lxml.html

1 """The ``lxml.html`` tool set for HTML handling. 2 """ 3 4 import threading 5 import re 6 try: 7 from urlparse import urljoin 8 except ImportError: 9 # Python 3 10 from urllib.parse import urljoin 11 import copy 12 from lxml import etree 13 from lxml.html import defs 14 from lxml import cssselect 15 from lxml.html._setmixin import SetMixin 16 try: 17 from UserDict import DictMixin 18 except ImportError: 19 # DictMixin was introduced in Python 2.4 20 from lxml.html._dictmixin import DictMixin 21 try: 22 set 23 except NameError: 24 # Python 2.3 25 from sets import Set as set 26 try: 27 bytes = __builtins__["bytes"] 28 except (KeyError, NameError): 29 # Python < 2.6 30 bytes = str 31 try: 32 unicode = __builtins__["unicode"] 33 except (KeyError, NameError): 34 # Python 3 35 unicode = str 36 try: 37 basestring = __builtins__["basestring"] 38 except (KeyError, NameError): 39 # Python 3 40 basestring = (str, bytes) 41

42 -def __fix_docstring(s):

43 if not s: 44 return s 45 import sys 46 if sys.version_info[0] >= 3: 47 sub = re.compile(r"^(\s*)u'", re.M).sub 48 else: 49 sub = re.compile(r"^(\s*)b'", re.M).sub 50 return sub(r"\1'", s)

51 52 __all__ = [ 53 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring', 54 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form', 55 'find_rel_links', 'find_class', 'make_links_absolute', 56 'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser', 'parse'] 57 58 XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml" 59 60 _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]", 61 namespaces={'x':XHTML_NAMESPACE}) 62 _options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option", 63 namespaces={'x':XHTML_NAMESPACE}) 64 _forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form", 65 namespaces={'x':XHTML_NAMESPACE}) 66 #_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'}) 67 _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]") 68 _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]") 69 _collect_string_content = etree.XPath("string()") 70 _css_url_re = re.compile(r'url$('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)$', re.I) 71 _css_import_re = re.compile(r'@import "(.*?)"') 72 _label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]", 73 namespaces={'x':XHTML_NAMESPACE}) 74 _archive_re = re.compile(r'[^ ]+') 75

76 -def _unquote_match(s, pos):

77 if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'": 78 return s[1:-1], pos+1 79 else: 80 return s,pos

81

82 -def _transform_result(typ, result):

83 """Convert the result back into the input type. 84 """ 85 if issubclass(typ, bytes): 86 return tostring(result, encoding='utf-8') 87 elif issubclass(typ, unicode): 88 return tostring(result, encoding=unicode) 89 else: 90 return result

91

92 -def _nons(tag):

93 if isinstance(tag, basestring): 94 if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE)+1] == XHTML_NAMESPACE: 95 return tag.split('}')[-1] 96 return tag

97

98 -class HtmlMixin(object):

99

100 - def base_url(self):

101 """ 102 Returns the base URL, given when the page was parsed. 103 104 Use with ``urlparse.urljoin(el.base_url, href)`` to get 105 absolute URLs. 106 """ 107 return self.getroottree().docinfo.URL

108 base_url = property(base_url, doc=base_url.__doc__) 109

110 - def forms(self):

111 """ 112 Return a list of all the forms 113 """ 114 return _forms_xpath(self)

115 forms = property(forms, doc=forms.__doc__) 116

117 - def body(self):

118 """ 119 Return the <body> element. Can be called from a child element 120 to get the document's head. 121 """ 122 return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0]

123 body = property(body, doc=body.__doc__) 124

125 - def head(self):

126 """ 127 Returns the <head> element. Can be called from a child 128 element to get the document's head. 129 """ 130 return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0]

131 head = property(head, doc=head.__doc__) 132

133 - def _label__get(self):

134 """ 135 Get or set any <label> element associated with this element. 136 """ 137 id = self.get('id') 138 if not id: 139 return None 140 result = _label_xpath(self, id=id) 141 if not result: 142 return None 143 else: 144 return result[0]

145 - def _label__set(self, label):

146 id = self.get('id') 147 if not id: 148 raise TypeError( 149 "You cannot set a label for an element (%r) that has no id" 150 % self) 151 if _nons(label.tag) != 'label': 152 raise TypeError( 153 "You can only assign label to a label element (not %r)" 154 % label) 155 label.set('for', id)

156 - def _label__del(self):

157 label = self.label 158 if label is not None: 159 del label.attrib['for']

160 label = property(_label__get, _label__set, _label__del, doc=_label__get.__doc__) 161

162 - def drop_tree(self):

163 """ 164 Removes this element from the tree, including its children and 165 text. The tail text is joined to the previous element or 166 parent. 167 """ 168 parent = self.getparent() 169 assert parent is not None 170 if self.tail: 171 previous = self.getprevious() 172 if previous is None: 173 parent.text = (parent.text or '') + self.tail 174 else: 175 previous.tail = (previous.tail or '') + self.tail 176 parent.remove(self)

177

178 - def drop_tag(self):

179 """ 180 Remove the tag, but not its children or text. The children and text 181 are merged into the parent. 182 183 Example:: 184 185 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>') 186 >>> h.find('.//b').drop_tag() 187 >>> print(tostring(h, encoding=unicode)) 188 <div>Hello World!</div> 189 """ 190 parent = self.getparent() 191 assert parent is not None 192 previous = self.getprevious() 193 if self.text and isinstance(self.tag, basestring): 194 # not a Comment, etc. 195 if previous is None: 196 parent.text = (parent.text or '') + self.text 197 else: 198 previous.tail = (previous.tail or '') + self.text 199 if self.tail: 200 if len(self): 201 last = self[-1] 202 last.tail = (last.tail or '') + self.tail 203 elif previous is None: 204 parent.text = (parent.text or '') + self.tail 205 else: 206 previous.tail = (previous.tail or '') + self.tail 207 index = parent.index(self) 208 parent[index:index+1] = self[:]

209

210 - def find_rel_links(self, rel):

211 """ 212 Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements. 213 """ 214 rel = rel.lower() 215 return [el for el in _rel_links_xpath(self) 216 if el.get('rel').lower() == rel]

217

218 - def find_class(self, class_name):

219 """ 220 Find any elements with the given class name. 221 """ 222 return _class_xpath(self, class_name=class_name)

223

224 - def get_element_by_id(self, id, *default):

225 """ 226 Get the first element in a document with the given id. If none is 227 found, return the default argument if provided or raise KeyError 228 otherwise. 229 230 Note that there can be more than one element with the same id, 231 and this isn't uncommon in HTML documents found in the wild. 232 Browsers return only the first match, and this function does 233 the same. 234 """ 235 try: 236 # FIXME: should this check for multiple matches? 237 # browsers just return the first one 238 return _id_xpath(self, id=id)[0] 239 except IndexError: 240 if default: 241 return default[0] 242 else: 243 raise KeyError(id)

244

245 - def text_content(self):

246 """ 247 Return the text content of the tag (and the text in any children). 248 """ 249 return _collect_string_content(self)

250

251 - def cssselect(self, expr):

252 """ 253 Run the CSS expression on this element and its children, 254 returning a list of the results. 255 256 Equivalent to lxml.cssselect.CSSSelect(expr)(self) -- note 257 that pre-compiling the expression can provide a substantial 258 speedup. 259 """ 260 return cssselect.CSSSelector(expr)(self)

261 262 ######################################## 263 ## Link functions 264 ######################################## 265

266 - def make_links_absolute(self, base_url=None, resolve_base_href=True):

267 """ 268 Make all links in the document absolute, given the 269 ``base_url`` for the document (the full URL where the document 270 came from), or if no ``base_url`` is given, then the ``.base_url`` of the document. 271 272 If ``resolve_base_href`` is true, then any ``<base href>`` 273 tags in the document are used *and* removed from the document. 274 If it is false then any such tag is ignored. 275 """ 276 if base_url is None: 277 base_url = self.base_url 278 if base_url is None: 279 raise TypeError( 280 "No base_url given, and the document has no base_url") 281 if resolve_base_href: 282 self.resolve_base_href() 283 def link_repl(href): 284 return urljoin(base_url, href)

285 self.rewrite_links(link_repl)

286

287 - def resolve_base_href(self):

288 """ 289 Find any ``<base href>`` tag in the document, and apply its 290 values to all links found in the document. Also remove the 291 tag once it has been applied. 292 """ 293 base_href = None 294 basetags = self.xpath('//base[@href]|//x:base[@href]', namespaces={'x':XHTML_NAMESPACE}) 295 for b in basetags: 296 base_href = b.get('href') 297 b.drop_tree() 298 if not base_href: 299 return 300 self.make_links_absolute(base_href, resolve_base_href=False)

301

302 - def iterlinks(self):

303 """ 304 Yield (element, attribute, link, pos), where attribute may be None 305 (indicating the link is in the text). ``pos`` is the position 306 where the link occurs; often 0, but sometimes something else in 307 the case of links in stylesheets or style tags. 308 309 Note: <base href> is *not* taken into account in any way. The 310 link you get is exactly the link in the document. 311 """ 312 link_attrs = defs.link_attrs 313 for el in self.iter(): 314 attribs = el.attrib 315 tag = _nons(el.tag) 316 if tag != 'object': 317 for attrib in link_attrs: 318 if attrib in attribs: 319 yield (el, attrib, attribs[attrib], 0) 320 elif tag == 'object': 321 codebase = None 322 ## <object> tags have attributes that are relative to 323 ## codebase 324 if 'codebase' in attribs: 325 codebase = el.get('codebase') 326 yield (el, 'codebase', codebase, 0) 327 for attrib in 'classid', 'data': 328 if attrib in attribs: 329 value = el.get(attrib) 330 if codebase is not None: 331 value = urljoin(codebase, value) 332 yield (el, attrib, value, 0) 333 if 'archive' in attribs: 334 for match in _archive_re.finditer(el.get('archive')): 335 value = match.group(0) 336 if codebase is not None: 337 value = urljoin(codebase, value) 338 yield (el, 'archive', value, match.start()) 339 if tag == 'param': 340 valuetype = el.get('valuetype') or '' 341 if valuetype.lower() == 'ref': 342 ## FIXME: while it's fine we *find* this link, 343 ## according to the spec we aren't supposed to 344 ## actually change the value, including resolving 345 ## it. It can also still be a link, even if it 346 ## doesn't have a valuetype="ref" (which seems to be the norm) 347 ## http://www.w3.org/TR/html401/struct/objects.html#adef-valuetype 348 yield (el, 'value', el.get('value'), 0) 349 if tag == 'style' and el.text: 350 for match in _css_url_re.finditer(el.text): 351 url, start = _unquote_match(match.group(1), match.start(1)) 352 yield (el, None, url, start) 353 for match in _css_import_re.finditer(el.text): 354 yield (el, None, match.group(1), match.start(1)) 355 if 'style' in attribs: 356 for match in _css_url_re.finditer(attribs['style']): 357 url, start = _unquote_match(match.group(1), match.start(1)) 358 yield (el, 'style', url, start)

359

360 - def rewrite_links(self, link_repl_func, resolve_base_href=True, 361 base_href=None):

362 """ 363 Rewrite all the links in the document. For each link 364 ``link_repl_func(link)`` will be called, and the return value 365 will replace the old link. 366 367 Note that links may not be absolute (unless you first called 368 ``make_links_absolute()``), and may be internal (e.g., 369 ``'#anchor'``). They can also be values like 370 ``'mailto:email'`` or ``'javascript:expr'``. 371 372 If you give ``base_href`` then all links passed to 373 ``link_repl_func()`` will take that into account. 374 375 If the ``link_repl_func`` returns None, the attribute or 376 tag text will be removed completely. 377 """ 378 if base_href is not None: 379 # FIXME: this can be done in one pass with a wrapper 380 # around link_repl_func 381 self.make_links_absolute(base_href, resolve_base_href=resolve_base_href) 382 elif resolve_base_href: 383 self.resolve_base_href() 384 for el, attrib, link, pos in self.iterlinks(): 385 new_link = link_repl_func(link.strip()) 386 if new_link == link: 387 continue 388 if new_link is None: 389 # Remove the attribute or element content 390 if attrib is None: 391 el.text = '' 392 else: 393 del el.attrib[attrib] 394 continue 395 if attrib is None: 396 new = el.text[:pos] + new_link + el.text[pos+len(link):] 397 el.text = new 398 else: 399 cur = el.attrib[attrib] 400 if not pos and len(cur) == len(link): 401 # Most common case 402 el.attrib[attrib] = new_link 403 else: 404 new = cur[:pos] + new_link + cur[pos+len(link):] 405 el.attrib[attrib] = new

406 407

408 -class _MethodFunc(object):

409 """ 410 An object that represents a method on an element as a function; 411 the function takes either an element or an HTML string. It 412 returns whatever the function normally returns, or if the function 413 works in-place (and so returns None) it returns a serialized form 414 of the resulting document. 415 """

416 - def __init__(self, name, copy=False, source_class=HtmlMixin):

417 self.name = name 418 self.copy = copy 419 self.__doc__ = getattr(source_class, self.name).__doc__

420 - def __call__(self, doc, *args, **kw):

421 result_type = type(doc) 422 if isinstance(doc, basestring): 423 if 'copy' in kw: 424 raise TypeError( 425 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name) 426 doc = fromstring(doc, **kw) 427 else: 428 if 'copy' in kw: 429 copy = kw.pop('copy') 430 else: 431 copy = self.copy 432 if copy: 433 doc = copy.deepcopy(doc) 434 meth = getattr(doc, self.name) 435 result = meth(*args, **kw) 436 # FIXME: this None test is a bit sloppy 437 if result is None: 438 # Then return what we got in 439 return _transform_result(result_type, doc) 440 else: 441 return result

442 443 find_rel_links = _MethodFunc('find_rel_links', copy=False) 444 find_class = _MethodFunc('find_class', copy=False) 445 make_links_absolute = _MethodFunc('make_links_absolute', copy=True) 446 resolve_base_href = _MethodFunc('resolve_base_href', copy=True) 447 iterlinks = _MethodFunc('iterlinks', copy=False) 448 rewrite_links = _MethodFunc('rewrite_links', copy=True) 449

450 -class HtmlComment(etree.CommentBase, HtmlMixin):

451 pass

452

453 -class HtmlElement(etree.ElementBase, HtmlMixin):

454 pass

455

456 -class HtmlProcessingInstruction(etree.PIBase, HtmlMixin):

457 pass

458

459 -class HtmlEntity(etree.EntityBase, HtmlMixin):

460 pass

461 462

463 -class HtmlElementClassLookup(etree.CustomElementClassLookup):

464 """A lookup scheme for HTML Element classes. 465 466 To create a lookup instance with different Element classes, pass a tag 467 name mapping of Element classes in the ``classes`` keyword argument and/or 468 a tag name mapping of Mixin classes in the ``mixins`` keyword argument. 469 The special key '*' denotes a Mixin class that should be mixed into all 470 Element classes. 471 """ 472 _default_element_classes = {} 473

474 - def __init__(self, classes=None, mixins=None):

475 etree.CustomElementClassLookup.__init__(self) 476 if classes is None: 477 classes = self._default_element_classes.copy() 478 if mixins: 479 mixers = {} 480 for name, value in mixins: 481 if name == '*': 482 for n in classes.keys(): 483 mixers.setdefault(n, []).append(value) 484 else: 485 mixers.setdefault(name, []).append(value) 486 for name, mix_bases in mixers.items(): 487 cur = classes.get(name, HtmlElement) 488 bases = tuple(mix_bases + [cur]) 489 classes[name] = type(cur.__name__, bases, {}) 490 self._element_classes = classes

491

492 - def lookup(self, node_type, document, namespace, name):

493 if node_type == 'element': 494 return self._element_classes.get(name.lower(), HtmlElement) 495 elif node_type == 'comment': 496 return HtmlComment 497 elif node_type == 'PI': 498 return HtmlProcessingInstruction 499 elif node_type == 'entity': 500 return HtmlEntity 501 # Otherwise normal lookup 502 return None

503 504 ################################################################################ 505 # parsing 506 ################################################################################ 507

508 -def document_fromstring(html, parser=None, **kw):

509 if parser is None: 510 parser = html_parser 511 value = etree.fromstring(html, parser, **kw) 512 if value is None: 513 raise etree.ParserError( 514 "Document is empty") 515 return value

516

517 -def fragments_fromstring(html, no_leading_text=False, base_url=None, 518 parser=None, **kw):

519 """ 520 Parses several HTML elements, returning a list of elements. 521 522 The first item in the list may be a string (though leading 523 whitespace is removed). If no_leading_text is true, then it will 524 be an error if there is leading text, and it will always be a list 525 of only elements. 526 527 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 528 """ 529 if parser is None: 530 parser = html_parser 531 # FIXME: check what happens when you give html with a body, head, etc. 532 start = html[:20].lstrip().lower() 533 if not start.startswith('<html') and not start.startswith('<!doctype'): 534 html = '<html><body>%s</body></html>' % html 535 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) 536 assert _nons(doc.tag) == 'html' 537 bodies = [e for e in doc if _nons(e.tag) == 'body'] 538 assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html)) 539 body = bodies[0] 540 elements = [] 541 if no_leading_text and body.text and body.text.strip(): 542 raise etree.ParserError( 543 "There is leading text: %r" % body.text) 544 if body.text and body.text.strip(): 545 elements.append(body.text) 546 elements.extend(body) 547 # FIXME: removing the reference to the parent artificial document 548 # would be nice 549 return elements

550

551 -def fragment_fromstring(html, create_parent=False, base_url=None, 552 parser=None, **kw):

553 """ 554 Parses a single HTML element; it is an error if there is more than 555 one element, or if anything but whitespace precedes or follows the 556 element. 557 558 If create_parent is true (or is a tag name) then a parent node 559 will be created to encapsulate the HTML in a single element. 560 561 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 562 """ 563 if parser is None: 564 parser = html_parser 565 if create_parent: 566 if not isinstance(create_parent, basestring): 567 create_parent = 'div' 568 return fragment_fromstring('<%s>%s</%s>' % ( 569 create_parent, html, create_parent), 570 parser=parser, base_url=base_url, **kw) 571 elements = fragments_fromstring(html, parser=parser, no_leading_text=True, 572 base_url=base_url, **kw) 573 if not elements: 574 raise etree.ParserError( 575 "No elements found") 576 if len(elements) > 1: 577 raise etree.ParserError( 578 "Multiple elements found (%s)" 579 % ', '.join([_element_name(e) for e in elements])) 580 el = elements[0] 581 if el.tail and el.tail.strip(): 582 raise etree.ParserError( 583 "Element followed by text: %r" % el.tail) 584 el.tail = None 585 return el

586

587 -def fromstring(html, base_url=None, parser=None, **kw):

588 """ 589 Parse the html, returning a single element/document. 590 591 This tries to minimally parse the chunk of text, without knowing if it 592 is a fragment or a document. 593 594 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 595 """ 596 if parser is None: 597 parser = html_parser 598 start = html[:10].lstrip().lower() 599 if start.startswith('<html') or start.startswith('<!doctype'): 600 # Looks like a full HTML document 601 return document_fromstring(html, parser=parser, base_url=base_url, **kw) 602 # otherwise, lets parse it out... 603 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) 604 bodies = doc.findall('body') 605 if not bodies: 606 bodies = doc.findall('{%s}body' % XHTML_NAMESPACE) 607 if bodies: 608 body = bodies[0] 609 if len(bodies) > 1: 610 # Somehow there are multiple bodies, which is bad, but just 611 # smash them into one body 612 for other_body in bodies[1:]: 613 if other_body.text: 614 if len(body): 615 body[-1].tail = (body[-1].tail or '') + other_body.text 616 else: 617 body.text = (body.text or '') + other_body.text 618 body.extend(other_body) 619 # We'll ignore tail 620 # I guess we are ignoring attributes too 621 other_body.drop_tree() 622 else: 623 body = None 624 heads = doc.findall('head') 625 if not heads: 626 heads = doc.findall('{%s}head' % XHTML_NAMESPACE) 627 if heads: 628 # Well, we have some sort of structure, so lets keep it all 629 head = heads[0] 630 if len(heads) > 1: 631 for other_head in heads[1:]: 632 head.extend(other_head) 633 # We don't care about text or tail in a head 634 other_head.drop_tree() 635 return doc 636 if (len(body) == 1 and (not body.text or not body.text.strip()) 637 and (not body[-1].tail or not body[-1].tail.strip())): 638 # The body has just one element, so it was probably a single 639 # element passed in 640 return body[0] 641 # Now we have a body which represents a bunch of tags which have the 642 # content that was passed in. We will create a fake container, which 643 # is the body tag, except <body> implies too much structure. 644 if _contains_block_level_tag(body): 645 body.tag = 'div' 646 else: 647 body.tag = 'span' 648 return body

649

650 -def parse(filename_or_url, parser=None, base_url=None, **kw):

651 """ 652 Parse a filename, URL, or file-like object into an HTML document 653 tree. Note: this returns a tree, not an element. Use 654 ``parse(...).getroot()`` to get the document root. 655 656 You can override the base URL with the ``base_url`` keyword. This 657 is most useful when parsing from a file-like object. 658 """ 659 if parser is None: 660 parser = html_parser 661 return etree.parse(filename_or_url, parser, base_url=base_url, **kw)

662

663 -def _contains_block_level_tag(el):

664 # FIXME: I could do this with XPath, but would that just be 665 # unnecessarily slow? 666 for el in el.iter(): 667 if _nons(el.tag) in defs.block_tags: 668 return True 669 return False

670

671 -def _element_name(el):

672 if isinstance(el, etree.CommentBase): 673 return 'comment' 674 elif isinstance(el, basestring): 675 return 'string' 676 else: 677 return _nons(el.tag)

678 679 ################################################################################ 680 # form handling 681 ################################################################################ 682

683 -class FormElement(HtmlElement):

684 """ 685 Represents a <form> element. 686 """ 687

688 - def inputs(self):

689 """ 690 Returns an accessor for all the input elements in the form. 691 692 See `InputGetter` for more information about the object. 693 """ 694 return InputGetter(self)

695 inputs = property(inputs, doc=inputs.__doc__) 696

697 - def _fields__get(self):

698 """ 699 Dictionary-like object that represents all the fields in this 700 form. You can set values in this dictionary to effect the 701 form. 702 """ 703 return FieldsDict(self.inputs)

704 - def _fields__set(self, value):

705 prev_keys = self.fields.keys() 706 for key, value in value.iteritems(): 707 if key in prev_keys: 708 prev_keys.remove(key) 709 self.fields[key] = value 710 for key in prev_keys: 711 if key is None: 712 # Case of an unnamed input; these aren't really 713 # expressed in form_values() anyway. 714 continue 715 self.fields[key] = None

716 717 fields = property(_fields__get, _fields__set, doc=_fields__get.__doc__) 718

719 - def _name(self):

720 if self.get('name'): 721 return self.get('name') 722 elif self.get('id'): 723 return '#' + self.get('id') 724 forms = list(self.body.iter('form')) 725 if not forms: 726 forms = list(self.body.iter('{%s}form' % XHTML_NAMESPACE)) 727 return str(forms.index(self))

728

729 - def form_values(self):

730 """ 731 Return a list of tuples of the field values for the form. 732 This is suitable to be passed to ``urllib.urlencode()``. 733 """ 734 results = [] 735 for el in self.inputs: 736 name = el.name 737 if not name: 738 continue 739 tag = _nons(el.tag) 740 if tag == 'textarea': 741 results.append((name, el.value)) 742 elif tag == 'select': 743 value = el.value 744 if el.multiple: 745 for v in value: 746 results.append((name, v)) 747 elif value is not None: 748 results.append((name, el.value)) 749 else: 750 assert tag == 'input', ( 751 "Unexpected tag: %r" % el) 752 if el.checkable and not el.checked: 753 continue 754 if el.type in ('submit', 'image', 'reset'): 755 continue 756 value = el.value 757 if value is not None: 758 results.append((name, el.value)) 759 return results

760

761 - def _action__get(self):

762 """ 763 Get/set the form's ``action`` attribute. 764 """ 765 base_url = self.base_url 766 action = self.get('action') 767 if base_url and action is not None: 768 return urljoin(base_url, action) 769 else: 770 return action

771 - def _action__set(self, value):

772 self.set('action', value)

773 - def _action__del(self):

774 if 'action' in self.attrib: 775 del self.attrib['action']

776 action = property(_action__get, _action__set, _action__del, doc=_action__get.__doc__) 777

778 - def _method__get(self):

779 """ 780 Get/set the form's method. Always returns a capitalized 781 string, and defaults to ``'GET'`` 782 """ 783 return self.get('method', 'GET').upper()

784 - def _method__set(self, value):

785 self.set('method', value.upper())

786 method = property(_method__get, _method__set, doc=_method__get.__doc__)

787 788 HtmlElementClassLookup._default_element_classes['form'] = FormElement 789

790 -def submit_form(form, extra_values=None, open_http=None):

791 """ 792 Helper function to submit a form. Returns a file-like object, as from 793 ``urllib.urlopen()``. This object also has a ``.geturl()`` function, 794 which shows the URL if there were any redirects. 795 796 You can use this like:: 797 798 form = doc.forms[0] 799 form.inputs['foo'].value = 'bar' # etc 800 response = form.submit() 801 doc = parse(response) 802 doc.make_links_absolute(response.geturl()) 803 804 To change the HTTP requester, pass a function as ``open_http`` keyword 805 argument that opens the URL for you. The function must have the following 806 signature:: 807 808 open_http(method, URL, values) 809 810 The action is one of 'GET' or 'POST', the URL is the target URL as a 811 string, and the values are a sequence of ``(name, value)`` tuples with the 812 form data. 813 """ 814 values = form.form_values() 815 if extra_values: 816 if hasattr(extra_values, 'items'): 817 extra_values = extra_values.items() 818 values.extend(extra_values) 819 if open_http is None: 820 open_http = open_http_urllib 821 return open_http(form.method, form.action, values)

822

823 -def open_http_urllib(method, url, values):

824 import urllib 825 ## FIXME: should test that it's not a relative URL or something 826 if method == 'GET': 827 if '?' in url: 828 url += '&' 829 else: 830 url += '?' 831 url += urllib.urlencode(values) 832 data = None 833 else: 834 data = urllib.urlencode(values) 835 return urllib.urlopen(url, data)

836

837 -class FieldsDict(DictMixin):

838

839 - def __init__(self, inputs):

840 self.inputs = inputs

841 - def __getitem__(self, item):

842 return self.inputs[item].value

843 - def __setitem__(self, item, value):

844 self.inputs[item].value = value

845 - def __delitem__(self, item):

846 raise KeyError( 847 "You cannot remove keys from ElementDict")

848 - def keys(self):

849 return self.inputs.keys()

850 - def __contains__(self, item):

851 return item in self.inputs

852

853 - def __repr__(self):

854 return '<%s for form %s>' % ( 855 self.__class__.__name__, 856 self.inputs.form._name())

857

858 -class InputGetter(object):

859 860 """ 861 An accessor that represents all the input fields in a form. 862 863 You can get fields by name from this, with 864 ``form.inputs['field_name']``. If there are a set of checkboxes 865 with the same name, they are returned as a list (a `CheckboxGroup` 866 which also allows value setting). Radio inputs are handled 867 similarly. 868 869 You can also iterate over this to get all input elements. This 870 won't return the same thing as if you get all the names, as 871 checkboxes and radio elements are returned individually. 872 """ 873 874 _name_xpath = etree.XPath(".//*[@name = $name and (local-name(.) = 'select' or local-name(.) = 'input' or local-name(.) = 'textarea')]") 875 _all_xpath = etree.XPath(".//*[local-name() = 'select' or local-name() = 'input' or local-name() = 'textarea']") 876

877 - def __init__(self, form):

878 self.form = form

879

880 - def __repr__(self):

881 return '<%s for form %s>' % ( 882 self.__class__.__name__, 883 self.form._name())

884 885 ## FIXME: there should be more methods, and it's unclear if this is 886 ## a dictionary-like object or list-like object 887

888 - def __getitem__(self, name):

889 results = self._name_xpath(self.form, name=name) 890 if results: 891 type = results[0].get('type') 892 if type == 'radio' and len(results) > 1: 893 group = RadioGroup(results) 894 group.name = name 895 return group 896 elif type == 'checkbox' and len(results) > 1: 897 group = CheckboxGroup(results) 898 group.name = name 899 return group 900 else: 901 # I don't like throwing away elements like this 902 return results[0] 903 else: 904 raise KeyError( 905 "No input element with the name %r" % name)

906

907 - def __contains__(self, name):

908 results = self._name_xpath(self.form, name=name) 909 return bool(results)

910

911 - def keys(self):

912 names = set() 913 for el in self: 914 names.add(el.name) 915 if None in names: 916 names.remove(None) 917 return list(names)

918

919 - def __iter__(self):

920 ## FIXME: kind of dumb to turn a list into an iterator, only 921 ## to have it likely turned back into a list again :( 922 return iter(self._all_xpath(self.form))

923

924 -class InputMixin(object):

925 926 """ 927 Mix-in for all input elements (input, select, and textarea) 928 """ 929 930

931 - def _name__get(self):

932 """ 933 Get/set the name of the element 934 """ 935 return self.get('name')

936 - def _name__set(self, value):

937 self.set('name', value)

938 - def _name__del(self):

939 if 'name' in self.attrib: 940 del self.attrib['name']

941 name = property(_name__get, _name__set, _name__del, doc=_name__get.__doc__) 942

943 - def __repr__(self):

944 type = getattr(self, 'type', None) 945 if type: 946 type = ' type=%r' % type 947 else: 948 type = '' 949 return '<%s %x name=%r%s>' % ( 950 self.__class__.__name__, id(self), self.name, type)

951

952 -class TextareaElement(InputMixin, HtmlElement):

953 """ 954 ``<textarea>`` element. You can get the name with ``.name`` and 955 get/set the value with ``.value`` 956 """ 957

958 - def _value__get(self):

959 """ 960 Get/set the value (which is the contents of this element) 961 """ 962 return self.text or ''

963 - def _value__set(self, value):

964 self.text = value

965 - def _value__del(self):

966 self.text = ''

967 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)

968 969 HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement 970

971 -class SelectElement(InputMixin, HtmlElement):

972 """ 973 ``<select>`` element. You can get the name with ``.name``. 974 975 ``.value`` will be the value of the selected option, unless this 976 is a multi-select element (``<select multiple>``), in which case 977 it will be a set-like object. In either case ``.value_options`` 978 gives the possible values. 979 980 The boolean attribute ``.multiple`` shows if this is a 981 multi-select. 982 """ 983

984 - def _value__get(self):

985 """ 986 Get/set the value of this select (the selected option). 987 988 If this is a multi-select, this is a set-like object that 989 represents all the selected options. 990 """ 991 if self.multiple: 992 return MultipleSelectOptions(self) 993 for el in _options_xpath(self): 994 if el.get('selected') is not None: 995 value = el.get('value') 996 if value is None: 997 value = el.text or '' 998 if value: 999 value = value.strip() 1000 return value 1001 return None

1002

1003 - def _value__set(self, value):

1004 if self.multiple: 1005 if isinstance(value, basestring): 1006 raise TypeError( 1007 "You must pass in a sequence") 1008 self.value.clear() 1009 self.value.update(value) 1010 return 1011 if value is not None: 1012 value = value.strip() 1013 for el in _options_xpath(self): 1014 opt_value = el.get('value') 1015 if opt_value is None: 1016 opt_value = el.text or '' 1017 if opt_value: 1018 opt_value = opt_value.strip() 1019 if opt_value == value: 1020 checked_option = el 1021 break 1022 else: 1023 raise ValueError( 1024 "There is no option with the value of %r" % value) 1025 for el in _options_xpath(self): 1026 if 'selected' in el.attrib: 1027 del el.attrib['selected'] 1028 if value is not None: 1029 checked_option.set('selected', '')

1030

1031 - def _value__del(self):

1032 # FIXME: should del be allowed at all? 1033 if self.multiple: 1034 self.value.clear() 1035 else: 1036 self.value = None

1037 1038 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1039

1040 - def value_options(self):

1041 """ 1042 All the possible values this select can have (the ``value`` 1043 attribute of all the ``<option>`` elements. 1044 """ 1045 options = [] 1046 for el in _options_xpath(self): 1047 value = el.get('value') 1048 if value is None: 1049 value = el.text or '' 1050 if value: 1051 value = value.strip() 1052 options.append(value) 1053 return options

1054 value_options = property(value_options, doc=value_options.__doc__) 1055

1056 - def _multiple__get(self):

1057 """ 1058 Boolean attribute: is there a ``multiple`` attribute on this element. 1059 """ 1060 return 'multiple' in self.attrib

1061 - def _multiple__set(self, value):

1062 if value: 1063 self.set('multiple', '') 1064 elif 'multiple' in self.attrib: 1065 del self.attrib['multiple']

1066 multiple = property(_multiple__get, _multiple__set, doc=_multiple__get.__doc__)

1067 1068 HtmlElementClassLookup._default_element_classes['select'] = SelectElement 1069

1070 -class MultipleSelectOptions(SetMixin):

1071 """ 1072 Represents all the selected options in a ``<select multiple>`` element. 1073 1074 You can add to this set-like option to select an option, or remove 1075 to unselect the option. 1076 """ 1077

1078 - def __init__(self, select):

1079 self.select = select

1080

1081 - def options(self):

1082 """ 1083 Iterator of all the ``<option>`` elements. 1084 """ 1085 return iter(_options_xpath(self.select))

1086 options = property(options) 1087

1088 - def __iter__(self):

1089 for option in self.options: 1090 yield option.get('value')

1091

1092 - def add(self, item):

1093 for option in self.options: 1094 if option.get('value') == item: 1095 option.set('selected', '') 1096 break 1097 else: 1098 raise ValueError( 1099 "There is no option with the value %r" % item)

1100

1101 - def remove(self, item):

1102 for option in self.options: 1103 if option.get('value') == item: 1104 if 'selected' in option.attrib: 1105 del option.attrib['selected'] 1106 else: 1107 raise ValueError( 1108 "The option %r is not currently selected" % item) 1109 break 1110 else: 1111 raise ValueError( 1112 "There is not option with the value %r" % item)

1113

1114 - def __repr__(self):

1115 return '<%s {%s} for select name=%r>' % ( 1116 self.__class__.__name__, 1117 ', '.join([repr(v) for v in self]), 1118 self.select.name)

1119

1120 -class RadioGroup(list):

1121 """ 1122 This object represents several ``<input type=radio>`` elements 1123 that have the same name. 1124 1125 You can use this like a list, but also use the property 1126 ``.value`` to check/uncheck inputs. Also you can use 1127 ``.value_options`` to get the possible values. 1128 """ 1129

1130 - def _value__get(self):

1131 """ 1132 Get/set the value, which checks the radio with that value (and 1133 unchecks any other value). 1134 """ 1135 for el in self: 1136 if 'checked' in el.attrib: 1137 return el.get('value') 1138 return None

1139

1140 - def _value__set(self, value):

1141 if value is not None: 1142 for el in self: 1143 if el.get('value') == value: 1144 checked_option = el 1145 break 1146 else: 1147 raise ValueError( 1148 "There is no radio input with the value %r" % value) 1149 for el in self: 1150 if 'checked' in el.attrib: 1151 del el.attrib['checked'] 1152 if value is not None: 1153 checked_option.set('checked', '')

1154

1155 - def _value__del(self):

1156 self.value = None

1157 1158 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1159

1160 - def value_options(self):

1161 """ 1162 Returns a list of all the possible values. 1163 """ 1164 return [el.get('value') for el in self]

1165 value_options = property(value_options, doc=value_options.__doc__) 1166

1167 - def __repr__(self):

1168 return '%s(%s)' % ( 1169 self.__class__.__name__, 1170 list.__repr__(self))

1171

1172 -class CheckboxGroup(list):

1173 """ 1174 Represents a group of checkboxes (``<input type=checkbox>``) that 1175 have the same name. 1176 1177 In addition to using this like a list, the ``.value`` attribute 1178 returns a set-like object that you can add to or remove from to 1179 check and uncheck checkboxes. You can also use ``.value_options`` 1180 to get the possible values. 1181 """ 1182

1183 - def _value__get(self):

1184 """ 1185 Return a set-like object that can be modified to check or 1186 uncheck individual checkboxes according to their value. 1187 """ 1188 return CheckboxValues(self)

1189 - def _value__set(self, value):

1190 self.value.clear() 1191 if not hasattr(value, '__iter__'): 1192 raise ValueError( 1193 "A CheckboxGroup (name=%r) must be set to a sequence (not %r)" 1194 % (self[0].name, value)) 1195 self.value.update(value)

1196 - def _value__del(self):

1197 self.value.clear()

1198 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1199

1200 - def __repr__(self):

1201 return '%s(%s)' % ( 1202 self.__class__.__name__, list.__repr__(self))

1203

1204 -class CheckboxValues(SetMixin):

1205 1206 """ 1207 Represents the values of the checked checkboxes in a group of 1208 checkboxes with the same name. 1209 """ 1210

1211 - def __init__(self, group):

1212 self.group = group

1213

1214 - def __iter__(self):

1215 return iter([ 1216 el.get('value') 1217 for el in self.group 1218 if 'checked' in el.attrib])

1219

1220 - def add(self, value):

1221 for el in self.group: 1222 if el.get('value') == value: 1223 el.set('checked', '') 1224 break 1225 else: 1226 raise KeyError("No checkbox with value %r" % value)

1227

1228 - def remove(self, value):

1229 for el in self.group: 1230 if el.get('value') == value: 1231 if 'checked' in el.attrib: 1232 del el.attrib['checked'] 1233 else: 1234 raise KeyError( 1235 "The checkbox with value %r was already unchecked" % value) 1236 break 1237 else: 1238 raise KeyError( 1239 "No checkbox with value %r" % value)

1240

1241 - def __repr__(self):

1242 return '<%s {%s} for checkboxes name=%r>' % ( 1243 self.__class__.__name__, 1244 ', '.join([repr(v) for v in self]), 1245 self.group.name)

1246

1247 -class InputElement(InputMixin, HtmlElement):

1248 """ 1249 Represents an ``<input>`` element. 1250 1251 You can get the type with ``.type`` (which is lower-cased and 1252 defaults to ``'text'``). 1253 1254 Also you can get and set the value with ``.value`` 1255 1256 Checkboxes and radios have the attribute ``input.checkable == 1257 True`` (for all others it is false) and a boolean attribute 1258 ``.checked``. 1259 1260 """ 1261 1262 ## FIXME: I'm a little uncomfortable with the use of .checked

1263 - def _value__get(self):

1264 """ 1265 Get/set the value of this element, using the ``value`` attribute. 1266 1267 Also, if this is a checkbox and it has no value, this defaults 1268 to ``'on'``. If it is a checkbox or radio that is not 1269 checked, this returns None. 1270 """ 1271 if self.checkable: 1272 if self.checked: 1273 return self.get('value') or 'on' 1274 else: 1275 return None 1276 return self.get('value')

1277 - def _value__set(self, value):

1278 if self.checkable: 1279 if not value: 1280 self.checked = False 1281 else: 1282 self.checked = True 1283 if isinstance(value, basestring): 1284 self.set('value', value) 1285 else: 1286 self.set('value', value)

1287 - def _value__del(self):

1288 if self.checkable: 1289 self.checked = False 1290 else: 1291 if 'value' in self.attrib: 1292 del self.attrib['value']

1293 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1294

1295 - def _type__get(self):

1296 """ 1297 Return the type of this element (using the type attribute). 1298 """ 1299 return self.get('type', 'text').lower()

1300 - def _type__set(self, value):

1301 self.set('type', value)

1302 type = property(_type__get, _type__set, doc=_type__get.__doc__) 1303

1304 - def checkable(self):

1305 """ 1306 Boolean: can this element be checked? 1307 """ 1308 return self.type in ['checkbox', 'radio']

1309 checkable = property(checkable, doc=checkable.__doc__) 1310

1311 - def _checked__get(self):

1312 """ 1313 Boolean attribute to get/set the presence of the ``checked`` 1314 attribute. 1315 1316 You can only use this on checkable input types. 1317 """ 1318 if not self.checkable: 1319 raise AttributeError('Not a checkable input type') 1320 return 'checked' in self.attrib

1321 - def _checked__set(self, value):

1322 if not self.checkable: 1323 raise AttributeError('Not a checkable input type') 1324 if value: 1325 self.set('checked', '') 1326 else: 1327 if 'checked' in self.attrib: 1328 del self.attrib['checked']

1329 checked = property(_checked__get, _checked__set, doc=_checked__get.__doc__)

1330 1331 HtmlElementClassLookup._default_element_classes['input'] = InputElement 1332

1333 -class LabelElement(HtmlElement):

1334 """ 1335 Represents a ``<label>`` element. 1336 1337 Label elements are linked to other elements with their ``for`` 1338 attribute. You can access this element with ``label.for_element``. 1339 """ 1340

1341 - def _for_element__get(self):

1342 """ 1343 Get/set the element this label points to. Return None if it 1344 can't be found. 1345 """ 1346 id = self.get('for') 1347 if not id: 1348 return None 1349 return self.body.get_element_by_id(id)

1350 - def _for_element__set(self, other):

1351 id = other.get('id') 1352 if not id: 1353 raise TypeError( 1354 "Element %r has no id attribute" % other) 1355 self.set('for', id)

1356 - def _for_element__del(self):

1357 if 'id' in self.attrib: 1358 del self.attrib['id']

1359 for_element = property(_for_element__get, _for_element__set, _for_element__del, 1360 doc=_for_element__get.__doc__)

1361 1362 HtmlElementClassLookup._default_element_classes['label'] = LabelElement 1363 1364 ############################################################ 1365 ## Serialization 1366 ############################################################ 1367

1368 -def html_to_xhtml(html):

1369 """Convert all tags in an HTML tree to XHTML by moving them to the 1370 XHTML namespace. 1371 """ 1372 try: 1373 html = html.getroot() 1374 except AttributeError: 1375 pass 1376 prefix = "{%s}" % XHTML_NAMESPACE 1377 for el in html.iter(): 1378 tag = el.tag 1379 if isinstance(tag, basestring): 1380 if tag[0] != '{': 1381 el.tag = prefix + tag

1382

1383 -def xhtml_to_html(xhtml):

1384 """Convert all tags in an XHTML tree to HTML by removing their 1385 XHTML namespace. 1386 """ 1387 try: 1388 xhtml = xhtml.getroot() 1389 except AttributeError: 1390 pass 1391 prefix = "{%s}" % XHTML_NAMESPACE 1392 prefix_len = len(prefix) 1393 for el in xhtml.iter(prefix + "*"): 1394 el.tag = el.tag[prefix_len:]

1395 1396 # This isn't a general match, but it's a match for what libxml2 1397 # specifically serialises: 1398 __str_replace_meta_content_type = re.compile( 1399 r'<meta http-equiv="Content-Type"[^>]*>').sub 1400 __bytes_replace_meta_content_type = re.compile( 1401 r'<meta http-equiv="Content-Type"[^>]*>'.encode('ASCII')).sub 1402

1403 -def tostring(doc, pretty_print=False, include_meta_content_type=False, 1404 encoding=None, method="html"):

1405 """Return an HTML string representation of the document. 1406 1407 Note: if include_meta_content_type is true this will create a 1408 ``<meta http-equiv="Content-Type" ...>`` tag in the head; 1409 regardless of the value of include_meta_content_type any existing 1410 ``<meta http-equiv="Content-Type" ...>`` tag will be removed 1411 1412 The ``encoding`` argument controls the output encoding (defauts to 1413 ASCII, with &#...; character references for any characters outside 1414 of ASCII). 1415 1416 The ``method`` argument defines the output method. It defaults to 1417 'html', but can also be 'xml' for xhtml output, or 'text' to 1418 serialise to plain text without markup. Note that you can pass 1419 the builtin ``unicode`` type as ``encoding`` argument to serialise 1420 to a unicode string. 1421 1422 Example:: 1423 1424 >>> from lxml import html 1425 >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>') 1426 1427 >>> html.tostring(root) 1428 b'<p>Hello<br>world!</p>' 1429 >>> html.tostring(root, method='html') 1430 b'<p>Hello<br>world!</p>' 1431 1432 >>> html.tostring(root, method='xml') 1433 b'<p>Hello<br/>world!</p>' 1434 1435 >>> html.tostring(root, method='text') 1436 b'Helloworld!' 1437 1438 >>> html.tostring(root, method='text', encoding=unicode) 1439 u'Helloworld!' 1440 """ 1441 html = etree.tostring(doc, method=method, pretty_print=pretty_print, 1442 encoding=encoding) 1443 if not include_meta_content_type: 1444 if isinstance(html, str): 1445 html = __str_replace_meta_content_type('', html) 1446 else: 1447 html = __bytes_replace_meta_content_type(bytes(), html) 1448 return html

1449 1450 tostring.__doc__ = __fix_docstring(tostring.__doc__) 1451

1452 -def open_in_browser(doc):

1453 """ 1454 Open the HTML document in a web browser (saving it to a temporary 1455 file to open it). 1456 """ 1457 import os 1458 import webbrowser 1459 try: 1460 write_doc = doc.write 1461 except AttributeError: 1462 write_doc = etree.ElementTree(element=doc).write 1463 fn = os.tempnam() + '.html' 1464 write_doc(fn, method="html") 1465 url = 'file://' + fn.replace(os.path.sep, '/') 1466 print(url) 1467 webbrowser.open(url)

1468 1469 ################################################################################ 1470 # configure Element class lookup 1471 ################################################################################ 1472

1473 -class HTMLParser(etree.HTMLParser):

1474 - def __init__(self, **kwargs):

1475 super(HTMLParser, self).__init__(**kwargs) 1476 self.set_element_class_lookup(HtmlElementClassLookup())

1477

1478 -class XHTMLParser(etree.XMLParser):

1479 - def __init__(self, **kwargs):

1480 super(XHTMLParser, self).__init__(**kwargs) 1481 self.set_element_class_lookup(HtmlElementClassLookup())

1482

1483 -def Element(*args, **kw):

1484 """Create a new HTML Element. 1485 1486 This can also be used for XHTML documents. 1487 """ 1488 v = html_parser.makeelement(*args, **kw) 1489 return v

1490 1491 html_parser = HTMLParser() 1492 xhtml_parser = XHTMLParser() 1493

Source Code for Package lxml.html