"""The ``lxml.html`` tool set for HTML handling. """ import threading import re try: from urlparse import urljoin except ImportError: # Python 3 from urllib.parse import urljoin import copy from lxml import etree from lxml.html import defs from lxml import cssselect from lxml.html._setmixin import SetMixin try: from UserDict import DictMixin except ImportError: # DictMixin was introduced in Python 2.4 from lxml.html._dictmixin import DictMixin try: set except NameError: # Python 2.3 from sets import Set as set try: bytes = __builtins__["bytes"] except (KeyError, NameError): # Python < 2.6 bytes = str try: unicode = __builtins__["unicode"] except (KeyError, NameError): # Python 3 unicode = str try: basestring = __builtins__["basestring"] except (KeyError, NameError): # Python 3 basestring = (str, bytes) def __fix_docstring(s): if not s: return s import sys if sys.version_info[0] >= 3: sub = re.compile(r"^(\s*)u'", re.M).sub else: sub = re.compile(r"^(\s*)b'", re.M).sub return sub(r"\1'", s) __all__ = [ 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring', 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form', 'find_rel_links', 'find_class', 'make_links_absolute', 'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser', 'parse'] XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml" _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]", namespaces={'x':XHTML_NAMESPACE}) _options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option", namespaces={'x':XHTML_NAMESPACE}) _forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form", namespaces={'x':XHTML_NAMESPACE}) #_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'}) _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]") _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]") _collect_string_content = etree.XPath("string()") _css_url_re = re.compile(r'url$('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)$', re.I) _css_import_re = re.compile(r'@import "(.*?)"') _label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]", namespaces={'x':XHTML_NAMESPACE}) _archive_re = re.compile(r'[^ ]+') def _unquote_match(s, pos): if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'": return s[1:-1], pos+1 else: return s,pos def _transform_result(typ, result): """Convert the result back into the input type. """ if issubclass(typ, bytes): return tostring(result, encoding='utf-8') elif issubclass(typ, unicode): return tostring(result, encoding=unicode) else: return result def _nons(tag): if isinstance(tag, basestring): if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE)+1] == XHTML_NAMESPACE: return tag.split('}')[-1] return tag class HtmlMixin(object): def base_url(self): """ Returns the base URL, given when the page was parsed. Use with ``urlparse.urljoin(el.base_url, href)`` to get absolute URLs. """ return self.getroottree().docinfo.URL base_url = property(base_url, doc=base_url.__doc__) def forms(self): """ Return a list of all the forms """ return _forms_xpath(self) forms = property(forms, doc=forms.__doc__) def body(self): """ Return the element. Can be called from a child element to get the document's head. """ return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0] body = property(body, doc=body.__doc__) def head(self): """ Returns the element. Can be called from a child element to get the document's head. """ return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0] head = property(head, doc=head.__doc__) def _label__get(self): """ Get or set any element associated with this element. """ id = self.get('id') if not id: return None result = _label_xpath(self, id=id) if not result: return None else: return result[0] def _label__set(self, label): id = self.get('id') if not id: raise TypeError( "You cannot set a label for an element (%r) that has no id" % self) if _nons(label.tag) != 'label': raise TypeError( "You can only assign label to a label element (not %r)" % label) label.set('for', id) def _label__del(self): label = self.label if label is not None: del label.attrib['for'] label = property(_label__get, _label__set, _label__del, doc=_label__get.__doc__) def drop_tree(self): """ Removes this element from the tree, including its children and text. The tail text is joined to the previous element or parent. """ parent = self.getparent() assert parent is not None if self.tail: previous = self.getprevious() if previous is None: parent.text = (parent.text or '') + self.tail else: previous.tail = (previous.tail or '') + self.tail parent.remove(self) def drop_tag(self): """ Remove the tag, but not its children or text. The children and text are merged into the parent. Example:: >>> h = fragment_fromstring('

Hello World!

') >>> h.find('.//b').drop_tag() >>> print(tostring(h, encoding=unicode))

Hello World!

""" parent = self.getparent() assert parent is not None previous = self.getprevious() if self.text and isinstance(self.tag, basestring): # not a Comment, etc. if previous is None: parent.text = (parent.text or '') + self.text else: previous.tail = (previous.tail or '') + self.text if self.tail: if len(self): last = self[-1] last.tail = (last.tail or '') + self.tail elif previous is None: parent.text = (parent.text or '') + self.tail else: previous.tail = (previous.tail or '') + self.tail index = parent.index(self) parent[index:index+1] = self[:] def find_rel_links(self, rel): """ Find any links like ``...``; returns a list of elements. """ rel = rel.lower() return [el for el in _rel_links_xpath(self) if el.get('rel').lower() == rel] def find_class(self, class_name): """ Find any elements with the given class name. """ return _class_xpath(self, class_name=class_name) def get_element_by_id(self, id, *default): """ Get the first element in a document with the given id. If none is found, return the default argument if provided or raise KeyError otherwise. Note that there can be more than one element with the same id, and this isn't uncommon in HTML documents found in the wild. Browsers return only the first match, and this function does the same. """ try: # FIXME: should this check for multiple matches? # browsers just return the first one return _id_xpath(self, id=id)[0] except IndexError: if default: return default[0] else: raise KeyError(id) def text_content(self): """ Return the text content of the tag (and the text in any children). """ return _collect_string_content(self) def cssselect(self, expr): """ Run the CSS expression on this element and its children, returning a list of the results. Equivalent to lxml.cssselect.CSSSelect(expr)(self) -- note that pre-compiling the expression can provide a substantial speedup. """ return cssselect.CSSSelector(expr)(self) ######################################## ## Link functions ######################################## def make_links_absolute(self, base_url=None, resolve_base_href=True): """ Make all links in the document absolute, given the ``base_url`` for the document (the full URL where the document came from), or if no ``base_url`` is given, then the ``.base_url`` of the document. If ``resolve_base_href`` is true, then any ```` tags in the document are used *and* removed from the document. If it is false then any such tag is ignored. """ if base_url is None: base_url = self.base_url if base_url is None: raise TypeError( "No base_url given, and the document has no base_url") if resolve_base_href: self.resolve_base_href() def link_repl(href): return urljoin(base_url, href) self.rewrite_links(link_repl) def resolve_base_href(self): """ Find any ```` tag in the document, and apply its values to all links found in the document. Also remove the tag once it has been applied. """ base_href = None basetags = self.xpath('//base[@href]|//x:base[@href]', namespaces={'x':XHTML_NAMESPACE}) for b in basetags: base_href = b.get('href') b.drop_tree() if not base_href: return self.make_links_absolute(base_href, resolve_base_href=False) def iterlinks(self): """ Yield (element, attribute, link, pos), where attribute may be None (indicating the link is in the text). ``pos`` is the position where the link occurs; often 0, but sometimes something else in the case of links in stylesheets or style tags. Note: is *not* taken into account in any way. The link you get is exactly the link in the document. """ link_attrs = defs.link_attrs for el in self.iter(): attribs = el.attrib tag = _nons(el.tag) if tag != 'object': for attrib in link_attrs: if attrib in attribs: yield (el, attrib, attribs[attrib], 0) elif tag == 'object': codebase = None ##