"""The ``lxml.html`` tool set for HTML handling. """ import threading import re try: from urlparse import urljoin except ImportError: # Python 3 from urllib.parse import urljoin import copy from lxml import etree from lxml.html import defs from lxml import cssselect from lxml.html._setmixin import SetMixin try: from UserDict import DictMixin except ImportError: # DictMixin was introduced in Python 2.4 from lxml.html._dictmixin import DictMixin try: set except NameError: # Python 2.3 from sets import Set as set try: bytes = __builtins__["bytes"] except (KeyError, NameError): # Python < 2.6 bytes = str try: unicode = __builtins__["unicode"] except (KeyError, NameError): # Python 3 unicode = str try: basestring = __builtins__["basestring"] except (KeyError, NameError): # Python 3 basestring = (str, bytes) def __fix_docstring(s): if not s: return s import sys if sys.version_info[0] >= 3: sub = re.compile(r"^(\s*)u'", re.M).sub else: sub = re.compile(r"^(\s*)b'", re.M).sub return sub(r"\1'", s) __all__ = [ 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring', 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form', 'find_rel_links', 'find_class', 'make_links_absolute', 'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser', 'parse'] XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml" _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]", namespaces={'x':XHTML_NAMESPACE}) _options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option", namespaces={'x':XHTML_NAMESPACE}) _forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form", namespaces={'x':XHTML_NAMESPACE}) #_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'}) _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]") _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]") _collect_string_content = etree.XPath("string()") _css_url_re = re.compile(r'url\(('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)\)', re.I) _css_import_re = re.compile(r'@import "(.*?)"') _label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]", namespaces={'x':XHTML_NAMESPACE}) _archive_re = re.compile(r'[^ ]+') def _unquote_match(s, pos): if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'": return s[1:-1], pos+1 else: return s,pos def _transform_result(typ, result): """Convert the result back into the input type. """ if issubclass(typ, bytes): return tostring(result, encoding='utf-8') elif issubclass(typ, unicode): return tostring(result, encoding=unicode) else: return result def _nons(tag): if isinstance(tag, basestring): if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE)+1] == XHTML_NAMESPACE: return tag.split('}')[-1] return tag class HtmlMixin(object): def base_url(self): """ Returns the base URL, given when the page was parsed. Use with ``urlparse.urljoin(el.base_url, href)`` to get absolute URLs. """ return self.getroottree().docinfo.URL base_url = property(base_url, doc=base_url.__doc__) def forms(self): """ Return a list of all the forms """ return _forms_xpath(self) forms = property(forms, doc=forms.__doc__) def body(self): """ Return the element. Can be called from a child element to get the document's head. """ return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0] body = property(body, doc=body.__doc__) def head(self): """ Returns the element. Can be called from a child element to get the document's head. """ return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0] head = property(head, doc=head.__doc__) def _label__get(self): """ Get or set any