"""The ``lxml.html`` tool set for HTML handling.
"""
import threading
import re
try:
from urlparse import urljoin
except ImportError:
# Python 3
from urllib.parse import urljoin
import copy
from lxml import etree
from lxml.html import defs
from lxml import cssselect
from lxml.html._setmixin import SetMixin
try:
from UserDict import DictMixin
except ImportError:
# DictMixin was introduced in Python 2.4
from lxml.html._dictmixin import DictMixin
try:
set
except NameError:
# Python 2.3
from sets import Set as set
try:
bytes = __builtins__["bytes"]
except (KeyError, NameError):
# Python < 2.6
bytes = str
try:
unicode = __builtins__["unicode"]
except (KeyError, NameError):
# Python 3
unicode = str
try:
basestring = __builtins__["basestring"]
except (KeyError, NameError):
# Python 3
basestring = (str, bytes)
def __fix_docstring(s):
if not s:
return s
import sys
if sys.version_info[0] >= 3:
sub = re.compile(r"^(\s*)u'", re.M).sub
else:
sub = re.compile(r"^(\s*)b'", re.M).sub
return sub(r"\1'", s)
__all__ = [
'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring',
'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form',
'find_rel_links', 'find_class', 'make_links_absolute',
'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser', 'parse']
XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml"
_rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]",
namespaces={'x':XHTML_NAMESPACE})
_options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option",
namespaces={'x':XHTML_NAMESPACE})
_forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form",
namespaces={'x':XHTML_NAMESPACE})
#_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'})
_class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]")
_id_xpath = etree.XPath("descendant-or-self::*[@id=$id]")
_collect_string_content = etree.XPath("string()")
_css_url_re = re.compile(r'url\(('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)\)', re.I)
_css_import_re = re.compile(r'@import "(.*?)"')
_label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]",
namespaces={'x':XHTML_NAMESPACE})
_archive_re = re.compile(r'[^ ]+')
def _unquote_match(s, pos):
if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'":
return s[1:-1], pos+1
else:
return s,pos
def _transform_result(typ, result):
"""Convert the result back into the input type.
"""
if issubclass(typ, bytes):
return tostring(result, encoding='utf-8')
elif issubclass(typ, unicode):
return tostring(result, encoding=unicode)
else:
return result
def _nons(tag):
if isinstance(tag, basestring):
if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE)+1] == XHTML_NAMESPACE:
return tag.split('}')[-1]
return tag
class HtmlMixin(object):
def base_url(self):
"""
Returns the base URL, given when the page was parsed.
Use with ``urlparse.urljoin(el.base_url, href)`` to get
absolute URLs.
"""
return self.getroottree().docinfo.URL
base_url = property(base_url, doc=base_url.__doc__)
def forms(self):
"""
Return a list of all the forms
"""
return _forms_xpath(self)
forms = property(forms, doc=forms.__doc__)
def body(self):
"""
Return the
element. Can be called from a child element
to get the document's head.
"""
return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0]
body = property(body, doc=body.__doc__)
def head(self):
"""
Returns the element. Can be called from a child
element to get the document's head.
"""
return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0]
head = property(head, doc=head.__doc__)
def _label__get(self):
"""
Get or set any element associated with this element.
"""
id = self.get('id')
if not id:
return None
result = _label_xpath(self, id=id)
if not result:
return None
else:
return result[0]
def _label__set(self, label):
id = self.get('id')
if not id:
raise TypeError(
"You cannot set a label for an element (%r) that has no id"
% self)
if _nons(label.tag) != 'label':
raise TypeError(
"You can only assign label to a label element (not %r)"
% label)
label.set('for', id)
def _label__del(self):
label = self.label
if label is not None:
del label.attrib['for']
label = property(_label__get, _label__set, _label__del, doc=_label__get.__doc__)
def drop_tree(self):
"""
Removes this element from the tree, including its children and
text. The tail text is joined to the previous element or
parent.
"""
parent = self.getparent()
assert parent is not None
if self.tail:
previous = self.getprevious()
if previous is None:
parent.text = (parent.text or '') + self.tail
else:
previous.tail = (previous.tail or '') + self.tail
parent.remove(self)
def drop_tag(self):
"""
Remove the tag, but not its children or text. The children and text
are merged into the parent.
Example::
>>> h = fragment_fromstring('Hello World!
')
>>> h.find('.//b').drop_tag()
>>> print(tostring(h, encoding=unicode))
Hello World!
"""
parent = self.getparent()
assert parent is not None
previous = self.getprevious()
if self.text and isinstance(self.tag, basestring):
# not a Comment, etc.
if previous is None:
parent.text = (parent.text or '') + self.text
else:
previous.tail = (previous.tail or '') + self.text
if self.tail:
if len(self):
last = self[-1]
last.tail = (last.tail or '') + self.tail
elif previous is None:
parent.text = (parent.text or '') + self.tail
else:
previous.tail = (previous.tail or '') + self.tail
index = parent.index(self)
parent[index:index+1] = self[:]
def find_rel_links(self, rel):
"""
Find any links like ``... ``; returns a list of elements.
"""
rel = rel.lower()
return [el for el in _rel_links_xpath(self)
if el.get('rel').lower() == rel]
def find_class(self, class_name):
"""
Find any elements with the given class name.
"""
return _class_xpath(self, class_name=class_name)
def get_element_by_id(self, id, *default):
"""
Get the first element in a document with the given id. If none is
found, return the default argument if provided or raise KeyError
otherwise.
Note that there can be more than one element with the same id,
and this isn't uncommon in HTML documents found in the wild.
Browsers return only the first match, and this function does
the same.
"""
try:
# FIXME: should this check for multiple matches?
# browsers just return the first one
return _id_xpath(self, id=id)[0]
except IndexError:
if default:
return default[0]
else:
raise KeyError(id)
def text_content(self):
"""
Return the text content of the tag (and the text in any children).
"""
return _collect_string_content(self)
def cssselect(self, expr):
"""
Run the CSS expression on this element and its children,
returning a list of the results.
Equivalent to lxml.cssselect.CSSSelect(expr)(self) -- note
that pre-compiling the expression can provide a substantial
speedup.
"""
return cssselect.CSSSelector(expr)(self)
########################################
## Link functions
########################################
def make_links_absolute(self, base_url=None, resolve_base_href=True):
"""
Make all links in the document absolute, given the
``base_url`` for the document (the full URL where the document
came from), or if no ``base_url`` is given, then the ``.base_url`` of the document.
If ``resolve_base_href`` is true, then any `` ``
tags in the document are used *and* removed from the document.
If it is false then any such tag is ignored.
"""
if base_url is None:
base_url = self.base_url
if base_url is None:
raise TypeError(
"No base_url given, and the document has no base_url")
if resolve_base_href:
self.resolve_base_href()
def link_repl(href):
return urljoin(base_url, href)
self.rewrite_links(link_repl)
def resolve_base_href(self):
"""
Find any `` `` tag in the document, and apply its
values to all links found in the document. Also remove the
tag once it has been applied.
"""
base_href = None
basetags = self.xpath('//base[@href]|//x:base[@href]', namespaces={'x':XHTML_NAMESPACE})
for b in basetags:
base_href = b.get('href')
b.drop_tree()
if not base_href:
return
self.make_links_absolute(base_href, resolve_base_href=False)
def iterlinks(self):
"""
Yield (element, attribute, link, pos), where attribute may be None
(indicating the link is in the text). ``pos`` is the position
where the link occurs; often 0, but sometimes something else in
the case of links in stylesheets or style tags.
Note: is *not* taken into account in any way. The
link you get is exactly the link in the document.
"""
link_attrs = defs.link_attrs
for el in self.iter():
attribs = el.attrib
tag = _nons(el.tag)
if tag != 'object':
for attrib in link_attrs:
if attrib in attribs:
yield (el, attrib, attribs[attrib], 0)
elif tag == 'object':
codebase = None
## tags have attributes that are relative to
## codebase
if 'codebase' in attribs:
codebase = el.get('codebase')
yield (el, 'codebase', codebase, 0)
for attrib in 'classid', 'data':
if attrib in attribs:
value = el.get(attrib)
if codebase is not None:
value = urljoin(codebase, value)
yield (el, attrib, value, 0)
if 'archive' in attribs:
for match in _archive_re.finditer(el.get('archive')):
value = match.group(0)
if codebase is not None:
value = urljoin(codebase, value)
yield (el, 'archive', value, match.start())
if tag == 'param':
valuetype = el.get('valuetype') or ''
if valuetype.lower() == 'ref':
## FIXME: while it's fine we *find* this link,
## according to the spec we aren't supposed to
## actually change the value, including resolving
## it. It can also still be a link, even if it
## doesn't have a valuetype="ref" (which seems to be the norm)
## http://www.w3.org/TR/html401/struct/objects.html#adef-valuetype
yield (el, 'value', el.get('value'), 0)
if tag == 'style' and el.text:
for match in _css_url_re.finditer(el.text):
url, start = _unquote_match(match.group(1), match.start(1))
yield (el, None, url, start)
for match in _css_import_re.finditer(el.text):
yield (el, None, match.group(1), match.start(1))
if 'style' in attribs:
for match in _css_url_re.finditer(attribs['style']):
url, start = _unquote_match(match.group(1), match.start(1))
yield (el, 'style', url, start)
def rewrite_links(self, link_repl_func, resolve_base_href=True,
base_href=None):
"""
Rewrite all the links in the document. For each link
``link_repl_func(link)`` will be called, and the return value
will replace the old link.
Note that links may not be absolute (unless you first called
``make_links_absolute()``), and may be internal (e.g.,
``'#anchor'``). They can also be values like
``'mailto:email'`` or ``'javascript:expr'``.
If you give ``base_href`` then all links passed to
``link_repl_func()`` will take that into account.
If the ``link_repl_func`` returns None, the attribute or
tag text will be removed completely.
"""
if base_href is not None:
# FIXME: this can be done in one pass with a wrapper
# around link_repl_func
self.make_links_absolute(base_href, resolve_base_href=resolve_base_href)
elif resolve_base_href:
self.resolve_base_href()
for el, attrib, link, pos in self.iterlinks():
new_link = link_repl_func(link.strip())
if new_link == link:
continue
if new_link is None:
# Remove the attribute or element content
if attrib is None:
el.text = ''
else:
del el.attrib[attrib]
continue
if attrib is None:
new = el.text[:pos] + new_link + el.text[pos+len(link):]
el.text = new
else:
cur = el.attrib[attrib]
if not pos and len(cur) == len(link):
# Most common case
el.attrib[attrib] = new_link
else:
new = cur[:pos] + new_link + cur[pos+len(link):]
el.attrib[attrib] = new
class _MethodFunc(object):
"""
An object that represents a method on an element as a function;
the function takes either an element or an HTML string. It
returns whatever the function normally returns, or if the function
works in-place (and so returns None) it returns a serialized form
of the resulting document.
"""
def __init__(self, name, copy=False, source_class=HtmlMixin):
self.name = name
self.copy = copy
self.__doc__ = getattr(source_class, self.name).__doc__
def __call__(self, doc, *args, **kw):
result_type = type(doc)
if isinstance(doc, basestring):
if 'copy' in kw:
raise TypeError(
"The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name)
doc = fromstring(doc, **kw)
else:
if 'copy' in kw:
copy = kw.pop('copy')
else:
copy = self.copy
if copy:
doc = copy.deepcopy(doc)
meth = getattr(doc, self.name)
result = meth(*args, **kw)
# FIXME: this None test is a bit sloppy
if result is None:
# Then return what we got in
return _transform_result(result_type, doc)
else:
return result
find_rel_links = _MethodFunc('find_rel_links', copy=False)
find_class = _MethodFunc('find_class', copy=False)
make_links_absolute = _MethodFunc('make_links_absolute', copy=True)
resolve_base_href = _MethodFunc('resolve_base_href', copy=True)
iterlinks = _MethodFunc('iterlinks', copy=False)
rewrite_links = _MethodFunc('rewrite_links', copy=True)
class HtmlComment(etree.CommentBase, HtmlMixin):
pass
class HtmlElement(etree.ElementBase, HtmlMixin):
pass
class HtmlProcessingInstruction(etree.PIBase, HtmlMixin):
pass
class HtmlEntity(etree.EntityBase, HtmlMixin):
pass
class HtmlElementClassLookup(etree.CustomElementClassLookup):
"""A lookup scheme for HTML Element classes.
To create a lookup instance with different Element classes, pass a tag
name mapping of Element classes in the ``classes`` keyword argument and/or
a tag name mapping of Mixin classes in the ``mixins`` keyword argument.
The special key '*' denotes a Mixin class that should be mixed into all
Element classes.
"""
_default_element_classes = {}
def __init__(self, classes=None, mixins=None):
etree.CustomElementClassLookup.__init__(self)
if classes is None:
classes = self._default_element_classes.copy()
if mixins:
mixers = {}
for name, value in mixins:
if name == '*':
for n in classes.keys():
mixers.setdefault(n, []).append(value)
else:
mixers.setdefault(name, []).append(value)
for name, mix_bases in mixers.items():
cur = classes.get(name, HtmlElement)
bases = tuple(mix_bases + [cur])
classes[name] = type(cur.__name__, bases, {})
self._element_classes = classes
def lookup(self, node_type, document, namespace, name):
if node_type == 'element':
return self._element_classes.get(name.lower(), HtmlElement)
elif node_type == 'comment':
return HtmlComment
elif node_type == 'PI':
return HtmlProcessingInstruction
elif node_type == 'entity':
return HtmlEntity
# Otherwise normal lookup
return None
################################################################################
# parsing
################################################################################
def document_fromstring(html, parser=None, **kw):
if parser is None:
parser = html_parser
value = etree.fromstring(html, parser, **kw)
if value is None:
raise etree.ParserError(
"Document is empty")
return value
def fragments_fromstring(html, no_leading_text=False, base_url=None,
parser=None, **kw):
"""
Parses several HTML elements, returning a list of elements.
The first item in the list may be a string (though leading
whitespace is removed). If no_leading_text is true, then it will
be an error if there is leading text, and it will always be a list
of only elements.
base_url will set the document's base_url attribute (and the tree's docinfo.URL)
"""
if parser is None:
parser = html_parser
# FIXME: check what happens when you give html with a body, head, etc.
start = html[:20].lstrip().lower()
if not start.startswith('%s%s>' % (
create_parent, html, create_parent),
parser=parser, base_url=base_url, **kw)
elements = fragments_fromstring(html, parser=parser, no_leading_text=True,
base_url=base_url, **kw)
if not elements:
raise etree.ParserError(
"No elements found")
if len(elements) > 1:
raise etree.ParserError(
"Multiple elements found (%s)"
% ', '.join([_element_name(e) for e in elements]))
el = elements[0]
if el.tail and el.tail.strip():
raise etree.ParserError(
"Element followed by text: %r" % el.tail)
el.tail = None
return el
def fromstring(html, base_url=None, parser=None, **kw):
"""
Parse the html, returning a single element/document.
This tries to minimally parse the chunk of text, without knowing if it
is a fragment or a document.
base_url will set the document's base_url attribute (and the tree's docinfo.URL)
"""
if parser is None:
parser = html_parser
start = html[:10].lstrip().lower()
if start.startswith(' 1:
# Somehow there are multiple bodies, which is bad, but just
# smash them into one body
for other_body in bodies[1:]:
if other_body.text:
if len(body):
body[-1].tail = (body[-1].tail or '') + other_body.text
else:
body.text = (body.text or '') + other_body.text
body.extend(other_body)
# We'll ignore tail
# I guess we are ignoring attributes too
other_body.drop_tree()
else:
body = None
heads = doc.findall('head')
if not heads:
heads = doc.findall('{%s}head' % XHTML_NAMESPACE)
if heads:
# Well, we have some sort of structure, so lets keep it all
head = heads[0]
if len(heads) > 1:
for other_head in heads[1:]:
head.extend(other_head)
# We don't care about text or tail in a head
other_head.drop_tree()
return doc
if (len(body) == 1 and (not body.text or not body.text.strip())
and (not body[-1].tail or not body[-1].tail.strip())):
# The body has just one element, so it was probably a single
# element passed in
return body[0]
# Now we have a body which represents a bunch of tags which have the
# content that was passed in. We will create a fake container, which
# is the body tag, except implies too much structure.
if _contains_block_level_tag(body):
body.tag = 'div'
else:
body.tag = 'span'
return body
def parse(filename_or_url, parser=None, base_url=None, **kw):
"""
Parse a filename, URL, or file-like object into an HTML document
tree. Note: this returns a tree, not an element. Use
``parse(...).getroot()`` to get the document root.
You can override the base URL with the ``base_url`` keyword. This
is most useful when parsing from a file-like object.
"""
if parser is None:
parser = html_parser
return etree.parse(filename_or_url, parser, base_url=base_url, **kw)
def _contains_block_level_tag(el):
# FIXME: I could do this with XPath, but would that just be
# unnecessarily slow?
for el in el.iter():
if _nons(el.tag) in defs.block_tags:
return True
return False
def _element_name(el):
if isinstance(el, etree.CommentBase):
return 'comment'
elif isinstance(el, basestring):
return 'string'
else:
return _nons(el.tag)
################################################################################
# form handling
################################################################################
class FormElement(HtmlElement):
"""
Represents a