1 """
2 An interface to html5lib.
3 """
4
5 import urllib
6 from html5lib import HTMLParser as _HTMLParser, XHTMLParser as _XHTMLParser
7 from lxml import etree
8 from lxml.html import _contains_block_level_tag, XHTML_NAMESPACE
9 from lxml.html._html5builder import TreeBuilder
10
11
12 try:
13 _strings = basestring
14 except NameError:
15 _strings = (bytes, str)
16
17
19 """An html5lib HTML parser with lxml as tree."""
20
23
24
26 """An html5lib XHTML Parser with lxml as tree."""
27
30
31
37
38
48
49
52 """Parses several HTML elements, returning a list of elements.
53
54 The first item in the list may be a string. If no_leading_text is true,
55 then it will be an error if there is leading text, and it will always be
56 a list of only elements.
57
58 If `guess_charset` is `True` and the text was not unicode but a
59 bytestring, the `chardet` library will perform charset guessing on the
60 string.
61 """
62 if not isinstance(html, _strings):
63 raise TypeError('string required')
64
65 if parser is None:
66 parser = html_parser
67
68 children = parser.parseFragment(html, 'div', useChardet=guess_charset)
69 if children and isinstance(children[0], _strings):
70 if no_leading_text:
71 if children[0].strip():
72 raise etree.ParserError('There is leading text: %r' %
73 children[0])
74 del children[0]
75 return children
76
77
80 """Parses a single HTML element; it is an error if there is more than
81 one element, or if anything but whitespace precedes or follows the
82 element.
83
84 If create_parent is true (or is a tag name) then a parent node
85 will be created to encapsulate the HTML in a single element.
86 """
87 if not isinstance(html, _strings):
88 raise TypeError('string required')
89
90 if create_parent:
91 container = create_parent or 'div'
92 html = '<%s>%s</%s>' % (container, html, container)
93
94 children = fragments_fromstring(html, True, guess_charset, parser)
95 if not children:
96 raise etree.ParserError('No elements found')
97 if len(children) > 1:
98 raise etree.ParserError('Multiple elements found')
99
100 result = children[0]
101 if result.tail and result.tail.strip():
102 raise etree.ParserError('Element followed by text: %r' % result.tail)
103 result.tail = None
104 return result
105
106
107 -def fromstring(html, guess_charset=True, parser=None):
108 """Parse the html, returning a single element/document.
109
110 This tries to minimally parse the chunk of text, without knowing if it
111 is a fragment or a document.
112
113 base_url will set the document's base_url attribute (and the tree's docinfo.URL)
114 """
115 if not isinstance(html, _strings):
116 raise TypeError('string required')
117 doc = document_fromstring(html, parser=parser,
118 guess_charset=guess_charset)
119
120
121 start = html[:50].lstrip().lower()
122 if start.startswith('<html') or start.startswith('<!doctype'):
123 return doc
124
125 head = _find_tag(doc, 'head')
126
127
128 if len(head):
129 return doc
130
131 body = _find_tag(doc, 'body')
132
133
134
135 if (len(body) == 1 and (not body.text or not body.text.strip())
136 and (not body[-1].tail or not body[-1].tail.strip())):
137 return body[0]
138
139
140
141
142 if _contains_block_level_tag(body):
143 body.tag = 'div'
144 else:
145 body.tag = 'span'
146 return body
147
148
149 -def parse(filename_url_or_file, guess_charset=True, parser=None):
150 """Parse a filename, URL, or file-like object into an HTML document
151 tree. Note: this returns a tree, not an element. Use
152 ``parse(...).getroot()`` to get the document root.
153 """
154 if parser is None:
155 parser = html_parser
156 if isinstance(filename_url_or_file, basestring):
157 fp = urllib.urlopen(filename_url_or_file)
158 else:
159 fp = filename_url_or_file
160 return parser.parse(fp, useChardet=guess_charset)
161
162
163 html_parser = HTMLParser()
164 xhtml_parser = XHTMLParser()
165