Package lxml :: Package html :: Module html5parser
[hide private]
[frames] | no frames]

Source Code for Module lxml.html.html5parser

  1  """ 
  2  An interface to html5lib. 
  3  """ 
  4   
  5  import urllib 
  6  from html5lib import HTMLParser as _HTMLParser, XHTMLParser as _XHTMLParser 
  7  from lxml import etree 
  8  from lxml.html import _contains_block_level_tag, XHTML_NAMESPACE 
  9  from lxml.html._html5builder import TreeBuilder 
 10   
 11  # python3 compatibility 
 12  try: 
 13      _strings = basestring 
 14  except NameError: 
 15      _strings = (bytes, str) 
 16   
 17   
18 -class HTMLParser(_HTMLParser):
19 """An html5lib HTML parser with lxml as tree.""" 20
21 - def __init__(self, strict=False):
22 _HTMLParser.__init__(self, strict=strict, tree=TreeBuilder)
23 24
25 -class XHTMLParser(_XHTMLParser):
26 """An html5lib XHTML Parser with lxml as tree.""" 27
28 - def __init__(self, strict=False):
29 _XHTMLParser.__init__(self, strict=strict, tree=TreeBuilder)
30 31
32 -def _find_tag(tree, tag):
33 elem = tree.find(tag) 34 if elem is not None: 35 return elem 36 return tree.find('{%s}%s' % (XHTML_NAMESPACE, tag))
37 38
39 -def document_fromstring(html, guess_charset=True, parser=None):
40 """Parse a whole document into a string.""" 41 if not isinstance(html, _strings): 42 raise TypeError('string required') 43 44 if parser is None: 45 parser = html_parser 46 47 return parser.parse(html, useChardet=guess_charset).getroot()
48 49
50 -def fragments_fromstring(html, no_leading_text=False, 51 guess_charset=False, parser=None):
52 """Parses several HTML elements, returning a list of elements. 53 54 The first item in the list may be a string. If no_leading_text is true, 55 then it will be an error if there is leading text, and it will always be 56 a list of only elements. 57 58 If `guess_charset` is `True` and the text was not unicode but a 59 bytestring, the `chardet` library will perform charset guessing on the 60 string. 61 """ 62 if not isinstance(html, _strings): 63 raise TypeError('string required') 64 65 if parser is None: 66 parser = html_parser 67 68 children = parser.parseFragment(html, 'div', useChardet=guess_charset) 69 if children and isinstance(children[0], _strings): 70 if no_leading_text: 71 if children[0].strip(): 72 raise etree.ParserError('There is leading text: %r' % 73 children[0]) 74 del children[0] 75 return children
76 77
78 -def fragment_fromstring(html, create_parent=False, 79 guess_charset=False, parser=None):
80 """Parses a single HTML element; it is an error if there is more than 81 one element, or if anything but whitespace precedes or follows the 82 element. 83 84 If create_parent is true (or is a tag name) then a parent node 85 will be created to encapsulate the HTML in a single element. 86 """ 87 if not isinstance(html, _strings): 88 raise TypeError('string required') 89 90 if create_parent: 91 container = create_parent or 'div' 92 html = '<%s>%s</%s>' % (container, html, container) 93 94 children = fragments_fromstring(html, True, guess_charset, parser) 95 if not children: 96 raise etree.ParserError('No elements found') 97 if len(children) > 1: 98 raise etree.ParserError('Multiple elements found') 99 100 result = children[0] 101 if result.tail and result.tail.strip(): 102 raise etree.ParserError('Element followed by text: %r' % result.tail) 103 result.tail = None 104 return result
105 106
107 -def fromstring(html, guess_charset=True, parser=None):
108 """Parse the html, returning a single element/document. 109 110 This tries to minimally parse the chunk of text, without knowing if it 111 is a fragment or a document. 112 113 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 114 """ 115 if not isinstance(html, _strings): 116 raise TypeError('string required') 117 doc = document_fromstring(html, parser=parser, 118 guess_charset=guess_charset) 119 120 # document starts with doctype or <html>, full document! 121 start = html[:50].lstrip().lower() 122 if start.startswith('<html') or start.startswith('<!doctype'): 123 return doc 124 125 head = _find_tag(doc, 'head') 126 127 # if the head is not empty we have a full document 128 if len(head): 129 return doc 130 131 body = _find_tag(doc, 'body') 132 133 # The body has just one element, so it was probably a single 134 # element passed in 135 if (len(body) == 1 and (not body.text or not body.text.strip()) 136 and (not body[-1].tail or not body[-1].tail.strip())): 137 return body[0] 138 139 # Now we have a body which represents a bunch of tags which have the 140 # content that was passed in. We will create a fake container, which 141 # is the body tag, except <body> implies too much structure. 142 if _contains_block_level_tag(body): 143 body.tag = 'div' 144 else: 145 body.tag = 'span' 146 return body
147 148
149 -def parse(filename_url_or_file, guess_charset=True, parser=None):
150 """Parse a filename, URL, or file-like object into an HTML document 151 tree. Note: this returns a tree, not an element. Use 152 ``parse(...).getroot()`` to get the document root. 153 """ 154 if parser is None: 155 parser = html_parser 156 if isinstance(filename_url_or_file, basestring): 157 fp = urllib.urlopen(filename_url_or_file) 158 else: 159 fp = filename_url_or_file 160 return parser.parse(fp, useChardet=guess_charset)
161 162 163 html_parser = HTMLParser() 164 xhtml_parser = XHTMLParser() 165