# -*- coding: utf-8 -*-
#
# Copyright (C) 2006-2007 Edgewall Software
# All rights reserved.
#
# This software is licensed as described in the file COPYING, which
# you should have received as part of this distribution. The terms
# are also available at http://genshi.edgewall.org/wiki/License.
#
# This software consists of voluntary contributions made by many
# individuals. For the exact contribution history, see the revision
# history and logs, available at http://genshi.edgewall.org/log/.
"""Support for constructing markup streams from files, strings, or other
sources.
"""
from itertools import chain
from xml.parsers import expat
try:
frozenset
except NameError:
from sets import ImmutableSet as frozenset
import HTMLParser as html
import htmlentitydefs
from StringIO import StringIO
from genshi.core import Attrs, QName, Stream, stripentities
from genshi.core import START, END, XML_DECL, DOCTYPE, TEXT, START_NS, END_NS, \
START_CDATA, END_CDATA, PI, COMMENT
__all__ = ['ET', 'ParseError', 'XMLParser', 'XML', 'HTMLParser', 'HTML']
__docformat__ = 'restructuredtext en'
def ET(element):
"""Convert a given ElementTree element to a markup stream.
:param element: an ElementTree element
:return: a markup stream
"""
tag_name = QName(element.tag.lstrip('{'))
attrs = Attrs([(QName(attr.lstrip('{')), value)
for attr, value in element.items()])
yield START, (tag_name, attrs), (None, -1, -1)
if element.text:
yield TEXT, element.text, (None, -1, -1)
for child in element.getchildren():
for item in ET(child):
yield item
yield END, tag_name, (None, -1, -1)
if element.tail:
yield TEXT, element.tail, (None, -1, -1)
class ParseError(Exception):
"""Exception raised when fatal syntax errors are found in the input being
parsed.
"""
def __init__(self, message, filename=None, lineno=-1, offset=-1):
"""Exception initializer.
:param message: the error message from the parser
:param filename: the path to the file that was parsed
:param lineno: the number of the line on which the error was encountered
:param offset: the column number where the error was encountered
"""
self.msg = message
if filename:
message += ', in ' + filename
Exception.__init__(self, message)
self.filename = filename or '
'))
>>> for kind, data, pos in parser:
... print kind, data
START (QName(u'ul'), Attrs([(QName(u'compact'), u'compact')]))
START (QName(u'li'), Attrs())
TEXT Foo
END li
END ul
"""
_EMPTY_ELEMS = frozenset(['area', 'base', 'basefont', 'br', 'col', 'frame',
'hr', 'img', 'input', 'isindex', 'link', 'meta',
'param'])
def __init__(self, source, filename=None, encoding='utf-8'):
"""Initialize the parser for the given HTML input.
:param source: the HTML text as a file-like object
:param filename: the name of the file, if known
:param filename: encoding of the file; ignored if the input is unicode
"""
html.HTMLParser.__init__(self)
self.source = source
self.filename = filename
self.encoding = encoding
self._queue = []
self._open_tags = []
def parse(self):
"""Generator that parses the HTML source, yielding markup events.
:return: a markup event stream
:raises ParseError: if the HTML text is not well formed
"""
def _generate():
try:
bufsize = 4 * 1024 # 4K
done = False
while 1:
while not done and len(self._queue) == 0:
data = self.source.read(bufsize)
if data == '': # end of data
self.close()
done = True
else:
self.feed(data)
for kind, data, pos in self._queue:
yield kind, data, pos
self._queue = []
if done:
open_tags = self._open_tags
open_tags.reverse()
for tag in open_tags:
yield END, QName(tag), pos
break
except html.HTMLParseError, e:
msg = '%s: line %d, column %d' % (e.msg, e.lineno, e.offset)
raise ParseError(msg, self.filename, e.lineno, e.offset)
return Stream(_generate()).filter(_coalesce)
def __iter__(self):
return iter(self.parse())
def _enqueue(self, kind, data, pos=None):
if pos is None:
pos = self._getpos()
self._queue.append((kind, data, pos))
def _getpos(self):
lineno, column = self.getpos()
return (self.filename, lineno, column)
def handle_starttag(self, tag, attrib):
fixed_attrib = []
for name, value in attrib: # Fixup minimized attributes
if value is None:
value = unicode(name)
elif not isinstance(value, unicode):
value = value.decode(self.encoding, 'replace')
fixed_attrib.append((QName(name), stripentities(value)))
self._enqueue(START, (QName(tag), Attrs(fixed_attrib)))
if tag in self._EMPTY_ELEMS:
self._enqueue(END, QName(tag))
else:
self._open_tags.append(tag)
def handle_endtag(self, tag):
if tag not in self._EMPTY_ELEMS:
while self._open_tags:
open_tag = self._open_tags.pop()
self._enqueue(END, QName(open_tag))
if open_tag.lower() == tag.lower():
break
def handle_data(self, text):
if not isinstance(text, unicode):
text = text.decode(self.encoding, 'replace')
self._enqueue(TEXT, text)
def handle_charref(self, name):
if name.lower().startswith('x'):
text = unichr(int(name[1:], 16))
else:
text = unichr(int(name))
self._enqueue(TEXT, text)
def handle_entityref(self, name):
try:
text = unichr(htmlentitydefs.name2codepoint[name])
except KeyError:
text = '&%s;' % name
self._enqueue(TEXT, text)
def handle_pi(self, data):
target, data = data.split(None, 1)
if data.endswith('?'):
data = data[:-1]
self._enqueue(PI, (target.strip(), data.strip()))
def handle_comment(self, text):
self._enqueue(COMMENT, text)
def HTML(text, encoding='utf-8'):
"""Parse the given HTML source and return a markup stream.
Unlike with `HTMLParser`, the returned stream is reusable, meaning it can be
iterated over multiple times:
>>> html = HTML('Foo
')
>>> print html
Foo
>>> print html.select('h1')
Foo
>>> print html.select('h1/text()')
Foo
:param text: the HTML source
:return: the parsed XML event stream
:raises ParseError: if the HTML text is not well-formed, and error recovery
fails
"""
return Stream(list(HTMLParser(StringIO(text), encoding=encoding)))
def _coalesce(stream):
"""Coalesces adjacent TEXT events into a single event."""
textbuf = []
textpos = None
for kind, data, pos in chain(stream, [(None, None, None)]):
if kind is TEXT:
textbuf.append(data)
if textpos is None:
textpos = pos
else:
if textbuf:
yield TEXT, u''.join(textbuf), textpos
del textbuf[:]
textpos = None
if kind:
yield kind, data, pos