# $Id: lexer.py 2133 2006-09-06 18:52:56Z dairiki $ # lexer.py - template parsing routines for Myghty # Copyright (C) 2004, 2005 Michael Bayer mike_mp@zzzcomputing.com # Original Perl code and documentation copyright (c) 1998-2003 by Jonathan Swartz. # # This module is part of Myghty and is released under # the MIT License: http://www.opensource.org/licenses/mit-license.php # import string, re, sys, codecs from myghty import exception from myghty.util import * """initial parser for a Myghty file, locates tokens and fires events in a Compiler object. Lexer is currently stateful and is not thread safe. the clone() method can be used to create copies of this object for use in multiple threads. """ # map of block names pointing to parse method names BLOCKS = { 'args' : 'variable_list_block', 'attr' : 'key_value_block', 'flags' : 'key_value_block', 'cleanup' : 'raw_block', 'doc' : 'doc_block', 'filter' : 'raw_block', 'init' : 'raw_block', 'once' : 'raw_block', 'global' : 'synonym:once', 'threadonce' : 'raw_block', 'threadlocal': 'synonym:threadonce', 'python' : 'raw_block', 'shared' : 'raw_block', 'requestlocal' : 'synonym:shared', 'requestonce' : 'synonym:shared', 'text' : 'text_block', } PYTHON_SCOPES = { 'component': 'python', 'request': 'shared', 'thread': 'threadonce', 'global': 'once', 'init' : 'init', 'cleanup': 'cleanup' } class Lexer: def __init__(self, **params): self.current = None def get_object_id(self): """returns an ID that can identify this lexer""" return "Myghty.Lexer" def clone(self, **params): """creates a clone of this Lexer. allow the Prototype pattern to be used in creating lexers for use in other threads.""" return Lexer(**params) class LexContext: """an object tracking the lexer's progress through a component block.""" def __init__(self, source, name, compiler): self.source = source self.name = name self.compiler = compiler # a regular expression to match the "end" of whatever construct # the parser located # different methods override this to locate different kinds of # endings # This will be overridden if entering a def or method section. self.ending = re.compile(r'\Z', re.S) # place to begin regular expression matching # since I cannot find an equivalent of perl's \G in python self.match_position = 0 self.in_def = False self.in_method = False self.block_name = None self.block_type = None self.lines = 0 def set_in_named_block(self, block_type, name): if block_type == 'def': self.in_def = True elif block_type == 'method': self.in_method = True elif block_type == 'closure': pass else: raise "invalid block type %s" % block_type self.block_name = name self.block_type = block_type def reset_in_named_block(self): self.in_def = False self.in_method = False self.block_name = None def match_pos(self, regstring = None, flags = None, regexp = None): if regexp == None: if flags: regexp = re.compile(regstring, flags) else: regexp = re.compile(regstring) match = regexp.match(self.source, self.match_position ) if match: (start, end) = match.span() # attempt to simulate perl's \G operator. usually works, except # it behaves differently with zero-length matches. # well actually perl's operator behaves more strangely. # see def variable_list_block for further \G angst if end == start: self.match_position = end + 1 else: self.match_position = end return match def lex(self, source, name, compiler, input_file = None): # Holds information about the current lex. current = Lexer.LexContext(source, name, compiler) # set current lex to this one self.current = current # optional full path of the file the source came from; passed through # to SyntaxErrors for exception reporting self.input_file = input_file # Clean up Mac and DOS line endings current.source = re.sub(r'\r\n?', "\n", current.source) # Detect and remove leading UTF-8 byte-order-marker # Some windows editors add these at the beginning of a file to # mark their content as UTF-8. if current.source.startswith(codecs.BOM_UTF8): current.source = current.source[len(codecs.BOM_UTF8):] self.current.compiler.magic_encoding_comment('utf_8') try: try: current.compiler.start_component() self.start() except Exception, e: raise finally: current.compiler.end_component() def start(self): end = None length = len(self.current.source) while (True): if self.current.match_position > length: break end = self.match_end() if end: break if self.match_block(): continue if self.match_named_block():continue if self.match_substitute(): continue if self.match_comp_call(): continue if self.match_python_line(): continue if self.match_comp_content_call(): continue if self.match_comp_content_call_end(): continue if self.match_text(): continue isend = (self.current.match_position > len(self.current.source)) if (self.current.in_def or self.current.in_method) and isend: self.raise_syntax_error("Missing closing %%%s> tag" % self.current.block_type) if isend: break raise exception.Compiler("Infinite parsing loop encountered - Lexer bug?") if self.current.in_def or self.current.in_method: type = self.current.block_type if not isinstance(end, str) or not self.current.ending.match(end): block_name = self.current.block_name self.raise_syntax_error("no closing %%%s> tag for <%%%s %s> block" % (type, type, block_name)) def match_block(self): match = self.current.match_pos(regexp = re.compile(r'\<%(' + string.join(BLOCKS.keys(), '|') + r')(\s+[^>]*)?\s*>', re.I | re.S )) if match: (type, attr) = (match.group(1).lower(), match.group(2)) self.current.block_type = type attributes = {} if attr: attrmatch = re.findall(r"\s*((\w+)\s*=\s*('[^']*'|\"[^\"]*\"|\w+))\s*", attr) for att in attrmatch: (full, key, val) = att try: attributes[key] = eval(val) except: (e, msg) = sys.exc_info()[0:2] self.raise_syntax_error("Non-evaluable attribute value: '%s' (%s: %s)" % (val, e, msg)) syntype = None # get method name for this block try: method = BLOCKS[type] if string.find(method, ':') != -1: syntype = method.split(':', 1)[-1] method = BLOCKS[syntype] except KeyError: self.raise_syntax_error("no such block type '%s'" % type) if attributes.has_key('scope') and type == 'python' or syntype == 'python': try: syntype = PYTHON_SCOPES[attributes['scope']] except KeyError: self.raise_syntax_error("unknown python scope '%s'" % attributes['scope']) if syntype: self.current.compiler.start_block(block_type = syntype, attributes = attributes) else: self.current.compiler.start_block(block_type = type, attributes = attributes) # call method dynamically getattr(self, method)(block_type = type, synonym_for = syntype, attributes = attributes) self.current.block_type = None return True else: return False def match_named_block(self): match = self.current.match_pos(regexp = re.compile(r"<%(def|method|closure)(?:\s+([^\n]+?))?(\s+[^>]*)?\s*>", re.I | re.S)) if match: (type, name, attr) = (match.group(1).lower(), match.group(2), match.group(3)) attributes = {} if attr: attrmatch = re.findall(r"\s*((\w+)\s*=\s*('[^']*'|\"[^\"]*\"|\w+))\s*", attr) for att in attrmatch: (full, key, val) = att attributes[key] = val if not type or not name: self.raise_syntax_error("%s block without a name" % type) self.current.compiler.start_named_block(block_type = type, name = name, attributes = attributes) # preserve a little state existingending = self.current.ending # screw with the current compile context self.current.ending = re.compile(r"<\/%%%s>(\n?)" % type, re.I) self.current.set_in_named_block(block_type = type, name = name) # recursively call the start() stuff self.start() # tell compiler to close up the block self.current.compiler.end_named_block(block_type = type) # restore the state of the current compile self.current.ending = existingending self.current.reset_in_named_block() # give our caller the good news return True else: return False def match_text(self): current = self.current match = current.match_pos(regexp = re.compile(r""" (.*?) # anything, followed by: ( (?<=\n)(?=[%#]) # an eval or comment line, preceded by a consumed \n | (?=?[%&]) # a substitution or block or call start or end # - don't consume | (\\\n) # an escaped newline - throw away | \Z # end of string )""", re.X | re.S)) if match: text = match.group(1) current.compiler.text_block(block = text) current.lines += self._count_lines(text) if match.group(3): current.lines += 1 return True else: return False def match_substitute(self): # This routine relies on there *not* to be an opening <%foo> tag # present, so match_block() must happen first. if not self.current.match_pos(r"<%"): return False match = self.current.match_pos( regexp = re.compile(""" (.+?) # Substitution body ($1) ( \s* (? # Closing tag """, re.X | re.I | re.S)) if match: (body, extra, escape) = match.group(1, 2, 3) self.current.lines += self._count_lines(body) if extra: self.current.lines += self._count_lines(extra) self.current.compiler.substitution(body, escape) return True else: self.raise_syntax_error("'<%' without matching '%>'") def match_comp_call(self): match = self.current.match_pos(regexp = re.compile(r"<&(?!\|)", re.S)) if match: match = self.current.match_pos(regexp = re.compile(r"(.*?)&>", re.S)) if match: call = match.group(1) self.current.compiler.component_call(call) self.current.lines += self._count_lines(call) return True else: self.raise_syntax_error("'<&' without matching '&>'") else: return False def match_comp_content_call(self): match = self.current.match_pos(regexp = re.compile(r"<&\|", re.S)) if match: match = self.current.match_pos(regexp = re.compile(r"(.*?)&>", re.S)) if match: call = match.group(1) self.current.compiler.component_content_call(call) self.current.lines += self._count_lines(call) return True else: self.raise_syntax_error("'<&|' without matching '&>'") else: return False def match_comp_content_call_end(self): match = self.current.match_pos(r"&>") if match: self.current.compiler.component_content_call_end() return True else: return False def match_block_end(self, block_type, allow_text = True, **params): if allow_text: regex = re.compile(r"(.*?)%%%s>(\n?)" % block_type, re.I | re.S) else: regex = re.compile(r"\s*%%%s>(\n?)" % block_type, re.I | re.S) match = self.current.match_pos(regex) if match: if allow_text: return tuple(match.group(1,2)) else: return match.group(1) else: self.raise_syntax_error("Invalid <%%%s> section line" % block_type) def match_python_line(self): match = self.current.match_pos(r"(?<=^)([%#])([^\n]*)(?:\n|\Z)", re.M) if match: # comment if match.group(1) == '#': if self.current.lines < 2: # Magic -*- encoding: foo -*- comment m = re.search(r'coding[=:]\s*([-\w.]+)', match.group(2)) if m: self.current.compiler.magic_encoding_comment(m.group(1)) self.current.lines += 1 return True self.current.compiler.python_line(line = match.group(2)) self.current.lines += 1 return True else: return False def match_end(self): match = self.current.match_pos(regexp = self.current.ending) if match: string = match.group() self.current.lines += self._count_lines(string) if string: return string else: return True else: return False def variable_list_block(self, block_type, attributes = None, **params): # python doesnt quite do the regexp here the same way as perl (which seems to # do it, incorrectly ??? somehow perl magically knows to stop global matching beyond # the %args> line based on the (?= %args> ) match at the end. python doesnt. # or maybe i just goofed.). # anyway, just to get this to work, get the whole ARG block out of the source first, # then operate upon that. if theres some all-in-one way # to do it in python, or i goofed, be my guest. match = self.current.match_pos(regexp = re.compile(r""".*?(?= <\/%%%s> )""" % block_type, re.M | re.S | re.X)) if match: source = match.group() else: source = '' # operate upon the stuff inside of <%block>%block> regexp = re.compile(r""" (?: (?: [ \t]* ( [^\W\d]\w* ) #only allows valid Python variable names [ \t]* (?: (?: # begin optional part of arg = ( [^\n]+ ) # default value, also consumes an inline comment, if any ) | (?: # an optional comment after an arg without a default [ \t]* \# [^\n]* ) )? ) | [ \t]* # a comment line \# [^\n]* | [ \t]* # just space ) (\n?) # optional newline. the ? makes finditer() go into an endless loop. """ , re.VERBOSE | re.I | re.M) # finditer has a bug here. goes into an endless loop. # but findall works. if i take the ? off the last newline there, then # finditer works, but we lose the args if it looks like <%args>foo%args> # with no newline. *shrug* matches = regexp.findall(source) #matches = regexp.finditer(source) scope = None if attributes is not None and attributes.has_key('scope'): scope = attributes['scope'] for match in matches: (name, default, linebr) = match #(name, default, linebr) = match.group(1, 2, 3) if name: self.current.compiler.variable_declaration(block_type=block_type, name=name, default=default, scope = scope) if linebr: self.current.lines += 1 params['allow_text'] = False nl = self.match_block_end(block_type = block_type, **params) if nl: self.current.lines +=1 self.current.compiler.end_block(block_type = block_type) def key_value_block(self, block_type, **params): # do this like the variable_list_block # see that method for regexp quirks match = self.current.match_pos(regexp = re.compile(r""".*?(?= <\/%%%s> )""" % block_type, re.M | re.S | re.X)) if match: source = match.group() else: source = '' regexp = re.compile(r""" (?: [ \t]* ([\w_]+) # identifier [ \t]*[=:][ \t]* # separator (\S[^\n]*) # value ( must start with a non-space char) | [ \t]* # an optional comment \# [^\n]* | [ \t]* # just space ) (\n?) """ , re.VERBOSE | re.I) matches = regexp.findall(source) #matches = regexp.finditer(source) for match in matches: (key, value, newline) = match #(key, value) = match.group(1, 2) if key: self.current.compiler.key_value_pair(block_type = block_type, key = key, value = value) if newline: self.current.lines += 1 params['allow_text'] = False nl = self.match_block_end(block_type = block_type, **params) if nl: self.current.lines +=1 self.current.compiler.end_block(block_type = block_type) def generic_block(self, method, **params): params['allow_text'] = True (block, n1) = self.match_block_end(**params) if params.has_key('synonym_for') and params['synonym_for'] is not None: compiler_block_type = params['synonym_for'] else: compiler_block_type = params['block_type'] getattr(self.current.compiler, method)(block_type = compiler_block_type, block = block) self.current.lines += self._count_lines(block) if n1: self.current.lines +=1 self.current.compiler.end_block(block_type = compiler_block_type) def text_block(self, **params): self.generic_block('text_block', **params) def raw_block(self, **params): self.generic_block('raw_block', **params) def doc_block(self, **params): self.generic_block('doc_block', **params) def line_number(self): return self.current.lines + 1 def get_name(self): return self.current.name def _count_lines(self, text): return len(re.findall(r"\n", text)) def _current_line(self): lines = re.split(r"\n",self.current.source[0:self.current.match_position]) if len(lines) <= self.current.lines: return '' else: return lines[self.current.lines] def raise_syntax_error(self, error): raise exception.Syntax( error = error, comp_name = self.get_name(), source_line = self._current_line(), line_number = self.line_number(), source = self.current.source, file = self.input_file, source_encoding = self.current.compiler.get_encoding())