"""Parse a Python source code encoding string""" import codecs import re # Regexp to match python magic encoding line PYTHON_MAGIC_COMMENT_re = re.compile( r'[ \t\f]* \# .* coding[=:][ \t]*([-\w.]+)', re.VERBOSE) def parse_encoding(lines): """Deduce the encoding of a source file from magic comment. It does this in the same way as the `Python interpreter`__ .. __: http://docs.python.org/ref/encodings.html The ``lines`` argument should be a list of the first 2 lines of the source code. (From Jeff Dairiki) """ try: line1 = lines[0] has_bom = line1.startswith(codecs.BOM_UTF8) if has_bom: line1 = line1[len(codecs.BOM_UTF8):] m = PYTHON_MAGIC_COMMENT_re.match(line1) if not m: try: import parser parser.suite(line1) except (ImportError, SyntaxError): # Either it's a real syntax error, in which case the source is # not valid python source, or line2 is a continuation of line1, # in which case we don't want to scan line2 for a magic # comment. pass else: line2 = lines[1] m = PYTHON_MAGIC_COMMENT_re.match(line2) if has_bom: if m: raise SyntaxError( "python refuses to compile code with both a UTF8 " "byte-order-mark and a magic encoding comment") return 'utf_8' elif m: return m.group(1) else: return None except: return None