home *** CD-ROM | disk | FTP | other *** search
Wrap
# Source Generated with Decompyle++ # File: in.pyc (Python 2.6) __all__ = [ 'buildlog', 'encodingByMediaType', 'getHTTPInfo', 'getMetaInfo', 'detectXMLEncoding', 'getEncodingInfo', 'tryEncodings', 'EncodingInfo'] __docformat__ = 'restructuredtext' __author__ = 'Christof Hoeke, Robert Siemer' __version__ = '$Id: __init__.py 1712 2009-04-23 12:46:03Z cthedot $' import HTMLParser import StringIO import cgi import httplib import re import sys import types import urllib VERSION = '0.9' class _MetaHTMLParser(HTMLParser.HTMLParser): content_type = None def handle_starttag(self, tag, attrs): pass _XML_APPLICATION_TYPE = 0 _XML_TEXT_TYPE = 1 _HTML_TEXT_TYPE = 2 _TEXT_TYPE = 3 _TEXT_UTF8 = 5 _OTHER_TYPE = 4 class EncodingInfo(object): def __init__(self): self.encoding = None self.mismatch = None self.logtext = None self.http_encoding = None self.http_media_type = None self.meta_encoding = None self.meta_media_type = None self.xml_encoding = None def __str__(self): if self.encoding: return self.encoding return u'' def __repr__(self): return '<%s.%s object encoding=%r mismatch=%s at 0x%x>' % (self.__class__.__module__, self.__class__.__name__, self.encoding, self.mismatch, id(self)) def buildlog(logname = 'encutils', level = 'INFO', stream = sys.stderr, filename = None, filemode = 'w', format = '%(levelname)s\t%(message)s'): import logging log = logging.getLogger(logname) if filename: hdlr = logging.FileHandler(filename, filemode) else: hdlr = logging.StreamHandler(stream) formatter = logging.Formatter(format) hdlr.setFormatter(formatter) log.addHandler(hdlr) log.setLevel(logging.__dict__.get(level, logging.INFO)) return log def _getTextTypeByMediaType(media_type, log = None): if not media_type: return _OTHER_TYPE xml_application_types = [ u'application/.*?\\+xml', u'application/xml', u'application/xml-dtd', u'application/xml-external-parsed-entity'] xml_text_types = [ u'text\\/.*?\\+xml', u'text/xml', u'text/xml-external-parsed-entity'] media_type = media_type.strip().lower() if media_type in xml_application_types or re.match(xml_application_types[0], media_type, re.I | re.S | re.X): return _XML_APPLICATION_TYPE if media_type in xml_text_types or re.match(xml_text_types[0], media_type, re.I | re.S | re.X): return _XML_TEXT_TYPE if media_type == u'text/html': return _HTML_TEXT_TYPE if media_type == u'text/css': return _TEXT_UTF8 if media_type.startswith(u'text/'): return _TEXT_TYPE return _OTHER_TYPE def _getTextType(text, log = None): if text[:30].find(u'<?xml version=') != -1: return _XML_APPLICATION_TYPE return _OTHER_TYPE def encodingByMediaType(media_type, log = None): defaultencodings = { _XML_APPLICATION_TYPE: u'utf-8', _XML_TEXT_TYPE: u'ascii', _HTML_TEXT_TYPE: u'iso-8859-1', _TEXT_TYPE: u'iso-8859-1', _TEXT_UTF8: u'utf-8', _OTHER_TYPE: None } texttype = _getTextTypeByMediaType(media_type) encoding = defaultencodings.get(texttype, None) if log: if not encoding: log.debug(u'"%s" Media-Type has no default encoding', media_type) else: log.debug(u'Default encoding for Media Type "%s": %s', media_type, encoding) return encoding def getHTTPInfo(response, log = None): info = response.info() media_type = info.gettype() encoding = info.getparam('charset') if encoding: encoding = encoding.lower() if log: log.info(u'HTTP media_type: %s', media_type) log.info(u'HTTP encoding: %s', encoding) return (media_type, encoding) def getMetaInfo(text, log = None): p = _MetaHTMLParser() try: p.feed(text) except HTMLParser.HTMLParseError: e = None if p.content_type: (media_type, params) = cgi.parse_header(p.content_type) encoding = params.get('charset') if encoding: encoding = encoding.lower() if log: log.info(u'HTML META media_type: %s', media_type) log.info(u'HTML META encoding: %s', encoding) else: media_type = None encoding = None return (media_type, encoding) def detectXMLEncoding(fp, log = None, includeDefault = True): if type(fp) in types.StringTypes: fp = StringIO.StringIO(fp) bomDict = { (0, 0, 254, 255): 'utf_32_be', (255, 254, 0, 0): 'utf_32_le', (254, 255, None, None): 'utf_16_be', (255, 254, None, None): 'utf_16_le', (239, 187, 191, None): 'utf-8' } oldFP = fp.tell() fp.seek(0) (byte1, byte2, byte3, byte4) = tuple(map(ord, fp.read(4))) bomDetection = bomDict.get((byte1, byte2, byte3, byte4)) if not bomDetection: bomDetection = bomDict.get((byte1, byte2, byte3, None)) if not bomDetection: bomDetection = bomDict.get((byte1, byte2, None, None)) if bomDetection: if log: log.info(u'XML BOM encoding: %s' % bomDetection) fp.seek(oldFP) return bomDetection fp.seek(0) buffer = fp.read(2048) xmlDeclPattern = '\n ^<\\?xml # w/o BOM, xmldecl starts with <?xml at the first byte\n .+? # some chars (version info), matched minimal\n encoding= # encoding attribute begins\n ["\'] # attribute start delimiter\n (?P<encstr> # what\'s matched in the brackets will be named encstr\n [^"\']+ # every character not delimiter (not overly exact!)\n ) # closes the brackets pair for the named group\n ["\'] # attribute end delimiter\n .*? # some chars optionally (standalone decl or whitespace)\n \\?> # xmldecl end\n ' xmlDeclRE = re.compile(xmlDeclPattern, re.VERBOSE) match = xmlDeclRE.search(buffer) fp.seek(oldFP) if match: enc = match.group('encstr').lower() if log: log.info(u'XML encoding="%s"' % enc) return enc if includeDefault: return u'utf-8' return None def tryEncodings(text, log = None): try: import chardet encoding = chardet.detect(text)['encoding'] except ImportError: msg = 'Using simplified encoding detection, you might want to install chardet.' if log: log.warn(msg) else: print msg encodings = ('ascii', 'iso-8859-1', 'utf-8') encoding = None for e in encodings: try: text.decode(e) except UnicodeDecodeError: continue if 'iso-8859-1' == e: try: if u'Γé¼' in text.decode('windows-1252'): return 'windows-1252' except UnicodeDecodeError: pass except: None<EXCEPTION MATCH>UnicodeDecodeError None<EXCEPTION MATCH>UnicodeDecodeError return e return encoding def getEncodingInfo(response = None, text = u'', log = None, url = None): if url: response = urllib.urlopen(url) if text is None: try: text = response.read() except IOError: e = None except: None<EXCEPTION MATCH>IOError None<EXCEPTION MATCH>IOError if text is None: text = '' encinfo = EncodingInfo() logstream = StringIO.StringIO() if not log: log = buildlog(stream = logstream, format = '%(message)s') if response: (encinfo.http_media_type, encinfo.http_encoding) = getHTTPInfo(response, log) texttype = _getTextTypeByMediaType(encinfo.http_media_type, log) else: texttype = _getTextType(text, log) if texttype == _XML_APPLICATION_TYPE: try: encinfo.xml_encoding = detectXMLEncoding(text, log) except (AttributeError, ValueError): e = None encinfo.xml_encoding = None except: None<EXCEPTION MATCH>(AttributeError, ValueError) None<EXCEPTION MATCH>(AttributeError, ValueError) if texttype == _HTML_TEXT_TYPE: try: encinfo.xml_encoding = detectXMLEncoding(text, log, includeDefault = False) except (AttributeError, ValueError): e = None encinfo.xml_encoding = None except: None<EXCEPTION MATCH>(AttributeError, ValueError) None<EXCEPTION MATCH>(AttributeError, ValueError) if texttype == _HTML_TEXT_TYPE or texttype == _TEXT_TYPE: (encinfo.meta_media_type, encinfo.meta_encoding) = getMetaInfo(text, log) encinfo.encoding = encinfo.http_encoding encinfo.mismatch = False if texttype == _XML_APPLICATION_TYPE: if not encinfo.encoding: encinfo.encoding = encinfo.xml_encoding elif texttype == _HTML_TEXT_TYPE: if not encinfo.encoding: encinfo.encoding = encinfo.meta_encoding if not encinfo.encoding: encinfo.encoding = encodingByMediaType(encinfo.http_media_type) if not encinfo.encoding: encinfo.encoding = tryEncodings(text) elif texttype == _XML_TEXT_TYPE or texttype == _TEXT_TYPE: if not encinfo.encoding: encinfo.encoding = encodingByMediaType(encinfo.http_media_type) elif texttype == _TEXT_UTF8: if not encinfo.encoding: encinfo.encoding = encodingByMediaType(encinfo.http_media_type) if encinfo.http_encoding and encinfo.xml_encoding and encinfo.http_encoding != encinfo.xml_encoding: encinfo.mismatch = True log.warn(u'"%s" (HTTP) != "%s" (XML) encoding mismatch' % (encinfo.http_encoding, encinfo.xml_encoding)) if encinfo.http_encoding and encinfo.meta_encoding and encinfo.http_encoding != encinfo.meta_encoding: encinfo.mismatch = True log.warn(u'"%s" (HTTP) != "%s" (HTML <meta>) encoding mismatch' % (encinfo.http_encoding, encinfo.meta_encoding)) if encinfo.xml_encoding and encinfo.meta_encoding and encinfo.xml_encoding != encinfo.meta_encoding: encinfo.mismatch = True log.warn(u'"%s" (XML) != "%s" (HTML <meta>) encoding mismatch' % (encinfo.xml_encoding, encinfo.meta_encoding)) log.info(u'Encoding (probably): %s (Mismatch: %s)', encinfo.encoding, encinfo.mismatch) encinfo.logtext = logstream.getvalue() return encinfo if __name__ == '__main__': import pydoc pydoc.help(__name__)