home *** CD-ROM | disk | FTP | other *** search
- # Source Generated with Decompyle++
- # File: in.pyc (Python 2.6)
-
- __version__ = '1.0'
- import re
-
- def detect(aBuf):
- import calibre.ebooks.chardet.universaldetector as universaldetector
- u = universaldetector.UniversalDetector()
- u.reset()
- u.feed(aBuf)
- u.close()
- return u.result
-
- ENCODING_PATS = [
- re.compile('<\\?[^<>]+encoding=[\\\'"](.*?)[\\\'"][^<>]*>', re.IGNORECASE),
- re.compile('<meta\\s+?[^<>]+?content=[\'"][^\'"]*?charset=([-a-z0-9]+)[^\'"]*?[\'"][^<>]*>', re.IGNORECASE)]
- ENTITY_PATTERN = re.compile('&(\\S+?);')
-
- def strip_encoding_declarations(raw):
- for pat in ENCODING_PATS:
- raw = pat.sub('', raw)
-
- return raw
-
-
- def substitute_entites(raw):
- xml_entity_to_unicode = xml_entity_to_unicode
- import calibre
- return ENTITY_PATTERN.sub(xml_entity_to_unicode, raw)
-
- _CHARSET_ALIASES = {
- 'macintosh': 'mac-roman',
- 'x-sjis': 'shift-jis' }
-
- def force_encoding(raw, verbose, assume_utf8 = False):
- preferred_encoding = preferred_encoding
- import calibre.constants
-
- try:
- chardet = detect(raw)
- except:
- chardet = {
- 'encoding': preferred_encoding,
- 'confidence': 0 }
-
- encoding = chardet['encoding']
- if chardet['confidence'] < 1 and assume_utf8:
- encoding = 'utf-8'
-
- if chardet['confidence'] < 1 and verbose:
- print 'WARNING: Encoding detection confidence %d%%' % chardet['confidence'] * 100
-
- if not encoding:
- encoding = preferred_encoding
-
- encoding = encoding.lower()
- if _CHARSET_ALIASES.has_key(encoding):
- encoding = _CHARSET_ALIASES[encoding]
-
- if encoding == 'ascii':
- encoding = 'utf-8'
-
- return encoding
-
-
- def xml_to_unicode(raw, verbose = False, strip_encoding_pats = False, resolve_entities = False, assume_utf8 = False):
- encoding = None
- if not raw:
- return (u'', encoding)
- if not isinstance(raw, unicode):
- if raw.startswith('\xff\xfe'):
- raw = raw.decode('utf-16-le')[1:]
- encoding = 'utf-16-le'
- elif raw.startswith('\xfe\xff'):
- raw = raw.decode('utf-16-be')[1:]
- encoding = 'utf-16-be'
-
-
- if not isinstance(raw, unicode):
- for pat in ENCODING_PATS:
- match = pat.search(raw)
- if match:
- encoding = match.group(1)
- break
- continue
-
- if encoding is None:
- encoding = force_encoding(raw, verbose, assume_utf8 = assume_utf8)
-
-
- try:
- if encoding.lower().strip() == 'macintosh':
- encoding = 'mac-roman'
-
- raw = raw.decode(encoding, 'replace')
- except LookupError:
- encoding = 'utf-8'
- raw = raw.decode(encoding, 'replace')
- except:
- None<EXCEPTION MATCH>LookupError
-
-
- None<EXCEPTION MATCH>LookupError
- if strip_encoding_pats:
- raw = strip_encoding_declarations(raw)
-
- if resolve_entities:
- raw = substitute_entites(raw)
-
- return (raw, encoding)
-
-