home *** CD-ROM | disk | FTP | other *** search
- # Source Generated with Decompyle++
- # File: in.pyc (Python 2.6)
-
- import constants
- import sys
- from latin1prober import Latin1Prober
- from mbcsgroupprober import MBCSGroupProber
- from sbcsgroupprober import SBCSGroupProber
- from escprober import EscCharSetProber
- import re
- MINIMUM_THRESHOLD = 0.2
- ePureAscii = 0
- eEscAscii = 1
- eHighbyte = 2
-
- class UniversalDetector:
-
- def __init__(self):
- self._highBitDetector = re.compile('[\\x80-\\xFF]')
- self._escDetector = re.compile('(\\033|~{)')
- self._mEscCharSetProber = None
- self._mCharSetProbers = []
- self.reset()
-
-
- def reset(self):
- self.result = {
- 'encoding': None,
- 'confidence': 0 }
- self.done = constants.False
- self._mStart = constants.True
- self._mGotData = constants.False
- self._mInputState = ePureAscii
- self._mLastChar = ''
- if self._mEscCharSetProber:
- self._mEscCharSetProber.reset()
-
- for prober in self._mCharSetProbers:
- prober.reset()
-
-
-
- def feed(self, aBuf):
- if self.done:
- return None
- aLen = len(aBuf)
- if not aLen:
- return None
- self._mGotData = constants.True
- if self.result['encoding'] and self.result['confidence'] > 0:
- self.done = constants.True
- return None
- self._mLastChar = aBuf[-1]
- if self._mInputState == eEscAscii:
- if not self._mEscCharSetProber:
- self._mEscCharSetProber = EscCharSetProber()
-
- if self._mEscCharSetProber.feed(aBuf) == constants.eFoundIt:
- self.result = {
- 'encoding': self._mEscCharSetProber.get_charset_name(),
- 'confidence': self._mEscCharSetProber.get_confidence() }
- self.done = constants.True
-
- elif self._mInputState == eHighbyte:
- if not self._mCharSetProbers:
- self._mCharSetProbers = [
- MBCSGroupProber(),
- SBCSGroupProber(),
- Latin1Prober()]
-
- for prober in self._mCharSetProbers:
- if prober.feed(aBuf) == constants.eFoundIt:
- self.result = {
- 'encoding': prober.get_charset_name(),
- 'confidence': prober.get_confidence() }
- self.done = constants.True
- break
- continue
-
-
-
-
- def close(self):
- if self.done:
- return None
- if not self._mGotData:
- if constants._debug:
- sys.stderr.write('no data received!\n')
-
- return None
- self.done = constants.True
- if self._mInputState == ePureAscii:
- self.result = {
- 'encoding': 'ascii',
- 'confidence': 1 }
- return self.result
- if self._mInputState == eHighbyte:
- proberConfidence = None
- maxProberConfidence = 0
- maxProber = None
- for prober in self._mCharSetProbers:
- proberConfidence = prober.get_confidence()
- if proberConfidence > maxProberConfidence:
- maxProberConfidence = proberConfidence
- maxProber = prober
- continue
- self.done if not prober else self._mGotData
-
- if maxProber and maxProberConfidence > MINIMUM_THRESHOLD:
- self.result = {
- 'encoding': maxProber.get_charset_name(),
- 'confidence': maxProber.get_confidence() }
- return self.result
-
- if constants._debug:
- sys.stderr.write('no probers hit minimum threshhold\n')
- for prober in self._mCharSetProbers[0].mProbers:
- if not prober:
- continue
-
- sys.stderr.write('%s confidence = %s\n' % (prober.get_charset_name(), prober.get_confidence()))
-
-
-
-
-