home *** CD-ROM | disk | FTP | other *** search
/ Maximum CD 2010 November / maximum-cd-2010-11.iso / DiscContents / calibre-0.7.13.msi / file_854 (.txt) < prev    next >
Encoding:
Python Compiled Bytecode  |  2010-08-06  |  3.1 KB  |  83 lines

  1. # Source Generated with Decompyle++
  2. # File: in.pyc (Python 2.6)
  3.  
  4. import constants
  5. import sys
  6. from charsetprober import CharSetProber
  7. SAMPLE_SIZE = 64
  8. SB_ENOUGH_REL_THRESHOLD = 1024
  9. POSITIVE_SHORTCUT_THRESHOLD = 0.95
  10. NEGATIVE_SHORTCUT_THRESHOLD = 0.05
  11. SYMBOL_CAT_ORDER = 250
  12. NUMBER_OF_SEQ_CAT = 4
  13. POSITIVE_CAT = NUMBER_OF_SEQ_CAT - 1
  14.  
  15. class SingleByteCharSetProber(CharSetProber):
  16.     
  17.     def __init__(self, model, reversed = constants.False, nameProber = None):
  18.         CharSetProber.__init__(self)
  19.         self._mModel = model
  20.         self._mReversed = reversed
  21.         self._mNameProber = nameProber
  22.         self.reset()
  23.  
  24.     
  25.     def reset(self):
  26.         CharSetProber.reset(self)
  27.         self._mLastOrder = 255
  28.         self._mSeqCounters = [
  29.             0] * NUMBER_OF_SEQ_CAT
  30.         self._mTotalSeqs = 0
  31.         self._mTotalChar = 0
  32.         self._mFreqChar = 0
  33.  
  34.     
  35.     def get_charset_name(self):
  36.         if self._mNameProber:
  37.             return self._mNameProber.get_charset_name()
  38.         return self._mModel['charsetName']
  39.  
  40.     
  41.     def feed(self, aBuf):
  42.         if not self._mModel['keepEnglishLetter']:
  43.             aBuf = self.filter_without_english_letters(aBuf)
  44.         
  45.         aLen = len(aBuf)
  46.         if not aLen:
  47.             return self.get_state()
  48.         for c in aBuf:
  49.             order = self._mModel['charToOrderMap'][ord(c)]
  50.             self._mLastOrder = order
  51.         
  52.         if self.get_state() == constants.eDetecting:
  53.             if self._mTotalSeqs > SB_ENOUGH_REL_THRESHOLD:
  54.                 cf = self.get_confidence()
  55.                 if cf > POSITIVE_SHORTCUT_THRESHOLD:
  56.                     if constants._debug:
  57.                         sys.stderr.write('%s confidence = %s, we have a winner\n' % (self._mModel['charsetName'], cf))
  58.                     
  59.                     self._mState = constants.eFoundIt
  60.                 elif cf < NEGATIVE_SHORTCUT_THRESHOLD:
  61.                     if constants._debug:
  62.                         sys.stderr.write('%s confidence = %s, below negative shortcut threshhold %s\n' % (self._mModel['charsetName'], cf, NEGATIVE_SHORTCUT_THRESHOLD))
  63.                     
  64.                     self._mState = constants.eNotMe
  65.                 
  66.             
  67.         
  68.         return self.get_state()
  69.  
  70.     
  71.     def get_confidence(self):
  72.         r = 0.01
  73.         if self._mTotalSeqs > 0:
  74.             r = 1 * self._mSeqCounters[POSITIVE_CAT] / self._mTotalSeqs / self._mModel['mTypicalPositiveRatio']
  75.             r = r * self._mFreqChar / self._mTotalChar
  76.             if r >= 1:
  77.                 r = 0.99
  78.             
  79.         
  80.         return r
  81.  
  82.  
  83.