home *** CD-ROM | disk | FTP | other *** search
/ Freelog 125 / Freelog_MarsAvril2015_No125.iso / Bureautique / LibreOffice / LibreOffice_4.3.5_Win_x86.msi / utf_8_sig.py < prev    next >
Text File  |  2014-12-12  |  4KB  |  131 lines

  1. """ Python 'utf-8-sig' Codec
  2. This work similar to UTF-8 with the following changes:
  3.  
  4. * On encoding/writing a UTF-8 encoded BOM will be prepended/written as the
  5.   first three bytes.
  6.  
  7. * On decoding/reading if the first three bytes are a UTF-8 encoded BOM, these
  8.   bytes will be skipped.
  9. """
  10. import codecs
  11.  
  12. ### Codec APIs
  13.  
  14. def encode(input, errors='strict'):
  15.     return (codecs.BOM_UTF8 + codecs.utf_8_encode(input, errors)[0],
  16.             len(input))
  17.  
  18. def decode(input, errors='strict'):
  19.     prefix = 0
  20.     if input[:3] == codecs.BOM_UTF8:
  21.         input = input[3:]
  22.         prefix = 3
  23.     (output, consumed) = codecs.utf_8_decode(input, errors, True)
  24.     return (output, consumed+prefix)
  25.  
  26. class IncrementalEncoder(codecs.IncrementalEncoder):
  27.     def __init__(self, errors='strict'):
  28.         codecs.IncrementalEncoder.__init__(self, errors)
  29.         self.first = 1
  30.  
  31.     def encode(self, input, final=False):
  32.         if self.first:
  33.             self.first = 0
  34.             return codecs.BOM_UTF8 + \
  35.                    codecs.utf_8_encode(input, self.errors)[0]
  36.         else:
  37.             return codecs.utf_8_encode(input, self.errors)[0]
  38.  
  39.     def reset(self):
  40.         codecs.IncrementalEncoder.reset(self)
  41.         self.first = 1
  42.  
  43.     def getstate(self):
  44.         return self.first
  45.  
  46.     def setstate(self, state):
  47.         self.first = state
  48.  
  49. class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
  50.     def __init__(self, errors='strict'):
  51.         codecs.BufferedIncrementalDecoder.__init__(self, errors)
  52.         self.first = 1
  53.  
  54.     def _buffer_decode(self, input, errors, final):
  55.         if self.first:
  56.             if len(input) < 3:
  57.                 if codecs.BOM_UTF8.startswith(input):
  58.                     # not enough data to decide if this really is a BOM
  59.                     # => try again on the next call
  60.                     return ("", 0)
  61.                 else:
  62.                     self.first = 0
  63.             else:
  64.                 self.first = 0
  65.                 if input[:3] == codecs.BOM_UTF8:
  66.                     (output, consumed) = \
  67.                        codecs.utf_8_decode(input[3:], errors, final)
  68.                     return (output, consumed+3)
  69.         return codecs.utf_8_decode(input, errors, final)
  70.  
  71.     def reset(self):
  72.         codecs.BufferedIncrementalDecoder.reset(self)
  73.         self.first = 1
  74.  
  75.     def getstate(self):
  76.         state = codecs.BufferedIncrementalDecoder.getstate(self)
  77.         # state[1] must be 0 here, as it isn't passed along to the caller
  78.         return (state[0], self.first)
  79.  
  80.     def setstate(self, state):
  81.         # state[1] will be ignored by BufferedIncrementalDecoder.setstate()
  82.         codecs.BufferedIncrementalDecoder.setstate(self, state)
  83.         self.first = state[1]
  84.  
  85. class StreamWriter(codecs.StreamWriter):
  86.     def reset(self):
  87.         codecs.StreamWriter.reset(self)
  88.         try:
  89.             del self.encode
  90.         except AttributeError:
  91.             pass
  92.  
  93.     def encode(self, input, errors='strict'):
  94.         self.encode = codecs.utf_8_encode
  95.         return encode(input, errors)
  96.  
  97. class StreamReader(codecs.StreamReader):
  98.     def reset(self):
  99.         codecs.StreamReader.reset(self)
  100.         try:
  101.             del self.decode
  102.         except AttributeError:
  103.             pass
  104.  
  105.     def decode(self, input, errors='strict'):
  106.         if len(input) < 3:
  107.             if codecs.BOM_UTF8.startswith(input):
  108.                 # not enough data to decide if this is a BOM
  109.                 # => try again on the next call
  110.                 return ("", 0)
  111.         elif input[:3] == codecs.BOM_UTF8:
  112.             self.decode = codecs.utf_8_decode
  113.             (output, consumed) = codecs.utf_8_decode(input[3:],errors)
  114.             return (output, consumed+3)
  115.         # (else) no BOM present
  116.         self.decode = codecs.utf_8_decode
  117.         return codecs.utf_8_decode(input, errors)
  118.  
  119. ### encodings module API
  120.  
  121. def getregentry():
  122.     return codecs.CodecInfo(
  123.         name='utf-8-sig',
  124.         encode=encode,
  125.         decode=decode,
  126.         incrementalencoder=IncrementalEncoder,
  127.         incrementaldecoder=IncrementalDecoder,
  128.         streamreader=StreamReader,
  129.         streamwriter=StreamWriter,
  130.     )
  131.