home *** CD-ROM | disk | FTP | other *** search
/ Maximum CD 2011 July / maximum-cd-2011-07.iso / DiscContents / LibO_3.3.2_Win_x86_install_multi.exe / libreoffice1.cab / test_multibytecodec.py < prev    next >
Encoding:
Python Source  |  2011-03-15  |  9.5 KB  |  245 lines

  1. #!/usr/bin/env python
  2. #
  3. # test_multibytecodec.py
  4. #   Unit test for multibytecodec itself
  5. #
  6.  
  7. from test import test_support
  8. from test import test_multibytecodec_support
  9. from test.test_support import TESTFN
  10. import unittest, StringIO, codecs, sys, os
  11. import _multibytecodec
  12.  
  13. ALL_CJKENCODINGS = [
  14. # _codecs_cn
  15.     'gb2312', 'gbk', 'gb18030', 'hz',
  16. # _codecs_hk
  17.     'big5hkscs',
  18. # _codecs_jp
  19.     'cp932', 'shift_jis', 'euc_jp', 'euc_jisx0213', 'shift_jisx0213',
  20.     'euc_jis_2004', 'shift_jis_2004',
  21. # _codecs_kr
  22.     'cp949', 'euc_kr', 'johab',
  23. # _codecs_tw
  24.     'big5', 'cp950',
  25. # _codecs_iso2022
  26.     'iso2022_jp', 'iso2022_jp_1', 'iso2022_jp_2', 'iso2022_jp_2004',
  27.     'iso2022_jp_3', 'iso2022_jp_ext', 'iso2022_kr',
  28. ]
  29.  
  30. class Test_MultibyteCodec(unittest.TestCase):
  31.  
  32.     def test_nullcoding(self):
  33.         for enc in ALL_CJKENCODINGS:
  34.             self.assertEqual(''.decode(enc), u'')
  35.             self.assertEqual(unicode('', enc), u'')
  36.             self.assertEqual(u''.encode(enc), '')
  37.  
  38.     def test_str_decode(self):
  39.         for enc in ALL_CJKENCODINGS:
  40.             self.assertEqual('abcd'.encode(enc), 'abcd')
  41.  
  42.     def test_errorcallback_longindex(self):
  43.         dec = codecs.getdecoder('euc-kr')
  44.         myreplace  = lambda exc: (u'', sys.maxint+1)
  45.         codecs.register_error('test.cjktest', myreplace)
  46.         self.assertRaises(IndexError, dec,
  47.                           'apple\x92ham\x93spam', 'test.cjktest')
  48.  
  49.     def test_codingspec(self):
  50.         try:
  51.             for enc in ALL_CJKENCODINGS:
  52.                 print >> open(TESTFN, 'w'), '# coding:', enc
  53.                 exec open(TESTFN)
  54.         finally:
  55.             os.unlink(TESTFN)
  56.  
  57.     def test_init_segfault(self):
  58.         # bug #3305: this used to segfault
  59.         self.assertRaises(AttributeError,
  60.                           _multibytecodec.MultibyteStreamReader, None)
  61.         self.assertRaises(AttributeError,
  62.                           _multibytecodec.MultibyteStreamWriter, None)
  63.  
  64.  
  65. class Test_IncrementalEncoder(unittest.TestCase):
  66.  
  67.     def test_stateless(self):
  68.         # cp949 encoder isn't stateful at all.
  69.         encoder = codecs.getincrementalencoder('cp949')()
  70.         self.assertEqual(encoder.encode(u'\ud30c\uc774\uc36c \ub9c8\uc744'),
  71.                          '\xc6\xc4\xc0\xcc\xbd\xe3 \xb8\xb6\xc0\xbb')
  72.         self.assertEqual(encoder.reset(), None)
  73.         self.assertEqual(encoder.encode(u'\u2606\u223c\u2606', True),
  74.                          '\xa1\xd9\xa1\xad\xa1\xd9')
  75.         self.assertEqual(encoder.reset(), None)
  76.         self.assertEqual(encoder.encode(u'', True), '')
  77.         self.assertEqual(encoder.encode(u'', False), '')
  78.         self.assertEqual(encoder.reset(), None)
  79.  
  80.     def test_stateful(self):
  81.         # jisx0213 encoder is stateful for a few codepoints. eg)
  82.         #   U+00E6 => A9DC
  83.         #   U+00E6 U+0300 => ABC4
  84.         #   U+0300 => ABDC
  85.  
  86.         encoder = codecs.getincrementalencoder('jisx0213')()
  87.         self.assertEqual(encoder.encode(u'\u00e6\u0300'), '\xab\xc4')
  88.         self.assertEqual(encoder.encode(u'\u00e6'), '')
  89.         self.assertEqual(encoder.encode(u'\u0300'), '\xab\xc4')
  90.         self.assertEqual(encoder.encode(u'\u00e6', True), '\xa9\xdc')
  91.  
  92.         self.assertEqual(encoder.reset(), None)
  93.         self.assertEqual(encoder.encode(u'\u0300'), '\xab\xdc')
  94.  
  95.         self.assertEqual(encoder.encode(u'\u00e6'), '')
  96.         self.assertEqual(encoder.encode('', True), '\xa9\xdc')
  97.         self.assertEqual(encoder.encode('', True), '')
  98.  
  99.     def test_stateful_keep_buffer(self):
  100.         encoder = codecs.getincrementalencoder('jisx0213')()
  101.         self.assertEqual(encoder.encode(u'\u00e6'), '')
  102.         self.assertRaises(UnicodeEncodeError, encoder.encode, u'\u0123')
  103.         self.assertEqual(encoder.encode(u'\u0300\u00e6'), '\xab\xc4')
  104.         self.assertRaises(UnicodeEncodeError, encoder.encode, u'\u0123')
  105.         self.assertEqual(encoder.reset(), None)
  106.         self.assertEqual(encoder.encode(u'\u0300'), '\xab\xdc')
  107.         self.assertEqual(encoder.encode(u'\u00e6'), '')
  108.         self.assertRaises(UnicodeEncodeError, encoder.encode, u'\u0123')
  109.         self.assertEqual(encoder.encode(u'', True), '\xa9\xdc')
  110.  
  111.  
  112. class Test_IncrementalDecoder(unittest.TestCase):
  113.  
  114.     def test_dbcs(self):
  115.         # cp949 decoder is simple with only 1 or 2 bytes sequences.
  116.         decoder = codecs.getincrementaldecoder('cp949')()
  117.         self.assertEqual(decoder.decode('\xc6\xc4\xc0\xcc\xbd'),
  118.                          u'\ud30c\uc774')
  119.         self.assertEqual(decoder.decode('\xe3 \xb8\xb6\xc0\xbb'),
  120.                          u'\uc36c \ub9c8\uc744')
  121.         self.assertEqual(decoder.decode(''), u'')
  122.  
  123.     def test_dbcs_keep_buffer(self):
  124.         decoder = codecs.getincrementaldecoder('cp949')()
  125.         self.assertEqual(decoder.decode('\xc6\xc4\xc0'), u'\ud30c')
  126.         self.assertRaises(UnicodeDecodeError, decoder.decode, '', True)
  127.         self.assertEqual(decoder.decode('\xcc'), u'\uc774')
  128.  
  129.         self.assertEqual(decoder.decode('\xc6\xc4\xc0'), u'\ud30c')
  130.         self.assertRaises(UnicodeDecodeError, decoder.decode, '\xcc\xbd', True)
  131.         self.assertEqual(decoder.decode('\xcc'), u'\uc774')
  132.  
  133.     def test_iso2022(self):
  134.         decoder = codecs.getincrementaldecoder('iso2022-jp')()
  135.         ESC = '\x1b'
  136.         self.assertEqual(decoder.decode(ESC + '('), u'')
  137.         self.assertEqual(decoder.decode('B', True), u'')
  138.         self.assertEqual(decoder.decode(ESC + '$'), u'')
  139.         self.assertEqual(decoder.decode('B@$'), u'\u4e16')
  140.         self.assertEqual(decoder.decode('@$@'), u'\u4e16')
  141.         self.assertEqual(decoder.decode('$', True), u'\u4e16')
  142.         self.assertEqual(decoder.reset(), None)
  143.         self.assertEqual(decoder.decode('@$'), u'@$')
  144.         self.assertEqual(decoder.decode(ESC + '$'), u'')
  145.         self.assertRaises(UnicodeDecodeError, decoder.decode, '', True)
  146.         self.assertEqual(decoder.decode('B@$'), u'\u4e16')
  147.  
  148. class Test_StreamReader(unittest.TestCase):
  149.     def test_bug1728403(self):
  150.         try:
  151.             open(TESTFN, 'w').write('\xa1')
  152.             f = codecs.open(TESTFN, encoding='cp949')
  153.             self.assertRaises(UnicodeDecodeError, f.read, 2)
  154.         finally:
  155.             try: f.close()
  156.             except: pass
  157.             os.unlink(TESTFN)
  158.  
  159. class Test_StreamWriter(unittest.TestCase):
  160.     if len(u'\U00012345') == 2: # UCS2
  161.         def test_gb18030(self):
  162.             s = StringIO.StringIO()
  163.             c = codecs.getwriter('gb18030')(s)
  164.             c.write(u'123')
  165.             self.assertEqual(s.getvalue(), '123')
  166.             c.write(u'\U00012345')
  167.             self.assertEqual(s.getvalue(), '123\x907\x959')
  168.             c.write(u'\U00012345'[0])
  169.             self.assertEqual(s.getvalue(), '123\x907\x959')
  170.             c.write(u'\U00012345'[1] + u'\U00012345' + u'\uac00\u00ac')
  171.             self.assertEqual(s.getvalue(),
  172.                     '123\x907\x959\x907\x959\x907\x959\x827\xcf5\x810\x851')
  173.             c.write(u'\U00012345'[0])
  174.             self.assertEqual(s.getvalue(),
  175.                     '123\x907\x959\x907\x959\x907\x959\x827\xcf5\x810\x851')
  176.             self.assertRaises(UnicodeError, c.reset)
  177.             self.assertEqual(s.getvalue(),
  178.                     '123\x907\x959\x907\x959\x907\x959\x827\xcf5\x810\x851')
  179.  
  180.         def test_utf_8(self):
  181.             s= StringIO.StringIO()
  182.             c = codecs.getwriter('utf-8')(s)
  183.             c.write(u'123')
  184.             self.assertEqual(s.getvalue(), '123')
  185.             c.write(u'\U00012345')
  186.             self.assertEqual(s.getvalue(), '123\xf0\x92\x8d\x85')
  187.  
  188.             # Python utf-8 codec can't buffer surrogate pairs yet.
  189.             if 0:
  190.                 c.write(u'\U00012345'[0])
  191.                 self.assertEqual(s.getvalue(), '123\xf0\x92\x8d\x85')
  192.                 c.write(u'\U00012345'[1] + u'\U00012345' + u'\uac00\u00ac')
  193.                 self.assertEqual(s.getvalue(),
  194.                     '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'
  195.                     '\xea\xb0\x80\xc2\xac')
  196.                 c.write(u'\U00012345'[0])
  197.                 self.assertEqual(s.getvalue(),
  198.                     '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'
  199.                     '\xea\xb0\x80\xc2\xac')
  200.                 c.reset()
  201.                 self.assertEqual(s.getvalue(),
  202.                     '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'
  203.                     '\xea\xb0\x80\xc2\xac\xed\xa0\x88')
  204.                 c.write(u'\U00012345'[1])
  205.                 self.assertEqual(s.getvalue(),
  206.                     '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'
  207.                     '\xea\xb0\x80\xc2\xac\xed\xa0\x88\xed\xbd\x85')
  208.  
  209.     else: # UCS4
  210.         pass
  211.  
  212.     def test_streamwriter_strwrite(self):
  213.         s = StringIO.StringIO()
  214.         wr = codecs.getwriter('gb18030')(s)
  215.         wr.write('abcd')
  216.         self.assertEqual(s.getvalue(), 'abcd')
  217.  
  218. class Test_ISO2022(unittest.TestCase):
  219.     def test_g2(self):
  220.         iso2022jp2 = '\x1b(B:hu4:unit\x1b.A\x1bNi de famille'
  221.         uni = u':hu4:unit\xe9 de famille'
  222.         self.assertEqual(iso2022jp2.decode('iso2022-jp-2'), uni)
  223.  
  224.     def test_iso2022_jp_g0(self):
  225.         self.failIf('\x0e' in u'\N{SOFT HYPHEN}'.encode('iso-2022-jp-2'))
  226.         for encoding in ('iso-2022-jp-2004', 'iso-2022-jp-3'):
  227.             e = u'\u3406'.encode(encoding)
  228.             self.failIf(filter(lambda x: x >= '\x80', e))
  229.  
  230.     def test_bug1572832(self):
  231.         if sys.maxunicode >= 0x10000:
  232.             myunichr = unichr
  233.         else:
  234.             myunichr = lambda x: unichr(0xD7C0+(x>>10)) + unichr(0xDC00+(x&0x3FF))
  235.  
  236.         for x in xrange(0x10000, 0x110000):
  237.             # Any ISO 2022 codec will cause the segfault
  238.             myunichr(x).encode('iso_2022_jp', 'ignore')
  239.  
  240. def test_main():
  241.     test_support.run_unittest(__name__)
  242.  
  243. if __name__ == "__main__":
  244.     test_main()
  245.