home *** CD-ROM | disk | FTP | other *** search
- # Source Generated with Decompyle++
- # File: in.pyc (Python 2.6)
-
- import os
- import re
- import tempfile
- from calibre.ebooks.rtf2xml import copy
-
- class Tokenize:
-
- def __init__(self, in_file, bug_handler, copy = None, run_level = 1):
- self._Tokenize__file = in_file
- self._Tokenize__bug_handler = bug_handler
- self._Tokenize__copy = copy
- self._Tokenize__special_tokens = [
- '_',
- '~',
- "'",
- '{',
- '}']
- self._Tokenize__write_to = tempfile.mktemp()
-
-
- def __from_ms_to_utf8(self, match_obj):
- uni_char = int(match_obj.group(1))
- if uni_char < 0:
- uni_char += 65536
-
- return '' + str('%X' % uni_char) + ';'
-
-
- def __neg_unicode_func(self, match_obj):
- neg_uni_char = int(match_obj.group(1)) * -1
- uni_char = neg_uni_char + 65536
- return '' + str('%X' % uni_char) + ';'
-
-
- def __sub_line_reg(self, line):
- line = line.replace('\\\\', '\\backslash ')
- line = line.replace('\\~', '\\~ ')
- line = line.replace('\\;', '\\; ')
- line = line.replace('&', '&')
- line = line.replace('<', '<')
- line = line.replace('>', '>')
- line = line.replace('\\~', '\\~ ')
- line = line.replace('\\_', '\\_ ')
- line = line.replace('\\:', '\\: ')
- line = line.replace('\\-', '\\- ')
- line = line.replace('\\{', '\\ob ')
- line = line.replace('\\}', '\\cb ')
- line = line.replace('{', '\\{')
- line = line.replace('}', '\\}')
- line = re.sub(self._Tokenize__utf_exp, self._Tokenize__from_ms_to_utf8, line)
- line = re.sub(self._Tokenize__ms_hex_exp, '\\mshex0\\g<1> ', line)
- line = re.sub(self._Tokenize__par_exp, '\\par ', line)
- return line
-
-
- def __compile_expressions(self):
- self._Tokenize__ms_hex_exp = re.compile("\\\\\\'(..)")
- self._Tokenize__utf_exp = re.compile('\\\\u(-?\\d{3,6}) {0,1}')
- self._Tokenize__splitexp = re.compile('(\\\\[\\\\{}]|{|}|\\\\[^\\s\\\\{}&]+(?:\\s)?)')
- self._Tokenize__par_exp = re.compile('\\\\$')
- self._Tokenize__mixed_exp = re.compile('(\\\\[a-zA-Z]+\\d+)(\\D+)')
-
-
- def __create_tokens(self):
- self._Tokenize__compile_expressions()
- read_obj = open(self._Tokenize__file, 'r')
- write_obj = open(self._Tokenize__write_to, 'w')
- line_to_read = 'dummy'
- while line_to_read:
- line_to_read = read_obj.readline()
- line = line_to_read
- line = line.replace('\n', '')
- line = self._Tokenize__sub_line_reg(line)
- tokens = re.split(self._Tokenize__splitexp, line)
- for token in tokens:
- if token != '':
- write_obj.write(token + '\n')
- continue
-
- read_obj.close()
- write_obj.close()
-
-
- def tokenize(self):
- self._Tokenize__create_tokens()
- copy_obj = copy.Copy(bug_handler = self._Tokenize__bug_handler)
- if self._Tokenize__copy:
- copy_obj.copy_file(self._Tokenize__write_to, 'tokenize.data')
-
- copy_obj.rename(self._Tokenize__write_to, self._Tokenize__file)
- os.remove(self._Tokenize__write_to)
-
-
-