home *** CD-ROM | disk | FTP | other *** search
Python Source | 2000-10-25 | 10.9 KB | 366 lines |
- #
- # (re)generate unicode property and type databases
- #
- # this script converts a unicode 3.0 database file to
- # Modules/unicodedata_db.h and Objects/unicodetype_db.h
- #
- # history:
- # 2000-09-24 fl created (based on bits and pieces from unidb)
- # 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
- # 2000-09-25 fl added character type table
- # 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields
- #
- # written by Fredrik Lundh (fredrik@pythonware.com), September 2000
- #
-
- import sys
-
- SCRIPT = sys.argv[0]
- VERSION = "1.1"
-
- UNICODE_DATA = "UnicodeData-Latest.txt"
-
- CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
- "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
- "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
- "So" ]
-
- BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
- "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
- "ON" ]
-
- # note: should match definitions in Objects/unicodectype.c
- ALPHA_MASK = 0x01
- DECIMAL_MASK = 0x02
- DIGIT_MASK = 0x04
- LOWER_MASK = 0x08
- LINEBREAK_MASK = 0x10
- SPACE_MASK = 0x20
- TITLE_MASK = 0x40
- UPPER_MASK = 0x80
-
- def maketables():
-
- unicode = UnicodeData(UNICODE_DATA)
-
- # extract unicode properties
- dummy = (0, 0, 0, 0)
- table = [dummy]
- cache = {0: dummy}
- index = [0] * len(unicode.chars)
-
- # 1) database properties
- for char in unicode.chars:
- record = unicode.table[char]
- if record:
- # extract database properties
- category = CATEGORY_NAMES.index(record[2])
- combining = int(record[3])
- bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
- mirrored = record[9] == "Y"
- item = (
- category, combining, bidirectional, mirrored
- )
- # add entry to index and item tables
- i = cache.get(item)
- if i is None:
- cache[item] = i = len(table)
- table.append(item)
- index[char] = i
-
- # 2) decomposition data
-
- # FIXME: <fl> using the encoding stuff from unidb would save
- # another 50k or so, but I'll leave that for 2.1...
-
- decomp_data = [""]
- decomp_index = [0] * len(unicode.chars)
-
- for char in unicode.chars:
- record = unicode.table[char]
- if record:
- if record[5]:
- try:
- i = decomp_data.index(record[5])
- except ValueError:
- i = len(decomp_data)
- decomp_data.append(record[5])
- else:
- i = 0
- decomp_index[char] = i
-
- FILE = "Modules/unicodedata_db.h"
-
- sys.stdout = open(FILE, "w")
-
- print "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
- print
- print "/* a list of unique database records */"
- print "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
- for item in table:
- print " {%d, %d, %d, %d}," % item
- print "};"
- print
-
- # FIXME: the following tables should be made static, and
- # the support code moved into unicodedatabase.c
-
- print "/* string literals */"
- print "const char *_PyUnicode_CategoryNames[] = {"
- for name in CATEGORY_NAMES:
- print " \"%s\"," % name
- print " NULL"
- print "};"
-
- print "const char *_PyUnicode_BidirectionalNames[] = {"
- for name in BIDIRECTIONAL_NAMES:
- print " \"%s\"," % name
- print " NULL"
- print "};"
-
- print "static const char *decomp_data[] = {"
- for name in decomp_data:
- print " \"%s\"," % name
- print " NULL"
- print "};"
-
- # split record index table
- index1, index2, shift = splitbins(index)
-
- print "/* index tables for the database records */"
- print "#define SHIFT", shift
- Array("index1", index1).dump(sys.stdout)
- Array("index2", index2).dump(sys.stdout)
-
- # split decomposition index table
- index1, index2, shift = splitbins(decomp_index)
-
- print "/* index tables for the decomposition data */"
- print "#define DECOMP_SHIFT", shift
- Array("decomp_index1", index1).dump(sys.stdout)
- Array("decomp_index2", index2).dump(sys.stdout)
-
- sys.stdout = sys.__stdout__
-
- #
- # 3) unicode type data
-
- # extract unicode types
- dummy = (0, 0, 0, 0, 0, 0)
- table = [dummy]
- cache = {0: dummy}
- index = [0] * len(unicode.chars)
-
- for char in unicode.chars:
- record = unicode.table[char]
- if record:
- # extract database properties
- category = record[2]
- bidirectional = record[4]
- flags = 0
- if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
- flags |= ALPHA_MASK
- if category == "Ll":
- flags |= LOWER_MASK
- if category == "Zl" or bidirectional == "B":
- flags |= LINEBREAK_MASK
- if category == "Zs" or bidirectional in ("WS", "B", "S"):
- flags |= SPACE_MASK
- if category == "Lt":
- flags |= TITLE_MASK
- if category == "Lu":
- flags |= UPPER_MASK
- # use delta predictor for upper/lower/title
- if record[12]:
- upper = (int(record[12], 16) - char) & 0xffff
- else:
- upper = 0
- if record[13]:
- lower = (int(record[13], 16) - char) & 0xffff
- else:
- lower = 0
- if record[14]:
- title = (int(record[14], 16) - char) & 0xffff
- else:
- title = 0
- # decimal digit, integer digit
- decimal = 0
- if record[6]:
- flags |= DECIMAL_MASK
- decimal = int(record[6])
- digit = 0
- if record[7]:
- flags |= DIGIT_MASK
- digit = int(record[7])
- item = (
- flags, upper, lower, title, decimal, digit
- )
- # add entry to index and item tables
- i = cache.get(item)
- if i is None:
- cache[item] = i = len(table)
- table.append(item)
- index[char] = i
-
- print len(table), "ctype entries"
-
- FILE = "Objects/unicodetype_db.h"
-
- sys.stdout = open(FILE, "w")
-
- print "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
- print
- print "/* a list of unique character type descriptors */"
- print "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"
- for item in table:
- print " {%d, %d, %d, %d, %d, %d}," % item
- print "};"
- print
-
- # split decomposition index table
- index1, index2, shift = splitbins(index)
-
- print "/* type indexes */"
- print "#define SHIFT", shift
- Array("index1", index1).dump(sys.stdout)
- Array("index2", index2).dump(sys.stdout)
-
- sys.stdout = sys.__stdout__
-
- # --------------------------------------------------------------------
- # the following support code is taken from the unidb utilities
- # Copyright (c) 1999-2000 by Secret Labs AB
-
- # load a unicode-data file from disk
-
- import string, sys
-
- class UnicodeData:
-
- def __init__(self, filename):
- file = open(filename)
- table = [None] * 65536
- while 1:
- s = file.readline()
- if not s:
- break
- s = string.split(string.strip(s), ";")
- char = string.atoi(s[0], 16)
- table[char] = s
-
- # public attributes
- self.filename = filename
- self.table = table
- self.chars = range(65536) # unicode
-
- def uselatin1(self):
- # restrict character range to ISO Latin 1
- self.chars = range(256)
-
- # stuff to deal with arrays of unsigned integers
-
- class Array:
-
- def __init__(self, name, data):
- self.name = name
- self.data = data
-
- def dump(self, file):
- # write data to file, as a C array
- size = getsize(self.data)
- # print >>sys.stderr, self.name+":", size*len(self.data), "bytes"
- file.write("static ")
- if size == 1:
- file.write("unsigned char")
- elif size == 2:
- file.write("unsigned short")
- else:
- file.write("unsigned int")
- file.write(" " + self.name + "[] = {\n")
- if self.data:
- s = " "
- for item in self.data:
- i = str(item) + ", "
- if len(s) + len(i) > 78:
- file.write(s + "\n")
- s = " " + i
- else:
- s = s + i
- if string.strip(s):
- file.write(s + "\n")
- file.write("};\n\n")
-
- def getsize(data):
- # return smallest possible integer size for the given array
- maxdata = max(data)
- if maxdata < 256:
- return 1
- elif maxdata < 65536:
- return 2
- else:
- return 4
-
- def splitbins(t, trace=0):
- """t, trace=0 -> (t1, t2, shift). Split a table to save space.
-
- t is a sequence of ints. This function can be useful to save space if
- many of the ints are the same. t1 and t2 are lists of ints, and shift
- is an int, chosen to minimize the combined size of t1 and t2 (in C
- code), and where for each i in range(len(t)),
- t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
- where mask is a bitmask isolating the last "shift" bits.
-
- If optional arg trace is true (default false), progress info is
- printed to sys.stderr.
- """
-
- import sys
- if trace:
- def dump(t1, t2, shift, bytes):
- print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
- len(t1), len(t2), shift, bytes)
- print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \
- "bytes"
- n = len(t)-1 # last valid index
- maxshift = 0 # the most we can shift n and still have something left
- if n > 0:
- while n >> 1:
- n >>= 1
- maxshift += 1
- del n
- bytes = sys.maxint # smallest total size so far
- t = tuple(t) # so slices can be dict keys
- for shift in range(maxshift + 1):
- t1 = []
- t2 = []
- size = 2**shift
- bincache = {}
- for i in range(0, len(t), size):
- bin = t[i:i+size]
- index = bincache.get(bin)
- if index is None:
- index = len(t2)
- bincache[bin] = index
- t2.extend(bin)
- t1.append(index >> shift)
- # determine memory size
- b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
- if trace:
- dump(t1, t2, shift, b)
- if b < bytes:
- best = t1, t2, shift
- bytes = b
- t1, t2, shift = best
- if trace:
- print >>sys.stderr, "Best:",
- dump(t1, t2, shift, bytes)
- if __debug__:
- # exhaustively verify that the decomposition is correct
- mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
- for i in xrange(len(t)):
- assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
- return best
-
- if __name__ == "__main__":
- maketables()
-