home *** CD-ROM | disk | FTP | other *** search
Python Source | 2000-10-25 | 14.1 KB | 454 lines |
- #! /usr/bin/env python
- # Originally written by Barry Warsaw <bwarsaw@python.org>
- #
- # minimally patched to make it even more xgettext compatible
- # by Peter Funk <pf@artcom-gmbh.de>
-
- # for selftesting
- try:
- import fintl
- _ = fintl.gettext
- except ImportError:
- def _(s): return s
-
-
- __doc__ = _("""pygettext -- Python equivalent of xgettext(1)
-
- Many systems (Solaris, Linux, Gnu) provide extensive tools that ease the
- internationalization of C programs. Most of these tools are independent of
- the programming language and can be used from within Python programs. Martin
- von Loewis' work[1] helps considerably in this regard.
-
- There's one problem though; xgettext is the program that scans source code
- looking for message strings, but it groks only C (or C++). Python introduces
- a few wrinkles, such as dual quoting characters, triple quoted strings, and
- raw strings. xgettext understands none of this.
-
- Enter pygettext, which uses Python's standard tokenize module to scan Python
- source code, generating .pot files identical to what GNU xgettext[2] generates
- for C and C++ code. From there, the standard GNU tools can be used.
-
- A word about marking Python strings as candidates for translation. GNU
- xgettext recognizes the following keywords: gettext, dgettext, dcgettext, and
- gettext_noop. But those can be a lot of text to include all over your code.
- C and C++ have a trick: they use the C preprocessor. Most internationalized C
- source includes a #define for gettext() to _() so that what has to be written
- in the source is much less. Thus these are both translatable strings:
-
- gettext("Translatable String")
- _("Translatable String")
-
- Python of course has no preprocessor so this doesn't work so well. Thus,
- pygettext searches only for _() by default, but see the -k/--keyword flag
- below for how to augment this.
-
- [1] http://www.python.org/workshops/1997-10/proceedings/loewis.html
- [2] http://www.gnu.org/software/gettext/gettext.html
-
- NOTE: pygettext attempts to be option and feature compatible with GNU xgettext
- where ever possible. However some options are still missing or are not fully
- implemented. Also, xgettext's use of command line switches with option
- arguments is broken, and in these cases, pygettext just defines additional
- switches.
-
- Usage: pygettext [options] inputfile ...
-
- Options:
-
- -a
- --extract-all
- Extract all strings
-
- -d name
- --default-domain=name
- Rename the default output file from messages.pot to name.pot
-
- -E
- --escape
- replace non-ASCII characters with octal escape sequences.
-
- -h
- --help
- print this help message and exit
-
- -k word
- --keyword=word
- Keywords to look for in addition to the default set, which are:
- %(DEFAULTKEYWORDS)s
-
- You can have multiple -k flags on the command line.
-
- -K
- --no-default-keywords
- Disable the default set of keywords (see above). Any keywords
- explicitly added with the -k/--keyword option are still recognized.
-
- --no-location
- Do not write filename/lineno location comments.
-
- -n
- --add-location
- Write filename/lineno location comments indicating where each
- extracted string is found in the source. These lines appear before
- each msgid. The style of comments is controlled by the -S/--style
- option. This is the default.
-
- -S stylename
- --style stylename
- Specify which style to use for location comments. Two styles are
- supported:
-
- Solaris # File: filename, line: line-number
- GNU #: filename:line
-
- The style name is case insensitive. GNU style is the default.
-
- -o filename
- --output=filename
- Rename the default output file from messages.pot to filename. If
- filename is `-' then the output is sent to standard out.
-
- -p dir
- --output-dir=dir
- Output files will be placed in directory dir.
-
- -v
- --verbose
- Print the names of the files being processed.
-
- -V
- --version
- Print the version of pygettext and exit.
-
- -w columns
- --width=columns
- Set width of output to columns.
-
- -x filename
- --exclude-file=filename
- Specify a file that contains a list of strings that are not be
- extracted from the input files. Each string to be excluded must
- appear on a line by itself in the file.
-
- If `inputfile' is -, standard input is read.
-
- """)
-
- import os
- import sys
- import time
- import getopt
- import tokenize
-
- __version__ = '1.1'
-
- default_keywords = ['_']
- DEFAULTKEYWORDS = ', '.join(default_keywords)
-
- EMPTYSTRING = ''
-
-
-
- # The normal pot-file header. msgmerge and EMACS' po-mode work better if
- # it's there.
- pot_header = _('''\
- # SOME DESCRIPTIVE TITLE.
- # Copyright (C) YEAR ORGANIZATION
- # FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
- #
- msgid ""
- msgstr ""
- "Project-Id-Version: PACKAGE VERSION\\n"
- "PO-Revision-Date: %(time)s\\n"
- "Last-Translator: FULL NAME <EMAIL@ADDRESS>\\n"
- "Language-Team: LANGUAGE <LL@li.org>\\n"
- "MIME-Version: 1.0\\n"
- "Content-Type: text/plain; charset=CHARSET\\n"
- "Content-Transfer-Encoding: ENCODING\\n"
- "Generated-By: pygettext.py %(version)s\\n"
-
- ''')
-
-
- def usage(code, msg=''):
- print __doc__ % globals()
- if msg:
- print msg
- sys.exit(code)
-
-
-
- escapes = []
-
- def make_escapes(pass_iso8859):
- global escapes
- if pass_iso8859:
- # Allow iso-8859 characters to pass through so that e.g. 'msgid
- # "Höhe"' would result not result in 'msgid "H\366he"'. Otherwise we
- # escape any character outside the 32..126 range.
- mod = 128
- else:
- mod = 256
- for i in range(256):
- if 32 <= (i % mod) <= 126:
- escapes.append(chr(i))
- else:
- escapes.append("\\%03o" % i)
- escapes[ord('\\')] = '\\\\'
- escapes[ord('\t')] = '\\t'
- escapes[ord('\r')] = '\\r'
- escapes[ord('\n')] = '\\n'
- escapes[ord('\"')] = '\\"'
-
-
- def escape(s):
- global escapes
- s = list(s)
- for i in range(len(s)):
- s[i] = escapes[ord(s[i])]
- return EMPTYSTRING.join(s)
-
-
- def safe_eval(s):
- # unwrap quotes, safely
- return eval(s, {'__builtins__':{}}, {})
-
-
- def normalize(s):
- # This converts the various Python string types into a format that is
- # appropriate for .po files, namely much closer to C style.
- lines = s.split('\n')
- if len(lines) == 1:
- s = '"' + escape(s) + '"'
- else:
- if not lines[-1]:
- del lines[-1]
- lines[-1] = lines[-1] + '\n'
- for i in range(len(lines)):
- lines[i] = escape(lines[i])
- lineterm = '\\n"\n"'
- s = '""\n"' + lineterm.join(lines) + '"'
- return s
-
-
-
- class TokenEater:
- def __init__(self, options):
- self.__options = options
- self.__messages = {}
- self.__state = self.__waiting
- self.__data = []
- self.__lineno = -1
-
- def __call__(self, ttype, tstring, stup, etup, line):
- # dispatch
- self.__state(ttype, tstring, stup[0])
-
- def __waiting(self, ttype, tstring, lineno):
- if ttype == tokenize.NAME and tstring in self.__options.keywords:
- self.__state = self.__keywordseen
-
- def __keywordseen(self, ttype, tstring, lineno):
- if ttype == tokenize.OP and tstring == '(':
- self.__data = []
- self.__lineno = lineno
- self.__state = self.__openseen
- else:
- self.__state = self.__waiting
-
- def __openseen(self, ttype, tstring, lineno):
- if ttype == tokenize.OP and tstring == ')':
- # We've seen the last of the translatable strings. Record the
- # line number of the first line of the strings and update the list
- # of messages seen. Reset state for the next batch. If there
- # were no strings inside _(), then just ignore this entry.
- if self.__data:
- msg = EMPTYSTRING.join(self.__data)
- if not msg in self.__options.toexclude:
- entry = (self.__curfile, self.__lineno)
- linenos = self.__messages.get(msg)
- if linenos is None:
- self.__messages[msg] = [entry]
- else:
- linenos.append(entry)
- self.__state = self.__waiting
- elif ttype == tokenize.STRING:
- self.__data.append(safe_eval(tstring))
- # TBD: should we warn if we seen anything else?
-
- def set_filename(self, filename):
- self.__curfile = filename
-
- def write(self, fp):
- options = self.__options
- timestamp = time.ctime(time.time())
- # common header
- try:
- sys.stdout = fp
- # The time stamp in the header doesn't have the same format
- # as that generated by xgettext...
- print pot_header % {'time': timestamp, 'version': __version__}
- for k, v in self.__messages.items():
- if not options.writelocations:
- pass
- # location comments are different b/w Solaris and GNU:
- elif options.locationstyle == options.SOLARIS:
- for filename, lineno in v:
- d = {'filename': filename, 'lineno': lineno}
- print _('# File: %(filename)s, line: %(lineno)d') % d
- elif options.locationstyle == options.GNU:
- # fit as many locations on one line, as long as the
- # resulting line length doesn't exceeds 'options.width'
- locline = '#:'
- for filename, lineno in v:
- d = {'filename': filename, 'lineno': lineno}
- s = _(' %(filename)s:%(lineno)d') % d
- if len(locline) + len(s) <= options.width:
- locline = locline + s
- else:
- print locline
- locline = "#:" + s
- if len(locline) > 2:
- print locline
- # TBD: sorting, normalizing
- print 'msgid', normalize(k)
- print 'msgstr ""\n'
- finally:
- sys.stdout = sys.__stdout__
-
-
- def main():
- global default_keywords
- try:
- opts, args = getopt.getopt(
- sys.argv[1:],
- 'ad:Ehk:Kno:p:S:Vvw:x:',
- ['extract-all', 'default-domain', 'escape', 'help',
- 'keyword=', 'no-default-keywords',
- 'add-location', 'no-location', 'output=', 'output-dir=',
- 'style=', 'verbose', 'version', 'width=', 'exclude-file=',
- ])
- except getopt.error, msg:
- usage(1, msg)
-
- # for holding option values
- class Options:
- # constants
- GNU = 1
- SOLARIS = 2
- # defaults
- extractall = 0 # FIXME: currently this option has no effect at all.
- escape = 0
- keywords = []
- outpath = ''
- outfile = 'messages.pot'
- writelocations = 1
- locationstyle = GNU
- verbose = 0
- width = 78
- excludefilename = ''
-
- options = Options()
- locations = {'gnu' : options.GNU,
- 'solaris' : options.SOLARIS,
- }
-
- # parse options
- for opt, arg in opts:
- if opt in ('-h', '--help'):
- usage(0)
- elif opt in ('-a', '--extract-all'):
- options.extractall = 1
- elif opt in ('-d', '--default-domain'):
- options.outfile = arg + '.pot'
- elif opt in ('-E', '--escape'):
- options.escape = 1
- elif opt in ('-k', '--keyword'):
- options.keywords.append(arg)
- elif opt in ('-K', '--no-default-keywords'):
- default_keywords = []
- elif opt in ('-n', '--add-location'):
- options.writelocations = 1
- elif opt in ('--no-location',):
- options.writelocations = 0
- elif opt in ('-S', '--style'):
- options.locationstyle = locations.get(arg.lower())
- if options.locationstyle is None:
- usage(1, _('Invalid value for --style: %s') % arg)
- elif opt in ('-o', '--output'):
- options.outfile = arg
- elif opt in ('-p', '--output-dir'):
- options.outpath = arg
- elif opt in ('-v', '--verbose'):
- options.verbose = 1
- elif opt in ('-V', '--version'):
- print _('pygettext.py (xgettext for Python) %s') % __version__
- sys.exit(0)
- elif opt in ('-w', '--width'):
- try:
- options.width = int(arg)
- except ValueError:
- usage(1, _('--width argument must be an integer: %s') % arg)
- elif opt in ('-x', '--exclude-file'):
- options.excludefilename = arg
-
- # calculate escapes
- make_escapes(options.escape)
-
- # calculate all keywords
- options.keywords.extend(default_keywords)
-
- # initialize list of strings to exclude
- if options.excludefilename:
- try:
- fp = open(options.excludefilename)
- options.toexclude = fp.readlines()
- fp.close()
- except IOError:
- sys.stderr.write(_("Can't read --exclude-file: %s") %
- options.excludefilename)
- sys.exit(1)
- else:
- options.toexclude = []
-
- # slurp through all the files
- eater = TokenEater(options)
- for filename in args:
- if filename == '-':
- if options.verbose:
- print _('Reading standard input')
- fp = sys.stdin
- closep = 0
- else:
- if options.verbose:
- print _('Working on %s') % filename
- fp = open(filename)
- closep = 1
- try:
- eater.set_filename(filename)
- tokenize.tokenize(fp.readline, eater)
- finally:
- if closep:
- fp.close()
-
- # write the output
- if options.outfile == '-':
- fp = sys.stdout
- closep = 0
- else:
- if options.outpath:
- options.outfile = os.path.join(options.outpath, options.outfile)
- fp = open(options.outfile, 'w')
- closep = 1
- try:
- eater.write(fp)
- finally:
- if closep:
- fp.close()
-
-
- if __name__ == '__main__':
- main()
- # some more test strings
- _(u'a unicode string')
-