dupeguru/hscommon/pygettext.py

381 lines
12 KiB
Python

# This module was taken from CPython's Tools/i18n and dirtily hacked to bypass the need for cmdline
# invocation.
# Originally written by Barry Warsaw <barry@zope.com>
#
# Minimally patched to make it even more xgettext compatible
# by Peter Funk <pf@artcom-gmbh.de>
#
# 2002-11-22 Jürgen Hermann <jh@web.de>
# Added checks that _() only contains string literals, and
# command line args are resolved to module lists, i.e. you
# can now pass a filename, a module or package name, or a
# directory (including globbing chars, important for Win32).
# Made docstring fit in 80 chars wide displays using pydoc.
#
import os
import importlib.machinery
import importlib.util
import sys
import glob
import token
import tokenize
__version__ = "1.5"
default_keywords = ["_"]
DEFAULTKEYWORDS = ", ".join(default_keywords)
EMPTYSTRING = ""
# The normal pot-file header. msgmerge and Emacs's po-mode work better if it's
# there.
pot_header = """
msgid ""
msgstr ""
"Content-Type: text/plain; charset=utf-8\\n"
"Content-Transfer-Encoding: utf-8\\n"
"""
def usage(code, msg=""):
print(__doc__ % globals(), file=sys.stderr)
if msg:
print(msg, file=sys.stderr)
sys.exit(code)
escapes = []
def make_escapes(pass_iso8859):
global escapes
if pass_iso8859:
# Allow iso-8859 characters to pass through so that e.g. 'msgid
# "H?he"' would result not result in 'msgid "H\366he"'. Otherwise we
# escape any character outside the 32..126 range.
mod = 128
else:
mod = 256
for i in range(256):
if 32 <= (i % mod) <= 126:
escapes.append(chr(i))
else:
escapes.append("\\%03o" % i)
escapes[ord("\\")] = "\\\\"
escapes[ord("\t")] = "\\t"
escapes[ord("\r")] = "\\r"
escapes[ord("\n")] = "\\n"
escapes[ord('"')] = '\\"'
def escape(s):
global escapes
s = list(s)
for i in range(len(s)):
s[i] = escapes[ord(s[i])]
return EMPTYSTRING.join(s)
def safe_eval(s):
# unwrap quotes, safely
return eval(s, {"__builtins__": {}}, {})
def normalize(s):
# This converts the various Python string types into a format that is
# appropriate for .po files, namely much closer to C style.
lines = s.split("\n")
if len(lines) == 1:
s = '"' + escape(s) + '"'
else:
if not lines[-1]:
del lines[-1]
lines[-1] = lines[-1] + "\n"
for i in range(len(lines)):
lines[i] = escape(lines[i])
lineterm = '\\n"\n"'
s = '""\n"' + lineterm.join(lines) + '"'
return s
def containsAny(str, set):
"""Check whether 'str' contains ANY of the chars in 'set'"""
return 1 in [c in str for c in set]
def _visit_pyfiles(list, dirname, names):
"""Helper for getFilesForName()."""
# get extension for python source files
if "_py_ext" not in globals():
global _py_ext
_py_ext = importlib.machinery.SOURCE_SUFFIXES[0]
# don't recurse into CVS directories
if "CVS" in names:
names.remove("CVS")
# add all *.py files to list
list.extend([os.path.join(dirname, file) for file in names if os.path.splitext(file)[1] == _py_ext])
def getFilesForName(name):
"""Get a list of module files for a filename, a module or package name,
or a directory.
"""
if not os.path.exists(name):
# check for glob chars
if containsAny(name, "*?[]"):
files = glob.glob(name)
file_list = []
for file in files:
file_list.extend(getFilesForName(file))
return file_list
# try to find module or package
try:
spec = importlib.util.find_spec(name)
name = spec.origin
except ImportError:
name = None
if not name:
return []
if os.path.isdir(name):
# find all python files in directory
file_list = []
os.walk(name, _visit_pyfiles, file_list)
return file_list
elif os.path.exists(name):
# a single file
return [name]
return []
class TokenEater:
def __init__(self, options):
self.__options = options
self.__messages = {}
self.__state = self.__waiting
self.__data = []
self.__lineno = -1
self.__freshmodule = 1
self.__curfile = None
def __call__(self, ttype, tstring, stup, etup, line):
# dispatch
# import token
# print >> sys.stderr, 'ttype:', token.tok_name[ttype], \
# 'tstring:', tstring
self.__state(ttype, tstring, stup[0])
def __waiting(self, ttype, tstring, lineno):
opts = self.__options
# Do docstring extractions, if enabled
if opts.docstrings and not opts.nodocstrings.get(self.__curfile):
# module docstring?
if self.__freshmodule:
if ttype == tokenize.STRING:
self.__addentry(safe_eval(tstring), lineno, isdocstring=1)
self.__freshmodule = 0
elif ttype not in (tokenize.COMMENT, tokenize.NL):
self.__freshmodule = 0
return
# class docstring?
if ttype == tokenize.NAME and tstring in ("class", "def"):
self.__state = self.__suiteseen
return
if ttype == tokenize.NAME and tstring in opts.keywords:
self.__state = self.__keywordseen
def __suiteseen(self, ttype, tstring, lineno):
# ignore anything until we see the colon
if ttype == tokenize.OP and tstring == ":":
self.__state = self.__suitedocstring
def __suitedocstring(self, ttype, tstring, lineno):
# ignore any intervening noise
if ttype == tokenize.STRING:
self.__addentry(safe_eval(tstring), lineno, isdocstring=1)
self.__state = self.__waiting
elif ttype not in (tokenize.NEWLINE, tokenize.INDENT, tokenize.COMMENT):
# there was no class docstring
self.__state = self.__waiting
def __keywordseen(self, ttype, tstring, lineno):
if ttype == tokenize.OP and tstring == "(":
self.__data = []
self.__lineno = lineno
self.__state = self.__openseen
else:
self.__state = self.__waiting
def __openseen(self, ttype, tstring, lineno):
if ttype == tokenize.OP and tstring == ")":
# We've seen the last of the translatable strings. Record the
# line number of the first line of the strings and update the list
# of messages seen. Reset state for the next batch. If there
# were no strings inside _(), then just ignore this entry.
if self.__data:
self.__addentry(EMPTYSTRING.join(self.__data))
self.__state = self.__waiting
elif ttype == tokenize.STRING:
self.__data.append(safe_eval(tstring))
elif ttype not in [
tokenize.COMMENT,
token.INDENT,
token.DEDENT,
token.NEWLINE,
tokenize.NL,
]:
# warn if we see anything else than STRING or whitespace
print(
'*** %(file)s:%(lineno)s: Seen unexpected token "%(token)s"'
% {"token": tstring, "file": self.__curfile, "lineno": self.__lineno},
file=sys.stderr,
)
self.__state = self.__waiting
def __addentry(self, msg, lineno=None, isdocstring=0):
if lineno is None:
lineno = self.__lineno
if msg not in self.__options.toexclude:
entry = (self.__curfile, lineno)
self.__messages.setdefault(msg, {})[entry] = isdocstring
def set_filename(self, filename):
self.__curfile = filename
self.__freshmodule = 1
def write(self, fp):
options = self.__options
# The time stamp in the header doesn't have the same format as that
# generated by xgettext...
print(pot_header, file=fp)
# Sort the entries. First sort each particular entry's keys, then
# sort all the entries by their first item.
reverse = {}
for k, v in self.__messages.items():
keys = sorted(v.keys())
reverse.setdefault(tuple(keys), []).append((k, v))
rkeys = sorted(reverse.keys())
for rkey in rkeys:
rentries = reverse[rkey]
rentries.sort()
for k, v in rentries:
# If the entry was gleaned out of a docstring, then add a
# comment stating so. This is to aid translators who may wish
# to skip translating some unimportant docstrings.
isdocstring = any(v.values())
# k is the message string, v is a dictionary-set of (filename,
# lineno) tuples. We want to sort the entries in v first by
# file name and then by line number.
v = sorted(v.keys())
if not options.writelocations:
pass
# location comments are different b/w Solaris and GNU:
elif options.locationstyle == options.SOLARIS:
for filename, lineno in v:
d = {"filename": filename, "lineno": lineno}
print("# File: %(filename)s, line: %(lineno)d" % d, file=fp)
elif options.locationstyle == options.GNU:
# fit as many locations on one line, as long as the
# resulting line length doesn't exceeds 'options.width'
locline = "#:"
for filename, lineno in v:
d = {"filename": filename, "lineno": lineno}
s = " %(filename)s:%(lineno)d" % d
if len(locline) + len(s) <= options.width:
locline = locline + s
else:
print(locline, file=fp)
locline = "#:" + s
if len(locline) > 2:
print(locline, file=fp)
if isdocstring:
print("#, docstring", file=fp)
print("msgid", normalize(k), file=fp)
print('msgstr ""\n', file=fp)
def main(source_files, outpath, keywords=None):
global default_keywords
# for holding option values
class Options:
# constants
GNU = 1
SOLARIS = 2
# defaults
extractall = 0 # FIXME: currently this option has no effect at all.
escape = 0
keywords = []
outfile = "messages.pot"
writelocations = 1
locationstyle = GNU
verbose = 0
width = 78
excludefilename = ""
docstrings = 0
nodocstrings = {}
options = Options()
options.outfile = outpath
if keywords:
options.keywords = keywords
# calculate escapes
make_escapes(options.escape)
# calculate all keywords
options.keywords.extend(default_keywords)
# initialize list of strings to exclude
if options.excludefilename:
try:
fp = open(options.excludefilename, encoding="utf-8")
options.toexclude = fp.readlines()
fp.close()
except OSError:
print(
"Can't read --exclude-file: %s" % options.excludefilename,
file=sys.stderr,
)
sys.exit(1)
else:
options.toexclude = []
# slurp through all the files
eater = TokenEater(options)
for filename in source_files:
if options.verbose:
print("Working on %s" % filename)
fp = open(filename, encoding="utf-8")
closep = 1
try:
eater.set_filename(filename)
try:
tokens = tokenize.generate_tokens(fp.readline)
for _token in tokens:
eater(*_token)
except tokenize.TokenError as e:
print(
"%s: %s, line %d, column %d" % (e.args[0], filename, e.args[1][0], e.args[1][1]),
file=sys.stderr,
)
finally:
if closep:
fp.close()
fp = open(options.outfile, "w", encoding="utf-8")
closep = 1
try:
eater.write(fp)
finally:
if closep:
fp.close()