mirror of
https://github.com/arsenetar/dupeguru.git
synced 2024-11-12 18:49:03 +00:00
381 lines
12 KiB
Python
381 lines
12 KiB
Python
# This module was taken from CPython's Tools/i18n and dirtily hacked to bypass the need for cmdline
|
|
# invocation.
|
|
|
|
# Originally written by Barry Warsaw <barry@zope.com>
|
|
#
|
|
# Minimally patched to make it even more xgettext compatible
|
|
# by Peter Funk <pf@artcom-gmbh.de>
|
|
#
|
|
# 2002-11-22 Jürgen Hermann <jh@web.de>
|
|
# Added checks that _() only contains string literals, and
|
|
# command line args are resolved to module lists, i.e. you
|
|
# can now pass a filename, a module or package name, or a
|
|
# directory (including globbing chars, important for Win32).
|
|
# Made docstring fit in 80 chars wide displays using pydoc.
|
|
#
|
|
|
|
import os
|
|
import importlib.machinery
|
|
import importlib.util
|
|
import sys
|
|
import glob
|
|
import token
|
|
import tokenize
|
|
|
|
__version__ = "1.5"
|
|
|
|
default_keywords = ["_"]
|
|
DEFAULTKEYWORDS = ", ".join(default_keywords)
|
|
|
|
EMPTYSTRING = ""
|
|
|
|
|
|
# The normal pot-file header. msgmerge and Emacs's po-mode work better if it's
|
|
# there.
|
|
pot_header = """
|
|
msgid ""
|
|
msgstr ""
|
|
"Content-Type: text/plain; charset=utf-8\\n"
|
|
"Content-Transfer-Encoding: utf-8\\n"
|
|
"""
|
|
|
|
|
|
def usage(code, msg=""):
|
|
print(__doc__ % globals(), file=sys.stderr)
|
|
if msg:
|
|
print(msg, file=sys.stderr)
|
|
sys.exit(code)
|
|
|
|
|
|
escapes = []
|
|
|
|
|
|
def make_escapes(pass_iso8859):
|
|
global escapes
|
|
if pass_iso8859:
|
|
# Allow iso-8859 characters to pass through so that e.g. 'msgid
|
|
# "H?he"' would result not result in 'msgid "H\366he"'. Otherwise we
|
|
# escape any character outside the 32..126 range.
|
|
mod = 128
|
|
else:
|
|
mod = 256
|
|
for i in range(256):
|
|
if 32 <= (i % mod) <= 126:
|
|
escapes.append(chr(i))
|
|
else:
|
|
escapes.append("\\%03o" % i)
|
|
escapes[ord("\\")] = "\\\\"
|
|
escapes[ord("\t")] = "\\t"
|
|
escapes[ord("\r")] = "\\r"
|
|
escapes[ord("\n")] = "\\n"
|
|
escapes[ord('"')] = '\\"'
|
|
|
|
|
|
def escape(s):
|
|
global escapes
|
|
s = list(s)
|
|
for i in range(len(s)):
|
|
s[i] = escapes[ord(s[i])]
|
|
return EMPTYSTRING.join(s)
|
|
|
|
|
|
def safe_eval(s):
|
|
# unwrap quotes, safely
|
|
return eval(s, {"__builtins__": {}}, {})
|
|
|
|
|
|
def normalize(s):
|
|
# This converts the various Python string types into a format that is
|
|
# appropriate for .po files, namely much closer to C style.
|
|
lines = s.split("\n")
|
|
if len(lines) == 1:
|
|
s = '"' + escape(s) + '"'
|
|
else:
|
|
if not lines[-1]:
|
|
del lines[-1]
|
|
lines[-1] = lines[-1] + "\n"
|
|
for i in range(len(lines)):
|
|
lines[i] = escape(lines[i])
|
|
lineterm = '\\n"\n"'
|
|
s = '""\n"' + lineterm.join(lines) + '"'
|
|
return s
|
|
|
|
|
|
def containsAny(str, set):
|
|
"""Check whether 'str' contains ANY of the chars in 'set'"""
|
|
return 1 in [c in str for c in set]
|
|
|
|
|
|
def _visit_pyfiles(list, dirname, names):
|
|
"""Helper for getFilesForName()."""
|
|
# get extension for python source files
|
|
if "_py_ext" not in globals():
|
|
global _py_ext
|
|
_py_ext = importlib.machinery.SOURCE_SUFFIXES[0]
|
|
|
|
# don't recurse into CVS directories
|
|
if "CVS" in names:
|
|
names.remove("CVS")
|
|
|
|
# add all *.py files to list
|
|
list.extend([os.path.join(dirname, file) for file in names if os.path.splitext(file)[1] == _py_ext])
|
|
|
|
|
|
def getFilesForName(name):
|
|
"""Get a list of module files for a filename, a module or package name,
|
|
or a directory.
|
|
"""
|
|
if not os.path.exists(name):
|
|
# check for glob chars
|
|
if containsAny(name, "*?[]"):
|
|
files = glob.glob(name)
|
|
file_list = []
|
|
for file in files:
|
|
file_list.extend(getFilesForName(file))
|
|
return file_list
|
|
|
|
# try to find module or package
|
|
try:
|
|
spec = importlib.util.find_spec(name)
|
|
name = spec.origin
|
|
except ImportError:
|
|
name = None
|
|
if not name:
|
|
return []
|
|
|
|
if os.path.isdir(name):
|
|
# find all python files in directory
|
|
file_list = []
|
|
os.walk(name, _visit_pyfiles, file_list)
|
|
return file_list
|
|
elif os.path.exists(name):
|
|
# a single file
|
|
return [name]
|
|
|
|
return []
|
|
|
|
|
|
class TokenEater:
|
|
def __init__(self, options):
|
|
self.__options = options
|
|
self.__messages = {}
|
|
self.__state = self.__waiting
|
|
self.__data = []
|
|
self.__lineno = -1
|
|
self.__freshmodule = 1
|
|
self.__curfile = None
|
|
|
|
def __call__(self, ttype, tstring, stup, etup, line):
|
|
# dispatch
|
|
# import token
|
|
# print >> sys.stderr, 'ttype:', token.tok_name[ttype], \
|
|
# 'tstring:', tstring
|
|
self.__state(ttype, tstring, stup[0])
|
|
|
|
def __waiting(self, ttype, tstring, lineno):
|
|
opts = self.__options
|
|
# Do docstring extractions, if enabled
|
|
if opts.docstrings and not opts.nodocstrings.get(self.__curfile):
|
|
# module docstring?
|
|
if self.__freshmodule:
|
|
if ttype == tokenize.STRING:
|
|
self.__addentry(safe_eval(tstring), lineno, isdocstring=1)
|
|
self.__freshmodule = 0
|
|
elif ttype not in (tokenize.COMMENT, tokenize.NL):
|
|
self.__freshmodule = 0
|
|
return
|
|
# class docstring?
|
|
if ttype == tokenize.NAME and tstring in ("class", "def"):
|
|
self.__state = self.__suiteseen
|
|
return
|
|
if ttype == tokenize.NAME and tstring in opts.keywords:
|
|
self.__state = self.__keywordseen
|
|
|
|
def __suiteseen(self, ttype, tstring, lineno):
|
|
# ignore anything until we see the colon
|
|
if ttype == tokenize.OP and tstring == ":":
|
|
self.__state = self.__suitedocstring
|
|
|
|
def __suitedocstring(self, ttype, tstring, lineno):
|
|
# ignore any intervening noise
|
|
if ttype == tokenize.STRING:
|
|
self.__addentry(safe_eval(tstring), lineno, isdocstring=1)
|
|
self.__state = self.__waiting
|
|
elif ttype not in (tokenize.NEWLINE, tokenize.INDENT, tokenize.COMMENT):
|
|
# there was no class docstring
|
|
self.__state = self.__waiting
|
|
|
|
def __keywordseen(self, ttype, tstring, lineno):
|
|
if ttype == tokenize.OP and tstring == "(":
|
|
self.__data = []
|
|
self.__lineno = lineno
|
|
self.__state = self.__openseen
|
|
else:
|
|
self.__state = self.__waiting
|
|
|
|
def __openseen(self, ttype, tstring, lineno):
|
|
if ttype == tokenize.OP and tstring == ")":
|
|
# We've seen the last of the translatable strings. Record the
|
|
# line number of the first line of the strings and update the list
|
|
# of messages seen. Reset state for the next batch. If there
|
|
# were no strings inside _(), then just ignore this entry.
|
|
if self.__data:
|
|
self.__addentry(EMPTYSTRING.join(self.__data))
|
|
self.__state = self.__waiting
|
|
elif ttype == tokenize.STRING:
|
|
self.__data.append(safe_eval(tstring))
|
|
elif ttype not in [
|
|
tokenize.COMMENT,
|
|
token.INDENT,
|
|
token.DEDENT,
|
|
token.NEWLINE,
|
|
tokenize.NL,
|
|
]:
|
|
# warn if we see anything else than STRING or whitespace
|
|
print(
|
|
'*** %(file)s:%(lineno)s: Seen unexpected token "%(token)s"'
|
|
% {"token": tstring, "file": self.__curfile, "lineno": self.__lineno},
|
|
file=sys.stderr,
|
|
)
|
|
self.__state = self.__waiting
|
|
|
|
def __addentry(self, msg, lineno=None, isdocstring=0):
|
|
if lineno is None:
|
|
lineno = self.__lineno
|
|
if msg not in self.__options.toexclude:
|
|
entry = (self.__curfile, lineno)
|
|
self.__messages.setdefault(msg, {})[entry] = isdocstring
|
|
|
|
def set_filename(self, filename):
|
|
self.__curfile = filename
|
|
self.__freshmodule = 1
|
|
|
|
def write(self, fp):
|
|
options = self.__options
|
|
# The time stamp in the header doesn't have the same format as that
|
|
# generated by xgettext...
|
|
print(pot_header, file=fp)
|
|
# Sort the entries. First sort each particular entry's keys, then
|
|
# sort all the entries by their first item.
|
|
reverse = {}
|
|
for k, v in self.__messages.items():
|
|
keys = sorted(v.keys())
|
|
reverse.setdefault(tuple(keys), []).append((k, v))
|
|
rkeys = sorted(reverse.keys())
|
|
for rkey in rkeys:
|
|
rentries = reverse[rkey]
|
|
rentries.sort()
|
|
for k, v in rentries:
|
|
# If the entry was gleaned out of a docstring, then add a
|
|
# comment stating so. This is to aid translators who may wish
|
|
# to skip translating some unimportant docstrings.
|
|
isdocstring = any(v.values())
|
|
# k is the message string, v is a dictionary-set of (filename,
|
|
# lineno) tuples. We want to sort the entries in v first by
|
|
# file name and then by line number.
|
|
v = sorted(v.keys())
|
|
if not options.writelocations:
|
|
pass
|
|
# location comments are different b/w Solaris and GNU:
|
|
elif options.locationstyle == options.SOLARIS:
|
|
for filename, lineno in v:
|
|
d = {"filename": filename, "lineno": lineno}
|
|
print("# File: %(filename)s, line: %(lineno)d" % d, file=fp)
|
|
elif options.locationstyle == options.GNU:
|
|
# fit as many locations on one line, as long as the
|
|
# resulting line length doesn't exceeds 'options.width'
|
|
locline = "#:"
|
|
for filename, lineno in v:
|
|
d = {"filename": filename, "lineno": lineno}
|
|
s = " %(filename)s:%(lineno)d" % d
|
|
if len(locline) + len(s) <= options.width:
|
|
locline = locline + s
|
|
else:
|
|
print(locline, file=fp)
|
|
locline = "#:" + s
|
|
if len(locline) > 2:
|
|
print(locline, file=fp)
|
|
if isdocstring:
|
|
print("#, docstring", file=fp)
|
|
print("msgid", normalize(k), file=fp)
|
|
print('msgstr ""\n', file=fp)
|
|
|
|
|
|
def main(source_files, outpath, keywords=None):
|
|
global default_keywords
|
|
|
|
# for holding option values
|
|
class Options:
|
|
# constants
|
|
GNU = 1
|
|
SOLARIS = 2
|
|
# defaults
|
|
extractall = 0 # FIXME: currently this option has no effect at all.
|
|
escape = 0
|
|
keywords = []
|
|
outfile = "messages.pot"
|
|
writelocations = 1
|
|
locationstyle = GNU
|
|
verbose = 0
|
|
width = 78
|
|
excludefilename = ""
|
|
docstrings = 0
|
|
nodocstrings = {}
|
|
|
|
options = Options()
|
|
|
|
options.outfile = outpath
|
|
if keywords:
|
|
options.keywords = keywords
|
|
|
|
# calculate escapes
|
|
make_escapes(options.escape)
|
|
|
|
# calculate all keywords
|
|
options.keywords.extend(default_keywords)
|
|
|
|
# initialize list of strings to exclude
|
|
if options.excludefilename:
|
|
try:
|
|
fp = open(options.excludefilename, encoding="utf-8")
|
|
options.toexclude = fp.readlines()
|
|
fp.close()
|
|
except OSError:
|
|
print(
|
|
"Can't read --exclude-file: %s" % options.excludefilename,
|
|
file=sys.stderr,
|
|
)
|
|
sys.exit(1)
|
|
else:
|
|
options.toexclude = []
|
|
|
|
# slurp through all the files
|
|
eater = TokenEater(options)
|
|
for filename in source_files:
|
|
if options.verbose:
|
|
print("Working on %s" % filename)
|
|
fp = open(filename, encoding="utf-8")
|
|
closep = 1
|
|
try:
|
|
eater.set_filename(filename)
|
|
try:
|
|
tokens = tokenize.generate_tokens(fp.readline)
|
|
for _token in tokens:
|
|
eater(*_token)
|
|
except tokenize.TokenError as e:
|
|
print(
|
|
"%s: %s, line %d, column %d" % (e.args[0], filename, e.args[1][0], e.args[1][1]),
|
|
file=sys.stderr,
|
|
)
|
|
finally:
|
|
if closep:
|
|
fp.close()
|
|
|
|
fp = open(options.outfile, "w", encoding="utf-8")
|
|
closep = 1
|
|
try:
|
|
eater.write(fp)
|
|
finally:
|
|
if closep:
|
|
fp.close()
|