From: Markus Scherer Date: Fri, 4 Apr 2014 18:01:48 +0000 (+0000) Subject: ICU-10821 ppucd.txt: find & write current-year copyright, escape non-ASCII in heading... X-Git-Tag: milestone-59-0-1~1962 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=2436998dd3eaf1f46a88ff42a98002257aef7494;p=icu ICU-10821 ppucd.txt: find & write current-year copyright, escape non-ASCII in heading comments X-SVN-Rev: 35600 --- diff --git a/tools/unicode/py/preparseucd.py b/tools/unicode/py/preparseucd.py index 0dd26ce2f6d..05d08791856 100755 --- a/tools/unicode/py/preparseucd.py +++ b/tools/unicode/py/preparseucd.py @@ -40,6 +40,7 @@ import sys _ucd_version = "?" _copyright = "" _terms_of_use = "" +_current_year = datetime.date.today().strftime("%Y") # ISO 15924 script codes --------------------------------------------------- *** @@ -349,6 +350,7 @@ def ReadUCDLines(in_file, want_ranges=True, want_other=False, Strips comments, ignores empty and all-comment lines. Returns a tuple (type, line, ...). """ + global _copyright, _terms_of_use for line in in_file: line = line.strip() if not line: continue @@ -361,6 +363,11 @@ def ReadUCDLines(in_file, want_ranges=True, want_other=False, yield ("missing", line, fields) continue if want_comments: yield ("comment", line) + if line.startswith("# Copyright"): + if not _copyright and _current_year in line: + _copyright = line + elif "terms of use" in line and not _terms_of_use: + _terms_of_use = line continue comment_start = line.find("#") # inline comment if comment_start >= 0: @@ -416,7 +423,7 @@ _ucd_version_re = re.compile("# *PropertyAliases" + "\\.txt") def ParsePropertyAliases(in_file): - global _copyright, _terms_of_use, _ucd_version + global _ucd_version prop_type_nulls = { "Binary": False, "Catalog": "??", # Must be specified, e.g., in @missing line. @@ -432,10 +439,6 @@ def ParsePropertyAliases(in_file): match = _ucd_version_re.match(line) if match: _ucd_version = match.group(1) - elif line.startswith("# Copyright"): - _copyright = line - elif "terms of use" in line: - _terms_of_use = line else: words = line[1:].lstrip().split() if len(words) == 2 and words[1] == "Properties": @@ -1038,9 +1041,28 @@ def WriteFieldsRangeProps(fields, start, end, props, out_file): out_file.write("\n") +def EscapeNonASCII(s): + i = 0 + while i < len(s): + c = ord(s[i]) + if c <= 0x7f: + i = i + 1 + else: + if c <= 0xffff: + esc = u"\\u%04X" % c + else: + esc = u"\\U%08X" % c + s = s[:i] + esc + s[i+1:] + i = i + len(esc) + return s + + def WritePreparsedUCD(out_file): + global _copyright, _terms_of_use out_file.write("# Preparsed UCD generated by ICU preparseucd.py\n"); - if _copyright: out_file.write(_copyright + "\n") + if not _copyright: + _copyright = "# Copyright (c) 1991-" + _current_year + " Unicode, Inc." + out_file.write(_copyright + "\n") if _terms_of_use: out_file.write(_terms_of_use + "\n") out_file.write("ucd;%s\n\n" % _ucd_version) # Sort property names (props keys) by their normalized forms @@ -1096,7 +1118,7 @@ def WritePreparsedUCD(out_file): # NamesList h1 heading (for [most of] a block). if i_h1 < len(_h1) and start == _h1[i_h1][0]: h = _h1[i_h1] - out_file.write("# %04lX..%04lX %s\n" % (h[0], h[1], h[2])) + out_file.write("# %04lX..%04lX %s\n" % (h[0], h[1], EscapeNonASCII(h[2]))) i_h1 += 1 # Algorithmic-names range. if i_alg < len(_alg_names_ranges) and start == _alg_names_ranges[i_alg][0]: @@ -1109,7 +1131,7 @@ def WritePreparsedUCD(out_file): i_alg += 1 # NamesList h2 heading. if i_h2 < len(_h2) and start == _h2[i_h2][0]: - out_file.write("# %s\n" % (_h2[i_h2][1])) + out_file.write("# %s\n" % EscapeNonASCII(_h2[i_h2][1])) i_h2 += 1 # Code point/range data. props = _props[i] @@ -1173,10 +1195,10 @@ def HasOneWayMapping(c): def WriteNorm2NFCTextFile(path): - year = datetime.date.today().strftime("%Y") + global _current_year with open(os.path.join(path, "nfc.txt"), "w") as out_file: out_file.write( - """# Copyright (C) 1999-""" + year + + """# Copyright (C) 1999-""" + _current_year + """, International Business Machines # Corporation and others. All Rights Reserved. # @@ -1205,10 +1227,10 @@ def WriteNorm2NFCTextFile(path): def WriteNorm2NFKCTextFile(path): - year = datetime.date.today().strftime("%Y") + global _current_year with open(os.path.join(path, "nfkc.txt"), "w") as out_file: out_file.write( - """# Copyright (C) 1999-""" + year + + """# Copyright (C) 1999-""" + _current_year + """, International Business Machines # Corporation and others. All Rights Reserved. # @@ -1242,11 +1264,11 @@ def WriteNorm2NFKCTextFile(path): def WriteNorm2NFKC_CFTextFile(path): - year = datetime.date.today().strftime("%Y") + global _current_year with open(os.path.join(path, "nfkc_cf.txt"), "w") as out_file: out_file.write( """# Unicode Character Database -# Copyright (c) 1991-""" + year + """ Unicode, Inc. +# Copyright (c) 1991-""" + _current_year + """ Unicode, Inc. # For terms of use, see http://www.unicode.org/terms_of_use.html # For documentation, see http://www.unicode.org/reports/tr44/ # @@ -1944,10 +1966,10 @@ def CheckPNamesData(): def WritePNamesDataHeader(out_path): - year = datetime.date.today().strftime("%Y") + global _current_year with open(out_path, "w") as out_file: out_file.write("""/** - * Copyright (C) 2002-""" + year + + * Copyright (C) 2002-""" + _current_year + """, International Business Machines Corporation and * others. All Rights Reserved. * @@ -2066,8 +2088,11 @@ def main(): # Optimize block vs. cp properties. CompactBlocks() # Write the ppucd.txt output file. + # Use US-ASCII so that ICU tests can parse it in the platform charset, + # which may be EBCDIC. + # Fix up non-ASCII data (NamesList.txt headings) to fit. out_path = os.path.join(unidata_path, "ppucd.txt") - with codecs.open(out_path, "w", "UTF-8") as out_file: + with codecs.open(out_path, "w", "US-ASCII") as out_file: WritePreparsedUCD(out_file) out_file.flush()