From: Markus Scherer <markus.icu@gmail.com>
Date: Fri, 4 Apr 2014 18:01:48 +0000 (+0000)
Subject: ICU-10821 ppucd.txt: find & write current-year copyright, escape non-ASCII in heading... 
X-Git-Tag: milestone-59-0-1~1962
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=2436998dd3eaf1f46a88ff42a98002257aef7494;p=icu

ICU-10821 ppucd.txt: find & write current-year copyright, escape non-ASCII in heading comments

X-SVN-Rev: 35600
---

diff --git a/tools/unicode/py/preparseucd.py b/tools/unicode/py/preparseucd.py
index 0dd26ce2f6d..05d08791856 100755
--- a/tools/unicode/py/preparseucd.py
+++ b/tools/unicode/py/preparseucd.py
@@ -40,6 +40,7 @@ import sys
 _ucd_version = "?"
 _copyright = ""
 _terms_of_use = ""
+_current_year = datetime.date.today().strftime("%Y")
 
 # ISO 15924 script codes --------------------------------------------------- ***
 
@@ -349,6 +350,7 @@ def ReadUCDLines(in_file, want_ranges=True, want_other=False,
   Strips comments, ignores empty and all-comment lines.
   Returns a tuple (type, line, ...).
   """
+  global _copyright, _terms_of_use
   for line in in_file:
     line = line.strip()
     if not line: continue
@@ -361,6 +363,11 @@ def ReadUCDLines(in_file, want_ranges=True, want_other=False,
           yield ("missing", line, fields)
           continue
       if want_comments: yield ("comment", line)
+      if line.startswith("# Copyright"):
+        if not _copyright and _current_year in line:
+          _copyright = line
+      elif "terms of use" in line and not _terms_of_use:
+        _terms_of_use = line
       continue
     comment_start = line.find("#")  # inline comment
     if comment_start >= 0:
@@ -416,7 +423,7 @@ _ucd_version_re = re.compile("# *PropertyAliases" +
                              "\\.txt")
 
 def ParsePropertyAliases(in_file):
-  global _copyright, _terms_of_use, _ucd_version
+  global _ucd_version
   prop_type_nulls = {
     "Binary": False,
     "Catalog": "??",  # Must be specified, e.g., in @missing line.
@@ -432,10 +439,6 @@ def ParsePropertyAliases(in_file):
       match = _ucd_version_re.match(line)
       if match:
         _ucd_version = match.group(1)
-      elif line.startswith("# Copyright"):
-        _copyright = line
-      elif "terms of use" in line:
-        _terms_of_use = line
       else:
         words = line[1:].lstrip().split()
         if len(words) == 2 and words[1] == "Properties":
@@ -1038,9 +1041,28 @@ def WriteFieldsRangeProps(fields, start, end, props, out_file):
   out_file.write("\n")
 
 
+def EscapeNonASCII(s):
+  i = 0
+  while i < len(s):
+    c = ord(s[i])
+    if c <= 0x7f:
+      i = i + 1
+    else:
+      if c <= 0xffff:
+        esc = u"\\u%04X" % c
+      else:
+        esc = u"\\U%08X" % c
+      s = s[:i] + esc + s[i+1:]
+      i = i + len(esc)
+  return s
+
+
 def WritePreparsedUCD(out_file):
+  global _copyright, _terms_of_use
   out_file.write("# Preparsed UCD generated by ICU preparseucd.py\n");
-  if _copyright: out_file.write(_copyright + "\n")
+  if not _copyright:
+    _copyright = "# Copyright (c) 1991-" + _current_year + " Unicode, Inc."
+  out_file.write(_copyright + "\n")
   if _terms_of_use: out_file.write(_terms_of_use + "\n")
   out_file.write("ucd;%s\n\n" % _ucd_version)
   # Sort property names (props keys) by their normalized forms
@@ -1096,7 +1118,7 @@ def WritePreparsedUCD(out_file):
     # NamesList h1 heading (for [most of] a block).
     if i_h1 < len(_h1) and start == _h1[i_h1][0]:
       h = _h1[i_h1]
-      out_file.write("# %04lX..%04lX %s\n" % (h[0], h[1], h[2]))
+      out_file.write("# %04lX..%04lX %s\n" % (h[0], h[1], EscapeNonASCII(h[2])))
       i_h1 += 1
     # Algorithmic-names range.
     if i_alg < len(_alg_names_ranges) and start == _alg_names_ranges[i_alg][0]:
@@ -1109,7 +1131,7 @@ def WritePreparsedUCD(out_file):
       i_alg += 1
     # NamesList h2 heading.
     if i_h2 < len(_h2) and start == _h2[i_h2][0]:
-      out_file.write("# %s\n" % (_h2[i_h2][1]))
+      out_file.write("# %s\n" % EscapeNonASCII(_h2[i_h2][1]))
       i_h2 += 1
     # Code point/range data.
     props = _props[i]
@@ -1173,10 +1195,10 @@ def HasOneWayMapping(c):
 
 
 def WriteNorm2NFCTextFile(path):
-  year = datetime.date.today().strftime("%Y")
+  global _current_year
   with open(os.path.join(path, "nfc.txt"), "w") as out_file:
     out_file.write(
-        """# Copyright (C) 1999-""" + year +
+        """# Copyright (C) 1999-""" + _current_year +
         """, International Business Machines
 # Corporation and others.  All Rights Reserved.
 #
@@ -1205,10 +1227,10 @@ def WriteNorm2NFCTextFile(path):
 
 
 def WriteNorm2NFKCTextFile(path):
-  year = datetime.date.today().strftime("%Y")
+  global _current_year
   with open(os.path.join(path, "nfkc.txt"), "w") as out_file:
     out_file.write(
-        """# Copyright (C) 1999-""" + year +
+        """# Copyright (C) 1999-""" + _current_year +
         """, International Business Machines
 # Corporation and others.  All Rights Reserved.
 #
@@ -1242,11 +1264,11 @@ def WriteNorm2NFKCTextFile(path):
 
 
 def WriteNorm2NFKC_CFTextFile(path):
-  year = datetime.date.today().strftime("%Y")
+  global _current_year
   with open(os.path.join(path, "nfkc_cf.txt"), "w") as out_file:
     out_file.write(
         """# Unicode Character Database
-# Copyright (c) 1991-""" + year + """ Unicode, Inc.
+# Copyright (c) 1991-""" + _current_year + """ Unicode, Inc.
 # For terms of use, see http://www.unicode.org/terms_of_use.html
 # For documentation, see http://www.unicode.org/reports/tr44/
 #
@@ -1944,10 +1966,10 @@ def CheckPNamesData():
 
 
 def WritePNamesDataHeader(out_path):
-  year = datetime.date.today().strftime("%Y")
+  global _current_year
   with open(out_path, "w") as out_file:
     out_file.write("""/**
- * Copyright (C) 2002-""" + year +
+ * Copyright (C) 2002-""" + _current_year +
 """, International Business Machines Corporation and
  * others. All Rights Reserved.
  *
@@ -2066,8 +2088,11 @@ def main():
   # Optimize block vs. cp properties.
   CompactBlocks()
   # Write the ppucd.txt output file.
+  # Use US-ASCII so that ICU tests can parse it in the platform charset,
+  # which may be EBCDIC.
+  # Fix up non-ASCII data (NamesList.txt headings) to fit.
   out_path = os.path.join(unidata_path, "ppucd.txt")
-  with codecs.open(out_path, "w", "UTF-8") as out_file:
+  with codecs.open(out_path, "w", "US-ASCII") as out_file:
     WritePreparsedUCD(out_file)
     out_file.flush()