ICU-10821 ppucd.txt: find & write current-year copyright, escape non-ASCII in heading...

author Markus Scherer <markus.icu@gmail.com>

Fri, 4 Apr 2014 18:01:48 +0000 (18:01 +0000)

committer Markus Scherer <markus.icu@gmail.com>

Fri, 4 Apr 2014 18:01:48 +0000 (18:01 +0000)
author Markus Scherer <markus.icu@gmail.com>
Fri, 4 Apr 2014 18:01:48 +0000 (18:01 +0000)
committer Markus Scherer <markus.icu@gmail.com>
Fri, 4 Apr 2014 18:01:48 +0000 (18:01 +0000)
diff --git a/tools/unicode/py/preparseucd.py b/tools/unicode/py/preparseucd.py

index 0dd26ce2f6da384a0ec1d424bc26d6357881eaa7..05d08791856e80c29d67c6d9e755fffa1811ac73 100755 (executable)
--- a/tools/unicode/py/preparseucd.py
+++ b/tools/unicode/py/preparseucd.py
@@ -40,6 +40,7 @@ import sys
  _ucd_version = "?"
  _copyright = ""
  _terms_of_use = ""
+_current_year = datetime.date.today().strftime("%Y")
  
  # ISO 15924 script codes --------------------------------------------------- ***
  
@@ -349,6 +350,7 @@ def ReadUCDLines(in_file, want_ranges=True, want_other=False,
    Strips comments, ignores empty and all-comment lines.
    Returns a tuple (type, line, ...).
    """
+  global _copyright, _terms_of_use
    for line in in_file:
      line = line.strip()
      if not line: continue
@@ -361,6 +363,11 @@ def ReadUCDLines(in_file, want_ranges=True, want_other=False,
            yield ("missing", line, fields)
            continue
        if want_comments: yield ("comment", line)
+      if line.startswith("# Copyright"):
+        if not _copyright and _current_year in line:
+          _copyright = line
+      elif "terms of use" in line and not _terms_of_use:
+        _terms_of_use = line
        continue
      comment_start = line.find("#")  # inline comment
      if comment_start >= 0:
@@ -416,7 +423,7 @@ _ucd_version_re = re.compile("# *PropertyAliases" +
                               "\\.txt")
  
  def ParsePropertyAliases(in_file):
-  global _copyright, _terms_of_use, _ucd_version
+  global _ucd_version
    prop_type_nulls = {
      "Binary": False,
      "Catalog": "??",  # Must be specified, e.g., in @missing line.
@@ -432,10 +439,6 @@ def ParsePropertyAliases(in_file):
        match = _ucd_version_re.match(line)
        if match:
          _ucd_version = match.group(1)
-      elif line.startswith("# Copyright"):
-        _copyright = line
-      elif "terms of use" in line:
-        _terms_of_use = line
        else:
          words = line[1:].lstrip().split()
          if len(words) == 2 and words[1] == "Properties":
@@ -1038,9 +1041,28 @@ def WriteFieldsRangeProps(fields, start, end, props, out_file):
    out_file.write("\n")
  
  
+def EscapeNonASCII(s):
+  i = 0
+  while i < len(s):
+    c = ord(s[i])
+    if c <= 0x7f:
+      i = i + 1
+    else:
+      if c <= 0xffff:
+        esc = u"\\u%04X" % c
+      else:
+        esc = u"\\U%08X" % c
+      s = s[:i] + esc + s[i+1:]
+      i = i + len(esc)
+  return s
+
+
  def WritePreparsedUCD(out_file):
+  global _copyright, _terms_of_use
    out_file.write("# Preparsed UCD generated by ICU preparseucd.py\n");
-  if _copyright: out_file.write(_copyright + "\n")
+  if not _copyright:
+    _copyright = "# Copyright (c) 1991-" + _current_year + " Unicode, Inc."
+  out_file.write(_copyright + "\n")
    if _terms_of_use: out_file.write(_terms_of_use + "\n")
    out_file.write("ucd;%s\n\n" % _ucd_version)
    # Sort property names (props keys) by their normalized forms
@@ -1096,7 +1118,7 @@ def WritePreparsedUCD(out_file):
      # NamesList h1 heading (for [most of] a block).
      if i_h1 < len(_h1) and start == _h1[i_h1][0]:
        h = _h1[i_h1]
-      out_file.write("# %04lX..%04lX %s\n" % (h[0], h[1], h[2]))
+      out_file.write("# %04lX..%04lX %s\n" % (h[0], h[1], EscapeNonASCII(h[2])))
        i_h1 += 1
      # Algorithmic-names range.
      if i_alg < len(_alg_names_ranges) and start == _alg_names_ranges[i_alg][0]:
@@ -1109,7 +1131,7 @@ def WritePreparsedUCD(out_file):
        i_alg += 1
      # NamesList h2 heading.
      if i_h2 < len(_h2) and start == _h2[i_h2][0]:
-      out_file.write("# %s\n" % (_h2[i_h2][1]))
+      out_file.write("# %s\n" % EscapeNonASCII(_h2[i_h2][1]))
        i_h2 += 1
      # Code point/range data.
      props = _props[i]
@@ -1173,10 +1195,10 @@ def HasOneWayMapping(c):
  
  
  def WriteNorm2NFCTextFile(path):
-  year = datetime.date.today().strftime("%Y")
+  global _current_year
    with open(os.path.join(path, "nfc.txt"), "w") as out_file:
      out_file.write(
-        """# Copyright (C) 1999-""" + year +
+        """# Copyright (C) 1999-""" + _current_year +
          """, International Business Machines
  # Corporation and others.  All Rights Reserved.
  #
@@ -1205,10 +1227,10 @@ def WriteNorm2NFCTextFile(path):
  
  
  def WriteNorm2NFKCTextFile(path):
-  year = datetime.date.today().strftime("%Y")
+  global _current_year
    with open(os.path.join(path, "nfkc.txt"), "w") as out_file:
      out_file.write(
-        """# Copyright (C) 1999-""" + year +
+        """# Copyright (C) 1999-""" + _current_year +
          """, International Business Machines
  # Corporation and others.  All Rights Reserved.
  #
@@ -1242,11 +1264,11 @@ def WriteNorm2NFKCTextFile(path):
  
  
  def WriteNorm2NFKC_CFTextFile(path):
-  year = datetime.date.today().strftime("%Y")
+  global _current_year
    with open(os.path.join(path, "nfkc_cf.txt"), "w") as out_file:
      out_file.write(
          """# Unicode Character Database
-# Copyright (c) 1991-""" + year + """ Unicode, Inc.
+# Copyright (c) 1991-""" + _current_year + """ Unicode, Inc.
  # For terms of use, see http://www.unicode.org/terms_of_use.html
  # For documentation, see http://www.unicode.org/reports/tr44/
  #
@@ -1944,10 +1966,10 @@ def CheckPNamesData():
  
  
  def WritePNamesDataHeader(out_path):
-  year = datetime.date.today().strftime("%Y")
+  global _current_year
    with open(out_path, "w") as out_file:
      out_file.write("""/**
- * Copyright (C) 2002-""" + year +
+ * Copyright (C) 2002-""" + _current_year +
  """, International Business Machines Corporation and
   * others. All Rights Reserved.
   *
@@ -2066,8 +2088,11 @@ def main():
    # Optimize block vs. cp properties.
    CompactBlocks()
    # Write the ppucd.txt output file.
+  # Use US-ASCII so that ICU tests can parse it in the platform charset,
+  # which may be EBCDIC.
+  # Fix up non-ASCII data (NamesList.txt headings) to fit.
    out_path = os.path.join(unidata_path, "ppucd.txt")
-  with codecs.open(out_path, "w", "UTF-8") as out_file:
+  with codecs.open(out_path, "w", "US-ASCII") as out_file:
      WritePreparsedUCD(out_file)
      out_file.flush()
author	Markus Scherer <markus.icu@gmail.com>
	Fri, 4 Apr 2014 18:01:48 +0000 (18:01 +0000)
committer	Markus Scherer <markus.icu@gmail.com>
	Fri, 4 Apr 2014 18:01:48 +0000 (18:01 +0000)