From 4ad12dc3188b508eca1299213cd1e455589d19db Mon Sep 17 00:00:00 2001
From: Markus Scherer <markus.icu@gmail.com>
Date: Thu, 19 Jan 2012 18:51:33 +0000
Subject: [PATCH] ICU-8995 merge idna2nrm.py into preparseucd.py

X-SVN-Rev: 31229
---
 tools/unicode/py/idna2nrm.py    | 91 -------------------------------
 tools/unicode/py/preparseucd.py | 96 ++++++++++++++++++++++++++++++---
 2 files changed, 90 insertions(+), 97 deletions(-)
 delete mode 100755 tools/unicode/py/idna2nrm.py

diff --git a/tools/unicode/py/idna2nrm.py b/tools/unicode/py/idna2nrm.py
deleted file mode 100755
index e1772b428a1..00000000000
--- a/tools/unicode/py/idna2nrm.py
+++ /dev/null
@@ -1,91 +0,0 @@
-#!/usr/bin/python2.4
-#   Copyright (C) 2010-2011, International Business Machines
-#   Corporation and others.  All Rights Reserved.
-#
-#   file name:  idna2nrm.py
-#   encoding:   US-ASCII
-#   tab size:   8 (not used)
-#   indentation:4
-#
-#   created on: 2010jan28
-#   created by: Markus W. Scherer
-
-"""Turn Unicode IdnaMappingTable.txt into ICU gennorm2 source file format."""
-
-__author__ = "Markus Scherer"
-
-import re
-
-replacements = [
-  # Several versions of avoiding circular FFFD>FFFD mappings,
-  # depending on the version of the input file.
-  (re.compile(r"FFFD          ; disallowed"), "# FFFD (avoid circular mapping)"),
-  (re.compile(r"\.\.FFFD"), "..FFFC"),
-  (re.compile(r"(FFF[^E])\.\.FFFF"), "\1..FFFC"),
-  # Since we switch between checking and not checking for STD3 character
-  # restrictions at runtime, checking the non-LDH ASCII characters in code,
-  # we treat these values here like their regular siblings.
-  (re.compile(r"^([^;]+)  ; disallowed_STD3_valid"), r"# \1disallowed_STD3_valid"),
-  (re.compile(r"; disallowed_STD3_mapped +; "), ">"),
-  # For UTS #46, we do not care about "not valid in IDNA2008".
-  (re.compile(r"; *; NV8 +"), ""),
-  # Normal transformations.
-  (re.compile(r"; disallowed"), ">FFFD"),
-  (re.compile(r"; ignored"), ">"),
-  (re.compile(r"^([^;]+)  ; valid"), r"# \1valid"),
-  (re.compile(r"; mapped +; "), ">"),
-  (re.compile(r"^([^;]+)  ; deviation +; "), r"# \1deviation >")
-]
-
-in_file = open("IdnaMappingTable.txt", "r")
-out_file = open("uts46.txt", "w")
-
-out_file.write("# Original file:\n")
-for line in in_file:
-  orig_line = line
-  if line.startswith("# For documentation, see"):
-    out_file.write(line)
-    out_file.write(r"""
-# ================================================
-# This file has been reformatted into syntax for the
-# gennorm2 Normalizer2 data generator tool.
-#
-# "valid", "disallowed_STD3_valid" and "deviation" lines are commented out.
-# "mapped" and "disallowed_STD3_mapped" are changed to use the ">" mapping syntax.
-# "disallowed" lines map to U+FFFD.
-# "ignored" lines map to an empty string.
-#
-# Characters disallowed under STD3 rules are treated as valid or mapped;
-# they are handled in code.
-# Deviation characters are also handled in code.
-#
-# Use this file as the second gennorm2 input file after nfc.txt.
-# ================================================
-""")
-    continue
-  if line[0] in "#\r\n":
-    out_file.write(line)
-    continue
-  for rep in replacements: line = rep[0].sub(rep[1], line)
-  # Align inline comments at column 40.
-  comment_pos = line.find("#", 1)
-  if comment_pos < 40:
-    line = line[:comment_pos] + ((40 - comment_pos) * ' ') + line[comment_pos:]
-  elif comment_pos > 40:
-    space_pos = comment_pos
-    while space_pos > 0 and line[space_pos - 1] == ' ':
-      space_pos = space_pos - 1
-    if space_pos < 40:
-      # Fewer than 40 characters before the comment:
-      # Align comments at column 40.
-      line = line[:40] + line[comment_pos:]
-    else:
-      # 40 or more characters before the comment:
-      # Keep one space between contents and comment.
-      line = line[:space_pos] + " " + line[comment_pos:]
-  # Write the modified line.
-  out_file.write(line)
-  if "..FFFF" in orig_line and "..FFFC" in line:
-    out_file.write("FFFE..FFFF    >FFFD\n");
-in_file.close()
-out_file.close()
diff --git a/tools/unicode/py/preparseucd.py b/tools/unicode/py/preparseucd.py
index cb44009f9c1..4666000cd08 100755
--- a/tools/unicode/py/preparseucd.py
+++ b/tools/unicode/py/preparseucd.py
@@ -18,12 +18,12 @@
 # Parses them and writes unidata/ppucd.txt (PreParsed UCD) with simple syntax.
 #
 # Invoke with three command-line parameters:
-# 1. source folder with UCD files
+# 1. source folder with UCD & idna files
 # 2. ICU source root folder
 # 3. ICU tools root folder
 #
 # Sample invocation:
-#   ~/svn.icu/tools/trunk/src/unicode$ py/preparseucd.py ~/uni61/20111205mod/ucd ~/svn.icu/trunk/src ~/svn.icu/tools/trunk/src
+#   ~/svn.icu/tools/trunk/src/unicode$ py/preparseucd.py ~/uni61/20120118 ~/svn.icu/trunk/src ~/svn.icu/tools/trunk/src
 
 import array
 import bisect
@@ -1288,6 +1288,87 @@ def WriteNorm2(path):
   WriteNorm2NFKCTextFile(path)
   WriteNorm2NFKC_CFTextFile(path)
 
+# UTS #46 Normalizer2 input file ------------------------------------------- ***
+
+_idna_replacements = [
+  # Several versions of avoiding circular FFFD>FFFD mappings,
+  # depending on the version of the input file.
+  (re.compile(r"FFFD          ; disallowed"), "# FFFD (avoid circular mapping)"),
+  (re.compile(r"\.\.FFFD"), "..FFFC"),
+  (re.compile(r"(FFF[^E])\.\.FFFF"), "\1..FFFC"),
+  # Since we switch between checking and not checking for STD3 character
+  # restrictions at runtime, checking the non-LDH ASCII characters in code,
+  # we treat these values here like their regular siblings.
+  (re.compile(r"^([^;]+)  ; disallowed_STD3_valid"), r"# \1disallowed_STD3_valid"),
+  (re.compile(r"; disallowed_STD3_mapped +; "), ">"),
+  # For UTS #46, we do not care about "not valid in IDNA2008".
+  (re.compile(r"; *; NV8 +"), ""),
+  # Normal transformations.
+  (re.compile(r"; disallowed"), ">FFFD"),
+  (re.compile(r"; ignored"), ">"),
+  (re.compile(r"^([^;]+)  ; valid"), r"# \1valid"),
+  (re.compile(r"; mapped +; "), ">"),
+  (re.compile(r"^([^;]+)  ; deviation +; "), r"# \1deviation >")
+]
+
+def IdnaToUTS46TextFile(s, t):
+  """Turn Unicode IdnaMappingTable.txt into ICU gennorm2 source file format."""
+  # Different input/output file names.
+  dest_path = os.path.dirname(t)
+  t = os.path.join(dest_path, "uts46.txt")
+  # TODO: With Python 2.7+, combine the two with statements into one.
+  with open(s, "r") as in_file:
+    with open(t, "w") as out_file:
+      out_file.write("# Original file:\n")
+      for line in in_file:
+        orig_line = line
+        if line.startswith("# For documentation, see"):
+          out_file.write(line)
+          out_file.write(r"""
+# ================================================
+# This file has been reformatted into syntax for the
+# gennorm2 Normalizer2 data generator tool.
+#
+# "valid", "disallowed_STD3_valid" and "deviation" lines are commented out.
+# "mapped" and "disallowed_STD3_mapped" are changed to use the ">" mapping syntax.
+# "disallowed" lines map to U+FFFD.
+# "ignored" lines map to an empty string.
+#
+# Characters disallowed under STD3 rules are treated as valid or mapped;
+# they are handled in code.
+# Deviation characters are also handled in code.
+#
+# Use this file as the second gennorm2 input file after nfc.txt.
+# ================================================
+""")
+          continue
+        if line[0] in "#\r\n":
+          out_file.write(line)
+          continue
+        for rep in _idna_replacements: line = rep[0].sub(rep[1], line)
+        # Align inline comments at column 40.
+        comment_pos = line.find("#", 1)
+        if comment_pos < 40:
+          line = (line[:comment_pos] + ((40 - comment_pos) * ' ') +
+                  line[comment_pos:])
+        elif comment_pos > 40:
+          space_pos = comment_pos
+          while space_pos > 0 and line[space_pos - 1] == ' ':
+            space_pos = space_pos - 1
+          if space_pos < 40:
+            # Fewer than 40 characters before the comment:
+            # Align comments at column 40.
+            line = line[:40] + line[comment_pos:]
+          else:
+            # 40 or more characters before the comment:
+            # Keep one space between contents and comment.
+            line = line[:space_pos] + " " + line[comment_pos:]
+        # Write the modified line.
+        out_file.write(line)
+        if "..FFFF" in orig_line and "..FFFC" in line:
+          out_file.write("FFFE..FFFF    >FFFD\n");
+  return t
+
 # Preprocessing ------------------------------------------------------------ ***
 
 _strip_re = re.compile("([0-9a-fA-F]+.+?) *#.*")
@@ -1431,7 +1512,9 @@ _files = {
   "SpecialCasing.txt": (CopyOnly, ParseSpecialCasing),
   "UnicodeData.txt": (CopyOnly, ParseUnicodeData, 2),
   "WordBreakProperty.txt": (DontCopy, ParseWordBreak),
-  "WordBreakTest.txt": (PrependBOM, "testdata")
+  "WordBreakTest.txt": (PrependBOM, "testdata"),
+  # From www.unicode.org/Public/idna/<version>/
+  "IdnaMappingTable.txt": (IdnaToUTS46TextFile, "norm2")
 }
 
 # List of lists of files to be parsed in order.
@@ -1447,19 +1530,20 @@ _file_version_re = re.compile("([a-zA-Z0-9]+)" +
 
 def PreprocessFiles(source_files, icu_src_root):
   unidata_path = os.path.join(icu_src_root, "source", "data", "unidata")
+  norm2_path = os.path.join(unidata_path, "norm2")
   testdata_path = os.path.join(icu_src_root, "source", "test", "testdata")
   folder_to_path = {
     "unidata": unidata_path,
+    "norm2": norm2_path,
     "testdata": testdata_path
   }
   files_processed = set()
   for source_file in source_files:
     basename = os.path.basename(source_file)
     match = _file_version_re.match(basename)
-    if match:
-      basename = match.group(1) + match.group(2)
-      print "Preprocessing %s" % basename
+    if match: basename = match.group(1) + match.group(2)
     if basename in _files:
+      print "Preprocessing %s" % basename
       if basename in files_processed:
         raise Exception("duplicate file basename %s!" % basename)
       files_processed.add(basename)
-- 
2.40.0