ICU-8995 merge idna2nrm.py into preparseucd.py

author Markus Scherer <markus.icu@gmail.com>

Thu, 19 Jan 2012 18:51:33 +0000 (18:51 +0000)

committer Markus Scherer <markus.icu@gmail.com>

Thu, 19 Jan 2012 18:51:33 +0000 (18:51 +0000)
author Markus Scherer <markus.icu@gmail.com>
Thu, 19 Jan 2012 18:51:33 +0000 (18:51 +0000)
committer Markus Scherer <markus.icu@gmail.com>
Thu, 19 Jan 2012 18:51:33 +0000 (18:51 +0000)
diff --git a/tools/unicode/py/idna2nrm.py b/tools/unicode/py/idna2nrm.py

deleted file mode 100755 (executable)

index e1772b4..0000000
--- a/tools/unicode/py/idna2nrm.py
+++ /dev/null
@@ -1,91 +0,0 @@
-#!/usr/bin/python2.4
-#   Copyright (C) 2010-2011, International Business Machines
-#   Corporation and others.  All Rights Reserved.
-#
-#   file name:  idna2nrm.py
-#   encoding:   US-ASCII
-#   tab size:   8 (not used)
-#   indentation:4
-#
-#   created on: 2010jan28
-#   created by: Markus W. Scherer
-
-"""Turn Unicode IdnaMappingTable.txt into ICU gennorm2 source file format."""
-
-__author__ = "Markus Scherer"
-
-import re
-
-replacements = [
-  # Several versions of avoiding circular FFFD>FFFD mappings,
-  # depending on the version of the input file.
-  (re.compile(r"FFFD          ; disallowed"), "# FFFD (avoid circular mapping)"),
-  (re.compile(r"\.\.FFFD"), "..FFFC"),
-  (re.compile(r"(FFF[^E])\.\.FFFF"), "\1..FFFC"),
-  # Since we switch between checking and not checking for STD3 character
-  # restrictions at runtime, checking the non-LDH ASCII characters in code,
-  # we treat these values here like their regular siblings.
-  (re.compile(r"^([^;]+)  ; disallowed_STD3_valid"), r"# \1disallowed_STD3_valid"),
-  (re.compile(r"; disallowed_STD3_mapped +; "), ">"),
-  # For UTS #46, we do not care about "not valid in IDNA2008".
-  (re.compile(r"; *; NV8 +"), ""),
-  # Normal transformations.
-  (re.compile(r"; disallowed"), ">FFFD"),
-  (re.compile(r"; ignored"), ">"),
-  (re.compile(r"^([^;]+)  ; valid"), r"# \1valid"),
-  (re.compile(r"; mapped +; "), ">"),
-  (re.compile(r"^([^;]+)  ; deviation +; "), r"# \1deviation >")
-]
-
-in_file = open("IdnaMappingTable.txt", "r")
-out_file = open("uts46.txt", "w")
-
-out_file.write("# Original file:\n")
-for line in in_file:
-  orig_line = line
-  if line.startswith("# For documentation, see"):
-    out_file.write(line)
-    out_file.write(r"""
-# ================================================
-# This file has been reformatted into syntax for the
-# gennorm2 Normalizer2 data generator tool.
-#
-# "valid", "disallowed_STD3_valid" and "deviation" lines are commented out.
-# "mapped" and "disallowed_STD3_mapped" are changed to use the ">" mapping syntax.
-# "disallowed" lines map to U+FFFD.
-# "ignored" lines map to an empty string.
-#
-# Characters disallowed under STD3 rules are treated as valid or mapped;
-# they are handled in code.
-# Deviation characters are also handled in code.
-#
-# Use this file as the second gennorm2 input file after nfc.txt.
-# ================================================
-""")
-    continue
-  if line[0] in "#\r\n":
-    out_file.write(line)
-    continue
-  for rep in replacements: line = rep[0].sub(rep[1], line)
-  # Align inline comments at column 40.
-  comment_pos = line.find("#", 1)
-  if comment_pos < 40:
-    line = line[:comment_pos] + ((40 - comment_pos) * ' ') + line[comment_pos:]
-  elif comment_pos > 40:
-    space_pos = comment_pos
-    while space_pos > 0 and line[space_pos - 1] == ' ':
-      space_pos = space_pos - 1
-    if space_pos < 40:
-      # Fewer than 40 characters before the comment:
-      # Align comments at column 40.
-      line = line[:40] + line[comment_pos:]
-    else:
-      # 40 or more characters before the comment:
-      # Keep one space between contents and comment.
-      line = line[:space_pos] + " " + line[comment_pos:]
-  # Write the modified line.
-  out_file.write(line)
-  if "..FFFF" in orig_line and "..FFFC" in line:
-    out_file.write("FFFE..FFFF    >FFFD\n");
-in_file.close()
-out_file.close()
diff --git a/tools/unicode/py/preparseucd.py b/tools/unicode/py/preparseucd.py

index cb44009f9c19432d5e6ca4d8de3fb54f1e4385da..4666000cd0827b9d32cd0c01faf5ae8d5cddf777 100755 (executable)
--- a/tools/unicode/py/preparseucd.py
+++ b/tools/unicode/py/preparseucd.py
@@ -18,12 +18,12 @@
  # Parses them and writes unidata/ppucd.txt (PreParsed UCD) with simple syntax.
  #
  # Invoke with three command-line parameters:
-# 1. source folder with UCD files
+# 1. source folder with UCD & idna files
  # 2. ICU source root folder
  # 3. ICU tools root folder
  #
  # Sample invocation:
-#   ~/svn.icu/tools/trunk/src/unicode$ py/preparseucd.py ~/uni61/20111205mod/ucd ~/svn.icu/trunk/src ~/svn.icu/tools/trunk/src
+#   ~/svn.icu/tools/trunk/src/unicode$ py/preparseucd.py ~/uni61/20120118 ~/svn.icu/trunk/src ~/svn.icu/tools/trunk/src
  
  import array
  import bisect
@@ -1288,6 +1288,87 @@ def WriteNorm2(path):
    WriteNorm2NFKCTextFile(path)
    WriteNorm2NFKC_CFTextFile(path)
  
+# UTS #46 Normalizer2 input file ------------------------------------------- ***
+
+_idna_replacements = [
+  # Several versions of avoiding circular FFFD>FFFD mappings,
+  # depending on the version of the input file.
+  (re.compile(r"FFFD          ; disallowed"), "# FFFD (avoid circular mapping)"),
+  (re.compile(r"\.\.FFFD"), "..FFFC"),
+  (re.compile(r"(FFF[^E])\.\.FFFF"), "\1..FFFC"),
+  # Since we switch between checking and not checking for STD3 character
+  # restrictions at runtime, checking the non-LDH ASCII characters in code,
+  # we treat these values here like their regular siblings.
+  (re.compile(r"^([^;]+)  ; disallowed_STD3_valid"), r"# \1disallowed_STD3_valid"),
+  (re.compile(r"; disallowed_STD3_mapped +; "), ">"),
+  # For UTS #46, we do not care about "not valid in IDNA2008".
+  (re.compile(r"; *; NV8 +"), ""),
+  # Normal transformations.
+  (re.compile(r"; disallowed"), ">FFFD"),
+  (re.compile(r"; ignored"), ">"),
+  (re.compile(r"^([^;]+)  ; valid"), r"# \1valid"),
+  (re.compile(r"; mapped +; "), ">"),
+  (re.compile(r"^([^;]+)  ; deviation +; "), r"# \1deviation >")
+]
+
+def IdnaToUTS46TextFile(s, t):
+  """Turn Unicode IdnaMappingTable.txt into ICU gennorm2 source file format."""
+  # Different input/output file names.
+  dest_path = os.path.dirname(t)
+  t = os.path.join(dest_path, "uts46.txt")
+  # TODO: With Python 2.7+, combine the two with statements into one.
+  with open(s, "r") as in_file:
+    with open(t, "w") as out_file:
+      out_file.write("# Original file:\n")
+      for line in in_file:
+        orig_line = line
+        if line.startswith("# For documentation, see"):
+          out_file.write(line)
+          out_file.write(r"""
+# ================================================
+# This file has been reformatted into syntax for the
+# gennorm2 Normalizer2 data generator tool.
+#
+# "valid", "disallowed_STD3_valid" and "deviation" lines are commented out.
+# "mapped" and "disallowed_STD3_mapped" are changed to use the ">" mapping syntax.
+# "disallowed" lines map to U+FFFD.
+# "ignored" lines map to an empty string.
+#
+# Characters disallowed under STD3 rules are treated as valid or mapped;
+# they are handled in code.
+# Deviation characters are also handled in code.
+#
+# Use this file as the second gennorm2 input file after nfc.txt.
+# ================================================
+""")
+          continue
+        if line[0] in "#\r\n":
+          out_file.write(line)
+          continue
+        for rep in _idna_replacements: line = rep[0].sub(rep[1], line)
+        # Align inline comments at column 40.
+        comment_pos = line.find("#", 1)
+        if comment_pos < 40:
+          line = (line[:comment_pos] + ((40 - comment_pos) * ' ') +
+                  line[comment_pos:])
+        elif comment_pos > 40:
+          space_pos = comment_pos
+          while space_pos > 0 and line[space_pos - 1] == ' ':
+            space_pos = space_pos - 1
+          if space_pos < 40:
+            # Fewer than 40 characters before the comment:
+            # Align comments at column 40.
+            line = line[:40] + line[comment_pos:]
+          else:
+            # 40 or more characters before the comment:
+            # Keep one space between contents and comment.
+            line = line[:space_pos] + " " + line[comment_pos:]
+        # Write the modified line.
+        out_file.write(line)
+        if "..FFFF" in orig_line and "..FFFC" in line:
+          out_file.write("FFFE..FFFF    >FFFD\n");
+  return t
+
  # Preprocessing ------------------------------------------------------------ ***
  
  _strip_re = re.compile("([0-9a-fA-F]+.+?) *#.*")
@@ -1431,7 +1512,9 @@ _files = {
    "SpecialCasing.txt": (CopyOnly, ParseSpecialCasing),
    "UnicodeData.txt": (CopyOnly, ParseUnicodeData, 2),
    "WordBreakProperty.txt": (DontCopy, ParseWordBreak),
-  "WordBreakTest.txt": (PrependBOM, "testdata")
+  "WordBreakTest.txt": (PrependBOM, "testdata"),
+  # From www.unicode.org/Public/idna/<version>/
+  "IdnaMappingTable.txt": (IdnaToUTS46TextFile, "norm2")
  }
  
  # List of lists of files to be parsed in order.
@@ -1447,19 +1530,20 @@ _file_version_re = re.compile("([a-zA-Z0-9]+)" +
  
  def PreprocessFiles(source_files, icu_src_root):
    unidata_path = os.path.join(icu_src_root, "source", "data", "unidata")
+  norm2_path = os.path.join(unidata_path, "norm2")
    testdata_path = os.path.join(icu_src_root, "source", "test", "testdata")
    folder_to_path = {
      "unidata": unidata_path,
+    "norm2": norm2_path,
      "testdata": testdata_path
    }
    files_processed = set()
    for source_file in source_files:
      basename = os.path.basename(source_file)
      match = _file_version_re.match(basename)
-    if match:
-      basename = match.group(1) + match.group(2)
-      print "Preprocessing %s" % basename
+    if match: basename = match.group(1) + match.group(2)
      if basename in _files:
+      print "Preprocessing %s" % basename
        if basename in files_processed:
          raise Exception("duplicate file basename %s!" % basename)
        files_processed.add(basename)
author	Markus Scherer <markus.icu@gmail.com>
	Thu, 19 Jan 2012 18:51:33 +0000 (18:51 +0000)
committer	Markus Scherer <markus.icu@gmail.com>
	Thu, 19 Jan 2012 18:51:33 +0000 (18:51 +0000)
tools/unicode/py/idna2nrm.py	[deleted file]	patch \| blob \| history
tools/unicode/py/preparseucd.py		patch \| blob \| history