+++ /dev/null
-#!/usr/bin/python2.4
-# Copyright (C) 2010-2011, International Business Machines
-# Corporation and others. All Rights Reserved.
-#
-# file name: idna2nrm.py
-# encoding: US-ASCII
-# tab size: 8 (not used)
-# indentation:4
-#
-# created on: 2010jan28
-# created by: Markus W. Scherer
-
-"""Turn Unicode IdnaMappingTable.txt into ICU gennorm2 source file format."""
-
-__author__ = "Markus Scherer"
-
-import re
-
-replacements = [
- # Several versions of avoiding circular FFFD>FFFD mappings,
- # depending on the version of the input file.
- (re.compile(r"FFFD ; disallowed"), "# FFFD (avoid circular mapping)"),
- (re.compile(r"\.\.FFFD"), "..FFFC"),
- (re.compile(r"(FFF[^E])\.\.FFFF"), "\1..FFFC"),
- # Since we switch between checking and not checking for STD3 character
- # restrictions at runtime, checking the non-LDH ASCII characters in code,
- # we treat these values here like their regular siblings.
- (re.compile(r"^([^;]+) ; disallowed_STD3_valid"), r"# \1disallowed_STD3_valid"),
- (re.compile(r"; disallowed_STD3_mapped +; "), ">"),
- # For UTS #46, we do not care about "not valid in IDNA2008".
- (re.compile(r"; *; NV8 +"), ""),
- # Normal transformations.
- (re.compile(r"; disallowed"), ">FFFD"),
- (re.compile(r"; ignored"), ">"),
- (re.compile(r"^([^;]+) ; valid"), r"# \1valid"),
- (re.compile(r"; mapped +; "), ">"),
- (re.compile(r"^([^;]+) ; deviation +; "), r"# \1deviation >")
-]
-
-in_file = open("IdnaMappingTable.txt", "r")
-out_file = open("uts46.txt", "w")
-
-out_file.write("# Original file:\n")
-for line in in_file:
- orig_line = line
- if line.startswith("# For documentation, see"):
- out_file.write(line)
- out_file.write(r"""
-# ================================================
-# This file has been reformatted into syntax for the
-# gennorm2 Normalizer2 data generator tool.
-#
-# "valid", "disallowed_STD3_valid" and "deviation" lines are commented out.
-# "mapped" and "disallowed_STD3_mapped" are changed to use the ">" mapping syntax.
-# "disallowed" lines map to U+FFFD.
-# "ignored" lines map to an empty string.
-#
-# Characters disallowed under STD3 rules are treated as valid or mapped;
-# they are handled in code.
-# Deviation characters are also handled in code.
-#
-# Use this file as the second gennorm2 input file after nfc.txt.
-# ================================================
-""")
- continue
- if line[0] in "#\r\n":
- out_file.write(line)
- continue
- for rep in replacements: line = rep[0].sub(rep[1], line)
- # Align inline comments at column 40.
- comment_pos = line.find("#", 1)
- if comment_pos < 40:
- line = line[:comment_pos] + ((40 - comment_pos) * ' ') + line[comment_pos:]
- elif comment_pos > 40:
- space_pos = comment_pos
- while space_pos > 0 and line[space_pos - 1] == ' ':
- space_pos = space_pos - 1
- if space_pos < 40:
- # Fewer than 40 characters before the comment:
- # Align comments at column 40.
- line = line[:40] + line[comment_pos:]
- else:
- # 40 or more characters before the comment:
- # Keep one space between contents and comment.
- line = line[:space_pos] + " " + line[comment_pos:]
- # Write the modified line.
- out_file.write(line)
- if "..FFFF" in orig_line and "..FFFC" in line:
- out_file.write("FFFE..FFFF >FFFD\n");
-in_file.close()
-out_file.close()
# Parses them and writes unidata/ppucd.txt (PreParsed UCD) with simple syntax.
#
# Invoke with three command-line parameters:
-# 1. source folder with UCD files
+# 1. source folder with UCD & idna files
# 2. ICU source root folder
# 3. ICU tools root folder
#
# Sample invocation:
-# ~/svn.icu/tools/trunk/src/unicode$ py/preparseucd.py ~/uni61/20111205mod/ucd ~/svn.icu/trunk/src ~/svn.icu/tools/trunk/src
+# ~/svn.icu/tools/trunk/src/unicode$ py/preparseucd.py ~/uni61/20120118 ~/svn.icu/trunk/src ~/svn.icu/tools/trunk/src
import array
import bisect
WriteNorm2NFKCTextFile(path)
WriteNorm2NFKC_CFTextFile(path)
+# UTS #46 Normalizer2 input file ------------------------------------------- ***
+
+_idna_replacements = [
+ # Several versions of avoiding circular FFFD>FFFD mappings,
+ # depending on the version of the input file.
+ (re.compile(r"FFFD ; disallowed"), "# FFFD (avoid circular mapping)"),
+ (re.compile(r"\.\.FFFD"), "..FFFC"),
+ (re.compile(r"(FFF[^E])\.\.FFFF"), "\1..FFFC"),
+ # Since we switch between checking and not checking for STD3 character
+ # restrictions at runtime, checking the non-LDH ASCII characters in code,
+ # we treat these values here like their regular siblings.
+ (re.compile(r"^([^;]+) ; disallowed_STD3_valid"), r"# \1disallowed_STD3_valid"),
+ (re.compile(r"; disallowed_STD3_mapped +; "), ">"),
+ # For UTS #46, we do not care about "not valid in IDNA2008".
+ (re.compile(r"; *; NV8 +"), ""),
+ # Normal transformations.
+ (re.compile(r"; disallowed"), ">FFFD"),
+ (re.compile(r"; ignored"), ">"),
+ (re.compile(r"^([^;]+) ; valid"), r"# \1valid"),
+ (re.compile(r"; mapped +; "), ">"),
+ (re.compile(r"^([^;]+) ; deviation +; "), r"# \1deviation >")
+]
+
+def IdnaToUTS46TextFile(s, t):
+ """Turn Unicode IdnaMappingTable.txt into ICU gennorm2 source file format."""
+ # Different input/output file names.
+ dest_path = os.path.dirname(t)
+ t = os.path.join(dest_path, "uts46.txt")
+ # TODO: With Python 2.7+, combine the two with statements into one.
+ with open(s, "r") as in_file:
+ with open(t, "w") as out_file:
+ out_file.write("# Original file:\n")
+ for line in in_file:
+ orig_line = line
+ if line.startswith("# For documentation, see"):
+ out_file.write(line)
+ out_file.write(r"""
+# ================================================
+# This file has been reformatted into syntax for the
+# gennorm2 Normalizer2 data generator tool.
+#
+# "valid", "disallowed_STD3_valid" and "deviation" lines are commented out.
+# "mapped" and "disallowed_STD3_mapped" are changed to use the ">" mapping syntax.
+# "disallowed" lines map to U+FFFD.
+# "ignored" lines map to an empty string.
+#
+# Characters disallowed under STD3 rules are treated as valid or mapped;
+# they are handled in code.
+# Deviation characters are also handled in code.
+#
+# Use this file as the second gennorm2 input file after nfc.txt.
+# ================================================
+""")
+ continue
+ if line[0] in "#\r\n":
+ out_file.write(line)
+ continue
+ for rep in _idna_replacements: line = rep[0].sub(rep[1], line)
+ # Align inline comments at column 40.
+ comment_pos = line.find("#", 1)
+ if comment_pos < 40:
+ line = (line[:comment_pos] + ((40 - comment_pos) * ' ') +
+ line[comment_pos:])
+ elif comment_pos > 40:
+ space_pos = comment_pos
+ while space_pos > 0 and line[space_pos - 1] == ' ':
+ space_pos = space_pos - 1
+ if space_pos < 40:
+ # Fewer than 40 characters before the comment:
+ # Align comments at column 40.
+ line = line[:40] + line[comment_pos:]
+ else:
+ # 40 or more characters before the comment:
+ # Keep one space between contents and comment.
+ line = line[:space_pos] + " " + line[comment_pos:]
+ # Write the modified line.
+ out_file.write(line)
+ if "..FFFF" in orig_line and "..FFFC" in line:
+ out_file.write("FFFE..FFFF >FFFD\n");
+ return t
+
# Preprocessing ------------------------------------------------------------ ***
_strip_re = re.compile("([0-9a-fA-F]+.+?) *#.*")
"SpecialCasing.txt": (CopyOnly, ParseSpecialCasing),
"UnicodeData.txt": (CopyOnly, ParseUnicodeData, 2),
"WordBreakProperty.txt": (DontCopy, ParseWordBreak),
- "WordBreakTest.txt": (PrependBOM, "testdata")
+ "WordBreakTest.txt": (PrependBOM, "testdata"),
+ # From www.unicode.org/Public/idna/<version>/
+ "IdnaMappingTable.txt": (IdnaToUTS46TextFile, "norm2")
}
# List of lists of files to be parsed in order.
def PreprocessFiles(source_files, icu_src_root):
unidata_path = os.path.join(icu_src_root, "source", "data", "unidata")
+ norm2_path = os.path.join(unidata_path, "norm2")
testdata_path = os.path.join(icu_src_root, "source", "test", "testdata")
folder_to_path = {
"unidata": unidata_path,
+ "norm2": norm2_path,
"testdata": testdata_path
}
files_processed = set()
for source_file in source_files:
basename = os.path.basename(source_file)
match = _file_version_re.match(basename)
- if match:
- basename = match.group(1) + match.group(2)
- print "Preprocessing %s" % basename
+ if match: basename = match.group(1) + match.group(2)
if basename in _files:
+ print "Preprocessing %s" % basename
if basename in files_processed:
raise Exception("duplicate file basename %s!" % basename)
files_processed.add(basename)