From 4ad12dc3188b508eca1299213cd1e455589d19db Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Thu, 19 Jan 2012 18:51:33 +0000 Subject: [PATCH] ICU-8995 merge idna2nrm.py into preparseucd.py X-SVN-Rev: 31229 --- tools/unicode/py/idna2nrm.py | 91 ------------------------------- tools/unicode/py/preparseucd.py | 96 ++++++++++++++++++++++++++++++--- 2 files changed, 90 insertions(+), 97 deletions(-) delete mode 100755 tools/unicode/py/idna2nrm.py diff --git a/tools/unicode/py/idna2nrm.py b/tools/unicode/py/idna2nrm.py deleted file mode 100755 index e1772b428a1..00000000000 --- a/tools/unicode/py/idna2nrm.py +++ /dev/null @@ -1,91 +0,0 @@ -#!/usr/bin/python2.4 -# Copyright (C) 2010-2011, International Business Machines -# Corporation and others. All Rights Reserved. -# -# file name: idna2nrm.py -# encoding: US-ASCII -# tab size: 8 (not used) -# indentation:4 -# -# created on: 2010jan28 -# created by: Markus W. Scherer - -"""Turn Unicode IdnaMappingTable.txt into ICU gennorm2 source file format.""" - -__author__ = "Markus Scherer" - -import re - -replacements = [ - # Several versions of avoiding circular FFFD>FFFD mappings, - # depending on the version of the input file. - (re.compile(r"FFFD ; disallowed"), "# FFFD (avoid circular mapping)"), - (re.compile(r"\.\.FFFD"), "..FFFC"), - (re.compile(r"(FFF[^E])\.\.FFFF"), "\1..FFFC"), - # Since we switch between checking and not checking for STD3 character - # restrictions at runtime, checking the non-LDH ASCII characters in code, - # we treat these values here like their regular siblings. - (re.compile(r"^([^;]+) ; disallowed_STD3_valid"), r"# \1disallowed_STD3_valid"), - (re.compile(r"; disallowed_STD3_mapped +; "), ">"), - # For UTS #46, we do not care about "not valid in IDNA2008". - (re.compile(r"; *; NV8 +"), ""), - # Normal transformations. - (re.compile(r"; disallowed"), ">FFFD"), - (re.compile(r"; ignored"), ">"), - (re.compile(r"^([^;]+) ; valid"), r"# \1valid"), - (re.compile(r"; mapped +; "), ">"), - (re.compile(r"^([^;]+) ; deviation +; "), r"# \1deviation >") -] - -in_file = open("IdnaMappingTable.txt", "r") -out_file = open("uts46.txt", "w") - -out_file.write("# Original file:\n") -for line in in_file: - orig_line = line - if line.startswith("# For documentation, see"): - out_file.write(line) - out_file.write(r""" -# ================================================ -# This file has been reformatted into syntax for the -# gennorm2 Normalizer2 data generator tool. -# -# "valid", "disallowed_STD3_valid" and "deviation" lines are commented out. -# "mapped" and "disallowed_STD3_mapped" are changed to use the ">" mapping syntax. -# "disallowed" lines map to U+FFFD. -# "ignored" lines map to an empty string. -# -# Characters disallowed under STD3 rules are treated as valid or mapped; -# they are handled in code. -# Deviation characters are also handled in code. -# -# Use this file as the second gennorm2 input file after nfc.txt. -# ================================================ -""") - continue - if line[0] in "#\r\n": - out_file.write(line) - continue - for rep in replacements: line = rep[0].sub(rep[1], line) - # Align inline comments at column 40. - comment_pos = line.find("#", 1) - if comment_pos < 40: - line = line[:comment_pos] + ((40 - comment_pos) * ' ') + line[comment_pos:] - elif comment_pos > 40: - space_pos = comment_pos - while space_pos > 0 and line[space_pos - 1] == ' ': - space_pos = space_pos - 1 - if space_pos < 40: - # Fewer than 40 characters before the comment: - # Align comments at column 40. - line = line[:40] + line[comment_pos:] - else: - # 40 or more characters before the comment: - # Keep one space between contents and comment. - line = line[:space_pos] + " " + line[comment_pos:] - # Write the modified line. - out_file.write(line) - if "..FFFF" in orig_line and "..FFFC" in line: - out_file.write("FFFE..FFFF >FFFD\n"); -in_file.close() -out_file.close() diff --git a/tools/unicode/py/preparseucd.py b/tools/unicode/py/preparseucd.py index cb44009f9c1..4666000cd08 100755 --- a/tools/unicode/py/preparseucd.py +++ b/tools/unicode/py/preparseucd.py @@ -18,12 +18,12 @@ # Parses them and writes unidata/ppucd.txt (PreParsed UCD) with simple syntax. # # Invoke with three command-line parameters: -# 1. source folder with UCD files +# 1. source folder with UCD & idna files # 2. ICU source root folder # 3. ICU tools root folder # # Sample invocation: -# ~/svn.icu/tools/trunk/src/unicode$ py/preparseucd.py ~/uni61/20111205mod/ucd ~/svn.icu/trunk/src ~/svn.icu/tools/trunk/src +# ~/svn.icu/tools/trunk/src/unicode$ py/preparseucd.py ~/uni61/20120118 ~/svn.icu/trunk/src ~/svn.icu/tools/trunk/src import array import bisect @@ -1288,6 +1288,87 @@ def WriteNorm2(path): WriteNorm2NFKCTextFile(path) WriteNorm2NFKC_CFTextFile(path) +# UTS #46 Normalizer2 input file ------------------------------------------- *** + +_idna_replacements = [ + # Several versions of avoiding circular FFFD>FFFD mappings, + # depending on the version of the input file. + (re.compile(r"FFFD ; disallowed"), "# FFFD (avoid circular mapping)"), + (re.compile(r"\.\.FFFD"), "..FFFC"), + (re.compile(r"(FFF[^E])\.\.FFFF"), "\1..FFFC"), + # Since we switch between checking and not checking for STD3 character + # restrictions at runtime, checking the non-LDH ASCII characters in code, + # we treat these values here like their regular siblings. + (re.compile(r"^([^;]+) ; disallowed_STD3_valid"), r"# \1disallowed_STD3_valid"), + (re.compile(r"; disallowed_STD3_mapped +; "), ">"), + # For UTS #46, we do not care about "not valid in IDNA2008". + (re.compile(r"; *; NV8 +"), ""), + # Normal transformations. + (re.compile(r"; disallowed"), ">FFFD"), + (re.compile(r"; ignored"), ">"), + (re.compile(r"^([^;]+) ; valid"), r"# \1valid"), + (re.compile(r"; mapped +; "), ">"), + (re.compile(r"^([^;]+) ; deviation +; "), r"# \1deviation >") +] + +def IdnaToUTS46TextFile(s, t): + """Turn Unicode IdnaMappingTable.txt into ICU gennorm2 source file format.""" + # Different input/output file names. + dest_path = os.path.dirname(t) + t = os.path.join(dest_path, "uts46.txt") + # TODO: With Python 2.7+, combine the two with statements into one. + with open(s, "r") as in_file: + with open(t, "w") as out_file: + out_file.write("# Original file:\n") + for line in in_file: + orig_line = line + if line.startswith("# For documentation, see"): + out_file.write(line) + out_file.write(r""" +# ================================================ +# This file has been reformatted into syntax for the +# gennorm2 Normalizer2 data generator tool. +# +# "valid", "disallowed_STD3_valid" and "deviation" lines are commented out. +# "mapped" and "disallowed_STD3_mapped" are changed to use the ">" mapping syntax. +# "disallowed" lines map to U+FFFD. +# "ignored" lines map to an empty string. +# +# Characters disallowed under STD3 rules are treated as valid or mapped; +# they are handled in code. +# Deviation characters are also handled in code. +# +# Use this file as the second gennorm2 input file after nfc.txt. +# ================================================ +""") + continue + if line[0] in "#\r\n": + out_file.write(line) + continue + for rep in _idna_replacements: line = rep[0].sub(rep[1], line) + # Align inline comments at column 40. + comment_pos = line.find("#", 1) + if comment_pos < 40: + line = (line[:comment_pos] + ((40 - comment_pos) * ' ') + + line[comment_pos:]) + elif comment_pos > 40: + space_pos = comment_pos + while space_pos > 0 and line[space_pos - 1] == ' ': + space_pos = space_pos - 1 + if space_pos < 40: + # Fewer than 40 characters before the comment: + # Align comments at column 40. + line = line[:40] + line[comment_pos:] + else: + # 40 or more characters before the comment: + # Keep one space between contents and comment. + line = line[:space_pos] + " " + line[comment_pos:] + # Write the modified line. + out_file.write(line) + if "..FFFF" in orig_line and "..FFFC" in line: + out_file.write("FFFE..FFFF >FFFD\n"); + return t + # Preprocessing ------------------------------------------------------------ *** _strip_re = re.compile("([0-9a-fA-F]+.+?) *#.*") @@ -1431,7 +1512,9 @@ _files = { "SpecialCasing.txt": (CopyOnly, ParseSpecialCasing), "UnicodeData.txt": (CopyOnly, ParseUnicodeData, 2), "WordBreakProperty.txt": (DontCopy, ParseWordBreak), - "WordBreakTest.txt": (PrependBOM, "testdata") + "WordBreakTest.txt": (PrependBOM, "testdata"), + # From www.unicode.org/Public/idna// + "IdnaMappingTable.txt": (IdnaToUTS46TextFile, "norm2") } # List of lists of files to be parsed in order. @@ -1447,19 +1530,20 @@ _file_version_re = re.compile("([a-zA-Z0-9]+)" + def PreprocessFiles(source_files, icu_src_root): unidata_path = os.path.join(icu_src_root, "source", "data", "unidata") + norm2_path = os.path.join(unidata_path, "norm2") testdata_path = os.path.join(icu_src_root, "source", "test", "testdata") folder_to_path = { "unidata": unidata_path, + "norm2": norm2_path, "testdata": testdata_path } files_processed = set() for source_file in source_files: basename = os.path.basename(source_file) match = _file_version_re.match(basename) - if match: - basename = match.group(1) + match.group(2) - print "Preprocessing %s" % basename + if match: basename = match.group(1) + match.group(2) if basename in _files: + print "Preprocessing %s" % basename if basename in files_processed: raise Exception("duplicate file basename %s!" % basename) files_processed.add(basename) -- 2.40.0