ICU-11449 collation formatVersion 5: reorder single scripts not groups, write new...

author Markus Scherer <markus.icu@gmail.com>

Mon, 5 Jan 2015 23:31:52 +0000 (23:31 +0000)

committer Markus Scherer <markus.icu@gmail.com>

Mon, 5 Jan 2015 23:31:52 +0000 (23:31 +0000)
author Markus Scherer <markus.icu@gmail.com>
Mon, 5 Jan 2015 23:31:52 +0000 (23:31 +0000)
committer Markus Scherer <markus.icu@gmail.com>
Mon, 5 Jan 2015 23:31:52 +0000 (23:31 +0000)
diff --git a/tools/unicode/c/genuca/collationbasedatabuilder.cpp b/tools/unicode/c/genuca/collationbasedatabuilder.cpp

index e700586387aef8cdc6971f456e0069daefc99986..bacb032a969bc7f0aeb9673c6df133205a09558f 100644 (file)
--- a/tools/unicode/c/genuca/collationbasedatabuilder.cpp
+++ b/tools/unicode/c/genuca/collationbasedatabuilder.cpp
@@ -1,6 +1,6 @@
  /*
  *******************************************************************************
-* Copyright (C) 2012-2014, International Business Machines
+* Copyright (C) 2012-2015, International Business Machines
  * Corporation and others.  All Rights Reserved.
  *******************************************************************************
  * collationbasedatabuilder.cpp
@@ -85,7 +85,10 @@ CollationBaseDataBuilder::CollationBaseDataBuilder(UErrorCode &errorCode)
          : CollationDataBuilder(errorCode),
            numericPrimary(0x12000000),
            firstHanPrimary(0), lastHanPrimary(0), hanStep(2),
-          rootElements(errorCode) {
+          rootElements(errorCode),
+          scriptStartsLength(1) {
+    uprv_memset(scriptsIndex, 0, sizeof(scriptsIndex));
+    uprv_memset(scriptStarts, 0, sizeof(scriptStarts));
  }
  
  CollationBaseDataBuilder::~CollationBaseDataBuilder() {
@@ -309,27 +312,43 @@ CollationBaseDataBuilder::addRootElement(int64_t ce, UErrorCode &errorCode) {
  }
  
  void
-CollationBaseDataBuilder::addReorderingGroup(uint32_t firstByte, uint32_t lastByte,
-                                             const UnicodeString &groupScripts,
-                                             UErrorCode &errorCode) {
-    if(U_FAILURE(errorCode)) { return; }
-    if(groupScripts.isEmpty()) {
-        errorCode = U_ILLEGAL_ARGUMENT_ERROR;
-        return;
+CollationBaseDataBuilder::addScriptStart(int32_t script, uint32_t p) {
+    // The primary weight must be the lowest possible for a two-byte prefix.
+    // It could be 2, 3, or 4 bytes long. We round down to the two-byte boundary.
+    U_ASSERT((p & 0xff) == 0 || (p & 0xff) == 2);
+    p >>= 8;
+    U_ASSERT((p & 0xff) == 0 || (p & 0xff) == 2);
+    p >>= 8;
+    uint32_t lowestP2 = compressibleBytes[p >> 8] ? 4 : 2;
+    if((p & 0xff) == lowestP2) {
+        // The script really starts on a lead byte boundary. Round down to that.
+        p &= 0xff00;
      }
-    if(groupScripts.indexOf((UChar)USCRIPT_UNKNOWN) >= 0) {
-        // Zzzz must not occur.
-        // It is the code used in the API to separate low and high scripts.
-        errorCode = U_ILLEGAL_ARGUMENT_ERROR;
-        return;
+    // Script starts should be added in ascending order, otherwise we would need to sort them.
+    if(script < UCOL_REORDER_CODE_FIRST) {
+        U_ASSERT(0 <= script && script < USCRIPT_CODE_LIMIT);
+    } else {
+        U_ASSERT(script <= (UCOL_REORDER_CODE_FIRST + 15));
+        script = USCRIPT_CODE_LIMIT + script - UCOL_REORDER_CODE_FIRST;
+    }
+    if(scriptStartsLength != 0 && scriptStarts[scriptStartsLength - 1] == p) {
+        // Two scripts share a range (e.g., Hira & Kana).
+        scriptsIndex[script] = (uint16_t)(scriptStartsLength - 1);
+    } else {
+        U_ASSERT(scriptStartsLength == 0 || scriptStarts[scriptStartsLength - 1] <= p);
+        U_ASSERT(scriptStartsLength < UPRV_LENGTHOF(scriptStarts));
+        scriptsIndex[script] = (uint16_t)scriptStartsLength;
+        scriptStarts[scriptStartsLength++] = (uint16_t)p;
+    }
+    if(script == USCRIPT_UNKNOWN) {
+        // The last script start is for unassigned code points
+        // (with high implict primary weights).
+        // Add one more entry with the limit of this range,
+        // which is the start of the trailing-weights range.
+        U_ASSERT(scriptStartsLength < UPRV_LENGTHOF(scriptStarts));
+        scriptStarts[scriptStartsLength++] =
+                (uint16_t)((Collation::FIRST_TRAILING_PRIMARY >> 16) & 0xff00);
      }
-    // Note: We are mostly trusting the input data,
-    // rather than verifying that reordering groups do not intersect
-    // with their lead byte ranges nor their sets of scripts,
-    // and that all script codes are valid.
-    scripts.append((UChar)((firstByte << 8) | lastByte));
-    scripts.append((UChar)groupScripts.length());
-    scripts.append(groupScripts);
  }
  
  void
@@ -337,8 +356,18 @@ CollationBaseDataBuilder::build(CollationData &data, UErrorCode &errorCode) {
      buildMappings(data, errorCode);
      data.numericPrimary = numericPrimary;
      data.compressibleBytes = compressibleBytes;
-    data.scripts = reinterpret_cast<const uint16_t *>(scripts.getBuffer());
-    data.scriptsLength = scripts.length();
+
+    int32_t numScripts = USCRIPT_CODE_LIMIT;
+    while(numScripts > 0 && scriptsIndex[numScripts - 1] == 0) { --numScripts; }
+    // Move the 16 special groups (not all used)
+    // down for contiguous storage of the script and special-group indexes.
+    for(int32_t i = 0; i < 16; ++i) {
+        scriptsIndex[numScripts + i] = scriptsIndex[USCRIPT_CODE_LIMIT + i];
+    }
+    data.numScripts = numScripts;
+    data.scriptsIndex = scriptsIndex;
+    data.scriptStarts = scriptStarts;
+    data.scriptStartsLength = scriptStartsLength;
      buildFastLatinTable(data, errorCode);
  }
  
diff --git a/tools/unicode/c/genuca/collationbasedatabuilder.h b/tools/unicode/c/genuca/collationbasedatabuilder.h

index aaa6be10fc3597eea5e63a7ea26c3735f6c5476a..214b52173712e7aefb5553e67978c7dcc59a8a9f 100644 (file)
--- a/tools/unicode/c/genuca/collationbasedatabuilder.h
+++ b/tools/unicode/c/genuca/collationbasedatabuilder.h
@@ -1,6 +1,6 @@
  /*
  *******************************************************************************
-* Copyright (C) 2012-2014, International Business Machines
+* Copyright (C) 2012-2015, International Business Machines
  * Corporation and others.  All Rights Reserved.
  *******************************************************************************
  * collationbasedatabuilder.h
@@ -18,6 +18,7 @@
  
  #include "unicode/uniset.h"
  #include "unicode/unistr.h"
+#include "unicode/uscript.h"
  #include "collation.h"
  #include "collationdata.h"
  #include "collationdatabuilder.h"
@@ -66,9 +67,7 @@ public:
      void addRootElements(const int64_t ces[], int32_t cesLength, UErrorCode &errorCode);
      void addRootElement(int64_t ce, UErrorCode &errorCode);
  
-    void addReorderingGroup(uint32_t firstByte, uint32_t lastByte,
-                            const UnicodeString &groupScripts,
-                            UErrorCode &errorCode);
+    void addScriptStart(int32_t script, uint32_t p);
  
      virtual void build(CollationData &data, UErrorCode &errorCode);
  
@@ -86,7 +85,9 @@ private:
      uint32_t lastHanPrimary;
      int32_t hanStep;
      UVector64 rootElements;
-    UnicodeString scripts;
+    uint16_t scriptsIndex[USCRIPT_CODE_LIMIT + 16];  // need exactly this many
+    uint16_t scriptStarts[USCRIPT_CODE_LIMIT + 16];  // should be safely more than needed
+    int32_t scriptStartsLength;
  };
  
  U_NAMESPACE_END
diff --git a/tools/unicode/c/genuca/genuca.cpp b/tools/unicode/c/genuca/genuca.cpp

index 94793919e5c19a03aa782b051451bd21b33d936a..c0aa67433c2acf36205960a860fc7991ef87d711 100644 (file)
--- a/tools/unicode/c/genuca/genuca.cpp
+++ b/tools/unicode/c/genuca/genuca.cpp
@@ -1,7 +1,7 @@
  /*
  *******************************************************************************
  *
-*   Copyright (C) 2000-2014, International Business Machines
+*   Copyright (C) 2000-2015, International Business Machines
  *   Corporation and others.  All Rights Reserved.
  *
  *******************************************************************************
@@ -25,6 +25,8 @@
  #include "unicode/utypes.h"
  #include "unicode/errorcode.h"
  #include "unicode/localpointer.h"
+#include "unicode/ucol.h"
+#include "unicode/uscript.h"
  #include "unicode/utf8.h"
  #include "charstr.h"
  #include "cmemory.h"
@@ -46,8 +48,6 @@
  #include "uparse.h"
  #include "writesrc.h"
  
-#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
-
  #if UCONFIG_NO_COLLATION
  
  extern "C" int
@@ -83,7 +83,7 @@ static UDataInfo ucaDataInfo={
      0,
  
      { 0x55, 0x43, 0x6f, 0x6c },         // dataFormat="UCol"
-    { 4, 1, 0, 0 },                     // formatVersion
+    { 5, 0, 0, 0 },                     // formatVersion
      { 6, 3, 0, 0 }                      // dataVersion
  };
  
@@ -206,32 +206,151 @@ static int64_t parseCE(const CollationDataBuilder &builder, char *&s, UErrorCode
      }
  }
  
+// Hardcoded mapping from script sample characters to script codes.
+// Pro: Available without complete and updated UCD scripts data,
+//      easy to add non-script codes specific to collation.
+// Con: Needs manual update for each new script or change in sample character.
  static const struct {
-    const char *name;
-    int32_t code;
-} specialReorderTokens[] = {
-    { "TERMINATOR", -2 },  // -2 means "ignore"
-    { "LEVEL-SEPARATOR", -2 },
-    { "FIELD-SEPARATOR", -2 },
-    { "COMPRESS", -3 },
-    // The standard name is "PUNCT" but FractionalUCA.txt uses the long form.
-    { "PUNCTUATION", UCOL_REORDER_CODE_PUNCTUATION },
-    { "IMPLICIT", USCRIPT_HAN },  // Implicit weights are usually for Han characters. Han & unassigned share a lead byte.
-    { "TRAILING", -2 },  // We do not reorder trailing weights (those after implicits).
-    { "SPECIAL", -2 }  // We must never reorder internal, special CE lead bytes.
+    UChar32 sampleChar;
+    int32_t script;
+} sampleCharsToScripts[] = {
+    { 0x00A0, UCOL_REORDER_CODE_SPACE },
+    { 0x201C, UCOL_REORDER_CODE_PUNCTUATION },
+    { 0x263A, UCOL_REORDER_CODE_SYMBOL },
+    { 0x20AC, UCOL_REORDER_CODE_CURRENCY },
+    { 0x0034, UCOL_REORDER_CODE_DIGIT },
+    { 0x004C, USCRIPT_LATIN },
+    { 0x03A9, USCRIPT_GREEK },
+    { 0x03E2, USCRIPT_COPTIC },
+    { 0x042F, USCRIPT_CYRILLIC },
+    { 0x2C00, USCRIPT_GLAGOLITIC },
+    { 0x1036B, USCRIPT_OLD_PERMIC },
+    { 0x10D3, USCRIPT_GEORGIAN },
+    { 0x0531, USCRIPT_ARMENIAN },
+    { 0x05D0, USCRIPT_HEBREW },
+    { 0x10900, USCRIPT_PHOENICIAN },
+    { 0x0800, USCRIPT_SAMARITAN },
+    { 0x0628, USCRIPT_ARABIC },
+    { 0x0710, USCRIPT_SYRIAC },
+    { 0x0840, USCRIPT_MANDAIC },
+    { 0x078C, USCRIPT_THAANA },
+    { 0x07CA, USCRIPT_NKO },
+    { 0x2D5E, USCRIPT_TIFINAGH },
+    { 0x12A0, USCRIPT_ETHIOPIC },
+    { 0x0905, USCRIPT_DEVANAGARI },
+    { 0x0995, USCRIPT_BENGALI },
+    { 0x0A15, USCRIPT_GURMUKHI },
+    { 0x0A95, USCRIPT_GUJARATI },
+    { 0x0B15, USCRIPT_ORIYA },
+    { 0x0B95, USCRIPT_TAMIL },
+    { 0x0C15, USCRIPT_TELUGU },
+    { 0x0C95, USCRIPT_KANNADA },
+    { 0x0D15, USCRIPT_MALAYALAM },
+    { 0x0D85, USCRIPT_SINHALA },
+    { 0xABC0, USCRIPT_MEITEI_MAYEK },
+    { 0xA800, USCRIPT_SYLOTI_NAGRI },
+    { 0xA882, USCRIPT_SAURASHTRA },
+    { 0x11083, USCRIPT_KAITHI },
+    { 0x11152, USCRIPT_MAHAJANI },
+    { 0x11183, USCRIPT_SHARADA },
+    { 0x11208, USCRIPT_KHOJKI },
+    { 0x112BE, USCRIPT_KHUDAWADI },
+    { 0x11315, USCRIPT_GRANTHA },
+    { 0x11484, USCRIPT_TIRHUTA },
+    { 0x1158E, USCRIPT_SIDDHAM },
+    { 0x1160E, USCRIPT_MODI },
+    { 0x11680, USCRIPT_TAKRI },
+    { 0x1B83, USCRIPT_SUNDANESE },
+    { 0x11005, USCRIPT_BRAHMI },
+    { 0x10A00, USCRIPT_KHAROSHTHI },
+    { 0x0E17, USCRIPT_THAI },
+    { 0x0EA5, USCRIPT_LAO },
+    { 0xAA80, USCRIPT_TAI_VIET },
+    { 0x0F40, USCRIPT_TIBETAN },
+    { 0x1C00, USCRIPT_LEPCHA },
+    { 0xA840, USCRIPT_PHAGS_PA },
+    { 0x1900, USCRIPT_LIMBU },
+    { 0x1703, USCRIPT_TAGALOG },
+    { 0x1723, USCRIPT_HANUNOO },
+    { 0x1743, USCRIPT_BUHID },
+    { 0x1763, USCRIPT_TAGBANWA },
+    { 0x1A00, USCRIPT_BUGINESE },
+    { 0x1BC0, USCRIPT_BATAK },
+    { 0xA930, USCRIPT_REJANG },
+    { 0xA90A, USCRIPT_KAYAH_LI },
+    { 0x1000, USCRIPT_MYANMAR },
+    { 0x11103, USCRIPT_CHAKMA },
+    { 0x1780, USCRIPT_KHMER },
+    { 0x1950, USCRIPT_TAI_LE },
+    { 0x1980, USCRIPT_NEW_TAI_LUE },
+    { 0x1A20, USCRIPT_LANNA },
+    { 0xAA00, USCRIPT_CHAM },
+    { 0x1B05, USCRIPT_BALINESE },
+    { 0xA984, USCRIPT_JAVANESE },
+    { 0x1826, USCRIPT_MONGOLIAN },
+    { 0x1C5A, USCRIPT_OL_CHIKI },
+    { 0x13C4, USCRIPT_CHEROKEE },
+    { 0x14C0, USCRIPT_CANADIAN_ABORIGINAL },
+    { 0x168F, USCRIPT_OGHAM },
+    { 0x16A0, USCRIPT_RUNIC },
+    { 0x10C00, USCRIPT_ORKHON },
+    { 0xA549, USCRIPT_VAI },
+    { 0xA6A0, USCRIPT_BAMUM },
+    { 0x16AE6, USCRIPT_BASSA_VAH },
+    { 0x1E802, USCRIPT_MENDE },
+    { 0xAC00, USCRIPT_HANGUL },
+    { 0x304B, USCRIPT_HIRAGANA },
+    { 0x30AB, USCRIPT_KATAKANA },
+    { 0x3105, USCRIPT_BOPOMOFO },
+    { 0xA288, USCRIPT_YI },
+    { 0xA4D0, USCRIPT_LISU },
+    { 0x16F00, USCRIPT_MIAO },
+    { 0x118B4, USCRIPT_WARANG_CITI },
+    { 0x11AC0, USCRIPT_PAU_CIN_HAU },
+    { 0x16B1C, USCRIPT_PAHAWH_HMONG },
+    { 0x10280, USCRIPT_LYCIAN },
+    { 0x102A0, USCRIPT_CARIAN },
+    { 0x10920, USCRIPT_LYDIAN },
+    { 0x10300, USCRIPT_OLD_ITALIC },
+    { 0x10330, USCRIPT_GOTHIC },
+    { 0x10414, USCRIPT_DESERET },
+    { 0x10450, USCRIPT_SHAVIAN },
+    { 0x1BC20, USCRIPT_DUPLOYAN },
+    { 0x10480, USCRIPT_OSMANYA },
+    { 0x10500, USCRIPT_ELBASAN },
+    { 0x10537, USCRIPT_CAUCASIAN_ALBANIAN },
+    { 0x110D0, USCRIPT_SORA_SOMPENG },
+    { 0x16A4F, USCRIPT_MRO },
+    { 0x10000, USCRIPT_LINEAR_B },
+    { 0x10647, USCRIPT_LINEAR_A },
+    { 0x10800, USCRIPT_CYPRIOT },
+    { 0x10A60, USCRIPT_OLD_SOUTH_ARABIAN },
+    { 0x10A95, USCRIPT_OLD_NORTH_ARABIAN },
+    { 0x10B00, USCRIPT_AVESTAN },
+    { 0x10873, USCRIPT_PALMYRENE },
+    { 0x10896, USCRIPT_NABATAEAN },
+    { 0x10840, USCRIPT_IMPERIAL_ARAMAIC },
+    { 0x10B40, USCRIPT_INSCRIPTIONAL_PARTHIAN },
+    { 0x10B60, USCRIPT_INSCRIPTIONAL_PAHLAVI },
+    { 0x10B8F, USCRIPT_PSALTER_PAHLAVI },
+    { 0x10AD8, USCRIPT_MANICHAEAN },
+    { 0x10380, USCRIPT_UGARITIC },
+    { 0x103A0, USCRIPT_OLD_PERSIAN },
+    { 0x12000, USCRIPT_CUNEIFORM },
+    { 0x13153, USCRIPT_EGYPTIAN_HIEROGLYPHS },
+    { 0x109A0, USCRIPT_MEROITIC_CURSIVE },
+    { 0x10980, USCRIPT_MEROITIC_HIEROGLYPHS },
+    { 0x5B57, USCRIPT_HAN },
+    { 0xFDD0, USCRIPT_UNKNOWN }  // unassigned-implicit primary weights
  };
  
-int32_t getReorderCode(const char* name) {
-    int32_t code = CollationRuleParser::getReorderCode(name);
-    if (code >= 0) {
-        return code;
-    }
-    for (int32_t i = 0; i < LENGTHOF(specialReorderTokens); ++i) {
-        if (0 == strcmp(name, specialReorderTokens[i].name)) {
-            return specialReorderTokens[i].code;
+static int32_t getCharScript(UChar32 c) {
+    for(int32_t i = 0; i < UPRV_LENGTHOF(sampleCharsToScripts); ++i) {
+        if(c == sampleCharsToScripts[i].sampleChar) {
+            return sampleCharsToScripts[i].script;
          }
      }
-    return -1;  // Same as UCHAR_INVALID_CODE or USCRIPT_INVALID_CODE.
+    return USCRIPT_INVALID_CODE;  // -1
  }
  
  /**
@@ -333,7 +452,7 @@ static struct {
  };
  
  static int64_t getOptionValue(const char *name) {
-    for (int32_t i = 0; i < LENGTHOF(vt); ++i) {
+    for (int32_t i = 0; i < UPRV_LENGTHOF(vt); ++i) {
          if(uprv_strcmp(name, vt[i].name) == 0) {
              return vt[i].value;
          }
@@ -341,11 +460,9 @@ static int64_t getOptionValue(const char *name) {
      return 0;
  }
  
-static UnicodeString *leadByteScripts = NULL;
-
  static void readAnOption(
          CollationBaseDataBuilder &builder, char *buffer, UErrorCode *status) {
-    for (int32_t cnt = 0; cnt<LENGTHOF(vt); cnt++) {
+    for (int32_t cnt = 0; cnt<UPRV_LENGTHOF(vt); cnt++) {
          int32_t vtLen = (int32_t)uprv_strlen(vt[cnt].name);
          if(uprv_strncmp(buffer, vt[cnt].name, vtLen) == 0) {
              ActionType what_to_do = vt[cnt].what_to_do;
@@ -491,54 +608,12 @@ static void readAnOption(
                      fprintf(stderr, "warning: UCA version %s != UCD version %s\n", uca, ucd);
                  }
              } else if (what_to_do == READLEADBYTETOSCRIPTS) {
-                uint16_t leadByte = (hex2num(*pointer++) * 16);
-                leadByte += hex2num(*pointer++);
-
-                if(0xe0 <= leadByte && leadByte < Collation::UNASSIGNED_IMPLICIT_BYTE) {
-                    // Extend the Hani range to the end of what this implementation uses.
-                    // FractionalUCA.txt assumes a different algorithm for implicit primary weights,
-                    // and different high-lead byte ranges.
-                    leadByteScripts[leadByte] = leadByteScripts[0xdf];
-                    return;
-                }
-
-                UnicodeString scripts;
-                for(;;) {
-                    pointer = skipWhiteSpace(pointer);
-                    if (*pointer == ']') {
-                        break;
-                    }
-                    const char *scriptName = pointer;
-                    char c;
-                    while((c = *pointer) != 0 && c != ' ' && c != '\t' && c != ']') { ++pointer; }
-                    if(c == 0) {
-                        fprintf(stderr, "Syntax error: unterminated list of scripts: '%s'\n", buffer);
-                        *status = U_INVALID_FORMAT_ERROR;
-                        return;
-                    }
-                    *pointer = 0;
-                    int32_t reorderCode = getReorderCode(scriptName);
-                    *pointer = c;
-                    if (reorderCode == -3) {  // COMPRESS
-                        builder.setCompressibleLeadByte(leadByte);
-                        continue;
-                    }
-                    if (reorderCode == -2) {
-                        continue;  // Ignore "TERMINATOR" etc.
-                    }
-                    if (reorderCode < 0 || 0xffff < reorderCode) {
-                        fprintf(stderr, "Syntax error: unable to parse reorder code from '%s'\n", scriptName);
-                        *status = U_INVALID_FORMAT_ERROR;
-                        return;
-                    }
-                    scripts.append((UChar)reorderCode);
-                }
-                if(!scripts.isEmpty()) {
-                    if(leadByteScripts == NULL) {
-                        leadByteScripts = new UnicodeString[256];
-                    }
-                    leadByteScripts[leadByte] = scripts;
+                if (strstr(pointer, "COMPRESS") != NULL) {
+                    uint16_t leadByte = (hex2num(*pointer++) * 16);
+                    leadByte += hex2num(*pointer++);
+                    builder.setCompressibleLeadByte(leadByte);
                  }
+                // We do not need the list of scripts on this line.
              }
              return;
          }
@@ -730,9 +805,21 @@ parseFractionalUCA(const char *filename,
                  // are only entered into the inverse table,
                  // not into the normal collation data.
                  builder.addRootElements(ces, cesLength, *status);
-                if(s.length() == 2 && s[1] == 0x34 && cesLength == 1) {
-                    // Lead byte for numeric sorting.
-                    builder.setNumericPrimary(p);
+                if(s.length() == 2 && cesLength == 1) {
+                    switch(s[1]) {
+                    case 0x34:
+                        // Lead byte for numeric sorting.
+                        builder.setNumericPrimary(p);
+                        break;
+                    case 0xFF21:
+                        builder.addScriptStart(CollationData::REORDER_RESERVED_BEFORE_LATIN, p);
+                        break;
+                    case 0xFF3A:
+                        builder.addScriptStart(CollationData::REORDER_RESERVED_AFTER_LATIN, p);
+                        break;
+                    default:
+                        break;
+                    }
                  }
              } else {
                  UChar32 c = s.char32At(0);
@@ -742,8 +829,29 @@ parseFractionalUCA(const char *filename,
                  // CollationBaseDataBuilder::init() maps them to special CEs.
                  // Except for U+FFFE, these have higher primaries in v2 than in FractionalUCA.txt.
                  if(0xfffd <= c && c <= 0xffff) { continue; }
-                if(s.length() == 2 && s[0] == 0xFDD1 && s[1] == 0xFDD0) {
-                    continue;
+                if(s.length() >= 2 && c == 0xFDD1) {
+                    UChar32 c2 = s.char32At(1);
+                    int32_t script = getCharScript(c2);
+                    if(script < 0) {
+                        fprintf(stderr,
+                                "Error: Unknown script for first-primary sample character "
+                                "U+%04x on line %u of %s\n"
+                                "    (add the character to genuca.cpp sampleCharsToScripts[])\n",
+                                c2, (int)line, filename);
+                        exit(U_INVALID_FORMAT_ERROR);
+                    }
+                    if(script == USCRIPT_UNKNOWN) {
+                        // FDD1 FDD0, first unassigned-implicit primary
+                        builder.addScriptStart(script, Collation::FIRST_UNASSIGNED_PRIMARY);
+                        continue;
+                    }
+                    builder.addScriptStart(script, p);
+                    if(script == USCRIPT_HIRAGANA) {
+                        builder.addScriptStart(USCRIPT_KATAKANA_OR_HIRAGANA, p);
+                    } else if(script == USCRIPT_HAN) {
+                        builder.addScriptStart(USCRIPT_SIMPLIFIED_HAN, p);
+                        builder.addScriptStart(USCRIPT_TRADITIONAL_HAN, p);
+                    }
                  }
  
                  if(0xe0000000 <= p && p < 0xf0000000) {
@@ -887,40 +995,6 @@ buildAndWriteBaseData(CollationBaseDataBuilder &builder,
          return;
      }
  
-    if(leadByteScripts != NULL) {
-        uint32_t firstLead = Collation::MERGE_SEPARATOR_BYTE + 1;
-        do {
-            // Find the range of lead bytes with this set of scripts.
-            const UnicodeString &firstScripts = leadByteScripts[firstLead];
-            if(firstScripts.isEmpty()) {
-                fprintf(stderr, "[top_byte 0x%02X] has no reorderable scripts\n", (int)firstLead);
-                errorCode = U_INVALID_FORMAT_ERROR;
-                return;
-            }
-            uint32_t lead = firstLead;
-            for(;;) {
-                ++lead;
-                const UnicodeString &scripts = leadByteScripts[lead];
-                // The scripts should either be the same or disjoint.
-                // We do not test if all reordering groups have disjoint sets of scripts.
-                if(scripts.isEmpty() || firstScripts.indexOf(scripts[0]) < 0) { break; }
-                if(scripts != firstScripts) {
-                    fprintf(stderr,
-                            "[top_byte 0x%02X] includes script %d from [top_byte 0x%02X] "
-                            "but not all scripts match\n",
-                            (int)firstLead, scripts[0], (int)lead);
-                    errorCode = U_INVALID_FORMAT_ERROR;
-                    return;
-                }
-            }
-            // lead is one greater than the last lead byte with the same set of scripts as firstLead.
-            builder.addReorderingGroup(firstLead, lead - 1, firstScripts, errorCode);
-            if(U_FAILURE(errorCode)) { return; }
-            firstLead = lead;
-        } while(firstLead < Collation::UNASSIGNED_IMPLICIT_BYTE);
-        delete[] leadByteScripts;
-    }
-
      CollationData data(*Normalizer2Factory::getNFCImpl(errorCode));
      builder.enableFastLatin();
      builder.build(data, errorCode);
@@ -1168,7 +1242,7 @@ extern "C" int
  main(int argc, char* argv[]) {
      U_MAIN_INIT_ARGS(argc, argv);
  
-    argc=u_parseArgs(argc, argv, LENGTHOF(options), options);
+    argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options);
  
      /* error handling, printing usage message */
      if(argc<0) {
author	Markus Scherer <markus.icu@gmail.com>
	Mon, 5 Jan 2015 23:31:52 +0000 (23:31 +0000)
committer	Markus Scherer <markus.icu@gmail.com>
	Mon, 5 Jan 2015 23:31:52 +0000 (23:31 +0000)
tools/unicode/c/genuca/collationbasedatabuilder.cpp		patch \| blob \| history
tools/unicode/c/genuca/collationbasedatabuilder.h		patch \| blob \| history
tools/unicode/c/genuca/genuca.cpp		patch \| blob \| history