From: Markus Scherer <markus.icu@gmail.com>
Date: Mon, 24 Feb 2014 22:17:04 +0000 (+0000)
Subject: ICU-9101 copy icu/branches/markus/collv2/source/tools/genuca2/genuca2.cpp to tools... 
X-Git-Tag: milestone-59-0-1~2149
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=f6de8d8a1cf59eb8837ac09472d797cb443b97ef;p=icu

ICU-9101 copy icu/branches/markus/collv2/source/tools/genuca2/genuca2.cpp to tools/trunk/unicode/c/genuca/genuca.cpp

X-SVN-Rev: 35220
---

diff --git a/tools/unicode/c/genuca/genuca.8.in b/tools/unicode/c/genuca/genuca.8.in
index e8ab27d0175..80fad66f261 100644
--- a/tools/unicode/c/genuca/genuca.8.in
+++ b/tools/unicode/c/genuca/genuca.8.in
@@ -2,90 +2,54 @@
 .\"
 .\" genuca.8: manual page for the genuca utility
 .\"
-.\" Copyright (C) 2000-2001 IBM, Inc. and others.
+.\" Copyright (C) 2000-2014 IBM, Inc. and others.
 .\"
-.TH GENUCA 8 "22 February 2001" "ICU MANPAGE" "ICU @VERSION@ Manual"
+.TH GENUCA 8 "2014-Feb-24" "ICU MANPAGE" "ICU @VERSION@ Manual"
 .SH NAME
 .B genuca
-\- create the UCA data table
+\- create the root collation data file for ICU
 .SH SYNOPSIS
 .B genuca
 [
 .BR "\-h\fP, \fB\-?\fP, \fB\-\-help"
 ]
 [
-.BR "\-V\fP, \fB\-\-version"
-]
-[
 .BR "\-v\fP, \fB\-\-verbose"
 ]
 [
 .BI "\-c\fP, \fB\-\-copyright"
 ]
 [
-.BI "\-s\fP, \fB\-\-sourcedir" " source"
-]
-[
-.BI "\-d\fP, \fB\-\-destdir" " destination"
-]
-[
-.IR file
+.IR path/to/ICU/src/root
 ]
 .SH DESCRIPTION
 .B genuca
-compiles the Unicode Collation Algorithm (UCA) data from
-.I file
-(or from
-.B FractionalUCA.txt
-if
-.I file
-is omitted) into its binary form, the files
+reads path/to/ICU/src/root/source/data/unidata/FractionalUCA.txt and
+writes source and binary data files with the collation root data.
+
+The binary file
 .B ucadata.dat
-and
-.BR invuca.dat .
-These binary files can then be read directly by ICU, or used by
+can then be read directly by ICU, or used by
+.BR icupkg (8)
+or
 .BR pkgdata (8)
 for incorporation into a larger archive or library.
+
+See http://www.unicode.org/reports/tr35/tr35-collation.html#Root_Data_Files
 .SH OPTIONS
 .TP
 .BR "\-h\fP, \fB\-?\fP, \fB\-\-help"
 Print help about usage and exit.
 .TP
-.BR "\-V\fP, \fB\-\-version"
-Print the version of
-.B genuca
-and exit.
-.TP
 .BR "\-v\fP, \fB\-\-verbose"
 Display extra informative messages during execution.
 .TP
 .BI "\-c\fP, \fB\-\-copyright"
 Include a copyright notice into the binary data.
-.TP
-.BI "\-s\fP, \fB\-\-sourcedir" " source"
-Set the source directory to
-.IR source .
-The default source directory is specified by the environment variable
-.BR ICU_DATA .
-.TP
-.BI "\-d\fP, \fB\-\-destdir" " destination"
-Set the destination directory to
-.IR destination .
-The default destination directory is specified by the environment variable
-.BR ICU_DATA .
-.SH ENVIRONMENT
-.TP 10
-.B ICU_DATA
-Specifies the directory containing ICU data. Defaults to
-.BR @thepkgicudatadir@/@PACKAGE@/@VERSION@/ .
-Some tools in ICU depend on the presence of the trailing slash. It is thus
-important to make sure that it is present if
-.B ICU_DATA
-is set.
 .SH FILES
 .TP 15
 .B FractionalUCA.txt
-Machine-readable file containing data for the Unicode collation algorithm.
+Machine-readable file containing data for the CLDR root collation order.
 .SH VERSION
 @VERSION@
 .SH COPYRIGHT
diff --git a/tools/unicode/c/genuca/genuca.cpp b/tools/unicode/c/genuca/genuca.cpp
index c1f210c0861..b13193678df 100644
--- a/tools/unicode/c/genuca/genuca.cpp
+++ b/tools/unicode/c/genuca/genuca.cpp
@@ -1,7 +1,7 @@
 /*
 *******************************************************************************
 *
-*   Copyright (C) 2000-2013, International Business Machines
+*   Copyright (C) 2000-2014, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *
 *******************************************************************************
@@ -11,502 +11,192 @@
 *   indentation:4
 *
 *   created at the end of XX century
-*   created by: Vladimir Weinstein
+*   created by: Vladimir Weinstein,
+*   modified in 2013-2014 by Markus Scherer
 *
-*   This program reads the Franctional UCA table and generates
+*   This program reads the Fractional UCA table and generates
 *   internal format for UCA table as well as inverse UCA table.
-*   It then writes binary files containing the data: ucadata.dat 
-*   & invuca.dat
-*   Change history:
-*   02/23/2001  grhoten                 Made it into a tool
-*   02/23/2001  weiv                    Moved element & table handling code to i18n
-*   05/09/2001  weiv                    Case bits are now in the CEs, not in front
-*   10/26/2010  sgill                   Support for reordering codes
+*   It then writes the ucadata.icu binary file containing the data.
 */
 
 #define U_NO_DEFAULT_INCLUDE_UTF_HEADERS 1
 
+#include <stdio.h>
 #include "unicode/utypes.h"
-#include "unicode/putil.h"
-#include "unicode/udata.h"
-#include "unicode/uclean.h"
-#include "unicode/uscript.h"
-#include "unicode/ustring.h"
-#include "unicode/utf16.h"
+#include "unicode/errorcode.h"
+#include "unicode/localpointer.h"
 #include "charstr.h"
-#include "ucol_bld.h"
-#include "ucol_imp.h"
-#include "genuca.h"
-#include "uoptions.h"
-#include "uparse.h"
+#include "cmemory.h"
+#include "collation.h"
+#include "collationbasedatabuilder.h"
+#include "collationdata.h"
+#include "collationdatabuilder.h"
+#include "collationdatareader.h"
+#include "collationdatawriter.h"
+#include "collationinfo.h"
+#include "collationrootelements.h"
+#include "collationruleparser.h"
+#include "collationtailoring.h"
+#include "cstring.h"
+#include "normalizer2impl.h"
 #include "toolutil.h"
 #include "unewdata.h"
-#include "cstring.h"
-#include "cmemory.h"
-
-#include <stdio.h>
+#include "uoptions.h"
+#include "uparse.h"
+#include "writesrc.h"
 
 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
 
-/** The maximum UTF-16 length (number of UChars) in a UCA contraction. */
-static const int32_t MAX_UCA_CONTRACTION_LENGTH=4;
-
-// script reordering structures
-typedef struct {
-    uint16_t reorderCode;
-    uint16_t offset;
-} ReorderIndex;
-
-typedef struct {
-    uint16_t LEAD_BYTE_TO_SCRIPTS_INDEX_LENGTH;
-    uint16_t* LEAD_BYTE_TO_SCRIPTS_INDEX;
-    uint16_t LEAD_BYTE_TO_SCRIPTS_DATA_LENGTH;
-    uint16_t* LEAD_BYTE_TO_SCRIPTS_DATA;
-    uint16_t LEAD_BYTE_TO_SCRIPTS_DATA_OFFSET;
-    
-    uint16_t SCRIPT_TO_LEAD_BYTES_INDEX_LENGTH;
-    ReorderIndex* SCRIPT_TO_LEAD_BYTES_INDEX;
-    uint16_t SCRIPT_TO_LEAD_BYTES_INDEX_COUNT;
-    uint16_t SCRIPT_TO_LEAD_BYTES_DATA_LENGTH;
-    uint16_t* SCRIPT_TO_LEAD_BYTES_DATA;
-    uint16_t SCRIPT_TO_LEAD_BYTES_DATA_OFFSET;
-} LeadByteConstants;
-
-int ReorderIndexComparer(const void *a, const void *b) {
-    return reinterpret_cast<const ReorderIndex*>(a)->reorderCode - reinterpret_cast<const ReorderIndex*>(b)->reorderCode;    
-}
-
-/*
- * Global - verbosity
- */
-UBool beVerbose = FALSE;
-
-static UVersionInfo UCAVersion;
-
 #if UCONFIG_NO_COLLATION
 
-/* dummy UDataInfo cf. udata.h */
-static UDataInfo dummyDataInfo = {
-    sizeof(UDataInfo),
-    0,
-
-    U_IS_BIG_ENDIAN,
-    U_CHARSET_FAMILY,
-    U_SIZEOF_UCHAR,
-    0,
-
-    { 0, 0, 0, 0 },                 /* dummy dataFormat */
-    { 0, 0, 0, 0 },                 /* dummy formatVersion */
-    { 0, 0, 0, 0 }                  /* dummy dataVersion */
-};
+extern "C" int
+main(int argc, char* argv[]) {
+    (void)argc;
+    (void)argv;
+    return 1;
+}
 
 #else
 
-static const UDataInfo ucaDataInfo={
-    sizeof(UDataInfo),
-    0,
+U_NAMESPACE_USE
 
-    U_IS_BIG_ENDIAN,
-    U_CHARSET_FAMILY,
-    sizeof(UChar),
-    0,
+static UBool beVerbose=FALSE, withCopyright=TRUE;
 
-    {UCA_DATA_FORMAT_0, UCA_DATA_FORMAT_1, UCA_DATA_FORMAT_2, UCA_DATA_FORMAT_3},     /* dataFormat="UCol"            */
-    /* 03/26/2002 bumped up version since format has changed */
-    /* 09/16/2002 bumped up version since we went from UColAttributeValue */
-    /*            to int32_t in UColOptionSet */
-    /* 05/13/2003 This one also updated since we added UCA and UCD versions */
-    /*            to header */
-    /* 09/11/2003 Adding information required by data swapper */
-    {UCA_FORMAT_VERSION_0, UCA_FORMAT_VERSION_1, UCA_FORMAT_VERSION_2, UCA_FORMAT_VERSION_3},                 /* formatVersion                */
-    {0, 0, 0, 0}                  /* dataVersion = Unicode Version*/
-};
+static UVersionInfo UCAVersion={ 0, 0, 0, 0 };
 
-static const UDataInfo invUcaDataInfo={
+static UDataInfo ucaDataInfo={
     sizeof(UDataInfo),
     0,
 
     U_IS_BIG_ENDIAN,
     U_CHARSET_FAMILY,
-    sizeof(UChar),
+    U_SIZEOF_UCHAR,
     0,
 
-    {INVUCA_DATA_FORMAT_0, INVUCA_DATA_FORMAT_1, INVUCA_DATA_FORMAT_2, INVUCA_DATA_FORMAT_3},     /* dataFormat="InvC"            */
-    /* 03/26/2002 bumped up version since format has changed */
-    /* 04/29/2003 2.1 format - we have added UCA version to header */
-    {INVUCA_FORMAT_VERSION_0, INVUCA_FORMAT_VERSION_1, INVUCA_FORMAT_VERSION_2, INVUCA_FORMAT_VERSION_3},                 /* formatVersion                */
-    {0, 0, 0, 0}                  /* dataVersion = Unicode Version*/
+    { 0x55, 0x43, 0x6f, 0x6c },         // dataFormat="UCol"
+    { 4, 0, 0, 0 },                     // formatVersion
+    { 6, 3, 0, 0 }                      // dataVersion
 };
 
-UCAElements le;
+static char *skipWhiteSpace(char *s) {
+    while(*s == ' ' || *s == '\t') { ++s; }
+    return s;
+}
 
-// returns number of characters read
-int32_t readElement(char **from, char *to, char separator, UErrorCode *status) {
-    if(U_FAILURE(*status)) {
-        return 0;
+static int32_t hex2num(char hex) {
+    if(hex>='0' && hex <='9') {
+        return hex-'0';
+    } else if(hex>='a' && hex<='f') {
+        return hex-'a'+10;
+    } else if(hex>='A' && hex<='F') {
+        return hex-'A'+10;
+    } else {
+        return -1;
     }
-    char buffer[1024];
-    int32_t i = 0;
+}
+
+static uint32_t parseWeight(char *&s, const char *separators,
+                            int32_t maxBytes, UErrorCode &errorCode) {
+    if(U_FAILURE(errorCode)) { return 0; }
+    uint32_t weight = 0;
+    int32_t numBytes = 0;
     for(;;) {
-        char c = **from;
-        if(c == separator || (separator == ' ' && c == '\t')) {
+        // Check one character after another, so that we don't just run over a 00.
+        int32_t nibble1, nibble2;
+        if((nibble1 = hex2num(s[0])) < 0 || (nibble2 = hex2num(s[1])) < 0) {
+            // Stop when we find something other than a pair of hex digits.
             break;
         }
-        if (c == '\0') {
+        if(numBytes == maxBytes || (numBytes != 0 && nibble1 == 0 && nibble2 <= 1)) {
+            // Too many bytes, or a 00 or 01 byte which is illegal inside a weight.
+            errorCode = U_INVALID_FORMAT_ERROR;
             return 0;
         }
-        if(c != ' ') {
-            *(buffer+i++) = c;
+        weight = (weight << 8) | ((uint32_t)nibble1 << 4) | (uint32_t)nibble2;
+        ++numBytes;
+        s += 2;
+        if(*s != ' ') {
+            break;
         }
-        (*from)++;
+        ++s;
     }
-    (*from)++;
-    *(buffer + i) = 0;
-    //*to = (char *)malloc(strlen(buffer)+1);
-    strcpy(to, buffer);
-    return i;
-}
-
-int32_t skipUntilWhiteSpace(char **from, UErrorCode *status) {
-    if (U_FAILURE(*status)) {
+    char c = *s;
+    if(c == 0 || strchr(separators, c) == NULL) {
+        errorCode = U_INVALID_FORMAT_ERROR;
         return 0;
     }
-    int32_t count = 0;
-    while (**from != ' ' && **from != '\t' && **from != '\0') {
-        (*from)++;
-        count++;
+    // numBytes==0 is ok, for example in [,,] or [, 82, 05]
+    // Left-align the weight.
+    while(numBytes < 4) {
+        weight <<= 8;
+        ++numBytes;
     }
-    return count;
+    return weight;
 }
 
-int32_t skipWhiteSpace(char **from, UErrorCode *status) {
-    if (U_FAILURE(*status)) {
-        return 0;
-    }
-    int32_t count = 0;
-    while (**from == ' ' || **from == '\t') {
-        (*from)++;
-        count++;
-    }
-    return count;
-}
-
-uint32_t getSingleCEValue(char *primary, char *secondary, char *tertiary, UErrorCode *status) {
-    if(U_FAILURE(*status)) {
-        return 0;
-    }
-    uint32_t value = 0;
-    char primsave = '\0';
-    char secsave = '\0';
-    char tersave = '\0';
-    char *primend = primary+4;
-    if(strlen(primary) > 4) {
-        primsave = *primend;
-        *primend = '\0';
-    }
-    char *secend = secondary+2;
-    if(strlen(secondary) > 2) {
-        secsave = *secend;
-        *secend = '\0';
-    }
-    char *terend = tertiary+2;
-    if(strlen(tertiary) > 2) {
-        tersave = *terend;
-        *terend = '\0';
-    }
-    uint32_t primvalue = (uint32_t)((*primary!='\0')?strtoul(primary, &primend, 16):0);
-    uint32_t secvalue = (uint32_t)((*secondary!='\0')?strtoul(secondary, &secend, 16):0);
-    uint32_t tervalue = (uint32_t)((*tertiary!='\0')?strtoul(tertiary, &terend, 16):0);
-    if(primvalue <= 0xFF) {
-      primvalue <<= 8;
-    }
-
-    value = ((primvalue<<UCOL_PRIMARYORDERSHIFT)&UCOL_PRIMARYORDERMASK)|
-        ((secvalue<<UCOL_SECONDARYORDERSHIFT)&UCOL_SECONDARYORDERMASK)|
-        (tervalue&UCOL_TERTIARYORDERMASK);
-
-    if(primsave!='\0') {
-        *primend = primsave;
-    }
-    if(secsave!='\0') {
-        *secend = secsave;
-    }
-    if(tersave!='\0') {
-        *terend = tersave;
-    }
-    return value;
-}
-
-static uint32_t inverseTable[0xFFFF][3];
-static uint32_t inversePos = 0;
-static UChar stringContinue[0xFFFF];
-static uint32_t sContPos = 0;
-
-static void addNewInverse(UCAElements *element, UErrorCode *status) {
-  if(U_FAILURE(*status)) {
-    return;
-  }
-  if(beVerbose && isContinuation(element->CEs[1])) {
-    //printf("+");
-  }
-  inversePos++;
-  inverseTable[inversePos][0] = element->CEs[0];
-  if(element->noOfCEs > 1 && isContinuation(element->CEs[1])) {
-    inverseTable[inversePos][1] = element->CEs[1];
-  } else {
-    inverseTable[inversePos][1] = 0;
-  }
-  if(element->cSize < 2) {
-    inverseTable[inversePos][2] = element->cPoints[0];
-  } else { /* add a new store of cruft */
-    inverseTable[inversePos][2] = ((element->cSize+1) << UCOL_INV_SHIFTVALUE) | sContPos;
-    memcpy(stringContinue+sContPos, element->cPoints, element->cSize*sizeof(UChar));
-    sContPos += element->cSize+1;
-  }
-}
-
-static void insertInverse(UCAElements *element, uint32_t position, UErrorCode *status) {
-  if(U_FAILURE(*status)) {
-    return;
-  }
-
-  if(beVerbose && isContinuation(element->CEs[1])) {
-    //printf("+");
-  }
-  if(position <= inversePos) {
-    /*move stuff around */
-    uint32_t amountToMove = (inversePos - position+1)*sizeof(inverseTable[0]);
-    uprv_memmove(inverseTable[position+1], inverseTable[position], amountToMove);
-  }
-  inverseTable[position][0] = element->CEs[0];
-  if(element->noOfCEs > 1 && isContinuation(element->CEs[1])) {
-    inverseTable[position][1] = element->CEs[1];
-  } else {
-    inverseTable[position][1] = 0;
-  }
-  if(element->cSize < 2) {
-    inverseTable[position][2] = element->cPoints[0];
-  } else { /* add a new store of cruft */
-    inverseTable[position][2] = ((element->cSize+1) << UCOL_INV_SHIFTVALUE) | sContPos;
-    memcpy(stringContinue+sContPos, element->cPoints, element->cSize*sizeof(UChar));
-    sContPos += element->cSize+1;
-  }
-  inversePos++;
-}
-
-static void addToExistingInverse(UCAElements *element, uint32_t position, UErrorCode *status) {
-
-  if(U_FAILURE(*status)) {
-    return;
-  }
-
-      if((inverseTable[position][2] & UCOL_INV_SIZEMASK) == 0) { /* single element, have to make new extension place and put both guys there */
-        stringContinue[sContPos] = (UChar)inverseTable[position][2];
-        inverseTable[position][2] = ((element->cSize+3) << UCOL_INV_SHIFTVALUE) | sContPos;
-        sContPos++;
-        stringContinue[sContPos++] = 0xFFFF;
-        memcpy(stringContinue+sContPos, element->cPoints, element->cSize*sizeof(UChar));
-        sContPos += element->cSize;
-        stringContinue[sContPos++] = 0xFFFE;
-      } else { /* adding to the already existing continuing table */
-        uint32_t contIndex = inverseTable[position][2] & UCOL_INV_OFFSETMASK;
-        uint32_t contSize = (inverseTable[position][2] & UCOL_INV_SIZEMASK) >> UCOL_INV_SHIFTVALUE;
-
-        if(contIndex+contSize < sContPos) {
-          /*fprintf(stderr, ".", sContPos, contIndex+contSize);*/
-          memcpy(stringContinue+contIndex+contSize+element->cSize+1, stringContinue+contIndex+contSize, (element->cSize+1)*sizeof(UChar));
-        }
-
-        stringContinue[contIndex+contSize-1] = 0xFFFF;
-        memcpy(stringContinue+contIndex+contSize, element->cPoints, element->cSize*sizeof(UChar));
-        sContPos += element->cSize+1;
-        stringContinue[contIndex+contSize+element->cSize] = 0xFFFE;
-
-        inverseTable[position][2] = ((contSize+element->cSize+1) << UCOL_INV_SHIFTVALUE) | contIndex;
-      }
-}
-
-/* 
- * Takes two CEs (lead and continuation) and 
- * compares them as CEs should be compared:
- * primary vs. primary, secondary vs. secondary
- * tertiary vs. tertiary
+/**
+ * Parse a CE like [0A 86, 05, 17] or [U+4E00, 10].
+ * Stop with an error, or else with the pointer s after the closing bracket.
  */
-static int32_t compareCEs(uint32_t *source, uint32_t *target) {
-  uint32_t s1 = source[0], s2, t1 = target[0], t2;
-  if(isContinuation(source[1])) {
-    s2 = source[1];
-  } else {
-    s2 = 0;
-  }
-  if(isContinuation(target[1])) {
-    t2 = target[1];
-  } else {
-    t2 = 0;
-  }
-  
-  uint32_t s = 0, t = 0;
-  if(s1 == t1 && s2 == t2) {
-    return 0;
-  }
-  s = (s1 & 0xFFFF0000)|((s2 & 0xFFFF0000)>>16); 
-  t = (t1 & 0xFFFF0000)|((t2 & 0xFFFF0000)>>16); 
-  if(s < t) {
-    return -1;
-  } else if(s > t) {
-    return 1;
-  } else {
-    s = (s1 & 0x0000FF00) | (s2 & 0x0000FF00)>>8;
-    t = (t1 & 0x0000FF00) | (t2 & 0x0000FF00)>>8;
-    if(s < t) {
-      return -1;
-    } else if(s > t) {
-      return 1;
-    } else {
-      s = (s1 & 0x000000FF)<<8 | (s2 & 0x000000FF);
-      t = (t1 & 0x000000FF)<<8 | (t2 & 0x000000FF);
-      if(s < t) {
-        return -1;
-      } else {
-        return 1;
-      }
-    }
-  }
-}
-
-static uint32_t addToInverse(UCAElements *element, UErrorCode *status) {
-  uint32_t position = inversePos;
-  uint32_t saveElement = element->CEs[0];
-  int32_t compResult = 0;
-  element->CEs[0] &= 0xFFFFFF3F;
-  if(element->noOfCEs == 1) {
-    element->CEs[1] = 0;
-  }
-  if(inversePos == 0) {
-    inverseTable[0][0] = inverseTable[0][1] = inverseTable[0][2] = 0;
-    addNewInverse(element, status);
-  } else if(compareCEs(inverseTable[inversePos], element->CEs) > 0) {
-    while((compResult = compareCEs(inverseTable[--position], element->CEs)) > 0);
-    if(beVerbose) { printf("p:%u ", (int)position); }
-    if(compResult == 0) {
-      addToExistingInverse(element, position, status);
-    } else {
-      insertInverse(element, position+1, status);
-    }
-  } else if(compareCEs(inverseTable[inversePos], element->CEs) == 0) {
-    addToExistingInverse(element, inversePos, status);
-  } else {
-    addNewInverse(element, status);
-  }
-  element->CEs[0] = saveElement;
-  if(beVerbose) { printf("+"); }
-  return inversePos;
-}
-
-static InverseUCATableHeader *assembleInverseTable(UErrorCode *status)
-{
-  InverseUCATableHeader *result = NULL;
-  uint32_t headerByteSize = paddedsize(sizeof(InverseUCATableHeader));
-  uint32_t inverseTableByteSize = (inversePos+2)*sizeof(uint32_t)*3;
-  uint32_t contsByteSize = sContPos * sizeof(UChar);
-  uint32_t i = 0;
-
-  result = (InverseUCATableHeader *)uprv_malloc(headerByteSize + inverseTableByteSize + contsByteSize);
-  uprv_memset(result, 0, headerByteSize + inverseTableByteSize + contsByteSize);
-  if(result != NULL) {
-    result->byteSize = headerByteSize + inverseTableByteSize + contsByteSize;
-
-    inversePos++;
-    inverseTable[inversePos][0] = 0xFFFFFFFF;
-    inverseTable[inversePos][1] = 0xFFFFFFFF;
-    inverseTable[inversePos][2] = 0x0000FFFF;
-    inversePos++;
-
-    for(i = 2; i<inversePos; i++) {
-      if(compareCEs(inverseTable[i-1], inverseTable[i]) > 0) { 
-        fprintf(stderr, "Error at %i: %08X & %08X\n", (int)i, (int)inverseTable[i-1][0], (int)inverseTable[i][0]);
-      } else if(inverseTable[i-1][0] == inverseTable[i][0] && !(inverseTable[i-1][1] < inverseTable[i][1])) {
-        fprintf(stderr, "Continuation error at %i: %08X %08X & %08X %08X\n", (int)i, (int)inverseTable[i-1][0], (int)inverseTable[i-1][1], (int)inverseTable[i][0], (int)inverseTable[i][1]);
-      }
-    }
-
-    result->tableSize = inversePos;
-    result->contsSize = sContPos;
-
-    result->table = headerByteSize;
-    result->conts = headerByteSize + inverseTableByteSize;
-
-    memcpy((uint8_t *)result + result->table, inverseTable, inverseTableByteSize);
-    memcpy((uint8_t *)result + result->conts, stringContinue, contsByteSize);
-
-  } else {
-    *status = U_MEMORY_ALLOCATION_ERROR;
-    return NULL;
-  }
-
-  return result; 
-}
-
-
-static void writeOutInverseData(InverseUCATableHeader *data, 
-                  const char *outputDir, 
-                  const char *copyright,
-                  UErrorCode *status)
-{
-    UNewDataMemory *pData;
-    
-    long dataLength;
-
-    UDataInfo invUcaInfo;
-    uprv_memcpy(&invUcaInfo, &invUcaDataInfo, sizeof(UDataInfo));
-    uprv_memcpy(invUcaInfo.dataVersion, UCAVersion, U_MAX_VERSION_LENGTH);
-
-    pData=udata_create(outputDir, INVC_DATA_TYPE, INVC_DATA_NAME, &invUcaInfo,
-                       copyright, status);
-
-    if(U_FAILURE(*status)) {
-        fprintf(stderr, "Error: unable to create %s"INVC_DATA_NAME", error %s\n", outputDir, u_errorName(*status));
-        return;
-    }
-
-    /* write the data to the file */
-    if (beVerbose) {
-        printf("Writing out inverse UCA table: %s%c%s.%s\n", outputDir, U_FILE_SEP_CHAR,
-                                                                INVC_DATA_NAME,
-                                                                INVC_DATA_TYPE);
-    }
-    udata_writeBlock(pData, data, data->byteSize);
-
-    /* finish up */
-    dataLength=udata_finish(pData, status);
-    if(U_FAILURE(*status)) {
-        fprintf(stderr, "Error: error %d writing the output file\n", *status);
-        return;
-    }
-}
-
-static int32_t hex2num(char hex) {
-    if(hex>='0' && hex <='9') {
-        return hex-'0';
-    } else if(hex>='a' && hex<='f') {
-        return hex-'a'+10;
-    } else if(hex>='A' && hex<='F') {
-        return hex-'A'+10;
+static int64_t parseCE(const CollationDataBuilder &builder, char *&s, UErrorCode &errorCode) {
+    if(U_FAILURE(errorCode)) { return 0; }
+    ++s;  // skip over the '['
+    if(s[0] == 'U' && s[1] == '+') {
+        // Read a code point and look up its CE.
+        // We use this especially for implicit primary weights,
+        // so that we can use different algorithms in the FractionalUCA.txt
+        // generator and the parser.
+        // The generator may not even need to compute any implicit primaries at all.
+        s += 2;
+        char *end;
+        unsigned long longCp = uprv_strtoul(s, &end, 16);
+        if(end == s || longCp > 0x10ffff) {
+            errorCode = U_INVALID_FORMAT_ERROR;
+            return 0;
+        }
+        UChar32 c = (UChar32)longCp;
+        int64_t ce = builder.getSingleCE(c, errorCode);
+        if(U_FAILURE(errorCode)) { return 0; }
+        s = end;
+        if(*s == ']') {  // [U+4E00]
+            ++s;
+            return ce;
+        }
+        if(*s != ',') {
+            errorCode = U_INVALID_FORMAT_ERROR;
+            return 0;
+        }
+        // Parse the following, secondary or tertiary weight.
+        s = skipWhiteSpace(s + 1);
+        uint32_t w = parseWeight(s, ",]", 2, errorCode);
+        if(U_FAILURE(errorCode)) { return 0; }
+        if(*s == ']') {  // [U+4E00, 10]
+            ++s;
+            // Set the tertiary weight to w.
+            return (ce & INT64_C(0xffffffffffff0000)) | (w >> 16);
+        }
+        // Set the secondary weight to w: [U+9F9C, 70, 20]
+        ce = (ce & INT64_C(0xffffffff00000000)) | w;
+        // Parse and set the tertiary weight.
+        s = skipWhiteSpace(s + 1);
+        w = parseWeight(s, "]", 2, errorCode);
+        ++s;
+        return ce | (w >> 16);
     } else {
-        return 0;
+        uint32_t p = parseWeight(s, ",", 4, errorCode);
+        if(U_FAILURE(errorCode)) { return 0; }
+        int64_t ce = (int64_t)p << 32;
+        s = skipWhiteSpace(s + 1);
+        uint32_t w = parseWeight(s, ",", 2, errorCode);
+        if(U_FAILURE(errorCode)) { return 0; }
+        ce |= w;
+        s = skipWhiteSpace(s + 1);
+        w = parseWeight(s, "]", 2, errorCode);
+        ++s;
+        return ce | (w >> 16);
     }
 }
 
-// static char* CHARACTER_CATEGORY_REORDER_CODES[] = {
-//     "Zs", "Nd", "Sc"
-// };
-// static const uint16_t CHARACTER_CATEGORY_REORDER_CODE_OFFSET = 0x1000;
-// static uint16_t CHARACTER_CATEGORY_REORDER_CODES_VALUE[] = {
-//     U_SPACE_SEPARATOR + CHARACTER_CATEGORY_REORDER_CODE_OFFSET,
-//     U_DECIMAL_DIGIT_NUMBER + CHARACTER_CATEGORY_REORDER_CODE_OFFSET, 
-//     U_CURRENCY_SYMBOL + CHARACTER_CATEGORY_REORDER_CODE_OFFSET
-// };
-
 static const struct {
     const char *name;
     int32_t code;
@@ -514,7 +204,8 @@ static const struct {
     { "TERMINATOR", -2 },  // -2 means "ignore"
     { "LEVEL-SEPARATOR", -2 },
     { "FIELD-SEPARATOR", -2 },
-    { "COMPRESS", -2 },  // TODO: We should parse/store which lead bytes are compressible; there is a ticket for that.
+    { "COMPRESS", -3 },
+    // The standard name is "PUNCT" but FractionalUCA.txt uses the long form.
     { "PUNCTUATION", UCOL_REORDER_CODE_PUNCTUATION },
     { "IMPLICIT", USCRIPT_HAN },  // Implicit weights are usually for Han characters. Han & unassigned share a lead byte.
     { "TRAILING", -2 },  // We do not reorder trailing weights (those after implicits).
@@ -522,11 +213,7 @@ static const struct {
 };
 
 int32_t getReorderCode(const char* name) {
-    int32_t code = ucol_findReorderingEntry(name);
-    if (code >= 0) {
-        return code;
-    }
-    code = u_getPropertyValueEnum(UCHAR_SCRIPT, name);
+    int32_t code = CollationRuleParser::getReorderCode(name);
     if (code >= 0) {
         return code;
     }
@@ -538,985 +225,818 @@ int32_t getReorderCode(const char* name) {
     return -1;  // Same as UCHAR_INVALID_CODE or USCRIPT_INVALID_CODE.
 }
 
-UCAElements *readAnElement(FILE *data, tempUCATable *t, UCAConstants *consts, LeadByteConstants *leadByteConstants, UErrorCode *status) {
-    static int itemsToDataBlock = 0;
-    static int scriptDataWritten = 0;
-    char buffer[2048], primary[100], secondary[100], tertiary[100];
-    UChar uBuffer[2048];
-    UChar uBuffer2[2048];
-    UChar leadByte[100], scriptCode[100];
-    int32_t i = 0;
-    unsigned int theValue;
-    char *pointer = NULL;
-    char *commentStart = NULL;
-    char *startCodePoint = NULL;
-    char *endCodePoint = NULL;
-    char *result = fgets(buffer, 2048, data);
-    int32_t buflen = (int32_t)uprv_strlen(buffer);
-    if(U_FAILURE(*status)) {
-        return 0;
-    }
-    *primary = *secondary = *tertiary = '\0';
-    *leadByte = *scriptCode = '\0';
-    if(result == NULL) {
-        if(feof(data)) {
-            return NULL;
-        } else {
-            fprintf(stderr, "empty line but no EOF!\n");
-            *status = U_INVALID_FORMAT_ERROR;
-            return NULL;
-        }
-    }
-    while(buflen>0 && (buffer[buflen-1] == '\r' || buffer[buflen-1] == '\n')) {
-      buffer[--buflen] = 0;
-    }
+enum ActionType {
+  READCE,
+  READPRIMARY,
+  READBYTE,
+  READUNIFIEDIDEOGRAPH,
+  READUCAVERSION,
+  READLEADBYTETOSCRIPTS,
+  IGNORE
+};
 
-    if(buffer[0] == 0 || buffer[0] == '#') {
-        return NULL; // just a comment, skip whole line
-    }
+static struct {
+    const char *const name;
+    int64_t value;
+    const ActionType what_to_do;
+} vt[]  = {
+    {"[first tertiary ignorable",     0, IGNORE},
+    {"[last tertiary ignorable",      0, IGNORE},
+    {"[first secondary ignorable",    0, READCE},
+    {"[last secondary ignorable",     0, READCE},
+    {"[first primary ignorable",      0, READCE},
+    {"[last primary ignorable",       0, READCE},
+    {"[first variable",               0, READCE},
+    {"[last variable",                0, READCE},
+    {"[first regular",                0, READCE},
+    {"[last regular",                 0, READCE},
+    {"[first implicit",               0, READCE},
+    {"[last implicit",                0, READCE},
+    {"[first trailing",               0, READCE},
+    {"[last trailing",                0, READCE},
+
+    {"[Unified_Ideograph",            0, READUNIFIEDIDEOGRAPH},
+
+    {"[fixed first implicit byte",    0, IGNORE},
+    {"[fixed last implicit byte",     0, IGNORE},
+    {"[fixed first trail byte",       0, IGNORE},
+    {"[fixed last trail byte",        0, IGNORE},
+    {"[fixed first special byte",     0, IGNORE},
+    {"[fixed last special byte",      0, IGNORE},
+    {"[fixed secondary common byte",                  0, READBYTE},
+    {"[fixed last secondary common byte",             0, READBYTE},
+    {"[fixed first ignorable secondary byte",         0, READBYTE},
+    {"[fixed tertiary common byte",                   0, READBYTE},
+    {"[fixed first ignorable tertiary byte",          0, READBYTE},
+    {"[variable top = ",              0, IGNORE},
+    {"[UCA version = ",               0, READUCAVERSION},
+    {"[top_byte",                     0, READLEADBYTETOSCRIPTS},
+    {"[reorderingTokens",             0, IGNORE},
+    {"[categories",                   0, IGNORE},
+    {"[first tertiary in secondary non-ignorable",    0, IGNORE},
+    {"[last tertiary in secondary non-ignorable",     0, IGNORE},
+    {"[first secondary in primary non-ignorable",     0, IGNORE},
+    {"[last secondary in primary non-ignorable",      0, IGNORE},
+};
 
-    UCAElements *element = &le;
-    memset(element, 0, sizeof(*element));
+static int64_t getOptionValue(const char *name) {
+    for (int32_t i = 0; i < LENGTHOF(vt); ++i) {
+        if(uprv_strcmp(name, vt[i].name) == 0) {
+            return vt[i].value;
+        }
+    }
+    return 0;
+}
 
-    enum ActionType {
-      READCE,
-      READHEX1,
-      READHEX2,
-      READUCAVERSION,
-      READLEADBYTETOSCRIPTS,
-      READSCRIPTTOLEADBYTES,
-      IGNORE,
-    };
+static UnicodeString *leadByteScripts = NULL;
 
-    // Directives.
-    if(buffer[0] == '[') {
-      uint32_t cnt = 0;
-      static const struct {
-        char name[128];
-        uint32_t *what;
-        ActionType what_to_do;
-      } vt[]  = { {"[first tertiary ignorable",  consts->UCA_FIRST_TERTIARY_IGNORABLE,  READCE},
-                  {"[last tertiary ignorable",   consts->UCA_LAST_TERTIARY_IGNORABLE,   READCE},
-                  {"[first secondary ignorable", consts->UCA_FIRST_SECONDARY_IGNORABLE, READCE},
-                  {"[last secondary ignorable",  consts->UCA_LAST_SECONDARY_IGNORABLE,  READCE},
-                  {"[first primary ignorable",   consts->UCA_FIRST_PRIMARY_IGNORABLE,   READCE},
-                  {"[last primary ignorable",    consts->UCA_LAST_PRIMARY_IGNORABLE,    READCE},
-                  {"[first variable",            consts->UCA_FIRST_VARIABLE,            READCE},
-                  {"[last variable",             consts->UCA_LAST_VARIABLE,             READCE},
-                  {"[first regular",             consts->UCA_FIRST_NON_VARIABLE,        READCE},
-                  {"[last regular",              consts->UCA_LAST_NON_VARIABLE,         READCE},
-                  {"[first implicit",            consts->UCA_FIRST_IMPLICIT,            READCE},
-                  {"[last implicit",             consts->UCA_LAST_IMPLICIT,             READCE},
-                  {"[first trailing",            consts->UCA_FIRST_TRAILING,            READCE},
-                  {"[last trailing",             consts->UCA_LAST_TRAILING,             READCE},
-
-                  {"[fixed top",                    &consts->UCA_PRIMARY_TOP_MIN,       READHEX1},
-                  {"[fixed first implicit byte",    &consts->UCA_PRIMARY_IMPLICIT_MIN,  READHEX1},
-                  {"[fixed last implicit byte",     &consts->UCA_PRIMARY_IMPLICIT_MAX,  READHEX1},
-                  {"[fixed first trail byte",       &consts->UCA_PRIMARY_TRAILING_MIN,  READHEX1},
-                  {"[fixed last trail byte",        &consts->UCA_PRIMARY_TRAILING_MAX,  READHEX1},
-                  {"[fixed first special byte",     &consts->UCA_PRIMARY_SPECIAL_MIN,   READHEX1},
-                  {"[fixed last special byte",      &consts->UCA_PRIMARY_SPECIAL_MAX,   READHEX1},
-                  {"[variable top = ",              &t->options->variableTopValue,      READHEX2},
-                  {"[UCA version = ",               NULL,                               READUCAVERSION},
-                  {"[top_byte",                     NULL,                               READLEADBYTETOSCRIPTS},
-                  {"[reorderingTokens",             NULL,                               READSCRIPTTOLEADBYTES},
-                  {"[categories",                   NULL,                               IGNORE},
-                  {"[first tertiary in secondary non-ignorable",                 NULL,                               IGNORE},
-                  {"[last tertiary in secondary non-ignorable",                 NULL,                               IGNORE},
-                  {"[first secondary in primary non-ignorable",                 NULL,                               IGNORE},
-                  {"[last secondary in primary non-ignorable",                 NULL,                               IGNORE},
-      };
-      for (cnt = 0; cnt<sizeof(vt)/sizeof(vt[0]); cnt++) {
-        uint32_t vtLen = (uint32_t)uprv_strlen(vt[cnt].name);
+static void readAnOption(
+        CollationBaseDataBuilder &builder, char *buffer, UErrorCode *status) {
+    for (int32_t cnt = 0; cnt<LENGTHOF(vt); cnt++) {
+        int32_t vtLen = (int32_t)uprv_strlen(vt[cnt].name);
         if(uprv_strncmp(buffer, vt[cnt].name, vtLen) == 0) {
             ActionType what_to_do = vt[cnt].what_to_do;
+            char *pointer = skipWhiteSpace(buffer + vtLen);
             if (what_to_do == IGNORE) { //vt[cnt].what_to_do == IGNORE
-                return NULL;
-            } else if(what_to_do == READHEX1 || what_to_do == READHEX2) {
-              pointer = buffer+vtLen;
-              int32_t numBytes = readElement(&pointer, primary, ']', status) / 2;
-              if(numBytes != (what_to_do == READHEX1 ? 1 : 2)) {
-                  fprintf(stderr, "Value of \"%s\" has unexpected number of %d bytes\n",
-                          buffer, (int)numBytes);
-                  //*status = U_INVALID_FORMAT_ERROR;
-                  return NULL;
-              }
-              *(vt[cnt].what) = (uint32_t)uprv_strtoul(primary, &pointer, 16);
-              if(*pointer != 0) {
-                  fprintf(stderr, "Value of \"%s\" is not a hexadecimal number\n", buffer);
-                  //*status = U_INVALID_FORMAT_ERROR;
-                  return NULL;
-              }
+                return;
             } else if (what_to_do == READCE) {
-              // TODO: combine & clean up the two CE parsers
-              pointer = strchr(buffer+vtLen, '[');
-              if(pointer) {
-                pointer++;
-                element->sizePrim[0]=readElement(&pointer, primary, ',', status) / 2;
-                element->sizeSec[0]=readElement(&pointer, secondary, ',', status) / 2;
-                element->sizeTer[0]=readElement(&pointer, tertiary, ']', status) / 2;
-                vt[cnt].what[0] = getSingleCEValue(primary, secondary, tertiary, status);
-                if(element->sizePrim[0] > 2 || element->sizeSec[0] > 1 || element->sizeTer[0] > 1) {
-                  uint32_t CEi = 1;
-                  uint32_t value = UCOL_CONTINUATION_MARKER; /* Continuation marker */
-                    if(2*CEi<element->sizePrim[i]) {
-                        value |= ((hex2num(*(primary+4*CEi))&0xF)<<28);
-                        value |= ((hex2num(*(primary+4*CEi+1))&0xF)<<24);
-                    }
-
-                    if(2*CEi+1<element->sizePrim[i]) {
-                        value |= ((hex2num(*(primary+4*CEi+2))&0xF)<<20);
-                        value |= ((hex2num(*(primary+4*CEi+3))&0xF)<<16);
-                    }
-
-                    if(CEi<element->sizeSec[i]) {
-                        value |= ((hex2num(*(secondary+2*CEi))&0xF)<<12);
-                        value |= ((hex2num(*(secondary+2*CEi+1))&0xF)<<8);
+                vt[cnt].value = parseCE(builder, pointer, *status);
+                if(U_SUCCESS(*status) && *pointer != ']') {
+                    *status = U_INVALID_FORMAT_ERROR;
+                }
+                if(U_FAILURE(*status)) {
+                    fprintf(stderr, "Syntax error: unable to parse the CE from line '%s'\n", buffer);
+                    return;
+                }
+            } else if(what_to_do == READPRIMARY) {
+                vt[cnt].value = parseWeight(pointer, "]", 4, *status);
+                if(U_FAILURE(*status)) {
+                    fprintf(stderr, "Value of \"%s\" is not a primary weight\n", buffer);
+                    return;
+                }
+            } else if(what_to_do == READBYTE) {
+                vt[cnt].value = parseWeight(pointer, "]", 1, *status) >> 24;
+                if(U_FAILURE(*status)) {
+                    fprintf(stderr, "Value of \"%s\" is not a valid byte\n", buffer);
+                    return;
+                }
+            } else if(what_to_do == READUNIFIEDIDEOGRAPH) {
+                UVector32 unihan(*status);
+                if(U_FAILURE(*status)) { return; }
+                for(;;) {
+                    if(*pointer == ']') { break; }
+                    if(*pointer == 0) {
+                        // Missing ] after ranges.
+                        *status = U_INVALID_FORMAT_ERROR;
+                        return;
                     }
-
-                    if(CEi<element->sizeTer[i]) {
-                        value |= ((hex2num(*(tertiary+2*CEi))&0x3)<<4);
-                        value |= (hex2num(*(tertiary+2*CEi+1))&0xF);
+                    char *s = pointer;
+                    while(*s != ' ' && *s != '\t' && *s != ']' && *s != '\0') { ++s; }
+                    char c = *s;
+                    *s = 0;
+                    uint32_t start, end;
+                    u_parseCodePointRange(pointer, &start, &end, status);
+                    *s = c;
+                    if(U_FAILURE(*status)) {
+                        fprintf(stderr, "Syntax error: unable to parse one of the ranges from line '%s'\n", buffer);
+                        *status = U_INVALID_FORMAT_ERROR;
+                        return;
                     }
-
-                    CEi++;
-
-                    vt[cnt].what[1] = value;
-                    //element->CEs[CEindex++] = value;
-                } else {
-                  vt[cnt].what[1] = 0;
+                    unihan.addElement((UChar32)start, *status);
+                    unihan.addElement((UChar32)end, *status);
+                    pointer = skipWhiteSpace(s);
                 }
-              } else {
-                fprintf(stderr, "Failed to read a CE from line %s\n", buffer);
-              }
-            } else if (what_to_do == READUCAVERSION) { //vt[cnt].what_to_do == READUCAVERSION
-              u_versionFromString(UCAVersion, buffer+vtLen);
-              if(beVerbose) {
-                char uca[U_MAX_VERSION_STRING_LENGTH];
-                u_versionToString(UCAVersion, uca);
-                printf("UCA version %s\n", uca);
-              }
-              UVersionInfo UCDVersion;
-              u_getUnicodeVersion(UCDVersion);
-              if (UCAVersion[0] != UCDVersion[0] || UCAVersion[1] != UCDVersion[1]) {
-                char uca[U_MAX_VERSION_STRING_LENGTH];
-                char ucd[U_MAX_VERSION_STRING_LENGTH];
-                u_versionToString(UCAVersion, uca);
-                u_versionToString(UCDVersion, ucd);
-                // Warning, not error, to permit bootstrapping during a version upgrade.
-                fprintf(stderr, "warning: UCA version %s != UCD version %s\n", uca, ucd);
-                // *status = U_INVALID_FORMAT_ERROR;
-                // return NULL;
-              }
-            } else if (what_to_do == READLEADBYTETOSCRIPTS) { //vt[cnt].what_to_do == READLEADBYTETOSCRIPTS
-                pointer = buffer + vtLen;
-                skipWhiteSpace(&pointer, status);
-
-                uint16_t leadByte = (hex2num(*pointer++) * 16) + hex2num(*pointer++);
-                //printf("~~~~ processing lead byte = %02x\n", leadByte);
-                if (leadByte >= leadByteConstants->LEAD_BYTE_TO_SCRIPTS_INDEX_LENGTH) {
-                    fprintf(stderr, "Lead byte larger than allocated table!");
-                    // set status and return
-                    *status = U_INTERNAL_PROGRAM_ERROR;
-                    return NULL;
+                builder.initHanRanges(unihan.getBuffer(), unihan.size(), *status);
+            } else if (what_to_do == READUCAVERSION) {
+                u_versionFromString(UCAVersion, pointer);
+                if(beVerbose) {
+                    char uca[U_MAX_VERSION_STRING_LENGTH];
+                    u_versionToString(UCAVersion, uca);
+                    printf("UCA version %s\n", uca);
                 }
-                skipWhiteSpace(&pointer, status);
-
-                int32_t reorderCodeArray[100];
-                uint32_t reorderCodeArrayCount = 0;
-                char scriptName[100];
-                int32_t elementLength = 0;
-                while ((elementLength = readElement(&pointer, scriptName, ' ', status)) > 0) {
-                    if (scriptName[0] == ']') {
+                UVersionInfo UCDVersion;
+                u_getUnicodeVersion(UCDVersion);
+                if (UCAVersion[0] != UCDVersion[0] || UCAVersion[1] != UCDVersion[1]) {
+                    char uca[U_MAX_VERSION_STRING_LENGTH];
+                    char ucd[U_MAX_VERSION_STRING_LENGTH];
+                    u_versionToString(UCAVersion, uca);
+                    u_versionToString(UCDVersion, ucd);
+                    // Warning, not error, to permit bootstrapping during a version upgrade.
+                    fprintf(stderr, "warning: UCA version %s != UCD version %s\n", uca, ucd);
+                }
+            } else if (what_to_do == READLEADBYTETOSCRIPTS) {
+                uint16_t leadByte = (hex2num(*pointer++) * 16);
+                leadByte += hex2num(*pointer++);
+
+                if(0xe0 <= leadByte && leadByte < Collation::UNASSIGNED_IMPLICIT_BYTE) {
+                    // Extend the Hani range to the end of what this implementation uses.
+                    // FractionalUCA.txt assumes a different algorithm for implicit primary weights,
+                    // and different high-lead byte ranges.
+                    leadByteScripts[leadByte] = leadByteScripts[0xdf];
+                    return;
+                }
+
+                UnicodeString scripts;
+                for(;;) {
+                    pointer = skipWhiteSpace(pointer);
+                    if (*pointer == ']') {
                         break;
                     }
+                    const char *scriptName = pointer;
+                    char c;
+                    while((c = *pointer) != 0 && c != ' ' && c != '\t' && c != ']') { ++pointer; }
+                    if(c == 0) {
+                        fprintf(stderr, "Syntax error: unterminated list of scripts: '%s'\n", buffer);
+                        *status = U_INVALID_FORMAT_ERROR;
+                        return;
+                    }
+                    *pointer = 0;
                     int32_t reorderCode = getReorderCode(scriptName);
+                    *pointer = c;
+                    if (reorderCode == -3) {  // COMPRESS
+                        builder.setCompressibleLeadByte(leadByte);
+                        continue;
+                    }
                     if (reorderCode == -2) {
                         continue;  // Ignore "TERMINATOR" etc.
                     }
-                    if (reorderCode < 0) {
-                        printf("Syntax error: unable to parse reorder code from '%s'\n", scriptName);
+                    if (reorderCode < 0 || 0xffff < reorderCode) {
+                        fprintf(stderr, "Syntax error: unable to parse reorder code from '%s'\n", scriptName);
                         *status = U_INVALID_FORMAT_ERROR;
-                        return NULL;
+                        return;
                     }
-                    if (reorderCodeArrayCount >= LENGTHOF(reorderCodeArray)) {
-                        printf("reorder code array count is greater than allocated size!\n");
-                        *status = U_INTERNAL_PROGRAM_ERROR;
-                        return NULL;
-                    }
-                    reorderCodeArray[reorderCodeArrayCount++] = reorderCode;
-                }
-                //printf("reorderCodeArrayCount = %d\n", reorderCodeArrayCount);
-                switch (reorderCodeArrayCount) {
-                    case 0:
-                        leadByteConstants->LEAD_BYTE_TO_SCRIPTS_INDEX[leadByte] = 0;
-                        break;
-                    case 1:
-                        // TODO = move 0x8000 into defined constant
-                        leadByteConstants->LEAD_BYTE_TO_SCRIPTS_INDEX[leadByte] = 0x8000 | reorderCodeArray[0];
-                        break;
-                    default:
-                        if (reorderCodeArrayCount + leadByteConstants->LEAD_BYTE_TO_SCRIPTS_DATA_OFFSET > leadByteConstants->LEAD_BYTE_TO_SCRIPTS_DATA_LENGTH) {
-                            // Error condition
-                        }
-                        leadByteConstants->LEAD_BYTE_TO_SCRIPTS_INDEX[leadByte] = leadByteConstants->LEAD_BYTE_TO_SCRIPTS_DATA_OFFSET;
-                        leadByteConstants->LEAD_BYTE_TO_SCRIPTS_DATA[leadByteConstants->LEAD_BYTE_TO_SCRIPTS_DATA_OFFSET++] = reorderCodeArrayCount;
-                        for (int reorderCodeIndex = 0; reorderCodeIndex < reorderCodeArrayCount; reorderCodeIndex++) {
-                            leadByteConstants->LEAD_BYTE_TO_SCRIPTS_DATA[leadByteConstants->LEAD_BYTE_TO_SCRIPTS_DATA_OFFSET++] = reorderCodeArray[reorderCodeIndex];
-                        }
+                    scripts.append((UChar)reorderCode);
                 }
-            } else if (what_to_do == READSCRIPTTOLEADBYTES) { //vt[cnt].what_to_do == READSCRIPTTOLEADBYTES
-                uint16_t leadByteArray[256];
-                uint32_t leadByteArrayCount = 0;
-                char scriptName[100];
-
-                pointer = buffer + vtLen;
-                skipWhiteSpace(&pointer, status);
-                uint32_t scriptNameLength = readElement(&pointer, scriptName, '\t', status);
-                int32_t reorderCode = getReorderCode(scriptName);
-                if (reorderCode >= 0) {
-                    //printf("^^^ processing reorder code = %04x (%s)\n", reorderCode, scriptName);
-                    skipWhiteSpace(&pointer, status);
-
-                    int32_t elementLength = 0;
-                    char leadByteString[100];
-                    while ((elementLength = readElement(&pointer, leadByteString, '=', status)) == 2) {
-                        //printf("\tleadByteArrayCount = %d, elementLength = %d, leadByteString = %s\n", leadByteArrayCount, elementLength, leadByteString);
-                        uint32_t leadByte = (hex2num(leadByteString[0]) * 16) + hex2num(leadByteString[1]);
-                        leadByteArray[leadByteArrayCount++] = (uint16_t) leadByte;
-                        skipUntilWhiteSpace(&pointer, status);
-                    }
-
-                    if (leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX_COUNT >= leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX_LENGTH) {
-                        //printf("\tError condition\n");
-                        //printf("\tindex count = %d, total index size = %d\n", leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX_COUNT, sizeof(leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX) / sizeof(leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX[0]));
-                        // Error condition
-                        *status = U_INTERNAL_PROGRAM_ERROR;
-                        return NULL;
+                if(!scripts.isEmpty()) {
+                    if(leadByteScripts == NULL) {
+                        leadByteScripts = new UnicodeString[256];
                     }
-                    leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX[leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX_COUNT].reorderCode = reorderCode;
-
-                    //printf("\tlead byte count = %d\n", leadByteArrayCount);
-                    //printf("\tlead byte array = ");
-                    //for (int i = 0; i < leadByteArrayCount; i++) {
-                    //    printf("%02x, ", leadByteArray[i]);
-                    //}
-                    //printf("\n");
-
-                    switch (leadByteArrayCount) {
-                        case 0:
-                            leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX[leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX_COUNT].offset = 0;
-                            break;
-                        case 1:
-                            // TODO = move 0x8000 into defined constant
-                            //printf("\t+++++ lead byte = &x\n", leadByteArray[0]);
-                            leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX[leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX_COUNT].offset = 0x8000 | leadByteArray[0];
-                            break;
-                        default:
-                            //printf("\t+++++ lead bytes written to data block - %d\n", itemsToDataBlock++);
-                            //printf("\tlead bytes = ");
-                            //for (int i = 0; i < leadByteArrayCount; i++) {
-                            //    printf("%02x, ", leadByteArray[i]);
-                            //}
-                            //printf("\n");
-                            //printf("\tBEFORE data bytes = ");
-                            //for (int i = 0; i < leadByteConstants->SCRIPT_TO_LEAD_BYTES_DATA_OFFSET; i++) {
-                            //    printf("%02x, ", leadByteConstants->SCRIPT_TO_LEAD_BYTES_DATA[i]);
-                            //}
-                            //printf("\n");
-                            //printf("\tdata offset = %d, data length = %d\n", leadByteConstants->SCRIPT_TO_LEAD_BYTES_DATA_OFFSET, leadByteConstants->SCRIPT_TO_LEAD_BYTES_DATA_LENGTH);
-                            if ((leadByteArrayCount + leadByteConstants->SCRIPT_TO_LEAD_BYTES_DATA_OFFSET) > leadByteConstants->SCRIPT_TO_LEAD_BYTES_DATA_LENGTH) {
-                                //printf("\tError condition\n");
-                                // Error condition
-                                *status = U_INTERNAL_PROGRAM_ERROR;
-                                return NULL;
-                            }
-                            leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX[leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX_COUNT].offset = leadByteConstants->SCRIPT_TO_LEAD_BYTES_DATA_OFFSET;
-                            leadByteConstants->SCRIPT_TO_LEAD_BYTES_DATA[leadByteConstants->SCRIPT_TO_LEAD_BYTES_DATA_OFFSET++] = leadByteArrayCount;
-                            scriptDataWritten++;
-                            memcpy(&leadByteConstants->SCRIPT_TO_LEAD_BYTES_DATA[leadByteConstants->SCRIPT_TO_LEAD_BYTES_DATA_OFFSET],
-                                leadByteArray, leadByteArrayCount * sizeof(leadByteArray[0]));
-                            scriptDataWritten += leadByteArrayCount;
-                            //printf("\tlead byte data written = %d\n", scriptDataWritten);
-                            //printf("\tcurrentIndex.reorderCode = %04x, currentIndex.offset = %04x\n", 
-                            //    leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX_COUNT.reorderCode, leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX_COUNT.offset);
-                            leadByteConstants->SCRIPT_TO_LEAD_BYTES_DATA_OFFSET += leadByteArrayCount;
-                            //printf("\tdata offset = %d\n", leadByteConstants->SCRIPT_TO_LEAD_BYTES_DATA_OFFSET);
-                            //printf("\tAFTER data bytes = ");
-                            //for (int i = 0; i < leadByteConstants->SCRIPT_TO_LEAD_BYTES_DATA_OFFSET; i++) {
-                            //    printf("%02x, ", leadByteConstants->SCRIPT_TO_LEAD_BYTES_DATA[i]);
-                            //}
-                            //printf("\n");
-                    }
-                    //if (reorderCode >= 0x1000) {
-                     //   printf("@@@@ reorderCode = %x, offset = %x\n", leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX[leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX_COUNT].reorderCode, leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX[leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX_COUNT].offset);
-                     //   for (int i = 0; i < leadByteConstants->SCRIPT_TO_LEAD_BYTES_DATA_OFFSET; i++) {
-                    //        printf("%02x, ", leadByteConstants->SCRIPT_TO_LEAD_BYTES_DATA[i]);
-                     //   }
-                    //    printf("\n");
-                   // }
-                    leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX_COUNT++;
+                    leadByteScripts[leadByte] = scripts;
                 }
             }
-            return NULL;
+            return;
+        }
+    }
+    fprintf(stderr, "Warning: unrecognized option: %s\n", buffer);
+}
+
+static UBool
+readAnElement(FILE *data,
+        CollationBaseDataBuilder &builder,
+        UnicodeString &prefix, UnicodeString &s,
+        int64_t ces[32], int32_t &cesLength,
+        UErrorCode *status) {
+    if(U_FAILURE(*status)) {
+        return FALSE;
+    }
+    char buffer[2048];
+    char *result = fgets(buffer, 2048, data);
+    if(result == NULL) {
+        if(feof(data)) {
+            return FALSE;
+        } else {
+            fprintf(stderr, "empty line but no EOF!\n");
+            *status = U_INVALID_FORMAT_ERROR;
+            return FALSE;
         }
-      }
-      fprintf(stderr, "Warning: unrecognized option: %s\n", buffer);
-      //*status = U_INVALID_FORMAT_ERROR;
-      return NULL;
+    }
+    int32_t buflen = (int32_t)uprv_strlen(buffer);
+    while(buflen>0 && (buffer[buflen-1] == '\r' || buffer[buflen-1] == '\n')) {
+      buffer[--buflen] = 0;
+    }
+
+    if(buffer[0] == 0 || buffer[0] == '#') {
+        return FALSE; // just a comment, skip whole line
     }
 
-    startCodePoint = buffer;
-    endCodePoint = strchr(startCodePoint, ';');
+    // Directives.
+    if(buffer[0] == '[') {
+        readAnOption(builder, buffer, status);
+        return FALSE;
+    }
 
-    if(endCodePoint == 0) {
+    char *startCodePoint = buffer;
+    char *endCodePoint = strchr(startCodePoint, ';');
+    if(endCodePoint == NULL) {
         fprintf(stderr, "error - line with no code point!\n");
         *status = U_INVALID_FORMAT_ERROR; /* No code point - could be an error, but probably only an empty line */
-        return NULL;
+        return FALSE;
     } else {
-        *(endCodePoint) = 0;
+        *endCodePoint = 0;
     }
 
     char *pipePointer = strchr(buffer, '|');
     if (pipePointer != NULL) {
         // Read the prefix string which precedes the actual string.
         *pipePointer = 0;
-        element->prefixSize =
+        UChar *prefixChars = prefix.getBuffer(32);
+        int32_t prefixSize =
             u_parseString(startCodePoint,
-                          element->prefixChars, LENGTHOF(element->prefixChars),
+                          prefixChars, prefix.getCapacity(),
                           NULL, status);
         if(U_FAILURE(*status)) {
+            prefix.releaseBuffer(0);
             fprintf(stderr, "error - parsing of prefix \"%s\" failed: %s\n",
                     startCodePoint, u_errorName(*status));
             *status = U_INVALID_FORMAT_ERROR;
-            return NULL;
+            return FALSE;
         }
-        element->prefix = element->prefixChars;
+        prefix.releaseBuffer(prefixSize);
         startCodePoint = pipePointer + 1;
     }
 
     // Read the string which gets the CE(s) assigned.
-    element->cSize =
+    UChar *uchars = s.getBuffer(32);
+    int32_t cSize =
         u_parseString(startCodePoint,
-                      element->uchars, LENGTHOF(element->uchars),
+                      uchars, s.getCapacity(),
                       NULL, status);
     if(U_FAILURE(*status)) {
+        s.releaseBuffer(0);
         fprintf(stderr, "error - parsing of code point(s) \"%s\" failed: %s\n",
                 startCodePoint, u_errorName(*status));
         *status = U_INVALID_FORMAT_ERROR;
-        return NULL;
+        return FALSE;
     }
-    element->cPoints = element->uchars;
+    s.releaseBuffer(cSize);
 
-    startCodePoint = endCodePoint+1;
+    char *pointer = endCodePoint + 1;
 
-    commentStart = strchr(startCodePoint, '#');
+    char *commentStart = strchr(pointer, '#');
     if(commentStart == NULL) {
-        commentStart = strlen(startCodePoint) + startCodePoint;
+        commentStart = strchr(pointer, 0);
     }
 
-    i = 0;
-    uint32_t CEindex = 0;
-    element->noOfCEs = 0;
+    cesLength = 0;
     for(;;) {
-        endCodePoint = strchr(startCodePoint, ']');
-        if(endCodePoint == NULL || endCodePoint >= commentStart) {
+        pointer = skipWhiteSpace(pointer);
+        if(pointer == commentStart) {
             break;
         }
-        pointer = strchr(startCodePoint, '[');
-        pointer++;
-
-        element->sizePrim[i]=readElement(&pointer, primary, ',', status) / 2;
-        element->sizeSec[i]=readElement(&pointer, secondary, ',', status) / 2;
-        element->sizeTer[i]=readElement(&pointer, tertiary, ']', status) / 2;
-
-
-        /* I want to get the CEs entered right here, including continuation */
-        element->CEs[CEindex++] = getSingleCEValue(primary, secondary, tertiary, status);
-
-        uint32_t CEi = 1;
-        while(2*CEi<element->sizePrim[i] || CEi<element->sizeSec[i] || CEi<element->sizeTer[i]) {
-          uint32_t value = UCOL_CONTINUATION_MARKER; /* Continuation marker */
-            if(2*CEi<element->sizePrim[i]) {
-                value |= ((hex2num(*(primary+4*CEi))&0xF)<<28);
-                value |= ((hex2num(*(primary+4*CEi+1))&0xF)<<24);
-            }
-
-            if(2*CEi+1<element->sizePrim[i]) {
-                value |= ((hex2num(*(primary+4*CEi+2))&0xF)<<20);
-                value |= ((hex2num(*(primary+4*CEi+3))&0xF)<<16);
-            }
-
-            if(CEi<element->sizeSec[i]) {
-                value |= ((hex2num(*(secondary+2*CEi))&0xF)<<12);
-                value |= ((hex2num(*(secondary+2*CEi+1))&0xF)<<8);
-            }
-
-            if(CEi<element->sizeTer[i]) {
-                value |= ((hex2num(*(tertiary+2*CEi))&0x3)<<4);
-                value |= (hex2num(*(tertiary+2*CEi+1))&0xF);
-            }
-
-            CEi++;
-
-            element->CEs[CEindex++] = value;
+        if(cesLength >= 31) {
+            fprintf(stderr, "Error: Too many CEs on line '%s'\n", buffer);
+            *status = U_INVALID_FORMAT_ERROR;
+            return FALSE;
         }
-
-      startCodePoint = endCodePoint+1;
-      i++;
-    }
-    element->noOfCEs = CEindex;
-#if 0
-    element->isThai = UCOL_ISTHAIPREVOWEL(element->cPoints[0]);
-#endif
-    // we don't want any strange stuff after useful data!
-    if (pointer == NULL) {
-        /* huh? Did we get ']' without the '['? Pair your brackets! */
-        *status=U_INVALID_FORMAT_ERROR;
-    }
-    else {
-        while(pointer < commentStart)  {
-            if(*pointer != ' ' && *pointer != '\t')
-            {
-                *status=U_INVALID_FORMAT_ERROR;
-                break;
-            }
-            pointer++;
+        ces[cesLength++] = parseCE(builder, pointer, *status);
+        if(U_FAILURE(*status)) {
+            fprintf(stderr, "Syntax error parsing CE from line '%s' - %s\n",
+                    buffer, u_errorName(*status));
+            return FALSE;
         }
     }
-    if(element->cSize == 1 && element->cPoints[0] == 0xfffe) {
+
+    if(s.length() == 1 && s[0] == 0xfffe) {
         // UCA 6.0 gives U+FFFE a special minimum weight using the
         // byte 02 which is the merge-sort-key separator and illegal for any
         // other characters.
     } else {
         // Rudimentary check for valid bytes in CE weights.
-        // For a more comprehensive check see cintltst /tscoll/citertst/TestCEValidity
-        for (i = 0; i < (int32_t)CEindex; ++i) {
-            uint32_t value = element->CEs[i];
-            uint8_t bytes[4] = {
-                (uint8_t)(value >> 24),
-                (uint8_t)(value >> 16),
-                (uint8_t)(value >> 8),
-                (uint8_t)(value & UCOL_NEW_TERTIARYORDERMASK)
-            };
-            for (int j = 0; j < 4; ++j) {
-                if (0 != bytes[j] && bytes[j] < 3) {
-                    fprintf(stderr, "Warning: invalid UCA weight byte %02X for %s\n", bytes[j], buffer);
-                    return NULL;
+        // For a more comprehensive check see CollationTest::TestRootElements(),
+        // intltest collate/CollationTest/TestRootElements
+        for (int32_t i = 0; i < cesLength; ++i) {
+            int64_t ce = ces[i];
+            UBool isCompressible = FALSE;
+            for (int j = 7; j >= 0; --j) {
+                uint8_t b = (uint8_t)(ce >> (j * 8));
+                if(j <= 1) { b &= 0x3f; }  // tertiary bytes use 6 bits
+                if (b == 1) {
+                    fprintf(stderr, "Warning: invalid UCA weight byte 01 for %s\n", buffer);
+                    return FALSE;
+                }
+                if ((j == 7 || j == 3 || j == 1) && b == 2) {
+                    fprintf(stderr, "Warning: invalid UCA weight lead byte 02 for %s\n", buffer);
+                    return FALSE;
+                }
+                if (j == 7) {
+                    isCompressible = builder.isCompressibleLeadByte(b);
+                } else if (j == 6) {
+                    // Primary second bytes 03 and FF are compression terminators.
+                    // 02, 03 and FF are usable when the lead byte is not compressible.
+                    // 02 is unusable and 03 is the low compression terminator when the lead byte is compressible.
+                    if (isCompressible && (b <= 3 || b == 0xff)) {
+                        fprintf(stderr, "Warning: invalid UCA primary second weight byte %02X for %s\n",
+                                b, buffer);
+                        return FALSE;
+                    }
                 }
-            }
-            // Primary second bytes 03 and FF are compression terminators.
-            if (!isContinuation(value) && (bytes[1] == 3 || bytes[1] == 0xFF)) {
-                fprintf(stderr, "Warning: invalid UCA primary second weight byte %02X for %s\n",
-                        bytes[1], buffer);
-                return NULL;
             }
         }
     }
 
-    if(U_FAILURE(*status)) {
-        fprintf(stderr, "problem putting stuff in hash table %s\n", u_errorName(*status));
-        *status = U_INTERNAL_PROGRAM_ERROR;
-        return NULL;
-    }
-
-    return element;
+    return TRUE;
 }
 
-
-void writeOutData(UCATableHeader *data,
-                  UCAConstants *consts,
-                  LeadByteConstants *leadByteConstants,
-                  UChar contractions[][MAX_UCA_CONTRACTION_LENGTH],
-                  uint32_t noOfcontractions,
-                  const char *outputDir,
-                  const char *copyright,
-                  UErrorCode *status)
+static void
+parseFractionalUCA(const char *filename,
+                   CollationBaseDataBuilder &builder,
+                   UErrorCode *status)
 {
-    if(U_FAILURE(*status)) {
+    if(U_FAILURE(*status)) { return; }
+    FILE *data = fopen(filename, "r");
+    if(data == NULL) {
+        fprintf(stderr, "Couldn't open file: %s\n", filename);
+        *status = U_FILE_ACCESS_ERROR;
         return;
     }
+    uint32_t line = 0;
 
-    uint32_t size = data->size;
+    UChar32 maxCodePoint = 0;
+    while(!feof(data)) {
+        if(U_FAILURE(*status)) {
+            fprintf(stderr, "Something returned an error %i (%s) while processing line %u of %s. Exiting...\n",
+                *status, u_errorName(*status), (int)line, filename);
+            exit(*status);
+        }
 
-    data->UCAConsts = data->size;
-    data->size += paddedsize(sizeof(UCAConstants));
+        line++;
 
-    if(noOfcontractions != 0) {
-      uprv_memset(&contractions[noOfcontractions][0], 0, MAX_UCA_CONTRACTION_LENGTH*U_SIZEOF_UCHAR);
-      noOfcontractions++;
+        UnicodeString prefix;
+        UnicodeString s;
+        int64_t ces[32];
+        int32_t cesLength = 0;
+        if(readAnElement(data, builder, prefix, s, ces, cesLength, status)) {
+            // we have read the line, now do something sensible with the read data!
+            uint32_t p = (uint32_t)(ces[0] >> 32);
+
+            if(s.length() > 1 && s[0] == 0xFDD0) {
+                // FractionalUCA.txt contractions starting with U+FDD0
+                // are only entered into the inverse table,
+                // not into the normal collation data.
+                builder.addRootElements(ces, cesLength, *status);
+                if(s.length() == 2 && s[1] == 0x34 && cesLength == 1) {
+                    // Lead byte for numeric sorting.
+                    builder.setNumericPrimary(p);
+                }
+            } else {
+                UChar32 c = s.char32At(0);
+                if(c > maxCodePoint) { maxCodePoint = c; }
+
+                // We ignore the CEs for U+FFFD..U+FFFF and for the unassigned first primary.
+                // CollationBaseDataBuilder::init() maps them to special CEs.
+                // Except for U+FFFE, these have higher primaries in v2 than in FractionalUCA.txt.
+                if(0xfffd <= c && c <= 0xffff) { continue; }
+                if(s.length() == 2 && s[0] == 0xFDD1 && s[1] == 0xFDD0) {
+                    continue;
+                }
 
+                if(0xe0000000 <= p && p < 0xf0000000) {
+                    fprintf(stderr,
+                            "Error: Unexpected mapping to an implicit or trailing primary"
+                            " on line %u of %s.\n",
+                            (int)line, filename);
+                    exit(U_INVALID_FORMAT_ERROR);
+                }
 
-      data->contractionUCACombos = data->size;
-      data->contractionUCACombosWidth = (uint8_t)MAX_UCA_CONTRACTION_LENGTH;
-      data->contractionUCACombosSize = noOfcontractions;
-      data->size += paddedsize((noOfcontractions*MAX_UCA_CONTRACTION_LENGTH*U_SIZEOF_UCHAR));
+                builder.add(prefix, s, ces, cesLength, *status);
+            }
+        }
     }
-    data->scriptToLeadByte = data->size;
-    //printf("@@@@ script to lead byte offset = 0x%x (%d)\n", data->size, data->size);
-    data->size +=
-        sizeof(leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX_COUNT) +                                                       // index table header
-        leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX_COUNT * sizeof(leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX[0]) +    // index table
-        sizeof(leadByteConstants->SCRIPT_TO_LEAD_BYTES_DATA_OFFSET) +                                                       // data table header
-        leadByteConstants->SCRIPT_TO_LEAD_BYTES_DATA_OFFSET * sizeof(leadByteConstants->SCRIPT_TO_LEAD_BYTES_DATA[0]);        // data table
-    data->leadByteToScript = data->size;
-    //printf("@@@@ lead byte to script offset = 0x%x (%d)\n", data->size, data->size);
-    data->size +=
-        sizeof(leadByteConstants->LEAD_BYTE_TO_SCRIPTS_INDEX_LENGTH) +                                                      // index table header
-        leadByteConstants->LEAD_BYTE_TO_SCRIPTS_INDEX_LENGTH * sizeof(leadByteConstants->LEAD_BYTE_TO_SCRIPTS_INDEX[0]) +   // index table
-        sizeof(leadByteConstants->LEAD_BYTE_TO_SCRIPTS_DATA_OFFSET) +                                                       // data table header
-        leadByteConstants->LEAD_BYTE_TO_SCRIPTS_DATA_OFFSET * sizeof(leadByteConstants->LEAD_BYTE_TO_SCRIPTS_DATA[0]);        // data table
-
-    UNewDataMemory *pData;
-    
-    long dataLength;
-    UDataInfo ucaInfo;
-    uprv_memcpy(&ucaInfo, &ucaDataInfo, sizeof(UDataInfo));
-    uprv_memcpy(ucaInfo.dataVersion, UCAVersion, U_MAX_VERSION_LENGTH);
-
-    pData=udata_create(outputDir, UCA_DATA_TYPE, UCA_DATA_NAME, &ucaInfo,
-                       copyright, status);
 
-    if(U_FAILURE(*status)) {
-        fprintf(stderr, "Error: unable to create %s"UCA_DATA_NAME", error %s\n", outputDir, u_errorName(*status));
+    int32_t numRanges = 0;
+    int32_t numRangeCodePoints = 0;
+    UChar32 rangeFirst = U_SENTINEL;
+    UChar32 rangeLast = U_SENTINEL;
+    uint32_t rangeFirstPrimary = 0;
+    uint32_t rangeLastPrimary = 0;
+    int32_t rangeStep = -1;
+
+    // Detect ranges of characters in primary code point order,
+    // with 3-byte primaries and
+    // with consistent "step" differences between adjacent primaries.
+    // This relies on the FractionalUCA generator using the same primary-weight incrementation.
+    // Start at U+0180: No ranges for common Latin characters.
+    // Go one beyond maxCodePoint in case a range ends there.
+    for(UChar32 c = 0x180; c <= (maxCodePoint + 1); ++c) {
+        UBool action;
+        uint32_t p = builder.getLongPrimaryIfSingleCE(c);
+        if(p != 0) {
+            // p is a "long" (three-byte) primary.
+            if(rangeFirst >= 0 && c == (rangeLast + 1) && p > rangeLastPrimary) {
+                // Find the offset between the two primaries.
+                int32_t step = CollationBaseDataBuilder::diffThreeBytePrimaries(
+                    rangeLastPrimary, p, builder.isCompressiblePrimary(p));
+                if(rangeFirst == rangeLast && step >= 2) {
+                    // c == rangeFirst + 1, store the "step" between range primaries.
+                    rangeStep = step;
+                    rangeLast = c;
+                    rangeLastPrimary = p;
+                    action = 0;  // continue range
+                } else if(rangeStep == step) {
+                    // Continue the range with the same "step" difference.
+                    rangeLast = c;
+                    rangeLastPrimary = p;
+                    action = 0;  // continue range
+                } else {
+                    action = 1;  // maybe finish range, start a new one
+                }
+            } else {
+                action = 1;  // maybe finish range, start a new one
+            }
+        } else {
+            action = -1;  // maybe finish range, do not start a new one
+        }
+        if(action != 0 && rangeFirst >= 0) {
+            // Finish a range.
+            // Set offset CE32s for a long range, leave single CEs for a short range.
+            UBool didSetRange = builder.maybeSetPrimaryRange(
+                rangeFirst, rangeLast,
+                rangeFirstPrimary, rangeStep, *status);
+            if(U_FAILURE(*status)) {
+                fprintf(stderr,
+                        "failure setting code point order range U+%04lx..U+%04lx "
+                        "%08lx..%08lx step %d - %s\n",
+                        (long)rangeFirst, (long)rangeLast,
+                        (long)rangeFirstPrimary, (long)rangeLastPrimary,
+                        (int)rangeStep, u_errorName(*status));
+            } else if(didSetRange) {
+                int32_t rangeLength = rangeLast - rangeFirst + 1;
+                if(beVerbose) {
+                    printf("* set code point order range U+%04lx..U+%04lx [%d] "
+                            "%08lx..%08lx step %d\n",
+                            (long)rangeFirst, (long)rangeLast,
+                            (int)rangeLength,
+                            (long)rangeFirstPrimary, (long)rangeLastPrimary,
+                            (int)rangeStep);
+                }
+                ++numRanges;
+                numRangeCodePoints += rangeLength;
+            }
+            rangeFirst = U_SENTINEL;
+            rangeStep = -1;
+        }
+        if(action > 0) {
+            // Start a new range.
+            rangeFirst = rangeLast = c;
+            rangeFirstPrimary = rangeLastPrimary = p;
+        }
+    }
+    printf("** set %d ranges with %d code points\n", (int)numRanges, (int)numRangeCodePoints);
+
+    // Idea: Probably best to work in two passes.
+    // Pass 1 for reading all data, setting isCompressible flags (and reordering groups)
+    // and finding ranges.
+    // Then set the ranges in a newly initialized builder
+    // for optimal compression (makes sure that adjacent blocks can overlap easily).
+    // Then set all mappings outside the ranges.
+    //
+    // In the first pass, we could store mappings in a simple list,
+    // with single-character/single-long-primary-CE mappings in a UTrie2;
+    // or store the mappings in a temporary builder;
+    // or we could just parse the input file again in the second pass.
+    //
+    // Ideally set/copy U+0000..U+017F before setting anything else,
+    // then set default Han/Hangul, then set the ranges, then copy non-range mappings.
+    // It should be easy to copy mappings from an un-built builder to a new one.
+    // Add CollationDataBuilder::copyFrom(builder, code point, errorCode) -- copy contexts & expansions.
+
+    if(UCAVersion[0] == 0 && UCAVersion[1] == 0 && UCAVersion[2] == 0 && UCAVersion[3] == 0) {
+        fprintf(stderr, "UCA version not specified. Cannot create data file!\n");
+        fclose(data);
         return;
     }
 
-    /* write the data to the file */
     if (beVerbose) {
-        printf("Writing out UCA table: %s%c%s.%s\n", outputDir,
-                                                        U_FILE_SEP_CHAR,
-                                                        U_ICUDATA_NAME "_" UCA_DATA_NAME,
-                                                        UCA_DATA_TYPE);
+        printf("\nLines read: %u\n", (int)line);
     }
-    udata_writeBlock(pData, data, size);
 
-    // output the constants here
-    udata_writeBlock(pData, consts, sizeof(UCAConstants));
+    fclose(data);
 
-    if (beVerbose) {
-        printf("first tertiary ignorable = %x %x\n", consts->UCA_FIRST_TERTIARY_IGNORABLE[0], consts->UCA_FIRST_TERTIARY_IGNORABLE[1]);
-        printf("last tertiary ignorable = %x %x\n", consts->UCA_LAST_TERTIARY_IGNORABLE[0], consts->UCA_LAST_TERTIARY_IGNORABLE[1]);
-        printf("first secondary ignorable = %x %x\n", consts->UCA_FIRST_SECONDARY_IGNORABLE[0], consts->UCA_FIRST_SECONDARY_IGNORABLE[1]);
-        printf("contractionUCACombosSize = %d\n", data->contractionUCACombosSize);
-        printf("contractionSize = %d\n", data->contractionSize);
-        printf("number of UCA contractions = %d\n", noOfcontractions);
+    return;
+}
+
+static void
+buildAndWriteBaseData(CollationBaseDataBuilder &builder,
+                      const char *path, UErrorCode &errorCode) {
+    if(U_FAILURE(errorCode)) { return; }
+
+    if(getOptionValue("[fixed secondary common byte") != Collation::COMMON_BYTE) {
+        fprintf(stderr, "error: unexpected [fixed secondary common byte]");
+        errorCode = U_INVALID_FORMAT_ERROR;
+        return;
     }
-    
-    if(noOfcontractions != 0) {
-      udata_writeBlock(pData, contractions, noOfcontractions*MAX_UCA_CONTRACTION_LENGTH*U_SIZEOF_UCHAR);
-      udata_writePadding(pData, paddedsize((noOfcontractions*MAX_UCA_CONTRACTION_LENGTH*U_SIZEOF_UCHAR)) - noOfcontractions*MAX_UCA_CONTRACTION_LENGTH*U_SIZEOF_UCHAR);
+    if(getOptionValue("[fixed tertiary common byte") != Collation::COMMON_BYTE) {
+        fprintf(stderr, "error: unexpected [fixed tertiary common byte]");
+        errorCode = U_INVALID_FORMAT_ERROR;
+        return;
     }
 
-    // output the script to lead bytes table here
-    if (beVerbose) {
-        printf("Writing Script to Lead Byte Data\n");
-        printf("\tindex table size = %x\n", leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX_COUNT);
-        printf("\tdata block size = %x\n", leadByteConstants->SCRIPT_TO_LEAD_BYTES_DATA_OFFSET);
+    if(leadByteScripts != NULL) {
+        uint32_t firstLead = Collation::MERGE_SEPARATOR_BYTE + 1;
+        do {
+            // Find the range of lead bytes with this set of scripts.
+            const UnicodeString &firstScripts = leadByteScripts[firstLead];
+            if(firstScripts.isEmpty()) {
+                fprintf(stderr, "[top_byte 0x%02X] has no reorderable scripts\n", (int)firstLead);
+                errorCode = U_INVALID_FORMAT_ERROR;
+                return;
+            }
+            uint32_t lead = firstLead;
+            for(;;) {
+                ++lead;
+                const UnicodeString &scripts = leadByteScripts[lead];
+                // The scripts should either be the same or disjoint.
+                // We do not test if all reordering groups have disjoint sets of scripts.
+                if(scripts.isEmpty() || firstScripts.indexOf(scripts[0]) < 0) { break; }
+                if(scripts != firstScripts) {
+                    fprintf(stderr,
+                            "[top_byte 0x%02X] includes script %d from [top_byte 0x%02X] "
+                            "but not all scripts match\n",
+                            (int)firstLead, scripts[0], (int)lead);
+                    errorCode = U_INVALID_FORMAT_ERROR;
+                    return;
+                }
+            }
+            // lead is one greater than the last lead byte with the same set of scripts as firstLead.
+            builder.addReorderingGroup(firstLead, lead - 1, firstScripts, errorCode);
+            if(U_FAILURE(errorCode)) { return; }
+            firstLead = lead;
+        } while(firstLead < Collation::UNASSIGNED_IMPLICIT_BYTE);
+        delete[] leadByteScripts;
+    }
+
+    CollationData data(*Normalizer2Factory::getNFCImpl(errorCode));
+    builder.enableFastLatin();
+    builder.build(data, errorCode);
+    if(U_FAILURE(errorCode)) {
+        fprintf(stderr, "builder.build() failed: %s\n",
+                u_errorName(errorCode));
+        return;
     }
-    udata_write16(pData, leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX_COUNT);
-    udata_write16(pData, leadByteConstants->SCRIPT_TO_LEAD_BYTES_DATA_OFFSET);
-//     printf("#### Script to Lead Byte Index Before Sort\n");
-//     for (int reorderCodeIndex = 0; reorderCodeIndex < leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX_COUNT; reorderCodeIndex++) {
-//         printf("\t%04x = %04x\n", leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX[reorderCodeIndex].reorderCode, leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX[reorderCodeIndex].offset);
-//     }
-    qsort(leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX, leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX_COUNT, sizeof(leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX[0]), ReorderIndexComparer);
-    udata_writeBlock(pData, leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX, leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX_COUNT * sizeof(leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX[0]));
-//     printf("#### Script to Lead Byte Index After Sort\n");
-//     for (int reorderCodeIndex = 0; reorderCodeIndex < leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX_COUNT; reorderCodeIndex++) {
-//         printf("\t%04x = %04x\n", leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX[reorderCodeIndex].reorderCode, leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX[reorderCodeIndex].offset);
-//     }
-    
-    // write out the script to lead bytes data block
-    udata_writeBlock(pData, leadByteConstants->SCRIPT_TO_LEAD_BYTES_DATA, leadByteConstants->SCRIPT_TO_LEAD_BYTES_DATA_OFFSET * sizeof(*leadByteConstants->SCRIPT_TO_LEAD_BYTES_DATA));
-    
-    if (beVerbose) {
-        printf("Writing Lead Byte To Script Data\n");
-        printf("\tindex table size = %x\n", leadByteConstants->LEAD_BYTE_TO_SCRIPTS_INDEX_LENGTH);
-        printf("\tdata block size = %x\n", leadByteConstants->LEAD_BYTE_TO_SCRIPTS_DATA_OFFSET);
+
+    // The CollationSettings constructor gives us the properly encoded
+    // default options, so that we need not duplicate them here.
+    CollationSettings settings;
+
+    UVector32 rootElements(errorCode);
+    for(int32_t i = 0; i < CollationRootElements::IX_COUNT; ++i) {
+        rootElements.addElement(0, errorCode);
     }
-    // output the header info
-    udata_write16(pData, leadByteConstants->LEAD_BYTE_TO_SCRIPTS_INDEX_LENGTH);
-    udata_write16(pData, leadByteConstants->LEAD_BYTE_TO_SCRIPTS_DATA_OFFSET);
-    
-    // output the index table
-    udata_writeBlock(pData, leadByteConstants->LEAD_BYTE_TO_SCRIPTS_INDEX, 
-        leadByteConstants->LEAD_BYTE_TO_SCRIPTS_INDEX_LENGTH * sizeof(leadByteConstants->LEAD_BYTE_TO_SCRIPTS_INDEX)[0]);
-//     for (int leadByte = 0; leadByte < leadByteConstants->LEAD_BYTE_TO_SCRIPTS_INDEX_LENGTH; leadByte++) {
-//         printf("\t%02x = %04x\n", leadByte, leadByteConstants->LEAD_BYTE_TO_SCRIPTS_INDEX[leadByte]);
-//     }
-
-    // output the data
-    udata_writeBlock(pData, leadByteConstants->LEAD_BYTE_TO_SCRIPTS_DATA, 
-        leadByteConstants->LEAD_BYTE_TO_SCRIPTS_DATA_OFFSET * sizeof(*leadByteConstants->LEAD_BYTE_TO_SCRIPTS_DATA));
-
-    
-    /* finish up */
-    dataLength=udata_finish(pData, status);
-    if(U_FAILURE(*status)) {
-        fprintf(stderr, "Error: error %d writing the output file\n", *status);
+    builder.buildRootElementsTable(rootElements, errorCode);
+    if(U_FAILURE(errorCode)) {
+        fprintf(stderr, "builder.buildRootElementsTable() failed: %s\n",
+                u_errorName(errorCode));
         return;
     }
-}
+    int32_t index = CollationRootElements::IX_COUNT;
+    rootElements.setElementAt(index, CollationRootElements::IX_FIRST_TERTIARY_INDEX);
 
-enum {
-    /*
-     * Maximum number of UCA contractions we can store.
-     * May need to be increased for a new Unicode version.
-     */
-    MAX_UCA_CONTRACTIONS=2048
-};
+    while((rootElements.elementAti(index) & 0xffff0000) == 0) { ++index; }
+    rootElements.setElementAt(index, CollationRootElements::IX_FIRST_SECONDARY_INDEX);
 
-static int32_t
-write_uca_table(const char *filename,
-                const char *outputDir,
-                const char *copyright,
-                UErrorCode *status)
-{
-    FILE *data = fopen(filename, "r");
-    if(data == NULL) {
-        fprintf(stderr, "Couldn't open file: %s\n", filename);
-        return -1;
+    while((rootElements.elementAti(index) & CollationRootElements::SEC_TER_DELTA_FLAG) != 0) {
+        ++index;
     }
-    uint32_t line = 0;
-    UCAElements *element = NULL;
-    UCATableHeader *myD = (UCATableHeader *)uprv_malloc(sizeof(UCATableHeader));
-    /* test for NULL */
-    if(myD == NULL) {
-        *status = U_MEMORY_ALLOCATION_ERROR;
-        fclose(data);
-        return 0;
+    rootElements.setElementAt(index, CollationRootElements::IX_FIRST_PRIMARY_INDEX);
+
+    rootElements.setElementAt(Collation::COMMON_SEC_AND_TER_CE,
+                              CollationRootElements::IX_COMMON_SEC_AND_TER_CE);
+
+    int32_t secTerBoundaries = (int32_t)getOptionValue("[fixed last secondary common byte") << 24;
+    secTerBoundaries |= (int32_t)getOptionValue("[fixed first ignorable secondary byte") << 16;
+    secTerBoundaries |= (int32_t)getOptionValue("[fixed first ignorable tertiary byte");
+    rootElements.setElementAt(secTerBoundaries, CollationRootElements::IX_SEC_TER_BOUNDARIES);
+
+    LocalMemory<uint8_t> buffer;
+    int32_t capacity = 1000000;
+    uint8_t *dest = buffer.allocateInsteadAndCopy(capacity);
+    if(dest == NULL) {
+        fprintf(stderr, "memory allocation (%ld bytes) for file contents failed\n",
+                (long)capacity);
+        errorCode = U_MEMORY_ALLOCATION_ERROR;
+        return;
     }
-    uprv_memset(myD, 0, sizeof(UCATableHeader));
-    UColOptionSet *opts = (UColOptionSet *)uprv_malloc(sizeof(UColOptionSet));
-    /* test for NULL */
-    if(opts == NULL) {
-        *status = U_MEMORY_ALLOCATION_ERROR;
-        uprv_free(myD);
-        fclose(data);
-        return 0;
+    int32_t indexes[CollationDataReader::IX_TOTAL_SIZE + 1];
+    int32_t totalSize = CollationDataWriter::writeBase(
+            data, settings,
+            rootElements.getBuffer(), rootElements.size(),
+            indexes, dest, capacity,
+            errorCode);
+    if(U_FAILURE(errorCode)) {
+        fprintf(stderr, "CollationDataWriter::writeBase(capacity = %ld) failed: %s\n",
+                (long)capacity, u_errorName(errorCode));
+        return;
     }
-    uprv_memset(opts, 0, sizeof(UColOptionSet));
-    UChar contractions[MAX_UCA_CONTRACTIONS][MAX_UCA_CONTRACTION_LENGTH];
-    uprv_memset(contractions, 0, sizeof(contractions));
-    uint32_t noOfContractions = 0;
-    UCAConstants consts;
-    uprv_memset(&consts, 0, sizeof(consts));
-#if 0
-    UCAConstants consts = {
-      UCOL_RESET_TOP_VALUE,
-      UCOL_FIRST_PRIMARY_IGNORABLE,
-      UCOL_LAST_PRIMARY_IGNORABLE,
-      UCOL_LAST_PRIMARY_IGNORABLE_CONT,
-      UCOL_FIRST_SECONDARY_IGNORABLE,
-      UCOL_LAST_SECONDARY_IGNORABLE,
-      UCOL_FIRST_TERTIARY_IGNORABLE,
-      UCOL_LAST_TERTIARY_IGNORABLE,
-      UCOL_FIRST_VARIABLE,
-      UCOL_LAST_VARIABLE,
-      UCOL_FIRST_NON_VARIABLE,
-      UCOL_LAST_NON_VARIABLE,
-
-      UCOL_NEXT_TOP_VALUE,
-/*
-      UCOL_NEXT_FIRST_PRIMARY_IGNORABLE,
-      UCOL_NEXT_LAST_PRIMARY_IGNORABLE,
-      UCOL_NEXT_FIRST_SECONDARY_IGNORABLE,
-      UCOL_NEXT_LAST_SECONDARY_IGNORABLE,
-      UCOL_NEXT_FIRST_TERTIARY_IGNORABLE,
-      UCOL_NEXT_LAST_TERTIARY_IGNORABLE,
-      UCOL_NEXT_FIRST_VARIABLE,
-      UCOL_NEXT_LAST_VARIABLE,
-*/
-
-      PRIMARY_IMPLICIT_MIN,
-      PRIMARY_IMPLICIT_MAX
-    };
-#endif
-
-    //printf("Allocating LeadByteConstants\n");
-    LeadByteConstants leadByteConstants;
-    uprv_memset(&leadByteConstants, 0x00, sizeof(LeadByteConstants));
-    
-    leadByteConstants.SCRIPT_TO_LEAD_BYTES_INDEX_LENGTH = 256;
-    leadByteConstants.SCRIPT_TO_LEAD_BYTES_INDEX = (ReorderIndex*) uprv_malloc(leadByteConstants.SCRIPT_TO_LEAD_BYTES_INDEX_LENGTH * sizeof(ReorderIndex));
-    uprv_memset(leadByteConstants.SCRIPT_TO_LEAD_BYTES_INDEX, 0x00, leadByteConstants.SCRIPT_TO_LEAD_BYTES_INDEX_LENGTH * sizeof(ReorderIndex));
-    leadByteConstants.SCRIPT_TO_LEAD_BYTES_DATA_LENGTH = 1024;
-    leadByteConstants.SCRIPT_TO_LEAD_BYTES_DATA = (uint16_t*) uprv_malloc(leadByteConstants.SCRIPT_TO_LEAD_BYTES_DATA_LENGTH * sizeof(uint16_t));
-    uprv_memset(leadByteConstants.SCRIPT_TO_LEAD_BYTES_DATA, 0x00, leadByteConstants.SCRIPT_TO_LEAD_BYTES_DATA_LENGTH * sizeof(uint16_t));
-    //printf("\tFinished Allocating LeadByteConstants\n");
-    
-    leadByteConstants.LEAD_BYTE_TO_SCRIPTS_INDEX_LENGTH = 256;
-    leadByteConstants.LEAD_BYTE_TO_SCRIPTS_INDEX = (uint16_t*) uprv_malloc(leadByteConstants.LEAD_BYTE_TO_SCRIPTS_INDEX_LENGTH * sizeof(uint16_t));
-    uprv_memset(leadByteConstants.LEAD_BYTE_TO_SCRIPTS_INDEX, 0x8000 | USCRIPT_INVALID_CODE, leadByteConstants.LEAD_BYTE_TO_SCRIPTS_INDEX_LENGTH * sizeof(uint16_t));
-    leadByteConstants.LEAD_BYTE_TO_SCRIPTS_DATA_LENGTH = 1024;
-    leadByteConstants.LEAD_BYTE_TO_SCRIPTS_DATA_OFFSET = 1;     // offset by 1 to leave zero location for those lead bytes with no reorder codes
-    leadByteConstants.LEAD_BYTE_TO_SCRIPTS_DATA = (uint16_t*) uprv_malloc(leadByteConstants.LEAD_BYTE_TO_SCRIPTS_DATA_LENGTH * sizeof(uint16_t));
-    uprv_memset(leadByteConstants.LEAD_BYTE_TO_SCRIPTS_DATA, 0x00, leadByteConstants.LEAD_BYTE_TO_SCRIPTS_DATA_LENGTH * sizeof(uint16_t));
-
-    uprv_memset(inverseTable, 0xDA, sizeof(int32_t)*3*0xFFFF);
-
-    opts->variableTopValue = 0;
-    opts->strength = UCOL_TERTIARY;
-    opts->frenchCollation = UCOL_OFF;
-    opts->alternateHandling = UCOL_NON_IGNORABLE; /* attribute for handling variable elements*/
-    opts->caseFirst = UCOL_OFF;         /* who goes first, lower case or uppercase */
-    opts->caseLevel = UCOL_OFF;         /* do we have an extra case level */
-    opts->normalizationMode = UCOL_OFF; /* attribute for normalization */
-    opts->hiraganaQ = UCOL_OFF; /* attribute for JIS X 4061, used only in Japanese */
-    opts->numericCollation = UCOL_OFF;
-    myD->jamoSpecial = FALSE;
-
-    tempUCATable *t = uprv_uca_initTempTable(myD, opts, NULL, IMPLICIT_TAG, LEAD_SURROGATE_TAG, status);
-    if(U_FAILURE(*status))
-    {
-        fprintf(stderr, "Failed to init UCA temp table: %s\n", u_errorName(*status));
-        uprv_free(opts);
-        uprv_free(myD);
-        fclose(data);
-        return -1;
+    printf("*** CLDR root collation part sizes ***\n");
+    CollationInfo::printSizes(totalSize, indexes);
+    printf("*** CLDR root collation size:   %6ld (with file header but no copyright string)\n",
+           (long)totalSize + 32);  // 32 bytes = DataHeader rounded up to 16-byte boundary
+
+    CollationTailoring::makeBaseVersion(UCAVersion, ucaDataInfo.dataVersion);
+    UNewDataMemory *pData=udata_create(path, "icu", "ucadata", &ucaDataInfo,
+                                       withCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
+    if(U_FAILURE(errorCode)) {
+        fprintf(stderr, "genuca: udata_create(%s, ucadata.icu) failed - %s\n",
+                path, u_errorName(errorCode));
+        return;
     }
 
-    // * set to zero
-    struct {
-        UChar32 start;
-        UChar32 end;
-        int32_t value;
-    } ranges[] =
-    {
-        {0xAC00, 0xD7B0, UCOL_SPECIAL_FLAG | (HANGUL_SYLLABLE_TAG << 24) },  //0 HANGUL_SYLLABLE_TAG,/* AC00-D7AF*/
-        //{0xD800, 0xDC00, UCOL_SPECIAL_FLAG | (LEAD_SURROGATE_TAG << 24)  },  //1 LEAD_SURROGATE_TAG, already set in utrie_open() /* D800-DBFF*/
-        {0xDC00, 0xE000, UCOL_SPECIAL_FLAG | (TRAIL_SURROGATE_TAG << 24) },  //2 TRAIL_SURROGATE DC00-DFFF
-        // Now directly handled in the collation code by the swapCJK function.
-        //{0x3400, 0x4DB6, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24)    },  //3 CJK_IMPLICIT_TAG,   /* 0x3400-0x4DB5*/
-        //{0x4E00, 0x9FA6, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24)    },  //4 CJK_IMPLICIT_TAG,   /* 0x4E00-0x9FA5*/
-        //{0xF900, 0xFA2E, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24)    },  //5 CJK_IMPLICIT_TAG,   /* 0xF900-0xFA2D*/
-        //{0x20000, 0x2A6D7, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24)  },  //6 CJK_IMPLICIT_TAG,   /* 0x20000-0x2A6D6*/
-        //{0x2F800, 0x2FA1E, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24)  },  //7 CJK_IMPLICIT_TAG,   /* 0x2F800-0x2FA1D*/
-    };
-    uint32_t i = 0;
-
-    for(i = 0; i<sizeof(ranges)/sizeof(ranges[0]); i++) {
-      /*ucmpe32_setRange32(t->mapping, ranges[i].start, ranges[i].end, ranges[i].value); */
-      utrie_setRange32(t->mapping, ranges[i].start, ranges[i].end, ranges[i].value, TRUE);
+    udata_writeBlock(pData, dest, totalSize);
+    long dataLength = udata_finish(pData, &errorCode);
+    if(U_FAILURE(errorCode)) {
+        fprintf(stderr, "genuca: error %s writing the output file\n", u_errorName(errorCode));
+        return;
     }
 
+    if(dataLength != (long)totalSize) {
+        fprintf(stderr,
+                "udata_finish(ucadata.icu) reports %ld bytes written but should be %ld\n",
+                dataLength, (long)totalSize);
+        errorCode=U_INTERNAL_PROGRAM_ERROR;
+    }
+}
 
-    int32_t surrogateCount = 0;
-    while(!feof(data)) {
-        if(U_FAILURE(*status)) {
-            fprintf(stderr, "Something returned an error %i (%s) while processing line %u of %s. Exiting...\n",
-                *status, u_errorName(*status), (int)line, filename);
-            exit(*status);
-        }
-
-        line++;
-        if(beVerbose) {
-          printf("%u ", (int)line);
+/**
+ * Adds each lead surrogate to the bmp set if any of the 1024
+ * associated supplementary code points is in the supp set.
+ * These can be one and the same set.
+ */
+static void
+setLeadSurrogatesForAssociatedSupplementary(UnicodeSet &bmp, const UnicodeSet &supp) {
+    UChar32 c = 0x10000;
+    for(UChar lead = 0xd800; lead < 0xdc00; ++lead, c += 0x400) {
+        if(supp.containsSome(c, c + 0x3ff)) {
+            bmp.add(lead);
         }
-        element = readAnElement(data, t, &consts, &leadByteConstants, status);
-        if(element != NULL) {
-            // we have read the line, now do something sensible with the read data!
+    }
+}
 
-            // if element is a contraction, we want to add it to contractions[]
-            int32_t length = (int32_t)element->cSize;
-            if(length > 1 && element->cPoints[0] != 0xFDD0) { // this is a contraction
-              if(U16_IS_LEAD(element->cPoints[0]) && U16_IS_TRAIL(element->cPoints[1]) && length == 2) {
-                surrogateCount++;
-              } else {
-                if(noOfContractions>=MAX_UCA_CONTRACTIONS) {
-                  fprintf(stderr,
-                          "\nMore than %d contractions. Please increase MAX_UCA_CONTRACTIONS in genuca.cpp. "
-                          "Exiting...\n",
-                          (int)MAX_UCA_CONTRACTIONS);
-                  exit(U_BUFFER_OVERFLOW_ERROR);
-                }
-                if(length > MAX_UCA_CONTRACTION_LENGTH) {
-                  fprintf(stderr,
-                          "\nLine %d: Contraction of length %d is too long. Please increase MAX_UCA_CONTRACTION_LENGTH in genuca.cpp. "
-                          "Exiting...\n",
-                          (int)line, (int)length);
-                  exit(U_BUFFER_OVERFLOW_ERROR);
-                }
-                UChar *t = &contractions[noOfContractions][0];
-                u_memcpy(t, element->cPoints, length);
-                t += length;
-                for(; length < MAX_UCA_CONTRACTION_LENGTH; ++length) {
-                    *t++ = 0;
+static int32_t
+makeBMPFoldedBitSet(const UnicodeSet &set, uint8_t index[0x800], uint32_t bits[256],
+                    UErrorCode &errorCode) {
+    if(U_FAILURE(errorCode)) { return 0; }
+    bits[0] = 0;  // no bits set
+    bits[1] = 0xffffffff;  // all bits set
+    int32_t bitsLength = 2;
+    int32_t i = 0;
+    for(UChar32 c = 0; c <= 0xffff; c += 0x20, ++i) {
+        if(set.containsNone(c, c + 0x1f)) {
+            index[i] = 0;
+        } else if(set.contains(c, c + 0x1f)) {
+            index[i] = 1;
+        } else {
+            uint32_t b = 0;
+            for(int32_t j = 0; j <= 0x1f; ++j) {
+                if(set.contains(c + j)) {
+                    b |= (uint32_t)1 << j;
                 }
-                noOfContractions++;
-              }
             }
-            else {
-                // TODO (claireho): does this work? Need more tests
-                // The following code is to handle the UCA pre-context rules
-                // for L/l with middle dot. We share the structures for contractionCombos.
-                // The format for pre-context character is
-                // contractions[0]: codepoint in element->cPoints[0]
-                // contractions[1]: '\0' to differentiate from a contraction
-                // contractions[2]: prefix char
-                if (element->prefixSize>0) {
-                    if(length > 1 || element->prefixSize > 1) {
-                        fprintf(stderr,
-                                "\nLine %d: Character with prefix, "
-                                "either too many characters or prefix too long.\n",
-                                (int)line);
-                        exit(U_INTERNAL_PROGRAM_ERROR);
-                    }
-                    if(noOfContractions>=MAX_UCA_CONTRACTIONS) {
-                      fprintf(stderr,
-                              "\nMore than %d contractions. Please increase MAX_UCA_CONTRACTIONS in genuca.cpp. "
-                              "Exiting...\n",
-                              (int)MAX_UCA_CONTRACTIONS);
-                      exit(U_BUFFER_OVERFLOW_ERROR);
-                    }
-                    UChar *t = &contractions[noOfContractions][0];
-                    t[0]=element->cPoints[0];
-                    t[1]=0;
-                    t[2]=element->prefixChars[0];
-                    t += 3;
-                    for(length = 3; length < MAX_UCA_CONTRACTION_LENGTH; ++length) {
-                        *t++ = 0;
+            int32_t k;
+            for(k = 2;; ++k) {
+                if(k == bitsLength) {
+                    // new bit combination
+                    if(bitsLength == 256) {
+                        errorCode = U_BUFFER_OVERFLOW_ERROR;
+                        return 0;
                     }
-                    noOfContractions++;
+                    bits[bitsLength++] = b;
+                    break;
+                }
+                if(bits[k] == b) {
+                    // duplicate bit combination
+                    break;
                 }
             }
-
-            /* we're first adding to inverse, because addAnElement will reverse the order */
-            /* of code points and stuff... we don't want that to happen */
-            if((element->CEs[0] >> 24) != 2) {
-                // Add every element except for the special minimum-weight character U+FFFE
-                // which has 02 weights.
-                // If we had 02 weights in the invuca table, then tailoring primary
-                // after an ignorable would try to put a weight before 02 which is not valid.
-                // We could fix this in a complicated way in the from-rule-string builder,
-                // but omitting this special element from invuca is simple and effective.
-                addToInverse(element, status);
-            }
-            if(!(length > 1 && element->cPoints[0] == 0xFDD0)) {
-              uprv_uca_addAnElement(t, element, status);
-            }
+            index[i] = k;
         }
     }
+    return bitsLength;
+}
 
-    if(UCAVersion[0] == 0 && UCAVersion[1] == 0 && UCAVersion[2] == 0 && UCAVersion[3] == 0) {
-        fprintf(stderr, "UCA version not specified. Cannot create data file!\n");
-        uprv_uca_closeTempTable(t);
-        uprv_free(opts);
-        uprv_free(myD);
-        fclose(data);
-        return -1;
-    }
-/*    {
-        uint32_t trieWord = utrie_get32(t->mapping, 0xDC01, NULL);
-    }*/
-
-    if (beVerbose) {
-        printf("\nLines read: %u\n", (int)line);
-        printf("Surrogate count: %i\n", (int)surrogateCount);
-        printf("Raw data breakdown:\n");
-        /*printf("Compact array stage1 top: %i, stage2 top: %i\n", t->mapping->stage1Top, t->mapping->stage2Top);*/
-        printf("Number of contractions: %u\n", (int)noOfContractions);
-        printf("Contraction image size: %u\n", (int)t->image->contractionSize);
-        printf("Expansions size: %i\n", (int)t->expansions->position);
-    }
-
-
-    /* produce canonical closure for table */
-    /* first set up constants for implicit calculation */
-    uprv_uca_initImplicitConstants(status);
-    /* do the closure */
-    UnicodeSet closed;
-    int32_t noOfClosures = uprv_uca_canonicalClosure(t, NULL, &closed, status);
-    if(noOfClosures != 0) {
-        fprintf(stderr, "Warning: %i canonical closures occured!\n", (int)noOfClosures);
-        UnicodeString pattern;
-        std::string utf8;
-        closed.toPattern(pattern, TRUE).toUTF8String(utf8);
-        fprintf(stderr, "UTF-8 pattern string: %s\n", utf8.c_str());
-    }
-
-    /* test */
-    UCATableHeader *myData = uprv_uca_assembleTable(t, status);  
-
-    if (beVerbose) {
-        printf("Compacted data breakdown:\n");
-        /*printf("Compact array stage1 top: %i, stage2 top: %i\n", t->mapping->stage1Top, t->mapping->stage2Top);*/
-        printf("Number of contractions: %u\n", (int)noOfContractions);
-        printf("Contraction image size: %u\n", (int)t->image->contractionSize);
-        printf("Expansions size: %i\n", (int)t->expansions->position);
-    }
+// TODO: Make preparseucd.py write fcd_data.h mapping code point ranges to FCD16 values,
+// use that rather than properties APIs.
+// Then consider moving related logic for the unsafeBwdSet back from the loader into this builder.
 
-    if(U_FAILURE(*status)) {
-        fprintf(stderr, "Error creating table: %s\n", u_errorName(*status));
-        uprv_uca_closeTempTable(t);
-        uprv_free(opts);
-        uprv_free(myD);
-        fclose(data);
-        return -1;
+/**
+ * Builds data for the FCD check fast path.
+ * For details see the CollationFCD class comments.
+ */
+static void
+buildAndWriteFCDData(const char *path, UErrorCode &errorCode) {
+    UnicodeSet lcccSet(UNICODE_STRING_SIMPLE("[[:^lccc=0:][\\udc00-\\udfff]]"), errorCode);
+    UnicodeSet tcccSet(UNICODE_STRING_SIMPLE("[:^tccc=0:]"), errorCode);
+    if(U_FAILURE(errorCode)) { return; }
+    setLeadSurrogatesForAssociatedSupplementary(tcccSet, tcccSet);
+    // The following supp(lccc)->lead(tccc) should be unnecessary
+    // after the previous supp(tccc)->lead(tccc)
+    // because there should not be any characters with lccc!=0 and tccc=0.
+    // It is safe and harmless.
+    setLeadSurrogatesForAssociatedSupplementary(tcccSet, lcccSet);
+    setLeadSurrogatesForAssociatedSupplementary(lcccSet, lcccSet);
+    uint8_t lcccIndex[0x800], tcccIndex[0x800];
+    uint32_t lcccBits[256], tcccBits[256];
+    int32_t lcccBitsLength = makeBMPFoldedBitSet(lcccSet, lcccIndex, lcccBits, errorCode);
+    int32_t tcccBitsLength = makeBMPFoldedBitSet(tcccSet, tcccIndex, tcccBits, errorCode);
+    printf("@@@ lcccBitsLength=%d -> %d bytes\n", lcccBitsLength, 0x800 + lcccBitsLength * 4);
+    printf("@@@ tcccBitsLength=%d -> %d bytes\n", tcccBitsLength, 0x800 + tcccBitsLength * 4);
+
+    if(U_FAILURE(errorCode)) { return; }
+
+    FILE *f=usrc_create(path, "collationfcd.cpp",
+                        "icu/tools/unicode/c/genuca/genuca.cpp");
+    if(f==NULL) {
+        errorCode=U_FILE_ACCESS_ERROR;
+        return;
     }
+    fputs("#include \"unicode/utypes.h\"\n\n", f);
+    fputs("#if !UCONFIG_NO_COLLATION\n\n", f);
+    fputs("#include \"collationfcd.h\"\n\n", f);
+    fputs("U_NAMESPACE_BEGIN\n\n", f);
+    usrc_writeArray(f,
+        "const uint8_t CollationFCD::lcccIndex[%ld]={\n",
+        lcccIndex, 8, 0x800,
+        "\n};\n\n");
+    usrc_writeArray(f,
+        "const uint32_t CollationFCD::lcccBits[%ld]={\n",
+        lcccBits, 32, lcccBitsLength,
+        "\n};\n\n");
+    usrc_writeArray(f,
+        "const uint8_t CollationFCD::tcccIndex[%ld]={\n",
+        tcccIndex, 8, 0x800,
+        "\n};\n\n");
+    usrc_writeArray(f,
+        "const uint32_t CollationFCD::tcccBits[%ld]={\n",
+        tcccBits, 32, tcccBitsLength,
+        "\n};\n\n");
+    fputs("U_NAMESPACE_END\n\n", f);
+    fputs("#endif  // !UCONFIG_NO_COLLATION\n", f);
+    fclose(f);
+}
 
-    /* populate the version info struct with version info*/
-    myData->version[0] = UCOL_BUILDER_VERSION;
-    myData->version[1] = UCAVersion[0];
-    myData->version[2] = UCAVersion[1];
-    myData->version[3] = UCAVersion[2];
-    /*TODO:The fractional rules version should be taken from FractionalUCA.txt*/
-    // Removed this macro. Instead, we use the fields below
-    //myD->version[1] = UCOL_FRACTIONAL_UCA_VERSION;
-    //myD->UCAVersion = UCAVersion; // out of FractionalUCA.txt
-    uprv_memcpy(myData->UCAVersion, UCAVersion, sizeof(UVersionInfo));
-    u_getUnicodeVersion(myData->UCDVersion);
-
-    writeOutData(myData, &consts, &leadByteConstants, contractions, noOfContractions, outputDir, copyright, status);
-
-    InverseUCATableHeader *inverse = assembleInverseTable(status);
-    uprv_memcpy(inverse->UCAVersion, UCAVersion, sizeof(UVersionInfo));
-    writeOutInverseData(inverse, outputDir, copyright, status);
-
-    uprv_uca_closeTempTable(t);
-    uprv_free(myD);
-    uprv_free(opts);
-
-    uprv_free(myData);
-    uprv_free(inverse);
-    
-    uprv_free(leadByteConstants.LEAD_BYTE_TO_SCRIPTS_INDEX);
-    uprv_free(leadByteConstants.LEAD_BYTE_TO_SCRIPTS_DATA);
-    uprv_free(leadByteConstants.SCRIPT_TO_LEAD_BYTES_INDEX);
-    uprv_free(leadByteConstants.SCRIPT_TO_LEAD_BYTES_DATA);
-    
-    fclose(data);
-
-    return 0;
+static void
+parseAndWriteCollationRootData(
+        const char *fracUCAPath,
+        const char *binaryDataPath,
+        const char *sourceCodePath,
+        UErrorCode &errorCode) {
+    if(U_FAILURE(errorCode)) { return; }
+    CollationBaseDataBuilder builder(errorCode);
+    builder.init(errorCode);
+    parseFractionalUCA(fracUCAPath, builder, &errorCode);
+    buildAndWriteBaseData(builder, binaryDataPath, errorCode);
+    buildAndWriteFCDData(sourceCodePath, errorCode);
 }
 
-#endif /* #if !UCONFIG_NO_COLLATION */
+// ------------------------------------------------------------------------- ***
 
 enum {
     HELP_H,
     HELP_QUESTION_MARK,
-    COPYRIGHT,
-    VERSION,
     VERBOSE,
-    ICUDATADIR
+    COPYRIGHT
 };
 
-/* Keep these values in sync with the above enums */
 static UOption options[]={
     UOPTION_HELP_H,
     UOPTION_HELP_QUESTION_MARK,
-    UOPTION_COPYRIGHT,
-    UOPTION_VERSION,
     UOPTION_VERBOSE,
-    UOPTION_ICUDATADIR
+    UOPTION_COPYRIGHT
 };
 
-int main(int argc, char* argv[]) {
-    uprv_memset(&UCAVersion, 0, 4);
-
+extern "C" int
+main(int argc, char* argv[]) {
     U_MAIN_INIT_ARGS(argc, argv);
+
     argc=u_parseArgs(argc, argv, LENGTHOF(options), options);
 
     /* error handling, printing usage message */
@@ -1525,108 +1045,61 @@ int main(int argc, char* argv[]) {
             "error in command line argument \"%s\"\n",
             argv[-argc]);
     }
-    if(argc<2 || options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur) {
+    if( argc<2 ||
+        options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur
+    ) {
+        /*
+         * Broken into chunks because the C89 standard says the minimum
+         * required supported string length is 509 bytes.
+         */
+        fprintf(stderr,
+            "Usage: %s [-options] path/to/ICU/src/root\n"
+            "\n"
+            "Reads path/to/ICU/src/root/source/data/unidata/FractionalUCA.txt and\n"
+            "writes source and binary data files with the collation root data.\n"
+            "\n",
+            argv[0]);
         fprintf(stderr,
-            "usage: %s [-options] path/to/ICU/src/root\n"
-            "\tRead in UCA collation text data and write out the binary collation data\n"
-            "options:\n"
+            "Options:\n"
             "\t-h or -? or --help  this usage text\n"
-            "\t-V or --version     show a version message\n"
-            "\t-c or --copyright   include a copyright notice\n"
-            "\t-v or --verbose     turn on verbose output\n"
-            "\t-i or --icudatadir  directory for locating any needed intermediate data files,\n"
-            "\t                    followed by path, defaults to %s\n",
-            argv[0], u_getDataDirectory());
-        return argc<2 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
-    }
-    if(options[VERSION].doesOccur) {
-        printf("genuca version %hu.%hu, ICU tool to read UCA text data and create UCA data tables for collation.\n",
-#if UCONFIG_NO_COLLATION
-            0, 0
-#else
-            UCA_FORMAT_VERSION_0, UCA_FORMAT_VERSION_1
-#endif
-            );
-        printf(U_COPYRIGHT_STRING"\n");
-        exit(0);
+            "\t-v or --verbose     verbose output\n"
+            "\t-c or --copyright   include a copyright notice\n");
+        return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
     }
 
-    /* get the options values */
-    beVerbose = options[VERBOSE].doesOccur;
-
-    const char *copyright = NULL;
-    if (options[COPYRIGHT].doesOccur) {
-        copyright = U_COPYRIGHT_STRING;
-    }
+    beVerbose=options[VERBOSE].doesOccur;
+    withCopyright=options[COPYRIGHT].doesOccur;
 
-    if (options[ICUDATADIR].doesOccur) {
-        u_setDataDirectory(options[ICUDATADIR].value);
-    }
-    /* Initialize ICU */
     IcuToolErrorCode errorCode("genuca");
-    u_init(errorCode);
-    if (errorCode.isFailure() && errorCode.get() != U_FILE_ACCESS_ERROR) {
-        fprintf(stderr, "%s: can not initialize ICU.  status = %s\n",
-            argv[0], errorCode.errorName());
-        exit(errorCode.reset());
-    }
-    errorCode.reset();
 
     CharString icuSrcRoot(argv[1], errorCode);
 
-    CharString icuSourceData(icuSrcRoot, errorCode);
-    icuSourceData.appendPathPart("source", errorCode);
-    icuSourceData.appendPathPart("data", errorCode);
-
-    CharString srcDir(icuSourceData, errorCode);
-    srcDir.appendPathPart("unidata", errorCode);
-
-    CharString destDir(icuSourceData, errorCode);
-    destDir.appendPathPart("in", errorCode);
-    destDir.appendPathPart("coll", errorCode);
+    CharString icuSource(icuSrcRoot, errorCode);
+    icuSource.appendPathPart("source", errorCode);
 
-    CharString ucaFile(srcDir, errorCode);
-    ucaFile.appendPathPart("FractionalUCA.txt", errorCode);
+    CharString icuSourceData(icuSource, errorCode);
+    icuSourceData.appendPathPart("data", errorCode);
 
-    if(errorCode.isFailure()) {
-        fprintf(stderr, "genuca: unable to build file paths - %s\n",
-                errorCode.errorName());
-        return errorCode.reset();
-    }
+    CharString fracUCAPath(icuSourceData, errorCode);
+    fracUCAPath.appendPathPart("unidata", errorCode);
+    fracUCAPath.appendPathPart("FractionalUCA.txt", errorCode);
 
-#if UCONFIG_NO_COLLATION
+    CharString sourceDataInColl(icuSourceData, errorCode);
+    sourceDataInColl.appendPathPart("in", errorCode);
+    sourceDataInColl.appendPathPart("coll", errorCode);
 
-    UNewDataMemory *pData;
-    const char *msg;
-    
-    msg = "genuca writes dummy " UCA_DATA_NAME "." UCA_DATA_TYPE " because of UCONFIG_NO_COLLATION, see uconfig.h";
-    fprintf(stderr, "%s\n", msg);
-    pData = udata_create(destDir.data(), UCA_DATA_TYPE, UCA_DATA_NAME, &dummyDataInfo,
-                         NULL, errorCode);
-    udata_writeBlock(pData, msg, strlen(msg));
-    udata_finish(pData, errorCode);
-
-    msg = "genuca writes dummy " INVC_DATA_NAME "." INVC_DATA_TYPE " because of UCONFIG_NO_COLLATION, see uconfig.h";
-    fprintf(stderr, "%s\n", msg);
-    pData = udata_create(destDir.data(), INVC_DATA_TYPE, INVC_DATA_NAME, &dummyDataInfo,
-                         NULL, errorCode);
-    udata_writeBlock(pData, msg, strlen(msg));
-    udata_finish(pData, errorCode);
-
-    return errorCode.reset();
+    CharString sourceI18n(icuSource, errorCode);
+    sourceI18n.appendPathPart("i18n", errorCode);
 
-#else
+    errorCode.assertSuccess();
 
-    return write_uca_table(ucaFile.data(), destDir.data(), copyright, errorCode);
+    parseAndWriteCollationRootData(
+        fracUCAPath.data(),
+        sourceDataInColl.data(),
+        sourceI18n.data(),
+        errorCode);
 
-#endif
+    return errorCode;
 }
 
-/*
- * Hey, Emacs, please set the following:
- *
- * Local Variables:
- * indent-tabs-mode: nil
- * End:
- *
- */
+#endif  // UCONFIG_NO_COLLATION
diff --git a/tools/unicode/c/genuca/genuca.h b/tools/unicode/c/genuca/genuca.h
deleted file mode 100644
index a98e6699428..00000000000
--- a/tools/unicode/c/genuca/genuca.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
-*******************************************************************************
-*
-*   Copyright (C) 2000-2004, International Business Machines
-*   Corporation and others.  All Rights Reserved.
-*
-*******************************************************************************
-*   file name:  genuca.h
-*   encoding:   US-ASCII
-*   tab size:   8 (not used)
-*   indentation:4
-*
-*   created at the end of XX century
-*   created by: Vladimir Weinstein
-*
-*   This program reads the Franctional UCA table and generates
-*   internal format for UCA table as well as inverse UCA table.
-*   It then writes binary files containing the data: ucadata.dat 
-*   & invuca.dat
-*/
-
-#ifndef UCADATA_H
-#define UCADATA_H
-
-#include "unicode/utypes.h"
-
-#if !UCONFIG_NO_COLLATION
-
-#include "ucol_elm.h"
-#include <stdio.h>
-#include <string.h>
-#include "unicode/utypes.h"
-#include "unicode/uchar.h"
-#include "ucol_imp.h"
-#include "uhash.h"
-#include "unewdata.h"
-
-
-void deleteElement(void *element);
-int32_t readElement(char **from, char *to, char separator, UErrorCode *status);
-uint32_t getSingleCEValue(char *primary, char *secondary, char *tertiary, UBool caseBit, UErrorCode *status);
-void printOutTable(UCATableHeader *myData, UErrorCode *status);
-UCAElements *readAnElement(FILE *data, tempUCATable *t, UCAConstants *consts, UErrorCode *status);
-
-#endif /* #if !UCONFIG_NO_COLLATION */
-
-#endif
diff --git a/tools/unicode/c/genuca/genuca.vcproj b/tools/unicode/c/genuca/genuca.vcproj
index 0056316e727..9c6919350ae 100644
--- a/tools/unicode/c/genuca/genuca.vcproj
+++ b/tools/unicode/c/genuca/genuca.vcproj
@@ -402,10 +402,6 @@
 			Name="Header Files"
 			Filter="h;hpp;hxx;hm;inl"
 			>
-			<File
-				RelativePath=".\genuca.h"
-				>
-			</File>
 		</Filter>
 		<Filter
 			Name="Resource Files"