ICU-12526 genuca: add new script sample characters, more readable error output

author Markus Scherer <markus.icu@gmail.com>

Fri, 6 May 2016 23:19:36 +0000 (23:19 +0000)

committer Markus Scherer <markus.icu@gmail.com>

Fri, 6 May 2016 23:19:36 +0000 (23:19 +0000)
author Markus Scherer <markus.icu@gmail.com>
Fri, 6 May 2016 23:19:36 +0000 (23:19 +0000)
committer Markus Scherer <markus.icu@gmail.com>
Fri, 6 May 2016 23:19:36 +0000 (23:19 +0000)
diff --git a/tools/unicode/c/genuca/genuca.cpp b/tools/unicode/c/genuca/genuca.cpp

index b152cdbd35bc71a47b4924e543db6b14af1e59eb..7332c4d8b1e775bb371f15a97560b66a50213912 100644 (file)
--- a/tools/unicode/c/genuca/genuca.cpp
+++ b/tools/unicode/c/genuca/genuca.cpp
@@ -236,6 +236,7 @@ static const struct {
      { 0x078C, USCRIPT_THAANA },
      { 0x07CA, USCRIPT_NKO },
      { 0x07D8, USCRIPT_NKO },
+    { 0x2D30, USCRIPT_TIFINAGH },
      { 0x2D5E, USCRIPT_TIFINAGH },
      { 0x12A0, USCRIPT_ETHIOPIC },
      { 0x0905, USCRIPT_DEVANAGARI },
@@ -258,6 +259,7 @@ static const struct {
      { 0x112BE, USCRIPT_KHUDAWADI },
      { 0x1128F, USCRIPT_MULTANI },
      { 0x11315, USCRIPT_GRANTHA },
+    { 0x11412, USCRIPT_NEWA },
      { 0x11484, USCRIPT_TIRHUTA },
      { 0x1158E, USCRIPT_SIDDHAM },
      { 0x1160E, USCRIPT_MODI },
@@ -266,10 +268,12 @@ static const struct {
      { 0x1B83, USCRIPT_SUNDANESE },
      { 0x11005, USCRIPT_BRAHMI },
      { 0x10A00, USCRIPT_KHAROSHTHI },
+    { 0x11C0E, USCRIPT_BHAIKSUKI },
      { 0x0E17, USCRIPT_THAI },
      { 0x0EA5, USCRIPT_LAO },
      { 0xAA80, USCRIPT_TAI_VIET },
      { 0x0F40, USCRIPT_TIBETAN },
+    { 0x11C72, USCRIPT_MARCHEN },
      { 0x1C00, USCRIPT_LEPCHA },
      { 0xA840, USCRIPT_PHAGS_PA },
      { 0x1900, USCRIPT_LIMBU },
@@ -293,6 +297,7 @@ static const struct {
      { 0x1826, USCRIPT_MONGOLIAN },
      { 0x1C5A, USCRIPT_OL_CHIKI },
      { 0x13C4, USCRIPT_CHEROKEE },
+    { 0x104B5, USCRIPT_OSAGE },
      { 0x14C0, USCRIPT_CANADIAN_ABORIGINAL },
      { 0x168F, USCRIPT_OGHAM },
      { 0x16A0, USCRIPT_RUNIC },
@@ -302,6 +307,7 @@ static const struct {
      { 0xA6A0, USCRIPT_BAMUM },
      { 0x16AE6, USCRIPT_BASSA_VAH },
      { 0x1E802, USCRIPT_MENDE },
+    { 0x1E909, USCRIPT_ADLAM, },
      { 0xAC00, USCRIPT_HANGUL },
      { 0x304B, USCRIPT_HIRAGANA },
      { 0x30AB, USCRIPT_KATAKANA },
@@ -350,6 +356,7 @@ static const struct {
      { 0x109A0, USCRIPT_MEROITIC_CURSIVE },
      { 0x10980, USCRIPT_MEROITIC_HIEROGLYPHS },
      { 0x14400, USCRIPT_ANATOLIAN_HIEROGLYPHS },
+    { 0x18229, USCRIPT_TANGUT },
      { 0x5B57, USCRIPT_HAN },
      { 0xFDD0, USCRIPT_UNKNOWN }  // unassigned-implicit primary weights
  };
@@ -632,7 +639,7 @@ static void readAnOption(
  }
  
  static UBool
-readAnElement(FILE *data,
+readAnElement(char *line,
          CollationBaseDataBuilder &builder,
          UnicodeString &prefix, UnicodeString &s,
          int64_t ces[32], int32_t &cesLength,
@@ -640,79 +647,69 @@ readAnElement(FILE *data,
      if(U_FAILURE(*status)) {
          return FALSE;
      }
-    char buffer[30000];
-    char *result = fgets(buffer, sizeof(buffer), data);
-    if(result == NULL) {
-        if(feof(data)) {
-            return FALSE;
-        } else {
-            fprintf(stderr, "empty line but no EOF!\n");
-            *status = U_INVALID_FORMAT_ERROR;
-            return FALSE;
-        }
-    }
-    int32_t buflen = (int32_t)uprv_strlen(buffer);
-    while(buflen>0 && (buffer[buflen-1] == '\r' || buffer[buflen-1] == '\n')) {
-      buffer[--buflen] = 0;
+    int32_t lineLength = (int32_t)uprv_strlen(line);
+    while(lineLength>0 && (line[lineLength-1] == '\r' || line[lineLength-1] == '\n')) {
+      line[--lineLength] = 0;
      }
  
-    if(buflen >= 3 && buffer[0] == (char)0xef &&
-            buffer[1] == (char)0xbb && buffer[2] == (char)0xbf) {
+    if(lineLength >= 3 && line[0] == (char)0xef &&
+            line[1] == (char)0xbb && line[2] == (char)0xbf) {
          // U+FEFF UTF-8 signature byte sequence.
          // Ignore, assuming it is at the start of the file.
-        buflen -= 3;
-        uprv_memmove(buffer, buffer + 3, buflen + 1);  // +1: including NUL terminator
+        line += 3;
+        lineLength -= 3;
      }
-    if(buffer[0] == 0 || buffer[0] == '#') {
+    if(line[0] == 0 || line[0] == '#') {
          return FALSE; // just a comment, skip whole line
      }
  
      // Directives.
-    if(buffer[0] == '[') {
-        readAnOption(builder, buffer, status);
+    if(line[0] == '[') {
+        readAnOption(builder, line, status);
          return FALSE;
      }
  
-    char *startCodePoint = buffer;
+    CharString input;
+    char *startCodePoint = line;
      char *endCodePoint = strchr(startCodePoint, ';');
      if(endCodePoint == NULL) {
-        fprintf(stderr, "error - line with no code point!\n");
+        fprintf(stderr, "error - line with no code point:\n%s\n", line);
          *status = U_INVALID_FORMAT_ERROR; /* No code point - could be an error, but probably only an empty line */
          return FALSE;
-    } else {
-        *endCodePoint = 0;
      }
  
-    char *pipePointer = strchr(buffer, '|');
+    char *pipePointer = strchr(line, '|');
      if (pipePointer != NULL) {
          // Read the prefix string which precedes the actual string.
-        *pipePointer = 0;
+        input.append(startCodePoint, (int32_t)(pipePointer - startCodePoint), *status);
          UChar *prefixChars = prefix.getBuffer(32);
          int32_t prefixSize =
-            u_parseString(startCodePoint,
+            u_parseString(input.data(),
                            prefixChars, prefix.getCapacity(),
                            NULL, status);
          if(U_FAILURE(*status)) {
              prefix.releaseBuffer(0);
-            fprintf(stderr, "error - parsing of prefix \"%s\" failed: %s\n",
-                    startCodePoint, u_errorName(*status));
+            fprintf(stderr, "error - parsing of prefix \"%s\" failed: %s\n%s\n",
+                    input.data(), line, u_errorName(*status));
              *status = U_INVALID_FORMAT_ERROR;
              return FALSE;
          }
          prefix.releaseBuffer(prefixSize);
          startCodePoint = pipePointer + 1;
+        input.clear();
      }
  
      // Read the string which gets the CE(s) assigned.
+    input.append(startCodePoint, (int32_t)(endCodePoint - startCodePoint), *status);
      UChar *uchars = s.getBuffer(32);
      int32_t cSize =
-        u_parseString(startCodePoint,
+        u_parseString(input.data(),
                        uchars, s.getCapacity(),
                        NULL, status);
      if(U_FAILURE(*status)) {
          s.releaseBuffer(0);
-        fprintf(stderr, "error - parsing of code point(s) \"%s\" failed: %s\n",
-                startCodePoint, u_errorName(*status));
+        fprintf(stderr, "error - parsing of code point(s) \"%s\" failed: %s\n%s\n",
+                input.data(), line, u_errorName(*status));
          *status = U_INVALID_FORMAT_ERROR;
          return FALSE;
      }
@@ -732,14 +729,14 @@ readAnElement(FILE *data,
              break;
          }
          if(cesLength >= 31) {
-            fprintf(stderr, "Error: Too many CEs on line '%s'\n", buffer);
+            fprintf(stderr, "Error: Too many CEs on line '%s'\n", line);
              *status = U_INVALID_FORMAT_ERROR;
              return FALSE;
          }
          ces[cesLength++] = parseCE(builder, pointer, *status);
          if(U_FAILURE(*status)) {
              fprintf(stderr, "Syntax error parsing CE from line '%s' - %s\n",
-                    buffer, u_errorName(*status));
+                    line, u_errorName(*status));
              return FALSE;
          }
      }
@@ -759,11 +756,11 @@ readAnElement(FILE *data,
                  uint8_t b = (uint8_t)(ce >> (j * 8));
                  if(j <= 1) { b &= 0x3f; }  // tertiary bytes use 6 bits
                  if (b == 1) {
-                    fprintf(stderr, "Warning: invalid UCA weight byte 01 for %s\n", buffer);
+                    fprintf(stderr, "Warning: invalid UCA weight byte 01 for %s\n", line);
                      return FALSE;
                  }
                  if (j == 7 && b == 2) {
-                    fprintf(stderr, "Warning: invalid UCA primary weight lead byte 02 for %s\n", buffer);
+                    fprintf(stderr, "Warning: invalid UCA primary weight lead byte 02 for %s\n", line);
                      return FALSE;
                  }
                  if (j == 7) {
@@ -774,7 +771,7 @@ readAnElement(FILE *data,
                      // 02 is unusable and 03 is the low compression terminator when the lead byte is compressible.
                      if (isCompressible && (b <= 3 || b == 0xff)) {
                          fprintf(stderr, "Warning: invalid UCA primary second weight byte %02X for %s\n",
-                                b, buffer);
+                                b, line);
                          return FALSE;
                      }
                  }
@@ -797,23 +794,34 @@ parseFractionalUCA(const char *filename,
          *status = U_FILE_ACCESS_ERROR;
          return;
      }
-    uint32_t line = 0;
+    int32_t lineNumber = 0;
+    char buffer[30000];
  
      UChar32 maxCodePoint = 0;
      while(!feof(data)) {
          if(U_FAILURE(*status)) {
              fprintf(stderr, "Something returned an error %i (%s) while processing line %u of %s. Exiting...\n",
-                *status, u_errorName(*status), (int)line, filename);
+                *status, u_errorName(*status), (int)lineNumber, filename);
              exit(*status);
          }
  
-        line++;
+        lineNumber++;
+        char *line = fgets(buffer, sizeof(buffer), data);
+        if(line == NULL) {
+            if(feof(data)) {
+                break;
+            } else {
+                fprintf(stderr, "no more input line and also no EOF!\n");
+                *status = U_INVALID_FORMAT_ERROR;
+                return;
+            }
+        }
  
          UnicodeString prefix;
          UnicodeString s;
          int64_t ces[32];
          int32_t cesLength = 0;
-        if(readAnElement(data, builder, prefix, s, ces, cesLength, status)) {
+        if(readAnElement(line, builder, prefix, s, ces, cesLength, status)) {
              // we have read the line, now do something sensible with the read data!
              uint32_t p = (uint32_t)(ces[0] >> 32);
  
@@ -852,9 +860,10 @@ parseFractionalUCA(const char *filename,
                      if(script < 0) {
                          fprintf(stderr,
                                  "Error: Unknown script for first-primary sample character "
-                                "U+%04x on line %u of %s\n"
+                                "U+%04X on line %u of %s:\n"
+                                "%s\n"
                                  "    (add the character to genuca.cpp sampleCharsToScripts[])\n",
-                                c2, (int)line, filename);
+                                c2, (int)lineNumber, filename, line);
                          exit(U_INVALID_FORMAT_ERROR);
                      }
                      if(script == USCRIPT_UNKNOWN) {
@@ -874,8 +883,9 @@ parseFractionalUCA(const char *filename,
                  if(0xe0000000 <= p && p < 0xf0000000) {
                      fprintf(stderr,
                              "Error: Unexpected mapping to an implicit or trailing primary"
-                            " on line %u of %s.\n",
-                            (int)line, filename);
+                            " on line %u of %s:\n"
+                            "%s\n",
+                            (int)lineNumber, filename, line);
                      exit(U_INVALID_FORMAT_ERROR);
                  }
  
@@ -988,7 +998,7 @@ parseFractionalUCA(const char *filename,
      }
  
      if (beVerbose) {
-        printf("\nLines read: %u\n", (int)line);
+        printf("\nLines read: %u\n", (int)lineNumber);
      }
  
      fclose(data);
author	Markus Scherer <markus.icu@gmail.com>
	Fri, 6 May 2016 23:19:36 +0000 (23:19 +0000)
committer	Markus Scherer <markus.icu@gmail.com>
	Fri, 6 May 2016 23:19:36 +0000 (23:19 +0000)