ICU-12766 aix/escaper: WIP. Support u8

author Steven R. Loomis <srl@icu-project.org>

Wed, 15 Mar 2017 16:58:45 +0000 (16:58 +0000)

committer Steven R. Loomis <srl@icu-project.org>

Wed, 15 Mar 2017 16:58:45 +0000 (16:58 +0000)
author Steven R. Loomis <srl@icu-project.org>
Wed, 15 Mar 2017 16:58:45 +0000 (16:58 +0000)
committer Steven R. Loomis <srl@icu-project.org>
Wed, 15 Mar 2017 16:58:45 +0000 (16:58 +0000)
diff --git a/icu4c/source/Doxyfile.in b/icu4c/source/Doxyfile.in

index 8104be3a047fe8c4dca0116d07bdd363cf16f6ae..ec08ddd618aa401d4f900540bafcd79d7dc62157 100644 (file)
--- a/icu4c/source/Doxyfile.in
+++ b/icu4c/source/Doxyfile.in
@@ -194,7 +194,7 @@ EXPAND_ONLY_PREDEF     = YES
  SEARCH_INCLUDES        = YES
  INCLUDE_PATH           = 
  INCLUDE_FILE_PATTERNS  = 
-PREDEFINED             = U_EXPORT2= U_STABLE= U_DRAFT= U_INTERNAL= U_SYSTEM= U_DEPRECATED= U_OBSOLETE= U_CALLCONV= U_CDECL_BEGIN= U_CDECL_END=  U_NO_THROW=\ "U_NAMESPACE_BEGIN=namespace icu{" "U_NAMESPACE_END=}" U_SHOW_CPLUSPLUS_API=1 U_DEFINE_LOCAL_OPEN_POINTER()= U_IN_DOXYGEN=1 U_OVERRIDE= U_FINAL= UCONFIG_ENABLE_PLUGINS=1
+PREDEFINED             = U_EXPORT2= U_STABLE= U_DRAFT= U_INTERNAL= U_SYSTEM= U_DEPRECATED= U_OBSOLETE= U_CALLCONV= U_CDECL_BEGIN= U_CDECL_END=  U_NO_THROW=\ "U_NAMESPACE_BEGIN=namespace icu{" "U_NAMESPACE_END=}" U_SHOW_CPLUSPLUS_API=1 U_DEFINE_LOCAL_OPEN_POINTER()= U_IN_DOXYGEN=1 U_OVERRIDE= U_FINAL= UCONFIG_ENABLE_PLUGINS=1 U_CHAR16_IS_TYPEDEF=0 U_CPLUSPLUS_VERSION=11 U_NO_NULLPTR_T=0
  EXPAND_AS_DEFINED      = 
  SKIP_FUNCTION_MACROS   = YES
  #---------------------------------------------------------------------------
diff --git a/icu4c/source/extra/uconv/uconv.cpp b/icu4c/source/extra/uconv/uconv.cpp

index 3bc807c819d10528b32e8d036cf2d941f228c633..ba5d06af3ac21cc76d85df135ec2ab8135b259e6 100644 (file)
--- a/icu4c/source/extra/uconv/uconv.cpp
+++ b/icu4c/source/extra/uconv/uconv.cpp
@@ -659,7 +659,7 @@ ConvertFile::convertFile(const char *pname,
          parse.line = -1;
  
          if (uprv_strchr(translit, ':') || uprv_strchr(translit, '>') || uprv_strchr(translit, '<') || uprv_strchr(translit, '>')) {
-            t = Transliterator::createFromRules(UNICODE_STRING_SIMPLE("Uconv"), str, UTRANS_FORWARD, parse, err);
+            t = Transliterator::createFromRules(UnicodeString(u"Uconv"), str, UTRANS_FORWARD, parse, err);
          } else {
              t = Transliterator::createInstance(UnicodeString(translit, -1, US_INV), UTRANS_FORWARD, err);
          }
diff --git a/icu4c/source/tools/escapesrc/escapesrc.cpp b/icu4c/source/tools/escapesrc/escapesrc.cpp

index eba8c71f195148be662b371e20f698fe2e72685e..b249587f9b70adaeb4d91674f1972a0a9feae970 100644 (file)
--- a/icu4c/source/tools/escapesrc/escapesrc.cpp
+++ b/icu4c/source/tools/escapesrc/escapesrc.cpp
@@ -13,6 +13,19 @@
  // with caution:
  #include "unicode/utf8.h"
  
+static const char
+  kSPACE = 0x20,
+  kTAB   = 0x09,
+  kLF    = 0x0A,
+  kCR    = 0x0D,
+  kHASH  = 0x23,
+  kSLASH = 0x2f,
+  kSTAR  = 0x2A,
+  kL_U   = 0x75,
+  kU_U   = 0x55,
+  kQUOT  = 0x27,
+  kDBLQ  = 0x22;
+
  std::string prog;
  
  void usage() {
@@ -39,6 +52,7 @@ int cleanup(const std::string &outfile) {
    return 0;
  }
  
+#if 0
  inline bool hasNonAscii(const char *line, size_t len) {
    const unsigned char *uline = reinterpret_cast<const unsigned char*>(line);
    for(size_t i=0;i<len; i++) {
@@ -48,14 +62,15 @@ inline bool hasNonAscii(const char *line, size_t len) {
    }
    return false;
  }
+#endif
  
  inline const char *skipws(const char *p, const char *e) {
    for(;p<e;p++) {
      switch(*p) {
-    case ' ':
-    case '\t':
-    case '\n':
-    case '\r':
+    case kSPACE:
+    case kTAB:
+    case kLF:
+    case kCR:
        break;
      default:
        return p; // non ws
@@ -64,6 +79,7 @@ inline const char *skipws(const char *p, const char *e) {
    return p;
  }
  
+#if 0
  inline bool isCommentOrEmpty(const char* line, size_t len) {
    const char *p = line;
    const char *e = line+len;
@@ -73,13 +89,13 @@ inline bool isCommentOrEmpty(const char* line, size_t len) {
    }
    p++;
    switch(*p) {
-  case '#': return true; // #directive
-  case '/':
+  case kHASH: return true; // #directive
+  case kSLASH:
      p++;
      if(p==e) return false; // single slash
      switch(*p) {
-    case '/': // '/ /'
-    case '*': // '/ *'
+    case kSLASH: // '/ /'
+    case kSTAR: // '/ *'
        return true; // start of comment
      default: return false; // something else
      }
@@ -87,6 +103,82 @@ inline bool isCommentOrEmpty(const char* line, size_t len) {
    }
    /*NOTREACHED*/
  }
+#endif
+
+void appendByte(std::string &outstr,
+                uint8_t byte) {
+    char tmp2[4];
+    sprintf(tmp2, "\\x%02X", 0xFF & (int)(byte));
+    outstr += tmp2;
+}
+
+/**
+ * @return true on failure
+ */
+bool appendUtf8(std::string &outstr,
+                const std::string &linestr,
+                size_t &pos,
+                size_t chars) {
+  char tmp[9];
+  for(size_t i=0;i<chars;i++) {
+    tmp[i] = linestr[++pos];
+  }
+  tmp[chars] = 0;
+  UChar32 ch;
+  sscanf(tmp, "%X", &ch);
+
+  // now to append \\x%% etc
+  uint8_t bytesNeeded = U8_LENGTH(ch);
+  if(bytesNeeded == 0) {
+    fprintf(stderr, "Illegal code point U+%X\n", ch);
+    return true;
+  }
+  uint8_t bytes[4];
+  uint8_t *s = bytes;
+  size_t i = 0;
+  U8_APPEND_UNSAFE(s, i, ch);
+  for(size_t t = 0; t<i; t++) {
+    appendByte(outstr, s[t]);
+  }
+  return false;
+}
+
+/**
+ * @param linestr string to mutate. Already escaped into \u format.
+ * @param origpos beginning, points to 'u8"'
+ * @param pos end, points to "
+ * @return false for no-problem, true for failure!
+ */
+bool fixu8(std::string &linestr, size_t origpos, size_t &endpos) {
+  size_t pos = origpos + 3;
+  std::string outstr;
+  outstr += (kDBLQ);
+  for(;pos<endpos;pos++) {
+    char c = linestr[pos];
+    if(c == kSLASH) {
+      char c2 = linestr[++pos];
+      switch(c2) {
+      case kQUOT:
+      case kDBLQ:
+        appendByte(outstr, c2);
+        break;
+      case kL_U:
+        appendUtf8(outstr, linestr, pos, 4);
+        break;
+      case kU_U:
+        appendUtf8(outstr, linestr, pos, 8);
+        break;
+      }
+    } else {
+      appendByte(outstr, c);
+    }
+  }
+  outstr += (kDBLQ);
+
+  linestr.replace(origpos, (endpos-origpos+1), outstr);
+  
+  return false; // OK
+}
  
  /**
   * fix the string at the position
@@ -94,18 +186,46 @@ inline bool isCommentOrEmpty(const char* line, size_t len) {
   * true = had err
   */
  bool fixAt(std::string &linestr, size_t pos) {
+  size_t origpos = pos;
+  
    if(linestr[pos] != 'u') {
      fprintf(stderr, "Not a 'u'?");
      return true;
    }
  
-  char quote = linestr[pos+1];
+  pos++; // past 'u'
  
-  //printf("u%c…%c\n", quote, quote);
+  bool utf8 = false;
+  
+  if(linestr[pos] == '8') { // u8"
+    utf8 = true;
+    pos++;
+  }
+  
+  char quote = linestr[pos];
+
+  if(quote != '\'' && quote != '\"') {
+    fprintf(stderr, "Quote is '%c' - not sure what to do.\n", quote);
+    return true;
+  }
+
+  if(quote == '\'' && utf8) {
+    fprintf(stderr, "Cannot do u8'...'\n");
+    return true;
+  }
+
+  pos ++;
  
+  //printf("u%c…%c\n", quote, quote);
  
-  for(pos += 2; pos < linestr.size(); pos++) {
-    if(linestr[pos] == quote) return false; // end of quote
+  for(; pos < linestr.size(); pos++) {
+    if(linestr[pos] == quote) {
+      if(utf8) {
+        return fixu8(linestr, origpos, pos); // fix u8"..."
+      } else {
+        return false; // end of quote
+      }
+    }
      if(linestr[pos] == '\\') {
        pos++;
        if(linestr[pos] == quote) continue; // quoted quote
@@ -156,19 +276,20 @@ bool fixLine(int /*no*/, std::string &linestr) {
    size_t len = linestr.size();
  
    // no u' in the line?
-  if(!strstr(line, "u'") && !strstr(line, "u\"")) {
+  if(!strstr(line, "u'") && !strstr(line, "u\"") && !strstr(line, "u8\"")) {
      return false; // Nothing to do. No u' or u" detected
    }
  
-  // Quick Check: all ascii?
-  if(!hasNonAscii(line, len)) {
-    return false; // ASCII
-  }
+  // lines such as u8"\u0308" are all ASCII.
+  // // Quick Check: all ascii?
+  // if(!hasNonAscii(line, len)) {
+  //   return false; // ASCII
+  // }
  
-  // comment or empty line?
-  if(isCommentOrEmpty(line, len)) {
-    return false; // Comment or just empty
-  }
+  // // comment or empty line?
+  // if(isCommentOrEmpty(line, len)) {
+  //   return false; // Comment or just empty
+  // }
  
    // start from the end and find all u" cases
    size_t pos = len = linestr.size();
@@ -188,6 +309,14 @@ bool fixLine(int /*no*/, std::string &linestr) {
      pos--;
    }
  
+  // reset and find all u8" cases
+  pos = len = linestr.size();
+  while((pos>0) && (pos = linestr.rfind("u8\"", pos)) != std::string::npos) {
+    if(fixAt(linestr, pos)) return true;
+    if(pos == 0) break;
+    pos--;
+  }
+
    //fprintf(stderr, "%d - fixed\n", no);
    return false;
  }
diff --git a/icu4c/source/tools/escapesrc/expect-simple.cpp b/icu4c/source/tools/escapesrc/expect-simple.cpp

index 684bf114d299bd208a1c00d4b3b21b1546174718..20d2bd8144137506fe1cc8cff7f66bde2cdfad31 100644 (file)
--- a/icu4c/source/tools/escapesrc/expect-simple.cpp
+++ b/icu4c/source/tools/escapesrc/expect-simple.cpp
@@ -8,3 +8,9 @@ u"\U000219F2";
   u"sa\u0127\u0127a";
   u'\u6587'; u"\U000219F2";
  
+"\x20\x5C\x75\x30\x33\x30\x31";
+"\x5C\x75\x30\x33\x30\x38\x20";
+"\x73\x61\x5C\x75\x30\x31\x32\x37\x5C\x75\x30\x31\x32\x37\x61";
+"\x5C\x75\x36\x35\x38\x37";
+"\x5C\x55\x30\x30\x30\x32\x31\x39\x46\x32";
+"\x73\x61\x5C\x75\x30\x31\x32\x37\x5C\x75\x30\x31\x32\x37\x61";
diff --git a/icu4c/source/tools/escapesrc/test-simple.cpp b/icu4c/source/tools/escapesrc/test-simple.cpp

index 9799c4f600db02b0fdedcc055613afd301de106f..b2fc953e475799fda50b2ebbfb00383c5cdddeb7 100644 (file)
--- a/icu4c/source/tools/escapesrc/test-simple.cpp
+++ b/icu4c/source/tools/escapesrc/test-simple.cpp
@@ -8,3 +8,9 @@ u"𡧲";
   u"saħħa";
   u'文'; u"𡧲";
  
+u8" \u0301";
+u8"\u0308 ";
+u8"saħħa";
+u8"文";
+u8"𡧲";
+u8"saħ\u0127a";
author	Steven R. Loomis <srl@icu-project.org>
	Wed, 15 Mar 2017 16:58:45 +0000 (16:58 +0000)
committer	Steven R. Loomis <srl@icu-project.org>
	Wed, 15 Mar 2017 16:58:45 +0000 (16:58 +0000)
icu4c/source/Doxyfile.in		patch \| blob \| history
icu4c/source/extra/uconv/uconv.cpp		patch \| blob \| history
icu4c/source/tools/escapesrc/escapesrc.cpp		patch \| blob \| history
icu4c/source/tools/escapesrc/expect-simple.cpp		patch \| blob \| history
icu4c/source/tools/escapesrc/test-simple.cpp		patch \| blob \| history