ICU-12766 cleanup and add test case for escaper

author Steven R. Loomis <srl@icu-project.org>

Wed, 15 Mar 2017 02:16:35 +0000 (02:16 +0000)

committer Steven R. Loomis <srl@icu-project.org>

Wed, 15 Mar 2017 02:16:35 +0000 (02:16 +0000)
author Steven R. Loomis <srl@icu-project.org>
Wed, 15 Mar 2017 02:16:35 +0000 (02:16 +0000)
committer Steven R. Loomis <srl@icu-project.org>
Wed, 15 Mar 2017 02:16:35 +0000 (02:16 +0000)
diff --git a/.gitattributes b/.gitattributes

index cb3fd4f3530577b1d91606643ed11a3c2cfe05a0..8c0e8f6e89858607df4cd1c889f63c49ae888bc0 100644 (file)
--- a/.gitattributes
+++ b/.gitattributes
@@ -124,6 +124,9 @@ icu4c/source/test/testdata/importtest.bin -text
  icu4c/source/test/testdata/old_e_testtypes.res -text
  icu4c/source/test/testdata/old_l_testtypes.res -text
  icu4c/source/test/testdata/uni-text.bin -text
+icu4c/source/tools/escapesrc/expect-simple.cpp -text
+icu4c/source/tools/escapesrc/test-nochange.cpp -text
+icu4c/source/tools/escapesrc/test-simple.cpp -text
  icu4c/source/tools/genbrk/genbrk.vcxproj -text
  icu4c/source/tools/genccode/genccode.vcxproj -text
  icu4c/source/tools/gencfu/gencfu.vcxproj -text
diff --git a/icu4c/source/Makefile.in b/icu4c/source/Makefile.in

index ef2657fc2dc0e6b99f25328f50fd02d5609e3ed7..9ba5eab6e2bf6398dc64a1edfcb9ceeca2058daa 100644 (file)
--- a/icu4c/source/Makefile.in
+++ b/icu4c/source/Makefile.in
@@ -140,7 +140,10 @@ $(LIBDIR) $(BINDIR):
  
  ## Recursive targets
  all-recursive install-recursive clean-recursive distclean-recursive dist-recursive check-recursive check-exhaustive-recursive: $(LIBDIR) $(BINDIR)
+ifneq ($(NEED_ESCAPING),)
+       @echo "building tools/escapesrc (Needed for this platform with NEED_ESCAPING)"
         @(cd tools/escapesrc && $(MAKE) RECURSIVE=YES $$local_target) || exit
+endif
         @dot_seen=no; \
         target=`echo $@ | sed s/-recursive//`; \
         list='$(LOCAL_SUBDIRS)'; for subdir in $$list; do \
diff --git a/icu4c/source/tools/Makefile.in b/icu4c/source/tools/Makefile.in

index 1cbef0005d49d677c2f1ab5c5ae833f23f95db5f..8fb7876de764a1cad5e899d47859f0df1867645f 100644 (file)
--- a/icu4c/source/tools/Makefile.in
+++ b/icu4c/source/tools/Makefile.in
@@ -17,7 +17,7 @@ subdir = tools
  
  SUBDIRS = toolutil ctestfw makeconv genrb genbrk \
  gencnval gensprep icuinfo genccode gencmn icupkg pkgdata \
-gentest gennorm2 gencfu gendict
+gentest gennorm2 gencfu gendict escapesrc
  
  ## List of phony targets
  .PHONY : all all-local all-recursive install install-local     \
diff --git a/icu4c/source/tools/escapesrc/Makefile.in b/icu4c/source/tools/escapesrc/Makefile.in

index 71ffdbca07c4e18c4c722b99cfd884f354ccca9d..8a0e2c1f803ee810222838f1d9d4f1b7411ec0f4 100644 (file)
--- a/icu4c/source/tools/escapesrc/Makefile.in
+++ b/icu4c/source/tools/escapesrc/Makefile.in
@@ -5,6 +5,7 @@
  ## others. All Rights Reserved.
  ## Steven R. Loomis
  
+# To avoid recursion
  SKIP_ESCAPING=YES
  
  ## Source directory information
@@ -25,7 +26,7 @@ SECTION = 8
  #MAN_FILES = $(TARGET_STUB_NAME).$(SECTION)
  
  ## Extra files to remove for 'make clean'
-CLEANFILES = *~ $(DEPS) $(MAN_FILES)
+CLEANFILES = *~ $(DEPS) $(MAN_FILES) ./output-*.cpp
  
  ## Target information
  TARGET = $(BINDIR)/$(TARGET_STUB_NAME)$(EXEEXT)
@@ -74,6 +75,12 @@ distclean-local: clean-local
         $(RMV) Makefile
  
  check-local: all-local
+       @echo Testing test-nochange.cpp
+       @$(INVOKE) $(TARGET) $(srcdir)/test-nochange.cpp ./output-nochange.cpp
+       @-diff -I '#line.*' $(srcdir)/test-nochange.cpp ./output-nochange.cpp || (echo >&2 'warning: diff failed or not found' ; true)
+       @echo Testing test-simple.cpp
+       @$(INVOKE) $(TARGET) $(srcdir)/test-simple.cpp ./output-simple.cpp
+       @-diff -I '#line.*' $(srcdir)/expect-simple.cpp ./output-simple.cpp   || (echo >&2 'warning: diff failed or not found' ; true)
  
  Makefile: $(srcdir)/Makefile.in  $(top_builddir)/config.status
         cd $(top_builddir) \
diff --git a/icu4c/source/tools/escapesrc/escapesrc.cpp b/icu4c/source/tools/escapesrc/escapesrc.cpp

index e4af76f45035e810c8a2713f5b59c03c2cb05416..dde8a1de900abd905a5310cacd2a6d15f2b8ffa0 100644 (file)
--- a/icu4c/source/tools/escapesrc/escapesrc.cpp
+++ b/icu4c/source/tools/escapesrc/escapesrc.cpp
@@ -6,6 +6,9 @@
  #include <stdlib.h>
  #include <unistd.h>
  #include <errno.h>
+#include <string.h>
+#include <iostream>
+#include <fstream>
  
  // with caution:
  #include "unicode/utf8.h"
@@ -112,7 +115,6 @@ bool fixAt(std::string &linestr, size_t pos) {
      } else {
        // Proceed to decode utf-8
        const uint8_t *s = (const uint8_t*) (linestr.c_str());
-      const uint8_t *b = s;
        int32_t i = pos;
        int32_t length = linestr.size();
        UChar32 c;
@@ -123,24 +125,22 @@ bool fixAt(std::string &linestr, size_t pos) {
          U8_NEXT(s, i, length, c);
        }
        if(c<0) {
-        fprintf(stderr, "Illegal utf-8 sequence %04X pos %d\n", c, pos);
+        fprintf(stderr, "Illegal utf-8 sequence\n");
          return true;
        }
  
        size_t seqLen = (i-pos);
  
-      printf("U+%04X pos %d [len %d]\n", c, pos, seqLen);
+      //printf("U+%04X pos %d [len %d]\n", c, pos, seqLen);
  
+      char newSeq[] = "\\U0000FFFD";
        if( c <= 0xFFFF) {
-        char newSeq[] = "\\uFFFD";
          sprintf(newSeq, "\\u%04X", c);
-        linestr.replace(pos, seqLen, newSeq);
-        //pos += seqLen; // advance
-        pos += strlen(newSeq) - 1;
        } else {
-        fprintf(stderr, "%s: Error: not implemented yet: surrogate pairs for U+%04X\n", prog.c_str(), c);
-        return true;
+        sprintf(newSeq, "\\U%08X", c);
        }
+      linestr.replace(pos, seqLen, newSeq);
+      pos += strlen(newSeq) - 1;
      }
    }
  
@@ -151,91 +151,85 @@ bool fixAt(std::string &linestr, size_t pos) {
   * false = no err
   * true = had err
   */
-bool fixLine(int no, std::string &linestr) {
+bool fixLine(int /*no*/, std::string &linestr) {
    const char *line = linestr.c_str();
    size_t len = linestr.size();
-  // Quick Check: all ascii?
  
+  // no u' in the line?
+  if(!strstr(line, "u'") && !strstr(line, "u\"")) {
+    return false; // Nothing to do. No u' or u" detected
+  }
+
+  // Quick Check: all ascii?
    if(!hasNonAscii(line, len)) {
      return false; // ASCII
    }
  
+  // comment or empty line?
    if(isCommentOrEmpty(line, len)) {
      return false; // Comment or just empty
    }
  
-  if(!strnstr(line, "u'", len) && !strnstr(line, "u\"", len)) {
-    return false; // Nothing to do. No u' or u" detected
-  }
-
    // start from the end and find all u" cases
    size_t pos = len = linestr.size();
-  while((pos = linestr.rfind("u\"", pos)) != std::string::npos) {
-    printf("found doublequote at %d\n", pos);
+  while((pos>0) && (pos = linestr.rfind("u\"", pos)) != std::string::npos) {
+    //printf("found doublequote at %d\n", pos);
      if(fixAt(linestr, pos)) return true;
+    if(pos == 0) break;
      pos--;
    }
  
    // reset and find all u' cases
    pos = len = linestr.size();
-  while((pos = linestr.rfind("u'", pos)) != std::string::npos) {
-    printf("found singlequote at %d\n", pos);
+  while((pos>0) && (pos = linestr.rfind("u'", pos)) != std::string::npos) {
+    //printf("found singlequote at %d\n", pos);
      if(fixAt(linestr, pos)) return true;
+    if(pos == 0) break;
      pos--;
    }
  
-  fprintf(stderr, "%d - fixed\n", no);
+  //fprintf(stderr, "%d - fixed\n", no);
    return false;
  }
  
  int convert(const std::string &infile, const std::string &outfile) {
-  fprintf(stderr, "%s: %s -> %s\n", prog.c_str(), infile.c_str(), outfile.c_str());
+  fprintf(stderr, "escapesrc: %s -> %s\n", infile.c_str(), outfile.c_str());
  
-  FILE *inf = fopen(infile.c_str(), "rb");
-  if(!inf) {
+  std::ifstream inf;
+  
+  inf.open(infile, std::ios::in);
+
+  if(!inf.is_open()) {
      fprintf(stderr, "%s: could not open input file %s\n", prog.c_str(), infile.c_str());
      cleanup(outfile);
      return 1;
    }
  
-  FILE *outf = fopen(outfile.c_str(), "w");
+  std::ofstream outf;
+
+  outf.open(outfile, std::ios::out);
  
-  if(!outf) {
+  if(!outf.is_open()) {
      fprintf(stderr, "%s: could not open output file %s\n", prog.c_str(), outfile.c_str());
-    fclose(inf);
      return 1;
    }
  
-  // TODO: any platform variations of this?
-  fprintf(outf, "#line 1 \"%s\"\n", infile.c_str());
+  // TODO: any platform variations of #line?
+  outf << "#line 1 \"" << infile << "\"" << '\n';
  
-  size_t len;
-  char *line;
    int no = 0;
    std::string linestr;
-  while((line = fgetln(inf, &len))!= NULL) {
+  while( getline( inf, linestr)) {
      no++;
-    linestr.assign(line, len);
      if(fixLine(no, linestr)) {
-      fclose(inf);
-      fclose(outf);
+      outf.close();
        fprintf(stderr, "%s:%d: Fixup failed by %s\n", infile.c_str(), no, prog.c_str());
        cleanup(outfile);
        return 1;
      }
-    len = linestr.size(); // size may have changed.
-    
-    if(fwrite(linestr.c_str(), 1, linestr.size(), outf) != len) {
-      fclose(inf);
-      fclose(outf);
-      fprintf(stderr, "%s: short write to  %s:%d\n", prog.c_str(), outfile.c_str(), no);
-      cleanup(outfile);
-      return 1;
-    }
+    outf << linestr << '\n';
    }
  
-  fclose(inf);
-  fclose(outf);
    return 0;
  }
  
diff --git a/icu4c/source/tools/escapesrc/expect-simple.cpp b/icu4c/source/tools/escapesrc/expect-simple.cpp

new file mode 100644 (file)

index 0000000..684bf11
--- /dev/null
+++ b/icu4c/source/tools/escapesrc/expect-simple.cpp
@@ -0,0 +1,10 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+
+u"sa\u0127\u0127a";
+u'\u6587';
+u"\U000219F2";
+
+ u"sa\u0127\u0127a";
+ u'\u6587'; u"\U000219F2";
+
diff --git a/icu4c/source/tools/escapesrc/test-nochange.cpp b/icu4c/source/tools/escapesrc/test-nochange.cpp

new file mode 100644 (file)

index 0000000..8c0d04b
--- /dev/null
+++ b/icu4c/source/tools/escapesrc/test-nochange.cpp
@@ -0,0 +1,5 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+
+// This is a source file with no changes needed in it.
+// In fact, the only non-ASCII character is the comment line at top.
diff --git a/icu4c/source/tools/escapesrc/test-simple.cpp b/icu4c/source/tools/escapesrc/test-simple.cpp

new file mode 100644 (file)

index 0000000..9799c4f
--- /dev/null
+++ b/icu4c/source/tools/escapesrc/test-simple.cpp
@@ -0,0 +1,10 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+
+u"saħħa";
+u'文';
+u"𡧲";
+
+ u"saħħa";
+ u'文'; u"𡧲";
+
author	Steven R. Loomis <srl@icu-project.org>
	Wed, 15 Mar 2017 02:16:35 +0000 (02:16 +0000)
committer	Steven R. Loomis <srl@icu-project.org>
	Wed, 15 Mar 2017 02:16:35 +0000 (02:16 +0000)
.gitattributes		patch \| blob \| history
icu4c/source/Makefile.in		patch \| blob \| history
icu4c/source/tools/Makefile.in		patch \| blob \| history
icu4c/source/tools/escapesrc/Makefile.in		patch \| blob \| history
icu4c/source/tools/escapesrc/escapesrc.cpp		patch \| blob \| history
icu4c/source/tools/escapesrc/expect-simple.cpp	[new file with mode: 0644]	patch \| blob
icu4c/source/tools/escapesrc/test-nochange.cpp	[new file with mode: 0644]	patch \| blob
icu4c/source/tools/escapesrc/test-simple.cpp	[new file with mode: 0644]	patch \| blob