From: Steven R. Loomis Date: Wed, 15 Mar 2017 02:16:35 +0000 (+0000) Subject: ICU-12766 cleanup and add test case for escaper X-Git-Tag: release-59-rc~99^2~11 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=383d3eead14b40824e54fe031d21275c468d802a;p=icu ICU-12766 cleanup and add test case for escaper X-SVN-Rev: 39810 --- diff --git a/.gitattributes b/.gitattributes index cb3fd4f3530..8c0e8f6e898 100644 --- a/.gitattributes +++ b/.gitattributes @@ -124,6 +124,9 @@ icu4c/source/test/testdata/importtest.bin -text icu4c/source/test/testdata/old_e_testtypes.res -text icu4c/source/test/testdata/old_l_testtypes.res -text icu4c/source/test/testdata/uni-text.bin -text +icu4c/source/tools/escapesrc/expect-simple.cpp -text +icu4c/source/tools/escapesrc/test-nochange.cpp -text +icu4c/source/tools/escapesrc/test-simple.cpp -text icu4c/source/tools/genbrk/genbrk.vcxproj -text icu4c/source/tools/genccode/genccode.vcxproj -text icu4c/source/tools/gencfu/gencfu.vcxproj -text diff --git a/icu4c/source/Makefile.in b/icu4c/source/Makefile.in index ef2657fc2dc..9ba5eab6e2b 100644 --- a/icu4c/source/Makefile.in +++ b/icu4c/source/Makefile.in @@ -140,7 +140,10 @@ $(LIBDIR) $(BINDIR): ## Recursive targets all-recursive install-recursive clean-recursive distclean-recursive dist-recursive check-recursive check-exhaustive-recursive: $(LIBDIR) $(BINDIR) +ifneq ($(NEED_ESCAPING),) + @echo "building tools/escapesrc (Needed for this platform with NEED_ESCAPING)" @(cd tools/escapesrc && $(MAKE) RECURSIVE=YES $$local_target) || exit +endif @dot_seen=no; \ target=`echo $@ | sed s/-recursive//`; \ list='$(LOCAL_SUBDIRS)'; for subdir in $$list; do \ diff --git a/icu4c/source/tools/Makefile.in b/icu4c/source/tools/Makefile.in index 1cbef0005d4..8fb7876de76 100644 --- a/icu4c/source/tools/Makefile.in +++ b/icu4c/source/tools/Makefile.in @@ -17,7 +17,7 @@ subdir = tools SUBDIRS = toolutil ctestfw makeconv genrb genbrk \ gencnval gensprep icuinfo genccode gencmn icupkg pkgdata \ -gentest gennorm2 gencfu gendict +gentest gennorm2 gencfu gendict escapesrc ## List of phony targets .PHONY : all all-local all-recursive install install-local \ diff --git a/icu4c/source/tools/escapesrc/Makefile.in b/icu4c/source/tools/escapesrc/Makefile.in index 71ffdbca07c..8a0e2c1f803 100644 --- a/icu4c/source/tools/escapesrc/Makefile.in +++ b/icu4c/source/tools/escapesrc/Makefile.in @@ -5,6 +5,7 @@ ## others. All Rights Reserved. ## Steven R. Loomis +# To avoid recursion SKIP_ESCAPING=YES ## Source directory information @@ -25,7 +26,7 @@ SECTION = 8 #MAN_FILES = $(TARGET_STUB_NAME).$(SECTION) ## Extra files to remove for 'make clean' -CLEANFILES = *~ $(DEPS) $(MAN_FILES) +CLEANFILES = *~ $(DEPS) $(MAN_FILES) ./output-*.cpp ## Target information TARGET = $(BINDIR)/$(TARGET_STUB_NAME)$(EXEEXT) @@ -74,6 +75,12 @@ distclean-local: clean-local $(RMV) Makefile check-local: all-local + @echo Testing test-nochange.cpp + @$(INVOKE) $(TARGET) $(srcdir)/test-nochange.cpp ./output-nochange.cpp + @-diff -I '#line.*' $(srcdir)/test-nochange.cpp ./output-nochange.cpp || (echo >&2 'warning: diff failed or not found' ; true) + @echo Testing test-simple.cpp + @$(INVOKE) $(TARGET) $(srcdir)/test-simple.cpp ./output-simple.cpp + @-diff -I '#line.*' $(srcdir)/expect-simple.cpp ./output-simple.cpp || (echo >&2 'warning: diff failed or not found' ; true) Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status cd $(top_builddir) \ diff --git a/icu4c/source/tools/escapesrc/escapesrc.cpp b/icu4c/source/tools/escapesrc/escapesrc.cpp index e4af76f4503..dde8a1de900 100644 --- a/icu4c/source/tools/escapesrc/escapesrc.cpp +++ b/icu4c/source/tools/escapesrc/escapesrc.cpp @@ -6,6 +6,9 @@ #include #include #include +#include +#include +#include // with caution: #include "unicode/utf8.h" @@ -112,7 +115,6 @@ bool fixAt(std::string &linestr, size_t pos) { } else { // Proceed to decode utf-8 const uint8_t *s = (const uint8_t*) (linestr.c_str()); - const uint8_t *b = s; int32_t i = pos; int32_t length = linestr.size(); UChar32 c; @@ -123,24 +125,22 @@ bool fixAt(std::string &linestr, size_t pos) { U8_NEXT(s, i, length, c); } if(c<0) { - fprintf(stderr, "Illegal utf-8 sequence %04X pos %d\n", c, pos); + fprintf(stderr, "Illegal utf-8 sequence\n"); return true; } size_t seqLen = (i-pos); - printf("U+%04X pos %d [len %d]\n", c, pos, seqLen); + //printf("U+%04X pos %d [len %d]\n", c, pos, seqLen); + char newSeq[] = "\\U0000FFFD"; if( c <= 0xFFFF) { - char newSeq[] = "\\uFFFD"; sprintf(newSeq, "\\u%04X", c); - linestr.replace(pos, seqLen, newSeq); - //pos += seqLen; // advance - pos += strlen(newSeq) - 1; } else { - fprintf(stderr, "%s: Error: not implemented yet: surrogate pairs for U+%04X\n", prog.c_str(), c); - return true; + sprintf(newSeq, "\\U%08X", c); } + linestr.replace(pos, seqLen, newSeq); + pos += strlen(newSeq) - 1; } } @@ -151,91 +151,85 @@ bool fixAt(std::string &linestr, size_t pos) { * false = no err * true = had err */ -bool fixLine(int no, std::string &linestr) { +bool fixLine(int /*no*/, std::string &linestr) { const char *line = linestr.c_str(); size_t len = linestr.size(); - // Quick Check: all ascii? + // no u' in the line? + if(!strstr(line, "u'") && !strstr(line, "u\"")) { + return false; // Nothing to do. No u' or u" detected + } + + // Quick Check: all ascii? if(!hasNonAscii(line, len)) { return false; // ASCII } + // comment or empty line? if(isCommentOrEmpty(line, len)) { return false; // Comment or just empty } - if(!strnstr(line, "u'", len) && !strnstr(line, "u\"", len)) { - return false; // Nothing to do. No u' or u" detected - } - // start from the end and find all u" cases size_t pos = len = linestr.size(); - while((pos = linestr.rfind("u\"", pos)) != std::string::npos) { - printf("found doublequote at %d\n", pos); + while((pos>0) && (pos = linestr.rfind("u\"", pos)) != std::string::npos) { + //printf("found doublequote at %d\n", pos); if(fixAt(linestr, pos)) return true; + if(pos == 0) break; pos--; } // reset and find all u' cases pos = len = linestr.size(); - while((pos = linestr.rfind("u'", pos)) != std::string::npos) { - printf("found singlequote at %d\n", pos); + while((pos>0) && (pos = linestr.rfind("u'", pos)) != std::string::npos) { + //printf("found singlequote at %d\n", pos); if(fixAt(linestr, pos)) return true; + if(pos == 0) break; pos--; } - fprintf(stderr, "%d - fixed\n", no); + //fprintf(stderr, "%d - fixed\n", no); return false; } int convert(const std::string &infile, const std::string &outfile) { - fprintf(stderr, "%s: %s -> %s\n", prog.c_str(), infile.c_str(), outfile.c_str()); + fprintf(stderr, "escapesrc: %s -> %s\n", infile.c_str(), outfile.c_str()); - FILE *inf = fopen(infile.c_str(), "rb"); - if(!inf) { + std::ifstream inf; + + inf.open(infile, std::ios::in); + + if(!inf.is_open()) { fprintf(stderr, "%s: could not open input file %s\n", prog.c_str(), infile.c_str()); cleanup(outfile); return 1; } - FILE *outf = fopen(outfile.c_str(), "w"); + std::ofstream outf; + + outf.open(outfile, std::ios::out); - if(!outf) { + if(!outf.is_open()) { fprintf(stderr, "%s: could not open output file %s\n", prog.c_str(), outfile.c_str()); - fclose(inf); return 1; } - // TODO: any platform variations of this? - fprintf(outf, "#line 1 \"%s\"\n", infile.c_str()); + // TODO: any platform variations of #line? + outf << "#line 1 \"" << infile << "\"" << '\n'; - size_t len; - char *line; int no = 0; std::string linestr; - while((line = fgetln(inf, &len))!= NULL) { + while( getline( inf, linestr)) { no++; - linestr.assign(line, len); if(fixLine(no, linestr)) { - fclose(inf); - fclose(outf); + outf.close(); fprintf(stderr, "%s:%d: Fixup failed by %s\n", infile.c_str(), no, prog.c_str()); cleanup(outfile); return 1; } - len = linestr.size(); // size may have changed. - - if(fwrite(linestr.c_str(), 1, linestr.size(), outf) != len) { - fclose(inf); - fclose(outf); - fprintf(stderr, "%s: short write to %s:%d\n", prog.c_str(), outfile.c_str(), no); - cleanup(outfile); - return 1; - } + outf << linestr << '\n'; } - fclose(inf); - fclose(outf); return 0; } diff --git a/icu4c/source/tools/escapesrc/expect-simple.cpp b/icu4c/source/tools/escapesrc/expect-simple.cpp new file mode 100644 index 00000000000..684bf114d29 --- /dev/null +++ b/icu4c/source/tools/escapesrc/expect-simple.cpp @@ -0,0 +1,10 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +u"sa\u0127\u0127a"; +u'\u6587'; +u"\U000219F2"; + + u"sa\u0127\u0127a"; + u'\u6587'; u"\U000219F2"; + diff --git a/icu4c/source/tools/escapesrc/test-nochange.cpp b/icu4c/source/tools/escapesrc/test-nochange.cpp new file mode 100644 index 00000000000..8c0d04b8099 --- /dev/null +++ b/icu4c/source/tools/escapesrc/test-nochange.cpp @@ -0,0 +1,5 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +// This is a source file with no changes needed in it. +// In fact, the only non-ASCII character is the comment line at top. diff --git a/icu4c/source/tools/escapesrc/test-simple.cpp b/icu4c/source/tools/escapesrc/test-simple.cpp new file mode 100644 index 00000000000..9799c4f600d --- /dev/null +++ b/icu4c/source/tools/escapesrc/test-simple.cpp @@ -0,0 +1,10 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +u"saħħa"; +u'文'; +u"𡧲"; + + u"saħħa"; + u'文'; u"𡧲"; +