From: Andy Heninger Date: Fri, 2 May 2014 22:02:59 +0000 (+0000) Subject: ICU-10835 Add tool for generation of regular expressions casing data X-Git-Tag: milestone-59-0-1~1924 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=8807332753bb4e0c64b044bbfc8311b6331c0b4e;p=icu ICU-10835 Add tool for generation of regular expressions casing data X-SVN-Rev: 35682 --- diff --git a/tools/unicode/c/genregexcasing/Makefile b/tools/unicode/c/genregexcasing/Makefile new file mode 100644 index 00000000000..d22b24c3b48 --- /dev/null +++ b/tools/unicode/c/genregexcasing/Makefile @@ -0,0 +1,12 @@ +# Copyright (C) 2014, International Business Machines +# Corporation and others. All Rights Reserved. + +# Edit the following two lines to reflect the location of your ICU sources & build (if out-of-source) +ICU_HOME=$(HOME)/icu/icu/trunk/source +ICU_BUILD=$(ICU_HOME) + +data: genregexcasing + LD_LIBRARY_PATH=$(ICU_BUILD)/lib:$(ICU_BUILD)/stubdata ICU_DATA=$(ICU_BUILD)/data/out ./genregexcasing + +genregexcasing: genregexcasing.cpp + clang++ genregexcasing.cpp -std=c++0x -g -I $(ICU_HOME)/common -I $(ICU_HOME)/i18n -I $(ICU_HOME)/io -L$(ICU_BUILD)/lib -L$(ICU_BUILD)/stubdata -licuuc -licui18n -licudata -o genregexcasing diff --git a/tools/unicode/c/genregexcasing/genregexcasing.cpp b/tools/unicode/c/genregexcasing/genregexcasing.cpp new file mode 100644 index 00000000000..110749e6f06 --- /dev/null +++ b/tools/unicode/c/genregexcasing/genregexcasing.cpp @@ -0,0 +1,140 @@ +/* +******************************************************************************* +* +* Copyright (C) 2014, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +*/ + +// file name: genregexcasing.cpp +// +// Program to generate the casing data for use by ICU regular expressions. +// The data declarations output when running this program are to be copied +// into the file i18n/regexcmp.h +// +// See the function RegexCompile::findCaseInsensitiveStarters() for more explanation. + +#include "unicode/uniset.h" +#include "unicode/usetiter.h" +#include "iostream" +#include +#include +#include +#include + +using namespace std; + +std::string sstring(const UnicodeString &us) { + string retString; + us.toUTF8String(retString); + return retString; +} + +int main() { + + std::map> cmap; + + for (UChar32 cp = 0; cp<=0x10ffff; cp++) { + UnicodeSet s(cp, cp); + s.closeOver(USET_CASE_INSENSITIVE); + + UnicodeSetIterator setIter(s); + while (setIter.next()) { + if (!setIter.isString()) { + continue; + } + const UnicodeString &str = setIter.getString(); + + cout << "Got a string for \"" << sstring(UnicodeString(cp)) << "\" [\\u" << hex << cp << "]\n"; + cout << " \"" << sstring(str) << "\" ["; + for (int32_t j=0; j data; + for (const auto &mapPair: cmap) { + Item dataForCP; + dataForCP.fCP = mapPair.first; + dataForCP.fStrIndex = outString.length(); + for (UChar32 valCP: mapPair.second) { + outString.append(valCP); + dataForCP.fCount++; + } + data.push_back(dataForCP); + } + + std::cout << " static const UChar32 RECaseFixCodePoints[] = {" ; + int items=0; + for (const Item &d: data) { + if (items++ % 10 == 0) { + std::cout << "\n "; + } + std::cout << "0x" << d.fCP << ", "; + } + std::cout << "0x110000};\n\n"; + + std::cout << " static const int16_t RECaseFixStringOffsets[] = {"; + items = 0; + for (const Item &d: data) { + if (items++ % 10 == 0) { + std::cout << "\n "; + } + std::cout << "0x" << d.fStrIndex << ", "; + } + std::cout << "0};\n\n"; + + std::cout << " static const int16_t RECaseFixCounts[] = {"; + items = 0; + for (const Item &d: data) { + if (items++ % 10 == 0) { + std::cout << "\n "; + } + std::cout << "0x" << d.fCount << ", "; + } + std::cout << "0};\n\n"; + + std::cout << " static const UChar RECaseFixData[] = {"; + for (int i=0; i