From a45f7faf63637550a51e205901e73fb6d7157228 Mon Sep 17 00:00:00 2001
From: Andy Heninger <andy.heninger@gmail.com>
Date: Fri, 2 May 2014 22:51:53 +0000
Subject: [PATCH] ICU-10835 Improve performance of case insensitive find
 operations.

X-SVN-Rev: 35683
---
 icu4c/source/i18n/regexcmp.cpp          | 126 ++++++++++++++++++++----
 icu4c/source/i18n/regexcmp.h            |   7 +-
 icu4c/source/test/intltest/regextst.cpp |  36 +++++++
 icu4c/source/test/intltest/regextst.h   |   3 +-
 icu4c/source/test/testdata/regextst.txt |  15 ++-
 5 files changed, 165 insertions(+), 22 deletions(-)

diff --git a/icu4c/source/i18n/regexcmp.cpp b/icu4c/source/i18n/regexcmp.cpp
index cd29deacc7d..0816eecfff5 100644
--- a/icu4c/source/i18n/regexcmp.cpp
+++ b/icu4c/source/i18n/regexcmp.cpp
@@ -2375,6 +2375,105 @@ UBool RegexCompile::compileInlineInterval() {
 
 
 
+//------------------------------------------------------------------------------
+//
+//   caseInsensitiveStart  given a single code point from a pattern string, determine the 
+//                         set of characters that could potentially begin a case-insensitive 
+//                         match of a string beginning with that character, using full Unicode
+//                         case insensitive matching.
+//
+//          This is used in optimizing find().
+//
+//          closeOver(USET_CASE_INSENSITIVE) does most of what is needed, but
+//          misses cases like this:
+//             A string from the pattern begins with 'ss' (although all we know
+//                 in this context is that it begins with 's')
+//             The pattern could match a string beginning with a German sharp-s
+//
+//           To the ordinary case closure for a character c, we add all other
+//           characters cx where the case closure of cx incudes a string form that begins
+//           with the original character c.
+//
+//           This function could be made smarter. The full pattern string is available
+//           and it would be possible to verify that the extra characters being added
+//           to the starting set fully match, rather than having just a first-char of the
+//           folded form match.
+//
+//------------------------------------------------------------------------------
+void  RegexCompile::findCaseInsensitiveStarters(UChar32 c, UnicodeSet *starterChars) {
+
+// Machine Generated below.
+// It may need updating with new versions of Unicode.
+// Intltest test RegexTest::TestCaseInsensitiveStarters will fail if an update is needed.
+// The update tool is here: svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
+
+// Machine Generated Data. Do not hand edit.
+    static const UChar32 RECaseFixCodePoints[] = {
+        0x61, 0x66, 0x68, 0x69, 0x6a, 0x73, 0x74, 0x77, 0x79, 0x2bc, 
+        0x3ac, 0x3ae, 0x3b1, 0x3b7, 0x3b9, 0x3c1, 0x3c5, 0x3c9, 0x3ce, 0x565, 
+        0x574, 0x57e, 0x1f00, 0x1f01, 0x1f02, 0x1f03, 0x1f04, 0x1f05, 0x1f06, 0x1f07, 
+        0x1f20, 0x1f21, 0x1f22, 0x1f23, 0x1f24, 0x1f25, 0x1f26, 0x1f27, 0x1f60, 0x1f61, 
+        0x1f62, 0x1f63, 0x1f64, 0x1f65, 0x1f66, 0x1f67, 0x1f70, 0x1f74, 0x1f7c, 0x110000};
+
+    static const int16_t RECaseFixStringOffsets[] = {
+        0x0, 0x1, 0x6, 0x7, 0x8, 0x9, 0xd, 0xe, 0xf, 0x10, 
+        0x11, 0x12, 0x13, 0x17, 0x1b, 0x20, 0x21, 0x2a, 0x2e, 0x2f, 
+        0x30, 0x34, 0x35, 0x37, 0x39, 0x3b, 0x3d, 0x3f, 0x41, 0x43, 
+        0x45, 0x47, 0x49, 0x4b, 0x4d, 0x4f, 0x51, 0x53, 0x55, 0x57, 
+        0x59, 0x5b, 0x5d, 0x5f, 0x61, 0x63, 0x65, 0x66, 0x67, 0};
+
+    static const int16_t RECaseFixCounts[] = {
+        0x1, 0x5, 0x1, 0x1, 0x1, 0x4, 0x1, 0x1, 0x1, 0x1, 
+        0x1, 0x1, 0x4, 0x4, 0x5, 0x1, 0x9, 0x4, 0x1, 0x1, 
+        0x4, 0x1, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 
+        0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 
+        0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x1, 0x1, 0x1, 0};
+
+    static const UChar RECaseFixData[] = {
+        0x1e9a, 0xfb00, 0xfb01, 0xfb02, 0xfb03, 0xfb04, 0x1e96, 0x130, 0x1f0, 0xdf, 
+        0x1e9e, 0xfb05, 0xfb06, 0x1e97, 0x1e98, 0x1e99, 0x149, 0x1fb4, 0x1fc4, 0x1fb3, 
+        0x1fb6, 0x1fb7, 0x1fbc, 0x1fc3, 0x1fc6, 0x1fc7, 0x1fcc, 0x390, 0x1fd2, 0x1fd3, 
+        0x1fd6, 0x1fd7, 0x1fe4, 0x3b0, 0x1f50, 0x1f52, 0x1f54, 0x1f56, 0x1fe2, 0x1fe3, 
+        0x1fe6, 0x1fe7, 0x1ff3, 0x1ff6, 0x1ff7, 0x1ffc, 0x1ff4, 0x587, 0xfb13, 0xfb14, 
+        0xfb15, 0xfb17, 0xfb16, 0x1f80, 0x1f88, 0x1f81, 0x1f89, 0x1f82, 0x1f8a, 0x1f83, 
+        0x1f8b, 0x1f84, 0x1f8c, 0x1f85, 0x1f8d, 0x1f86, 0x1f8e, 0x1f87, 0x1f8f, 0x1f90, 
+        0x1f98, 0x1f91, 0x1f99, 0x1f92, 0x1f9a, 0x1f93, 0x1f9b, 0x1f94, 0x1f9c, 0x1f95, 
+        0x1f9d, 0x1f96, 0x1f9e, 0x1f97, 0x1f9f, 0x1fa0, 0x1fa8, 0x1fa1, 0x1fa9, 0x1fa2, 
+        0x1faa, 0x1fa3, 0x1fab, 0x1fa4, 0x1fac, 0x1fa5, 0x1fad, 0x1fa6, 0x1fae, 0x1fa7, 
+        0x1faf, 0x1fb2, 0x1fc2, 0x1ff2, 0};
+
+// End of machine generated data.
+
+    if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
+        UChar32 caseFoldedC  = u_foldCase(c, U_FOLD_CASE_DEFAULT);
+        starterChars->set(caseFoldedC, caseFoldedC);
+
+        int32_t i;
+        for (i=0; RECaseFixCodePoints[i]<c ; i++) {
+            // Simple linear search through the sorted list of interesting code points.
+        }
+
+        if (RECaseFixCodePoints[i] == c) {
+            int32_t dataIndex = RECaseFixStringOffsets[i];
+            int32_t numCharsToAdd = RECaseFixCounts[i];
+            UChar32 cpToAdd = 0;
+            for (int32_t j=0; j<numCharsToAdd; j++) {
+                U16_NEXT_UNSAFE(RECaseFixData, dataIndex, cpToAdd);
+                starterChars->add(cpToAdd);
+            }
+        }
+
+        starterChars->closeOver(USET_CASE_INSENSITIVE);
+        starterChars->removeAllStrings();
+    } else {
+        // Not a cased character. Just return it alone.
+        starterChars->set(c, c);
+    }
+}
+
+
+
+
 //------------------------------------------------------------------------------
 //
 //   matchStartType    Determine how a match can start.
@@ -2565,17 +2664,12 @@ void   RegexCompile::matchStartType() {
             if (currentLen == 0) {
                 UChar32  c = URX_VAL(op);
                 if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
-
-                    // Disable optimizations on first char of match.
-                    // TODO: Compute the set of chars that case fold to this char, or to
-                    //       a string that begins with this char.
-                    //       For simple case folding, this code worked:
-                    //   UnicodeSet s(c, c);
-                    //   s.closeOver(USET_CASE_INSENSITIVE);
-                    //   fRXPat->fInitialChars->addAll(s);
-
-                    fRXPat->fInitialChars->clear();
-                    fRXPat->fInitialChars->complement();
+                    UnicodeSet starters(c, c);
+                    starters.closeOver(USET_CASE_INSENSITIVE);
+                    // findCaseInsensitiveStarters(c, &starters);
+                    //   For ONECHAR_I, no need to worry about text chars that expand on folding into strings.
+                    //   The expanded folding can't match the pattern.
+                    fRXPat->fInitialChars->addAll(starters);
                 } else {
                     // Char has no case variants.  Just add it as-is to the
                     //   set of possible starting chars.
@@ -2698,14 +2792,8 @@ void   RegexCompile::matchStartType() {
                     //   characters for this pattern.
                     int32_t stringStartIdx = URX_VAL(op);
                     UChar32  c = fRXPat->fLiteralText.char32At(stringStartIdx);
-                    UnicodeSet s(c, c);
-
-                    // TODO:  compute correct set of starting chars for full case folding.
-                    //        For the moment, say any char can start.
-                    // s.closeOver(USET_CASE_INSENSITIVE);
-                    s.clear();
-                    s.complement();
-
+                    UnicodeSet s;
+                    findCaseInsensitiveStarters(c, &s);
                     fRXPat->fInitialChars->addAll(s);
                     numInitialStrings += 2;  // Matching on an initial string not possible.
                 }
diff --git a/icu4c/source/i18n/regexcmp.h b/icu4c/source/i18n/regexcmp.h
index 0041beb6b6f..022b120198b 100644
--- a/icu4c/source/i18n/regexcmp.h
+++ b/icu4c/source/i18n/regexcmp.h
@@ -1,7 +1,7 @@
 //
 //  regexcmp.h
 //
-//  Copyright (C) 2002-2012, International Business Machines Corporation and others.
+//  Copyright (C) 2002-2014, International Business Machines Corporation and others.
 //  All Rights Reserved.
 //
 //  This file contains declarations for the class RegexCompile
@@ -22,6 +22,7 @@
 #include "unicode/parseerr.h"
 #include "uhash.h"
 #include "uvector.h"
+#include "uvectr32.h"
 
 
 
@@ -115,6 +116,10 @@ private:
     UChar32     scanNamedChar();
     UnicodeSet *createSetForProperty(const UnicodeString &propName, UBool negated);
 
+public:   // Public for testing only.
+    static void findCaseInsensitiveStarters(UChar32 c, UnicodeSet *starterChars);
+private:
+
 
     UErrorCode                    *fStatus;
     RegexPattern                  *fRXPat;
diff --git a/icu4c/source/test/intltest/regextst.cpp b/icu4c/source/test/intltest/regextst.cpp
index 9c09d4f6c55..18155e90323 100644
--- a/icu4c/source/test/intltest/regextst.cpp
+++ b/icu4c/source/test/intltest/regextst.cpp
@@ -28,8 +28,10 @@
 #include "unicode/ucnv.h"
 #include "unicode/uniset.h"
 #include "unicode/uregex.h"
+#include "unicode/usetiter.h"
 #include "unicode/ustring.h"
 #include "regextst.h"
+#include "regexcmp.h"
 #include "uvector.h"
 #include "util.h"
 #include <stdlib.h>
@@ -135,6 +137,9 @@ void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, ch
         case 22: name = "Bug10459";
             if (exec) Bug10459();
             break;
+        case 23: name = "TestCaseInsensitiveStarters";
+            if (exec) TestCaseInsensitiveStarters();
+            break;
 
         default: name = "";
             break; //needed to end loop
@@ -5267,5 +5272,36 @@ void RegexTest::Bug10459() {
     utext_close(utext_txt);
 }
 
+void RegexTest::TestCaseInsensitiveStarters() {
+    // Test that the data used by RegexCompile::findCaseInsensitiveStarters() hasn't
+    //  become stale because of new Unicode characters.
+    // If it is stale, rerun the generation tool
+    //    svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
+    // and replace the embedded data in i18n/regexcmp.cpp
+
+    for (UChar32 cp=0; cp<=0x10ffff; cp++) {
+        if (!u_hasBinaryProperty(cp, UCHAR_CASE_SENSITIVE)) {
+            continue;
+        }
+        UnicodeSet s(cp, cp);
+        s.closeOver(USET_CASE_INSENSITIVE);
+        UnicodeSetIterator setIter(s);
+        while (setIter.next()) {
+            if (!setIter.isString()) {
+                continue;
+            }
+            const UnicodeString &str = setIter.getString();
+            UChar32 firstChar = str.char32At(0);
+            UnicodeSet starters;
+            RegexCompile::findCaseInsensitiveStarters(firstChar, &starters);
+            if (!starters.contains(cp)) {
+                errln("CaseInsensitiveStarters for \\u%x is missing character \\u%x.", cp, firstChar);
+                return;
+            }
+        }
+    }
+}
+
+
 #endif  /* !UCONFIG_NO_REGULAR_EXPRESSIONS  */
 
diff --git a/icu4c/source/test/intltest/regextst.h b/icu4c/source/test/intltest/regextst.h
index 6b59be45f15..974e1355ae5 100644
--- a/icu4c/source/test/intltest/regextst.h
+++ b/icu4c/source/test/intltest/regextst.h
@@ -1,6 +1,6 @@
 /********************************************************************
  * COPYRIGHT:
- * Copyright (c) 2002-2013, International Business Machines Corporation and
+ * Copyright (c) 2002-2014, International Business Machines Corporation and
  * others. All Rights Reserved.
  ********************************************************************/
 
@@ -48,6 +48,7 @@ public:
     virtual void Bug9283();
     virtual void CheckInvBufSize();
     virtual void Bug10459();
+    virtual void TestCaseInsensitiveStarters();
     
     // The following functions are internal to the regexp tests.
     virtual void assertUText(const char *expected, UText *actual, const char *file, int line);
diff --git a/icu4c/source/test/testdata/regextst.txt b/icu4c/source/test/testdata/regextst.txt
index 36c384d136b..f70cd0256bd 100644
--- a/icu4c/source/test/testdata/regextst.txt
+++ b/icu4c/source/test/testdata/regextst.txt
@@ -519,9 +519,15 @@
 '((((((((((a))))))))))\10' i    "<0><1><2><3><4><5><6><7><8><9><10>A</10></9></8></7></6></5></4></3></2></1>A</0>"
 
 "(?:(?i)a)b"                    "<0>Ab</0>"
-"ab(?i)cd"	                 "<0>abCd</0>"
+"ab(?i)cd"	                    "<0>abCd</0>"
 "ab$cd"                         "abcd"
 
+"ssl"                      i    "abc<0>Ãl</0>xyz"
+"ssl"                      i    "abc<0>áºl</0>xyz"
+"FIND"                     i    "can <0>ï¬nd</0> ?"  # fi ligature, \ufb01
+"ï¬nd"                      i    "can <0>FIND</0> ?"
+"ÏÌÍ"                        i    "xxx<0>á¿§</0>xxx"    # Composed char (match string) decomposes when case-folded (pattern)
+
 # White space handling
 "a b"                           "ab"
 "abc "                          "abc"
@@ -1172,6 +1178,13 @@
 "(?<=a{1,})bc"           E       "aaaa<0>bc</0>def"   # U_REGEX_LOOK_BEHIND_LIMIT error.
 "(?<=(?:){11})bc"                "<0>bc</0>"          # Empty (?:) expression.
 
+# Bug 10835
+#   Match Start Set not being correctly computed for case insensitive patterns.
+#   (Test here is to dump the compiled pattern & manually check the start set.)
+
+"(private|secret|confidential|classified|restricted)"  i   "hmm, <0><1>Classified</1></0> stuff"
+"(private|secret|confidential|classified|restricted)"      "hmm, Classified stuff"
+
 # Bug 10844
 
 "^([\w\d:]+)$"                  "<0><1>DiesIst1Beispiel:text</1></0>"
-- 
2.40.0