ICU-11891 UnicodeRegex change supplementary escapes to Java regex syntax

author Markus Scherer <markus.icu@gmail.com>

Mon, 20 Sep 2021 22:34:28 +0000 (15:34 -0700)

committer Markus Scherer <markus.icu@gmail.com>

Tue, 21 Sep 2021 00:05:18 +0000 (17:05 -0700)
author Markus Scherer <markus.icu@gmail.com>
Mon, 20 Sep 2021 22:34:28 +0000 (15:34 -0700)
committer Markus Scherer <markus.icu@gmail.com>
Tue, 21 Sep 2021 00:05:18 +0000 (17:05 -0700)
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/UnicodeRegex.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/UnicodeRegex.java

index e4260e43c4df705ebfb3b5fd6b7353d877abfb77..93b91381c4b1eebc2edc9cba080b3fc4c7a62c03 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/UnicodeRegex.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/UnicodeRegex.java
@@ -40,6 +40,8 @@ import com.ibm.icu.util.Freezable;
   * @author markdavis
   */
  public class UnicodeRegex implements Cloneable, Freezable<UnicodeRegex>, StringTransform {
+    private static final Pattern SUPP_ESCAPE = Pattern.compile("\\\\U00([0-9a-fA-F]{6})");
+
      // Note: we don't currently have any state, but intend to in the future,
      // particularly for the regex style supported.
  
@@ -75,7 +77,7 @@ public class UnicodeRegex implements Cloneable, Freezable<UnicodeRegex>, StringT
       * <p>Not thread-safe; create a separate copy for different threads.
       * <p>In the future, we may extend this to support other regex packages.
       *
-     * @regex A modified Java regex pattern, as in the input to
+     * @param regex A modified Java regex pattern, as in the input to
       *        Pattern.compile(), except that all "character classes" are
       *        processed as if they were UnicodeSet patterns. Example:
       *        "abc[:bc=N:]. See UnicodeSet for the differences in syntax.
@@ -208,7 +210,7 @@ public class UnicodeRegex implements Cloneable, Freezable<UnicodeRegex>, StringT
       */
      public String compileBnf(List<String> lines) {
          Map<String, String> variables = getVariables(lines);
-        Set<String> unused = new LinkedHashSet<String>(variables.keySet());
+        Set<String> unused = new LinkedHashSet<>(variables.keySet());
          // brute force replacement; do twice to allow for different order
          // later on can optimize
          for (int i = 0; i < 2; ++i) {
@@ -343,7 +345,12 @@ public class UnicodeRegex implements Cloneable, Freezable<UnicodeRegex>, StringT
              pos.setIndex(i);
              UnicodeSet x = temp.clear().applyPattern(regex, pos, symbolTable, 0);
              x.complement().complement(); // hack to fix toPattern
-            result.append(x.toPattern(false));
+            String pattern = x.toPattern(false);
+            // Escaping of supplementary code points differs between ICU UnicodeSet and Java regex.
+            if (pattern.contains("\\U")) {
+                pattern = SUPP_ESCAPE.matcher(pattern).replaceAll("\\\\x{$1}");
+            }
+            result.append(pattern);
              i = pos.getIndex() - 1; // allow for the loop increment
              return i;
          } catch (Exception e) {
@@ -370,7 +377,7 @@ public class UnicodeRegex implements Cloneable, Freezable<UnicodeRegex>, StringT
      };
  
      private Map<String, String> getVariables(List<String> lines) {
-        Map<String, String> variables = new TreeMap<String, String>(LongestFirst);
+        Map<String, String> variables = new TreeMap<>(LongestFirst);
          String variable = null;
          StringBuffer definition = new StringBuffer();
          int count = 0;
diff --git a/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/translit/RegexUtilitiesTest.java b/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/translit/RegexUtilitiesTest.java

index 88e18b160a2d2f90cdf8172bc00677bac1459b26..ea0afe876af0ad16884be7beff4306e4fec97f96 100644 (file)
--- a/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/translit/RegexUtilitiesTest.java
+++ b/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/translit/RegexUtilitiesTest.java
@@ -66,7 +66,8 @@ public class RegexUtilitiesTest extends TestFmwk {
          UnicodeSet requiresQuote = new UnicodeSet("[\\$\\&\\-\\:\\[\\\\\\]\\^\\{\\}[:pattern_whitespace:]]");
          boolean skip = TestFmwk.getExhaustiveness() < 10;
          for (int cp = 0; cp < 0x110000; ++cp) {
-            if (cp > 0xFF && skip && (cp % 37 != 0)) {
+            // Do always test U+1FFFE to cover UnicodeSet escaping a supplementary noncharacter.
+            if (cp > 0xFF && skip && (cp % 37 != 0) && cp != 0x1fffe) {
                  continue;
              }
              String cpString = UTF16.valueOf(cp);
@@ -82,7 +83,9 @@ public class RegexUtilitiesTest extends TestFmwk {
              String expected = "[" + s + "]";  // Try this first for faster testing.
              boolean ok = pattern.equals(expected);
              if (!ok) {
-                expected = new UnicodeSet(expected).toPattern(false);
+                // Escape like in UnicodeSet, and change supplementary escapes to Java regex syntax.
+                expected = new UnicodeSet(expected).toPattern(false).
+                        replaceAll("\\\\U00([0-9a-fA-F]{6})", "\\\\x{$1}");
                  ok = pattern.equals(expected);
              }
              assertTrue("Doubled character works " + hex.transform(s), ok);
author	Markus Scherer <markus.icu@gmail.com>
	Mon, 20 Sep 2021 22:34:28 +0000 (15:34 -0700)
committer	Markus Scherer <markus.icu@gmail.com>
	Tue, 21 Sep 2021 00:05:18 +0000 (17:05 -0700)
icu4j/main/classes/core/src/com/ibm/icu/impl/UnicodeRegex.java		patch \| blob \| history
icu4j/main/tests/translit/src/com/ibm/icu/dev/test/translit/RegexUtilitiesTest.java		patch \| blob \| history