ICU-21524 UnicodeSet.hasStrings(): no complement in toPattern()

author Markus Scherer <markus.icu@gmail.com>

Tue, 19 Oct 2021 00:19:15 +0000 (17:19 -0700)

committer Markus Scherer <markus.icu@gmail.com>

Tue, 19 Oct 2021 19:24:15 +0000 (12:24 -0700)
author Markus Scherer <markus.icu@gmail.com>
Tue, 19 Oct 2021 00:19:15 +0000 (17:19 -0700)
committer Markus Scherer <markus.icu@gmail.com>
Tue, 19 Oct 2021 19:24:15 +0000 (12:24 -0700)
diff --git a/icu4c/source/common/uniset.cpp b/icu4c/source/common/uniset.cpp

index 967ea2ecdb22c241c8d58bcbe5dcb96468b469d8..92a81a1a02d1f008a67ac0347557474e38063fb6 100644 (file)
--- a/icu4c/source/common/uniset.cpp
+++ b/icu4c/source/common/uniset.cpp
@@ -2095,7 +2095,10 @@ UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
      //             getRangeEnd(last) == MAX_VALUE)
      // Invariant: list[len-1] == HIGH == MAX_VALUE + 1
      // If limit == len then len is even and the last range ends with MAX_VALUE.
-    if (len >= 4 && list[0] == 0 && limit == len) {
+    //
+    // *But* do not write the inverse (complement) if there are strings.
+    // Since ICU 70, the '^' performs a code point complement which removes all strings.
+    if (len >= 4 && list[0] == 0 && limit == len && !hasStrings()) {
          // Emit the inverse
          result.append(u'^');
          // Offsetting the inversion list index by one lets us
diff --git a/icu4c/source/test/cintltst/usettest.c b/icu4c/source/test/cintltst/usettest.c

index a444ce5c937e97eb9f9eeaa42863b47685148a1e..f5528d058019075136e71cfe3fe1c8928b2e43c5 100644 (file)
--- a/icu4c/source/test/cintltst/usettest.c
+++ b/icu4c/source/test/cintltst/usettest.c
@@ -137,7 +137,7 @@ static void TestAPI() {
      uset_removeString(set, STR_ab, STR_ab_LEN);
      expect(set, "acd{bc}", "bfg{ab}", NULL);
  
-    /* [^acd{bc}] */
+    /* [[^acd]{bc}] */
      uset_complement(set);
      expect(set, "bef{bc}", "acd{ac}", NULL);
  
@@ -436,8 +436,8 @@ static void expectItems(const USet* set,
                  strlen(items)==0 ? "TRUE" : "FALSE");
      }
  
-    /* Don't test patterns starting with "[^" */
-    if (u_strlen(ustr) > 2 && ustr[1] == 0x5e /*'^'*/) {
+    /* Don't test patterns starting with "[^" or "[\\u0000". */
+    if ((u_strlen(ustr) > 2 && ustr[1] == u'^') || uset_contains(set, 0)) {
          return;
      }
  
diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp

index 7a0641f3a530960f60607df9e9279e28da889fd9..b4bee76010759bc5a6259745826801e329f37e0d 100644 (file)
--- a/icu4c/source/test/intltest/usettest.cpp
+++ b/icu4c/source/test/intltest/usettest.cpp
@@ -4147,6 +4147,24 @@ void UnicodeSetTest::TestPatternCodePointComplement() {
                  notBasic.contains(U'🚲'));
      }
  
+    // When there are strings, we must not use the complement for a more compact toPattern().
+    {
+        UnicodeSet set;
+        set.add(0,  u'Y').add(u'b', u'q').add(u'x', 0x10ffff);
+        UnicodeString pattern;
+        set.toPattern(pattern, true);
+        UnicodeSet set2(pattern, errorCode);
+        checkEqual(set, set2, "set(with 0 & max, only code points) pattern round-trip");
+        assertEquals("set(with 0 & max, only code points).toPattern()", u"[^Z-ar-w]", pattern);
+
+        set.add("ch").add("ss");
+        set.toPattern(pattern, true);
+        set2 = UnicodeSet(pattern, errorCode);
+        checkEqual(set, set2, "set(with 0 & max, with strings) pattern round-trip");
+        assertEquals("set(with 0 & max, with strings).toPattern()",
+                u"[\\u0000-Yb-qx-\\U0010FFFF{ch}{ss}]", pattern);
+    }
+
      // The complement() API behavior does not change under this ticket.
      {
          UnicodeSet notBasic(u"[:Basic_Emoji:]", errorCode);
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java b/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java

index e677fcaad89d15ce0050859e036d8af3f59fca5d..d799b03aaa8693045e3419ca65be769b65019a6f 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java
@@ -818,7 +818,10 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
              //             getRangeEnd(last) == MAX_VALUE)
              // Invariant: list[len-1] == HIGH == MAX_VALUE + 1
              // If limit == len then len is even and the last range ends with MAX_VALUE.
-            if (len >= 4 && list[0] == 0 && limit == len) {
+            //
+            // *But* do not write the inverse (complement) if there are strings.
+            // Since ICU 70, the '^' performs a code point complement which removes all strings.
+            if (len >= 4 && list[0] == 0 && limit == len && !hasStrings()) {
                  // Emit the inverse
                  result.append('^');
                  // Offsetting the inversion list index by one lets us
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetTest.java

index a94b9fe7e6af23ffcb70c3f8ff51aacc3e2673f3..e31d92bcba46603f9a55388eadd76186ba9bc02a 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetTest.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetTest.java
@@ -2971,6 +2971,23 @@ public class UnicodeSetTest extends TestFmwk {
                      notBasic.contains("🚲"));
          }
  
+        // When there are strings, we must not use the complement for a more compact toPattern().
+        {
+            UnicodeSet set = new UnicodeSet();
+            set.add(0,  'Y').add('b', 'q').add('x', 0x10ffff);
+            String pattern = set.toPattern(true);
+            UnicodeSet set2 = new UnicodeSet(pattern);
+            checkEqual(set, set2, "set(with 0 & max, only code points) pattern round-trip");
+            assertEquals("set(with 0 & max, only code points).toPattern()", "[^Z-ar-w]", pattern);
+
+            set.add("ch").add("ss");
+            pattern = set.toPattern(true);
+            set2 = new UnicodeSet(pattern);
+            checkEqual(set, set2, "set(with 0 & max, with strings) pattern round-trip");
+            assertEquals("set(with 0 & max, with strings).toPattern()",
+                    "[\\u0000-Yb-qx-\\U0010FFFF{ch}{ss}]", pattern);
+        }
+
          // The complement() API behavior does not change under this ticket.
          {
              UnicodeSet notBasic = new UnicodeSet("[:Basic_Emoji:]").complement();
author	Markus Scherer <markus.icu@gmail.com>
	Tue, 19 Oct 2021 00:19:15 +0000 (17:19 -0700)
committer	Markus Scherer <markus.icu@gmail.com>
	Tue, 19 Oct 2021 19:24:15 +0000 (12:24 -0700)
icu4c/source/common/uniset.cpp		patch \| blob \| history
icu4c/source/test/cintltst/usettest.c		patch \| blob \| history
icu4c/source/test/intltest/usettest.cpp		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetTest.java		patch \| blob \| history