ICU-12725 Update u_isIDStart and u_isIDPart to TR31

author Frank Tang <ftang@chromium.org>

Thu, 19 Jan 2023 02:00:35 +0000 (18:00 -0800)

committer Frank Yung-Fong Tang <ftang@google.com>

Wed, 25 Jan 2023 20:02:53 +0000 (12:02 -0800)
author Frank Tang <ftang@chromium.org>
Thu, 19 Jan 2023 02:00:35 +0000 (18:00 -0800)
committer Frank Yung-Fong Tang <ftang@google.com>
Wed, 25 Jan 2023 20:02:53 +0000 (12:02 -0800)
diff --git a/icu4c/source/common/uchar.cpp b/icu4c/source/common/uchar.cpp

index 7789a3b88a6c597d058584471956d70e4d2c6967..49564069b718b1049dcde546a05b7f400a87b497 100644 (file)
--- a/icu4c/source/common/uchar.cpp
+++ b/icu4c/source/common/uchar.cpp
@@ -304,30 +304,6 @@ u_ispunct(UChar32 c) {
      return (UBool)((CAT_MASK(props)&U_GC_P_MASK)!=0);
  }
  
-/* Checks if the Unicode character can start a Unicode identifier.*/
-U_CAPI UBool U_EXPORT2
-u_isIDStart(UChar32 c) {
-    /* same as u_isalpha() */
-    uint32_t props;
-    GET_PROPS(c, props);
-    return (UBool)((CAT_MASK(props)&(U_GC_L_MASK|U_GC_NL_MASK))!=0);
-}
-
-/* Checks if the Unicode character can be a Unicode identifier part other than starting the
- identifier.*/
-U_CAPI UBool U_EXPORT2
-u_isIDPart(UChar32 c) {
-    uint32_t props;
-    GET_PROPS(c, props);
-    return (UBool)(
-           (CAT_MASK(props)&
-            (U_GC_ND_MASK|U_GC_NL_MASK|
-             U_GC_L_MASK|
-             U_GC_PC_MASK|U_GC_MC_MASK|U_GC_MN_MASK)
-           )!=0 ||
-           u_isIDIgnorable(c));
-}
-
  /*Checks if the Unicode character can be ignorable in a Java or Unicode identifier.*/
  U_CAPI UBool U_EXPORT2
  u_isIDIgnorable(UChar32 c) {
diff --git a/icu4c/source/common/unicode/uchar.h b/icu4c/source/common/unicode/uchar.h

index 6bb68e62a9d7fc6bd0685386f37e25ecf08e4314..186ac754af441e2dbbbe7081c6b30cc6494a2a39 100644 (file)
--- a/icu4c/source/common/unicode/uchar.h
+++ b/icu4c/source/common/unicode/uchar.h
@@ -3837,9 +3837,8 @@ u_getPropertyValueEnum(UProperty property,
  
  /**
   * Determines if the specified character is permissible as the
- * first character in an identifier according to Unicode
- * (The Unicode Standard, Version 3.0, chapter 5.16 Identifiers).
- * True for characters with general categories "L" (letters) and "Nl" (letter numbers).
+ * first character in an identifier as ID_Start according to
+ * Unicode® Standard Annex #31 UNICODE IDENTIFIER AND PATTERN SYNTAX
   *
   * Same as java.lang.Character.isUnicodeIdentifierStart().
   * Same as UCHAR_ID_START
@@ -3856,12 +3855,9 @@ U_CAPI UBool U_EXPORT2
  u_isIDStart(UChar32 c);
  
  /**
- * Determines if the specified character is permissible
- * in an identifier according to Java.
- * True for characters with general categories "L" (letters),
- * "Nl" (letter numbers), "Nd" (decimal digits),
- * "Mc" and "Mn" (combining marks), "Pc" (connecting punctuation), and
- * u_isIDIgnorable(c).
+ * Determines if the specified character is permissible as a
+ * character other than the first character in an identifier as ID_Continue
+ * according to Unicode® Standard Annex #31 UNICODE IDENTIFIER AND PATTERN SYNTAX
   *
   * Same as java.lang.Character.isUnicodeIdentifierPart().
   * Almost the same as Unicode's ID_Continue (UCHAR_ID_CONTINUE)
@@ -3869,7 +3865,8 @@ u_isIDStart(UChar32 c);
   * u_isIDIgnorable(c).
   *
   * @param c the code point to be tested
- * @return true if the code point may occur in an identifier according to Java
+ * @return true if the code point may occur in an identifier other than the
+ * first character.
   *
   * @see UCHAR_ID_CONTINUE
   * @see u_isIDStart
diff --git a/icu4c/source/common/uprops.cpp b/icu4c/source/common/uprops.cpp

index 26e950b876b1033c07f9709bf3ad15a6a5b4aa4f..429e3a1d1ca5226ef8d1536daa18ddb2a79b8e42 100644 (file)
--- a/icu4c/source/common/uprops.cpp
+++ b/icu4c/source/common/uprops.cpp
@@ -423,6 +423,19 @@ u_hasBinaryProperty(UChar32 c, UProperty which) {
      }
  }
  
+/* Checks if the Unicode character can start a Unicode identifier.*/
+U_CAPI UBool U_EXPORT2
+u_isIDStart(UChar32 c) {
+    return u_hasBinaryProperty(c, UCHAR_ID_START);
+}
+
+/* Checks if the Unicode character can be a Unicode identifier part other than starting the
+ identifier.*/
+U_CAPI UBool U_EXPORT2
+u_isIDPart(UChar32 c) {
+    return u_hasBinaryProperty(c, UCHAR_ID_CONTINUE);
+}
+
  U_CAPI UBool U_EXPORT2
  u_stringHasBinaryProperty(const UChar *s, int32_t length, UProperty which) {
      if (s == nullptr && length != 0) { return false; }
diff --git a/icu4c/source/test/cintltst/cucdtst.c b/icu4c/source/test/cintltst/cucdtst.c

index f8af66d16a43d6c214076fbf41fee69a063052ec..f433b149e30d6509f301233364af255a84c6a1c0 100644 (file)
--- a/icu4c/source/test/cintltst/cucdtst.c
+++ b/icu4c/source/test/cintltst/cucdtst.c
@@ -895,10 +895,10 @@ static void TestIdentifier()
      const UChar32 sampleNonJavaIDStart[] = {0x0020, 0x2030, 0x0082};
      const UChar32 sampleJavaIDPart[] = {0x005f, 0x0032, 0x0045};
      const UChar32 sampleNonJavaIDPart[] = {0x2030, 0x2020, 0x0020};
-    const UChar32 sampleUnicodeIDStart[] = {0x0250, 0x00e2, 0x0061};
-    const UChar32 sampleNonUnicodeIDStart[] = {0x2000, 0x000a, 0x2019};
-    const UChar32 sampleUnicodeIDPart[] = {0x005f, 0x0032, 0x0045};
-    const UChar32 sampleNonUnicodeIDPart[] = {0x2030, 0x00a3, 0x0020};
+    const UChar32 sampleUnicodeIDStart[] = {0x0250, 0x00e2, 0x0061, 0x1885, 0x212e, 0x309b};
+    const UChar32 sampleNonUnicodeIDStart[] = {0x2000, 0x000a, 0x2019, 0x2e2f};
+    const UChar32 sampleUnicodeIDPart[] = {0x005f, 0x0032, 0x0045, 0x1886, 0x212e, 0x309c};
+    const UChar32 sampleNonUnicodeIDPart[] = {0x2030, 0x00a3, 0x0020, 0x2019, 0x2e2f};
      const UChar32 sampleIDIgnore[] = {0x0006, 0x0010, 0x206b, 0x85};
      const UChar32 sampleNonIDIgnore[] = {0x0075, 0x00a3, 0x0061};
  
diff --git a/icu4c/source/test/depstest/dependencies.txt b/icu4c/source/test/depstest/dependencies.txt

index 0a33ab038e9f6371e0d3ac3bec31c158eb5da0d4..28a2dc705ff326125c40e10b30cdc3700e943589 100644 (file)
--- a/icu4c/source/test/depstest/dependencies.txt
+++ b/icu4c/source/test/depstest/dependencies.txt
@@ -351,7 +351,7 @@ group: uniset_core
  group: icu_utility_with_props
      util_props.o
    deps
-    icu_utility uchar ucase
+    icu_utility uchar ucase uprops
  
  group: icu_utility
      util.o
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/lang/UCharacter.java b/icu4j/main/classes/core/src/com/ibm/icu/lang/UCharacter.java

index 6e456c0d40fcab2c4008effb5bc7e3c0748ad787..bf58c8d1e34bf8ae006c2f849316d6459056161d 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/lang/UCharacter.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/lang/UCharacter.java
@@ -4550,20 +4550,7 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
       */
      public static boolean isUnicodeIdentifierPart(int ch)
      {
-        // if props == 0, it will just fall through and return false
-        // cat == format
-        return ((1 << getType(ch))
-                & ((1 << UCharacterCategory.UPPERCASE_LETTER)
-                        | (1 << UCharacterCategory.LOWERCASE_LETTER)
-                        | (1 << UCharacterCategory.TITLECASE_LETTER)
-                        | (1 << UCharacterCategory.MODIFIER_LETTER)
-                        | (1 << UCharacterCategory.OTHER_LETTER)
-                        | (1 << UCharacterCategory.LETTER_NUMBER)
-                        | (1 << UCharacterCategory.CONNECTOR_PUNCTUATION)
-                        | (1 << UCharacterCategory.DECIMAL_DIGIT_NUMBER)
-                        | (1 << UCharacterCategory.COMBINING_SPACING_MARK)
-                        | (1 << UCharacterCategory.NON_SPACING_MARK))) != 0
-                        || isIdentifierIgnorable(ch);
+        return hasBinaryProperty(ch, UProperty.ID_CONTINUE);  // single code point
      }
  
      /**
@@ -4588,15 +4575,7 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
       */
      public static boolean isUnicodeIdentifierStart(int ch)
      {
-        /*int cat = getType(ch);*/
-        // if props == 0, it will just fall through and return false
-        return ((1 << getType(ch))
-                & ((1 << UCharacterCategory.UPPERCASE_LETTER)
-                        | (1 << UCharacterCategory.LOWERCASE_LETTER)
-                        | (1 << UCharacterCategory.TITLECASE_LETTER)
-                        | (1 << UCharacterCategory.MODIFIER_LETTER)
-                        | (1 << UCharacterCategory.OTHER_LETTER)
-                        | (1 << UCharacterCategory.LETTER_NUMBER))) != 0;
+        return hasBinaryProperty(ch, UProperty.ID_START);  // single code point
      }
  
      /**
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UCharacterTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UCharacterTest.java

index d0c99cfeb7225e9f7652f43d70526ad4f853f47d..51111c1cf562e990b287da9a960c6df8a87666ea 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UCharacterTest.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UCharacterTest.java
@@ -623,15 +623,14 @@ public final class UCharacterTest extends TestFmwk
      @Test
      public void TestIdentifier()
      {
-        int unicodeidstart[] = {0x0250, 0x0000e2, 0x000061};
-        int nonunicodeidstart[] = {0x2000, 0x00000a, 0x002019};
-        int unicodeidpart[] = {0x005f, 0x000032, 0x000045};
-        int nonunicodeidpart[] = {0x2030, 0x0000a3, 0x000020};
+        int unicodeidstart[] = {0x0250, 0x0000e2, 0x000061, 0x001885, 0x00212e, 0x00309b};
+        int nonunicodeidstart[] = {0x2000, 0x00000a, 0x002019, 0x002e2f};
+        int unicodeidpart[] = {0x005f, 0x000032, 0x000045, 0x001886, 0x00212e, 0x00309c};
+        int nonunicodeidpart[] = {0x2030, 0x0000a3, 0x000020, 0x002019, 0x002e2f};
          int idignore[] = {0x0006, 0x0010, 0x206b};
          int nonidignore[] = {0x0075, 0x0000a3, 0x000061};
  
-        int size = unicodeidstart.length;
-        for (int i = 0; i < size; i ++)
+        for (int i = 0; i < unicodeidstart.length; i ++)
          {
              if (!UCharacter.isUnicodeIdentifierStart(unicodeidstart[i]))
              {
@@ -639,6 +638,9 @@ public final class UCharacterTest extends TestFmwk
                      " expected to be a unicode identifier start character");
                  break;
              }
+        }
+        for (int i = 0; i < nonunicodeidstart.length; i ++)
+        {
              if (UCharacter.isUnicodeIdentifierStart(nonunicodeidstart[i]))
              {
                  errln("FAIL \\u" + hex(nonunicodeidstart[i]) +
@@ -646,12 +648,18 @@ public final class UCharacterTest extends TestFmwk
                          "character");
                  break;
              }
+        }
+        for (int i = 0; i < unicodeidpart.length; i ++)
+        {
              if (!UCharacter.isUnicodeIdentifierPart(unicodeidpart[i]))
              {
                  errln("FAIL \\u" + hex(unicodeidpart[i]) +
                      " expected to be a unicode identifier part character");
                  break;
              }
+        }
+        for (int i = 0; i < nonunicodeidpart.length; i ++)
+        {
              if (UCharacter.isUnicodeIdentifierPart(nonunicodeidpart[i]))
              {
                  errln("FAIL \\u" + hex(nonunicodeidpart[i]) +
@@ -659,23 +667,24 @@ public final class UCharacterTest extends TestFmwk
                          "character");
                  break;
              }
+         }
+        for (int i = 0; i < idignore.length; i ++)
+        {
              if (!UCharacter.isIdentifierIgnorable(idignore[i]))
              {
                  errln("FAIL \\u" + hex(idignore[i]) +
                          " expected to be a ignorable unicode character");
                  break;
              }
+        }
+        for (int i = 0; i < nonidignore.length; i ++)
+        {
              if (UCharacter.isIdentifierIgnorable(nonidignore[i]))
              {
                  errln("FAIL \\u" + hex(nonidignore[i]) +
                      " expected not to be a ignorable unicode character");
                  break;
              }
-            logln("Ok    \\u" + hex(unicodeidstart[i]) + " and \\u" +
-                    hex(nonunicodeidstart[i]) + " and \\u" +
-                    hex(unicodeidpart[i]) + " and \\u" +
-                    hex(nonunicodeidpart[i]) + " and \\u" +
-                    hex(idignore[i]) + " and \\u" + hex(nonidignore[i]));
          }
      }
author	Frank Tang <ftang@chromium.org>
	Thu, 19 Jan 2023 02:00:35 +0000 (18:00 -0800)
committer	Frank Yung-Fong Tang <ftang@google.com>
	Wed, 25 Jan 2023 20:02:53 +0000 (12:02 -0800)
icu4c/source/common/uchar.cpp		patch \| blob \| history
icu4c/source/common/unicode/uchar.h		patch \| blob \| history
icu4c/source/common/uprops.cpp		patch \| blob \| history
icu4c/source/test/cintltst/cucdtst.c		patch \| blob \| history
icu4c/source/test/depstest/dependencies.txt		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/lang/UCharacter.java		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UCharacterTest.java		patch \| blob \| history