ICU-13331 fix Java AlphabeticIndex.addIndexExemplars() for missing index exemplars...

author Markus Scherer <markus.icu@gmail.com>

Fri, 16 Feb 2018 19:48:49 +0000 (19:48 +0000)

committer Markus Scherer <markus.icu@gmail.com>

Fri, 16 Feb 2018 19:48:49 +0000 (19:48 +0000)
author Markus Scherer <markus.icu@gmail.com>
Fri, 16 Feb 2018 19:48:49 +0000 (19:48 +0000)
committer Markus Scherer <markus.icu@gmail.com>
Fri, 16 Feb 2018 19:48:49 +0000 (19:48 +0000)
diff --git a/icu4c/source/i18n/alphaindex.cpp b/icu4c/source/i18n/alphaindex.cpp

index d877cb2a9911990f141de3c95b5d94be72f20457..d36a2cc6de3bb0eca9eca09c1f5c7551234d3bc2 100644 (file)
--- a/icu4c/source/i18n/alphaindex.cpp
+++ b/icu4c/source/i18n/alphaindex.cpp
@@ -725,7 +725,7 @@ void AlphabeticIndex::addIndexExemplars(const Locale &locale, UErrorCode &status
      }
  
      // question: should we add auxiliary exemplars?
-    if (exemplars.containsSome(0x61, 0x7A) /* a-z */ || exemplars.size() == 0) {
+    if (exemplars.containsSome(0x61, 0x7A) /* a-z */ || exemplars.isEmpty()) {
          exemplars.add(0x61, 0x7A);
      }
      if (exemplars.containsSome(0xAC00, 0xD7A3)) {  // Hangul syllables
@@ -740,14 +740,9 @@ void AlphabeticIndex::addIndexExemplars(const Locale &locale, UErrorCode &status
          // cut down to small list
          // make use of the fact that Ethiopic is allocated in 8's, where
          // the base is 0 mod 8.
-        UnicodeSet ethiopic(
-            UNICODE_STRING_SIMPLE("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]"), status);
-        UnicodeSetIterator it(ethiopic);
-        while (it.next() && !it.isString()) {
-            if ((it.getCodepoint() & 0x7) != 0) {
-                exemplars.remove(it.getCodepoint());
-            }
-        }
+        UnicodeSet ethiopic(UnicodeString(u"[ሀለሐመሠረሰሸቀቈቐቘበቨተቸኀኈነኘአከኰኸዀወዐዘዠየደዸጀገጐጘጠጨጰጸፀፈፐፘ]"), status);
+        ethiopic.retainAll(exemplars);
+        exemplars.remove(u'ሀ', 0x137F).addAll(ethiopic);
      }
  
      // Upper-case any that aren't already so.
diff --git a/icu4c/source/test/intltest/alphaindextst.cpp b/icu4c/source/test/intltest/alphaindextst.cpp

index a3ebd1114a84ff3cd3237f3ce3e015c76153094d..667e0435a862419c6d74853ef90a29b74b5cc85f 100644 (file)
--- a/icu4c/source/test/intltest/alphaindextst.cpp
+++ b/icu4c/source/test/intltest/alphaindextst.cpp
@@ -22,6 +22,7 @@
  #include "unicode/localpointer.h"
  #include "unicode/tblcoll.h"
  #include "unicode/uniset.h"
+#include "unicode/uscript.h"
  
  #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_NORMALIZATION
  
@@ -66,6 +67,7 @@ void AlphabeticIndexTest::runIndexedTest( int32_t index, UBool exec, const char*
      TESTCASE_AUTO(TestChineseZhuyin);
      TESTCASE_AUTO(TestJapaneseKanji);
      TESTCASE_AUTO(TestChineseUnihan);
+    TESTCASE_AUTO(testHasBuckets);
      TESTCASE_AUTO_END;
  }
  
@@ -724,4 +726,23 @@ void AlphabeticIndexTest::TestChineseUnihan() {
      assertEquals("getBucketIndex(U+7527)", 101, bucketIndex);
  }
  
+void AlphabeticIndexTest::testHasBuckets() {
+    checkHasBuckets(Locale("am"), USCRIPT_ETHIOPIC);
+    checkHasBuckets(Locale("haw"), USCRIPT_LATIN);
+    checkHasBuckets(Locale("hy"), USCRIPT_ARMENIAN);
+    checkHasBuckets(Locale("vai"), USCRIPT_VAI);
+}
+
+void AlphabeticIndexTest::checkHasBuckets(const Locale &locale, UScriptCode script) {
+    IcuTestErrorCode errorCode(*this, "checkHasBuckets");
+    AlphabeticIndex aindex(locale, errorCode);
+    LocalPointer<AlphabeticIndex::ImmutableIndex> index(aindex.buildImmutableIndex(errorCode));
+    UnicodeString loc = locale.getName();
+    assertTrue(loc + u" at least 3 buckets", index->getBucketCount() >= 3);
+    const AlphabeticIndex::Bucket *bucket = index->getBucket(1);
+    assertEquals(loc + u" real bucket", U_ALPHAINDEX_NORMAL, bucket->getLabelType());
+    assertEquals(loc + u" expected script", script,
+            uscript_getScript(bucket->getLabel().char32At(0), errorCode));
+}
+
  #endif
diff --git a/icu4c/source/test/intltest/alphaindextst.h b/icu4c/source/test/intltest/alphaindextst.h

index 6bbe153f6fa3e9ae961a61c01e46528caf014c60..a785fca08e7f0857ba9715dbbf332e2fa5ff106f 100644 (file)
--- a/icu4c/source/test/intltest/alphaindextst.h
+++ b/icu4c/source/test/intltest/alphaindextst.h
@@ -13,6 +13,7 @@
  #ifndef ALPHAINDEXTST_H
  #define ALPHAINDEXTST_H
  
+#include "unicode/uscript.h"
  #include "intltest.h"
  
  class AlphabeticIndexTest: public IntlTest {
@@ -49,6 +50,9 @@ public:
      void TestChineseZhuyin();
      void TestJapaneseKanji();
      void TestChineseUnihan();
+
+    void testHasBuckets();
+    void checkHasBuckets(const Locale &locale, UScriptCode script);
  };
  
  #endif
diff --git a/icu4j/main/classes/collate/src/com/ibm/icu/text/AlphabeticIndex.java b/icu4j/main/classes/collate/src/com/ibm/icu/text/AlphabeticIndex.java

index 6bbf99916dea08321fa82ebc3f2fb087832f3c99..3dbe3c076a560e63f2878c7e7b6c1a1861118f96 100644 (file)
--- a/icu4j/main/classes/collate/src/com/ibm/icu/text/AlphabeticIndex.java
+++ b/icu4j/main/classes/collate/src/com/ibm/icu/text/AlphabeticIndex.java
@@ -523,7 +523,7 @@ public final class AlphabeticIndex<V> implements Iterable<Bucket<V>> {
       */
      private void addIndexExemplars(ULocale locale) {
          UnicodeSet exemplars = LocaleData.getExemplarSet(locale, 0, LocaleData.ES_INDEX);
-        if (exemplars != null) {
+        if (exemplars != null && !exemplars.isEmpty()) {
              initialLabels.addAll(exemplars);
              return;
          }
@@ -534,7 +534,7 @@ public final class AlphabeticIndex<V> implements Iterable<Bucket<V>> {
  
          exemplars = exemplars.cloneAsThawed();
          // question: should we add auxiliary exemplars?
-        if (exemplars.containsSome('a', 'z') || exemplars.size() == 0) {
+        if (exemplars.containsSome('a', 'z') || exemplars.isEmpty()) {
              exemplars.addAll('a', 'z');
          }
          if (exemplars.containsSome(0xAC00, 0xD7A3)) {  // Hangul syllables
@@ -549,13 +549,9 @@ public final class AlphabeticIndex<V> implements Iterable<Bucket<V>> {
              // cut down to small list
              // make use of the fact that Ethiopic is allocated in 8's, where
              // the base is 0 mod 8.
-            UnicodeSet ethiopic = new UnicodeSet("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]");
-            UnicodeSetIterator it = new UnicodeSetIterator(ethiopic);
-            while (it.next() && it.codepoint != UnicodeSetIterator.IS_STRING) {
-                if ((it.codepoint & 0x7) != 0) {
-                    exemplars.remove(it.codepoint);
-                }
-            }
+            UnicodeSet ethiopic = new UnicodeSet("[ሀለሐመሠረሰሸቀቈቐቘበቨተቸኀኈነኘአከኰኸዀወዐዘዠየደዸጀገጐጘጠጨጰጸፀፈፐፘ]");
+            ethiopic.retainAll(exemplars);
+            exemplars.remove('ሀ', 0x137F).addAll(ethiopic);
          }
  
          // Upper-case any that aren't already so.
diff --git a/icu4j/main/tests/collate/src/com/ibm/icu/dev/test/collator/AlphabeticIndexTest.java b/icu4j/main/tests/collate/src/com/ibm/icu/dev/test/collator/AlphabeticIndexTest.java

index 6cc274a563d600c1ae4d8353d9b7d575f1d9d501..00e3f58e331edd9909bbdba5af3989f554771941 100644 (file)
--- a/icu4j/main/tests/collate/src/com/ibm/icu/dev/test/collator/AlphabeticIndexTest.java
+++ b/icu4j/main/tests/collate/src/com/ibm/icu/dev/test/collator/AlphabeticIndexTest.java
@@ -1160,4 +1160,24 @@ public class AlphabeticIndexTest extends TestFmwk {
          assertEquals("Wrong bucket label", "inflow", index.getInflowLabel());
          assertEquals("Bucket size not 1", 1, inflowBucket.size());
      }
+
+    @Test
+    public void testHasBuckets() {
+        checkHasBuckets(new Locale("am"), UScript.ETHIOPIC);
+        checkHasBuckets(new Locale("haw"), UScript.LATIN);
+        checkHasBuckets(new Locale("hy"), UScript.ARMENIAN);
+        checkHasBuckets(new Locale("vai"), UScript.VAI);
+    }
+
+    private void checkHasBuckets(Locale locale, int script) {
+        AlphabeticIndex.ImmutableIndex index =
+                new AlphabeticIndex<String>(locale).buildImmutableIndex();
+        String loc = locale.toString();
+        assertTrue(loc + " at least 3 buckets", index.getBucketCount() >= 3);
+        AlphabeticIndex.Bucket bucket = index.getBucket(1);
+        assertEquals(loc + " real bucket", AlphabeticIndex.Bucket.LabelType.NORMAL,
+                bucket.getLabelType());
+        assertEquals(loc + " expected script", script,
+                UScript.getScript(bucket.getLabel().codePointAt(0)));
+    }
  }
author	Markus Scherer <markus.icu@gmail.com>
	Fri, 16 Feb 2018 19:48:49 +0000 (19:48 +0000)
committer	Markus Scherer <markus.icu@gmail.com>
	Fri, 16 Feb 2018 19:48:49 +0000 (19:48 +0000)
icu4c/source/i18n/alphaindex.cpp		patch \| blob \| history
icu4c/source/test/intltest/alphaindextst.cpp		patch \| blob \| history
icu4c/source/test/intltest/alphaindextst.h		patch \| blob \| history
icu4j/main/classes/collate/src/com/ibm/icu/text/AlphabeticIndex.java		patch \| blob \| history
icu4j/main/tests/collate/src/com/ibm/icu/dev/test/collator/AlphabeticIndexTest.java		patch \| blob \| history