ICU-20916 LocaleMatcher distinguish between equivalent locales

author Markus Scherer <markus.icu@gmail.com>

Fri, 20 Dec 2019 00:09:10 +0000 (00:09 +0000)

committer Markus Scherer <markus.icu@gmail.com>

Fri, 20 Dec 2019 17:36:57 +0000 (09:36 -0800)
author Markus Scherer <markus.icu@gmail.com>
Fri, 20 Dec 2019 00:09:10 +0000 (00:09 +0000)
committer Markus Scherer <markus.icu@gmail.com>
Fri, 20 Dec 2019 17:36:57 +0000 (09:36 -0800)
diff --git a/icu4c/source/common/localematcher.cpp b/icu4c/source/common/localematcher.cpp

index d975fe759b4ce1911e2793567b1f0e7267aecdda..0723bc1d4597975a534d8ba6e043649ed6a36b3c 100644 (file)
--- a/icu4c/source/common/localematcher.cpp
+++ b/icu4c/source/common/localematcher.cpp
@@ -22,7 +22,7 @@
  #include "uhash.h"
  #include "uvector.h"
  
-#define UND_LSR LSR("und", "", "")
+#define UND_LSR LSR("und", "", "", LSR::EXPLICIT_LSR)
  
  /**
   * Indicator for the lifetime of desired-locale objects passed into the LocaleMatcher.
@@ -393,26 +393,27 @@ LocaleMatcher::LocaleMatcher(const Builder &builder, UErrorCode &errorCode) :
          // 3. Remaining locales in builder order.
          // In Java, we use a LinkedHashMap for both map & ordered lists.
          // In C++, we use separate structures.
-        // We over-allocate arrays of LSRs and indexes for simplicity.
-        // We reserve slots at the array starts for the default and paradigm locales,
-        // plus enough for all supported locales.
-        // If there are few paradigm locales and few duplicate supported LSRs,
-        // then the amount of wasted space is small.
+        //
+        // We allocate arrays of LSRs and indexes,
+        // with as many slots as supported locales, for simplicity.
+        // We write the default and paradigm LSRs starting from the front of the arrays,
+        // and others starting from the back.
+        // At the end we reverse the non-paradigm LSRs.
+        // We end up wasting as many array slots as there are duplicate supported LSRs,
+        // but the amount of wasted space is small as long as there are few duplicates.
          supportedLsrToIndex = uhash_openSize(hashLSR, compareLSRs, uhash_compareLong,
                                               supportedLocalesLength, &errorCode);
          if (U_FAILURE(errorCode)) { return; }
-        int32_t paradigmLimit = 1 + localeDistance.getParadigmLSRsLength();
-        int32_t suppLSRsCapacity = paradigmLimit + supportedLocalesLength;
          supportedLSRs = static_cast<const LSR **>(
-            uprv_malloc(suppLSRsCapacity * sizeof(const LSR *)));
+            uprv_malloc(supportedLocalesLength * sizeof(const LSR *)));
          supportedIndexes = static_cast<int32_t *>(
-            uprv_malloc(suppLSRsCapacity * sizeof(int32_t)));
+            uprv_malloc(supportedLocalesLength * sizeof(int32_t)));
          if (supportedLSRs == nullptr || supportedIndexes == nullptr) {
              errorCode = U_MEMORY_ALLOCATION_ERROR;
              return;
          }
          int32_t paradigmIndex = 0;
-        int32_t otherIndex = paradigmLimit;
+        int32_t otherIndex = supportedLocalesLength;
          if (idef >= 0) {
              uhash_puti(supportedLsrToIndex, const_cast<LSR *>(defLSR), idef + 1, &errorCode);
              supportedLSRs[0] = defLSR;
@@ -446,21 +447,32 @@ LocaleMatcher::LocaleMatcher(const Builder &builder, UErrorCode &errorCode) :
                          supportedLSRs[paradigmIndex] = &lsr;
                          supportedIndexes[paradigmIndex++] = i;
                      } else {
-                        supportedLSRs[otherIndex] = &lsr;
-                        supportedIndexes[otherIndex++] = i;
+                        supportedLSRs[--otherIndex] = &lsr;
+                        supportedIndexes[otherIndex] = i;
                      }
                  }
              }
              if (U_FAILURE(errorCode)) { return; }
          }
-        // Squeeze out unused array slots.
-        if (paradigmIndex < paradigmLimit && paradigmLimit < otherIndex) {
-            uprv_memmove(supportedLSRs + paradigmIndex, supportedLSRs + paradigmLimit,
-                         (otherIndex - paradigmLimit) * sizeof(const LSR *));
-            uprv_memmove(supportedIndexes + paradigmIndex, supportedIndexes + paradigmLimit,
-                         (otherIndex - paradigmLimit) * sizeof(int32_t));
+        // Reverse the non-paradigm LSRs to be in order, right after the paradigm LSRs.
+        // First fill the unused slots between paradigm LSRs and other LSRs.
+        // This gap is as large as the number of locales with duplicate LSRs.
+        int32_t i = paradigmIndex;
+        int32_t j = supportedLocalesLength - 1;
+        while (i < otherIndex && otherIndex <= j) {
+            supportedLSRs[i] = supportedLSRs[j];
+            supportedIndexes[i++] = supportedIndexes[j--];
          }
-        supportedLSRsLength = otherIndex - (paradigmLimit - paradigmIndex);
+        // Swap remaining non-paradigm LSRs in place.
+        while (i < j) {
+            const LSR *tempLSR = supportedLSRs[i];
+            supportedLSRs[i] = supportedLSRs[j];
+            supportedLSRs[j] = tempLSR;
+            int32_t tempIndex = supportedIndexes[i];
+            supportedIndexes[i++] = supportedIndexes[j];
+            supportedIndexes[j--] = tempIndex;
+        }
+        supportedLSRsLength = supportedLocalesLength - (otherIndex - paradigmIndex);
      }
  
      if (def != nullptr && (idef < 0 || def != supportedLocales[idef])) {
@@ -662,7 +674,7 @@ int32_t LocaleMatcher::getBestSuppIndex(LSR desiredLSR, LocaleLsrIterator *remai
      if (U_FAILURE(errorCode)) { return -1; }
      int32_t desiredIndex = 0;
      int32_t bestSupportedLsrIndex = -1;
-    for (int32_t bestDistance = thresholdDistance;;) {
+    for (int32_t bestShiftedDistance = LocaleDistance::shiftDistance(thresholdDistance);;) {
          // Quick check for exact maximized LSR.
          // Returns suppIndex+1 where 0 means not found.
          if (supportedLsrToIndex != nullptr) {
@@ -677,16 +689,17 @@ int32_t LocaleMatcher::getBestSuppIndex(LSR desiredLSR, LocaleLsrIterator *remai
              }
          }
          int32_t bestIndexAndDistance = localeDistance.getBestIndexAndDistance(
-                desiredLSR, supportedLSRs, supportedLSRsLength, bestDistance, favorSubtag);
+                desiredLSR, supportedLSRs, supportedLSRsLength, bestShiftedDistance, favorSubtag);
          if (bestIndexAndDistance >= 0) {
-            bestDistance = bestIndexAndDistance & 0xff;
+            bestShiftedDistance = LocaleDistance::getShiftedDistance(bestIndexAndDistance);
              if (remainingIter != nullptr) {
                  remainingIter->rememberCurrent(desiredIndex, errorCode);
                  if (U_FAILURE(errorCode)) { return -1; }
              }
-            bestSupportedLsrIndex = bestIndexAndDistance >= 0 ? bestIndexAndDistance >> 8 : -1;
+            bestSupportedLsrIndex = bestIndexAndDistance >= 0 ?
+                LocaleDistance::getIndex(bestIndexAndDistance) : -1;
          }
-        if ((bestDistance -= demotionPerDesiredLocale) <= 0) {
+        if ((bestShiftedDistance -= LocaleDistance::shiftDistance(demotionPerDesiredLocale)) <= 0) {
              break;
          }
          if (remainingIter == nullptr || !remainingIter->hasNext()) {
@@ -708,11 +721,12 @@ double LocaleMatcher::internalMatch(const Locale &desired, const Locale &support
      LSR suppLSR = getMaximalLsrOrUnd(likelySubtags, supported, errorCode);
      if (U_FAILURE(errorCode)) { return 0; }
      const LSR *pSuppLSR = &suppLSR;
-    int32_t distance = localeDistance.getBestIndexAndDistance(
+    int32_t indexAndDistance = localeDistance.getBestIndexAndDistance(
              getMaximalLsrOrUnd(likelySubtags, desired, errorCode),
              &pSuppLSR, 1,
-            thresholdDistance, favorSubtag) & 0xff;
-    return (100 - distance) / 100.0;
+            LocaleDistance::shiftDistance(thresholdDistance), favorSubtag);
+    double distance = LocaleDistance::getDistanceDouble(indexAndDistance);
+    return (100.0 - distance) / 100.0;
  }
  
  U_NAMESPACE_END
diff --git a/icu4c/source/common/locdistance.cpp b/icu4c/source/common/locdistance.cpp

index 800d0eacf2b605f59f2c6455fb38867ac0706b14..50633cc828922a85053755f9c3cb8ff8c75b331e 100644 (file)
--- a/icu4c/source/common/locdistance.cpp
+++ b/icu4c/source/common/locdistance.cpp
@@ -97,17 +97,23 @@ LocaleDistance::LocaleDistance(const LocaleDistanceData &data) :
      // a mere region difference for one desired locale
      // is as good as a perfect match for the next following desired locale.
      // As of CLDR 36, we have <languageMatch desired="en_*_*" supported="en_*_*" distance="5"/>.
-    LSR en("en", "Latn", "US");
-    LSR enGB("en", "Latn", "GB");
+    LSR en("en", "Latn", "US", LSR::EXPLICIT_LSR);
+    LSR enGB("en", "Latn", "GB", LSR::EXPLICIT_LSR);
      const LSR *p_enGB = &enGB;
-    defaultDemotionPerDesiredLocale = getBestIndexAndDistance(en, &p_enGB, 1,
-            50, ULOCMATCH_FAVOR_LANGUAGE) & 0xff;
+    int32_t indexAndDistance = getBestIndexAndDistance(en, &p_enGB, 1,
+            shiftDistance(50), ULOCMATCH_FAVOR_LANGUAGE);
+    defaultDemotionPerDesiredLocale  = getDistanceFloor(indexAndDistance);
  }
  
  int32_t LocaleDistance::getBestIndexAndDistance(
          const LSR &desired,
          const LSR **supportedLSRs, int32_t supportedLSRsLength,
-        int32_t threshold, ULocMatchFavorSubtag favorSubtag) const {
+        int32_t shiftedThreshold, ULocMatchFavorSubtag favorSubtag) const {
+    // Round up the shifted threshold (if fraction bits are not 0)
+    // for comparison with un-shifted distances until we need fraction bits.
+    // (If we simply shifted non-zero fraction bits away, then we might ignore a language
+    // when it's really still a micro distance below the threshold.)
+    int32_t roundedThreshold = (shiftedThreshold + DISTANCE_FRACTION_MASK) >> DISTANCE_SHIFT;
      BytesTrie iter(trie);
      // Look up the desired language only once for all supported LSRs.
      // Its "distance" is either a match point value of 0, or a non-match negative value.
@@ -153,7 +159,7 @@ int32_t LocaleDistance::getBestIndexAndDistance(
          if (favorSubtag == ULOCMATCH_FAVOR_SCRIPT) {
              distance >>= 2;
          }
-        if (distance >= threshold) {
+        if (distance >= roundedThreshold) {
              continue;
          }
  
@@ -171,7 +177,7 @@ int32_t LocaleDistance::getBestIndexAndDistance(
              scriptDistance &= ~DISTANCE_IS_FINAL;
          }
          distance += scriptDistance;
-        if (distance >= threshold) {
+        if (distance >= roundedThreshold) {
              continue;
          }
  
@@ -180,7 +186,7 @@ int32_t LocaleDistance::getBestIndexAndDistance(
          } else if (star || (flags & DISTANCE_IS_FINAL) != 0) {
              distance += defaultRegionDistance;
          } else {
-            int32_t remainingThreshold = threshold - distance;
+            int32_t remainingThreshold = roundedThreshold - distance;
              if (minRegionDistance >= remainingThreshold) {
                  continue;
              }
@@ -196,15 +202,23 @@ int32_t LocaleDistance::getBestIndexAndDistance(
                      partitionsForRegion(supported),
                      remainingThreshold);
          }
-        if (distance < threshold) {
-            if (distance == 0) {
-                return slIndex << 8;
+        int32_t shiftedDistance = shiftDistance(distance);
+        if (shiftedDistance == 0) {
+            // Distinguish between equivalent but originally unequal locales via an
+            // additional micro distance.
+            shiftedDistance |= (desired.flags ^ supported.flags);
+        }
+        if (shiftedDistance < shiftedThreshold) {
+            if (shiftedDistance == 0) {
+                return slIndex << INDEX_SHIFT;
              }
              bestIndex = slIndex;
-            threshold = distance;
+            shiftedThreshold = shiftedDistance;
          }
      }
-    return bestIndex >= 0 ? (bestIndex << 8) | threshold : 0xffffff00 | ABOVE_THRESHOLD;
+    return bestIndex >= 0 ?
+            (bestIndex << INDEX_SHIFT) | shiftedThreshold :
+            INDEX_NEG_1 | shiftDistance(ABOVE_THRESHOLD);
  }
  
  int32_t LocaleDistance::getDesSuppScriptDistance(
@@ -352,11 +366,14 @@ int32_t LocaleDistance::trieNext(BytesTrie &iter, const char *s, bool wantValue)
  }
  
  UBool LocaleDistance::isParadigmLSR(const LSR &lsr) const {
-    // Linear search for a very short list (length 6 as of 2019).
-    // If there are many paradigm LSRs we should use a hash set.
+    // Linear search for a very short list (length 6 as of 2019),
+    // because we look for equivalence not equality, and
+    // because it's easy.
+    // If there are many paradigm LSRs we should use a hash set
+    // with custom comparator and hasher.
      U_ASSERT(paradigmLSRsLength <= 15);
      for (int32_t i = 0; i < paradigmLSRsLength; ++i) {
-        if (lsr == paradigmLSRs[i]) { return true; }
+        if (lsr.isEquivalentTo(paradigmLSRs[i])) { return true; }
      }
      return false;
  }
diff --git a/icu4c/source/common/locdistance.h b/icu4c/source/common/locdistance.h

index 7439f51c56bf8c3e328c3b99eb17839d50d1d884..0ee3d0e63e94b6266a0731e45c8a4786d1d96fce 100644 (file)
--- a/icu4c/source/common/locdistance.h
+++ b/icu4c/source/common/locdistance.h
@@ -26,19 +26,36 @@ class LocaleDistance final : public UMemory {
  public:
      static const LocaleDistance *getSingleton(UErrorCode &errorCode);
  
+    static int32_t shiftDistance(int32_t distance) {
+        return distance << DISTANCE_SHIFT;
+    }
+
+    static int32_t getShiftedDistance(int32_t indexAndDistance) {
+        return indexAndDistance & DISTANCE_MASK;
+    }
+
+    static double getDistanceDouble(int32_t indexAndDistance) {
+        double shiftedDistance = getShiftedDistance(indexAndDistance);
+        return shiftedDistance / (1 << DISTANCE_SHIFT);
+    }
+
+    static int32_t getIndex(int32_t indexAndDistance) {
+        // assert indexAndDistance >= 0;
+        return indexAndDistance >> INDEX_SHIFT;
+    }
+
      /**
       * Finds the supported LSR with the smallest distance from the desired one.
       * Equivalent LSR subtags must be normalized into a canonical form.
       *
-     * <p>Returns the index of the lowest-distance supported LSR in bits 31..8
+     * <p>Returns the index of the lowest-distance supported LSR in the high bits
       * (negative if none has a distance below the threshold),
-     * and its distance (0..ABOVE_THRESHOLD) in bits 7..0.
+     * and its distance (0..ABOVE_THRESHOLD) in the low bits.
       */
      int32_t getBestIndexAndDistance(const LSR &desired,
                                      const LSR **supportedLSRs, int32_t supportedLSRsLength,
-                                    int32_t threshold, ULocMatchFavorSubtag favorSubtag) const;
-
-    int32_t getParadigmLSRsLength() const { return paradigmLSRsLength; }
+                                    int32_t shiftedThreshold,
+                                    ULocMatchFavorSubtag favorSubtag) const;
  
      UBool isParadigmLSR(const LSR &lsr) const;
  
@@ -51,6 +68,20 @@ public:
      }
  
  private:
+    // The distance is shifted left to gain some fraction bits.
+    static constexpr int32_t DISTANCE_SHIFT = 3;
+    static constexpr int32_t DISTANCE_FRACTION_MASK = 7;
+    // 7 bits for 0..100
+    static constexpr int32_t DISTANCE_INT_SHIFT = 7;
+    static constexpr int32_t INDEX_SHIFT = DISTANCE_INT_SHIFT + DISTANCE_SHIFT;
+    static constexpr int32_t DISTANCE_MASK = 0x3ff;
+    // tic constexpr int32_t MAX_INDEX = 0x1fffff;  // avoids sign bit
+    static constexpr int32_t INDEX_NEG_1 = 0xfffffc00;
+
+    static int32_t getDistanceFloor(int32_t indexAndDistance) {
+        return (indexAndDistance & DISTANCE_MASK) >> DISTANCE_SHIFT;
+    }
+
      LocaleDistance(const LocaleDistanceData &data);
      LocaleDistance(const LocaleDistance &other) = delete;
      LocaleDistance &operator=(const LocaleDistance &other) = delete;
diff --git a/icu4c/source/common/loclikelysubtags.cpp b/icu4c/source/common/loclikelysubtags.cpp

index d7f5e124c2c790d2b4f0d0b94e71c0961a34b59d..27f10b3fb92c7fc471f70e28fc5705d477127761 100644 (file)
--- a/icu4c/source/common/loclikelysubtags.cpp
+++ b/icu4c/source/common/loclikelysubtags.cpp
@@ -250,7 +250,8 @@ struct XLikelySubtagsData {
          for (int32_t i = 0, j = 0; i < lsrSubtagsLength; i += 3, ++j) {
              lsrs[j] = LSR(strings.get(lsrSubtagIndexes[i]),
                            strings.get(lsrSubtagIndexes[i + 1]),
-                          strings.get(lsrSubtagIndexes[i + 2]));
+                          strings.get(lsrSubtagIndexes[i + 2]),
+                          LSR::IMPLICIT_LSR);
          }
  
          if (partitionsLength > 0) {
@@ -275,7 +276,8 @@ struct XLikelySubtagsData {
              for (int32_t i = 0, j = 0; i < paradigmSubtagsLength; i += 3, ++j) {
                  paradigms[j] = LSR(strings.get(paradigmSubtagIndexes[i]),
                                     strings.get(paradigmSubtagIndexes[i + 1]),
-                                   strings.get(paradigmSubtagIndexes[i + 2]));
+                                   strings.get(paradigmSubtagIndexes[i + 2]),
+                                   LSR::DONT_CARE_FLAGS);
              }
              distanceData.paradigms = paradigms;
          }
@@ -383,7 +385,7 @@ LSR XLikelySubtags::makeMaximizedLsrFrom(const Locale &locale, UErrorCode &error
      const char *name = locale.getName();
      if (uprv_isAtSign(name[0]) && name[1] == 'x' && name[2] == '=') {  // name.startsWith("@x=")
          // Private use language tag x-subtag-subtag...
-        return LSR(name, "", "");
+        return LSR(name, "", "", LSR::EXPLICIT_LSR);
      }
      return makeMaximizedLsr(locale.getLanguage(), locale.getScript(), locale.getCountry(),
                              locale.getVariant(), errorCode);
@@ -407,26 +409,31 @@ LSR XLikelySubtags::makeMaximizedLsr(const char *language, const char *script, c
      if (region[0] == 'X' && (c1 = region[1]) != 0 && region[2] == 0) {
          switch (c1) {
          case 'A':
-            return LSR(PSEUDO_ACCENTS_PREFIX, language, script, region, errorCode);
+            return LSR(PSEUDO_ACCENTS_PREFIX, language, script, region,
+                       LSR::EXPLICIT_LSR, errorCode);
          case 'B':
-            return LSR(PSEUDO_BIDI_PREFIX, language, script, region, errorCode);
+            return LSR(PSEUDO_BIDI_PREFIX, language, script, region,
+                       LSR::EXPLICIT_LSR, errorCode);
          case 'C':
-            return LSR(PSEUDO_CRACKED_PREFIX, language, script, region, errorCode);
+            return LSR(PSEUDO_CRACKED_PREFIX, language, script, region,
+                       LSR::EXPLICIT_LSR, errorCode);
          default:  // normal locale
              break;
          }
      }
  
      if (variant[0] == 'P' && variant[1] == 'S') {
+        int32_t lsrFlags = *region == 0 ?
+            LSR::EXPLICIT_LANGUAGE | LSR::EXPLICIT_SCRIPT : LSR::EXPLICIT_LSR;
          if (uprv_strcmp(variant, "PSACCENT") == 0) {
              return LSR(PSEUDO_ACCENTS_PREFIX, language, script,
-                       *region == 0 ? "XA" : region, errorCode);
+                       *region == 0 ? "XA" : region, lsrFlags, errorCode);
          } else if (uprv_strcmp(variant, "PSBIDI") == 0) {
              return LSR(PSEUDO_BIDI_PREFIX, language, script,
-                       *region == 0 ? "XB" : region, errorCode);
+                       *region == 0 ? "XB" : region, lsrFlags, errorCode);
          } else if (uprv_strcmp(variant, "PSCRACK") == 0) {
              return LSR(PSEUDO_CRACKED_PREFIX, language, script,
-                       *region == 0 ? "XC" : region, errorCode);
+                       *region == 0 ? "XC" : region, lsrFlags, errorCode);
          }
          // else normal locale
      }
@@ -448,7 +455,7 @@ LSR XLikelySubtags::maximize(const char *language, const char *script, const cha
          region = "";
      }
      if (*script != 0 && *region != 0 && *language != 0) {
-        return LSR(language, script, region);  // already maximized
+        return LSR(language, script, region, LSR::EXPLICIT_LSR);  // already maximized
      }
  
      uint32_t retainOldMask = 0;
@@ -535,7 +542,7 @@ LSR XLikelySubtags::maximize(const char *language, const char *script, const cha
      if (retainOldMask == 0) {
          // Quickly return a copy of the lookup-result LSR
          // without new allocation of the subtags.
-        return LSR(result.language, result.script, result.region);
+        return LSR(result.language, result.script, result.region, result.flags);
      }
      if ((retainOldMask & 4) == 0) {
          language = result.language;
@@ -546,7 +553,8 @@ LSR XLikelySubtags::maximize(const char *language, const char *script, const cha
      if ((retainOldMask & 1) == 0) {
          region = result.region;
      }
-    return LSR(language, script, region);
+    // retainOldMask flags = LSR explicit-subtag flags
+    return LSR(language, script, region, retainOldMask);
  }
  
  int32_t XLikelySubtags::trieNext(BytesTrie &iter, const char *s, int32_t i) {
@@ -615,9 +623,9 @@ LSR XLikelySubtags::minimizeSubtags(const char *languageIn, const char *scriptIn
      boolean favorRegionOk = false;
      if (result.script.equals(value00.script)) { //script is default
          if (result.region.equals(value00.region)) {
-            return new LSR(result.language, "", "");
+            return new LSR(result.language, "", "", LSR.DONT_CARE_FLAGS);
          } else if (fieldToFavor == ULocale.Minimize.FAVOR_REGION) {
-            return new LSR(result.language, "", result.region);
+            return new LSR(result.language, "", result.region, LSR.DONT_CARE_FLAGS);
          } else {
              favorRegionOk = true;
          }
@@ -627,9 +635,9 @@ LSR XLikelySubtags::minimizeSubtags(const char *languageIn, const char *scriptIn
      // Maybe do later, but for now use the straightforward code.
      LSR result2 = maximize(languageIn, scriptIn, "");
      if (result2.equals(result)) {
-        return new LSR(result.language, result.script, "");
+        return new LSR(result.language, result.script, "", LSR.DONT_CARE_FLAGS);
      } else if (favorRegionOk) {
-        return new LSR(result.language, "", result.region);
+        return new LSR(result.language, "", result.region, LSR.DONT_CARE_FLAGS);
      }
      return result;
  }
diff --git a/icu4c/source/common/lsr.cpp b/icu4c/source/common/lsr.cpp

index 0c28eeda1bc7b6642458c4eda84fce5fcaedb991..a5e10ef17670f5336c6f1a5dfd11c9fa432b9619 100644 (file)
--- a/icu4c/source/common/lsr.cpp
+++ b/icu4c/source/common/lsr.cpp
@@ -14,9 +14,10 @@
  
  U_NAMESPACE_BEGIN
  
-LSR::LSR(char prefix, const char *lang, const char *scr, const char *r, UErrorCode &errorCode) :
+LSR::LSR(char prefix, const char *lang, const char *scr, const char *r, int32_t f,
+         UErrorCode &errorCode) :
          language(nullptr), script(nullptr), region(r),
-        regionIndex(indexForRegion(region)) {
+        regionIndex(indexForRegion(region)), flags(f) {
      if (U_SUCCESS(errorCode)) {
          CharString langScript;
          langScript.append(prefix, errorCode).append(lang, errorCode).append('\0', errorCode);
@@ -32,7 +33,8 @@ LSR::LSR(char prefix, const char *lang, const char *scr, const char *r, UErrorCo
  
  LSR::LSR(LSR &&other) U_NOEXCEPT :
          language(other.language), script(other.script), region(other.region), owned(other.owned),
-        regionIndex(other.regionIndex), hashCode(other.hashCode) {
+        regionIndex(other.regionIndex), flags(other.flags),
+        hashCode(other.hashCode) {
      if (owned != nullptr) {
          other.language = other.script = "";
          other.owned = nullptr;
@@ -50,6 +52,7 @@ LSR &LSR::operator=(LSR &&other) U_NOEXCEPT {
      script = other.script;
      region = other.region;
      regionIndex = other.regionIndex;
+    flags = other.flags;
      owned = other.owned;
      hashCode = other.hashCode;
      if (owned != nullptr) {
@@ -60,7 +63,7 @@ LSR &LSR::operator=(LSR &&other) U_NOEXCEPT {
      return *this;
  }
  
-UBool LSR::operator==(const LSR &other) const {
+UBool LSR::isEquivalentTo(const LSR &other) const {
      return
          uprv_strcmp(language, other.language) == 0 &&
          uprv_strcmp(script, other.script) == 0 &&
@@ -69,6 +72,16 @@ UBool LSR::operator==(const LSR &other) const {
          (regionIndex > 0 || uprv_strcmp(region, other.region) == 0);
  }
  
+UBool LSR::operator==(const LSR &other) const {
+    return
+        uprv_strcmp(language, other.language) == 0 &&
+        uprv_strcmp(script, other.script) == 0 &&
+        regionIndex == other.regionIndex &&
+        // Compare regions if both are ill-formed (and their indexes are 0).
+        (regionIndex > 0 || uprv_strcmp(region, other.region) == 0) &&
+        flags == other.flags;
+}
+
  int32_t LSR::indexForRegion(const char *region) {
      int32_t c = region[0];
      int32_t a = c - '0';
@@ -90,10 +103,10 @@ int32_t LSR::indexForRegion(const char *region) {
  
  LSR &LSR::setHashCode() {
      if (hashCode == 0) {
-        hashCode =
-            (ustr_hashCharsN(language, static_cast<int32_t>(uprv_strlen(language))) * 37 +
-            ustr_hashCharsN(script, static_cast<int32_t>(uprv_strlen(script)))) * 37 +
-            regionIndex;
+        int32_t h = ustr_hashCharsN(language, static_cast<int32_t>(uprv_strlen(language)));
+        h = h * 37 + ustr_hashCharsN(script, static_cast<int32_t>(uprv_strlen(script)));
+        h = h * 37 + regionIndex;
+        hashCode = h * 37 + flags;
      }
      return *this;
  }
diff --git a/icu4c/source/common/lsr.h b/icu4c/source/common/lsr.h

index db6cf938f47d021dbc06c9ded8c86d0781ff6e5a..d535e5b0376cd3c88c364ea57deddf13034a669e 100644 (file)
--- a/icu4c/source/common/lsr.h
+++ b/icu4c/source/common/lsr.h
@@ -16,26 +16,35 @@ U_NAMESPACE_BEGIN
  struct LSR final : public UMemory {
      static constexpr int32_t REGION_INDEX_LIMIT = 1001 + 26 * 26;
  
+    static constexpr int32_t EXPLICIT_LSR = 7;
+    static constexpr int32_t EXPLICIT_LANGUAGE = 4;
+    static constexpr int32_t EXPLICIT_SCRIPT = 2;
+    static constexpr int32_t EXPLICIT_REGION = 1;
+    static constexpr int32_t IMPLICIT_LSR = 0;
+    static constexpr int32_t DONT_CARE_FLAGS = 0;
+
      const char *language;
      const char *script;
      const char *region;
      char *owned = nullptr;
      /** Index for region, 0 if ill-formed. @see indexForRegion */
      int32_t regionIndex = 0;
+    int32_t flags = 0;
      /** Only set for LSRs that will be used in a hash table. */
      int32_t hashCode = 0;
  
      LSR() : language("und"), script(""), region("") {}
  
      /** Constructor which aliases all subtag pointers. */
-    LSR(const char *lang, const char *scr, const char *r) :
+    LSR(const char *lang, const char *scr, const char *r, int32_t f) :
              language(lang),  script(scr), region(r),
-            regionIndex(indexForRegion(region)) {}
+            regionIndex(indexForRegion(region)), flags(f) {}
      /**
       * Constructor which prepends the prefix to the language and script,
       * copies those into owned memory, and aliases the region.
       */
-    LSR(char prefix, const char *lang, const char *scr, const char *r, UErrorCode &errorCode);
+    LSR(char prefix, const char *lang, const char *scr, const char *r, int32_t f,
+        UErrorCode &errorCode);
      LSR(LSR &&other) U_NOEXCEPT;
      LSR(const LSR &other) = delete;
      inline ~LSR() {
@@ -55,6 +64,7 @@ struct LSR final : public UMemory {
       */
      static int32_t indexForRegion(const char *region);
  
+    UBool isEquivalentTo(const LSR &other) const;
      UBool operator==(const LSR &other) const;
  
      inline UBool operator!=(const LSR &other) const {
diff --git a/icu4c/source/test/testdata/localeMatcherTest.txt b/icu4c/source/test/testdata/localeMatcherTest.txt

index 21c9b601410e07f0d50527d486e28d0206ba8db7..649c95baea58f566ac8cadaa92b5145c345ef9f1 100644 (file)
--- a/icu4c/source/test/testdata/localeMatcherTest.txt
+++ b/icu4c/source/test/testdata/localeMatcherTest.txt
@@ -1052,9 +1052,9 @@ en >> en-DE
  ar-EG >> ar-SY
  pt-BR >> pt
  ar-XB >> ar-XB
-ar-PSBIDI >> ar-XB  # These are equivalent.
+ar-PSBIDI >> ar-PSBIDI
  en-XA >> en-XA
-en-PSACCENT >> en-XA  # These are equivalent.
+en-PSACCENT >> en-PSACCENT
  ar-PSCRACK >> ar-PSCRACK
  
  @favor=script
@@ -1063,9 +1063,9 @@ en >> en-DE
  ar-EG >> ar-SY
  pt-BR >> pt
  ar-XB >> ar-XB
-ar-PSBIDI >> ar-XB  # These are equivalent.
+ar-PSBIDI >> ar-PSBIDI
  en-XA >> en-XA
-en-PSACCENT >> en-XA  # These are equivalent.
+en-PSACCENT >> en-PSACCENT
  ar-PSCRACK >> ar-PSCRACK
  
  ** test: BestMatchForTraditionalChinese
@@ -1544,50 +1544,44 @@ zh-TW, en >> en-US
  zh-Hant-CN, en >> en-US
  zh-Hans, en >> zh-Hans-CN
  
-** test: return first among likely-subtags equivalent locales
-# Was: more specific script should win in case regions are identical
-# with some different results.
+** test: return most originally similar among likely-subtags equivalent locales
  @supported=af, af-Latn, af-Arab
  af >> af
  af-ZA >> af
-af-Latn-ZA >> af
-af-Latn >> af
+af-Latn-ZA >> af-Latn
+af-Latn >> af-Latn
  
  @favor=script
  af >> af
  af-ZA >> af
-af-Latn-ZA >> af
-af-Latn >> af
+af-Latn-ZA >> af-Latn
+af-Latn >> af-Latn
  
-# Was: more specific region should win
-# with some different results.
  @supported=nl, nl-NL, nl-BE
  @favor=
  nl >> nl
  nl-Latn >> nl
-nl-Latn-NL >> nl
-nl-NL >> nl
+nl-Latn-NL >> nl-NL
+nl-NL >> nl-NL
  
  @favor=script
  nl >> nl
  nl-Latn >> nl
-nl-Latn-NL >> nl
-nl-NL >> nl
+nl-Latn-NL >> nl-NL
+nl-NL >> nl-NL
  
-# Was: more specific region wins over more specific script
-# with some different results.
  @supported=nl, nl-Latn, nl-NL, nl-BE
  @favor=
  nl >> nl
-nl-Latn >> nl
-nl-NL >> nl
-nl-Latn-NL >> nl
+nl-Latn >> nl-Latn
+nl-NL >> nl-NL
+nl-Latn-NL >> nl-Latn
  
  @favor=script
  nl >> nl
-nl-Latn >> nl
-nl-NL >> nl
-nl-Latn-NL >> nl
+nl-Latn >> nl-Latn
+nl-NL >> nl-NL
+nl-Latn-NL >> nl-Latn
  
  ** test: region may replace matched if matched is enclosing
  @supported=es-419, es
@@ -1670,22 +1664,22 @@ ja-Jpan-JP, en-GB >> ja
  ** test: pick best maximized tag
  @supported=ja, ja-Jpan-US, ja-JP, en, ru
  ja-Jpan, ru >> ja
-ja-JP, ru >> ja
+ja-JP, ru >> ja-JP
  ja-US, ru >> ja-Jpan-US
  
  @favor=script
  ja-Jpan, ru >> ja
-ja-JP, ru >> ja
+ja-JP, ru >> ja-JP
  ja-US, ru >> ja-Jpan-US
  
  ** test: termination: pick best maximized match
  @supported=ja, ja-Jpan, ja-JP, en, ru
-ja-Jpan-JP, ru >> ja
-ja-Jpan, ru >> ja
+ja-Jpan-JP, ru >> ja-Jpan
+ja-Jpan, ru >> ja-Jpan
  
  @favor=script
-ja-Jpan-JP, ru >> ja
-ja-Jpan, ru >> ja
+ja-Jpan-JP, ru >> ja-Jpan
+ja-Jpan, ru >> ja-Jpan
  
  ** test: same language over exact, but distinguish when user is explicit
  @supported=fr, en-GB, ja, es-ES, es-MX
@@ -1900,7 +1894,7 @@ zh-TW >> zh
  ** test: testGetBestMatchWithMinMatchScore
  @supported=fr-FR, fr, fr-CA, en
  @default=und
-fr >> fr-FR # First likely-subtags equivalent match is chosen.
+fr >> fr
  @supported=en, fr, fr-CA
  fr-FR >> fr # Parent match is chosen.
  @supported=en, fr-CA
@@ -1930,7 +1924,7 @@ ru >> und
  
  @favor=script
  @supported=fr-FR, fr, fr-CA, en
-fr >> fr-FR
+fr >> fr
  @supported=en, fr, fr-CA
  fr-FR >> fr
  @supported=en, fr-CA
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/LSR.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/LSR.java

index d1dc775d1832b583771b872d3708130c3dc82f15..95c289814f2e1b4bf18087d671ddecd9f31e4419 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/LSR.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/LSR.java
@@ -7,6 +7,13 @@ import java.util.Objects;
  public final class LSR {
      public static final int REGION_INDEX_LIMIT = 1001 + 26 * 26;
  
+    public static final int EXPLICIT_LSR = 7;
+    public static final int EXPLICIT_LANGUAGE = 4;
+    public static final int EXPLICIT_SCRIPT = 2;
+    public static final int EXPLICIT_REGION = 1;
+    public static final int IMPLICIT_LSR = 0;
+    public static final int DONT_CARE_FLAGS = 0;
+
      public static final boolean DEBUG_OUTPUT = false;
  
      public final String language;
@@ -14,12 +21,14 @@ public final class LSR {
      public final String region;
      /** Index for region, negative if ill-formed. @see indexForRegion */
      final int regionIndex;
+    public final int flags;
  
-    public LSR(String language, String script, String region) {
+    public LSR(String language, String script, String region, int flags) {
          this.language = language;
          this.script = script;
          this.region = region;
          regionIndex = indexForRegion(region);
+        this.flags = flags;
      }
  
      /**
@@ -57,6 +66,13 @@ public final class LSR {
          }
          return result.toString();
      }
+
+    public boolean isEquivalentTo(LSR other) {
+        return language.equals(other.language)
+                && script.equals(other.script)
+                && region.equals(other.region);
+    }
+
      @Override
      public boolean equals(Object obj) {
          LSR other;
@@ -65,10 +81,12 @@ public final class LSR {
                  && obj.getClass() == this.getClass()
                  && language.equals((other = (LSR) obj).language)
                  && script.equals(other.script)
-                && region.equals(other.region));
+                && region.equals(other.region)
+                && flags == other.flags);
      }
+
      @Override
      public int hashCode() {
-        return Objects.hash(language, script, region);
+        return Objects.hash(language, script, region, flags);
      }
  }
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/LocaleDistance.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/LocaleDistance.java

index 79fe285bcecc86fefb5819689868c409d500920c..028cd8a2badc72e3a363a7af5b965aba5abc8f54 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/LocaleDistance.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/LocaleDistance.java
@@ -34,6 +34,17 @@ public class LocaleDistance {
      private static final int DISTANCE_IS_FINAL = 0x100;
      private static final int DISTANCE_IS_FINAL_OR_SKIP_SCRIPT =
              DISTANCE_IS_FINAL | DISTANCE_SKIP_SCRIPT;
+
+    // The distance is shifted left to gain some fraction bits.
+    private static final int DISTANCE_SHIFT = 3;
+    private static final int DISTANCE_FRACTION_MASK = 7;
+    // 7 bits for 0..100
+    private static final int DISTANCE_INT_SHIFT = 7;
+    private static final int INDEX_SHIFT = DISTANCE_INT_SHIFT + DISTANCE_SHIFT;
+    private static final int DISTANCE_MASK = 0x3ff;
+    // vate static final int MAX_INDEX = 0x1fffff;  // avoids sign bit
+    private static final int INDEX_NEG_1 = 0xfffffc00;
+
      // Indexes into array of distances.
      public static final int IX_DEF_LANG_DISTANCE = 0;
      public static final int IX_DEF_SCRIPT_DISTANCE = 1;
@@ -67,6 +78,28 @@ public class LocaleDistance {
      private final int minRegionDistance;
      private final int defaultDemotionPerDesiredLocale;
  
+    public static final int shiftDistance(int distance) {
+        return distance << DISTANCE_SHIFT;
+    }
+
+    public static final int getShiftedDistance(int indexAndDistance) {
+        return indexAndDistance & DISTANCE_MASK;
+    }
+
+    public static final double getDistanceDouble(int indexAndDistance) {
+        double shiftedDistance = getShiftedDistance(indexAndDistance);
+        return shiftedDistance / (1 << DISTANCE_SHIFT);
+    }
+
+    private static final int getDistanceFloor(int indexAndDistance) {
+        return (indexAndDistance & DISTANCE_MASK) >> DISTANCE_SHIFT;
+    }
+
+    public static final int getIndex(int indexAndDistance) {
+        assert indexAndDistance >= 0;
+        return indexAndDistance >> INDEX_SHIFT;
+    }
+
      // VisibleForTesting
      public static final class Data {
          public byte[] trie;
@@ -121,7 +154,8 @@ public class LocaleDistance {
                  String[] paradigms = value.getStringArray();
                  paradigmLSRs = new HashSet<>(paradigms.length / 3);
                  for (int i = 0; i < paradigms.length; i += 3) {
-                    paradigmLSRs.add(new LSR(paradigms[i], paradigms[i + 1], paradigms[i + 2]));
+                    paradigmLSRs.add(new LSR(paradigms[i], paradigms[i + 1], paradigms[i + 2],
+                            LSR.DONT_CARE_FLAGS));
                  }
              } else {
                  paradigmLSRs = Collections.emptySet();
@@ -168,10 +202,11 @@ public class LocaleDistance {
          // a mere region difference for one desired locale
          // is as good as a perfect match for the next following desired locale.
          // As of CLDR 36, we have <languageMatch desired="en_*_*" supported="en_*_*" distance="5"/>.
-        LSR en = new LSR("en", "Latn", "US");
-        LSR enGB = new LSR("en", "Latn", "GB");
-        defaultDemotionPerDesiredLocale = getBestIndexAndDistance(en, new LSR[] { enGB },
-                50, FavorSubtag.LANGUAGE) & 0xff;
+        LSR en = new LSR("en", "Latn", "US", LSR.EXPLICIT_LSR);
+        LSR enGB = new LSR("en", "Latn", "GB", LSR.EXPLICIT_LSR);
+        int indexAndDistance = getBestIndexAndDistance(en, new LSR[] { enGB },
+                shiftDistance(50), FavorSubtag.LANGUAGE);
+        defaultDemotionPerDesiredLocale  = getDistanceFloor(indexAndDistance);
  
          if (DEBUG_OUTPUT) {
              System.out.println("*** locale distance");
@@ -187,20 +222,26 @@ public class LocaleDistance {
              int threshold, FavorSubtag favorSubtag) {
          LSR supportedLSR = XLikelySubtags.INSTANCE.makeMaximizedLsrFrom(supported);
          LSR desiredLSR = XLikelySubtags.INSTANCE.makeMaximizedLsrFrom(desired);
-        return getBestIndexAndDistance(desiredLSR, new LSR[] { supportedLSR },
-                threshold, favorSubtag) & 0xff;
+        int indexAndDistance = getBestIndexAndDistance(desiredLSR, new LSR[] { supportedLSR },
+                shiftDistance(threshold), favorSubtag);
+        return getDistanceFloor(indexAndDistance);
      }
  
      /**
       * Finds the supported LSR with the smallest distance from the desired one.
       * Equivalent LSR subtags must be normalized into a canonical form.
       *
-     * <p>Returns the index of the lowest-distance supported LSR in bits 31..8
+     * <p>Returns the index of the lowest-distance supported LSR in the high bits
       * (negative if none has a distance below the threshold),
-     * and its distance (0..ABOVE_THRESHOLD) in bits 7..0.
+     * and its distance (0..ABOVE_THRESHOLD) in the low bits.
       */
      public int getBestIndexAndDistance(LSR desired, LSR[] supportedLSRs,
-            int threshold, FavorSubtag favorSubtag) {
+            int shiftedThreshold, FavorSubtag favorSubtag) {
+        // Round up the shifted threshold (if fraction bits are not 0)
+        // for comparison with un-shifted distances until we need fraction bits.
+        // (If we simply shifted non-zero fraction bits away, then we might ignore a language
+        // when it's really still a micro distance below the threshold.)
+        int roundedThreshold = (shiftedThreshold + DISTANCE_FRACTION_MASK) >> DISTANCE_SHIFT;
          BytesTrie iter = new BytesTrie(trie);
          // Look up the desired language only once for all supported LSRs.
          // Its "distance" is either a match point value of 0, or a non-match negative value.
@@ -246,7 +287,7 @@ public class LocaleDistance {
              if (favorSubtag == FavorSubtag.SCRIPT) {
                  distance >>= 2;
              }
-            if (distance >= threshold) {
+            if (distance >= roundedThreshold) {
                  continue;
              }
  
@@ -264,7 +305,7 @@ public class LocaleDistance {
                  scriptDistance &= ~DISTANCE_IS_FINAL;
              }
              distance += scriptDistance;
-            if (distance >= threshold) {
+            if (distance >= roundedThreshold) {
                  continue;
              }
  
@@ -273,7 +314,7 @@ public class LocaleDistance {
              } else if (star || (flags & DISTANCE_IS_FINAL) != 0) {
                  distance += defaultRegionDistance;
              } else {
-                int remainingThreshold = threshold - distance;
+                int remainingThreshold = roundedThreshold - distance;
                  if (minRegionDistance >= remainingThreshold) {
                      continue;
                  }
@@ -289,15 +330,23 @@ public class LocaleDistance {
                          partitionsForRegion(supported),
                          remainingThreshold);
              }
-            if (distance < threshold) {
-                if (distance == 0) {
-                    return slIndex << 8;
+            int shiftedDistance = shiftDistance(distance);
+            if (shiftedDistance == 0) {
+                // Distinguish between equivalent but originally unequal locales via an
+                // additional micro distance.
+                shiftedDistance |= (desired.flags ^ supported.flags);
+            }
+            if (shiftedDistance < shiftedThreshold) {
+                if (shiftedDistance == 0) {
+                    return slIndex << INDEX_SHIFT;
                  }
                  bestIndex = slIndex;
-                threshold = distance;
+                shiftedThreshold = shiftedDistance;
              }
          }
-        return bestIndex >= 0 ? (bestIndex << 8) | threshold : 0xffffff00 | ABOVE_THRESHOLD;
+        return bestIndex >= 0 ?
+                (bestIndex << INDEX_SHIFT) | shiftedThreshold :
+                INDEX_NEG_1 | shiftDistance(ABOVE_THRESHOLD);
      }
  
      private static final int getDesSuppScriptDistance(BytesTrie iter, long startState,
@@ -439,7 +488,17 @@ public class LocaleDistance {
      }
  
      public boolean isParadigmLSR(LSR lsr) {
-        return paradigmLSRs.contains(lsr);
+        // Linear search for a very short list (length 6 as of 2019),
+        // because we look for equivalence not equality, and
+        // HashSet does not support customizing equality.
+        // If there are many paradigm LSRs we should revisit this.
+        assert paradigmLSRs.size() <= 15;
+        for (LSR plsr : paradigmLSRs) {
+            if (lsr.isEquivalentTo(plsr)) {
+                return true;
+            }
+        }
+        return false;
      }
  
      // VisibleForTesting
@@ -455,9 +514,6 @@ public class LocaleDistance {
          return defaultDemotionPerDesiredLocale;
      }
  
-    // TODO: When we build data offline,
-    // write test code to compare the loaded table with the builder output.
-    // Fail if different, with instructions for how to update the data file.
      // VisibleForTesting
      public Map<String, Integer> testOnlyGetDistanceTable() {
          Map<String, Integer> map = new TreeMap<>();
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/XLikelySubtags.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/XLikelySubtags.java

index de42587b3178696190e729d264092a5ce116bf56..1938170e74ea4a4daf0a6e79c7bbbe9c12fc4691 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/XLikelySubtags.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/XLikelySubtags.java
@@ -87,7 +87,8 @@ public final class XLikelySubtags {
              String[] lsrSubtags = getValue(likelyTable, "lsrs", value).getStringArray();
              LSR[] lsrs = new LSR[lsrSubtags.length / 3];
              for (int i = 0, j = 0; i < lsrSubtags.length; i += 3, ++j) {
-                lsrs[j] = new LSR(lsrSubtags[i], lsrSubtags[i + 1], lsrSubtags[i + 2]);
+                lsrs[j] = new LSR(lsrSubtags[i], lsrSubtags[i + 1], lsrSubtags[i + 2],
+                        LSR.IMPLICIT_LSR);
              }
  
              return new Data(languageAliases, regionAliases, trie, lsrs);
@@ -185,7 +186,7 @@ public final class XLikelySubtags {
              String tag = locale.toLanguageTag();
              assert tag.startsWith("x-");
              // Private use language tag x-subtag-subtag...
-            return new LSR(tag, "", "");
+            return new LSR(tag, "", "", LSR.EXPLICIT_LSR);
          }
          return makeMaximizedLsr(locale.getLanguage(), locale.getScript(), locale.getCountry(),
                  locale.getVariant());
@@ -195,7 +196,7 @@ public final class XLikelySubtags {
          String tag = locale.toLanguageTag();
          if (tag.startsWith("x-")) {
              // Private use language tag x-subtag-subtag...
-            return new LSR(tag, "", "");
+            return new LSR(tag, "", "", LSR.EXPLICIT_LSR);
          }
          return makeMaximizedLsr(locale.getLanguage(), locale.getScript(), locale.getCountry(),
                  locale.getVariant());
@@ -209,29 +210,34 @@ public final class XLikelySubtags {
              switch (region.charAt(1)) {
              case 'A':
                  return new LSR(PSEUDO_ACCENTS_PREFIX + language,
-                        PSEUDO_ACCENTS_PREFIX + script, region);
+                        PSEUDO_ACCENTS_PREFIX + script, region, LSR.EXPLICIT_LSR);
              case 'B':
                  return new LSR(PSEUDO_BIDI_PREFIX + language,
-                        PSEUDO_BIDI_PREFIX + script, region);
+                        PSEUDO_BIDI_PREFIX + script, region, LSR.EXPLICIT_LSR);
              case 'C':
                  return new LSR(PSEUDO_CRACKED_PREFIX + language,
-                        PSEUDO_CRACKED_PREFIX + script, region);
+                        PSEUDO_CRACKED_PREFIX + script, region, LSR.EXPLICIT_LSR);
              default:  // normal locale
                  break;
              }
          }
  
          if (variant.startsWith("PS")) {
+            int lsrFlags = region.isEmpty() ?
+                    LSR.EXPLICIT_LANGUAGE | LSR.EXPLICIT_SCRIPT : LSR.EXPLICIT_LSR;
              switch (variant) {
              case "PSACCENT":
                  return new LSR(PSEUDO_ACCENTS_PREFIX + language,
-                        PSEUDO_ACCENTS_PREFIX + script, region.isEmpty() ? "XA" : region);
+                        PSEUDO_ACCENTS_PREFIX + script,
+                        region.isEmpty() ? "XA" : region, lsrFlags);
              case "PSBIDI":
                  return new LSR(PSEUDO_BIDI_PREFIX + language,
-                        PSEUDO_BIDI_PREFIX + script, region.isEmpty() ? "XB" : region);
+                        PSEUDO_BIDI_PREFIX + script,
+                        region.isEmpty() ? "XB" : region, lsrFlags);
              case "PSCRACK":
                  return new LSR(PSEUDO_CRACKED_PREFIX + language,
-                        PSEUDO_CRACKED_PREFIX + script, region.isEmpty() ? "XC" : region);
+                        PSEUDO_CRACKED_PREFIX + script,
+                        region.isEmpty() ? "XC" : region, lsrFlags);
              default:  // normal locale
                  break;
              }
@@ -257,7 +263,7 @@ public final class XLikelySubtags {
              region = "";
          }
          if (!script.isEmpty() && !region.isEmpty() && !language.isEmpty()) {
-            return new LSR(language, script, region);  // already maximized
+            return new LSR(language, script, region, LSR.EXPLICIT_LSR);  // already maximized
          }
  
          int retainOldMask = 0;
@@ -340,6 +346,7 @@ public final class XLikelySubtags {
          }
  
          if (retainOldMask == 0) {
+            assert result.flags == LSR.IMPLICIT_LSR;
              return result;
          }
          if ((retainOldMask & 4) == 0) {
@@ -351,7 +358,8 @@ public final class XLikelySubtags {
          if ((retainOldMask & 1) == 0) {
              region = result.region;
          }
-        return new LSR(language, script, region);
+        // retainOldMask flags = LSR explicit-subtag flags
+        return new LSR(language, script, region, retainOldMask);
      }
  
      private static final int trieNext(BytesTrie iter, String s, int i) {
@@ -411,9 +419,9 @@ public final class XLikelySubtags {
          boolean favorRegionOk = false;
          if (result.script.equals(value00.script)) { //script is default
              if (result.region.equals(value00.region)) {
-                return new LSR(result.language, "", "");
+                return new LSR(result.language, "", "", LSR.DONT_CARE_FLAGS);
              } else if (fieldToFavor == ULocale.Minimize.FAVOR_REGION) {
-                return new LSR(result.language, "", result.region);
+                return new LSR(result.language, "", result.region, LSR.DONT_CARE_FLAGS);
              } else {
                  favorRegionOk = true;
              }
@@ -423,9 +431,9 @@ public final class XLikelySubtags {
          // Maybe do later, but for now use the straightforward code.
          LSR result2 = maximize(languageIn, scriptIn, "");
          if (result2.equals(result)) {
-            return new LSR(result.language, result.script, "");
+            return new LSR(result.language, result.script, "", LSR.DONT_CARE_FLAGS);
          } else if (favorRegionOk) {
-            return new LSR(result.language, "", result.region);
+            return new LSR(result.language, "", result.region, LSR.DONT_CARE_FLAGS);
          }
          return result;
      }
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/util/LocaleMatcher.java b/icu4j/main/classes/core/src/com/ibm/icu/util/LocaleMatcher.java

index e0fd56909d208683b23887900a7852fc0655b963..a333355c4c04ab1644899db089d82ff39328c90a 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/util/LocaleMatcher.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/util/LocaleMatcher.java
@@ -64,7 +64,7 @@ import com.ibm.icu.impl.locale.XLikelySubtags;
   * @stable ICU 4.4
   */
  public final class LocaleMatcher {
-    private static final LSR UND_LSR = new LSR("und","","");
+    private static final LSR UND_LSR = new LSR("und","","", LSR.EXPLICIT_LSR);
      // In ULocale, "und" and "" make the same object.
      private static final ULocale UND_ULOCALE = new ULocale("und");
      // In Locale, "und" and "" make different objects.
@@ -680,6 +680,9 @@ public final class LocaleMatcher {
                  builder.demotion == Demotion.NONE ? 0 :
                      LocaleDistance.INSTANCE.getDefaultDemotionPerDesiredLocale();  // null or REGION
          favorSubtag = builder.favor;
+        if (TRACE_MATCHER) {
+            System.err.printf("new LocaleMatcher: %s\n", toString());
+        }
      }
  
      private static final void putIfAbsent(Map<LSR, Integer> lsrToIndex, LSR lsr, int i) {
@@ -938,26 +941,34 @@ public final class LocaleMatcher {
      private int getBestSuppIndex(LSR desiredLSR, LsrIterator remainingIter) {
          int desiredIndex = 0;
          int bestSupportedLsrIndex = -1;
-        for (int bestDistance = thresholdDistance;;) {
+        StringBuilder sb = null;
+        if (TRACE_MATCHER) {
+            sb = new StringBuilder("LocaleMatcher desired:");
+        }
+        for (int bestShiftedDistance = LocaleDistance.shiftDistance(thresholdDistance);;) {
+            if (TRACE_MATCHER) {
+                sb.append(' ').append(desiredLSR);
+            }
              // Quick check for exact maximized LSR.
              Integer index = supportedLsrToIndex.get(desiredLSR);
              if (index != null) {
                  int suppIndex = index;
                  if (TRACE_MATCHER) {
-                    System.err.printf("Returning %s: desiredLSR=supportedLSR\n",
-                            supportedULocales[suppIndex]);
+                    System.err.printf("%s --> best=%s: desiredLSR=supportedLSR\n",
+                            sb, supportedULocales[suppIndex]);
                  }
                  if (remainingIter != null) { remainingIter.rememberCurrent(desiredIndex); }
                  return suppIndex;
              }
              int bestIndexAndDistance = LocaleDistance.INSTANCE.getBestIndexAndDistance(
-                    desiredLSR, supportedLSRs, bestDistance, favorSubtag);
+                    desiredLSR, supportedLSRs, bestShiftedDistance, favorSubtag);
              if (bestIndexAndDistance >= 0) {
-                bestDistance = bestIndexAndDistance & 0xff;
+                bestShiftedDistance = LocaleDistance.getShiftedDistance(bestIndexAndDistance);
                  if (remainingIter != null) { remainingIter.rememberCurrent(desiredIndex); }
-                bestSupportedLsrIndex = bestIndexAndDistance >> 8;
+                bestSupportedLsrIndex = LocaleDistance.getIndex(bestIndexAndDistance);
              }
-            if ((bestDistance -= demotionPerDesiredLocale) <= 0) {
+            if ((bestShiftedDistance -= LocaleDistance.shiftDistance(demotionPerDesiredLocale))
+                    <= 0) {
                  break;
              }
              if (remainingIter == null || !remainingIter.hasNext()) {
@@ -968,14 +979,14 @@ public final class LocaleMatcher {
          }
          if (bestSupportedLsrIndex < 0) {
              if (TRACE_MATCHER) {
-                System.err.printf("Returning default %s: no good match\n", defaultULocale);
+                System.err.printf("%s --> best=default %s: no good match\n", sb, defaultULocale);
              }
              return -1;
          }
          int suppIndex = supportedIndexes[bestSupportedLsrIndex];
          if (TRACE_MATCHER) {
-            System.err.printf("Returning %s: best matching supported locale\n",
-                    supportedULocales[suppIndex]);
+            System.err.printf("%s --> best=%s: best matching supported locale\n",
+                    sb, supportedULocales[suppIndex]);
          }
          return suppIndex;
      }
@@ -1000,11 +1011,16 @@ public final class LocaleMatcher {
      @Deprecated
      public double match(ULocale desired, ULocale desiredMax, ULocale supported, ULocale supportedMax) {
          // Returns the inverse of the distance: That is, 1-distance(desired, supported).
-        int distance = LocaleDistance.INSTANCE.getBestIndexAndDistance(
+        int indexAndDistance = LocaleDistance.INSTANCE.getBestIndexAndDistance(
                  getMaximalLsrOrUnd(desired),
                  new LSR[] { getMaximalLsrOrUnd(supported) },
-                thresholdDistance, favorSubtag) & 0xff;
-        return (100 - distance) / 100.0;
+                LocaleDistance.shiftDistance(thresholdDistance), favorSubtag);
+        double distance = LocaleDistance.getDistanceDouble(indexAndDistance);
+        if (TRACE_MATCHER) {
+            System.err.printf("LocaleMatcher distance(desired=%s, supported=%s)=%g\n",
+                Objects.toString(desired), Objects.toString(supported), distance);
+        }
+        return (100.0 - distance) / 100.0;
      }
  
      /**
@@ -1032,16 +1048,17 @@ public final class LocaleMatcher {
      @Override
      public String toString() {
          StringBuilder s = new StringBuilder().append("{LocaleMatcher");
-        if (supportedULocales.length > 0) {
-            s.append(" supported={").append(supportedULocales[0].toString());
-            for (int i = 1; i < supportedULocales.length; ++i) {
-                s.append(", ").append(supportedULocales[i].toString());
+        // Supported languages in the order that we try to match them.
+        if (supportedLSRs.length > 0) {
+            s.append(" supportedLSRs={").append(supportedLSRs[0].toString());
+            for (int i = 1; i < supportedLSRs.length; ++i) {
+                s.append(", ").append(supportedLSRs[i].toString());
              }
              s.append('}');
          }
          s.append(" default=").append(Objects.toString(defaultULocale));
          if (favorSubtag != null) {
-            s.append(" distance=").append(favorSubtag.toString());
+            s.append(" favor=").append(favorSubtag.toString());
          }
          if (thresholdDistance >= 0) {
              s.append(String.format(" threshold=%d", thresholdDistance));
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/data/localeMatcherTest.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/data/localeMatcherTest.txt

index 21c9b601410e07f0d50527d486e28d0206ba8db7..649c95baea58f566ac8cadaa92b5145c345ef9f1 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/data/localeMatcherTest.txt
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/data/localeMatcherTest.txt
@@ -1052,9 +1052,9 @@ en >> en-DE
  ar-EG >> ar-SY
  pt-BR >> pt
  ar-XB >> ar-XB
-ar-PSBIDI >> ar-XB  # These are equivalent.
+ar-PSBIDI >> ar-PSBIDI
  en-XA >> en-XA
-en-PSACCENT >> en-XA  # These are equivalent.
+en-PSACCENT >> en-PSACCENT
  ar-PSCRACK >> ar-PSCRACK
  
  @favor=script
@@ -1063,9 +1063,9 @@ en >> en-DE
  ar-EG >> ar-SY
  pt-BR >> pt
  ar-XB >> ar-XB
-ar-PSBIDI >> ar-XB  # These are equivalent.
+ar-PSBIDI >> ar-PSBIDI
  en-XA >> en-XA
-en-PSACCENT >> en-XA  # These are equivalent.
+en-PSACCENT >> en-PSACCENT
  ar-PSCRACK >> ar-PSCRACK
  
  ** test: BestMatchForTraditionalChinese
@@ -1544,50 +1544,44 @@ zh-TW, en >> en-US
  zh-Hant-CN, en >> en-US
  zh-Hans, en >> zh-Hans-CN
  
-** test: return first among likely-subtags equivalent locales
-# Was: more specific script should win in case regions are identical
-# with some different results.
+** test: return most originally similar among likely-subtags equivalent locales
  @supported=af, af-Latn, af-Arab
  af >> af
  af-ZA >> af
-af-Latn-ZA >> af
-af-Latn >> af
+af-Latn-ZA >> af-Latn
+af-Latn >> af-Latn
  
  @favor=script
  af >> af
  af-ZA >> af
-af-Latn-ZA >> af
-af-Latn >> af
+af-Latn-ZA >> af-Latn
+af-Latn >> af-Latn
  
-# Was: more specific region should win
-# with some different results.
  @supported=nl, nl-NL, nl-BE
  @favor=
  nl >> nl
  nl-Latn >> nl
-nl-Latn-NL >> nl
-nl-NL >> nl
+nl-Latn-NL >> nl-NL
+nl-NL >> nl-NL
  
  @favor=script
  nl >> nl
  nl-Latn >> nl
-nl-Latn-NL >> nl
-nl-NL >> nl
+nl-Latn-NL >> nl-NL
+nl-NL >> nl-NL
  
-# Was: more specific region wins over more specific script
-# with some different results.
  @supported=nl, nl-Latn, nl-NL, nl-BE
  @favor=
  nl >> nl
-nl-Latn >> nl
-nl-NL >> nl
-nl-Latn-NL >> nl
+nl-Latn >> nl-Latn
+nl-NL >> nl-NL
+nl-Latn-NL >> nl-Latn
  
  @favor=script
  nl >> nl
-nl-Latn >> nl
-nl-NL >> nl
-nl-Latn-NL >> nl
+nl-Latn >> nl-Latn
+nl-NL >> nl-NL
+nl-Latn-NL >> nl-Latn
  
  ** test: region may replace matched if matched is enclosing
  @supported=es-419, es
@@ -1670,22 +1664,22 @@ ja-Jpan-JP, en-GB >> ja
  ** test: pick best maximized tag
  @supported=ja, ja-Jpan-US, ja-JP, en, ru
  ja-Jpan, ru >> ja
-ja-JP, ru >> ja
+ja-JP, ru >> ja-JP
  ja-US, ru >> ja-Jpan-US
  
  @favor=script
  ja-Jpan, ru >> ja
-ja-JP, ru >> ja
+ja-JP, ru >> ja-JP
  ja-US, ru >> ja-Jpan-US
  
  ** test: termination: pick best maximized match
  @supported=ja, ja-Jpan, ja-JP, en, ru
-ja-Jpan-JP, ru >> ja
-ja-Jpan, ru >> ja
+ja-Jpan-JP, ru >> ja-Jpan
+ja-Jpan, ru >> ja-Jpan
  
  @favor=script
-ja-Jpan-JP, ru >> ja
-ja-Jpan, ru >> ja
+ja-Jpan-JP, ru >> ja-Jpan
+ja-Jpan, ru >> ja-Jpan
  
  ** test: same language over exact, but distinguish when user is explicit
  @supported=fr, en-GB, ja, es-ES, es-MX
@@ -1900,7 +1894,7 @@ zh-TW >> zh
  ** test: testGetBestMatchWithMinMatchScore
  @supported=fr-FR, fr, fr-CA, en
  @default=und
-fr >> fr-FR # First likely-subtags equivalent match is chosen.
+fr >> fr
  @supported=en, fr, fr-CA
  fr-FR >> fr # Parent match is chosen.
  @supported=en, fr-CA
@@ -1930,7 +1924,7 @@ ru >> und
  
  @favor=script
  @supported=fr-FR, fr, fr-CA, en
-fr >> fr-FR
+fr >> fr
  @supported=en, fr, fr-CA
  fr-FR >> fr
  @supported=en, fr-CA
diff --git a/icu4j/tools/misc/src/com/ibm/icu/dev/tool/locale/LikelySubtagsBuilder.java b/icu4j/tools/misc/src/com/ibm/icu/dev/tool/locale/LikelySubtagsBuilder.java

index 813d6f8f81badb05ed7670052cd707b7ca50ad66..6a7f6b8c6400c69bd66378a451a339fa7b7471d8 100644 (file)
--- a/icu4j/tools/misc/src/com/ibm/icu/dev/tool/locale/LikelySubtagsBuilder.java
+++ b/icu4j/tools/misc/src/com/ibm/icu/dev/tool/locale/LikelySubtagsBuilder.java
@@ -139,10 +139,11 @@ public class LikelySubtagsBuilder {
          Map<LSR, Integer> lsrIndexes = new LinkedHashMap<>();
          // Reserve index 0 as "no value":
          // The runtime lookup returns 0 for an intermediate match with no value.
-        lsrIndexes.put(new LSR("", "", ""), 0);  // arbitrary LSR
+        lsrIndexes.put(new LSR("", "", "", LSR.DONT_CARE_FLAGS), 0);  // arbitrary LSR
          // Reserve index 1 for SKIP_SCRIPT:
          // The runtime lookup returns 1 for an intermediate match with a value.
-        lsrIndexes.put(new LSR("skip", "script", ""), 1);  // looks good when printing the data
+        // This LSR looks good when printing the data.
+        lsrIndexes.put(new LSR("skip", "script", "", LSR.DONT_CARE_FLAGS), 1);
          // We could prefill the lsrList with common locales to give them small indexes,
          // and see if that improves performance a little.
          for (Map.Entry<String, Map<String, Map<String, LSR>>> ls :  langTable.entrySet()) {
@@ -251,7 +252,7 @@ public class LikelySubtagsBuilder {
              }
          }
          // hack
-        set(result, "und", "Latn", "", new LSR("en", "Latn", "US"));
+        set(result, "und", "Latn", "", new LSR("en", "Latn", "US", LSR.DONT_CARE_FLAGS));
  
          // hack, ensure that if und-YY => und-Xxxx-YY, then we add Xxxx=>YY to the table
          // <likelySubtag from="und_GH" to="ak_Latn_GH"/>
@@ -294,7 +295,9 @@ public class LikelySubtagsBuilder {
          String lang = parts[0];
          String p2 = parts.length < 2 ? "" : parts[1];
          String p3 = parts.length < 3 ? "" : parts[2];
-        return p2.length() < 4 ? new LSR(lang, "", p2) : new LSR(lang, p2, p3);
+        return p2.length() < 4 ?
+                new LSR(lang, "", p2, LSR.DONT_CARE_FLAGS) :
+                new LSR(lang, p2, p3, LSR.DONT_CARE_FLAGS);
      }
  
      private static void set(Map<String, Map<String, Map<String, LSR>>> langTable,
diff --git a/icu4j/tools/misc/src/com/ibm/icu/dev/tool/locale/LocaleDistanceBuilder.java b/icu4j/tools/misc/src/com/ibm/icu/dev/tool/locale/LocaleDistanceBuilder.java

index a104c35ef0723e21eb62792ac0df71bd43f0c4ad..43b3cf856bc134f01d5f7ae5844d7c45bbb6ccac 100644 (file)
--- a/icu4j/tools/misc/src/com/ibm/icu/dev/tool/locale/LocaleDistanceBuilder.java
+++ b/icu4j/tools/misc/src/com/ibm/icu/dev/tool/locale/LocaleDistanceBuilder.java
@@ -487,7 +487,10 @@ public final class LocaleDistanceBuilder {
          Set<LSR> paradigmLSRs = new HashSet<>();  // could be TreeSet if LSR were Comparable
          for (String paradigm : paradigms) {
              ULocale pl = new ULocale(paradigm);
-            paradigmLSRs.add(XLikelySubtags.INSTANCE.makeMaximizedLsrFrom(pl));
+            LSR max = XLikelySubtags.INSTANCE.makeMaximizedLsrFrom(pl);
+            // Clear the LSR flags to make the data equality test in
+            // LocaleDistanceTest happy.
+            paradigmLSRs.add(new LSR(max.language, max.script, max.region, LSR.DONT_CARE_FLAGS));
          }
  
          TerritoryContainment tc = new TerritoryContainment(supplementalData);
author	Markus Scherer <markus.icu@gmail.com>
	Fri, 20 Dec 2019 00:09:10 +0000 (00:09 +0000)
committer	Markus Scherer <markus.icu@gmail.com>
	Fri, 20 Dec 2019 17:36:57 +0000 (09:36 -0800)
icu4c/source/common/localematcher.cpp		patch \| blob \| history
icu4c/source/common/locdistance.cpp		patch \| blob \| history
icu4c/source/common/locdistance.h		patch \| blob \| history
icu4c/source/common/loclikelysubtags.cpp		patch \| blob \| history
icu4c/source/common/lsr.cpp		patch \| blob \| history
icu4c/source/common/lsr.h		patch \| blob \| history
icu4c/source/test/testdata/localeMatcherTest.txt		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/impl/locale/LSR.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/impl/locale/LocaleDistance.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/impl/locale/XLikelySubtags.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/util/LocaleMatcher.java		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/data/localeMatcherTest.txt		patch \| blob \| history
icu4j/tools/misc/src/com/ibm/icu/dev/tool/locale/LikelySubtagsBuilder.java		patch \| blob \| history
icu4j/tools/misc/src/com/ibm/icu/dev/tool/locale/LocaleDistanceBuilder.java		patch \| blob \| history