]> granicus.if.org Git - icu/commitdiff
ICU-21284 Correctly normalize Unit Identifiers
authorHugo van der Merwe <17109322+hugovdm@users.noreply.github.com>
Mon, 8 Feb 2021 21:16:57 +0000 (21:16 +0000)
committerHugo van der Merwe <17109322+hugovdm@users.noreply.github.com>
Tue, 9 Feb 2021 11:52:05 +0000 (12:52 +0100)
See #1527

15 files changed:
icu4c/source/data/misc/units.txt
icu4c/source/i18n/measunit_extra.cpp
icu4c/source/i18n/measunit_impl.h
icu4c/source/i18n/units_data.cpp
icu4c/source/i18n/units_data.h
icu4c/source/i18n/units_router.cpp
icu4c/source/test/intltest/measfmttest.cpp
icu4c/source/test/intltest/units_data_test.cpp
icu4j/main/classes/core/src/com/ibm/icu/impl/units/SingleUnitImpl.java
icu4j/main/classes/core/src/com/ibm/icu/impl/units/UnitsData.java
icu4j/main/shared/data/icudata.jar
icu4j/main/shared/data/icutzdata.jar
icu4j/main/shared/data/testdata.jar
icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/MeasureUnitTest.java
tools/cldr/cldr-to-icu/src/main/resources/ldml2icu_supplemental.txt

index f8c86457d2908840f276d065c2bfa926cd7cf842..efdb824207bcbf32856149c9d8d08b0f3da5e9f3 100644 (file)
@@ -1904,50 +1904,140 @@ units:table(nofallback){
         }
     }
     unitQuantities{
-        ampere{"electric-current"}
-        ampere-per-meter{"magnetic-field-strength"}
-        ampere-per-square-meter{"current-density"}
-        bit{"digital"}
-        candela{"luminous-intensity"}
-        candela-per-square-meter{"illuminance"}
-        candela-square-meter-per-square-meter{"luminous-flux"}
-        cubic-meter{"volume"}
-        cubic-meter-per-kilogram{"specific-volume"}
-        cubic-meter-per-meter{"consumption"}
-        cubic-second-square-ampere-per-kilogram-square-meter{"electric-conductance"}
-        em{"typewidth"}
-        item{"substance-amount"}
-        item-per-cubic-meter{"concentration"}
-        item-per-kilogram{"concentration-mass"}
-        kelvin{"temperature"}
-        kilogram{"mass"}
-        kilogram-meter-per-square-second{"force"}
-        kilogram-per-cubic-meter{"mass-density"}
-        kilogram-per-kilogram{"mass-fraction"}
-        kilogram-per-meter-square-second{"pressure"}
-        kilogram-per-square-meter-square-second{"pressure-per-length"}
-        kilogram-per-square-second-ampere{"magnetic-induction"}
-        kilogram-square-meter-per-cubic-second{"power"}
-        kilogram-square-meter-per-cubic-second-ampere{"voltage"}
-        kilogram-square-meter-per-cubic-second-square-ampere{"electric-resistance"}
-        kilogram-square-meter-per-square-second{"energy"}
-        kilogram-square-meter-per-square-second-ampere{"magnetic-flux"}
-        kilogram-square-meter-per-square-second-square-ampere{"electric-inductance"}
-        meter{"length"}
-        meter-per-second{"speed"}
-        meter-per-square-second{"acceleration"}
-        pixel{"graphics"}
-        pixel-per-meter{"resolution"}
-        portion{"portion"}
-        pow4-second-square-ampere-per-kilogram-square-meter{"electric-capacitance"}
-        revolution{"angle"}
-        revolution-per-meter{"wave-number"}
-        revolution-per-second{"frequency"}
-        second{"duration"}
-        second-ampere{"electric-charge"}
-        square-meter{"area"}
-        square-meter-per-square-second{"dose"}
-        square-revolution{"solid-angle"}
-        year{"year-duration"}
+        {
+            candela{"luminous-intensity"}
+        }
+        {
+            candela-per-square-meter{"illuminance"}
+        }
+        {
+            candela-square-meter-per-square-meter{"luminous-flux"}
+        }
+        {
+            kilogram{"mass"}
+        }
+        {
+            kilogram-per-kilogram{"mass-fraction"}
+        }
+        {
+            kilogram-per-cubic-meter{"mass-density"}
+        }
+        {
+            kilogram-per-meter-square-second{"pressure"}
+        }
+        {
+            kilogram-per-square-second-ampere{"magnetic-induction"}
+        }
+        {
+            kilogram-meter-per-square-second{"force"}
+        }
+        {
+            kilogram-square-meter-per-cubic-second{"power"}
+        }
+        {
+            kilogram-square-meter-per-cubic-second-ampere{"voltage"}
+        }
+        {
+            kilogram-square-meter-per-cubic-second-square-ampere{"electric-resistance"}
+        }
+        {
+            kilogram-square-meter-per-square-second{"energy"}
+        }
+        {
+            kilogram-square-meter-per-square-second-ampere{"magnetic-flux"}
+        }
+        {
+            kilogram-square-meter-per-square-second-square-ampere{"electric-inductance"}
+        }
+        {
+            cubic-meter{"volume"}
+        }
+        {
+            cubic-meter-per-kilogram{"specific-volume"}
+        }
+        {
+            cubic-meter-per-meter{"consumption"}
+        }
+        {
+            square-meter{"area"}
+        }
+        {
+            square-meter-per-square-second{"dose"}
+        }
+        {
+            meter{"length"}
+        }
+        {
+            meter-per-second{"speed"}
+        }
+        {
+            meter-per-square-second{"acceleration"}
+        }
+        {
+            kilogram-per-square-meter-square-second{"pressure-per-length"}
+        }
+        {
+            pow4-second-square-ampere-per-kilogram-square-meter{"electric-capacitance"}
+        }
+        {
+            cubic-second-square-ampere-per-kilogram-square-meter{"electric-conductance"}
+        }
+        {
+            second{"duration"}
+        }
+        {
+            second-ampere{"electric-charge"}
+        }
+        {
+            year{"year-duration"}
+        }
+        {
+            ampere{"electric-current"}
+        }
+        {
+            ampere-per-square-meter{"current-density"}
+        }
+        {
+            ampere-per-meter{"magnetic-field-strength"}
+        }
+        {
+            kelvin{"temperature"}
+        }
+        {
+            square-revolution{"solid-angle"}
+        }
+        {
+            revolution{"angle"}
+        }
+        {
+            revolution-per-meter{"wave-number"}
+        }
+        {
+            revolution-per-second{"frequency"}
+        }
+        {
+            item{"substance-amount"}
+        }
+        {
+            item-per-kilogram{"concentration-mass"}
+        }
+        {
+            item-per-cubic-meter{"concentration"}
+        }
+        {
+            portion{"portion"}
+        }
+        {
+            bit{"digital"}
+        }
+        {
+            pixel{"graphics"}
+        }
+        {
+            pixel-per-meter{"resolution"}
+        }
+        {
+            em{"typewidth"}
+        }
     }
 }
index 9556ae9680bf3beffe50b27fb37e8db17b2759ca..06bf90baf2e23b7907cd8666900c4e587bef0c9e 100644 (file)
@@ -138,14 +138,17 @@ const struct UnitPrefixStrings {
  * A ResourceSink that collects simple unit identifiers from the keys of the
  * convertUnits table into an array, and adds these values to a TrieBuilder,
  * with associated values being their index into this array plus a specified
- * offset, to a trie.
+ * offset.
  *
  * Example code:
  *
  *     UErrorCode status = U_ZERO_ERROR;
  *     BytesTrieBuilder b(status);
- *     const char *unitIdentifiers[200];
- *     SimpleUnitIdentifiersSink identifierSink(unitIdentifiers, 200, b, kTrieValueOffset);
+ *     int32_t ARR_SIZE = 200;
+ *     const char *unitIdentifiers[ARR_SIZE];
+ *     int32_t *unitCategories[ARR_SIZE];
+ *     SimpleUnitIdentifiersSink identifierSink(gSerializedUnitCategoriesTrie, unitIdentifiers,
+ *                                              unitCategories, ARR_SIZE, b, kTrieValueOffset);
  *     LocalUResourceBundlePointer unitsBundle(ures_openDirect(NULL, "units", &status));
  *     ures_getAllItemsWithFallback(unitsBundle.getAlias(), "convertUnits", identifierSink, status);
  */
@@ -153,20 +156,27 @@ class SimpleUnitIdentifiersSink : public icu::ResourceSink {
   public:
     /**
      * Constructor.
-     * @param out Array of char* to which the simple unit identifiers will be
-     *     saved.
-     * @param outSize The size of `out`.
+     * @param quantitiesTrieData The data for constructing a quantitiesTrie,
+     *     which maps from a simple unit identifier to an index into the
+     *     gCategories array.
+     * @param out Array of char* to which pointers to the simple unit
+     *     identifiers will be saved. (Does not take ownership.)
+     * @param outCategories Array of int32_t to which category indexes will be
+     *     saved: this corresponds to simple unit IDs saved to `out`, mapping
+     *     from the ID to the value produced by the quantitiesTrie (which is an
+     *     index into the gCategories array).
+     * @param outSize The size of `out` and `outCategories`.
      * @param trieBuilder The trie builder to which the simple unit identifier
      *     should be added. The trie builder must outlive this resource sink.
      * @param trieValueOffset This is added to the index of the identifier in
      *     the `out` array, before adding to `trieBuilder` as the value
      *     associated with the identifier.
      */
-    explicit SimpleUnitIdentifiersSink(const char **out, int32_t outSize, BytesTrieBuilder &trieBuilder,
-                                       int32_t trieValueOffset)
-        : outArray(out), outSize(outSize), trieBuilder(trieBuilder), trieValueOffset(trieValueOffset),
-          outIndex(0) {
-    }
+    explicit SimpleUnitIdentifiersSink(StringPiece quantitiesTrieData, const char **out,
+                                       int32_t *outCategories, int32_t outSize,
+                                       BytesTrieBuilder &trieBuilder, int32_t trieValueOffset)
+        : outArray(out), outCategories(outCategories), outSize(outSize), trieBuilder(trieBuilder),
+          trieValueOffset(trieValueOffset), quantitiesTrieData(quantitiesTrieData), outIndex(0) {}
 
     /**
      * Adds the table keys found in value to the output vector.
@@ -186,30 +196,120 @@ class SimpleUnitIdentifiersSink : public icu::ResourceSink {
             return;
         }
 
+        BytesTrie quantitiesTrie(quantitiesTrieData.data());
+
         // Collect keys from the table resource.
-        const char *key;
-        for (int32_t i = 0; table.getKeyAndValue(i, key, value); ++i) {
+        const char *simpleUnitID;
+        for (int32_t i = 0; table.getKeyAndValue(i, simpleUnitID, value); ++i) {
             U_ASSERT(i < table.getSize());
             U_ASSERT(outIndex < outSize);
-            if (uprv_strcmp(key, "kilogram") == 0) {
+            if (uprv_strcmp(simpleUnitID, "kilogram") == 0) {
                 // For parsing, we use "gram", the prefixless metric mass unit. We
                 // thus ignore the SI Base Unit of Mass: it exists due to being the
                 // mass conversion target unit, but not needed for MeasureUnit
                 // parsing.
                 continue;
             }
-            outArray[outIndex] = key;
-            trieBuilder.add(key, trieValueOffset + outIndex, status);
+            outArray[outIndex] = simpleUnitID;
+            trieBuilder.add(simpleUnitID, trieValueOffset + outIndex, status);
+
+            // Find the base target unit for this simple unit
+            ResourceTable table = value.getTable(status);
+            if (U_FAILURE(status)) { return; }
+            if (!table.findValue("target", value)) {
+                status = U_INVALID_FORMAT_ERROR;
+                break;
+            }
+            int32_t len;
+            const UChar* uTarget = value.getString(len, status);
+            CharString target;
+            target.appendInvariantChars(uTarget, len, status);
+            if (U_FAILURE(status)) { return; }
+            quantitiesTrie.reset();
+            UStringTrieResult result = quantitiesTrie.next(target.data(), target.length());
+            if (!USTRINGTRIE_HAS_VALUE(result)) {
+                status = U_INVALID_FORMAT_ERROR;
+                break;
+            }
+            outCategories[outIndex] = quantitiesTrie.getValue();
+
             outIndex++;
         }
     }
 
   private:
     const char **outArray;
+    int32_t *outCategories;
     int32_t outSize;
     BytesTrieBuilder &trieBuilder;
     int32_t trieValueOffset;
 
+    StringPiece quantitiesTrieData;
+
+    int32_t outIndex;
+};
+
+/**
+ * A ResourceSink that collects information from `unitQuantities` in the `units`
+ * resource to provide key->value lookups from base unit to category, as well as
+ * preserving ordering information for these categories. See `units.txt`.
+ *
+ * For example: "kilogram" -> "mass", "meter-per-second" -> "speed".
+ *
+ * In C++ unitQuantity values are collected in order into a UChar* array, while
+ * unitQuantity keys are added added to a TrieBuilder, with associated values
+ * being the index into the aforementioned UChar* array.
+ */
+class CategoriesSink : public icu::ResourceSink {
+  public:
+    /**
+     * Constructor.
+     * @param out Array of UChar* to which unitQuantity values will be saved.
+     *     The pointers returned  not owned: they point directly at the resource
+     *     strings in static memory.
+     * @param outSize The size of the `out` array.
+     * @param trieBuilder The trie builder to which the keys (base units) of
+     *     each unitQuantity will be added, each with value being the offset
+     *     into `out`.
+     */
+    explicit CategoriesSink(const UChar **out, int32_t &outSize, BytesTrieBuilder &trieBuilder)
+        : outQuantitiesArray(out), outSize(outSize), trieBuilder(trieBuilder), outIndex(0) {}
+
+    void put(const char * /*key*/, ResourceValue &value, UBool /*noFallback*/, UErrorCode &status) {
+        ResourceArray array = value.getArray(status);
+        if (U_FAILURE(status)) {
+            return;
+        }
+
+        if (outIndex + array.getSize() > outSize) {
+            status = U_INDEX_OUTOFBOUNDS_ERROR;
+            return;
+        }
+
+        for (int32_t i = 0; array.getValue(i, value); ++i) {
+            U_ASSERT(outIndex < outSize);
+            ResourceTable table = value.getTable(status);
+            if (U_FAILURE(status)) {
+                return;
+            }
+            if (table.getSize() != 1) {
+                status = U_INVALID_FORMAT_ERROR;
+                return;
+            }
+            const char *key;
+            table.getKeyAndValue(0, key, value);
+            int32_t uTmpLen;
+            outQuantitiesArray[outIndex] = value.getString(uTmpLen, status);
+            trieBuilder.add(key, outIndex, status);
+            outIndex++;
+        }
+    }
+
+  private:
+    const UChar **outQuantitiesArray;
+    int32_t &outSize;
+    BytesTrieBuilder &trieBuilder;
+
     int32_t outIndex;
 };
 
@@ -222,11 +322,34 @@ icu::UInitOnce gUnitExtrasInitOnce = U_INITONCE_INITIALIZER;
 // by SingleUnitImpl::getSimpleUnitID().)
 const char **gSimpleUnits = nullptr;
 
+// Maps from the value associated with each simple unit ID to an index into the
+// gCategories array.
+int32_t *gSimpleUnitCategories = nullptr;
+
 char *gSerializedUnitExtrasStemTrie = nullptr;
 
+// Array of UChar* pointing at the unit categories (aka "quantities", aka
+// "types"), as found in the `unitQuantities` resource. The array memory itself
+// is owned by this pointer, but the individual UChar* in that array point at
+// static memory.
+const UChar **gCategories = nullptr;
+// Number of items in `gCategories`.
+int32_t gCategoriesCount = 0;
+// TODO: rather save an index into gCategories?
+const char *kConsumption = "consumption";
+size_t kConsumptionLen = strlen("consumption");
+// Serialized BytesTrie for mapping from base units to indices into gCategories.
+char *gSerializedUnitCategoriesTrie = nullptr;
+
 UBool U_CALLCONV cleanupUnitExtras() {
+    uprv_free(gSerializedUnitCategoriesTrie);
+    gSerializedUnitCategoriesTrie = nullptr;
+    uprv_free(gCategories);
+    gCategories = nullptr;
     uprv_free(gSerializedUnitExtrasStemTrie);
     gSerializedUnitExtrasStemTrie = nullptr;
+    uprv_free(gSimpleUnitCategories);
+    gSimpleUnitCategories = nullptr;
     uprv_free(gSimpleUnits);
     gSimpleUnits = nullptr;
     gUnitExtrasInitOnce.reset();
@@ -235,6 +358,36 @@ UBool U_CALLCONV cleanupUnitExtras() {
 
 void U_CALLCONV initUnitExtras(UErrorCode& status) {
     ucln_i18n_registerCleanup(UCLN_I18N_UNIT_EXTRAS, cleanupUnitExtras);
+    LocalUResourceBundlePointer unitsBundle(ures_openDirect(nullptr, "units", &status));
+
+    // Collect unitQuantities information into gSerializedUnitCategoriesTrie and gCategories.
+    const char *CATEGORY_TABLE_NAME = "unitQuantities";
+    LocalUResourceBundlePointer unitQuantities(
+        ures_getByKey(unitsBundle.getAlias(), CATEGORY_TABLE_NAME, nullptr, &status));
+    if (U_FAILURE(status)) { return; }
+    gCategoriesCount = unitQuantities.getAlias()->fSize;
+    size_t quantitiesMallocSize = sizeof(UChar *) * gCategoriesCount;
+    gCategories = static_cast<const UChar **>(uprv_malloc(quantitiesMallocSize));
+    if (gCategories == nullptr) {
+        status = U_MEMORY_ALLOCATION_ERROR;
+        return;
+    }
+    uprv_memset(gCategories, 0, quantitiesMallocSize);
+    BytesTrieBuilder quantitiesBuilder(status);
+    CategoriesSink categoriesSink(gCategories, gCategoriesCount, quantitiesBuilder);
+    ures_getAllItemsWithFallback(unitsBundle.getAlias(), CATEGORY_TABLE_NAME, categoriesSink, status);
+    StringPiece resultQuantities = quantitiesBuilder.buildStringPiece(USTRINGTRIE_BUILD_FAST, status);
+    if (U_FAILURE(status)) { return; }
+    // Copy the result into the global constant pointer
+    size_t numBytesQuantities = resultQuantities.length();
+    gSerializedUnitCategoriesTrie = static_cast<char *>(uprv_malloc(numBytesQuantities));
+    if (gSerializedUnitCategoriesTrie == nullptr) {
+        status = U_MEMORY_ALLOCATION_ERROR;
+        return;
+    }
+    uprv_memcpy(gSerializedUnitCategoriesTrie, resultQuantities.data(), numBytesQuantities);
+
+    // Build the BytesTrie that Parser needs for parsing unit identifiers.
 
     BytesTrieBuilder b(status);
     if (U_FAILURE(status)) { return; }
@@ -270,11 +423,8 @@ void U_CALLCONV initUnitExtras(UErrorCode& status) {
 
     // Add sanctioned simple units by offset: simple units all have entries in
     // units/convertUnits resources.
-    // TODO(ICU-21059): confirm whether this is clean enough, or whether we need to
-    // filter units' validity list instead.
-    LocalUResourceBundlePointer unitsBundle(ures_openDirect(NULL, "units", &status));
     LocalUResourceBundlePointer convertUnits(
-        ures_getByKey(unitsBundle.getAlias(), "convertUnits", NULL, &status));
+        ures_getByKey(unitsBundle.getAlias(), "convertUnits", nullptr, &status));
     if (U_FAILURE(status)) { return; }
 
     // Allocate enough space: with identifierSink below skipping kilogram, we're
@@ -287,9 +437,17 @@ void U_CALLCONV initUnitExtras(UErrorCode& status) {
         return;
     }
     uprv_memset(gSimpleUnits, 0, arrayMallocSize);
+    arrayMallocSize = sizeof(int32_t) * simpleUnitsCount;
+    gSimpleUnitCategories = static_cast<int32_t *>(uprv_malloc(arrayMallocSize));
+    if (gSimpleUnitCategories == nullptr) {
+        status = U_MEMORY_ALLOCATION_ERROR;
+        return;
+    }
+    uprv_memset(gSimpleUnitCategories, 0, arrayMallocSize);
 
     // Populate gSimpleUnits and build the associated trie.
-    SimpleUnitIdentifiersSink identifierSink(gSimpleUnits, simpleUnitsCount, b, kSimpleUnitOffset);
+    SimpleUnitIdentifiersSink identifierSink(resultQuantities, gSimpleUnits, gSimpleUnitCategories,
+                                             simpleUnitsCount, b, kSimpleUnitOffset);
     ures_getAllItemsWithFallback(unitsBundle.getAlias(), "convertUnits", identifierSink, status);
 
     // Build the CharsTrie
@@ -648,6 +806,23 @@ compareSingleUnits(const void* /*context*/, const void* left, const void* right)
     return (*realLeft)->compareTo(**realRight);
 }
 
+// Returns an index into the gCategories array, for the "unitQuantity" (aka
+// "type" or "category") associated with the given base unit identifier. Returns
+// -1 on failure, together with U_UNSUPPORTED_ERROR.
+int32_t getUnitCategoryIndex(StringPiece baseUnitIdentifier, UErrorCode &status) {
+    umtx_initOnce(gUnitExtrasInitOnce, &initUnitExtras, status);
+    if (U_FAILURE(status)) {
+        return -1;
+    }
+    BytesTrie trie(gSerializedUnitCategoriesTrie);
+    UStringTrieResult result = trie.next(baseUnitIdentifier.data(), baseUnitIdentifier.length());
+    if (!USTRINGTRIE_HAS_VALUE(result)) {
+        status = U_UNSUPPORTED_ERROR;
+        return -1;
+    }
+    return trie.getValue();
+}
+
 } // namespace
 
 U_CAPI int32_t U_EXPORT2
@@ -672,6 +847,33 @@ umeas_getPrefixBase(UMeasurePrefix unitPrefix) {
     return 10;
 }
 
+CharString U_I18N_API getUnitQuantity(StringPiece baseUnitIdentifier, UErrorCode &status) {
+    CharString result;
+    U_ASSERT(result.length() == 0);
+    if (U_FAILURE(status)) {
+        return result;
+    }
+    UErrorCode localStatus = U_ZERO_ERROR;
+    int32_t idx = getUnitCategoryIndex(baseUnitIdentifier, localStatus);
+    if (U_FAILURE(localStatus)) {
+        // TODO(icu-units#130): support inverting any unit, with correct
+        // fallback logic: inversion and fallback may depend on presence or
+        // absence of a usage for that category.
+        if (uprv_strcmp(baseUnitIdentifier.data(), "meter-per-cubic-meter") == 0) {
+            result.append(kConsumption, (int32_t)kConsumptionLen, status);
+            return result;
+        }
+        status = U_INVALID_FORMAT_ERROR;
+        return result;
+    }
+    if (idx < 0 || idx >= gCategoriesCount) {
+        status = U_INVALID_FORMAT_ERROR;
+        return result;
+    }
+    result.appendInvariantChars(gCategories[idx], u_strlen(gCategories[idx]), status);
+    return result;
+}
+
 // In ICU4J, this is MeasureUnit.getSingleUnitImpl().
 SingleUnitImpl SingleUnitImpl::forMeasureUnit(const MeasureUnit& measureUnit, UErrorCode& status) {
     MeasureUnitImpl temp;
@@ -743,6 +945,10 @@ void SingleUnitImpl::appendNeutralIdentifier(CharString &result, UErrorCode &sta
     result.append(StringPiece(this->getSimpleUnitID()), status);
 }
 
+int32_t SingleUnitImpl::getUnitCategoryIndex() const {
+    return gSimpleUnitCategories[index];
+}
+
 MeasureUnitImpl::MeasureUnitImpl(const MeasureUnitImpl &other, UErrorCode &status) {
     *this = other.copy(status);
 }
index bb8575c84430ce74d5f7148bef64745bce1e7aa4..0f5aac488d2022ad396aa217053d9bbaed78ae3f 100644 (file)
@@ -41,6 +41,20 @@ struct U_I18N_API MeasureUnitImplWithIndex : public UMemory {
         : index(index), unitImpl(unitImpl) {}
 };
 
+/**
+ * Looks up the "unitQuantity" (aka "type" or "category") of a base unit
+ * identifier. The category is returned via `result`, which must initially be
+ * empty.
+ *
+ * This only supports base units: other units must be resolved to base units
+ * before passing to this function, otherwise U_UNSUPPORTED_ERROR status will be
+ * returned.
+ *
+ * Categories are found in `unitQuantities` in the `units` resource (see
+ * `units.txt`).
+ */
+CharString U_I18N_API getUnitQuantity(StringPiece baseUnitIdentifier, UErrorCode &status);
+
 /**
  * A struct representing a single unit (optional SI or binary prefix, and dimensionality).
  */
@@ -70,10 +84,20 @@ struct U_I18N_API SingleUnitImpl : public UMemory {
      */
     void appendNeutralIdentifier(CharString &result, UErrorCode &status) const;
 
+    /**
+     * Returns the index of this unit's "quantity" in unitQuantities (in
+     * measunit_extra.cpp). The value of this index determines sort order for
+     * normalization of unit identifiers.
+     */
+    int32_t getUnitCategoryIndex() const;
+
     /**
      * Compare this SingleUnitImpl to another SingleUnitImpl for the sake of
      * sorting and coalescing.
      *
+     * Sort order of units is specified by UTS #35
+     * (https://unicode.org/reports/tr35/tr35-info.html#Unit_Identifier_Normalization).
+     *
      * Takes the sign of dimensionality into account, but not the absolute
      * value: per-meter is not considered the same as meter, but meter is
      * considered the same as square-meter.
@@ -90,6 +114,16 @@ struct U_I18N_API SingleUnitImpl : public UMemory {
         if (dimensionality > 0 && other.dimensionality < 0) {
             return -1;
         }
+        // Sort by official quantity order
+        int32_t thisQuantity = this->getUnitCategoryIndex();
+        int32_t otherQuantity = other.getUnitCategoryIndex();
+        if (thisQuantity < otherQuantity) {
+            return -1;
+        }
+        if (thisQuantity > otherQuantity) {
+            return 1;
+        }
+        // If quantity order didn't help, then we go by index.
         if (index < other.index) {
             return -1;
         }
@@ -128,7 +162,8 @@ struct U_I18N_API SingleUnitImpl : public UMemory {
 
     /**
      * Simple unit index, unique for every simple unit, -1 for the dimensionless
-     * unit. This is an index into a string list in measunit_extra.cpp.
+     * unit. This is an index into a string list in measunit_extra.cpp, as
+     * loaded by SimpleUnitIdentifiersSink.
      *
      * The default value is -1, meaning the dimensionless unit:
      * isDimensionless() will return true, until index is changed.
index 2d94a851a18eb21f990b3fcea19e2ae774c94b88..61f537479fa9086aa3be98781f4607b26d3f8084 100644 (file)
@@ -364,29 +364,6 @@ int32_t UnitPreferenceMetadata::compareTo(const UnitPreferenceMetadata &other, b
     return cmp;
 }
 
-CharString U_I18N_API getUnitCategory(const char *baseUnitIdentifier, UErrorCode &status) {
-    CharString result;
-    LocalUResourceBundlePointer unitsBundle(ures_openDirect(NULL, "units", &status));
-    LocalUResourceBundlePointer unitQuantities(
-        ures_getByKey(unitsBundle.getAlias(), "unitQuantities", NULL, &status));
-    int32_t categoryLength;
-    if (U_FAILURE(status)) { return result; }
-    const UChar *uCategory =
-        ures_getStringByKey(unitQuantities.getAlias(), baseUnitIdentifier, &categoryLength, &status);
-    if (U_FAILURE(status)) {
-        // TODO(icu-units#130): support inverting any unit, with correct
-        // fallback logic: inversion and fallback may depend on presence or
-        // absence of a usage for that category.
-        if (uprv_strcmp(baseUnitIdentifier, "meter-per-cubic-meter") == 0) {
-            status = U_ZERO_ERROR;
-            result.append("consumption", status);
-            return result;
-        }
-    }
-    result.appendInvariantChars(uCategory, categoryLength, status);
-    return result;
-}
-
 // TODO: this may be unnecessary. Fold into ConversionRates class? Or move to anonymous namespace?
 void U_I18N_API getAllConversionRates(MaybeStackVector<ConversionRateInfo> &result, UErrorCode &status) {
     LocalUResourceBundlePointer unitsBundle(ures_openDirect(NULL, "units", &status));
index b6fe8e88de3c2ef71cbe7e2f793d5537133d138b..2c19b9434bd02b49180544e46902112c782a8ec9 100644 (file)
 U_NAMESPACE_BEGIN
 namespace units {
 
-/**
- * Looks up the unit category of a base unit identifier.
- *
- * Only supports base units, other units must be resolved to base units before
- * passing to this function.
- *
- * Categories are found in `unitQuantities` in the `units` resource (see
- * `units.txt`).
- *
- * TODO(hugovdm): if we give units_data.cpp access to the functionality of
- * `extractCompoundBaseUnit` which is currently in units_converter.cpp, we could
- * support all units for which there is a category. Does it make sense to move
- * that function to units_data.cpp?
- */
-CharString U_I18N_API getUnitCategory(const char *baseUnitIdentifier, UErrorCode &status);
-
 /**
  * Encapsulates "convertUnits" information from units resources, specifying how
  * to convert from one unit to another.
index 882077bc2ddcd4977c5677d7cf204ca4461e528d..9fc389d4395d2fb790238b8838aafdf6105c3459 100644 (file)
@@ -53,13 +53,18 @@ UnitsRouter::UnitsRouter(MeasureUnit inputUnit, StringPiece region, StringPiece
     MeasureUnitImpl inputUnitImpl = MeasureUnitImpl::forMeasureUnitMaybeCopy(inputUnit, status);
     MeasureUnit baseUnit =
         (extractCompoundBaseUnit(inputUnitImpl, conversionRates, status)).build(status);
-    CharString category = getUnitCategory(baseUnit.getIdentifier(), status);
+    CharString category = getUnitQuantity(baseUnit.getIdentifier(), status);
+    if (U_FAILURE(status)) {
+        return;
+    }
 
     const UnitPreference *const *unitPreferences;
     int32_t preferencesCount = 0;
-    prefs.getPreferencesFor(category.data(), usage, region, unitPreferences, preferencesCount, status);
+    prefs.getPreferencesFor(category.toStringPiece(), usage, region, unitPreferences, preferencesCount,
+                            status);
 
     for (int i = 0; i < preferencesCount; ++i) {
+        U_ASSERT(unitPreferences[i] != nullptr);
         const auto &preference = *unitPreferences[i];
 
         MeasureUnitImpl complexTargetUnitImpl =
index 3af2e1eb05273b8f15b83e07dbcd097700002592..3f429cd3fab978f02004af3ce97bacb10f0b7d6b 100644 (file)
@@ -3672,13 +3672,12 @@ void MeasureFormatTest::TestIdentifiers() {
         {"kilometer-per-second-per-megaparsec", "kilometer-per-megaparsec-second"},
 
         // TODO(ICU-21284): Add more test cases once the proper ranking is available.
-        // TODO(ICU-21284,icu-units#70): These cases are the wrong way around:
-        {"pound-force-foot", "foot-pound-force"},
-        {"foot-pound-force", "foot-pound-force"},
-        {"kilowatt-hour", "hour-kilowatt"},
-        {"hour-kilowatt", "hour-kilowatt"},
-        {"newton-meter", "meter-newton"},
-        {"meter-newton", "meter-newton"},
+        {"newton-meter", "newton-meter"},
+        {"meter-newton", "newton-meter"},
+        {"pound-force-foot", "pound-force-foot"},
+        {"foot-pound-force", "pound-force-foot"},
+        {"kilowatt-hour", "kilowatt-hour"},
+        {"hour-kilowatt", "kilowatt-hour"},
 
         // Testing prefixes are parsed and produced correctly (ensures no
         // collisions in the enum values)
index 405c4e4a5b8ef3da84fc77133c69d9976985e1be..a5ffa7f1170dccc4762e8129e3b3263e050ca474 100644 (file)
@@ -5,7 +5,9 @@
 
 #if !UCONFIG_NO_FORMATTING
 
+#include "measunit_impl.h"
 #include "units_data.h"
+
 #include "intltest.h"
 
 using namespace ::icu::units;
@@ -51,9 +53,10 @@ void UnitsDataTest::testGetUnitCategory() {
 
     IcuTestErrorCode status(*this, "testGetUnitCategory");
     for (const auto &t : testCases) {
-        CharString category = getUnitCategory(t.unit, status);
-        status.errIfFailureAndReset("getUnitCategory(%s)", t.unit);
-        assertEquals("category", t.expectedCategory, category.data());
+        CharString category = getUnitQuantity(t.unit, status);
+        if (!status.errIfFailureAndReset("getUnitCategory(%s)", t.unit)) {
+            assertEquals("category", t.expectedCategory, category.data());
+        }
     }
 }
 
index fd35f44de3eab3fe8b10740459677ee1260932a7..dd66d742a82a24d064391dadc2ba8c6591253d63 100644 (file)
@@ -5,6 +5,9 @@ package com.ibm.icu.impl.units;
 
 import com.ibm.icu.util.MeasureUnit;
 
+// TODO: revisit documentation in this file. E.g. we don't do dimensionless
+// units in Java? We use null instead.
+
 /**
  * A class representing a single unit (optional SI or binary prefix, and dimensionality).
  */
@@ -84,6 +87,9 @@ public class SingleUnitImpl {
      * Compare this SingleUnitImpl to another SingleUnitImpl for the sake of
      * sorting and coalescing.
      * <p>
+     * Sort order of units is specified by UTS #35
+     * (https://unicode.org/reports/tr35/tr35-info.html#Unit_Identifier_Normalization).
+     * <p>
      * Takes the sign of dimensionality into account, but not the absolute
      * value: per-meter is not considered the same as meter, but meter is
      * considered the same as square-meter.
@@ -100,6 +106,16 @@ public class SingleUnitImpl {
         if (dimensionality > 0 && other.dimensionality < 0) {
             return -1;
         }
+        // Sort by official quantity order
+        int thisCategoryIndex = UnitsData.getCategoryIndexOfSimpleUnit(index);
+        int otherCategoryIndex = UnitsData.getCategoryIndexOfSimpleUnit(other.index);
+        if (thisCategoryIndex < otherCategoryIndex) {
+            return -1;
+        }
+        if (thisCategoryIndex > otherCategoryIndex) {
+            return 1;
+        }
+        // If quantity order didn't help, then we go by index.
         if (index < other.index) {
             return -1;
         }
@@ -158,6 +174,7 @@ public class SingleUnitImpl {
         this.unitPrefix = unitPrefix;
     }
 
+    // TODO: unused? Delete?
     public int getIndex() {
         return index;
     }
index 19ef8f70702325935c49aa6552eda7b73a7921e9..910fa92aa48571b1113e08a499d228cd46951148 100644 (file)
@@ -1,11 +1,11 @@
 // © 2020 and later: Unicode, Inc. and others.
 // License & terms of use: http://www.unicode.org/copyright.html
 
-
 package com.ibm.icu.impl.units;
 
 import java.util.ArrayList;
 import java.util.HashMap;
+import java.util.Iterator;
 
 import com.ibm.icu.impl.ICUData;
 import com.ibm.icu.impl.ICUResourceBundle;
@@ -17,33 +17,38 @@ import com.ibm.icu.util.UResourceBundle;
  * Responsible for all units data operations (retriever, analysis, extraction certain data ... etc.).
  */
 public class UnitsData {
-    private volatile static String[] simpleUnits = null;
+    // TODO(icu-units#122): this class can use static initialization to load the
+    // data once, and provide access to it via static methods. (Partial change
+    // has been done already.)
+
+    // Array of simple unit IDs.
+    private static String[] simpleUnits = null;
+
+    // Maps from the value associated with each simple unit ID to a category
+    // index number.
+    private static int[] simpleUnitCategories = null;
+
     private ConversionRates conversionRates;
     private UnitPreferences unitPreferences;
-    /**
-     * Pairs of categories and the corresponding base units.
-     */
-    private Categories categories;
+
 
     public UnitsData() {
         this.conversionRates = new ConversionRates();
         this.unitPreferences = new UnitPreferences();
-        this.categories = new Categories();
     }
 
     public static String[] getSimpleUnits() {
-        if (simpleUnits != null) {
-            return simpleUnits;
-        }
+        return simpleUnits;
+    }
 
+    static {
         // Read simple units
         ICUResourceBundle resource;
         resource = (ICUResourceBundle) UResourceBundle.getBundleInstance(ICUData.ICU_BASE_NAME, "units");
         SimpleUnitIdentifiersSink sink = new SimpleUnitIdentifiersSink();
         resource.getAllItemsWithFallback("convertUnits", sink);
         simpleUnits = sink.simpleUnits;
-
-        return simpleUnits;
+        simpleUnitCategories = sink.simpleUnitCategories;
     }
 
     public ConversionRates getConversionRates() {
@@ -54,6 +59,10 @@ public class UnitsData {
         return unitPreferences;
     }
 
+    public static int getCategoryIndexOfSimpleUnit(int simpleUnitIndex) {
+        return simpleUnitCategories[simpleUnitIndex];
+    }
+
     /**
      * @param measureUnit An instance of MeasureUnitImpl.
      * @return the corresponding category.
@@ -70,7 +79,8 @@ public class UnitsData {
             return "consumption";
         }
 
-        return this.categories.mapFromUnitToCategory.get(baseUnitIdentifier);
+        int index = Categories.baseUnitToIndex.get(baseUnitIdentifier);
+        return Categories.indexToCategory[index];
     }
 
     public UnitPreferences.UnitPreference[] getPreferencesFor(String category, String usage, String region) {
@@ -79,6 +89,7 @@ public class UnitsData {
 
     public static class SimpleUnitIdentifiersSink extends UResource.Sink {
         String[] simpleUnits = null;
+        int[] simpleUnitCategories = null;
 
         @Override
         public void put(UResource.Key key, UResource.Value value, boolean noFallback) {
@@ -87,6 +98,7 @@ public class UnitsData {
 
             UResource.Table simpleUnitsTable = value.getTable();
             ArrayList<String> simpleUnits = new ArrayList<>();
+            ArrayList<Integer> simpleUnitCategories = new ArrayList<>();
             for (int i = 0; simpleUnitsTable.getKeyAndValue(i, key, value); i++) {
                 if (key.toString().equals("kilogram")) {
 
@@ -97,10 +109,28 @@ public class UnitsData {
                     continue;
                 }
 
+                // Find the base target unit for this simple unit
+                UResource.Table table = value.getTable();
+                if (!table.findValue("target", value)) {
+                    // TODO: is there a more idiomatic way to deal with Resource
+                    // Sink data errors in ICU4J? For now we just assert-fail,
+                    // and otherwise skip bad data:
+                    assert false : "Could not find \"target\" for simple unit: " + key;
+                    continue;
+                }
+                String target = value.getString();
+
                 simpleUnits.add(key.toString());
+                simpleUnitCategories.add(Categories.baseUnitToIndex.get(target));
             }
 
             this.simpleUnits = simpleUnits.toArray(new String[0]);
+            this.simpleUnitCategories = new int[simpleUnitCategories.size()];
+            Iterator<Integer> iter = simpleUnitCategories.iterator();
+            for (int i = 0; i < this.simpleUnitCategories.length; i++)
+            {
+                this.simpleUnitCategories[i] = iter.next().intValue();
+            }
         }
     }
 
@@ -138,50 +168,71 @@ public class UnitsData {
         public static final String DEFAULT_USAGE = "default";
     }
 
+    // Deals with base units and categories, e.g. "meter-per-second" --> "speed".
     public static class Categories {
-
         /**
-         * Contains the map between units in their base units into their category.
-         * For example:  meter-per-second --> "speed"
+         * Maps from base unit to an index value: an index into the
+         * indexToCategory array.
          */
-        HashMap<String, String> mapFromUnitToCategory;
+        static HashMap<String, Integer> baseUnitToIndex;
 
+        /**
+         * Our official array of category strings - categories are identified by
+         * indeces into this array.
+         */
+        static String[] indexToCategory;
 
-        public Categories() {
+        static {
             // Read unit Categories
             ICUResourceBundle resource;
             resource = (ICUResourceBundle) UResourceBundle.getBundleInstance(ICUData.ICU_BASE_NAME, "units");
             CategoriesSink sink = new CategoriesSink();
             resource.getAllItemsWithFallback(Constants.CATEGORY_TABLE_NAME, sink);
-            this.mapFromUnitToCategory = sink.getMapFromUnitToCategory();
+            baseUnitToIndex = sink.mapFromUnitToIndex;
+            indexToCategory = sink.categories.toArray(new String[0]);
         }
     }
 
+    /**
+     * A Resource Sink that collects information from `unitQuantities` in the
+     * `units` resource to provide key->value lookups from base unit to
+     * category, as well as preserving ordering information for these
+     * categories. See `units.txt`.
+     *
+     * For example: "kilogram" -> "mass", "meter-per-second" -> "speed".
+     *
+     * In Java unitQuantity values are collected in order into an ArrayList,
+     * while unitQuantity key-to-index lookups are handled with a HashMap.
+     */
     public static class CategoriesSink extends UResource.Sink {
         /**
          * Contains the map between units in their base units into their category.
          * For example:  meter-per-second --> "speed"
          */
-        HashMap<String, String> mapFromUnitToCategory;
+        HashMap<String, Integer> mapFromUnitToIndex;
+        ArrayList<String> categories;
 
         public CategoriesSink() {
-            mapFromUnitToCategory = new HashMap<>();
+            mapFromUnitToIndex = new HashMap<>();
+            categories = new ArrayList<>();
         }
 
         @Override
         public void put(UResource.Key key, UResource.Value value, boolean noFallback) {
             assert (key.toString().equals(Constants.CATEGORY_TABLE_NAME));
-            assert (value.getType() == UResourceBundle.TABLE);
-
-            UResource.Table categoryTable = value.getTable();
-            for (int i = 0; categoryTable.getKeyAndValue(i, key, value); i++) {
-                assert (value.getType() == UResourceBundle.STRING);
-                mapFromUnitToCategory.put(key.toString(), value.toString());
+            assert (value.getType() == UResourceBundle.ARRAY);
+
+            UResource.Array categoryArray = value.getArray();
+            for (int i=0; categoryArray.getValue(i, value); i++) {
+                assert (value.getType() == UResourceBundle.TABLE);
+                UResource.Table table = value.getTable();
+                assert (table.getSize() == 1)
+                    : "expecting single-entry table, got size: " + table.getSize();
+                table.getKeyAndValue(0, key, value);
+                assert value.getType() == UResourceBundle.STRING : "expecting category string";
+                mapFromUnitToIndex.put(key.toString(), categories.size());
+                categories.add(value.toString());
             }
         }
-
-        public HashMap<String, String> getMapFromUnitToCategory() {
-            return mapFromUnitToCategory;
-        }
     }
 }
index 4d9cc237296221ef13055dc2f9cab4134fd32328..3251490129b30cd7eb0c1a18ae13e6d2bb26fea3 100644 (file)
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e738e530bcd2dcafff1de1d603c79d5a1edc04c095ca52366259c354f19e56ed
-size 13306751
+oid sha256:f4a144335f9c6c6a6df5a95d882d8841de82be4e86db650c643c67ac84ef8f84
+size 13306908
index 08986d63d9e77e2ff7104a900a6671334bcb85f7..5b649fdf1be76d54c0211e6564165bbe3c019979 100644 (file)
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:19f02ee2a2dc722a729fa9258175a738fc6021d252769b85c023a927135c7c26
+oid sha256:09736746668a9d57494331b4533ae8ba1e38f55f433f5ecd9026e1c57735a413
 size 95080
index 1315090b2fbc4adc51c473623bea824980784f37..b65c1cc48aa760a0c66425832233c47894c0a3c7 100644 (file)
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1970fbcc18ec8a8b86702fe73ffbba842e9379bd973edbfb4e189ac6ac6d2a83
+oid sha256:056761b1169f3ba2b2c63e3f71c8bce2e61a7a80d7e21bcd9c38e98fbd3414a0
 size 723496
index 9a0fdd16eac9770bf66ff266795877d0c5deb489..1220f562a36b39337d3c2c249cd364046724e6fc 100644 (file)
@@ -3497,13 +3497,12 @@ public class MeasureUnitTest extends TestFmwk {
             new TestCase("kilometer-per-second-per-megaparsec", "kilometer-per-megaparsec-second"),
 
             // TODO(ICU-21284): Add more test cases once the proper ranking is available.
-            // TODO(ICU-21284,icu-units#70): These cases are the wrong way around:
-            new TestCase("pound-force-foot", "foot-pound-force"),
-            new TestCase("foot-pound-force", "foot-pound-force"),
-            new TestCase("kilowatt-hour", "hour-kilowatt"),
-            new TestCase("hour-kilowatt", "hour-kilowatt"),
-            new TestCase("newton-meter", "meter-newton"),
-            new TestCase("meter-newton", "meter-newton"),
+            new TestCase("newton-meter", "newton-meter"),
+            new TestCase("meter-newton", "newton-meter"),
+            new TestCase("pound-force-foot", "pound-force-foot"),
+            new TestCase("foot-pound-force", "pound-force-foot"),
+            new TestCase("kilowatt-hour", "kilowatt-hour"),
+            new TestCase("hour-kilowatt", "kilowatt-hour"),
 
             // Testing prefixes are parsed and produced correctly (ensures no
             // collisions in the enum values)
index 5edbe7074b5f684ec4aad3e383f93b76a4bf593b..01b1caf12f11c2f22b74881b68b657e9b1e3b057 100644 (file)
     ; /unitConstants/$1 ; values="$2"
 
 //supplementalData/unitQuantities/unitQuantity[@baseUnit="(%W)"][@quantity="(%W)"](?:[@status="%W"])?
-    ; /unitQuantities/$1 ; values="$2"
+    ; /unitQuantities/<FIFO>/$1 ; values="$2"
 
 //supplementalData/convertUnits/convertUnit[@source="(%W)"][@baseUnit="(%W)"](?:[@systems="%W"])?
     ; /convertUnits/$1/target ; values=$2