]> granicus.if.org Git - icu/commitdiff
ICU-22100 Remove unicode blocks from Japanese ML phrase breaking
authorallenwtsu <allenwtsu@google.com>
Mon, 9 Jan 2023 05:56:18 +0000 (05:56 +0000)
committerFrank Yung-Fong Tang <ftang@google.com>
Tue, 10 Jan 2023 01:38:51 +0000 (17:38 -0800)
See #2278

icu4c/source/common/mlbe.cpp
icu4c/source/common/mlbe.h
icu4c/source/data/brkitr/adaboost/jaml.txt
icu4c/source/test/testdata/rbbitst.txt
icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/MlBreakEngine.java
icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt

index 3ccf470e5b1aecfb6aaf34b53c5c69e73df93503..79b163299eb8fe093114a7a881b7614d167124e5 100644 (file)
 
 U_NAMESPACE_BEGIN
 
-Element::Element() : length(0) {}
-
-void Element::setCharAndUblock(UChar32 ch, const UnicodeString &idx) {
-    character = ch;
-    U_ASSERT(idx.length() <= 3);
-    length = idx.length();
-    idx.extract(0, length, ublock);
-    ublock[length] = '\0';
-}
-
-UChar32 Element::getCharacter() const {
-    return character;
-}
-
-char16_t* Element::getUblock() const {
-    return (char16_t*)ublock;
-}
-
-uint16_t Element::getLength() const {
-    return length;
-}
-
 MlBreakEngine::MlBreakEngine(const UnicodeSet &digitOrOpenPunctuationOrAlphabetSet,
                                  const UnicodeSet &closePunctuationSet, UErrorCode &status)
     : fDigitOrOpenPunctuationOrAlphabetSet(digitOrOpenPunctuationOrAlphabetSet),
@@ -56,12 +34,8 @@ MlBreakEngine::~MlBreakEngine() {}
 
 namespace {
     const char16_t INVALID = u'|';
-    const int32_t MAX_FEATURE = 26;
-    const int32_t MAX_FEATURE_LENGTH = 14;
-
-    bool isValid(const Element& element) {
-        return element.getLength() != 1 || element.getUblock()[0] != INVALID;
-    }
+    const int32_t MAX_FEATURE = 13;
+    const int32_t MAX_FEATURE_LENGTH = 11;
 
     void concatChar(const char16_t *str, const UChar32 *arr, int32_t length, char16_t *feature, UErrorCode &status) {
         if (U_FAILURE(status)) {
@@ -74,11 +48,6 @@ namespace {
         U_ASSERT(result.length() < MAX_FEATURE_LENGTH);
         result.extract(feature, MAX_FEATURE_LENGTH, status);  // NUL-terminates
     }
-
-    void writeString(const UnicodeString &str, char16_t *feature, UErrorCode &status) {
-        U_ASSERT(str.length() < MAX_FEATURE_LENGTH);
-        str.extract(feature, MAX_FEATURE_LENGTH, status);  // NUL-terminates
-    }
 }
 
 int32_t MlBreakEngine::divideUpRange(UText *inText, int32_t rangeStart, int32_t rangeEnd,
@@ -98,12 +67,11 @@ int32_t MlBreakEngine::divideUpRange(UText *inText, int32_t rangeStart, int32_t
         return 0;
     }
     int32_t numBreaks = 0;
-    UChar32 ch;
     UnicodeString index;
     // The ML model groups six char to evaluate if the 4th char is a breakpoint.
     // Like a sliding window, the elementList removes the first char and appends the new char from
     // inString in each iteration so that its size always remains at six.
-    Element elementList[6];
+    UChar32 elementList[6];
 
     int32_t codeUts = initElementList(inString, elementList, status);
     int32_t length = inString.countChar32();
@@ -117,12 +85,10 @@ int32_t MlBreakEngine::divideUpRange(UText *inText, int32_t rangeStart, int32_t
         evaluateBreakpoint(elementList, i, numBreaks, boundary, status);
         if (i + 1 >= inString.countChar32()) break;
         // Remove the first element and append a new element
-        uprv_memmove(elementList, elementList + 1, 5 * sizeof(Element));
-        ch = inString.countChar32(0, codeUts) < length ? inString.char32At(codeUts) : INVALID;
-        index = (ch != INVALID) ? getUnicodeBlock(ch, status) : UnicodeString(INVALID);
-        elementList[5].setCharAndUblock(ch, index);
-        if (ch != INVALID) {
-            codeUts += U16_LENGTH(ch);
+        uprv_memmove(elementList, elementList + 1, 5 * sizeof(UChar32));
+        elementList[5] = inString.countChar32(0, codeUts) < length ? inString.char32At(codeUts) : INVALID;
+        if (elementList[5] != INVALID) {
+            codeUts += U16_LENGTH(elementList[5]);
         }
     }
     if (U_FAILURE(status)) return 0;
@@ -176,7 +142,7 @@ int32_t MlBreakEngine::divideUpRange(UText *inText, int32_t rangeStart, int32_t
     return correctedNumBreaks;
 }
 
-void MlBreakEngine::evaluateBreakpoint(Element* elementList, int32_t index, int32_t &numBreaks,
+void MlBreakEngine::evaluateBreakpoint(UChar32* elementList, int32_t index, int32_t &numBreaks,
                                          UVector32 &boundary, UErrorCode &status) const {
     char16_t featureList[MAX_FEATURE][MAX_FEATURE_LENGTH];
     if (U_FAILURE(status)) {
@@ -186,12 +152,12 @@ void MlBreakEngine::evaluateBreakpoint(Element* elementList, int32_t index, int3
     UChar32 arr[4] = {-1, -1, -1, -1};
     int32_t length = 0, listLength = 0;
 
-    const UChar32 w1 = elementList[0].getCharacter();
-    const UChar32 w2 = elementList[1].getCharacter();
-    const UChar32 w3 = elementList[2].getCharacter();
-    const UChar32 w4 = elementList[3].getCharacter();
-    const UChar32 w5 = elementList[4].getCharacter();
-    const UChar32 w6 = elementList[5].getCharacter();
+    const UChar32 w1 = elementList[0];
+    const UChar32 w2 = elementList[1];
+    const UChar32 w3 = elementList[2];
+    const UChar32 w4 = elementList[3];
+    const UChar32 w5 = elementList[4];
+    const UChar32 w6 = elementList[5];
 
     length = 1;
     if (w1 != INVALID) {
@@ -259,82 +225,6 @@ void MlBreakEngine::evaluateBreakpoint(Element* elementList, int32_t index, int3
         arr[2] = w6;
         concatChar(u"TW4:", arr, length, featureList[listLength++], status);
     }
-    if (isValid(elementList[0])) {
-        writeString(UnicodeString(u"UB1:").append(elementList[0].getUblock(), 0,
-                                                  elementList[0].getLength()),
-                    featureList[listLength++], status);
-    }
-    if (isValid(elementList[1])) {
-        writeString(UnicodeString(u"UB2:").append(elementList[1].getUblock(), 0,
-                                                  elementList[1].getLength()),
-                    featureList[listLength++], status);
-    }
-    if (isValid(elementList[2])) {
-        writeString(UnicodeString(u"UB3:").append(elementList[2].getUblock(), 0,
-                                                  elementList[2].getLength()),
-                    featureList[listLength++], status);
-    }
-    if (isValid(elementList[3])) {
-        writeString(UnicodeString(u"UB4:").append(elementList[3].getUblock(), 0,
-                                                  elementList[3].getLength()),
-                    featureList[listLength++], status);
-    }
-    if (isValid(elementList[4])) {
-        writeString(UnicodeString(u"UB5:").append(elementList[4].getUblock(), 0,
-                                                  elementList[4].getLength()),
-                    featureList[listLength++], status);
-    }
-    if (isValid(elementList[5])) {
-        writeString(UnicodeString(u"UB6:").append(elementList[5].getUblock(), 0,
-                                                  elementList[5].getLength()),
-                    featureList[listLength++], status);
-    }
-    if (isValid(elementList[1]) && isValid(elementList[2])) {
-        writeString(UnicodeString(u"BB1:")
-                        .append(elementList[1].getUblock(), 0, elementList[1].getLength())
-                        .append(elementList[2].getUblock(), 0, elementList[2].getLength()),
-                    featureList[listLength++], status);
-    }
-    if (isValid(elementList[2]) && isValid(elementList[3])) {
-        writeString(UnicodeString(u"BB2:")
-                        .append(elementList[2].getUblock(), 0, elementList[2].getLength())
-                        .append(elementList[3].getUblock(), 0, elementList[3].getLength()),
-                    featureList[listLength++], status);
-    }
-    if (isValid(elementList[3]) && isValid(elementList[4])) {
-        writeString(UnicodeString(u"BB3:")
-                        .append(elementList[3].getUblock(), 0, elementList[3].getLength())
-                        .append(elementList[4].getUblock(), 0, elementList[4].getLength()),
-                    featureList[listLength++], status);
-    }
-    if (isValid(elementList[0]) && isValid(elementList[1]) && isValid(elementList[2])) {
-        writeString(UnicodeString(u"TB1:")
-                        .append(elementList[0].getUblock(), 0, elementList[0].getLength())
-                        .append(elementList[1].getUblock(), 0, elementList[1].getLength())
-                        .append(elementList[2].getUblock(), 0, elementList[2].getLength()),
-                    featureList[listLength++], status);
-    }
-    if (isValid(elementList[1]) && isValid(elementList[2]) && isValid(elementList[3])) {
-        writeString(UnicodeString(u"TB2:")
-                        .append(elementList[1].getUblock(), 0, elementList[1].getLength())
-                        .append(elementList[2].getUblock(), 0, elementList[2].getLength())
-                        .append(elementList[3].getUblock(), 0, elementList[3].getLength()),
-                    featureList[listLength++], status);
-    }
-    if (isValid(elementList[2]) && isValid(elementList[3]) && isValid(elementList[4])) {
-        writeString(UnicodeString(u"TB3:")
-                        .append(elementList[2].getUblock(), 0, elementList[2].getLength())
-                        .append(elementList[3].getUblock(), 0, elementList[3].getLength())
-                        .append(elementList[4].getUblock(), 0, elementList[4].getLength()),
-                    featureList[listLength++], status);
-    }
-    if (isValid(elementList[3]) && isValid(elementList[4]) && isValid(elementList[5])) {
-        writeString(UnicodeString(u"TB4:")
-                        .append(elementList[3].getUblock(), 0, elementList[3].getLength())
-                        .append(elementList[4].getUblock(), 0, elementList[4].getLength())
-                        .append(elementList[5].getUblock(), 0, elementList[5].getLength()),
-                    featureList[listLength++], status);
-    }
     if (U_FAILURE(status)) {
         return;
     }
@@ -351,7 +241,7 @@ void MlBreakEngine::evaluateBreakpoint(Element* elementList, int32_t index, int3
     }
 }
 
-int32_t MlBreakEngine::initElementList(const UnicodeString &inString, Element* elementList,
+int32_t MlBreakEngine::initElementList(const UnicodeString &inString, UChar32* elementList,
                                          UErrorCode &status) const {
     if (U_FAILURE(status)) {
         return 0;
@@ -363,52 +253,29 @@ int32_t MlBreakEngine::initElementList(const UnicodeString &inString, Element* e
     if (length > 0) {
         w3 = inString.char32At(0);
         index += U16_LENGTH(w3);
+        if (length > 1) {
+            w4 = inString.char32At(index);
+            index += U16_LENGTH(w4);
+            if (length > 2) {
+                w5 = inString.char32At(index);
+                index += U16_LENGTH(w5);
+                if (length > 3) {
+                    w6 = inString.char32At(index);
+                    index += U16_LENGTH(w6);
+                }
+            }
+        }
     }
-    if (length > 1) {
-        w4 = inString.char32At(index);
-        index += U16_LENGTH(w4);
-    }
-    if (length > 2) {
-        w5 = inString.char32At(index);
-        index += U16_LENGTH(w5);
-    }
-    if (length > 3) {
-        w6 = inString.char32At(index);
-        index += U16_LENGTH(w6);
-    }
-
-    const UnicodeString b1(INVALID);
-    const UnicodeString b2(b1);
-    const UnicodeString b3(getUnicodeBlock(w3, status));
-    const UnicodeString b4(getUnicodeBlock(w4, status));
-    const UnicodeString b5(getUnicodeBlock(w5, status));
-    const UnicodeString b6(getUnicodeBlock(w6, status));
-
-    elementList[0].setCharAndUblock(w1, b1);
-    elementList[1].setCharAndUblock(w2, b2);
-    elementList[2].setCharAndUblock(w3, b3);
-    elementList[3].setCharAndUblock(w4, b4);
-    elementList[4].setCharAndUblock(w5, b5);
-    elementList[5].setCharAndUblock(w6, b6);
+    elementList[0] = w1;
+    elementList[1] = w2;
+    elementList[2] = w3;
+    elementList[3] = w4;
+    elementList[4] = w5;
+    elementList[5] = w6;
 
     return index;
 }
 
-UnicodeString MlBreakEngine::getUnicodeBlock(UChar32 ch, UErrorCode &status) const {
-    if (U_FAILURE(status)) {
-        return UnicodeString(INVALID);
-    }
-
-    UBlockCode block = ublock_getCode(ch);
-    if (block == UBLOCK_NO_BLOCK || block == UBLOCK_INVALID_CODE) {
-        return UnicodeString(INVALID);
-    } else {
-        UnicodeString empty;
-        // Same as sprintf("%03d", block)
-        return ICU_Utility::appendNumber(empty, (int32_t)block, 10, 3);
-    }
-}
-
 void MlBreakEngine::loadMLModel(UErrorCode &error) {
     // BudouX's model consists of pairs of the feature and its score.
     // As integrating it into jaml.txt, modelKeys denotes the ML feature; modelValues means the
index 8943fa3414f88e23e6649b1ae2704d19202da36a..2f0edd6c4f26e4564dc42326b6e92a36a2353723 100644 (file)
@@ -13,51 +13,6 @@ U_NAMESPACE_BEGIN
 
 #if !UCONFIG_NO_BREAK_ITERATION
 
-/**
- * A class used to encapsulate a character and its unicode block index
- */
-class Element : public UMemory {
-   public:
-    /**
-     * Default constructor.
-     */
-    Element();
-
-    /**
-     * Set the character and its unicode block.
-     *
-     * @param ch A unicode character.
-     * @param ublock The unicode block of the character.
-     */
-    void setCharAndUblock(UChar32 ch, const UnicodeString& ublock);
-
-    /**
-     * Get the unicode character.
-     *
-     * @return The unicode character.
-     */
-    UChar32 getCharacter() const;
-
-    /**
-     * Get the unicode character's unicode block.
-     *
-     * @return The unicode block.
-     */
-    char16_t* getUblock() const;
-
-    /**
-     * Get the length of the unicode block.
-     *
-     * @return The unicode block length.
-     */
-    uint16_t getLength() const;
-
-   private:
-    UChar32 character;
-    char16_t ublock[4];
-    uint16_t length;
-};
-
 /**
  * A machine learning break engine for the phrase breaking in Japanese.
  */
@@ -104,38 +59,27 @@ class MlBreakEngine : public UMemory {
      */
     void loadMLModel(UErrorCode &error);
 
-    /**
-     * Get the character's unicode block code defined in UBlockCode.
-     *
-     * @param ch A character.
-     * @param error Information on any errors encountered.
-     * @return The unicode block code which is 3 digits with '0' added in the beginning if the code
-     * is less than 3 digits.
-     *
-     */
-    UnicodeString getUnicodeBlock(UChar32 ch, UErrorCode &status) const;
-
     /**
      * Initialize the element list from the input string.
      *
      * @param inString A input string to be segmented.
-     * @param elementList A list to store the first six characters and their unicode block codes.
+     * @param elementList A list to store the first six characters.
      * @param status Information on any errors encountered.
      * @return The number of code units of the first six characters in inString.
      */
-    int32_t initElementList(const UnicodeString &inString, Element* elementList,
+    int32_t initElementList(const UnicodeString &inString, UChar32* elementList,
                             UErrorCode &status) const;
 
     /**
      * Evaluate whether the index is a potential breakpoint.
      *
-     * @param elementList A list including 6 elements for the breakpoint evaluation.
+     * @param elementList A list including six elements for the breakpoint evaluation.
      * @param index The breakpoint index to be evaluated.
      * @param numBreaks The accumulated number of breakpoints.
      * @param boundary A vector including the index of the breakpoint.
      * @param status Information on any errors encountered.
      */
-    void evaluateBreakpoint(Element* elementList, int32_t index, int32_t &numBreaks,
+    void evaluateBreakpoint(UChar32* elementList, int32_t index, int32_t &numBreaks,
                             UVector32 &boundary, UErrorCode &status) const;
 
     UnicodeSet fDigitOrOpenPunctuationOrAlphabetSet;
index 0500ff73fbfb7a34ede61c3097d1649523f11b21..4ddea6c78b2ab277b1c0fbf1aa4f2419e50dff87 100644 (file)
 // License & terms of use: http://www.unicode.org/copyright.html
 jaml {
     modelKeys {
-        "BB2:062071",
-        "UB3:061",
-        "UB3:071",
-        "TB2:062062062",
-        "TB4:062062062",
-        "UB3:063",
-        "UB4:071",
-        "BB3:062062",
-        "UB4:062",
-        "BB1:062071",
-        "BB1:062061",
-        "UB4:061",
-        "TB1:071071062",
-        "TB3:062063063",
-        "UB2:061",
-        "TB1:062071062",
-        "TB3:062062062",
-        "BB2:063063",
-        "UW3:は",
-        "UW3:に",
-        "TB3:062071062",
-        "UW3:が",
-        "UW4:こ",
-        "UB5:061",
-        "UW3:と",
-        "TB4:063063063",
-        "UW4:て",
-        "TB2:062062061",
+        "UW3:、",
         "UW3:。",
-        "UW4:お",
-        "UW3:の",
-        "BB3:071071",
-        "BB3:062071",
-        "UW3:お",
-        "UW3:し",
-        "UW4:、",
         "UW4:の",
-        "UW3:を",
+        "UW4:、",
+        "UW3:の",
         "UW4:。",
-        "UW3:、",
-        "UW5:で",
-        "UW4:あ",
-        "BB2:062062",
+        "UW3:に",
+        "UW5:。",
+        "UW4:て",
+        "UW3:は",
+        "UW4:に",
+        "UW3:を",
+        "UW5:、",
+        "UW2:、",
+        "UW3:が",
+        "UW2:。",
+        "UW4:で",
+        "UW3:と",
+        "UW4:は",
+        "UW4:が",
+        "UW4:る",
         "UW4:っ",
-        "UW5:っ",
         "UW3:も",
-        "UW5:う",
-        "UW3:「",
         "UW5:な",
-        "UW4:そ",
-        "UW4:る",
-        "UW3:っ",
-        "UW4:「",
-        "UW4:い",
-        "BB2:087087",
-        "UB4:087",
-        "UW5:に",
-        "BW3:もの",
-        "UW5:し",
-        "UW6:う",
-        "BW2:とい",
-        "UW4:に",
+        "UW3:で",
         "UW3:る",
-        "TB2:071062071",
-        "UW4:で",
-        "UW5:が",
-        "BB1:071071",
-        "UW5:は",
-        "UW4:は",
-        "UW4:れ",
-        "UW5:き",
-        "BB2:071062",
-        "BB2:071071",
-        "UW3:・",
-        "BB2:071087",
-        "BB2:061062",
-        "TB1:062061062",
-        "UW3:れ",
-        "BB2:087062",
-        "TB2:087087087",
+        "UW5:で",
+        "UW4:を",
+        "UW4:か",
+        "UW3:っ",
+        "UW2:の",
+        "UW4:と",
+        "UW5:っ",
+        "UW4:な",
+        "UW3:て",
+        "UW4:た",
+        "UW4:こ",
+        "UW6:に",
         "UW4:ら",
-        "TB1:071071071",
-        "UB2:071",
-        "TB1:062062087",
-        "UW5:す",
-        "UW5:ん",
-        "UW3:で",
-        "UW4:が",
-        "UW3:こ",
-        "TB4:071062062",
+        "UW3:た",
+        "UW2:を",
         "UW3:ら",
-        "UW6:に",
         "UW6:。",
-        "UW3:た",
-        "TB1:061071071",
-        "UW5:く",
-        "UB1:063",
-        "UW1:そ",
+        "UW4:し",
+        "UW3:な",
+        "UW2:に",
+        "UW4:い",
+        "UW4:り",
+        "UW6:う",
         "UW3:う",
-        "BW3:とい",
-        "BW3:とこ",
-        "UW3:ま",
-        "BW3:こと",
-        "UW2:っ",
-        "UW5:・",
-        "TB3:062062061",
-        "UW3:き",
-        "UW4:ん",
-        "UB3:062",
         "UW3:く",
-        "UW3:」",
-        "UW5:あ",
-        "BB2:062087",
-        "BW3:いう",
-        "UW5:れ",
-        "UW2:一",
-        "UW3:,",
-        "UW1:に",
-        "UW2:と",
-        "TB2:071071062",
-        "TB2:071071071",
-        "UW5:を",
-        "UW4:り",
-        "BW1:から",
-        "UW3:ち",
-        "BW3:いい",
+        "UW4:れ",
         "UW2:は",
-        "UW6:た",
-        "TB1:063063062",
-        "UW4:1",
-        "UW4:や",
-        "UW2:ん",
-        "UW3:]",
-        "UW4:ほ",
-        "TB3:062087087",
-        "BW2:であ",
         "UW4:だ",
-        "BB3:071062",
-        "TB1:087087087",
-        "BW3:・・",
-        "BW3:とき",
-        "UW4:を",
-        "UW3:て",
-        "UW4:か",
-        "UW2:そ",
-        "TB4:071071062",
-        "TB2:062061071",
-        "UW2:を",
-        "UW4:ご",
-        "UW2:で",
-        "TB3:071071071",
-        "BB1:087087",
+        "UW4:う",
+        "UW3:い",
+        "UW6:い",
+        "UW4:ん",
+        "UW2:か",
+        "UW4:ー",
+        "UW6:を",
+        "UW2:も",
+        "UW5:き",
+        "UW3:り",
+        "UW6:で",
+        "UW2:る",
+        "UW2:と",
+        "UW3:]",
+        "UW4:そ",
+        "UW3:,",
+        "UW4:も",
+        "UW4:く",
+        "UW3:か",
+        "BW2:とい",
+        "UW4:お",
+        "UW4:ま",
+        "UW6:が",
+        "UW4:き",
         "UW2:し",
-        "UW4:出",
+        "UW2:て",
+        "UW3:!",
         "UW2:ま",
-        "UW4:,",
+        "UW5:に",
+        "UW3:や",
+        "UW6:て",
+        "BW3:もの",
+        "UW6:の",
+        "UW2:ん",
+        "UW2:が",
+        "UW5:が",
+        "BW1:いう",
+        "UW2:で",
+        "UW5:す",
+        "UW3:?",
         "UW5:と",
-        "UW4:ど",
-        "BW3:して",
-        "UW1:で",
-        "BB2:061071",
-        "BW3:ため",
-        "BW2:とし",
-        "BW2:ない",
-        "BW2:てい",
-        "UW3:間",
-        "UW3:!",
-        "UW5:ー",
-        "UW4:す",
-        "UW4:!",
-        "BW1:とが",
-        "UW5:の",
-        "TB4:062062071",
-        "TB2:061071071",
-        "UW6:・",
+        "UW6:は",
         "UW3:.",
-        "UW2:て",
-        "UW3:笑",
-        "UW2:こ",
-        "UW5:も",
+        "UW4:「",
+        "UW3:ば",
+        "UW5:ん",
+        "BW3:いう",
+        "UW4:す",
+        "BW1:から",
+        "UW3:ど",
+        "UW5:し",
+        "UW2:っ",
+        "UW4:思",
+        "UW3:…",
+        "UW5:る",
+        "BW2:てい",
         "BW3:よう",
-        "UW3:人",
-        "UW2:の",
-        "UW3:か",
-        "UW3:日",
-        "UW1:い",
-        "BW2:とこ",
+        "UW5:え",
         "UW4:私",
-        "UW3:…",
-        "UW2:に",
-        "UW3:今",
-        "BB3:087062",
-        "UB3:055",
-        "UW4:(",
-        "BB1:087071",
-        "UW1:な",
-        "BB3:063063",
-        "UW5:来",
-        "UW3:?",
-        "TW3:ている",
-        "UW4:」",
-        "UW4:前",
-        "BW1:いう",
-        "UW4:つ",
+        "UW3:・",
+        "UW4:人",
+        "UW5:く",
         "UW3:)",
-        "BW1:では",
-        "UW2:る",
-        "UW5:そ",
-        "UW4:ー",
-        "TW2:気に入",
-        "UW4:笑",
-        "UW4:ひ",
-        "TB4:087087087",
+        "UW4:京",
+        "BW2:ない",
+        "UW3:ー",
+        "BW3:とこ",
+        "UW5:は",
+        "UW4:」",
+        "UW2:一",
+        "UW4:よ",
+        "BW3:こと",
+        "UW5:ー",
+        "UW6:し",
         "UW4:け",
-        "UW2:も",
-        "BW3:ちょ",
-        "BW3:出来",
-        "TB2:062071062",
-        "UW4:『",
-        "UW3:[",
-        "UW4:2",
-        "UW5:つ",
-        "TB1:061071062",
-        "UW3:1",
-        "BW3:から",
-        "UB5:071",
-        "UW4:ま",
-        "UW3:ば",
-        "UW3:り",
-        "BW3:その",
-        "UW3:ご",
-        "UW4:わ",
-        "BW2:てお",
-        "TB2:071062062",
         "BW1:ない",
+        "BW2:です",
+        "UW4:一",
+        "UW5:帯",
+        "UW5:を",
+        "UW6:な",
+        "UW5:べ",
+        "BW3:いい",
+        "BW2:であ",
+        "BW2:ので",
+        "UW4:,",
+        "UW5:れ",
+        "UW5:ろ",
+        "UW1:そ",
+        "UW5:い",
+        "UW1:い",
+        "UW5:・",
+        "UW5:わ",
+        "UW4:1",
+        "UW5:う",
+        "UW4:大",
+        "UW3:ま",
+        "BW2:とこ",
+        "UW4:!",
+        "UW4:見",
+        "UW4:行",
+        "BW1:こと",
+        "UW1:な",
+        "UW2:さ",
+        "UW3:☆",
+        "UW4:さ",
         "UW2:よ",
-        "UB2:087",
-        "UW6:の",
+        "BW1:とか",
+        "UW4:(",
+        "BW3:でも",
+        "UW5:の",
+        "UW4:・",
+        "UW5:た",
+        "UW1:す",
+        "UW5:か",
+        "UW4:使",
+        "UW3:♪",
+        "UW4:え",
+        "UW4:今",
+        "BW2:、と",
+        "BW3:とき",
+        "UW4:ろ",
+        "UW5:つ",
+        "UW1:に",
+        "UW5:じ",
+        "UW1:で",
+        "UW4:ン",
+        "UW3:ず",
+        "BW3:して",
+        "UW4:食",
+        "UW4:気",
+        "UW4:時",
+        "UW3:日",
+        "BW1:しい",
+        "UW4:自",
+        "UW3:笑",
         "UW2:毎",
+        "TW1:という",
+        "UW4:み",
+        "UW4:…",
+        "TW2:ではな",
+        "UW6:さ",
+        "UW5:め",
+        "UW2:少",
+        "UW5:あ",
+        "UW4:2",
+        "UW3:へ",
+        "TW3:という",
+        "UW4:何",
+        "UW2:く",
         "UW2:結",
-        "TW4:の京都",
-        "UW3:さ",
-        "UW2:最",
-        "BW2:です",
-        "UW2:」",
-        "UW5:え",
-        "UW3:だ",
-        "TW4:ところ",
-        "UW4:.",
-        "UB1:062",
-        "UW6:て",
+        "BW1:うな",
+        "BW1:もう",
         "UW1:が",
-        "BW2:、と",
-        "UW3:0",
+        "UW4:じ",
+        "UW2:う",
+        "UW4:ル",
+        "UW3:」",
+        "BW1:とが",
+        "UW2:最",
+        "BW1:るの",
+        "UW3:間",
+        "UW6:た",
+        "UW3:つ",
+        "UW4:ど",
+        "UW1:と",
         "UW3:ん",
-        "UW3:中",
-        "UW4:よ",
-        "BW3:この",
-        "UW2:が",
-        "UW3:み",
-        "TW2:ではな",
-        "UW6:と",
-        "UW4:[",
-        "TW3:、ある",
-        "BW3:ころ",
-        "UW4:?",
-        "UW6:、",
-        "UW4:電",
-        "BB1:062040",
-        "UW3:後",
-        "UW5:い",
-        "UW2:、",
-        "UW5:て",
-        "BB2:062040",
-        "UW3:真",
-        "UW3:そ",
-        "UW5:さ",
-        "UB5:087",
-        "TW3:という",
-        "UW3:分",
-        "UB6:071",
-        "BW3:なっ",
-        "UW4:ろ",
-        "BB2:061061",
-        "TW3:ところ",
-        "UB1:071",
-        "UW1:、",
-        "BW1:とか",
-        "UW3:な",
-        "UW6:り",
-        "UW4:間",
-        "UW3:べ",
-        "UW5:べ",
-        "TB4:062071062",
-        "UW4:]",
+        "UW4:.",
+        "UW3:だ",
+        "UW4:わ",
+        "UW4:最",
+        "UW4:?",
+        "UW3:ろ",
+        "UW4:ば",
+        "TW3:ている",
+        "BW3:この",
+        "UW5:も",
+        "UW3:人",
+        "BW3:とい",
+        "UW4:つ",
+        "BW3:その",
+        "BW3:もう",
+        "UW2:そ",
         "BW2:には",
-        "UW5:々",
-        "BW1:。・",
-        "BW1:その",
-        "UW1:す",
-        "UW4:)",
-        "UW6:っ",
-        "TB3:063063063",
-        "TB3:062071071",
-        "UB5:063",
+        "BW3:かけ",
+        "TW4:の京都",
+        "TW4:ところ",
+        "UW3:京",
+        "UW4:携",
         "BW1:かも",
-        "UW6:る",
-        "TB4:062063063",
-        "UW3:ど",
-        "TW3:である",
-        "TW4:くらい",
+        "BW1:では",
+        "UW4:ち",
+        "UW3:分",
+        "UW4:べ",
+        "BW3:ころ",
+        "UW3:ゃ",
+        "UW2:す",
+        "BW1:。・",
+        "UW3:電",
+        "BW3:なっ",
+        "UW3:す",
         "BW1:最近",
-        "BW1:しい",
-        "BW1:とも",
-        "BW2:と同",
-        "TW1:という",
-        "UW2:さ",
-        "BW2:帯電",
-        "TB1:071062062",
+        "UW4:め",
+        "UW3:ぐ",
+        "UW2:お",
         "BW3:そし",
-        "UW2:。",
-        "UW5:か",
-        "UW5:こ",
-        "BW3:ない",
+        "BW1:かし",
+        "BW1:同じ",
+        "BW3:メー",
+        "UW5:て",
+        "UW6:り",
+        "TW4:くらい",
+        "UW3:今",
+        "UW5:そ",
+        "UW4:や",
+        "UW5:」",
+        "UW4:帯",
+        "UW6:ー",
+        "BW2:とし",
+        "TW1:ような",
+        "BW2:てお",
+        "UW4:笑",
+        "UW1:は",
+        "BW3:かか",
+        "TW4:かなり",
+        "UW4:)",
         "BW1:んな",
-        "BW2:でき",
-        "UW4:3",
-        "UW3:け",
-        "TW4:ことが",
-        "BW1:こと",
-        "UB3:087",
-        "UW3:電",
-        "UW3:よ",
-        "BW1:たと",
-        "UW5:ま",
-        "UW5:た",
+        "UW1:ち",
+        "TW2:気に入",
+        "TW1:・・・",
+        "UW6:と",
         "UW5:ち",
-        "UW2:け",
-        "UW5:だ",
+        "BW3:ため",
+        "UW4:ず",
+        "UW3:0",
+        "BW1:んで",
+        "UW3:中",
+        "UW3:々",
+        "BW2:のよ",
+        "BW2:帯電",
+        "BW2:でも",
+        "BW1:には",
+        "BW3:ちょ",
+        "UW4:せ",
         "UW3:度",
-        "BW1:たい",
-        "UW4:使",
-        "UW2:き",
-        "TW4:かなり",
-        "UB6:063",
-        "BB1:062062",
-        "UW4:込",
-        "TW3:と言っ",
-        "UW6:だ",
-        "UW5:り",
-        "UW5:よ",
-        "BW3:どう",
-        "UW4:…",
-        "UW3:や",
-        "BW1:かし",
-        "BW3:かっ",
-        "UW4:今",
-        "UW3:『",
-        "UW4:思",
-        "UB2:063",
-        "UW4:く",
-        "UW3:京",
-        "UW6:ー",
-        "UW1:ん",
-        "BW1:うな",
-        "TB2:062061061",
-        "UW1:と",
-        "TB4:062063062",
-        "TB2:061062062",
-        "BW1:この",
-        "BW2:ので",
-        "UW4:み",
-        "UW5:わ",
-        "UW6:や",
-        "BW1:れて",
-        "UW2:や",
-        "UW6:こ",
-        "UW4:な",
-        "UW5:め",
-        "BW1:もう",
-        "TB4:071062071",
-        "BW1:より",
-        "UW4:合",
-        "UW6:け",
-        "BW1:少し",
-        "BW2:でし",
-        "UW4:と",
-        "TB1:063063063",
-        "UW3:ー",
-        "BW2:くな",
-        "UW2:く",
-        "UW2:我",
-        "BW2:いも",
-        "BW3:わか",
-        "TB2:071063071",
-        "UW4:も",
-        "UW1:あ",
-        "UW4:最",
-        "BW1:るの",
-        "UW2:全",
+        "BW1:でも",
+        "BW1:が、",
+        "UW2:な",
+        "UW5:思",
         "UW6:0",
-        "UW4:放",
-        "UW4:京",
-        "BW3:かけ",
-        "UW2:少",
-        "BW3:もう",
-        "UW2:多",
-        "UW2:う",
-        "TB1:062062040",
-        "UW1:を",
-        "UW3:光",
-        "BW1:!!",
-        "UW2:ャ",
-        "BW3:すぐ",
-        "UW4:帯",
-        "UW6:し",
-        "BW3:でも",
-        "BW2:、そ",
-        "TB3:071087087",
-        "TB2:063062071",
-        "UW3:わ",
-        "UB4:063",
-        "TB4:071071071",
-        "UW5:都",
-        "UW5:ず",
-        "UW2:バ",
-        "UW2:京",
-        "UW3:ゃ",
-        "BW1:い、",
-        "BW3:よく",
-        "BW1:たら",
-        "BW2:のよ",
-        "UW2:思",
-        "BW1:うに",
-        "BW1:の間",
-        "UW6:ん",
-        "UW6:ず",
-        "BW1:った",
-        "TW3:ること",
+        "UW6:寺",
         "BW3:とて",
-        "TW1:ような",
+        "BW3:ある",
+        "BW2:もし",
+        "UW4:ッ",
+        "UW1:て",
+        "BW2:にも",
+        "BW1:れた",
+        "UW4:ひ",
+        "TW3:ること",
+        "BW1:てい",
+        "UW4:』",
+        "BW1:だけ",
+        "UW3:お",
+        "BW1:少し",
+        "TW3:、ある",
+        "UW5:!",
+        "UW6:ル",
+        "UW2:多",
+        "UW6:ご",
+        "UW6:や",
+        "UW3:後",
+        "BW2:てみ",
+        "BW1:とき",
+        "UW4:ゃ",
+        "BW1:たい",
+        "UW3:き",
+        "TW4:ことが",
+        "UW3:真",
+        "BW2:など",
         "UW6:ぱ",
-        "TB3:063071062",
-        "TW4:って、",
-        "TW4:なんて",
-        "TW2:その後",
-        "UW6:ら",
-        "TW4:ことに",
-        "UW3:>",
-        "TW3:てしま",
-        "UW3:い",
-        "TB4:071062061",
-        "UW2:ひ",
-        "UW6:め",
-        "UW6:で",
+        "BW1:った",
+        "BW1:ても",
+        "UW5:日",
+        "BW1:たと",
+        "UW4:]",
+        "UW3:ッ",
+        "TW4:メール",
+        "BW2:はな",
+        "BW3:・・",
         "BW3:なる",
-        "UW5:ご",
-        "BW2:りし",
-        "UW6:電",
-        "UW1:は",
-        "BW1:いも",
-        "BW3:すご",
-        "UW4:通",
-        "BW3:おり",
-        "BW3:かか",
-        "BW1:思い",
+        "BW1:とい",
+        "UW2:全",
+        "BW1:にも",
+        "BW1:たら",
+        "BW2:くな",
+        "UW3:「",
+        "BW1:その",
+        "UW3:観",
+        "BW1:うに",
+        "UW3:イ",
+        "BW3:もん",
+        "UW5:ず",
+        "BW3:しま",
+        "BW1:より",
+        "UW5:分",
     }
     modelValues:intvector {
-        1800,
-        271,
-        -857,
-        -417,
-        285,
-        -583,
-        388,
-        828,
-        -853,
-        -820,
-        502,
-        -708,
-        358,
-        1341,
-        -586,
-        -451,
-        257,
-        -1876,
-        2052,
-        1698,
-        -458,
-        2048,
-        1182,
-        -551,
-        980,
-        773,
-        -1453,
-        -152,
-        3201,
-        2865,
-        1203,
-        144,
-        -369,
-        -2539,
-        -613,
-        -3574,
-        -1111,
-        3110,
-        -3022,
-        2039,
+        3634,
+        4347,
+        -2581,
+        -4812,
+        2538,
+        -4206,
+        2701,
+        -1455,
+        -2403,
+        2977,
+        -2678,
+        4165,
+        -818,
+        -1011,
+        2996,
+        -904,
+        -1808,
+        2064,
+        -2164,
+        -2180,
+        -2760,
+        -2310,
+        2360,
+        -388,
+        1842,
+        1706,
+        -706,
+        -2408,
+        -1628,
+        -1005,
+        -434,
+        -1442,
+        543,
         -1091,
-        1241,
-        -560,
-        -1412,
-        625,
-        1350,
-        297,
-        -2404,
-        -595,
-        1007,
-        -1829,
-        -1662,
-        3213,
-        270,
-        -911,
-        178,
-        -727,
-        2716,
-        -484,
-        -344,
-        929,
-        -1236,
-        760,
-        -299,
-        -419,
-        -728,
-        122,
-        -704,
-        -605,
-        -1507,
-        545,
-        -68,
-        -320,
-        1498,
-        953,
-        -323,
-        -575,
-        -673,
+        1355,
+        -1056,
+        258,
+        277,
+        -2999,
+        1331,
+        -1305,
+        1242,
+        -337,
+        -1073,
+        1392,
+        -576,
+        -886,
+        -2405,
+        -386,
+        1031,
+        1470,
+        -2105,
+        -594,
+        -1461,
+        -1160,
+        964,
+        -48,
+        -2158,
+        110,
+        -1750,
+        228,
+        -603,
+        801,
+        972,
+        102,
+        -395,
+        -508,
+        1640,
+        191,
+        2468,
+        -1580,
+        -1529,
+        1148,
+        515,
+        539,
+        -774,
+        111,
+        -1275,
+        113,
+        -432,
+        1736,
+        588,
+        -413,
+        1360,
+        49,
+        2322,
+        48,
+        255,
+        -521,
+        -366,
+        529,
+        -493,
+        -557,
+        1719,
+        -476,
+        104,
+        1311,
+        1314,
+        1307,
         520,
+        666,
+        -412,
+        627,
+        1098,
+        -209,
+        163,
+        955,
+        1798,
+        -39,
+        -753,
+        -1262,
+        411,
+        1247,
+        914,
+        522,
+        348,
+        2156,
+        510,
+        -1522,
+        -243,
+        1337,
+        -378,
+        -1957,
+        834,
         -450,
-        -1767,
-        -247,
-        56,
-        231,
-        -764,
-        536,
-        794,
-        -703,
-        -566,
-        51,
-        390,
-        52,
-        -182,
-        466,
-        133,
-        354,
-        107,
-        492,
+        235,
+        87,
+        236,
+        -1615,
+        485,
+        -1445,
         488,
-        -1194,
-        1145,
-        -847,
-        812,
-        151,
+        404,
+        -333,
+        66,
+        787,
+        647,
+        -1495,
+        -756,
+        -1700,
+        279,
+        -81,
+        260,
+        162,
+        -51,
+        -851,
+        462,
+        493,
+        161,
+        396,
+        -238,
+        -1044,
+        -1685,
+        433,
+        276,
+        -695,
+        -148,
+        416,
+        1235,
+        -748,
+        257,
+        784,
+        748,
+        767,
+        -262,
+        -490,
+        -26,
+        152,
+        186,
+        544,
+        1035,
+        -711,
+        549,
         -517,
-        -314,
-        -553,
-        -783,
-        -117,
-        736,
-        -88,
-        -598,
-        569,
-        606,
-        287,
-        744,
-        1739,
-        -217,
-        -219,
-        -144,
-        234,
-        -649,
-        -757,
-        834,
-        -819,
-        869,
-        -275,
-        -267,
-        154,
-        653,
-        594,
-        255,
-        1018,
-        1124,
-        284,
-        -1624,
-        -372,
-        440,
-        -184,
-        -1936,
-        1318,
-        -1124,
-        453,
-        -92,
-        -343,
+        799,
+        -1024,
+        542,
+        -118,
+        432,
+        -56,
+        -694,
+        668,
+        249,
         175,
-        182,
-        -886,
-        930,
-        -223,
-        -57,
-        -113,
-        103,
-        -200,
-        510,
-        -2099,
-        -498,
-        385,
-        80,
-        -156,
-        360,
-        1289,
-        771,
-        -1114,
-        -399,
-        870,
-        1230,
-        79,
-        472,
-        -1596,
-        -1092,
-        -572,
-        55,
-        -151,
-        -124,
-        1316,
-        -248,
-        1280,
-        -125,
-        -284,
-        -1023,
-        862,
-        84,
-        417,
-        568,
-        -88,
-        -528,
-        910,
-        674,
-        -212,
-        894,
-        -121,
-        1108,
-        762,
-        260,
-        -197,
-        91,
-        -53,
-        1117,
-        -645,
-        -868,
-        -611,
-        220,
-        422,
-        1431,
-        -532,
-        -157,
-        -476,
-        -846,
-        -1309,
-        -1614,
-        1225,
-        302,
-        -738,
-        -260,
-        892,
-        -778,
-        -193,
-        1221,
-        -779,
-        489,
-        420,
-        -85,
-        -525,
-        -830,
-        26,
-        270,
-        439,
-        -120,
-        1263,
-        -795,
-        291,
-        -1310,
-        -23,
-        347,
-        312,
-        -107,
-        -114,
+        329,
+        305,
+        287,
+        423,
+        438,
+        934,
+        628,
+        292,
+        -536,
+        -995,
+        -814,
+        237,
+        263,
+        571,
+        -138,
+        402,
         701,
-        830,
-        1309,
-        -451,
-        260,
-        -1080,
-        536,
-        188,
-        -60,
-        643,
-        -1184,
-        31,
-        -194,
-        -51,
-        -514,
-        -442,
-        -120,
-        649,
-        410,
-        882,
-        -75,
-        -341,
-        -718,
-        -128,
-        340,
-        -1245,
-        -164,
-        -1052,
-        70,
-        -256,
+        387,
+        474,
+        -183,
+        661,
+        280,
+        767,
+        -53,
+        -793,
+        -191,
+        -401,
+        526,
+        -679,
         279,
-        786,
-        40,
-        -177,
-        97,
-        -411,
-        222,
-        -89,
-        -277,
-        -146,
-        414,
-        483,
-        21,
-        -339,
-        -406,
-        -360,
-        -450,
-        -14,
-        -36,
-        513,
-        252,
-        54,
-        -501,
-        -478,
-        450,
-        -36,
-        -644,
-        -392,
-        714,
-        643,
-        -341,
-        91,
-        -1018,
-        34,
-        -177,
-        123,
-        80,
-        -695,
-        -44,
-        -357,
-        253,
+        -407,
+        493,
+        -82,
+        365,
+        -334,
+        36,
+        284,
+        -813,
+        424,
+        -425,
+        423,
+        -796,
+        452,
+        -635,
         -389,
-        613,
-        515,
-        418,
+        404,
+        -141,
+        415,
+        -277,
+        -400,
+        502,
+        766,
+        -182,
+        -426,
+        720,
+        1005,
+        422,
         -396,
-        -553,
-        193,
-        298,
-        -334,
-        -57,
-        -315,
-        -77,
-        33,
-        88,
-        137,
-        280,
-        -448,
-        196,
-        -136,
-        -295,
-        -329,
-        -92,
-        -360,
-        -132,
-        -288,
-        -45,
-        -43,
-        174,
-        75,
-        -60,
-        330,
-        360,
-        217,
-        130,
-        473,
-        -41,
-        -23,
-        -340,
-        -530,
-        -69,
-        -71,
+        123,
+        -533,
+        -91,
+        -355,
+        333,
+        -596,
+        -333,
+        434,
+        31,
+        567,
+        -356,
+        -309,
+        251,
+        365,
+        -399,
+        411,
+        -235,
+        -526,
+        468,
+        438,
+        136,
+        103,
+        74,
+        585,
+        324,
         -115,
-        297,
-        -240,
-        229,
-        507,
-        -348,
-        171,
-        -320,
-        239,
-        16,
-        -195,
-        -277,
-        -41,
-        69,
-        280,
-        -264,
-        30,
-        249,
-        -97,
-        -163,
-        -221,
-        96,
-        83,
+        -219,
+        -217,
+        -289,
+        -88,
+        143,
+        361,
+        -558,
+        -614,
+        -56,
+        456,
+        441,
+        -566,
+        102,
+        112,
+        -466,
+        325,
+        -27,
+        128,
+        294,
+        -321,
+        -224,
+        -206,
+        252,
+        209,
+        -207,
+        -224,
+        -207,
+        109,
+        316,
+        -234,
+        222,
+        95,
+        192,
+        -40,
+        -98,
         82,
-        -218,
-        -93,
-        -53,
+        68,
+        230,
+        -28,
+        -67,
+        -149,
+        14,
+        -120,
+        95,
+        122,
+        -81,
+        -67,
+        -296,
+        122,
+        -81,
+        134,
+        -200,
+        -67,
+        14,
+        67,
+        119,
         40,
-        28,
-        285,
-        27,
-        283,
-        -211,
+        118,
         -92,
-        214,
-        -225,
-        -54,
-        53,
-        105,
-        -198,
-        -53,
-        -277,
-        198,
-        184,
-        -264,
-        -106,
-        14,
-        185,
-        -155,
-        185,
-        106,
-        -119,
+        91,
+        -105,
         53,
-        208,
-        92,
-        262,
-        106,
-        -52,
+        40,
+        -51,
+        39,
+        -64,
         105,
-        -25,
-        -79,
-        104,
-        141,
-        129,
-        -114,
+        13,
+        39,
         26,
-        64,
-        -113,
+        -52,
+        -52,
+        -52,
         26,
-        77,
-        -64,
-        13,
+        -26,
+        -39,
         13,
+        -13,
+        39,
         26,
-        89,
-        115,
-        -49,
-        89,
-        -114,
-        51,
-        64,
-        -64,
-        -51,
-        -38,
-        89,
-        13,
-        -64,
         13,
-        -48,
-        76,
-        63,
-        62,
-        13,
-        112,
-        -76,
-        -50,
+        -39,
+        -26,
+        -26,
+        -26,
         -13,
-        -49,
-        63,
-        -50,
-        13,
+        -13,
+        39,
+        26,
+        -13,
+        26,
         13,
-        -50,
-        24,
-        -12,
-        24,
-        12,
-        24,
-        12,
-        -12,
-        -24,
-        12,
-        -12,
-        -12,
-        12,
-        -12,
     }
 }
\ No newline at end of file
index 40c6745dd06666d0b0e3328f15e43a98172ab9af..7a3c8e46f0e2b8c38439a9e41230db75bbd9815d 100644 (file)
@@ -1919,9 +1919,9 @@ Bangkok)•</data>
 <data>•\uff19\u6708\u306b•\u6771\u4eac\u304b\u3089•\u53cb\u9054\u304c•\u904a\u3073\u306b•\u6765\u305f•</data>
 #る文字「そうだ、京都」-> る•文字•「そうだ、•京都」•
 <data>•\u308b•\u6587\u5b57•\u300c\u305d\u3046\u3060\u3001•\u4eac\u90fd\u300d•</data>
-#Kana supplement: ð\9b\81\88(U+1B048) -> \uD82C\uDC48, ð\9b\80¸(U+1B038) -> \uD82C\uDC38, ð\9b\80\99(U+1B019)-> \uD82C\uDC19</data>
-#𛁈る𛀸(しるこ)、あ𛀙よろし(あかよろし) -> 𛁈る𛀸•(しるこ)、•あ𛀙よろし•(あかよろし)
-<data>•\U0001B048\u308B\U0001B038•\uFF08\u3057\u308B\u3053\uFF09\u3001•\u3042\U0001B019\u3088\u308D\u3057•\uFF08\u3042\u304B\u3088\u308D\u3057\uFF09•</data>
+#Kana supplement: ð\9b\81\9b (U+1B05B), ð\9b\82¦(U+1B0A6)
+#生 𛁛𛂦゙をいただく。-> 生 𛁛𛂦゙を•いただく。
+<data>•\u751F\U0001B05B\U0001B0A6\u3099\u3092•\u3044\u305F\u3060\u304F\u3002•</data>
 #中国の携帯は約500元から5000元です -> 中国の▁携帯は▁約▁500元から▁5000元です
 <data>•\u4E2D\u56FD\u306E•\u643A\u5E2F\u306F•\u7D04•\uFF15\uFF10\uFF10\u5143\u304B\u3089•\uFF15\uFF10\uFF10\uFF10\u5143\u3067\u3059•</data>
 #しかもロゴがUnicode!! -> しかも▁ロゴが▁Unicode!!
index ceeb4879ae5967dbc677046ca6e47a6b0a3998f3..196579d0a58c20978f36ca7f93562bfedcf40f1e 100644 (file)
@@ -24,61 +24,12 @@ public class MlBreakEngine {
 
     private static final int INVALID = '|';
     private static final String INVALID_STRING = "|";
-    private static final int MAX_FEATURE = 26;
+    private static final int MAX_FEATURE = 13;
     private UnicodeSet fDigitOrOpenPunctuationOrAlphabetSet;
     private UnicodeSet fClosePunctuationSet;
     private HashMap<String, Integer> fModel;
-
     private int fNegativeSum;
 
-    static class Element {
-        private int character;
-        private String ublock;
-
-        /**
-         * Default constructor.
-         */
-        public Element() {
-            character = 0;
-            ublock = null;
-        }
-
-        /**
-         * Set the character and its unicode block.
-         *
-         * @param ch  A unicode character.
-         * @param str The unicode block of the character.
-         */
-        public void setCharAndUblock(int ch, String str) {
-            Assert.assrt(str.length() <= 3);
-            this.character = ch;
-            ublock = str;
-        }
-
-        /**
-         * Get the unicode character.
-         *
-         * @return The unicode character.
-         */
-        public int getCharacter() {
-            return character;
-        }
-
-        /**
-         * Get the unicode character's unicode block.
-         *
-         * @return The unicode block.
-         */
-        public String getUblock() {
-            return ublock;
-        }
-    }
-
-    private static boolean isValid(Element element) {
-        String ublock = element.getUblock();
-        return ublock.length() != 1 || (int) ublock.charAt(0) != INVALID;
-    }
-
     /**
      * Constructor for Chinese and Japanese phrase breaking.
      *
@@ -114,12 +65,10 @@ public class MlBreakEngine {
             return 0;
         }
         ArrayList<Integer> boundary = new ArrayList<Integer>(numCodePts);
-        int ch;
-        String ublock;
         // The ML model groups six char to evaluate if the 4th char is a breakpoint.
         // Like a sliding window, the elementList removes the first char and appends the new char
         // from inString in each iteration so that its size always remains at six.
-        Element elementList[] = new Element[6];
+        int elementList[] = new int[6];
         initElementList(inString, elementList, numCodePts);
 
         // Add a break for the start.
@@ -130,10 +79,7 @@ public class MlBreakEngine {
                 break;
             }
             shiftLeftOne(elementList);
-
-            ch = (i + 3) < numCodePts ? next32(inString) : INVALID;
-            ublock = (ch != INVALID) ? getUnicodeBlock(ch) : INVALID_STRING;
-            elementList[5].setCharAndUblock(ch, ublock);
+            elementList[5] = (i + 3) < numCodePts ? next32(inString) : INVALID;
         }
 
         // Add a break for the end if there is not one there already.
@@ -181,11 +127,10 @@ public class MlBreakEngine {
         return correctedNumBreaks;
     }
 
-    private void shiftLeftOne(Element[] elementList) {
+    private void shiftLeftOne(int[] elementList) {
         int length = elementList.length;
         for (int i = 1; i < length; i++) {
-            elementList[i - 1].character = elementList[i].character;
-            elementList[i - 1].ublock = elementList[i].ublock;
+            elementList[i - 1] = elementList[i];
         }
     }
 
@@ -196,14 +141,14 @@ public class MlBreakEngine {
      * @param index       The breakpoint index to be evaluated.
      * @param boundary    An list including the index of the breakpoint.
      */
-    private void evaluateBreakpoint(Element[] elementList, int index, ArrayList<Integer> boundary) {
+    private void evaluateBreakpoint(int[] elementList, int index, ArrayList<Integer> boundary) {
         String[] featureList = new String[MAX_FEATURE];
-        final int w1 = elementList[0].getCharacter();
-        final int w2 = elementList[1].getCharacter();
-        final int w3 = elementList[2].getCharacter();
-        final int w4 = elementList[3].getCharacter();
-        final int w5 = elementList[4].getCharacter();
-        final int w6 = elementList[5].getCharacter();
+        final int w1 = elementList[0];
+        final int w2 = elementList[1];
+        final int w3 = elementList[2];
+        final int w4 = elementList[3];
+        final int w5 = elementList[4];
+        final int w6 = elementList[5];
 
         StringBuilder sb = new StringBuilder();
         int idx = 0;
@@ -265,76 +210,7 @@ public class MlBreakEngine {
             featureList[idx++] = sb.append("TW4:").appendCodePoint(w4).appendCodePoint(
                     w5).appendCodePoint(w6).toString();
         }
-        if (isValid(elementList[0])) {
-            sb.setLength(0);
-            featureList[idx++] = sb.append("UB1:").append(elementList[0].getUblock()).toString();
-        }
-        if (isValid(elementList[1])) {
-            sb.setLength(0);
-            featureList[idx++] = sb.append("UB2:").append(elementList[1].getUblock()).toString();
-        }
-        if (isValid(elementList[2])) {
-            sb.setLength(0);
-            featureList[idx++] = sb.append("UB3:").append(elementList[2].getUblock()).toString();
-        }
-        if (isValid(elementList[3])) {
-            sb.setLength(0);
-            featureList[idx++] = sb.append("UB4:").append(elementList[3].getUblock()).toString();
-        }
-        if (isValid(elementList[4])) {
-            sb.setLength(0);
-            featureList[idx++] = sb.append("UB5:").append(elementList[4].getUblock()).toString();
-        }
-        if (isValid(elementList[5])) {
-            sb.setLength(0);
-            featureList[idx++] = sb.append("UB6:").append(elementList[5].getUblock()).toString();
-        }
-        if (isValid(elementList[1]) && isValid(elementList[2])) {
-            sb.setLength(0);
-            featureList[idx++] = sb.append("BB1:").
-                    append(elementList[1].getUblock()).
-                    append(elementList[2].getUblock()).toString();
-        }
-        if (isValid(elementList[2]) && isValid(elementList[3])) {
-            sb.setLength(0);
-            featureList[idx++] = sb.append("BB2:").
-                    append(elementList[2].getUblock()).
-                    append(elementList[3].getUblock()).toString();
-        }
-        if (isValid(elementList[3]) && isValid(elementList[4])) {
-            sb.setLength(0);
-            featureList[idx++] = sb.append("BB3:").
-                    append(elementList[3].getUblock()).
-                    append(elementList[4].getUblock()).toString();
-        }
-        if (isValid(elementList[0]) && isValid(elementList[1]) && isValid(elementList[2])) {
-            sb.setLength(0);
-            featureList[idx++] = sb.append("TB1:").
-                    append(elementList[0].getUblock()).
-                    append(elementList[1].getUblock()).
-                    append(elementList[2].getUblock()).toString();
-        }
-        if (isValid(elementList[1]) && isValid(elementList[2]) && isValid(elementList[3])) {
-            sb.setLength(0);
-            featureList[idx++] = sb.append("TB2:").
-                    append(elementList[1].getUblock()).
-                    append(elementList[2].getUblock()).
-                    append(elementList[3].getUblock()).toString();
-        }
-        if (isValid(elementList[2]) && isValid(elementList[3]) && isValid(elementList[4])) {
-            sb.setLength(0);
-            featureList[idx++] = sb.append("TB3:").
-                    append(elementList[2].getUblock()).
-                    append(elementList[3].getUblock()).
-                    append(elementList[4].getUblock()).toString();
-        }
-        if (isValid(elementList[3]) && isValid(elementList[4]) && isValid(elementList[5])) {
-            sb.setLength(0);
-            featureList[idx++] = sb.append("TB4:").
-                    append(elementList[3].getUblock()).
-                    append(elementList[4].getUblock()).
-                    append(elementList[5].getUblock()).toString();
-        }
+
         int score = fNegativeSum;
         for (int j = 0; j < idx; j++) {
             if (fModel.containsKey(featureList[j])) {
@@ -350,12 +226,11 @@ public class MlBreakEngine {
      * Initialize the element list from the input string.
      *
      * @param inString    A input string to be segmented.
-     * @param elementList A list to store the first six characters and their unicode block codes.
+     * @param elementList A list to store the first six characters.
      * @param numCodePts  The number of code points of input string
      * @return The number of the code units of the first six characters in inString.
      */
-    private int initElementList(CharacterIterator inString, Element[] elementList,
-            int numCodePts) {
+    private int initElementList(CharacterIterator inString, int[] elementList, int numCodePts) {
         int index = 0;
         inString.setIndex(index);
         int w1, w2, w3, w4, w5, w6;
@@ -363,60 +238,29 @@ public class MlBreakEngine {
         if (numCodePts > 0) {
             w3 = current32(inString);
             index += Character.charCount(w3);
+            if (numCodePts > 1) {
+                w4 = next32(inString);
+                index += Character.charCount(w3);
+                if (numCodePts > 2) {
+                    w5 = next32(inString);
+                    index += Character.charCount(w5);
+                    if (numCodePts > 3) {
+                        w6 = next32(inString);
+                        index += Character.charCount(w6);
+                    }
+                }
+            }
         }
-        if (numCodePts > 1) {
-            w4 = next32(inString);
-            index += Character.charCount(w3);
-        }
-        if (numCodePts > 2) {
-            w5 = next32(inString);
-            index += Character.charCount(w5);
-        }
-        if (numCodePts > 3) {
-            w6 = next32(inString);
-            index += Character.charCount(w6);
-        }
-
-        final String b1 = INVALID_STRING;
-        final String b2 = b1;
-        final String b3 = getUnicodeBlock(w3);
-        final String b4 = getUnicodeBlock(w4);
-        final String b5 = getUnicodeBlock(w5);
-        final String b6 = getUnicodeBlock(w6);
-
-        elementList[0] = new Element();
-        elementList[0].setCharAndUblock(w1, b1);
-        elementList[1] = new Element();
-        elementList[1].setCharAndUblock(w2, b2);
-        elementList[2] = new Element();
-        elementList[2].setCharAndUblock(w3, b3);
-        elementList[3] = new Element();
-        elementList[3].setCharAndUblock(w4, b4);
-        elementList[4] = new Element();
-        elementList[4].setCharAndUblock(w5, b5);
-        elementList[5] = new Element();
-        elementList[5].setCharAndUblock(w6, b6);
+        elementList[0] = w1;
+        elementList[1] = w2;
+        elementList[2] = w3;
+        elementList[3] = w4;
+        elementList[4] = w5;
+        elementList[5] = w6;
 
         return index;
     }
 
-    /**
-     * Get the character's unicode block code defined in UBlockCode.
-     *
-     * @param ch A char.
-     * @return The unicode block code which is 3 digits with '0' added in the beginning if the code
-     * is less than 3 digits.
-     */
-    private String getUnicodeBlock(int ch) {
-        int blockId = UCharacter.UnicodeBlock.of(ch).getID();
-        if (blockId == UCharacter.UnicodeBlock.NO_BLOCK.getID()
-                || blockId == UCharacter.UnicodeBlock.INVALID_CODE_ID) {
-            return INVALID_STRING;
-        } else {
-            return String.format("%03d", blockId);
-        }
-    }
-
     /**
      * Load the machine learning's model file.
      */
index 40c6745dd06666d0b0e3328f15e43a98172ab9af..7a3c8e46f0e2b8c38439a9e41230db75bbd9815d 100644 (file)
@@ -1919,9 +1919,9 @@ Bangkok)•</data>
 <data>•\uff19\u6708\u306b•\u6771\u4eac\u304b\u3089•\u53cb\u9054\u304c•\u904a\u3073\u306b•\u6765\u305f•</data>
 #る文字「そうだ、京都」-> る•文字•「そうだ、•京都」•
 <data>•\u308b•\u6587\u5b57•\u300c\u305d\u3046\u3060\u3001•\u4eac\u90fd\u300d•</data>
-#Kana supplement: ð\9b\81\88(U+1B048) -> \uD82C\uDC48, ð\9b\80¸(U+1B038) -> \uD82C\uDC38, ð\9b\80\99(U+1B019)-> \uD82C\uDC19</data>
-#𛁈る𛀸(しるこ)、あ𛀙よろし(あかよろし) -> 𛁈る𛀸•(しるこ)、•あ𛀙よろし•(あかよろし)
-<data>•\U0001B048\u308B\U0001B038•\uFF08\u3057\u308B\u3053\uFF09\u3001•\u3042\U0001B019\u3088\u308D\u3057•\uFF08\u3042\u304B\u3088\u308D\u3057\uFF09•</data>
+#Kana supplement: ð\9b\81\9b (U+1B05B), ð\9b\82¦(U+1B0A6)
+#生 𛁛𛂦゙をいただく。-> 生 𛁛𛂦゙を•いただく。
+<data>•\u751F\U0001B05B\U0001B0A6\u3099\u3092•\u3044\u305F\u3060\u304F\u3002•</data>
 #中国の携帯は約500元から5000元です -> 中国の▁携帯は▁約▁500元から▁5000元です
 <data>•\u4E2D\u56FD\u306E•\u643A\u5E2F\u306F•\u7D04•\uFF15\uFF10\uFF10\u5143\u304B\u3089•\uFF15\uFF10\uFF10\uFF10\u5143\u3067\u3059•</data>
 #しかもロゴがUnicode!! -> しかも▁ロゴが▁Unicode!!