U_NAMESPACE_BEGIN
+enum class ModelIndex { kUWStart = 0, kBWStart = 6, kTWStart = 9 };
+
MlBreakEngine::MlBreakEngine(const UnicodeSet &digitOrOpenPunctuationOrAlphabetSet,
- const UnicodeSet &closePunctuationSet, UErrorCode &status)
+ const UnicodeSet &closePunctuationSet, UErrorCode &status)
: fDigitOrOpenPunctuationOrAlphabetSet(digitOrOpenPunctuationOrAlphabetSet),
fClosePunctuationSet(closePunctuationSet),
- fModel(status),
fNegativeSum(0) {
if (U_FAILURE(status)) {
return;
MlBreakEngine::~MlBreakEngine() {}
-namespace {
- const char16_t INVALID = u'|';
-}
-
int32_t MlBreakEngine::divideUpRange(UText *inText, int32_t rangeStart, int32_t rangeEnd,
- UVector32 &foundBreaks, const UnicodeString &inString,
- const LocalPointer<UVector32> &inputMap,
- UErrorCode &status) const {
+ UVector32 &foundBreaks, const UnicodeString &inString,
+ const LocalPointer<UVector32> &inputMap,
+ UErrorCode &status) const {
if (U_FAILURE(status)) {
return 0;
}
return 0;
}
int32_t numBreaks = 0;
- UnicodeString index;
- // The ML model groups six char to evaluate if the 4th char is a breakpoint.
- // Like a sliding window, the elementList removes the first char and appends the new char from
- // inString in each iteration so that its size always remains at six.
- UChar32 elementList[6];
-
- int32_t codeUts = initElementList(inString, elementList, status);
- int32_t length = inString.countChar32();
+ int32_t codePointLength = inString.countChar32();
+ // The ML algorithm groups six char and evaluates whether the 4th char is a breakpoint.
+ // In each iteration, it evaluates the 4th char and then moves forward one char like a sliding
+ // window. Initially, the first six values in the indexList are [-1, -1, 0, 1, 2, 3]. After
+ // moving forward, finally the last six values in the indexList are
+ // [length-4, length-3, length-2, length-1, -1, -1]. The "+4" here means four extra "-1".
+ int32_t indexSize = codePointLength + 4;
+ int32_t *indexList = (int32_t *)uprv_malloc(indexSize * sizeof(int32_t));
+ if (indexList == nullptr) {
+ status = U_MEMORY_ALLOCATION_ERROR;
+ return 0;
+ }
+ int32_t numCodeUnits = initIndexList(inString, indexList, status);
// Add a break for the start.
boundary.addElement(0, status);
numBreaks++;
if (U_FAILURE(status)) return 0;
- for (int32_t i = 1; i < length && U_SUCCESS(status); i++) {
- evaluateBreakpoint(elementList, i, numBreaks, boundary, status);
- if (i + 1 >= inString.countChar32()) break;
- // Remove the first element and append a new element
- uprv_memmove(elementList, elementList + 1, 5 * sizeof(UChar32));
- elementList[5] = inString.countChar32(0, codeUts) < length ? inString.char32At(codeUts) : INVALID;
- if (elementList[5] != INVALID) {
- codeUts += U16_LENGTH(elementList[5]);
+ for (int32_t idx = 0; idx + 1 < codePointLength && U_SUCCESS(status); idx++) {
+ numBreaks =
+ evaluateBreakpoint(inString, indexList, idx, numCodeUnits, numBreaks, boundary, status);
+ if (idx + 4 < codePointLength) {
+ indexList[idx + 6] = numCodeUnits;
+ numCodeUnits += U16_LENGTH(inString.char32At(indexList[idx + 6]));
}
}
+ uprv_free(indexList);
+
if (U_FAILURE(status)) return 0;
// Add a break for the end if there is not one there already.
return correctedNumBreaks;
}
-void MlBreakEngine::evaluateBreakpoint(UChar32* elementList, int32_t index, int32_t &numBreaks,
- UVector32 &boundary, UErrorCode &status) const {
+int32_t MlBreakEngine::evaluateBreakpoint(const UnicodeString &inString, int32_t *indexList,
+ int32_t startIdx, int32_t numCodeUnits, int32_t numBreaks,
+ UVector32 &boundary, UErrorCode &status) const {
if (U_FAILURE(status)) {
- return;
+ return numBreaks;
}
-
- UnicodeString feature;
+ int32_t start = 0, end = 0;
int32_t score = fNegativeSum;
- if (elementList[0] != INVALID) {
- // When the key doesn't exist, Hashtable.geti(key) returns 0 and 2 * 0 = 0.
- // So, we can skip to check whether fModel includes key featureList[j] or not.
- score += (2 * fModel.geti(feature.setTo(u"UW1:", 4).append(elementList[0])));
- }
- if (elementList[1] != INVALID) {
- score += (2 * fModel.geti(feature.setTo(u"UW2:", 4).append(elementList[1])));
- }
- if (elementList[2] != INVALID) {
- score += (2 * fModel.geti(feature.setTo(u"UW3:", 4).append(elementList[2])));
- }
- if (elementList[3] != INVALID) {
- score += (2 * fModel.geti(feature.setTo(u"UW4:", 4).append(elementList[3])));
- }
- if (elementList[4] != INVALID) {
- score += (2 * fModel.geti(feature.setTo(u"UW5:", 4).append(elementList[4])));
- }
- if (elementList[5] != INVALID) {
- score += (2 * fModel.geti(feature.setTo(u"UW6:", 4).append(elementList[5])));
- }
- if (elementList[1] != INVALID && elementList[2] != INVALID) {
- score += (2 * fModel.geti(
- feature.setTo(u"BW1:", 4).append(elementList[1]).append(elementList[2])));
- }
- if (elementList[2] != INVALID && elementList[3] != INVALID) {
- score += (2 * fModel.geti(
- feature.setTo(u"BW2:", 4).append(elementList[2]).append(elementList[3])));
- }
- if (elementList[3] != INVALID && elementList[4] != INVALID) {
- score += (2 * fModel.geti(
- feature.setTo(u"BW3:", 4).append(elementList[3]).append(elementList[4])));
- }
- if (elementList[0] != INVALID && elementList[1] != INVALID && elementList[2] != INVALID) {
- score += (2 * fModel.geti(feature.setTo(u"TW1:", 4)
- .append(elementList[0])
- .append(elementList[1])
- .append(elementList[2])));
- }
- if (elementList[1] != INVALID && elementList[2] != INVALID && elementList[3] != INVALID) {
- score += (2 * fModel.geti(feature.setTo(u"TW2:", 4)
- .append(elementList[1])
- .append(elementList[2])
- .append(elementList[3])));
+ for (int i = 0; i < 6; i++) {
+ // UW1 ~ UW6
+ start = startIdx + i;
+ if (indexList[start] != -1) {
+ end = (indexList[start + 1] != -1) ? indexList[start + 1] : numCodeUnits;
+ score += fModel[static_cast<int32_t>(ModelIndex::kUWStart) + i].geti(
+ inString.tempSubString(indexList[start], end - indexList[start]));
+ }
}
- if (elementList[2] != INVALID && elementList[3] != INVALID && elementList[4] != INVALID) {
- score += (2 * fModel.geti(feature.setTo(u"TW3:", 4)
- .append(elementList[2])
- .append(elementList[3])
- .append(elementList[4])));
+ for (int i = 0; i < 3; i++) {
+ // BW1 ~ BW3
+ start = startIdx + i + 1;
+ if (indexList[start] != -1 && indexList[start + 1] != -1) {
+ end = (indexList[start + 2] != -1) ? indexList[start + 2] : numCodeUnits;
+ score += fModel[static_cast<int32_t>(ModelIndex::kBWStart) + i].geti(
+ inString.tempSubString(indexList[start], end - indexList[start]));
+ }
}
- if (elementList[3] != INVALID && elementList[4] != INVALID && elementList[5] != INVALID) {
- score += (2 * fModel.geti(feature.setTo(u"TW4:", 4)
- .append(elementList[3])
- .append(elementList[4])
- .append(elementList[5])));
+ for (int i = 0; i < 4; i++) {
+ // TW1 ~ TW4
+ start = startIdx + i;
+ if (indexList[start] != -1 && indexList[start + 1] != -1 && indexList[start + 2] != -1) {
+ end = (indexList[start + 3] != -1) ? indexList[start + 3] : numCodeUnits;
+ score += fModel[static_cast<int32_t>(ModelIndex::kTWStart) + i].geti(
+ inString.tempSubString(indexList[start], end - indexList[start]));
+ }
}
+
if (score > 0) {
- boundary.addElement(index, status);
+ boundary.addElement(startIdx + 1, status);
numBreaks++;
}
+ return numBreaks;
}
-int32_t MlBreakEngine::initElementList(const UnicodeString &inString, UChar32* elementList,
- UErrorCode &status) const {
+int32_t MlBreakEngine::initIndexList(const UnicodeString &inString, int32_t *indexList,
+ UErrorCode &status) const {
if (U_FAILURE(status)) {
return 0;
}
int32_t index = 0;
int32_t length = inString.countChar32();
- UChar32 w1, w2, w3, w4, w5, w6;
- w1 = w2 = w3 = w4 = w5 = w6 = INVALID;
+ // Set all (lenght+4) items inside indexLength to -1 presuming -1 is 4 bytes of 0xff.
+ uprv_memset(indexList, 0xff, (length + 4) * sizeof(int32_t));
if (length > 0) {
- w3 = inString.char32At(0);
- index += U16_LENGTH(w3);
+ indexList[2] = 0;
+ index = U16_LENGTH(inString.char32At(0));
if (length > 1) {
- w4 = inString.char32At(index);
- index += U16_LENGTH(w4);
+ indexList[3] = index;
+ index += U16_LENGTH(inString.char32At(index));
if (length > 2) {
- w5 = inString.char32At(index);
- index += U16_LENGTH(w5);
+ indexList[4] = index;
+ index += U16_LENGTH(inString.char32At(index));
if (length > 3) {
- w6 = inString.char32At(index);
- index += U16_LENGTH(w6);
+ indexList[5] = index;
+ index += U16_LENGTH(inString.char32At(index));
}
}
}
}
- elementList[0] = w1;
- elementList[1] = w2;
- elementList[2] = w3;
- elementList[3] = w4;
- elementList[4] = w5;
- elementList[5] = w6;
-
return index;
}
void MlBreakEngine::loadMLModel(UErrorCode &error) {
- // BudouX's model consists of pairs of the feature and its score.
- // As integrating it into jaml.txt, modelKeys denotes the ML feature; modelValues means the
- // corresponding feature's score.
+ // BudouX's model consists of thirteen categories, each of which is make up of pairs of the
+ // feature and its score. As integrating it into jaml.txt, we define thirteen kinds of key and
+ // value to represent the feature and the corresponding score respectively.
+
+ if (U_FAILURE(error)) return;
+ UnicodeString key;
+ StackUResourceBundle stackTempBundle;
+ ResourceDataValue modelKey;
+
+ LocalUResourceBundlePointer rbp(ures_openDirect(U_ICUDATA_BRKITR, "jaml", &error));
+ UResourceBundle *rb = rbp.getAlias();
if (U_FAILURE(error)) return;
+ int32_t index = 0;
+ initKeyValue(rb, "UW1Keys", "UW1Values", fModel[index++], error);
+ initKeyValue(rb, "UW2Keys", "UW2Values", fModel[index++], error);
+ initKeyValue(rb, "UW3Keys", "UW3Values", fModel[index++], error);
+ initKeyValue(rb, "UW4Keys", "UW4Values", fModel[index++], error);
+ initKeyValue(rb, "UW5Keys", "UW5Values", fModel[index++], error);
+ initKeyValue(rb, "UW6Keys", "UW6Values", fModel[index++], error);
+ initKeyValue(rb, "BW1Keys", "BW1Values", fModel[index++], error);
+ initKeyValue(rb, "BW2Keys", "BW2Values", fModel[index++], error);
+ initKeyValue(rb, "BW3Keys", "BW3Values", fModel[index++], error);
+ initKeyValue(rb, "TW1Keys", "TW1Values", fModel[index++], error);
+ initKeyValue(rb, "TW2Keys", "TW2Values", fModel[index++], error);
+ initKeyValue(rb, "TW3Keys", "TW3Values", fModel[index++], error);
+ initKeyValue(rb, "TW4Keys", "TW4Values", fModel[index++], error);
+ fNegativeSum /= 2;
+}
+
+void MlBreakEngine::initKeyValue(UResourceBundle *rb, const char *keyName, const char *valueName,
+ Hashtable &model, UErrorCode &error) {
int32_t keySize = 0;
int32_t valueSize = 0;
int32_t stringLength = 0;
StackUResourceBundle stackTempBundle;
ResourceDataValue modelKey;
- LocalUResourceBundlePointer rbp(ures_openDirect(U_ICUDATA_BRKITR, "jaml", &error));
- UResourceBundle* rb = rbp.orphan();
// get modelValues
- LocalUResourceBundlePointer modelValue(ures_getByKey(rb, "modelValues", nullptr, &error));
- const int32_t* value = ures_getIntVector(modelValue.getAlias(), &valueSize, &error);
+ LocalUResourceBundlePointer modelValue(ures_getByKey(rb, valueName, nullptr, &error));
+ const int32_t *value = ures_getIntVector(modelValue.getAlias(), &valueSize, &error);
if (U_FAILURE(error)) return;
// get modelKeys
- ures_getValueWithFallback(rb, "modelKeys", stackTempBundle.getAlias(), modelKey, error);
+ ures_getValueWithFallback(rb, keyName, stackTempBundle.getAlias(), modelKey, error);
ResourceArray stringArray = modelKey.getArray(error);
keySize = stringArray.getSize();
if (U_FAILURE(error)) return;
if (U_SUCCESS(error)) {
U_ASSERT(idx < valueSize);
fNegativeSum -= value[idx];
- fModel.puti(key, value[idx], error);
+ model.puti(key, value[idx], error);
}
}
}
#define MLBREAKENGINE_H
#include "hash.h"
+#include "unicode/resbund.h"
#include "unicode/uniset.h"
#include "unicode/utext.h"
#include "uvectr32.h"
* @param status Information on any errors encountered.
*/
MlBreakEngine(const UnicodeSet &digitOrOpenPunctuationOrAlphabetSet,
- const UnicodeSet &closePunctuationSet, UErrorCode &status);
+ const UnicodeSet &closePunctuationSet, UErrorCode &status);
/**
* Virtual destructor.
void loadMLModel(UErrorCode &error);
/**
- * Initialize the element list from the input string.
+ * In the machine learning's model file, specify the name of the key and value to load the
+ * corresponding feature and its score.
+ *
+ * @param rb A ResouceBundle corresponding to the model file.
+ * @param keyName The kay name in the model file.
+ * @param valueName The value name in the model file.
+ * @param model A hashtable to store the pairs of the feature and its score.
+ * @param error Information on any errors encountered.
+ */
+ void initKeyValue(UResourceBundle *rb, const char *keyName, const char *valueName,
+ Hashtable &model, UErrorCode &error);
+
+ /**
+ * Initialize the index list from the input string.
*
* @param inString A input string to be segmented.
- * @param elementList A list to store the first six characters.
+ * @param indexList A code unit index list of inString.
* @param status Information on any errors encountered.
- * @return The number of code units of the first six characters in inString.
+ * @return The number of code units of the first four characters in inString.
*/
- int32_t initElementList(const UnicodeString &inString, UChar32* elementList,
- UErrorCode &status) const;
+ int32_t initIndexList(const UnicodeString &inString, int32_t *indexList,
+ UErrorCode &status) const;
/**
* Evaluate whether the index is a potential breakpoint.
*
- * @param elementList A list including six elements for the breakpoint evaluation.
- * @param index The breakpoint index to be evaluated.
+ * @param inString A input string to be segmented.
+ * @param indexList A code unit index list of the inString.
+ * @param startIdx The start index of the indexList.
+ * @param numCodeUnits The current code unit boundary of the indexList.
* @param numBreaks The accumulated number of breakpoints.
* @param boundary A vector including the index of the breakpoint.
* @param status Information on any errors encountered.
+ * @return The number of breakpoints
*/
- void evaluateBreakpoint(UChar32* elementList, int32_t index, int32_t &numBreaks,
- UVector32 &boundary, UErrorCode &status) const;
+ int32_t evaluateBreakpoint(const UnicodeString &inString, int32_t *indexList, int32_t startIdx,
+ int32_t numCodeUnits, int32_t numBreaks, UVector32 &boundary,
+ UErrorCode &status) const;
+
+ void printUnicodeString(const UnicodeString &s) const;
UnicodeSet fDigitOrOpenPunctuationOrAlphabetSet;
UnicodeSet fClosePunctuationSet;
- Hashtable fModel;
+ Hashtable fModel[13]; // {UW1, UW2, ... UW6, BW1, ... BW3, TW1, TW2, ... TW4} 6+3+4= 13
int32_t fNegativeSum;
};
// © 2022 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
jaml {
- modelKeys {
- "UW3:、",
- "UW3:。",
- "UW4:の",
- "UW4:、",
- "UW3:の",
- "UW4:。",
- "UW3:に",
- "UW5:。",
- "UW4:て",
- "UW3:は",
- "UW4:に",
- "UW3:を",
- "UW5:、",
- "UW2:、",
- "UW3:が",
- "UW2:。",
- "UW4:で",
- "UW3:と",
- "UW4:は",
- "UW4:が",
- "UW4:る",
- "UW4:っ",
- "UW3:も",
- "UW5:な",
- "UW3:で",
- "UW3:る",
- "UW5:で",
- "UW4:を",
- "UW4:か",
- "UW3:っ",
- "UW2:の",
- "UW4:と",
- "UW5:っ",
- "UW4:な",
- "UW3:て",
- "UW4:た",
- "UW4:こ",
- "UW6:に",
- "UW4:ら",
- "UW3:た",
- "UW2:を",
- "UW3:ら",
- "UW6:。",
- "UW4:し",
- "UW3:な",
- "UW2:に",
- "UW4:い",
- "UW4:り",
- "UW6:う",
- "UW3:う",
- "UW3:く",
- "UW4:れ",
- "UW2:は",
- "UW4:だ",
- "UW4:う",
- "UW3:い",
- "UW6:い",
- "UW4:ん",
- "UW2:か",
- "UW4:ー",
- "UW6:を",
- "UW2:も",
- "UW5:き",
- "UW3:り",
- "UW6:で",
- "UW2:る",
- "UW2:と",
- "UW3:]",
- "UW4:そ",
- "UW3:,",
- "UW4:も",
- "UW4:く",
- "UW3:か",
- "BW2:とい",
- "UW4:お",
- "UW4:ま",
- "UW6:が",
- "UW4:き",
- "UW2:し",
- "UW2:て",
- "UW3:!",
- "UW2:ま",
- "UW5:に",
- "UW3:や",
- "UW6:て",
- "BW3:もの",
- "UW6:の",
- "UW2:ん",
- "UW2:が",
- "UW5:が",
- "BW1:いう",
- "UW2:で",
- "UW5:す",
- "UW3:?",
- "UW5:と",
- "UW6:は",
- "UW3:.",
- "UW4:「",
- "UW3:ば",
- "UW5:ん",
- "BW3:いう",
- "UW4:す",
- "BW1:から",
- "UW3:ど",
- "UW5:し",
- "UW2:っ",
- "UW4:思",
- "UW3:…",
- "UW5:る",
- "BW2:てい",
- "BW3:よう",
- "UW5:え",
- "UW4:私",
- "UW3:・",
- "UW4:人",
- "UW5:く",
- "UW3:)",
- "UW4:京",
- "BW2:ない",
- "UW3:ー",
- "BW3:とこ",
- "UW5:は",
- "UW4:」",
- "UW2:一",
- "UW4:よ",
- "BW3:こと",
- "UW5:ー",
- "UW6:し",
- "UW4:け",
- "BW1:ない",
- "BW2:です",
- "UW4:一",
- "UW5:帯",
- "UW5:を",
- "UW6:な",
- "UW5:べ",
- "BW3:いい",
- "BW2:であ",
- "BW2:ので",
- "UW4:,",
- "UW5:れ",
- "UW5:ろ",
- "UW1:そ",
- "UW5:い",
- "UW1:い",
- "UW5:・",
- "UW5:わ",
- "UW4:1",
- "UW5:う",
- "UW4:大",
- "UW3:ま",
- "BW2:とこ",
- "UW4:!",
- "UW4:見",
- "UW4:行",
- "BW1:こと",
- "UW1:な",
- "UW2:さ",
- "UW3:☆",
- "UW4:さ",
- "UW2:よ",
- "BW1:とか",
- "UW4:(",
- "BW3:でも",
- "UW5:の",
- "UW4:・",
- "UW5:た",
- "UW1:す",
- "UW5:か",
- "UW4:使",
- "UW3:♪",
- "UW4:え",
- "UW4:今",
- "BW2:、と",
- "BW3:とき",
- "UW4:ろ",
- "UW5:つ",
- "UW1:に",
- "UW5:じ",
- "UW1:で",
- "UW4:ン",
- "UW3:ず",
- "BW3:して",
- "UW4:食",
- "UW4:気",
- "UW4:時",
- "UW3:日",
- "BW1:しい",
- "UW4:自",
- "UW3:笑",
- "UW2:毎",
- "TW1:という",
- "UW4:み",
- "UW4:…",
- "TW2:ではな",
- "UW6:さ",
- "UW5:め",
- "UW2:少",
- "UW5:あ",
- "UW4:2",
- "UW3:へ",
- "TW3:という",
- "UW4:何",
- "UW2:く",
- "UW2:結",
- "BW1:うな",
- "BW1:もう",
- "UW1:が",
- "UW4:じ",
- "UW2:う",
- "UW4:ル",
- "UW3:」",
- "BW1:とが",
- "UW2:最",
- "BW1:るの",
- "UW3:間",
- "UW6:た",
- "UW3:つ",
- "UW4:ど",
- "UW1:と",
- "UW3:ん",
- "UW4:.",
- "UW3:だ",
- "UW4:わ",
- "UW4:最",
- "UW4:?",
- "UW3:ろ",
- "UW4:ば",
- "TW3:ている",
- "BW3:この",
- "UW5:も",
- "UW3:人",
- "BW3:とい",
- "UW4:つ",
- "BW3:その",
- "BW3:もう",
- "UW2:そ",
- "BW2:には",
- "BW3:かけ",
- "TW4:の京都",
- "TW4:ところ",
- "UW3:京",
- "UW4:携",
- "BW1:かも",
- "BW1:では",
- "UW4:ち",
- "UW3:分",
- "UW4:べ",
- "BW3:ころ",
- "UW3:ゃ",
- "UW2:す",
- "BW1:。・",
- "UW3:電",
- "BW3:なっ",
- "UW3:す",
- "BW1:最近",
- "UW4:め",
- "UW3:ぐ",
- "UW2:お",
- "BW3:そし",
- "BW1:かし",
- "BW1:同じ",
- "BW3:メー",
- "UW5:て",
- "UW6:り",
- "TW4:くらい",
- "UW3:今",
- "UW5:そ",
- "UW4:や",
- "UW5:」",
- "UW4:帯",
- "UW6:ー",
- "BW2:とし",
- "TW1:ような",
- "BW2:てお",
- "UW4:笑",
- "UW1:は",
- "BW3:かか",
- "TW4:かなり",
- "UW4:)",
- "BW1:んな",
- "UW1:ち",
- "TW2:気に入",
- "TW1:・・・",
- "UW6:と",
- "UW5:ち",
- "BW3:ため",
- "UW4:ず",
- "UW3:0",
- "BW1:んで",
- "UW3:中",
- "UW3:々",
- "BW2:のよ",
- "BW2:帯電",
- "BW2:でも",
- "BW1:には",
- "BW3:ちょ",
- "UW4:せ",
- "UW3:度",
- "BW1:でも",
- "BW1:が、",
- "UW2:な",
- "UW5:思",
- "UW6:0",
- "UW6:寺",
- "BW3:とて",
- "BW3:ある",
- "BW2:もし",
- "UW4:ッ",
- "UW1:て",
- "BW2:にも",
- "BW1:れた",
- "UW4:ひ",
- "TW3:ること",
- "BW1:てい",
- "UW4:』",
- "BW1:だけ",
- "UW3:お",
- "BW1:少し",
- "TW3:、ある",
- "UW5:!",
- "UW6:ル",
- "UW2:多",
- "UW6:ご",
- "UW6:や",
- "UW3:後",
- "BW2:てみ",
- "BW1:とき",
- "UW4:ゃ",
- "BW1:たい",
- "UW3:き",
- "TW4:ことが",
- "UW3:真",
- "BW2:など",
- "UW6:ぱ",
- "BW1:った",
- "BW1:ても",
- "UW5:日",
- "BW1:たと",
- "UW4:]",
- "UW3:ッ",
- "TW4:メール",
- "BW2:はな",
- "BW3:・・",
- "BW3:なる",
- "BW1:とい",
- "UW2:全",
- "BW1:にも",
- "BW1:たら",
- "BW2:くな",
- "UW3:「",
- "BW1:その",
- "UW3:観",
- "BW1:うに",
- "UW3:イ",
- "BW3:もん",
- "UW5:ず",
- "BW3:しま",
- "BW1:より",
- "UW5:分",
+ BW1Keys {
+ "。・",
+ "いう",
+ "うな",
+ "うに",
+ "かし",
+ "かも",
+ "から",
+ "が、",
+ "こと",
+ "しい",
+ "その",
+ "たい",
+ "たと",
+ "たら",
+ "だけ",
+ "った",
+ "てい",
+ "ても",
+ "では",
+ "でも",
+ "とい",
+ "とか",
+ "とが",
+ "とき",
+ "ない",
+ "には",
+ "にも",
+ "もう",
+ "より",
+ "るの",
+ "れた",
+ "んで",
+ "んな",
+ "同じ",
+ "少し",
+ "最近",
}
- modelValues:intvector {
- 3634,
- 4347,
- -2581,
- -4812,
- 2538,
- -4206,
- 2701,
- -1455,
- -2403,
- 2977,
- -2678,
- 4165,
- -818,
- -1011,
- 2996,
- -904,
- -1808,
- 2064,
- -2164,
- -2180,
- -2760,
- -2310,
- 2360,
- -388,
- 1842,
- 1706,
- -706,
- -2408,
- -1628,
- -1005,
- -434,
- -1442,
- 543,
- -1091,
- 1355,
- -1056,
- 258,
- 277,
- -2999,
- 1331,
- -1305,
- 1242,
- -337,
- -1073,
- 1392,
- -576,
- -886,
- -2405,
- -386,
- 1031,
- 1470,
- -2105,
- -594,
- -1461,
- -1160,
- 964,
- -48,
- -2158,
- 110,
- -1750,
- 228,
- -603,
- 801,
- 972,
- 102,
- -395,
- -508,
- 1640,
- 191,
- 2468,
- -1580,
- -1529,
- 1148,
- 515,
- 539,
- -774,
- 111,
- -1275,
- 113,
- -432,
- 1736,
- 588,
- -413,
- 1360,
- 49,
- 2322,
- 48,
- 255,
- -521,
- -366,
+ BW1Values:intvector {
+ 567,
529,
- -493,
- -557,
- 1719,
- -476,
- 104,
- 1311,
- 1314,
- 1307,
- 520,
- 666,
- -412,
+ 280,
+ -13,
+ 468,
+ -533,
627,
- 1098,
- -209,
- 163,
- 955,
- 1798,
+ 192,
+ -695,
+ 423,
+ -26,
+ 53,
+ -52,
+ 13,
+ 122,
+ 13,
+ -67,
+ 39,
+ -91,
+ 95,
+ -13,
+ 784,
+ -679,
+ 91,
+ 485,
+ 109,
+ 26,
+ 767,
+ 26,
+ -407,
+ 95,
+ -206,
+ 102,
+ 438,
+ 134,
+ 365,
+ }
+ BW2Keys {
+ "、と",
+ "くな",
+ "てい",
+ "てお",
+ "てみ",
+ "であ",
+ "です",
+ "でも",
+ "とい",
+ "とこ",
+ "とし",
+ "ない",
+ "など",
+ "には",
+ "にも",
+ "ので",
+ "のよ",
+ "はな",
+ "もし",
+ "帯電",
+ }
+ BW2Values:intvector {
+ -517,
-39,
-753,
- -1262,
- 411,
- 1247,
- 914,
- 522,
- 348,
- 2156,
- 510,
+ -558,
+ -92,
+ -1495,
+ -1445,
+ -207,
+ 515,
+ -1044,
+ 143,
-1522,
- -243,
- 1337,
- -378,
- -1957,
- 834,
- -450,
+ -64,
+ -426,
+ -120,
+ -756,
+ -207,
+ -26,
+ -67,
+ -224,
+ }
+ BW3Keys {
+ "ある",
+ "いい",
+ "いう",
+ "かか",
+ "かけ",
+ "こと",
+ "この",
+ "ころ",
+ "して",
+ "しま",
+ "そし",
+ "その",
+ "ため",
+ "ちょ",
+ "でも",
+ "とい",
+ "とき",
+ "とこ",
+ "とて",
+ "なっ",
+ "なる",
+ "もう",
+ "もの",
+ "もん",
+ "よう",
+ "メー",
+ "・・",
+ }
+ BW3Values:intvector {
+ -28,
+ 647,
+ 666,
+ 456,
+ 720,
235,
- 87,
- 236,
- -1615,
- 485,
- -1445,
- 488,
404,
-333,
- 66,
- 787,
- 647,
- -1495,
- -756,
- -1700,
- 279,
+ 249,
+ -13,
+ -526,
+ 502,
+ 294,
+ 316,
+ 767,
+ -277,
+ 799,
+ 1337,
+ 230,
+ -309,
+ 13,
+ 766,
+ 2322,
+ 39,
+ -1262,
+ 136,
+ -39,
+ }
+ TW1Keys {
+ "という",
+ "ような",
+ "・・・",
+ }
+ TW1Values:intvector {
+ 292,
+ 361,
+ 325,
+ }
+ TW2Keys {
+ "ではな",
+ "気に入",
+ }
+ TW2Values:intvector {
+ -814,
+ -466,
+ }
+ TW3Keys {
+ "、ある",
+ "ている",
+ "という",
+ "ること",
+ }
+ TW3Values:intvector {
+ -200,
+ -389,
+ 387,
-81,
- 260,
- 162,
+ }
+ TW4Keys {
+ "かなり",
+ "くらい",
+ "ことが",
+ "ところ",
+ "の京都",
+ "メール",
+ }
+ TW4Values:intvector {
+ 441,
+ 585,
-51,
- -851,
- 462,
- 493,
- 161,
- 396,
- -238,
- -1044,
- -1685,
- 433,
- 276,
- -695,
+ 422,
+ 1005,
+ 26,
+ }
+ UW1Keys {
+ "い",
+ "が",
+ "す",
+ "そ",
+ "ち",
+ "て",
+ "で",
+ "と",
+ "な",
+ "に",
+ "は",
+ }
+ UW1Values:intvector {
+ -51,
+ -53,
+ 152,
+ 260,
+ 112,
+ 14,
+ -56,
+ 36,
-148,
+ -118,
+ -56,
+ }
+ UW2Keys {
+ "、",
+ "。",
+ "う",
+ "お",
+ "か",
+ "が",
+ "く",
+ "さ",
+ "し",
+ "す",
+ "そ",
+ "っ",
+ "て",
+ "で",
+ "と",
+ "な",
+ "に",
+ "の",
+ "は",
+ "ま",
+ "も",
+ "よ",
+ "る",
+ "を",
+ "ん",
+ "一",
+ "全",
+ "多",
+ "少",
+ "最",
+ "毎",
+ "結",
+ }
+ UW2Values:intvector {
+ -1011,
+ -904,
+ -191,
+ -235,
+ 110,
+ -521,
+ -183,
416,
- 1235,
- -748,
+ 113,
+ 31,
+ -182,
+ 163,
+ -432,
+ -493,
+ -508,
+ -40,
+ -576,
+ -434,
+ -594,
+ 588,
+ -603,
257,
- 784,
- 748,
- 767,
- -262,
- -490,
- -26,
- 152,
- 186,
- 544,
+ -395,
+ -1305,
+ 255,
+ 834,
+ 39,
+ 67,
+ 571,
+ 279,
+ 628,
+ 661,
+ }
+ UW3Keys {
+ "…",
+ "☆",
+ "♪",
+ "、",
+ "。",
+ "々",
+ "「",
+ "」",
+ "い",
+ "う",
+ "お",
+ "か",
+ "が",
+ "き",
+ "く",
+ "ぐ",
+ "す",
+ "ず",
+ "た",
+ "だ",
+ "っ",
+ "つ",
+ "て",
+ "で",
+ "と",
+ "ど",
+ "な",
+ "に",
+ "の",
+ "は",
+ "ば",
+ "へ",
+ "ま",
+ "も",
+ "ゃ",
+ "や",
+ "ら",
+ "り",
+ "る",
+ "ろ",
+ "を",
+ "ん",
+ "イ",
+ "ッ",
+ "・",
+ "ー",
+ "中",
+ "京",
+ "人",
+ "今",
+ "分",
+ "度",
+ "後",
+ "日",
+ "真",
+ "笑",
+ "観",
+ "間",
+ "電",
+ "!",
+ ")",
+ ",",
+ ".",
+ "0",
+ "?",
+ "]",
+ }
+ UW3Values:intvector {
+ 1798,
+ 1235,
1035,
- -711,
- 549,
- -517,
- 799,
- -1024,
- 542,
- -118,
- 432,
- -56,
- -694,
+ 3634,
+ 4347,
+ 209,
+ -26,
+ 526,
+ 964,
+ 1031,
+ -81,
+ 1148,
+ 2996,
+ 40,
+ 1470,
+ 411,
+ 251,
668,
- 249,
- 175,
- 329,
- 305,
+ 1331,
+ 424,
+ -1005,
+ 365,
+ 1355,
+ 1842,
+ 2064,
+ 1098,
+ 1392,
+ 2701,
+ 2538,
+ 2977,
+ 1307,
+ 701,
+ -238,
+ 2360,
+ 434,
+ 1360,
+ 1242,
+ 972,
+ 1706,
+ 452,
+ 4165,
+ 284,
+ -13,
+ -52,
+ 914,
+ -243,
+ 252,
+ -396,
+ 415,
+ 324,
+ 333,
+ 222,
+ 118,
287,
- 423,
- 438,
+ 39,
934,
- 628,
- 292,
- -536,
- -995,
- -814,
- 237,
- 263,
- 571,
- -138,
- 402,
- 701,
- 387,
- 474,
- -183,
- 661,
- 280,
- 767,
- -53,
- -793,
- -191,
- -401,
- 526,
- -679,
- 279,
- -407,
+ -26,
493,
- -82,
- 365,
+ -356,
+ 1736,
+ 2156,
+ 2468,
+ 1311,
+ -224,
+ 1719,
+ 1640,
+ }
+ UW4Keys {
+ "…",
+ "、",
+ "。",
+ "「",
+ "」",
+ "』",
+ "い",
+ "う",
+ "え",
+ "お",
+ "か",
+ "が",
+ "き",
+ "く",
+ "け",
+ "こ",
+ "さ",
+ "し",
+ "じ",
+ "す",
+ "ず",
+ "せ",
+ "そ",
+ "た",
+ "だ",
+ "ち",
+ "っ",
+ "つ",
+ "て",
+ "で",
+ "と",
+ "ど",
+ "な",
+ "に",
+ "の",
+ "は",
+ "ば",
+ "ひ",
+ "べ",
+ "ま",
+ "み",
+ "め",
+ "も",
+ "ゃ",
+ "や",
+ "よ",
+ "ら",
+ "り",
+ "る",
+ "れ",
+ "ろ",
+ "わ",
+ "を",
+ "ん",
+ "ッ",
+ "ル",
+ "ン",
+ "・",
+ "ー",
+ "一",
+ "京",
+ "人",
+ "今",
+ "何",
+ "使",
+ "大",
+ "帯",
+ "思",
+ "携",
+ "時",
+ "最",
+ "気",
+ "私",
+ "笑",
+ "自",
+ "行",
+ "見",
+ "食",
+ "!",
+ "(",
+ ")",
+ ",",
+ ".",
+ "1",
+ "2",
+ "?",
+ "]",
+ }
+ UW4Values:intvector {
+ -995,
+ -4812,
+ -4206,
+ 1314,
+ -1957,
+ -296,
+ -886,
+ -1160,
+ -711,
+ 539,
+ -1628,
+ -2180,
+ -1275,
+ -1529,
+ -1615,
+ 258,
+ -748,
+ -1073,
+ -793,
+ -412,
+ -321,
+ -234,
+ 191,
+ -1056,
+ -1461,
+ -355,
+ -2310,
+ -400,
+ -2403,
+ -1808,
+ -1442,
-334,
- 36,
- 284,
- -813,
- 424,
- -425,
- 423,
- -796,
- 452,
+ -1091,
+ -2678,
+ -2581,
+ -2164,
-635,
- -389,
- 404,
- -141,
- 415,
- -277,
- -400,
- 502,
- 766,
- -182,
- -426,
- 720,
- 1005,
- 422,
- -396,
- 123,
- -533,
- -91,
- -355,
- 333,
+ 122,
-596,
- -333,
- 434,
- 31,
- 567,
- -356,
- -309,
- 251,
- 365,
+ -774,
+ -536,
-399,
- 411,
- -235,
- -526,
- 468,
- 438,
- 136,
- 103,
- 74,
- 585,
- 324,
- -115,
+ -1580,
+ -105,
-219,
- -217,
+ -450,
+ -2999,
+ -2405,
+ -2760,
+ -2105,
+ -1024,
+ -425,
+ -2408,
+ -2158,
+ -149,
+ -401,
+ -694,
+ -490,
+ -1750,
+ 488,
+ 510,
+ 522,
+ 549,
+ 474,
+ 544,
+ 396,
-289,
- -88,
- 143,
- 361,
- -558,
+ 955,
+ 123,
+ 305,
+ 423,
+ 329,
+ 1247,
-614,
- -56,
- 456,
- 441,
+ 438,
+ 276,
+ 433,
+ 175,
+ -1685,
+ 748,
-566,
- 102,
- 112,
- -466,
- 325,
- -27,
- 128,
- 294,
- -321,
- -224,
- -206,
- 252,
- 209,
- -207,
- -224,
- -207,
- 109,
- 316,
- -234,
- 222,
- 95,
- 192,
- -40,
- -98,
- 82,
- 68,
- 230,
- -28,
- -67,
- -149,
- 14,
- -120,
- 95,
- 122,
- -81,
- -67,
- -296,
- 122,
- -81,
- 134,
- -200,
- -67,
- 14,
- 67,
- 119,
- 40,
- 118,
- -92,
- 91,
- -105,
- 53,
- 40,
- -51,
- 39,
- -64,
- 105,
- 13,
- 39,
- 26,
- -52,
- -52,
+ -1700,
+ -813,
+ 493,
+ 402,
+ -796,
-52,
+ }
+ UW5Keys {
+ "、",
+ "。",
+ "」",
+ "あ",
+ "い",
+ "う",
+ "え",
+ "か",
+ "が",
+ "き",
+ "く",
+ "し",
+ "じ",
+ "す",
+ "ず",
+ "そ",
+ "た",
+ "ち",
+ "っ",
+ "つ",
+ "て",
+ "で",
+ "と",
+ "な",
+ "に",
+ "の",
+ "は",
+ "べ",
+ "め",
+ "も",
+ "る",
+ "れ",
+ "ろ",
+ "わ",
+ "を",
+ "ん",
+ "・",
+ "ー",
+ "分",
+ "帯",
+ "思",
+ "日",
+ "!",
+ }
+ UW5Values:intvector {
+ -818,
+ -1455,
+ -217,
+ -138,
+ 162,
+ 161,
+ 411,
+ 186,
+ -366,
+ 801,
+ 348,
+ -209,
+ 432,
+ -557,
26,
+ -115,
-26,
+ 128,
+ 543,
+ 542,
+ 103,
+ -706,
+ -476,
+ -388,
+ -413,
+ -262,
+ -378,
+ 787,
+ 263,
+ -141,
-39,
+ 279,
+ -81,
+ 462,
+ -333,
+ 520,
+ -851,
+ 87,
13,
- -13,
- 39,
- 26,
- 13,
- -39,
- -26,
- -26,
- -26,
- -13,
- -13,
- 39,
- 26,
- -13,
+ 404,
+ -98,
26,
- 13,
+ -67,
+ }
+ UW6Keys {
+ "。",
+ "い",
+ "う",
+ "が",
+ "ご",
+ "さ",
+ "し",
+ "た",
+ "て",
+ "で",
+ "と",
+ "な",
+ "に",
+ "の",
+ "は",
+ "ぱ",
+ "や",
+ "り",
+ "を",
+ "ル",
+ "ー",
+ "寺",
+ "0",
+ }
+ UW6Values:intvector {
+ -337,
+ -48,
+ -386,
+ 111,
+ 119,
+ 237,
+ 236,
+ -82,
+ 49,
+ 102,
+ -27,
+ 66,
+ 277,
+ 48,
+ 104,
+ 105,
+ 40,
+ 74,
+ 228,
+ 14,
+ -88,
+ 68,
+ 82,
}
-}
\ No newline at end of file
+}
import static com.ibm.icu.impl.CharacterIteration.next32;
import static com.ibm.icu.impl.CharacterIteration.previous32;
-import com.ibm.icu.impl.Assert;
import com.ibm.icu.impl.ICUData;
-import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.util.UResourceBundle;
import com.ibm.icu.util.UResourceBundleIterator;
-import java.lang.System;
import java.text.CharacterIterator;
+import java.util.Arrays;
import java.util.ArrayList;
+import java.util.List;
import java.util.HashMap;
-public class MlBreakEngine {
+enum ModelIndex {
+ kUWStart(0), kBWStart(6), kTWStart(9);
+ private final int value;
+
+ private ModelIndex(int value) {
+ this.value = value;
+ }
+
+ public int getValue() {
+ return value;
+ }
+}
- private static final int INVALID = '|';
- private static final String INVALID_STRING = "|";
+public class MlBreakEngine {
+ // {UW1, UW2, ... UW6, BW1, ... BW3, TW1, TW2, ... TW4} 6+3+4= 13
private static final int MAX_FEATURE = 13;
private UnicodeSet fDigitOrOpenPunctuationOrAlphabetSet;
private UnicodeSet fClosePunctuationSet;
- private HashMap<String, Integer> fModel;
+ private List<HashMap<String, Integer>> fModel;
private int fNegativeSum;
/**
UnicodeSet closePunctuationSet) {
fDigitOrOpenPunctuationOrAlphabetSet = digitOrOpenPunctuationOrAlphabetSet;
fClosePunctuationSet = closePunctuationSet;
- fModel = new HashMap<String, Integer>();
+ fModel = new ArrayList<HashMap<String, Integer>>(MAX_FEATURE);
+ for (int i = 0; i < MAX_FEATURE; i++) {
+ fModel.add(new HashMap<String, Integer>());
+ }
fNegativeSum = 0;
loadMLModel();
}
/**
* Divide up a range of characters handled by this break engine.
*
- * @param inText A input text.
- * @param startPos The start index of the input text.
- * @param endPos The end index of the input text.
- * @param inString A input string normalized from inText from startPos to endPos
- * @param numCodePts The number of code points of inString
- * @param charPositions A map that transforms inString's code point index to code unit index.
- * @param foundBreaks A list to store the breakpoint.
+ * @param inText An input text.
+ * @param startPos The start index of the input text.
+ * @param endPos The end index of the input text.
+ * @param inString A input string normalized from inText from startPos to endPos
+ * @param codePointLength The number of code points of inString
+ * @param charPositions A map that transforms inString's code point index to code unit index.
+ * @param foundBreaks A list to store the breakpoint.
* @return The number of breakpoints
*/
public int divideUpRange(CharacterIterator inText, int startPos, int endPos,
- CharacterIterator inString, int numCodePts, int[] charPositions,
+ CharacterIterator inString, int codePointLength, int[] charPositions,
DictionaryBreakEngine.DequeI foundBreaks) {
if (startPos >= endPos) {
return 0;
}
- ArrayList<Integer> boundary = new ArrayList<Integer>(numCodePts);
- // The ML model groups six char to evaluate if the 4th char is a breakpoint.
- // Like a sliding window, the elementList removes the first char and appends the new char
- // from inString in each iteration so that its size always remains at six.
- int elementList[] = new int[6];
- initElementList(inString, elementList, numCodePts);
+ ArrayList<Integer> boundary = new ArrayList<Integer>(codePointLength);
+ String inputStr = transform(inString);
+ // The ML algorithm groups six char and evaluates whether the 4th char is a breakpoint.
+ // In each iteration, it evaluates the 4th char and then moves forward one char like
+ // sliding window. Initially, the first six values in the indexList are
+ // [-1, -1, 0, 1, 2, 3]. After moving forward, finally the last six values in the indexList
+ // are [length-4, length-3, length-2, length-1, -1, -1]. The "+4" here means four extra
+ // "-1".
+ int indexSize = codePointLength + 4;
+ int indexList[] = new int[indexSize];
+ int numCodeUnits = initIndexList(inString, indexList, codePointLength);
// Add a break for the start.
boundary.add(0, 0);
- for (int i = 1; i < numCodePts; i++) {
- evaluateBreakpoint(elementList, i, boundary);
- if (i + 1 > numCodePts) {
- break;
+
+ for (int idx = 0; idx + 1 < codePointLength; idx++) {
+ evaluateBreakpoint(inputStr, indexList, idx, numCodeUnits, boundary);
+ if (idx + 4 < codePointLength) {
+ indexList[idx + 6] = numCodeUnits;
+ numCodeUnits += Character.charCount(next32(inString));
}
- shiftLeftOne(elementList);
- elementList[5] = (i + 3) < numCodePts ? next32(inString) : INVALID;
}
// Add a break for the end if there is not one there already.
- if (boundary.get(boundary.size() - 1) != numCodePts) {
- boundary.add(numCodePts);
+ if (boundary.get(boundary.size() - 1) != codePointLength) {
+ boundary.add(codePointLength);
}
int correctedNumBreaks = 0;
return correctedNumBreaks;
}
- private void shiftLeftOne(int[] elementList) {
- int length = elementList.length;
- for (int i = 1; i < length; i++) {
- elementList[i - 1] = elementList[i];
+ /**
+ * Transform a CharacterIterator into a String.
+ */
+ private String transform(CharacterIterator inString) {
+ StringBuilder sb = new StringBuilder();
+ inString.setIndex(0);
+ for (char c = inString.first(); c != CharacterIterator.DONE; c = inString.next()) {
+ sb.append(c);
}
+ return sb.toString();
}
/**
- * Evaluate whether the index is a potential breakpoint.
+ * Evaluate whether the breakpointIdx is a potential breakpoint.
*
- * @param elementList A list including six elements for the breakpoint evaluation.
- * @param index The breakpoint index to be evaluated.
- * @param boundary An list including the index of the breakpoint.
+ * @param inputStr An input string to be segmented.
+ * @param indexList A code unit index list of the inputStr.
+ * @param startIdx The start index of the indexList.
+ * @param numCodeUnits The current code unit boundary of the indexList.
+ * @param boundary A list including the index of the breakpoint.
*/
- private void evaluateBreakpoint(int[] elementList, int index, ArrayList<Integer> boundary) {
- String[] featureList = new String[MAX_FEATURE];
- final int w1 = elementList[0];
- final int w2 = elementList[1];
- final int w3 = elementList[2];
- final int w4 = elementList[3];
- final int w5 = elementList[4];
- final int w6 = elementList[5];
+ private void evaluateBreakpoint(String inputStr, int[] indexList, int startIdx,
+ int numCodeUnits, ArrayList<Integer> boundary) {
+ int start = 0, end = 0;
+ int score = fNegativeSum;
- StringBuilder sb = new StringBuilder();
- int idx = 0;
- if (w1 != INVALID) {
- featureList[idx++] = sb.append("UW1:").appendCodePoint(w1).toString();
- }
- if (w2 != INVALID) {
- sb.setLength(0);
- featureList[idx++] = sb.append("UW2:").appendCodePoint(w2).toString();
- }
- if (w3 != INVALID) {
- sb.setLength(0);
- featureList[idx++] = sb.append("UW3:").appendCodePoint(w3).toString();
- }
- if (w4 != INVALID) {
- sb.setLength(0);
- featureList[idx++] = sb.append("UW4:").appendCodePoint(w4).toString();
- }
- if (w5 != INVALID) {
- sb.setLength(0);
- featureList[idx++] = sb.append("UW5:").appendCodePoint(w5).toString();
- }
- if (w6 != INVALID) {
- sb.setLength(0);
- featureList[idx++] = sb.append("UW6:").appendCodePoint(w6).toString();
- }
- if (w2 != INVALID && w3 != INVALID) {
- sb.setLength(0);
- featureList[idx++] = sb.append("BW1:").appendCodePoint(w2).appendCodePoint(
- w3).toString();
- }
- if (w3 != INVALID && w4 != INVALID) {
- sb.setLength(0);
- featureList[idx++] = sb.append("BW2:").appendCodePoint(w3).appendCodePoint(
- w4).toString();
- }
- if (w4 != INVALID && w5 != INVALID) {
- sb.setLength(0);
- featureList[idx++] = sb.append("BW3:").appendCodePoint(w4).appendCodePoint(
- w5).toString();
- }
- if (w1 != INVALID && w2 != INVALID && w3 != INVALID) {
- sb.setLength(0);
- featureList[idx++] = sb.append("TW1:").appendCodePoint(w1).appendCodePoint(
- w2).appendCodePoint(w3).toString();
- }
- if (w2 != INVALID && w3 != INVALID && w4 != INVALID) {
- sb.setLength(0);
- featureList[idx++] = sb.append("TW2:").appendCodePoint(w2).appendCodePoint(
- w3).appendCodePoint(w4).toString();
- }
- if (w3 != INVALID && w4 != INVALID && w5 != INVALID) {
- sb.setLength(0);
- featureList[idx++] = sb.append("TW3:").appendCodePoint(w3).appendCodePoint(
- w4).appendCodePoint(w5).toString();
+ for (int i = 0; i < 6; i++) {
+ // UW1 ~ UW6
+ start = startIdx + i;
+ if (indexList[start] != -1) {
+ end = (indexList[start + 1] != -1) ? indexList[start + 1] : numCodeUnits;
+ score += fModel.get(ModelIndex.kUWStart.getValue() + i).getOrDefault(
+ inputStr.substring(indexList[start], end), 0);
+ }
}
- if (w4 != INVALID && w5 != INVALID && w6 != INVALID) {
- sb.setLength(0);
- featureList[idx++] = sb.append("TW4:").appendCodePoint(w4).appendCodePoint(
- w5).appendCodePoint(w6).toString();
+ for (int i = 0; i < 3; i++) {
+ // BW1 ~ BW3
+ start = startIdx + i + 1;
+ if (indexList[start] != -1 && indexList[start + 1] != -1) {
+ end = (indexList[start + 2] != -1) ? indexList[start + 2] : numCodeUnits;
+ score += fModel.get(ModelIndex.kBWStart.getValue() + i).getOrDefault(
+ inputStr.substring(indexList[start], end), 0);
+ }
}
-
- int score = fNegativeSum;
- for (int j = 0; j < idx; j++) {
- if (fModel.containsKey(featureList[j])) {
- score += (2 * fModel.get(featureList[j]));
+ for (int i = 0; i < 4; i++) {
+ // TW1 ~ TW4
+ start = startIdx + i;
+ if (indexList[start] != -1
+ && indexList[start + 1] != -1
+ && indexList[start + 2] != -1) {
+ end = (indexList[start + 3] != -1) ? indexList[start + 3] : numCodeUnits;
+ score += fModel.get(ModelIndex.kTWStart.getValue() + i).getOrDefault(
+ inputStr.substring(indexList[start], end), 0);
}
}
if (score > 0) {
- boundary.add(index);
+ boundary.add(startIdx + 1);
}
}
/**
- * Initialize the element list from the input string.
+ * Initialize the index list from the input string.
*
- * @param inString A input string to be segmented.
- * @param elementList A list to store the first six characters.
- * @param numCodePts The number of code points of input string
+ * @param inString An input string to be segmented.
+ * @param indexList A code unit index list of the inString.
+ * @param codePointLength The number of code points of the input string
* @return The number of the code units of the first six characters in inString.
*/
- private int initElementList(CharacterIterator inString, int[] elementList, int numCodePts) {
+ private int initIndexList(CharacterIterator inString, int[] indexList, int codePointLength) {
int index = 0;
inString.setIndex(index);
- int w1, w2, w3, w4, w5, w6;
- w1 = w2 = w3 = w4 = w5 = w6 = INVALID;
- if (numCodePts > 0) {
- w3 = current32(inString);
- index += Character.charCount(w3);
- if (numCodePts > 1) {
- w4 = next32(inString);
- index += Character.charCount(w3);
- if (numCodePts > 2) {
- w5 = next32(inString);
- index += Character.charCount(w5);
- if (numCodePts > 3) {
- w6 = next32(inString);
- index += Character.charCount(w6);
+ Arrays.fill(indexList, -1);
+ if (codePointLength > 0) {
+ indexList[2] = 0;
+ index += Character.charCount(current32(inString));
+ if (codePointLength > 1) {
+ indexList[3] = index;
+ index += Character.charCount(next32(inString));
+ if (codePointLength > 2) {
+ indexList[4] = index;
+ index += Character.charCount(next32(inString));
+ if (codePointLength > 3) {
+ indexList[5] = index;
+ index += Character.charCount(next32(inString));
}
}
}
}
- elementList[0] = w1;
- elementList[1] = w2;
- elementList[2] = w3;
- elementList[3] = w4;
- elementList[4] = w5;
- elementList[5] = w6;
-
return index;
}
int index = 0;
UResourceBundle rb = UResourceBundle.getBundleInstance(ICUData.ICU_BRKITR_BASE_NAME,
"jaml");
- UResourceBundle keyBundle = rb.get("modelKeys");
- UResourceBundle valueBundle = rb.get("modelValues");
+ initKeyValue(rb, "UW1Keys", "UW1Values", fModel.get(index++));
+ initKeyValue(rb, "UW2Keys", "UW2Values", fModel.get(index++));
+ initKeyValue(rb, "UW3Keys", "UW3Values", fModel.get(index++));
+ initKeyValue(rb, "UW4Keys", "UW4Values", fModel.get(index++));
+ initKeyValue(rb, "UW5Keys", "UW5Values", fModel.get(index++));
+ initKeyValue(rb, "UW6Keys", "UW6Values", fModel.get(index++));
+ initKeyValue(rb, "BW1Keys", "BW1Values", fModel.get(index++));
+ initKeyValue(rb, "BW2Keys", "BW2Values", fModel.get(index++));
+ initKeyValue(rb, "BW3Keys", "BW3Values", fModel.get(index++));
+ initKeyValue(rb, "TW1Keys", "TW1Values", fModel.get(index++));
+ initKeyValue(rb, "TW2Keys", "TW2Values", fModel.get(index++));
+ initKeyValue(rb, "TW3Keys", "TW3Values", fModel.get(index++));
+ initKeyValue(rb, "TW4Keys", "TW4Values", fModel.get(index++));
+ fNegativeSum /= 2;
+ }
+
+ /**
+ * In the machine learning's model file, specify the name of the key and value to load the
+ * corresponding feature and its score.
+ *
+ * @param rb A RedouceBundle corresponding to the model file.
+ * @param keyName The kay name in the model file.
+ * @param valueName The value name in the model file.
+ * @param map A HashMap to store the pairs of the feature and its score.
+ */
+ private void initKeyValue(UResourceBundle rb, String keyName, String valueName,
+ HashMap<String, Integer> map) {
+ int idx = 0;
+ UResourceBundle keyBundle = rb.get(keyName);
+ UResourceBundle valueBundle = rb.get(valueName);
int[] value = valueBundle.getIntVector();
UResourceBundleIterator iterator = keyBundle.getIterator();
while (iterator.hasNext()) {
- fNegativeSum -= value[index];
- fModel.put(iterator.nextString(), value[index++]);
+ fNegativeSum -= value[idx];
+ map.put(iterator.nextString(), value[idx++]);
}
}
}