U_NAMESPACE_BEGIN
-Element::Element() : length(0) {}
-
-void Element::setCharAndUblock(UChar32 ch, const UnicodeString &idx) {
- character = ch;
- U_ASSERT(idx.length() <= 3);
- length = idx.length();
- idx.extract(0, length, ublock);
- ublock[length] = '\0';
-}
-
-UChar32 Element::getCharacter() const {
- return character;
-}
-
-char16_t* Element::getUblock() const {
- return (char16_t*)ublock;
-}
-
-uint16_t Element::getLength() const {
- return length;
-}
-
MlBreakEngine::MlBreakEngine(const UnicodeSet &digitOrOpenPunctuationOrAlphabetSet,
const UnicodeSet &closePunctuationSet, UErrorCode &status)
: fDigitOrOpenPunctuationOrAlphabetSet(digitOrOpenPunctuationOrAlphabetSet),
namespace {
const char16_t INVALID = u'|';
- const int32_t MAX_FEATURE = 26;
- const int32_t MAX_FEATURE_LENGTH = 14;
-
- bool isValid(const Element& element) {
- return element.getLength() != 1 || element.getUblock()[0] != INVALID;
- }
+ const int32_t MAX_FEATURE = 13;
+ const int32_t MAX_FEATURE_LENGTH = 11;
void concatChar(const char16_t *str, const UChar32 *arr, int32_t length, char16_t *feature, UErrorCode &status) {
if (U_FAILURE(status)) {
U_ASSERT(result.length() < MAX_FEATURE_LENGTH);
result.extract(feature, MAX_FEATURE_LENGTH, status); // NUL-terminates
}
-
- void writeString(const UnicodeString &str, char16_t *feature, UErrorCode &status) {
- U_ASSERT(str.length() < MAX_FEATURE_LENGTH);
- str.extract(feature, MAX_FEATURE_LENGTH, status); // NUL-terminates
- }
}
int32_t MlBreakEngine::divideUpRange(UText *inText, int32_t rangeStart, int32_t rangeEnd,
return 0;
}
int32_t numBreaks = 0;
- UChar32 ch;
UnicodeString index;
// The ML model groups six char to evaluate if the 4th char is a breakpoint.
// Like a sliding window, the elementList removes the first char and appends the new char from
// inString in each iteration so that its size always remains at six.
- Element elementList[6];
+ UChar32 elementList[6];
int32_t codeUts = initElementList(inString, elementList, status);
int32_t length = inString.countChar32();
evaluateBreakpoint(elementList, i, numBreaks, boundary, status);
if (i + 1 >= inString.countChar32()) break;
// Remove the first element and append a new element
- uprv_memmove(elementList, elementList + 1, 5 * sizeof(Element));
- ch = inString.countChar32(0, codeUts) < length ? inString.char32At(codeUts) : INVALID;
- index = (ch != INVALID) ? getUnicodeBlock(ch, status) : UnicodeString(INVALID);
- elementList[5].setCharAndUblock(ch, index);
- if (ch != INVALID) {
- codeUts += U16_LENGTH(ch);
+ uprv_memmove(elementList, elementList + 1, 5 * sizeof(UChar32));
+ elementList[5] = inString.countChar32(0, codeUts) < length ? inString.char32At(codeUts) : INVALID;
+ if (elementList[5] != INVALID) {
+ codeUts += U16_LENGTH(elementList[5]);
}
}
if (U_FAILURE(status)) return 0;
return correctedNumBreaks;
}
-void MlBreakEngine::evaluateBreakpoint(Element* elementList, int32_t index, int32_t &numBreaks,
+void MlBreakEngine::evaluateBreakpoint(UChar32* elementList, int32_t index, int32_t &numBreaks,
UVector32 &boundary, UErrorCode &status) const {
char16_t featureList[MAX_FEATURE][MAX_FEATURE_LENGTH];
if (U_FAILURE(status)) {
UChar32 arr[4] = {-1, -1, -1, -1};
int32_t length = 0, listLength = 0;
- const UChar32 w1 = elementList[0].getCharacter();
- const UChar32 w2 = elementList[1].getCharacter();
- const UChar32 w3 = elementList[2].getCharacter();
- const UChar32 w4 = elementList[3].getCharacter();
- const UChar32 w5 = elementList[4].getCharacter();
- const UChar32 w6 = elementList[5].getCharacter();
+ const UChar32 w1 = elementList[0];
+ const UChar32 w2 = elementList[1];
+ const UChar32 w3 = elementList[2];
+ const UChar32 w4 = elementList[3];
+ const UChar32 w5 = elementList[4];
+ const UChar32 w6 = elementList[5];
length = 1;
if (w1 != INVALID) {
arr[2] = w6;
concatChar(u"TW4:", arr, length, featureList[listLength++], status);
}
- if (isValid(elementList[0])) {
- writeString(UnicodeString(u"UB1:").append(elementList[0].getUblock(), 0,
- elementList[0].getLength()),
- featureList[listLength++], status);
- }
- if (isValid(elementList[1])) {
- writeString(UnicodeString(u"UB2:").append(elementList[1].getUblock(), 0,
- elementList[1].getLength()),
- featureList[listLength++], status);
- }
- if (isValid(elementList[2])) {
- writeString(UnicodeString(u"UB3:").append(elementList[2].getUblock(), 0,
- elementList[2].getLength()),
- featureList[listLength++], status);
- }
- if (isValid(elementList[3])) {
- writeString(UnicodeString(u"UB4:").append(elementList[3].getUblock(), 0,
- elementList[3].getLength()),
- featureList[listLength++], status);
- }
- if (isValid(elementList[4])) {
- writeString(UnicodeString(u"UB5:").append(elementList[4].getUblock(), 0,
- elementList[4].getLength()),
- featureList[listLength++], status);
- }
- if (isValid(elementList[5])) {
- writeString(UnicodeString(u"UB6:").append(elementList[5].getUblock(), 0,
- elementList[5].getLength()),
- featureList[listLength++], status);
- }
- if (isValid(elementList[1]) && isValid(elementList[2])) {
- writeString(UnicodeString(u"BB1:")
- .append(elementList[1].getUblock(), 0, elementList[1].getLength())
- .append(elementList[2].getUblock(), 0, elementList[2].getLength()),
- featureList[listLength++], status);
- }
- if (isValid(elementList[2]) && isValid(elementList[3])) {
- writeString(UnicodeString(u"BB2:")
- .append(elementList[2].getUblock(), 0, elementList[2].getLength())
- .append(elementList[3].getUblock(), 0, elementList[3].getLength()),
- featureList[listLength++], status);
- }
- if (isValid(elementList[3]) && isValid(elementList[4])) {
- writeString(UnicodeString(u"BB3:")
- .append(elementList[3].getUblock(), 0, elementList[3].getLength())
- .append(elementList[4].getUblock(), 0, elementList[4].getLength()),
- featureList[listLength++], status);
- }
- if (isValid(elementList[0]) && isValid(elementList[1]) && isValid(elementList[2])) {
- writeString(UnicodeString(u"TB1:")
- .append(elementList[0].getUblock(), 0, elementList[0].getLength())
- .append(elementList[1].getUblock(), 0, elementList[1].getLength())
- .append(elementList[2].getUblock(), 0, elementList[2].getLength()),
- featureList[listLength++], status);
- }
- if (isValid(elementList[1]) && isValid(elementList[2]) && isValid(elementList[3])) {
- writeString(UnicodeString(u"TB2:")
- .append(elementList[1].getUblock(), 0, elementList[1].getLength())
- .append(elementList[2].getUblock(), 0, elementList[2].getLength())
- .append(elementList[3].getUblock(), 0, elementList[3].getLength()),
- featureList[listLength++], status);
- }
- if (isValid(elementList[2]) && isValid(elementList[3]) && isValid(elementList[4])) {
- writeString(UnicodeString(u"TB3:")
- .append(elementList[2].getUblock(), 0, elementList[2].getLength())
- .append(elementList[3].getUblock(), 0, elementList[3].getLength())
- .append(elementList[4].getUblock(), 0, elementList[4].getLength()),
- featureList[listLength++], status);
- }
- if (isValid(elementList[3]) && isValid(elementList[4]) && isValid(elementList[5])) {
- writeString(UnicodeString(u"TB4:")
- .append(elementList[3].getUblock(), 0, elementList[3].getLength())
- .append(elementList[4].getUblock(), 0, elementList[4].getLength())
- .append(elementList[5].getUblock(), 0, elementList[5].getLength()),
- featureList[listLength++], status);
- }
if (U_FAILURE(status)) {
return;
}
}
}
-int32_t MlBreakEngine::initElementList(const UnicodeString &inString, Element* elementList,
+int32_t MlBreakEngine::initElementList(const UnicodeString &inString, UChar32* elementList,
UErrorCode &status) const {
if (U_FAILURE(status)) {
return 0;
if (length > 0) {
w3 = inString.char32At(0);
index += U16_LENGTH(w3);
+ if (length > 1) {
+ w4 = inString.char32At(index);
+ index += U16_LENGTH(w4);
+ if (length > 2) {
+ w5 = inString.char32At(index);
+ index += U16_LENGTH(w5);
+ if (length > 3) {
+ w6 = inString.char32At(index);
+ index += U16_LENGTH(w6);
+ }
+ }
+ }
}
- if (length > 1) {
- w4 = inString.char32At(index);
- index += U16_LENGTH(w4);
- }
- if (length > 2) {
- w5 = inString.char32At(index);
- index += U16_LENGTH(w5);
- }
- if (length > 3) {
- w6 = inString.char32At(index);
- index += U16_LENGTH(w6);
- }
-
- const UnicodeString b1(INVALID);
- const UnicodeString b2(b1);
- const UnicodeString b3(getUnicodeBlock(w3, status));
- const UnicodeString b4(getUnicodeBlock(w4, status));
- const UnicodeString b5(getUnicodeBlock(w5, status));
- const UnicodeString b6(getUnicodeBlock(w6, status));
-
- elementList[0].setCharAndUblock(w1, b1);
- elementList[1].setCharAndUblock(w2, b2);
- elementList[2].setCharAndUblock(w3, b3);
- elementList[3].setCharAndUblock(w4, b4);
- elementList[4].setCharAndUblock(w5, b5);
- elementList[5].setCharAndUblock(w6, b6);
+ elementList[0] = w1;
+ elementList[1] = w2;
+ elementList[2] = w3;
+ elementList[3] = w4;
+ elementList[4] = w5;
+ elementList[5] = w6;
return index;
}
-UnicodeString MlBreakEngine::getUnicodeBlock(UChar32 ch, UErrorCode &status) const {
- if (U_FAILURE(status)) {
- return UnicodeString(INVALID);
- }
-
- UBlockCode block = ublock_getCode(ch);
- if (block == UBLOCK_NO_BLOCK || block == UBLOCK_INVALID_CODE) {
- return UnicodeString(INVALID);
- } else {
- UnicodeString empty;
- // Same as sprintf("%03d", block)
- return ICU_Utility::appendNumber(empty, (int32_t)block, 10, 3);
- }
-}
-
void MlBreakEngine::loadMLModel(UErrorCode &error) {
// BudouX's model consists of pairs of the feature and its score.
// As integrating it into jaml.txt, modelKeys denotes the ML feature; modelValues means the
#if !UCONFIG_NO_BREAK_ITERATION
-/**
- * A class used to encapsulate a character and its unicode block index
- */
-class Element : public UMemory {
- public:
- /**
- * Default constructor.
- */
- Element();
-
- /**
- * Set the character and its unicode block.
- *
- * @param ch A unicode character.
- * @param ublock The unicode block of the character.
- */
- void setCharAndUblock(UChar32 ch, const UnicodeString& ublock);
-
- /**
- * Get the unicode character.
- *
- * @return The unicode character.
- */
- UChar32 getCharacter() const;
-
- /**
- * Get the unicode character's unicode block.
- *
- * @return The unicode block.
- */
- char16_t* getUblock() const;
-
- /**
- * Get the length of the unicode block.
- *
- * @return The unicode block length.
- */
- uint16_t getLength() const;
-
- private:
- UChar32 character;
- char16_t ublock[4];
- uint16_t length;
-};
-
/**
* A machine learning break engine for the phrase breaking in Japanese.
*/
*/
void loadMLModel(UErrorCode &error);
- /**
- * Get the character's unicode block code defined in UBlockCode.
- *
- * @param ch A character.
- * @param error Information on any errors encountered.
- * @return The unicode block code which is 3 digits with '0' added in the beginning if the code
- * is less than 3 digits.
- *
- */
- UnicodeString getUnicodeBlock(UChar32 ch, UErrorCode &status) const;
-
/**
* Initialize the element list from the input string.
*
* @param inString A input string to be segmented.
- * @param elementList A list to store the first six characters and their unicode block codes.
+ * @param elementList A list to store the first six characters.
* @param status Information on any errors encountered.
* @return The number of code units of the first six characters in inString.
*/
- int32_t initElementList(const UnicodeString &inString, Element* elementList,
+ int32_t initElementList(const UnicodeString &inString, UChar32* elementList,
UErrorCode &status) const;
/**
* Evaluate whether the index is a potential breakpoint.
*
- * @param elementList A list including 6 elements for the breakpoint evaluation.
+ * @param elementList A list including six elements for the breakpoint evaluation.
* @param index The breakpoint index to be evaluated.
* @param numBreaks The accumulated number of breakpoints.
* @param boundary A vector including the index of the breakpoint.
* @param status Information on any errors encountered.
*/
- void evaluateBreakpoint(Element* elementList, int32_t index, int32_t &numBreaks,
+ void evaluateBreakpoint(UChar32* elementList, int32_t index, int32_t &numBreaks,
UVector32 &boundary, UErrorCode &status) const;
UnicodeSet fDigitOrOpenPunctuationOrAlphabetSet;
// License & terms of use: http://www.unicode.org/copyright.html
jaml {
modelKeys {
- "BB2:062071",
- "UB3:061",
- "UB3:071",
- "TB2:062062062",
- "TB4:062062062",
- "UB3:063",
- "UB4:071",
- "BB3:062062",
- "UB4:062",
- "BB1:062071",
- "BB1:062061",
- "UB4:061",
- "TB1:071071062",
- "TB3:062063063",
- "UB2:061",
- "TB1:062071062",
- "TB3:062062062",
- "BB2:063063",
- "UW3:は",
- "UW3:に",
- "TB3:062071062",
- "UW3:が",
- "UW4:こ",
- "UB5:061",
- "UW3:と",
- "TB4:063063063",
- "UW4:て",
- "TB2:062062061",
+ "UW3:、",
"UW3:。",
- "UW4:お",
- "UW3:の",
- "BB3:071071",
- "BB3:062071",
- "UW3:お",
- "UW3:し",
- "UW4:、",
"UW4:の",
- "UW3:を",
+ "UW4:、",
+ "UW3:の",
"UW4:。",
- "UW3:、",
- "UW5:で",
- "UW4:あ",
- "BB2:062062",
+ "UW3:に",
+ "UW5:。",
+ "UW4:て",
+ "UW3:は",
+ "UW4:に",
+ "UW3:を",
+ "UW5:、",
+ "UW2:、",
+ "UW3:が",
+ "UW2:。",
+ "UW4:で",
+ "UW3:と",
+ "UW4:は",
+ "UW4:が",
+ "UW4:る",
"UW4:っ",
- "UW5:っ",
"UW3:も",
- "UW5:う",
- "UW3:「",
"UW5:な",
- "UW4:そ",
- "UW4:る",
- "UW3:っ",
- "UW4:「",
- "UW4:い",
- "BB2:087087",
- "UB4:087",
- "UW5:に",
- "BW3:もの",
- "UW5:し",
- "UW6:う",
- "BW2:とい",
- "UW4:に",
+ "UW3:で",
"UW3:る",
- "TB2:071062071",
- "UW4:で",
- "UW5:が",
- "BB1:071071",
- "UW5:は",
- "UW4:は",
- "UW4:れ",
- "UW5:き",
- "BB2:071062",
- "BB2:071071",
- "UW3:・",
- "BB2:071087",
- "BB2:061062",
- "TB1:062061062",
- "UW3:れ",
- "BB2:087062",
- "TB2:087087087",
+ "UW5:で",
+ "UW4:を",
+ "UW4:か",
+ "UW3:っ",
+ "UW2:の",
+ "UW4:と",
+ "UW5:っ",
+ "UW4:な",
+ "UW3:て",
+ "UW4:た",
+ "UW4:こ",
+ "UW6:に",
"UW4:ら",
- "TB1:071071071",
- "UB2:071",
- "TB1:062062087",
- "UW5:す",
- "UW5:ん",
- "UW3:で",
- "UW4:が",
- "UW3:こ",
- "TB4:071062062",
+ "UW3:た",
+ "UW2:を",
"UW3:ら",
- "UW6:に",
"UW6:。",
- "UW3:た",
- "TB1:061071071",
- "UW5:く",
- "UB1:063",
- "UW1:そ",
+ "UW4:し",
+ "UW3:な",
+ "UW2:に",
+ "UW4:い",
+ "UW4:り",
+ "UW6:う",
"UW3:う",
- "BW3:とい",
- "BW3:とこ",
- "UW3:ま",
- "BW3:こと",
- "UW2:っ",
- "UW5:・",
- "TB3:062062061",
- "UW3:き",
- "UW4:ん",
- "UB3:062",
"UW3:く",
- "UW3:」",
- "UW5:あ",
- "BB2:062087",
- "BW3:いう",
- "UW5:れ",
- "UW2:一",
- "UW3:,",
- "UW1:に",
- "UW2:と",
- "TB2:071071062",
- "TB2:071071071",
- "UW5:を",
- "UW4:り",
- "BW1:から",
- "UW3:ち",
- "BW3:いい",
+ "UW4:れ",
"UW2:は",
- "UW6:た",
- "TB1:063063062",
- "UW4:1",
- "UW4:や",
- "UW2:ん",
- "UW3:]",
- "UW4:ほ",
- "TB3:062087087",
- "BW2:であ",
"UW4:だ",
- "BB3:071062",
- "TB1:087087087",
- "BW3:・・",
- "BW3:とき",
- "UW4:を",
- "UW3:て",
- "UW4:か",
- "UW2:そ",
- "TB4:071071062",
- "TB2:062061071",
- "UW2:を",
- "UW4:ご",
- "UW2:で",
- "TB3:071071071",
- "BB1:087087",
+ "UW4:う",
+ "UW3:い",
+ "UW6:い",
+ "UW4:ん",
+ "UW2:か",
+ "UW4:ー",
+ "UW6:を",
+ "UW2:も",
+ "UW5:き",
+ "UW3:り",
+ "UW6:で",
+ "UW2:る",
+ "UW2:と",
+ "UW3:]",
+ "UW4:そ",
+ "UW3:,",
+ "UW4:も",
+ "UW4:く",
+ "UW3:か",
+ "BW2:とい",
+ "UW4:お",
+ "UW4:ま",
+ "UW6:が",
+ "UW4:き",
"UW2:し",
- "UW4:出",
+ "UW2:て",
+ "UW3:!",
"UW2:ま",
- "UW4:,",
+ "UW5:に",
+ "UW3:や",
+ "UW6:て",
+ "BW3:もの",
+ "UW6:の",
+ "UW2:ん",
+ "UW2:が",
+ "UW5:が",
+ "BW1:いう",
+ "UW2:で",
+ "UW5:す",
+ "UW3:?",
"UW5:と",
- "UW4:ど",
- "BW3:して",
- "UW1:で",
- "BB2:061071",
- "BW3:ため",
- "BW2:とし",
- "BW2:ない",
- "BW2:てい",
- "UW3:間",
- "UW3:!",
- "UW5:ー",
- "UW4:す",
- "UW4:!",
- "BW1:とが",
- "UW5:の",
- "TB4:062062071",
- "TB2:061071071",
- "UW6:・",
+ "UW6:は",
"UW3:.",
- "UW2:て",
- "UW3:笑",
- "UW2:こ",
- "UW5:も",
+ "UW4:「",
+ "UW3:ば",
+ "UW5:ん",
+ "BW3:いう",
+ "UW4:す",
+ "BW1:から",
+ "UW3:ど",
+ "UW5:し",
+ "UW2:っ",
+ "UW4:思",
+ "UW3:…",
+ "UW5:る",
+ "BW2:てい",
"BW3:よう",
- "UW3:人",
- "UW2:の",
- "UW3:か",
- "UW3:日",
- "UW1:い",
- "BW2:とこ",
+ "UW5:え",
"UW4:私",
- "UW3:…",
- "UW2:に",
- "UW3:今",
- "BB3:087062",
- "UB3:055",
- "UW4:(",
- "BB1:087071",
- "UW1:な",
- "BB3:063063",
- "UW5:来",
- "UW3:?",
- "TW3:ている",
- "UW4:」",
- "UW4:前",
- "BW1:いう",
- "UW4:つ",
+ "UW3:・",
+ "UW4:人",
+ "UW5:く",
"UW3:)",
- "BW1:では",
- "UW2:る",
- "UW5:そ",
- "UW4:ー",
- "TW2:気に入",
- "UW4:笑",
- "UW4:ひ",
- "TB4:087087087",
+ "UW4:京",
+ "BW2:ない",
+ "UW3:ー",
+ "BW3:とこ",
+ "UW5:は",
+ "UW4:」",
+ "UW2:一",
+ "UW4:よ",
+ "BW3:こと",
+ "UW5:ー",
+ "UW6:し",
"UW4:け",
- "UW2:も",
- "BW3:ちょ",
- "BW3:出来",
- "TB2:062071062",
- "UW4:『",
- "UW3:[",
- "UW4:2",
- "UW5:つ",
- "TB1:061071062",
- "UW3:1",
- "BW3:から",
- "UB5:071",
- "UW4:ま",
- "UW3:ば",
- "UW3:り",
- "BW3:その",
- "UW3:ご",
- "UW4:わ",
- "BW2:てお",
- "TB2:071062062",
"BW1:ない",
+ "BW2:です",
+ "UW4:一",
+ "UW5:帯",
+ "UW5:を",
+ "UW6:な",
+ "UW5:べ",
+ "BW3:いい",
+ "BW2:であ",
+ "BW2:ので",
+ "UW4:,",
+ "UW5:れ",
+ "UW5:ろ",
+ "UW1:そ",
+ "UW5:い",
+ "UW1:い",
+ "UW5:・",
+ "UW5:わ",
+ "UW4:1",
+ "UW5:う",
+ "UW4:大",
+ "UW3:ま",
+ "BW2:とこ",
+ "UW4:!",
+ "UW4:見",
+ "UW4:行",
+ "BW1:こと",
+ "UW1:な",
+ "UW2:さ",
+ "UW3:☆",
+ "UW4:さ",
"UW2:よ",
- "UB2:087",
- "UW6:の",
+ "BW1:とか",
+ "UW4:(",
+ "BW3:でも",
+ "UW5:の",
+ "UW4:・",
+ "UW5:た",
+ "UW1:す",
+ "UW5:か",
+ "UW4:使",
+ "UW3:♪",
+ "UW4:え",
+ "UW4:今",
+ "BW2:、と",
+ "BW3:とき",
+ "UW4:ろ",
+ "UW5:つ",
+ "UW1:に",
+ "UW5:じ",
+ "UW1:で",
+ "UW4:ン",
+ "UW3:ず",
+ "BW3:して",
+ "UW4:食",
+ "UW4:気",
+ "UW4:時",
+ "UW3:日",
+ "BW1:しい",
+ "UW4:自",
+ "UW3:笑",
"UW2:毎",
+ "TW1:という",
+ "UW4:み",
+ "UW4:…",
+ "TW2:ではな",
+ "UW6:さ",
+ "UW5:め",
+ "UW2:少",
+ "UW5:あ",
+ "UW4:2",
+ "UW3:へ",
+ "TW3:という",
+ "UW4:何",
+ "UW2:く",
"UW2:結",
- "TW4:の京都",
- "UW3:さ",
- "UW2:最",
- "BW2:です",
- "UW2:」",
- "UW5:え",
- "UW3:だ",
- "TW4:ところ",
- "UW4:.",
- "UB1:062",
- "UW6:て",
+ "BW1:うな",
+ "BW1:もう",
"UW1:が",
- "BW2:、と",
- "UW3:0",
+ "UW4:じ",
+ "UW2:う",
+ "UW4:ル",
+ "UW3:」",
+ "BW1:とが",
+ "UW2:最",
+ "BW1:るの",
+ "UW3:間",
+ "UW6:た",
+ "UW3:つ",
+ "UW4:ど",
+ "UW1:と",
"UW3:ん",
- "UW3:中",
- "UW4:よ",
- "BW3:この",
- "UW2:が",
- "UW3:み",
- "TW2:ではな",
- "UW6:と",
- "UW4:[",
- "TW3:、ある",
- "BW3:ころ",
- "UW4:?",
- "UW6:、",
- "UW4:電",
- "BB1:062040",
- "UW3:後",
- "UW5:い",
- "UW2:、",
- "UW5:て",
- "BB2:062040",
- "UW3:真",
- "UW3:そ",
- "UW5:さ",
- "UB5:087",
- "TW3:という",
- "UW3:分",
- "UB6:071",
- "BW3:なっ",
- "UW4:ろ",
- "BB2:061061",
- "TW3:ところ",
- "UB1:071",
- "UW1:、",
- "BW1:とか",
- "UW3:な",
- "UW6:り",
- "UW4:間",
- "UW3:べ",
- "UW5:べ",
- "TB4:062071062",
- "UW4:]",
+ "UW4:.",
+ "UW3:だ",
+ "UW4:わ",
+ "UW4:最",
+ "UW4:?",
+ "UW3:ろ",
+ "UW4:ば",
+ "TW3:ている",
+ "BW3:この",
+ "UW5:も",
+ "UW3:人",
+ "BW3:とい",
+ "UW4:つ",
+ "BW3:その",
+ "BW3:もう",
+ "UW2:そ",
"BW2:には",
- "UW5:々",
- "BW1:。・",
- "BW1:その",
- "UW1:す",
- "UW4:)",
- "UW6:っ",
- "TB3:063063063",
- "TB3:062071071",
- "UB5:063",
+ "BW3:かけ",
+ "TW4:の京都",
+ "TW4:ところ",
+ "UW3:京",
+ "UW4:携",
"BW1:かも",
- "UW6:る",
- "TB4:062063063",
- "UW3:ど",
- "TW3:である",
- "TW4:くらい",
+ "BW1:では",
+ "UW4:ち",
+ "UW3:分",
+ "UW4:べ",
+ "BW3:ころ",
+ "UW3:ゃ",
+ "UW2:す",
+ "BW1:。・",
+ "UW3:電",
+ "BW3:なっ",
+ "UW3:す",
"BW1:最近",
- "BW1:しい",
- "BW1:とも",
- "BW2:と同",
- "TW1:という",
- "UW2:さ",
- "BW2:帯電",
- "TB1:071062062",
+ "UW4:め",
+ "UW3:ぐ",
+ "UW2:お",
"BW3:そし",
- "UW2:。",
- "UW5:か",
- "UW5:こ",
- "BW3:ない",
+ "BW1:かし",
+ "BW1:同じ",
+ "BW3:メー",
+ "UW5:て",
+ "UW6:り",
+ "TW4:くらい",
+ "UW3:今",
+ "UW5:そ",
+ "UW4:や",
+ "UW5:」",
+ "UW4:帯",
+ "UW6:ー",
+ "BW2:とし",
+ "TW1:ような",
+ "BW2:てお",
+ "UW4:笑",
+ "UW1:は",
+ "BW3:かか",
+ "TW4:かなり",
+ "UW4:)",
"BW1:んな",
- "BW2:でき",
- "UW4:3",
- "UW3:け",
- "TW4:ことが",
- "BW1:こと",
- "UB3:087",
- "UW3:電",
- "UW3:よ",
- "BW1:たと",
- "UW5:ま",
- "UW5:た",
+ "UW1:ち",
+ "TW2:気に入",
+ "TW1:・・・",
+ "UW6:と",
"UW5:ち",
- "UW2:け",
- "UW5:だ",
+ "BW3:ため",
+ "UW4:ず",
+ "UW3:0",
+ "BW1:んで",
+ "UW3:中",
+ "UW3:々",
+ "BW2:のよ",
+ "BW2:帯電",
+ "BW2:でも",
+ "BW1:には",
+ "BW3:ちょ",
+ "UW4:せ",
"UW3:度",
- "BW1:たい",
- "UW4:使",
- "UW2:き",
- "TW4:かなり",
- "UB6:063",
- "BB1:062062",
- "UW4:込",
- "TW3:と言っ",
- "UW6:だ",
- "UW5:り",
- "UW5:よ",
- "BW3:どう",
- "UW4:…",
- "UW3:や",
- "BW1:かし",
- "BW3:かっ",
- "UW4:今",
- "UW3:『",
- "UW4:思",
- "UB2:063",
- "UW4:く",
- "UW3:京",
- "UW6:ー",
- "UW1:ん",
- "BW1:うな",
- "TB2:062061061",
- "UW1:と",
- "TB4:062063062",
- "TB2:061062062",
- "BW1:この",
- "BW2:ので",
- "UW4:み",
- "UW5:わ",
- "UW6:や",
- "BW1:れて",
- "UW2:や",
- "UW6:こ",
- "UW4:な",
- "UW5:め",
- "BW1:もう",
- "TB4:071062071",
- "BW1:より",
- "UW4:合",
- "UW6:け",
- "BW1:少し",
- "BW2:でし",
- "UW4:と",
- "TB1:063063063",
- "UW3:ー",
- "BW2:くな",
- "UW2:く",
- "UW2:我",
- "BW2:いも",
- "BW3:わか",
- "TB2:071063071",
- "UW4:も",
- "UW1:あ",
- "UW4:最",
- "BW1:るの",
- "UW2:全",
+ "BW1:でも",
+ "BW1:が、",
+ "UW2:な",
+ "UW5:思",
"UW6:0",
- "UW4:放",
- "UW4:京",
- "BW3:かけ",
- "UW2:少",
- "BW3:もう",
- "UW2:多",
- "UW2:う",
- "TB1:062062040",
- "UW1:を",
- "UW3:光",
- "BW1:!!",
- "UW2:ャ",
- "BW3:すぐ",
- "UW4:帯",
- "UW6:し",
- "BW3:でも",
- "BW2:、そ",
- "TB3:071087087",
- "TB2:063062071",
- "UW3:わ",
- "UB4:063",
- "TB4:071071071",
- "UW5:都",
- "UW5:ず",
- "UW2:バ",
- "UW2:京",
- "UW3:ゃ",
- "BW1:い、",
- "BW3:よく",
- "BW1:たら",
- "BW2:のよ",
- "UW2:思",
- "BW1:うに",
- "BW1:の間",
- "UW6:ん",
- "UW6:ず",
- "BW1:った",
- "TW3:ること",
+ "UW6:寺",
"BW3:とて",
- "TW1:ような",
+ "BW3:ある",
+ "BW2:もし",
+ "UW4:ッ",
+ "UW1:て",
+ "BW2:にも",
+ "BW1:れた",
+ "UW4:ひ",
+ "TW3:ること",
+ "BW1:てい",
+ "UW4:』",
+ "BW1:だけ",
+ "UW3:お",
+ "BW1:少し",
+ "TW3:、ある",
+ "UW5:!",
+ "UW6:ル",
+ "UW2:多",
+ "UW6:ご",
+ "UW6:や",
+ "UW3:後",
+ "BW2:てみ",
+ "BW1:とき",
+ "UW4:ゃ",
+ "BW1:たい",
+ "UW3:き",
+ "TW4:ことが",
+ "UW3:真",
+ "BW2:など",
"UW6:ぱ",
- "TB3:063071062",
- "TW4:って、",
- "TW4:なんて",
- "TW2:その後",
- "UW6:ら",
- "TW4:ことに",
- "UW3:>",
- "TW3:てしま",
- "UW3:い",
- "TB4:071062061",
- "UW2:ひ",
- "UW6:め",
- "UW6:で",
+ "BW1:った",
+ "BW1:ても",
+ "UW5:日",
+ "BW1:たと",
+ "UW4:]",
+ "UW3:ッ",
+ "TW4:メール",
+ "BW2:はな",
+ "BW3:・・",
"BW3:なる",
- "UW5:ご",
- "BW2:りし",
- "UW6:電",
- "UW1:は",
- "BW1:いも",
- "BW3:すご",
- "UW4:通",
- "BW3:おり",
- "BW3:かか",
- "BW1:思い",
+ "BW1:とい",
+ "UW2:全",
+ "BW1:にも",
+ "BW1:たら",
+ "BW2:くな",
+ "UW3:「",
+ "BW1:その",
+ "UW3:観",
+ "BW1:うに",
+ "UW3:イ",
+ "BW3:もん",
+ "UW5:ず",
+ "BW3:しま",
+ "BW1:より",
+ "UW5:分",
}
modelValues:intvector {
- 1800,
- 271,
- -857,
- -417,
- 285,
- -583,
- 388,
- 828,
- -853,
- -820,
- 502,
- -708,
- 358,
- 1341,
- -586,
- -451,
- 257,
- -1876,
- 2052,
- 1698,
- -458,
- 2048,
- 1182,
- -551,
- 980,
- 773,
- -1453,
- -152,
- 3201,
- 2865,
- 1203,
- 144,
- -369,
- -2539,
- -613,
- -3574,
- -1111,
- 3110,
- -3022,
- 2039,
+ 3634,
+ 4347,
+ -2581,
+ -4812,
+ 2538,
+ -4206,
+ 2701,
+ -1455,
+ -2403,
+ 2977,
+ -2678,
+ 4165,
+ -818,
+ -1011,
+ 2996,
+ -904,
+ -1808,
+ 2064,
+ -2164,
+ -2180,
+ -2760,
+ -2310,
+ 2360,
+ -388,
+ 1842,
+ 1706,
+ -706,
+ -2408,
+ -1628,
+ -1005,
+ -434,
+ -1442,
+ 543,
-1091,
- 1241,
- -560,
- -1412,
- 625,
- 1350,
- 297,
- -2404,
- -595,
- 1007,
- -1829,
- -1662,
- 3213,
- 270,
- -911,
- 178,
- -727,
- 2716,
- -484,
- -344,
- 929,
- -1236,
- 760,
- -299,
- -419,
- -728,
- 122,
- -704,
- -605,
- -1507,
- 545,
- -68,
- -320,
- 1498,
- 953,
- -323,
- -575,
- -673,
+ 1355,
+ -1056,
+ 258,
+ 277,
+ -2999,
+ 1331,
+ -1305,
+ 1242,
+ -337,
+ -1073,
+ 1392,
+ -576,
+ -886,
+ -2405,
+ -386,
+ 1031,
+ 1470,
+ -2105,
+ -594,
+ -1461,
+ -1160,
+ 964,
+ -48,
+ -2158,
+ 110,
+ -1750,
+ 228,
+ -603,
+ 801,
+ 972,
+ 102,
+ -395,
+ -508,
+ 1640,
+ 191,
+ 2468,
+ -1580,
+ -1529,
+ 1148,
+ 515,
+ 539,
+ -774,
+ 111,
+ -1275,
+ 113,
+ -432,
+ 1736,
+ 588,
+ -413,
+ 1360,
+ 49,
+ 2322,
+ 48,
+ 255,
+ -521,
+ -366,
+ 529,
+ -493,
+ -557,
+ 1719,
+ -476,
+ 104,
+ 1311,
+ 1314,
+ 1307,
520,
+ 666,
+ -412,
+ 627,
+ 1098,
+ -209,
+ 163,
+ 955,
+ 1798,
+ -39,
+ -753,
+ -1262,
+ 411,
+ 1247,
+ 914,
+ 522,
+ 348,
+ 2156,
+ 510,
+ -1522,
+ -243,
+ 1337,
+ -378,
+ -1957,
+ 834,
-450,
- -1767,
- -247,
- 56,
- 231,
- -764,
- 536,
- 794,
- -703,
- -566,
- 51,
- 390,
- 52,
- -182,
- 466,
- 133,
- 354,
- 107,
- 492,
+ 235,
+ 87,
+ 236,
+ -1615,
+ 485,
+ -1445,
488,
- -1194,
- 1145,
- -847,
- 812,
- 151,
+ 404,
+ -333,
+ 66,
+ 787,
+ 647,
+ -1495,
+ -756,
+ -1700,
+ 279,
+ -81,
+ 260,
+ 162,
+ -51,
+ -851,
+ 462,
+ 493,
+ 161,
+ 396,
+ -238,
+ -1044,
+ -1685,
+ 433,
+ 276,
+ -695,
+ -148,
+ 416,
+ 1235,
+ -748,
+ 257,
+ 784,
+ 748,
+ 767,
+ -262,
+ -490,
+ -26,
+ 152,
+ 186,
+ 544,
+ 1035,
+ -711,
+ 549,
-517,
- -314,
- -553,
- -783,
- -117,
- 736,
- -88,
- -598,
- 569,
- 606,
- 287,
- 744,
- 1739,
- -217,
- -219,
- -144,
- 234,
- -649,
- -757,
- 834,
- -819,
- 869,
- -275,
- -267,
- 154,
- 653,
- 594,
- 255,
- 1018,
- 1124,
- 284,
- -1624,
- -372,
- 440,
- -184,
- -1936,
- 1318,
- -1124,
- 453,
- -92,
- -343,
+ 799,
+ -1024,
+ 542,
+ -118,
+ 432,
+ -56,
+ -694,
+ 668,
+ 249,
175,
- 182,
- -886,
- 930,
- -223,
- -57,
- -113,
- 103,
- -200,
- 510,
- -2099,
- -498,
- 385,
- 80,
- -156,
- 360,
- 1289,
- 771,
- -1114,
- -399,
- 870,
- 1230,
- 79,
- 472,
- -1596,
- -1092,
- -572,
- 55,
- -151,
- -124,
- 1316,
- -248,
- 1280,
- -125,
- -284,
- -1023,
- 862,
- 84,
- 417,
- 568,
- -88,
- -528,
- 910,
- 674,
- -212,
- 894,
- -121,
- 1108,
- 762,
- 260,
- -197,
- 91,
- -53,
- 1117,
- -645,
- -868,
- -611,
- 220,
- 422,
- 1431,
- -532,
- -157,
- -476,
- -846,
- -1309,
- -1614,
- 1225,
- 302,
- -738,
- -260,
- 892,
- -778,
- -193,
- 1221,
- -779,
- 489,
- 420,
- -85,
- -525,
- -830,
- 26,
- 270,
- 439,
- -120,
- 1263,
- -795,
- 291,
- -1310,
- -23,
- 347,
- 312,
- -107,
- -114,
+ 329,
+ 305,
+ 287,
+ 423,
+ 438,
+ 934,
+ 628,
+ 292,
+ -536,
+ -995,
+ -814,
+ 237,
+ 263,
+ 571,
+ -138,
+ 402,
701,
- 830,
- 1309,
- -451,
- 260,
- -1080,
- 536,
- 188,
- -60,
- 643,
- -1184,
- 31,
- -194,
- -51,
- -514,
- -442,
- -120,
- 649,
- 410,
- 882,
- -75,
- -341,
- -718,
- -128,
- 340,
- -1245,
- -164,
- -1052,
- 70,
- -256,
+ 387,
+ 474,
+ -183,
+ 661,
+ 280,
+ 767,
+ -53,
+ -793,
+ -191,
+ -401,
+ 526,
+ -679,
279,
- 786,
- 40,
- -177,
- 97,
- -411,
- 222,
- -89,
- -277,
- -146,
- 414,
- 483,
- 21,
- -339,
- -406,
- -360,
- -450,
- -14,
- -36,
- 513,
- 252,
- 54,
- -501,
- -478,
- 450,
- -36,
- -644,
- -392,
- 714,
- 643,
- -341,
- 91,
- -1018,
- 34,
- -177,
- 123,
- 80,
- -695,
- -44,
- -357,
- 253,
+ -407,
+ 493,
+ -82,
+ 365,
+ -334,
+ 36,
+ 284,
+ -813,
+ 424,
+ -425,
+ 423,
+ -796,
+ 452,
+ -635,
-389,
- 613,
- 515,
- 418,
+ 404,
+ -141,
+ 415,
+ -277,
+ -400,
+ 502,
+ 766,
+ -182,
+ -426,
+ 720,
+ 1005,
+ 422,
-396,
- -553,
- 193,
- 298,
- -334,
- -57,
- -315,
- -77,
- 33,
- 88,
- 137,
- 280,
- -448,
- 196,
- -136,
- -295,
- -329,
- -92,
- -360,
- -132,
- -288,
- -45,
- -43,
- 174,
- 75,
- -60,
- 330,
- 360,
- 217,
- 130,
- 473,
- -41,
- -23,
- -340,
- -530,
- -69,
- -71,
+ 123,
+ -533,
+ -91,
+ -355,
+ 333,
+ -596,
+ -333,
+ 434,
+ 31,
+ 567,
+ -356,
+ -309,
+ 251,
+ 365,
+ -399,
+ 411,
+ -235,
+ -526,
+ 468,
+ 438,
+ 136,
+ 103,
+ 74,
+ 585,
+ 324,
-115,
- 297,
- -240,
- 229,
- 507,
- -348,
- 171,
- -320,
- 239,
- 16,
- -195,
- -277,
- -41,
- 69,
- 280,
- -264,
- 30,
- 249,
- -97,
- -163,
- -221,
- 96,
- 83,
+ -219,
+ -217,
+ -289,
+ -88,
+ 143,
+ 361,
+ -558,
+ -614,
+ -56,
+ 456,
+ 441,
+ -566,
+ 102,
+ 112,
+ -466,
+ 325,
+ -27,
+ 128,
+ 294,
+ -321,
+ -224,
+ -206,
+ 252,
+ 209,
+ -207,
+ -224,
+ -207,
+ 109,
+ 316,
+ -234,
+ 222,
+ 95,
+ 192,
+ -40,
+ -98,
82,
- -218,
- -93,
- -53,
+ 68,
+ 230,
+ -28,
+ -67,
+ -149,
+ 14,
+ -120,
+ 95,
+ 122,
+ -81,
+ -67,
+ -296,
+ 122,
+ -81,
+ 134,
+ -200,
+ -67,
+ 14,
+ 67,
+ 119,
40,
- 28,
- 285,
- 27,
- 283,
- -211,
+ 118,
-92,
- 214,
- -225,
- -54,
- 53,
- 105,
- -198,
- -53,
- -277,
- 198,
- 184,
- -264,
- -106,
- 14,
- 185,
- -155,
- 185,
- 106,
- -119,
+ 91,
+ -105,
53,
- 208,
- 92,
- 262,
- 106,
- -52,
+ 40,
+ -51,
+ 39,
+ -64,
105,
- -25,
- -79,
- 104,
- 141,
- 129,
- -114,
+ 13,
+ 39,
26,
- 64,
- -113,
+ -52,
+ -52,
+ -52,
26,
- 77,
- -64,
- 13,
+ -26,
+ -39,
13,
+ -13,
+ 39,
26,
- 89,
- 115,
- -49,
- 89,
- -114,
- 51,
- 64,
- -64,
- -51,
- -38,
- 89,
- 13,
- -64,
13,
- -48,
- 76,
- 63,
- 62,
- 13,
- 112,
- -76,
- -50,
+ -39,
+ -26,
+ -26,
+ -26,
-13,
- -49,
- 63,
- -50,
- 13,
+ -13,
+ 39,
+ 26,
+ -13,
+ 26,
13,
- -50,
- 24,
- -12,
- 24,
- 12,
- 24,
- 12,
- -12,
- -24,
- 12,
- -12,
- -12,
- 12,
- -12,
}
}
\ No newline at end of file
<data>•\uff19\u6708\u306b•\u6771\u4eac\u304b\u3089•\u53cb\u9054\u304c•\u904a\u3073\u306b•\u6765\u305f•</data>
#る文字「そうだ、京都」-> る•文字•「そうだ、•京都」•
<data>•\u308b•\u6587\u5b57•\u300c\u305d\u3046\u3060\u3001•\u4eac\u90fd\u300d•</data>
-#Kana supplement: ð\9b\81\88(U+1B048) -> \uD82C\uDC48, ð\9b\80¸(U+1B038) -> \uD82C\uDC38, ð\9b\80\99(U+1B019)-> \uD82C\uDC19</data>
-#𛁈る𛀸(しるこ)、あ𛀙よろし(あかよろし) -> 𛁈る𛀸•(しるこ)、•あ𛀙よろし•(あかよろし)
-<data>•\U0001B048\u308B\U0001B038•\uFF08\u3057\u308B\u3053\uFF09\u3001•\u3042\U0001B019\u3088\u308D\u3057•\uFF08\u3042\u304B\u3088\u308D\u3057\uFF09•</data>
+#Kana supplement: ð\9b\81\9b (U+1B05B), ð\9b\82¦(U+1B0A6)
+#生 𛁛𛂦゙をいただく。-> 生 𛁛𛂦゙を•いただく。
+<data>•\u751F\U0001B05B\U0001B0A6\u3099\u3092•\u3044\u305F\u3060\u304F\u3002•</data>
#中国の携帯は約500元から5000元です -> 中国の▁携帯は▁約▁500元から▁5000元です
<data>•\u4E2D\u56FD\u306E•\u643A\u5E2F\u306F•\u7D04•\uFF15\uFF10\uFF10\u5143\u304B\u3089•\uFF15\uFF10\uFF10\uFF10\u5143\u3067\u3059•</data>
#しかもロゴがUnicode!! -> しかも▁ロゴが▁Unicode!!
private static final int INVALID = '|';
private static final String INVALID_STRING = "|";
- private static final int MAX_FEATURE = 26;
+ private static final int MAX_FEATURE = 13;
private UnicodeSet fDigitOrOpenPunctuationOrAlphabetSet;
private UnicodeSet fClosePunctuationSet;
private HashMap<String, Integer> fModel;
-
private int fNegativeSum;
- static class Element {
- private int character;
- private String ublock;
-
- /**
- * Default constructor.
- */
- public Element() {
- character = 0;
- ublock = null;
- }
-
- /**
- * Set the character and its unicode block.
- *
- * @param ch A unicode character.
- * @param str The unicode block of the character.
- */
- public void setCharAndUblock(int ch, String str) {
- Assert.assrt(str.length() <= 3);
- this.character = ch;
- ublock = str;
- }
-
- /**
- * Get the unicode character.
- *
- * @return The unicode character.
- */
- public int getCharacter() {
- return character;
- }
-
- /**
- * Get the unicode character's unicode block.
- *
- * @return The unicode block.
- */
- public String getUblock() {
- return ublock;
- }
- }
-
- private static boolean isValid(Element element) {
- String ublock = element.getUblock();
- return ublock.length() != 1 || (int) ublock.charAt(0) != INVALID;
- }
-
/**
* Constructor for Chinese and Japanese phrase breaking.
*
return 0;
}
ArrayList<Integer> boundary = new ArrayList<Integer>(numCodePts);
- int ch;
- String ublock;
// The ML model groups six char to evaluate if the 4th char is a breakpoint.
// Like a sliding window, the elementList removes the first char and appends the new char
// from inString in each iteration so that its size always remains at six.
- Element elementList[] = new Element[6];
+ int elementList[] = new int[6];
initElementList(inString, elementList, numCodePts);
// Add a break for the start.
break;
}
shiftLeftOne(elementList);
-
- ch = (i + 3) < numCodePts ? next32(inString) : INVALID;
- ublock = (ch != INVALID) ? getUnicodeBlock(ch) : INVALID_STRING;
- elementList[5].setCharAndUblock(ch, ublock);
+ elementList[5] = (i + 3) < numCodePts ? next32(inString) : INVALID;
}
// Add a break for the end if there is not one there already.
return correctedNumBreaks;
}
- private void shiftLeftOne(Element[] elementList) {
+ private void shiftLeftOne(int[] elementList) {
int length = elementList.length;
for (int i = 1; i < length; i++) {
- elementList[i - 1].character = elementList[i].character;
- elementList[i - 1].ublock = elementList[i].ublock;
+ elementList[i - 1] = elementList[i];
}
}
* @param index The breakpoint index to be evaluated.
* @param boundary An list including the index of the breakpoint.
*/
- private void evaluateBreakpoint(Element[] elementList, int index, ArrayList<Integer> boundary) {
+ private void evaluateBreakpoint(int[] elementList, int index, ArrayList<Integer> boundary) {
String[] featureList = new String[MAX_FEATURE];
- final int w1 = elementList[0].getCharacter();
- final int w2 = elementList[1].getCharacter();
- final int w3 = elementList[2].getCharacter();
- final int w4 = elementList[3].getCharacter();
- final int w5 = elementList[4].getCharacter();
- final int w6 = elementList[5].getCharacter();
+ final int w1 = elementList[0];
+ final int w2 = elementList[1];
+ final int w3 = elementList[2];
+ final int w4 = elementList[3];
+ final int w5 = elementList[4];
+ final int w6 = elementList[5];
StringBuilder sb = new StringBuilder();
int idx = 0;
featureList[idx++] = sb.append("TW4:").appendCodePoint(w4).appendCodePoint(
w5).appendCodePoint(w6).toString();
}
- if (isValid(elementList[0])) {
- sb.setLength(0);
- featureList[idx++] = sb.append("UB1:").append(elementList[0].getUblock()).toString();
- }
- if (isValid(elementList[1])) {
- sb.setLength(0);
- featureList[idx++] = sb.append("UB2:").append(elementList[1].getUblock()).toString();
- }
- if (isValid(elementList[2])) {
- sb.setLength(0);
- featureList[idx++] = sb.append("UB3:").append(elementList[2].getUblock()).toString();
- }
- if (isValid(elementList[3])) {
- sb.setLength(0);
- featureList[idx++] = sb.append("UB4:").append(elementList[3].getUblock()).toString();
- }
- if (isValid(elementList[4])) {
- sb.setLength(0);
- featureList[idx++] = sb.append("UB5:").append(elementList[4].getUblock()).toString();
- }
- if (isValid(elementList[5])) {
- sb.setLength(0);
- featureList[idx++] = sb.append("UB6:").append(elementList[5].getUblock()).toString();
- }
- if (isValid(elementList[1]) && isValid(elementList[2])) {
- sb.setLength(0);
- featureList[idx++] = sb.append("BB1:").
- append(elementList[1].getUblock()).
- append(elementList[2].getUblock()).toString();
- }
- if (isValid(elementList[2]) && isValid(elementList[3])) {
- sb.setLength(0);
- featureList[idx++] = sb.append("BB2:").
- append(elementList[2].getUblock()).
- append(elementList[3].getUblock()).toString();
- }
- if (isValid(elementList[3]) && isValid(elementList[4])) {
- sb.setLength(0);
- featureList[idx++] = sb.append("BB3:").
- append(elementList[3].getUblock()).
- append(elementList[4].getUblock()).toString();
- }
- if (isValid(elementList[0]) && isValid(elementList[1]) && isValid(elementList[2])) {
- sb.setLength(0);
- featureList[idx++] = sb.append("TB1:").
- append(elementList[0].getUblock()).
- append(elementList[1].getUblock()).
- append(elementList[2].getUblock()).toString();
- }
- if (isValid(elementList[1]) && isValid(elementList[2]) && isValid(elementList[3])) {
- sb.setLength(0);
- featureList[idx++] = sb.append("TB2:").
- append(elementList[1].getUblock()).
- append(elementList[2].getUblock()).
- append(elementList[3].getUblock()).toString();
- }
- if (isValid(elementList[2]) && isValid(elementList[3]) && isValid(elementList[4])) {
- sb.setLength(0);
- featureList[idx++] = sb.append("TB3:").
- append(elementList[2].getUblock()).
- append(elementList[3].getUblock()).
- append(elementList[4].getUblock()).toString();
- }
- if (isValid(elementList[3]) && isValid(elementList[4]) && isValid(elementList[5])) {
- sb.setLength(0);
- featureList[idx++] = sb.append("TB4:").
- append(elementList[3].getUblock()).
- append(elementList[4].getUblock()).
- append(elementList[5].getUblock()).toString();
- }
+
int score = fNegativeSum;
for (int j = 0; j < idx; j++) {
if (fModel.containsKey(featureList[j])) {
* Initialize the element list from the input string.
*
* @param inString A input string to be segmented.
- * @param elementList A list to store the first six characters and their unicode block codes.
+ * @param elementList A list to store the first six characters.
* @param numCodePts The number of code points of input string
* @return The number of the code units of the first six characters in inString.
*/
- private int initElementList(CharacterIterator inString, Element[] elementList,
- int numCodePts) {
+ private int initElementList(CharacterIterator inString, int[] elementList, int numCodePts) {
int index = 0;
inString.setIndex(index);
int w1, w2, w3, w4, w5, w6;
if (numCodePts > 0) {
w3 = current32(inString);
index += Character.charCount(w3);
+ if (numCodePts > 1) {
+ w4 = next32(inString);
+ index += Character.charCount(w3);
+ if (numCodePts > 2) {
+ w5 = next32(inString);
+ index += Character.charCount(w5);
+ if (numCodePts > 3) {
+ w6 = next32(inString);
+ index += Character.charCount(w6);
+ }
+ }
+ }
}
- if (numCodePts > 1) {
- w4 = next32(inString);
- index += Character.charCount(w3);
- }
- if (numCodePts > 2) {
- w5 = next32(inString);
- index += Character.charCount(w5);
- }
- if (numCodePts > 3) {
- w6 = next32(inString);
- index += Character.charCount(w6);
- }
-
- final String b1 = INVALID_STRING;
- final String b2 = b1;
- final String b3 = getUnicodeBlock(w3);
- final String b4 = getUnicodeBlock(w4);
- final String b5 = getUnicodeBlock(w5);
- final String b6 = getUnicodeBlock(w6);
-
- elementList[0] = new Element();
- elementList[0].setCharAndUblock(w1, b1);
- elementList[1] = new Element();
- elementList[1].setCharAndUblock(w2, b2);
- elementList[2] = new Element();
- elementList[2].setCharAndUblock(w3, b3);
- elementList[3] = new Element();
- elementList[3].setCharAndUblock(w4, b4);
- elementList[4] = new Element();
- elementList[4].setCharAndUblock(w5, b5);
- elementList[5] = new Element();
- elementList[5].setCharAndUblock(w6, b6);
+ elementList[0] = w1;
+ elementList[1] = w2;
+ elementList[2] = w3;
+ elementList[3] = w4;
+ elementList[4] = w5;
+ elementList[5] = w6;
return index;
}
- /**
- * Get the character's unicode block code defined in UBlockCode.
- *
- * @param ch A char.
- * @return The unicode block code which is 3 digits with '0' added in the beginning if the code
- * is less than 3 digits.
- */
- private String getUnicodeBlock(int ch) {
- int blockId = UCharacter.UnicodeBlock.of(ch).getID();
- if (blockId == UCharacter.UnicodeBlock.NO_BLOCK.getID()
- || blockId == UCharacter.UnicodeBlock.INVALID_CODE_ID) {
- return INVALID_STRING;
- } else {
- return String.format("%03d", blockId);
- }
- }
-
/**
* Load the machine learning's model file.
*/
<data>•\uff19\u6708\u306b•\u6771\u4eac\u304b\u3089•\u53cb\u9054\u304c•\u904a\u3073\u306b•\u6765\u305f•</data>
#る文字「そうだ、京都」-> る•文字•「そうだ、•京都」•
<data>•\u308b•\u6587\u5b57•\u300c\u305d\u3046\u3060\u3001•\u4eac\u90fd\u300d•</data>
-#Kana supplement: ð\9b\81\88(U+1B048) -> \uD82C\uDC48, ð\9b\80¸(U+1B038) -> \uD82C\uDC38, ð\9b\80\99(U+1B019)-> \uD82C\uDC19</data>
-#𛁈る𛀸(しるこ)、あ𛀙よろし(あかよろし) -> 𛁈る𛀸•(しるこ)、•あ𛀙よろし•(あかよろし)
-<data>•\U0001B048\u308B\U0001B038•\uFF08\u3057\u308B\u3053\uFF09\u3001•\u3042\U0001B019\u3088\u308D\u3057•\uFF08\u3042\u304B\u3088\u308D\u3057\uFF09•</data>
+#Kana supplement: ð\9b\81\9b (U+1B05B), ð\9b\82¦(U+1B0A6)
+#生 𛁛𛂦゙をいただく。-> 生 𛁛𛂦゙を•いただく。
+<data>•\u751F\U0001B05B\U0001B0A6\u3099\u3092•\u3044\u305F\u3060\u304F\u3002•</data>
#中国の携帯は約500元から5000元です -> 中国の▁携帯は▁約▁500元から▁5000元です
<data>•\u4E2D\u56FD\u306E•\u643A\u5E2F\u306F•\u7D04•\uFF15\uFF10\uFF10\u5143\u304B\u3089•\uFF15\uFF10\uFF10\uFF10\u5143\u3067\u3059•</data>
#しかもロゴがUnicode!! -> しかも▁ロゴが▁Unicode!!