import com.ibm.icu.impl.Utility;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.BreakIterator;
-import com.ibm.icu.text.UTF16;
+import com.ibm.icu.text.RuleBasedBreakIterator;
import com.ibm.icu.util.ULocale;
static class TestParams {
BreakIterator bi;
- StringBuffer dataToBreak = new StringBuffer();
+ StringBuilder dataToBreak = new StringBuilder();
int[] expectedBreaks = new int[1000];
int[] srcLine = new int[1000];
int[] srcCol = new int[1000];
//
// Open and read the test data file.
//
- StringBuffer testFileBuf = new StringBuffer();
+ StringBuilder testFileBuf = new StringBuilder();
InputStream is = null;
try {
is = RBBITestExtended.class.getResourceAsStream("rbbitst.txt");
continue;
}
- UTF16.append(testFileBuf, c);
+ testFileBuf.appendCodePoint(c);
}
} finally {
isr.close();
final int PARSE_TAG = 2;
final int PARSE_DATA = 3;
final int PARSE_NUM = 4;
+ final int PARSE_RULES = 5;
int parseState = PARSE_TAG;
int savedState = PARSE_TAG;
- final char CH_LF = 0x0a;
- final char CH_CR = 0x0d;
- final char CH_HASH = 0x23;
- /*static const UChar CH_PERIOD = 0x2e;*/
- final char CH_LT = 0x3c;
- final char CH_GT = 0x3e;
- final char CH_BACKSLASH = 0x5c;
- final char CH_BULLET = 0x2022;
-
int lineNum = 1;
int colStart = 0;
int column = 0;
int i;
int tagValue = 0; // The numeric value of a <nnn> tag.
+
+ StringBuilder rules = new StringBuilder(); // Holds rules from a <rules> ... </rules> block
+ int rulesFirstLine = 0; // Line number of the start of current <rules> block
+
int len = testString.length();
for (charIdx = 0; charIdx < len; ) {
- int c = UTF16.charAt(testString, charIdx);
+ int c = testString.codePointAt(charIdx);
charIdx++;
- if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {
+ if (c == '\r' && charIdx<len && testString.charAt(charIdx) == '\n') {
// treat CRLF as a unit
- c = CH_LF;
+ c = '\n';
charIdx++;
}
- if (c == CH_LF || c == CH_CR) {
+ if (c == '\n' || c == '\r') {
lineNum++;
colStart = charIdx;
}
case PARSE_TAG:
{
- if (c == CH_HASH) {
+ if (c == '#') {
parseState = PARSE_COMMENT;
savedState = PARSE_TAG;
break;
charIdx += 6;
break;
}
+ if (testString.startsWith("<rules>", charIdx-1) ||
+ testString.startsWith("<badrules>", charIdx-1)) {
+ charIdx = testString.indexOf('>', charIdx) + 1;
+ parseState = PARSE_RULES;
+ rules.setLength(0);
+ rulesFirstLine = lineNum;
+ break;
+ }
+
if (testString.startsWith("<locale ", charIdx-1)) {
int closeIndex = testString.indexOf(">", charIdx);
if (closeIndex < 0) {
//savedState = PARSE_DATA;
}
+ case PARSE_RULES:
+ if (testString.startsWith("</rules>", charIdx-1)) {
+ charIdx += 7;
+ parseState = PARSE_TAG;
+ try {
+ tp.bi = new RuleBasedBreakIterator(rules.toString());
+ } catch (IllegalArgumentException e) {
+ errln(String.format("rbbitst.txt:%d Error creating break iterator from rules. %s", lineNum, e));
+ }
+ } else if (testString.startsWith("</badrules>", charIdx-1)) {
+ charIdx += 10;
+ parseState = PARSE_TAG;
+ boolean goodRules = true;
+ try {
+ new RuleBasedBreakIterator(rules.toString());
+ } catch (IllegalArgumentException e) {
+ goodRules = false;
+ }
+ if (goodRules) {
+ errln(String.format(
+ "rbbitst.txt:%d Expected, but did not get, a failure creating break iterator from rules.",
+ lineNum));
+ }
+ } else {
+ rules.appendCodePoint(c);
+ }
+ break;
+
case PARSE_DATA:
- if (c == CH_BULLET) {
+ if (c == '•') {
int breakIdx = tp.dataToBreak.length();
tp.expectedBreaks[breakIdx] = -1;
tp.srcLine[breakIdx] = lineNum;
} else {
// Named code point was recognized. Insert it
// into the test data.
- UTF16.append(tp.dataToBreak, c);
+ tp.dataToBreak.appendCodePoint(c);
for (i = tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) {
tp.srcLine[i] = lineNum;
tp.srcCol[i] = column;
break;
}
- if (c == CH_LT) {
+ if (c == '<') {
tagValue = 0;
parseState = PARSE_NUM;
break;
}
- if (c == CH_HASH && column==3) { // TODO: why is column off so far?
+ if (c == '#' && column==3) { // TODO: why is column off so far?
parseState = PARSE_COMMENT;
savedState = PARSE_DATA;
break;
}
- if (c == CH_BACKSLASH) {
+ if (c == '\\') {
// Check for \ at end of line, a line continuation.
// Advance over (discard) the newline
- int cp = UTF16.charAt(testString, charIdx);
- if (cp == CH_CR && charIdx<len && UTF16.charAt(testString, charIdx+1) == CH_LF) {
+ int cp = testString.codePointAt(charIdx);
+ if (cp == '\r' && charIdx<len && testString.codePointAt(charIdx+1) == '\n') {
// We have a CR LF
// Need an extra increment of the input ptr to move over both of them
charIdx++;
}
- if (cp == CH_LF || cp == CH_CR) {
+ if (cp == '\n' || cp == '\r') {
lineNum++;
column = 0;
charIdx++;
// Escape sequence was recognized. Insert the char
// into the test data.
charIdx = charIdxAr[0];
- UTF16.append(tp.dataToBreak, cp);
+ tp.dataToBreak.appendCodePoint(cp);
for (i=tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) {
tp.srcLine[i] = lineNum;
tp.srcCol[i] = column;
// Not a recognized backslash escape sequence.
// Take the next char as a literal.
// TODO: Should this be an error?
- c = UTF16.charAt(testString,charIdx);
- charIdx = UTF16.moveCodePointOffset(testString, charIdx, 1);
+ c = testString.codePointAt(charIdx);
+ charIdx = testString.offsetByCodePoints(charIdx, 1);
}
// Normal, non-escaped data char.
- UTF16.append(tp.dataToBreak, c);
+ tp.dataToBreak.appendCodePoint(c);
// Save the mapping from offset in the data to line/column numbers in
// the original input file. Will be used for better error messages only.
break;
}
- if (c == CH_GT) {
+ if (c == '>') {
// Finished the number. Add the info to the expected break data,
// and switch parse state back to doing plain data.
parseState = PARSE_DATA;
break;
}
- errln("Syntax Error in test file at line "+ lineNum +", col %d" + column);
+ errln(String.format("Syntax Error in rbbitst.txt at line %d, col %d", lineNum, column));
return;
-
- // parseState = PARSE_COMMENT; // TODO: unreachable. Don't stop on errors.
- // break;
}
+ }
+ // Reached end of test file. Raise an error if parseState indicates that we are
+ // within a block that should have been terminated.
-
+ if (parseState == PARSE_RULES) {
+ errln(String.format("rbbitst.txt:%d <rules> block beginning at line %d is not closed.",
+ lineNum, rulesFirstLine));
+ }
+ if (parseState == PARSE_DATA) {
+ errln(String.format("rbbitst.txt:%d <data> block not closed.", lineNum));
}
}
# <sent> any following data is for sentence break testing
# <line> any following data is for line break testing
# <char> any following data is for char break testing
-# <locale local_name> Switch to the named locale at the next occurence of <word>, <sent>, etc.
+# <rules> rules ... </rules> following data is tested against these rules.
+# Applies until a following occurence of <word>, <sent>, etc. or another <rules>
+# <locale locale_name> Switch to the named locale at the next occurence of <word>, <sent>, etc.
# <data> ... </data> test data. May span multiple lines.
# <> Break position, status == 0
# • Break position, status == 0 (Bullet, \u2022)
# Temp debugging tests
<locale en>
<word>
-<data><0>ク<400>ライアン<400>ト<400>サーバー<400></data>
-# <data><0>ク<400>ライアン<400>トサーバー<400></data>
+<data><0>コンピューター<400>は<400>、<0>本質<400>的<400>に<400>は<400>数字<400>しか<400>扱う<400>こと<400>が<400>でき<400>ま<400>せん<400>。<0>\
+コンピューター<400>は<400>、<0>文字<400>や<400>記号<400>など<400>の<400>それぞれに<400>番号<400>を<400>割り振る<400>こと<400>によって<400>扱える<400>\
+よう<400>にし<400>ます<400>。<0>ユニ<400>コード<400>が<400>出来る<400>まで<400>は<400>、<0>これらの<400>番号<400>を<400>割り振る<400>仕組み<400>が<400>\
+何<400>百<400>種類<400>も<400>存在<400>しま<400>した<400>。<0>どの<400>一つ<400>を<400>とっても<400>、<0>十分<400>な<400>文字<400>を<400>含<400>\
+んで<400>は<400>いま<400>せん<400>で<400>した<400>。<0>例えば<400>、<0>欧州<400>連合<400>一つ<400>を<400>見<400>て<400>も<400>、<0>その<400>\
+すべて<400>の<400>言語<400>を<400>カバー<400>する<400>ため<400>に<400>は<400>、<0>いくつか<400>の<400>異なる<400>符号<400>化<400>の<400>仕組み<400>\
+が<400>必要<400>で<400>した<400>。<0>英語<400>の<400>よう<400>な<400>一つ<400>の<400>言語<400>に<400>限<400>って<400>も<400>、<0>一つ<400>だけ<400>\
+の<400>符号<400>化<400>の<400>仕組み<400>では<400>、<0>一般<400>的<400>に<400>使<400>われる<400>すべて<400>の<400>文字<400>、<0>句読点<400>、<0>\
+。<0></data>
+
+#<data><0>コンピューター<400>は<400>、<0>本質<400>的<400>に<400>は<400>数字<400>しか<400>扱う<400>こと<400>が<400>でき<400>ま<400>せん<400>。<0>\
## FILTERED BREAK TESTS
<data>•\U0001F468\u200D\u2695\uFE0F•\U0001F468\u200D\u2695•\U0001F468\U0001F3FD\u200D\u2695\uFE0F•\U0001F468\U0001F3FD\u200D\u2695\u0020•</data>
# woman astronaut, woman astronaut / fitz4
<data>•\U0001F469\u200D\U0001F680•\U0001F469\U0001F3FD\u200D\U0001F680\u0020•</data>
+
+
+####################################################################################
+#
+# Test rule status values
+#
+####################################################################################
+<rules> $Letters = [:L:];
+ $Numbers = [:N:];
+ $Letters+{1};
+ $Numbers+{2};
+ Help\ me\!{4};
+ [^$Letters $Numbers];
+ !.*;
+</rules>
+<data>•abc<1>123<2>.•.•abc<1> •Help<1> •me<1> •Help me!<4></data>
+
+# Test option to prohibit unquoted literals.
+
+<rules>
+!!forward;
+ Hello\ World;
+!!reverse;
+ .*;
+</rules>
+<data>•Hello World•</data>
+
+<badrules>
+!!quoted_literals_only;
+!!forward;
+ Hello\ World;
+!!reverse;
+ .*;
+</badrules>
+
+<rules>
+#TODO: uncomment this line when quoted_literals_only is implemented.
+#!!quoted_literals_only;
+!!forward;
+ 'Hello World';
+!!reverse;
+ .*;
+</rules>
+<data>•Hello World•</data>
+