RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString &rules,
UParseError *parseErr,
UErrorCode &status)
- : fRules(rules)
+ : fRules(rules), fStrippedRules(rules)
{
fStatus = &status; // status is checked below
fParseError = parseErr;
return NULL;
}
- // Remove comments and whitespace from the rules to make it smaller.
- UnicodeString strippedRules((const UnicodeString&)RBBIRuleScanner::stripRules(fRules));
+ // Remove whitespace from the rules to make it smaller.
+ // The rule parser has already removed comments.
+ fStrippedRules = fScanner->stripRules(fStrippedRules);
// Calculate the size of each section in the data.
// Sizes here are padded up to a multiple of 8 for better memory alignment.
int32_t safeRevTableSize = align8(fSafeRevTables->getTableSize());
int32_t trieSize = align8(fSetBuilder->getTrieSize());
int32_t statusTableSize = align8(fRuleStatusVals->size() * sizeof(int32_t));
- int32_t rulesSize = align8((strippedRules.length()+1) * sizeof(UChar));
+ int32_t rulesSize = align8((fStrippedRules.length()+1) * sizeof(UChar));
(void)safeFwdTableSize;
data->fStatusTable = data->fTrie + trieSize;
data->fStatusTableLen= statusTableSize;
data->fRuleSource = data->fStatusTable + statusTableSize;
- data->fRuleSourceLen = strippedRules.length() * sizeof(UChar);
+ data->fRuleSourceLen = fStrippedRules.length() * sizeof(UChar);
uprv_memset(data->fReserved, 0, sizeof(data->fReserved));
ruleStatusTable[i] = fRuleStatusVals->elementAti(i);
}
- strippedRules.extract((UChar *)((uint8_t *)data+data->fRuleSource), rulesSize/2+1, *fStatus);
+ fStrippedRules.extract((UChar *)((uint8_t *)data+data->fRuleSource), rulesSize/2+1, *fStatus);
return data;
}
UErrorCode *fStatus; // Error reporting. Keeping status
UParseError *fParseError; // here avoids passing it everywhere.
const UnicodeString &fRules; // The rule string that we are compiling
+ UnicodeString fStrippedRules; // The rule string, with comments stripped.
RBBIRuleScanner *fScanner; // The scanner.
RBBINode *fForwardTree; // The parse trees, generated by the scanner,
//------------------------------------------------------------------------------
//
-// stripRules Return a rules string without unnecessary
-// characters.
+// stripRules Return a rules string without extra spaces.
+// (Comments are removed separately, during rule parsing.)
//
//------------------------------------------------------------------------------
UnicodeString RBBIRuleScanner::stripRules(const UnicodeString &rules) {
UnicodeString strippedRules;
- int rulesLength = rules.length();
- for (int idx = 0; idx < rulesLength; ) {
- UChar ch = rules[idx++];
- if (ch == chPound) {
- while (idx < rulesLength
- && ch != chCR && ch != chLF && ch != chNEL)
- {
- ch = rules[idx++];
- }
- }
- if (!u_isISOControl(ch)) {
- strippedRules.append(ch);
+ int32_t rulesLength = rules.length();
+ bool skippingSpaces = false;
+
+ for (int32_t idx=0; idx<rulesLength; idx = rules.moveIndex32(idx, 1)) {
+ UChar32 cp = rules.char32At(idx);
+ bool whiteSpace = u_hasBinaryProperty(cp, UCHAR_PATTERN_WHITE_SPACE);
+ if (skippingSpaces && whiteSpace) {
+ continue;
}
+ strippedRules.append(cp);
+ skippingSpaces = whiteSpace;
}
- // strippedRules = strippedRules.unescape();
return strippedRules;
}
// It will be treated as white-space, and serves to break up anything
// that might otherwise incorrectly clump together with a comment in
// the middle (a variable name, for example.)
+ int32_t commentStart = fScanIndex;
for (;;) {
c.fChar = nextCharLL();
if (c.fChar == (UChar32)-1 || // EOF
c.fChar == chNEL ||
c.fChar == chLS) {break;}
}
+ for (int32_t i=commentStart; i<fNextIndex-1; ++i) {
+ fRB->fStrippedRules.setCharAt(i, u' ');
+ }
}
if (c.fChar == (UChar32)-1) {
return;
builtRules = (const uint8_t *)udata_getMemory(data.getAlias());
builtSource = (const UChar *)(builtRules + ((RBBIDataHeader*)builtRules)->fRuleSource);
- RuleBasedBreakIterator *brkItr = new RuleBasedBreakIterator(builtSource, parseError, status);
+ LocalPointer<RuleBasedBreakIterator> brkItr (new RuleBasedBreakIterator(builtSource, parseError, status));
if (U_FAILURE(status)) {
errln("%s:%d createRuleBasedBreakIterator: ICU Error \"%s\" at line %d, column %d\n",
__FILE__, __LINE__, u_errorName(status), parseError.line, parseError.offset);
errln("%s:%d Built rules and rebuilt rules are different %s", __FILE__, __LINE__, dataFile);
return;
}
- delete brkItr;
}
void RBBIAPITest::TestRoundtripRules() {
TESTCASE_AUTO(TestBug12932);
TESTCASE_AUTO(TestEmoji);
TESTCASE_AUTO(TestBug12519);
+ TESTCASE_AUTO(TestBug12677);
TESTCASE_AUTO_END;
}
assertTrue(WHERE "after assignment of \"biDe = biFr\", they should be equal, but are not.", *biFr == *biDe);
}
+void RBBITest::TestBug12677() {
+ // Check that stripping of comments from rules for getRules() is not confused by
+ // the presence of '#' characters in the rules that do not introduce comments.
+ UnicodeString rules(u"!!forward; \n"
+ "$x = [ab#]; # a set with a # literal. \n"
+ " # .; # a comment that looks sort of like a rule. \n"
+ " '#' '?'; # a rule with a quoted # \n"
+ );
+
+ UErrorCode status = U_ZERO_ERROR;
+ UParseError pe;
+ RuleBasedBreakIterator bi (rules, pe, status);
+ assertSuccess(WHERE, status);
+ UnicodeString rtRules = bi.getRules();
+ assertEquals(WHERE, UnicodeString(u"!!forward; $x = [ab#]; '#' '?'; "), rtRules);
+}
+
//
// TestDebug - A place-holder test for debugging purposes.
// For putting in fragments of other tests that can be invoked
void TestBug12932();
void TestEmoji();
void TestBug12519();
+ void TestBug12677();
void TestDebug();
void TestProperties();
String fDebugEnv; // controls debug trace output
String fRules; // The rule string that we are compiling
+ StringBuilder fStrippedRules; // The rule string, with comments stripped.
RBBIRuleScanner fScanner; // The scanner.
fDebugEnv = ICUDebug.enabled("rbbi") ?
ICUDebug.value("rbbi") : null;
fRules = rules;
+ fStrippedRules = new StringBuilder(rules);
fUSetNodes = new ArrayList<RBBINode>();
fRuleStatusVals = new ArrayList<Integer>();
fScanner = new RBBIRuleScanner(this);
DataOutputStream dos = new DataOutputStream(os);
int i;
- // Remove comments and whitespace from the rules to make it smaller.
- String strippedRules = RBBIRuleScanner.stripRules(fRules);
+ // Remove whitespace from the rules to make it smaller.
+ // The rule parser has already removed comments.
+ String strippedRules = RBBIRuleScanner.stripRules(fStrippedRules.toString());
// Calculate the size of each section in the data in bytes.
// Sizes here are padded up to a multiple of 8 for better memory alignment.
import com.ibm.icu.impl.Assert;
import com.ibm.icu.impl.Utility;
import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.lang.UProperty;
/**
* This class is part of the Rule Based Break Iterator rule compiler.
static String stripRules(String rules) {
StringBuilder strippedRules = new StringBuilder();
int rulesLength = rules.length();
- for (int idx = 0; idx < rulesLength;) {
- char ch = rules.charAt(idx++);
- if (ch == '#') {
- while (idx < rulesLength
- && ch != '\r' && ch != '\n' && ch != chNEL) {
- ch = rules.charAt(idx++);
- }
- }
- if (!UCharacter.isISOControl(ch)) {
- strippedRules.append(ch);
+ boolean skippingSpaces = false;
+
+ for (int idx = 0; idx < rulesLength; idx = rules.offsetByCodePoints(idx, 1)) {
+ int cp = rules.codePointAt(idx);
+ boolean whiteSpace = UCharacter.hasBinaryProperty(cp, UProperty.PATTERN_WHITE_SPACE);
+ if (skippingSpaces && whiteSpace) {
+ continue;
}
+ strippedRules.appendCodePoint(cp);
+ skippingSpaces = whiteSpace;
}
return strippedRules.toString();
}
// It will be treated as white-space, and serves to break up anything
// that might otherwise incorrectly clump together with a comment in
// the middle (a variable name, for example.)
+ int commentStart = fScanIndex;
for (;;) {
c.fChar = nextCharLL();
if (c.fChar == -1 || // EOF
break;
}
}
+ for (int i=commentStart; i<fNextIndex-1; ++i) {
+ fRB.fStrippedRules.setCharAt(i, ' ');
+ }
}
if (c.fChar == -1) {
return;
assertEquals("", t1.fExpectedBoundaries, t1.fBoundaries);
assertEquals("", t2.fExpectedBoundaries, t2.fBoundaries);
}
+
+ @Test
+ public void TestBug12677() {
+ // Check that stripping of comments from rules for getRules() is not confused by
+ // the presence of '#' characters in the rules that do not introduce comments.
+ String rules = "!!forward; \n"
+ + "$x = [ab#]; # a set with a # literal. \n"
+ + " # .; # a comment that looks sort of like a rule. \n"
+ + " '#' '?'; # a rule with a quoted # \n";
+
+ RuleBasedBreakIterator bi = new RuleBasedBreakIterator(rules);
+ String rtRules = bi.toString(); // getRules() in C++
+ assertEquals("Break Iterator rule stripping test", "!!forward; $x = [ab#]; '#' '?'; ", rtRules);
+ }
}