import java.text.CharacterIterator;
import java.util.HashSet;
+import com.ibm.icu.impl.ICUResourceBundle.OpenType;
import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.text.FilteredBreakIteratorBuilder;
import com.ibm.icu.text.UCharacterIterator;
import com.ibm.icu.util.CharsTrieBuilder;
import com.ibm.icu.util.StringTrieBuilder;
import com.ibm.icu.util.ULocale;
-import com.ibm.icu.util.UResourceBundle;
/**
* @author tomzhang
this.backwardsTrie = backwardsTrie;
}
- @Override
- public int next() {
- int n = delegate.next();
+
+ /**
+ * Reset the filter from the delegate.
+ */
+ private final void resetState() {
+ text = UCharacterIterator.getInstance((CharacterIterator) delegate.getText().clone());
+ }
+
+ /**
+ * Is there an exception at this point?
+ *
+ * @param n
+ * @return
+ */
+ private final boolean breakExceptionAt(int n) {
+ // Note: the C++ version of this function is SimpleFilteredSentenceBreakIterator::breakExceptionAt()
+
+ int bestPosn = -1;
+ int bestValue = -1;
+
+ // loops while 'n' points to an exception
+ text.setIndex(n);
+ backwardsTrie.reset();
+ int uch;
+
+ // Assume a space is following the '.' (so we handle the case: "Mr. /Brown")
+ if ((uch = text.previousCodePoint()) == ' ') { // TODO: skip a class of chars here??
+ // TODO only do this the 1st time?
+ } else {
+ uch = text.nextCodePoint();
+ }
+
+ BytesTrie.Result r = BytesTrie.Result.INTERMEDIATE_VALUE;
+
+ while ((uch = text.previousCodePoint()) != UCharacterIterator.DONE && // more to consume backwards and..
+ ((r = backwardsTrie.nextForCodePoint(uch)).hasNext())) {// more in the trie
+ if (r.hasValue()) { // remember the best match so far
+ bestPosn = text.getIndex();
+ bestValue = backwardsTrie.getValue();
+ }
+ }
+
+ if (r.matches()) { // exact match?
+ bestValue = backwardsTrie.getValue();
+ bestPosn = text.getIndex();
+ }
+
+ if (bestPosn >= 0) {
+ if (bestValue == Builder.MATCH) { // exact match!
+ return true; // Exception here.
+ } else if (bestValue == Builder.PARTIAL && forwardsPartialTrie != null) {
+ // make sure there's a forward trie
+ // We matched the "Ph." in "Ph.D." - now we need to run everything through the forwards trie
+ // to see if it matches something going forward.
+ forwardsPartialTrie.reset();
+
+ BytesTrie.Result rfwd = BytesTrie.Result.INTERMEDIATE_VALUE;
+ text.setIndex(bestPosn); // hope that's close ..
+ while ((uch = text.nextCodePoint()) != BreakIterator.DONE
+ && ((rfwd = forwardsPartialTrie.nextForCodePoint(uch)).hasNext())) {
+ }
+ if (rfwd.matches()) {
+ // Exception here
+ return true;
+ } // else fall through
+ } // else fall through
+ } // else fall through
+ return false; // No exception here.
+ }
+
+ /**
+ * Given that the delegate has already given its "initial" answer,
+ * find the NEXT actual (non-excepted) break.
+ * @param n initial position from delegate
+ * @return new break position or UBRK_DONE
+ */
+ private final int internalNext(int n) {
if (n == BreakIterator.DONE || // at end or
backwardsTrie == null) { // .. no backwards table loaded == no exceptions
return n;
}
- // UCharacterIterator text;
- text = UCharacterIterator.getInstance((CharacterIterator) delegate.getText().clone());
- do { // outer loop runs once per underlying break (from fDelegate).
- // loops while 'n' points to an exception.
- text.setIndex(n);
- backwardsTrie.reset();
- int uch;
-
- // Assume a space is following the '.' (so we handle the case: "Mr. /Brown")
- if ((uch = text.previousCodePoint()) == ' ') { // TODO: skip a class of chars here??
- // TODO only do this the 1st time?
- } else {
- uch = text.nextCodePoint();
- }
+ resetState();
- BytesTrie.Result r = BytesTrie.Result.INTERMEDIATE_VALUE;
+ final int textLen = text.getLength();
- int bestPosn = -1;
- int bestValue = -1;
+ while (n != BreakIterator.DONE && n != textLen) {
+ // outer loop runs once per underlying break (from fDelegate).
+ // loops while 'n' points to an exception.
- while ((uch = text.previousCodePoint()) != BreakIterator.DONE && // more to consume backwards and..
- ((r = backwardsTrie.nextForCodePoint(uch)).hasNext())) {// more in the trie
- if (r.hasValue()) { // remember the best match so far
- bestPosn = text.getIndex();
- bestValue = backwardsTrie.getValue();
- }
+ if (breakExceptionAt(n)) {
+ // n points to a break exception
+ n = delegate.next();
+ } else {
+ // no exception at this spot
+ return n;
}
+ }
+ return n; //hit underlying DONE or break at end of text
+ }
- if (r.matches()) { // exact match?
- bestValue = backwardsTrie.getValue();
- bestPosn = text.getIndex();
- }
+ /**
+ * Given that the delegate has already given its "initial" answer,
+ * find the NEXT actual (non-excepted) break.
+ * @param n initial position from delegate
+ * @return new break position or UBRK_DONE
+ */
+ private final int internalPrev(int n) {
+ if (n == 0 || n == BreakIterator.DONE || // at end or
+ backwardsTrie == null) { // .. no backwards table loaded == no exceptions
+ return n;
+ }
+ resetState();
- if (bestPosn >= 0) {
- if (bestValue == Builder.MATCH) { // exact match!
- n = delegate.next(); // skip this one. Find the next lowerlevel break.
- if (n == BreakIterator.DONE) {
- break;
- }
- continue; // See if the next is another exception.
- } else if (bestValue == Builder.PARTIAL && forwardsPartialTrie != null) {
- // make sure there's a forward trie
- // We matched the "Ph." in "Ph.D." - now we need to run everything through the forwards trie
- // to see if it matches something going forward.
- forwardsPartialTrie.reset();
-
- BytesTrie.Result rfwd = BytesTrie.Result.INTERMEDIATE_VALUE;
- text.setIndex(bestPosn); // hope that's close ..
- while ((uch = text.nextCodePoint()) != BreakIterator.DONE
- && ((rfwd = forwardsPartialTrie.nextForCodePoint(uch)).hasNext())) {
- }
- if (rfwd.matches()) {
- // only full matches here, nothing to check
- // skip the next:
- n = delegate.next();
- if (n == BreakIterator.DONE) {
- break;
- }
- continue;
- } else {
- // no match (no exception) -return the 'underlying' break
- break;
- }
- } else {
- break; // internal error and/or no forwards trie
- }
+ while (n != BreakIterator.DONE && n != 0) {
+ // outer loop runs once per underlying break (from fDelegate).
+ // loops while 'n' points to an exception.
+
+ if (breakExceptionAt(n)) {
+ // n points to a break exception
+ n = delegate.previous();
} else {
- break; // No match - so exit. Not an exception.
+ // no exception at this spot
+ return n;
}
- } while (n != BreakIterator.DONE);
- return n;
+ }
+ return n; //hit underlying DONE or break at end of text
}
@Override
return other;
}
+
@Override
public int first() {
- return delegate.first();
+ return internalNext(delegate.first());
}
@Override
- public int last() {
- return delegate.last();
+ public int preceding(int offset) {
+ return internalPrev(delegate.preceding(offset));
}
@Override
- public int next(int n) {
- // TODO
- throw new UnsupportedOperationException("next(int) is not yet implemented");
+ public int previous() {
+ return internalPrev(delegate.previous());
}
@Override
- public int previous() {
- // TODO
- throw new UnsupportedOperationException("previous() is not yet implemented");
+ public int current() {
+ return delegate.current();
}
@Override
- public int following(int offset) {
- // TODO
- throw new UnsupportedOperationException("following(int) is not yet implemented");
+ public boolean isBoundary(int offset) {
+ if(!delegate.isBoundary(offset)) {
+ return false; // No underlying break to suppress?
+ }
+
+ // delegate thinks there's a break…
+ if(backwardsTrie == null) {
+ return true; // no data
+ }
+
+ resetState();
+ return !breakExceptionAt(offset); // if there's an exception: no break.
}
@Override
- public int current() {
- return delegate.current();
+ public int next() {
+ return internalNext(delegate.next());
}
@Override
- public int preceding(int offset) {
- // TODO
- throw new UnsupportedOperationException("preceding(int) is not yet implemented");
+ public int next(int n) {
+ return internalNext(delegate.next(n));
+ }
+
+ @Override
+ public int following(int offset) {
+ return internalNext(delegate.following(offset));
+ }
+
+ @Override
+ public int last() {
+ // Don't suppress a break opportunity at the end of text.
+ return delegate.last();
}
@Override
* @param loc the locale to get filtered iterators
*/
public Builder(ULocale loc) {
- ICUResourceBundle rb = (ICUResourceBundle) UResourceBundle.getBundleInstance(
- ICUData.ICU_BRKITR_BASE_NAME, loc);
- ICUResourceBundle exceptions = rb.findWithFallback("exceptions");
- if (exceptions != null) {
- ICUResourceBundle breaks = exceptions.findWithFallback("SentenceBreak");
-
- if (breaks != null) {
- for (int index = 0, size = breaks.getSize(); index < size; ++index) {
- ICUResourceBundle b = (ICUResourceBundle) breaks.get(index);
- String br = b.getString();
- filterSet.add(br);
- }
+ ICUResourceBundle rb = ICUResourceBundle.getBundleInstance(
+ ICUData.ICU_BRKITR_BASE_NAME, loc, OpenType.LOCALE_ROOT);
+
+ ICUResourceBundle breaks = rb.findWithFallback("exceptions/SentenceBreak");
+
+ if (breaks != null) {
+ for (int index = 0, size = breaks.getSize(); index < size; ++index) {
+ ICUResourceBundle b = (ICUResourceBundle) breaks.get(index);
+ String br = b.getString();
+ filterSet.add(br);
}
- } // else - no exceptions.
+ }
}
/**
// Short circuit - nothing to except.
return adoptBreakIterator;
}
-
+
CharsTrieBuilder builder = new CharsTrieBuilder();
CharsTrieBuilder builder2 = new CharsTrieBuilder();
{
}
-
+
@Before
public void init(){
characterBreak = BreakIterator.getCharacterInstance();
for (int k = tb.first(); k != BreakIterator.DONE; k = tb.next())
if (k == 2) {
errln("Break between CR and LF in string U+" + Integer.toHexString(
- (int)(work.charAt(0))) + ", U+d U+a U+" + Integer.toHexString(
- (int)(work.charAt(3))));
+ (work.charAt(0))) + ", U+d U+a U+" + Integer.toHexString(
+ (work.charAt(3))));
errorCount++;
if (errorCount >= 75)
return;
tb.setText(work.toString());
for (int k = tb.first(); k != BreakIterator.DONE; k = tb.next())
if (k == 2) {
- errln("Break between U+" + Integer.toHexString((int)(work.charAt(1)))
- + " and U+" + Integer.toHexString((int)(work.charAt(2))));
+ errln("Break between U+" + Integer.toHexString((work.charAt(1)))
+ + " and U+" + Integer.toHexString((work.charAt(2))));
errorCount++;
if (errorCount >= 75)
return;
out.append(c);
else {
out.append("\\u");
- temp = Integer.toHexString((int)c);
+ temp = Integer.toHexString(c);
out.append(zeros.substring(0, 4 - temp.length()));
out.append(temp);
}
generalIteratorTest(lineBreak, lineSelectionData);
}
-
+
/**
* @bug 4117554
int begin = 3;
int end = str.length() - 3;
// not used boolean gotException = false;
-
+
iter.setText(new StringCharacterIterator(str, begin, end, begin));
for (int index = -1; index < begin + 1; ++index) {
if (locList.length == 0)
errln("getAvailableLocales() returned an empty list!");
// I have no idea how to test this function...
-
+
com.ibm.icu.util.ULocale[] ulocList = BreakIterator.getAvailableULocales();
if (ulocList.length == 0) {
- errln("getAvailableULocales() returned an empty list!");
+ errln("getAvailableULocales() returned an empty list!");
} else {
logln("getAvailableULocales() returned " + ulocList.length + " locales");
}
}
-
+
/**
* @bug 4068137
*/
}
}
-
+
/**
* Bug 4450804
*/
assertEquals("Next point", 5, brk.next());
assertEquals("Last point", BreakIterator.DONE, brk.next());
}
-
+
/*
* Test case for Ticket#10721. BreakIterator factory method should throw NPE
* when specified locale is null.
errln("getWordInstance((ULocale)null) did not throw NPE.");
} catch (NullPointerException e) { /* OK */ }
}
-
+
/**
* Test FilteredBreakIteratorBuilder newly introduced
*/
logln("Building new BI\n");
filteredBI = builder.build(baseBI);
- logln("Testing:");
- filteredBI.setText(text);
- assertEquals("1st next", 20, filteredBI.next());
- assertEquals("1st next", 84, filteredBI.next());
- assertEquals("1st next", 90, filteredBI.next());
- assertEquals("1st next", 181, filteredBI.next());
- assertEquals("1st next", 278, filteredBI.next());
- filteredBI.first();
+ assertDefaultBreakBehavior(filteredBI, text);
}
{
assertEquals("2nd next", 278, filteredBI.next());
filteredBI.first();
}
-
+
{
logln("Constructing empty builder\n");
filteredBI = builder.build(baseBI);
if(filteredBI != null) {
- logln("Testing:");
- filteredBI.setText(text);
-
- assertEquals("5th next", 84, filteredBI.next());
- assertEquals("5th next", 278, filteredBI.next());
- filteredBI.first();
+ assertEnglishBreakBehavior(filteredBI, text);
}
}
+ {
+ logln("Constructing English @ss=standard\n");
+ filteredBI = BreakIterator.getSentenceInstance(ULocale.forLanguageTag("en-US-u-ss-standard"));
+
+ if(filteredBI != null) {
+ assertEnglishBreakBehavior(filteredBI, text);
+ }
+ }
+
+ {
+ logln("Constructing Afrikaans @ss=standard - should be == default\n");
+ filteredBI = BreakIterator.getSentenceInstance(ULocale.forLanguageTag("af-u-ss-standard"));
+
+ assertDefaultBreakBehavior(filteredBI, text);
+ }
+
+ {
+ logln("Constructing Japanese @ss=standard - should be == default\n");
+ filteredBI = BreakIterator.getSentenceInstance(ULocale.forLanguageTag("ja-u-ss-standard"));
+
+ assertDefaultBreakBehavior(filteredBI, text);
+ }
+ {
+ logln("Constructing tfg @ss=standard - should be == default\n");
+ filteredBI = BreakIterator.getSentenceInstance(ULocale.forLanguageTag("tfg-u-ss-standard"));
+
+ assertDefaultBreakBehavior(filteredBI, text);
+ }
+
{
logln("Constructing French builder");
builder = FilteredBreakIteratorBuilder.createInstance(ULocale.FRENCH);
filteredBI = builder.build(baseBI);
if(filteredBI != null) {
- logln("Testing:");
- filteredBI.setText(text);
- assertEquals("6th next", 20, filteredBI.next());
- assertEquals("6th next", 84, filteredBI.next());
- filteredBI.first();
+ assertFrenchBreakBehavior(filteredBI, text);
}
}
}
+
+ /**
+ * @param filteredBI
+ * @param text
+ */
+ private void assertFrenchBreakBehavior(BreakIterator filteredBI, String text) {
+ logln("Testing French behavior:");
+ filteredBI.setText(text);
+ assertEquals("6th next", 20, filteredBI.next());
+ assertEquals("6th next", 84, filteredBI.next());
+ filteredBI.first();
+ }
+
+ /**
+ * @param filteredBI
+ * @param text
+ */
+ private void assertEnglishBreakBehavior(BreakIterator filteredBI, String text) {
+ logln("Testing English filtered behavior:");
+ filteredBI.setText(text);
+
+ assertEquals("5th next", 84, filteredBI.next());
+ assertEquals("5th next", 278, filteredBI.next());
+ filteredBI.first();
+ }
+
+ /**
+ * @param filteredBI
+ * @param text
+ */
+ private void assertDefaultBreakBehavior(BreakIterator filteredBI, String text) {
+ logln("Testing Default Behavior:");
+ filteredBI.setText(text);
+ assertEquals("1st next", 20, filteredBI.next());
+ assertEquals("1st next", 84, filteredBI.next());
+ assertEquals("1st next", 90, filteredBI.next());
+ assertEquals("1st next", 181, filteredBI.next());
+ assertEquals("1st next", 278, filteredBI.next());
+ filteredBI.first();
+ }
}
-# Copyright (c) 2001-2016 International Business Machines
+# Copyright (C) 2016 and later: Unicode, Inc. and others.
+# License & terms of use: http://www.unicode.org/copyright.html
+# Copyright (c) 2001-2016 International Business Machines
# Corporation and others. All Rights Reserved.
#
# RBBI Test Data
# are merged back into ICU4C's copy of the file, lest they get overwritten later.
# TODO: figure out how to have a single copy of the file for use by both C and Java.
+
# Temp debugging tests
<locale en>
<word>
## FILTERED BREAK TESTS
# (William Bradford, public domain. http://catalog.hathitrust.org/Record/008651224 ) - edited.
-#<locale en>
-#<sent>
-#<data>\
-#•In the meantime Mr. •Weston arrived with his small ship, which he had now recovered. •Capt. •Gorges, who informed the Sgt. here that one purpose of his going east was to meet with Mr. •Weston, took this opportunity to call him to account for some abuses he had to lay to his charge.•</data>
-#
-#<locale en@ss=standard>
-#<sent>
-#<data>\
-#•In the meantime Mr. Weston arrived with his small ship, which he had now recovered. •Capt. Gorges, who informed the Sgt. here that one purpose of his going east was to meet with Mr. Weston, took this opportunity to call him to account for some abuses he had to lay to his charge.•</data>
-#
+<locale en>
+<sent>
+<data>\
+•In the meantime Mr. •Weston arrived with his small ship, which he had now recovered. •Capt. •Gorges, who informed the Sgt. here that one purpose of his going east was to meet with Mr. •Weston, took this opportunity to call him to account for some abuses he had to lay to his charge.•</data>
+
+<locale en@ss=standard>
+<sent>
+<data>\
+•In the meantime Mr. Weston arrived with his small ship, which he had now recovered. •Capt. Gorges, who informed the Sgt. here that one purpose of his going east was to meet with Mr. Weston, took this opportunity to call him to account for some abuses he had to lay to his charge.•</data>
+
+# This hits the case where "D." would match the end of "Ph.D.".
+<locale en@ss=standard>
+<sent>
+<data>\
+•Doctor with a D. •As in, Ph.D., you know.•</data>
+
+# same as root (unless some exceptions are added!)
+<locale tfg@ss=standard>
+<sent>
+<data>\
+•In the meantime Mr. •Weston arrived with his small ship, which he had now recovered. •Capt. •Gorges, who informed the Sgt. here that one purpose of his going east was to meet with Mr. •Weston, took this opportunity to call him to account for some abuses he had to lay to his charge.•</data>
+
+# same as root (unless some exceptions are added!)
+<locale ja@ss=standard>
+<sent>
+<data>\
+•In the meantime Mr. •Weston arrived with his small ship, which he had now recovered. •Capt. •Gorges, who informed the Sgt. here that one purpose of his going east was to meet with Mr. •Weston, took this opportunity to call him to account for some abuses he had to lay to his charge.•</data>
+
## END FILTERED BREAK TESTS
+
########################################################################################
#
#
<data>•栃木<400>県<400>足利<400>市<400>で<400>の<400>撮影<400>が<400>公開<400></data>
<data>•栃木<400>県<400>足利<400>市<400>で<400>の<400>撮影<400>が<400>公開<400>さ<400>れ<400>た<400></data>
-
# Ticket #11999
# Unhandled Break Engine was consuming all characters, not just unhandled.
# \U00011700 is AHOM LETTER KA. There is no dictionary for AHOM, triggering the unhandled engine,