From d7dabc7c069a60342345b0b5fa012091909e188e Mon Sep 17 00:00:00 2001 From: Yoshito Umaoka Date: Mon, 8 Sep 2014 22:30:08 +0000 Subject: [PATCH] ICU-11029 Merging ICU4J implementation of filtered break iterator (ULI break) as technology preview from the work branch. X-SVN-Rev: 36398 --- .gitattributes | 2 + .../text/FilteredBreakIteratorBuilder.java | 108 ++++++ .../SimpleFilteredBreakIteratorBuilder.java | 314 ++++++++++++++++++ .../icu/dev/test/rbbi/BreakIteratorTest.java | 144 ++++++++ 4 files changed, 568 insertions(+) create mode 100644 icu4j/main/classes/core/src/com/ibm/icu/text/FilteredBreakIteratorBuilder.java create mode 100644 icu4j/main/classes/core/src/com/ibm/icu/text/SimpleFilteredBreakIteratorBuilder.java diff --git a/.gitattributes b/.gitattributes index 5a84e9944fb..7964079079e 100644 --- a/.gitattributes +++ b/.gitattributes @@ -427,6 +427,8 @@ icu4j/main/classes/core/.settings/org.eclipse.jdt.core.prefs -text icu4j/main/classes/core/manifest.stub -text icu4j/main/classes/core/src/com/ibm/icu/impl/TZDBTimeZoneNames.java -text icu4j/main/classes/core/src/com/ibm/icu/impl/locale/KeyTypeData.java -text +icu4j/main/classes/core/src/com/ibm/icu/text/FilteredBreakIteratorBuilder.java -text +icu4j/main/classes/core/src/com/ibm/icu/text/SimpleFilteredBreakIteratorBuilder.java -text icu4j/main/classes/currdata/.externalToolBuilders/copy-data-currdata.launch -text icu4j/main/classes/currdata/.settings/org.eclipse.core.resources.prefs -text icu4j/main/classes/currdata/.settings/org.eclipse.jdt.core.prefs -text diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/FilteredBreakIteratorBuilder.java b/icu4j/main/classes/core/src/com/ibm/icu/text/FilteredBreakIteratorBuilder.java new file mode 100644 index 00000000000..378f1d60af4 --- /dev/null +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/FilteredBreakIteratorBuilder.java @@ -0,0 +1,108 @@ +/* + ******************************************************************************* + * Copyright (C) 2014, International Business Machines Corporation and + * others. All Rights Reserved. + ******************************************************************************* + */ +package com.ibm.icu.text; + +import com.ibm.icu.util.ULocale; + +/** + * The BreakIteratorFilter is used to modify the behavior of a BreakIterator + * by constructing a new BreakIterator which suppresses certain segment boundaries. + * See http://www.unicode.org/reports/tr35/tr35-general.html#Segmentation_Exceptions . + * For example, a typical English Sentence Break Iterator would break on the space + * in the string "Mr. Smith" (resulting in two segments), + * but with "Mr." as an exception, a filtered break iterator + * would consider the string "Mr. Smith" to be a single segment. + * + * @author tomzhang + * + * @internal ICU 54 technology preview + * @deprecated This API might change or be removed in a future release. + */ +@Deprecated +public abstract class FilteredBreakIteratorBuilder { + + /** + * Construct a FilteredBreakIteratorBuilder based on rules in a locale. + * The rules are taken from CLDR exception data for the locale, + * see http://www.unicode.org/reports/tr35/tr35-general.html#Segmentation_Exceptions + * This is the equivalent of calling createInstance(UErrorCode&) + * and then repeatedly calling addNoBreakAfter(...) with the contents + * of the CLDR exception data. + * @param where the locale. + * @return the new builder + * @internal ICU 54 technology preview + * @deprecated This API might change or be removed in a future release. + */ + @Deprecated + public static FilteredBreakIteratorBuilder createInstance(ULocale where) { + FilteredBreakIteratorBuilder ret = new SimpleFilteredBreakIteratorBuilder(where); + return ret; + } + + /** + * Construct an empty FilteredBreakIteratorBuilder. + * In this state, it will not suppress any segment boundaries. + * @return the new builder + * @internal ICU 54 technology preview + * @deprecated This API might change or be removed in a future release. + */ + @Deprecated + public static FilteredBreakIteratorBuilder createInstance() { + FilteredBreakIteratorBuilder ret = new SimpleFilteredBreakIteratorBuilder(); + return ret; + } + + /** + * Suppress a certain string from being the end of a segment. + * For example, suppressing "Mr.", then segments ending in "Mr." will not be returned + * by the iterator. + * @param str the string to suppress, such as "Mr." + * @return returns true if the string was not present and now added, + * false if the call was a no-op because the string was already being suppressed. + * @internal ICU 54 technology preview + * @deprecated This API might change or be removed in a future release. + */ + @Deprecated + public abstract boolean suppressBreakAfter(String str); + + /** + * Stop suppressing a certain string from being the end of the segment. + * This function does not create any new segment boundaries, but only serves to un-do + * the effect of earlier calls to suppressBreakAfter, or to un-do the effect of + * locale data which may be suppressing certain strings. + * @param str the str the string to unsuppress, such as "Mr." + * @return returns true if the string was present and now removed, + * false if the call was a no-op because the string was not being suppressed. + * @internal ICU 54 technology preview + * @deprecated This API might change or be removed in a future release. + */ + @Deprecated + public abstract boolean unsuppressBreakAfter(String str); + + /** + * Wrap (adopt) an existing break iterator in a new filtered instance. + * The resulting BreakIterator is owned by the caller. + * The BreakIteratorFilter may be destroyed before the BreakIterator is destroyed. + * Note that the adoptBreakIterator is adopted by the new BreakIterator + * and should no longer be used by the caller. + * The FilteredBreakIteratorBuilder may be reused. + * @param adoptBreakIterator the break iterator to adopt + * @return the new BreakIterator, owned by the caller. + * @internal ICU 54 technology preview + * @deprecated This API might change or be removed in a future release. + */ + @Deprecated + public abstract BreakIterator build(BreakIterator adoptBreakIterator); + + /** + * For subclass use + * @internal ICU 54 technology preview + * @deprecated This API might change or be removed in a future release. + */ + @Deprecated + protected FilteredBreakIteratorBuilder() {} +} \ No newline at end of file diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/SimpleFilteredBreakIteratorBuilder.java b/icu4j/main/classes/core/src/com/ibm/icu/text/SimpleFilteredBreakIteratorBuilder.java new file mode 100644 index 00000000000..a83c2f46f93 --- /dev/null +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/SimpleFilteredBreakIteratorBuilder.java @@ -0,0 +1,314 @@ +/* + ******************************************************************************* + * Copyright (C) 2014, International Business Machines Corporation and + * others. All Rights Reserved. + ******************************************************************************* + */ +package com.ibm.icu.text; + +import java.text.CharacterIterator; +import java.util.HashSet; + +import com.ibm.icu.impl.ICUResourceBundle; +import com.ibm.icu.util.BytesTrie; +import com.ibm.icu.util.CharsTrie; +import com.ibm.icu.util.CharsTrieBuilder; +import com.ibm.icu.util.StringTrieBuilder; +import com.ibm.icu.util.ULocale; +import com.ibm.icu.util.UResourceBundle; + +/** + * @author tomzhang + */ +class SimpleFilteredSentenceBreakIterator extends BreakIterator { + + private BreakIterator delegate; + private UCharacterIterator text; // TODO(Tom): suffice to move into the local scope in next() ? + private CharsTrie backwardsTrie; // i.e. ".srM" for Mrs. + private CharsTrie forwardsPartialTrie; // Has ".a" for "a.M." + + /** + * @param adoptBreakIterator + * break iterator to adopt + * @param forwardsPartialTrie + * forward & partial char trie to adopt + * @param backwardsTrie + * backward trie to adopt + */ + public SimpleFilteredSentenceBreakIterator(BreakIterator adoptBreakIterator, CharsTrie forwardsPartialTrie, + CharsTrie backwardsTrie) { + this.delegate = adoptBreakIterator; + this.forwardsPartialTrie = forwardsPartialTrie; + this.backwardsTrie = backwardsTrie; + } + + @Override + public int next() { + int n = delegate.next(); + if (n == BreakIterator.DONE || // at end or + backwardsTrie == null) { // .. no backwards table loaded == no exceptions + return n; + } + // UCharacterIterator text; + text = UCharacterIterator.getInstance((CharacterIterator) delegate.getText().clone()); + do { // outer loop runs once per underlying break (from fDelegate). + // loops while 'n' points to an exception. + text.setIndex(n); + backwardsTrie.reset(); + int uch; + + // Assume a space is following the '.' (so we handle the case: "Mr. /Brown") + if ((uch = text.previousCodePoint()) == ' ') { // TODO: skip a class of chars here?? + // TODO only do this the 1st time? + } else { + uch = text.nextCodePoint(); + } + + BytesTrie.Result r = BytesTrie.Result.INTERMEDIATE_VALUE; + + int bestPosn = -1; + int bestValue = -1; + + while ((uch = text.previousCodePoint()) != BreakIterator.DONE && // more to consume backwards and.. + ((r = backwardsTrie.nextForCodePoint(uch)).hasNext())) {// more in the trie + if (r.hasValue()) { // remember the best match so far + bestPosn = text.getIndex(); + bestValue = backwardsTrie.getValue(); + } + } + + if (r.matches()) { // exact match? + bestValue = backwardsTrie.getValue(); + bestPosn = text.getIndex(); + } + + if (bestPosn >= 0) { + if (bestValue == SimpleFilteredBreakIteratorBuilder.MATCH) { // exact match! + n = delegate.next(); // skip this one. Find the next lowerlevel break. + if (n == BreakIterator.DONE) + return n; + continue; // See if the next is another exception. + } else if (bestValue == SimpleFilteredBreakIteratorBuilder.PARTIAL && forwardsPartialTrie != null) { + // make sure there's a forward trie + // We matched the "Ph." in "Ph.D." - now we need to run everything through the forwards trie + // to see if it matches something going forward. + forwardsPartialTrie.reset(); + + BytesTrie.Result rfwd = BytesTrie.Result.INTERMEDIATE_VALUE; + text.setIndex(bestPosn); // hope that's close .. + while ((uch = text.nextCodePoint()) != BreakIterator.DONE + && ((rfwd = forwardsPartialTrie.nextForCodePoint(uch)).hasNext())) { + } + if (rfwd.matches()) { + // only full matches here, nothing to check + // skip the next: + n = delegate.next(); + if (n == BreakIterator.DONE) + return n; + continue; + } else { + // no match (no exception) -return the 'underlying' break + return n; + } + } else { + return n; // internal error and/or no forwards trie + } + } else { + return n; // No match - so exit. Not an exception. + } + } while (n != BreakIterator.DONE); + return n; + } + + @Override + public boolean equals(Object obj) { + if (obj == null) + return false; + if (this == obj) + return true; + if (getClass() != obj.getClass()) + return false; + SimpleFilteredSentenceBreakIterator other = (SimpleFilteredSentenceBreakIterator) obj; + return delegate.equals(other.delegate) && text.equals(other.text) && backwardsTrie.equals(other.backwardsTrie) + && forwardsPartialTrie.equals(other.forwardsPartialTrie); + } + + @Override + public Object clone() { + SimpleFilteredSentenceBreakIterator other = (SimpleFilteredSentenceBreakIterator) super.clone(); + return other; + } + + @Override + public int first() { + return delegate.first(); + } + + @Override + public int last() { + return delegate.last(); + } + + @Override + public int next(int n) { + return delegate.next(n); + } + + @Override + public int previous() { + return delegate.previous(); + } + + @Override + public int following(int offset) { + return delegate.following(offset); + } + + @Override + public int current() { + return delegate.current(); + } + + @Override + public CharacterIterator getText() { + return delegate.getText(); + } + + @Override + public void setText(CharacterIterator newText) { + delegate.setText(newText); + } +} + +public class SimpleFilteredBreakIteratorBuilder extends FilteredBreakIteratorBuilder { + /** + * filter set to store all exceptions + */ + private HashSet filterSet; + + static final int PARTIAL = (1 << 0); // < partial - need to run through forward trie + static final int MATCH = (1 << 1); // < exact match - skip this one. + static final int SuppressInReverse = (1 << 0); + static final int AddToForward = (1 << 1); + + /** + * Create SimpleFilteredBreakIteratorBuilder using given locale + * @param loc the locale to get filtered iterators + */ + public SimpleFilteredBreakIteratorBuilder(ULocale loc) { + ICUResourceBundle rb = (ICUResourceBundle) UResourceBundle.getBundleInstance( + ICUResourceBundle.ICU_BRKITR_BASE_NAME, loc); + ICUResourceBundle exceptions = rb.findWithFallback("exceptions"); + ICUResourceBundle breaks = exceptions.findWithFallback("SentenceBreak"); + + filterSet = new HashSet(); + if (breaks != null) { + for (int index = 0, size = breaks.getSize(); index < size; ++index) { + ICUResourceBundle b = (ICUResourceBundle) breaks.get(index); + String br = b.getString(); + filterSet.add(br); + } + } + } + + /** + * Create SimpleFilteredBreakIteratorBuilder with no exception + */ + public SimpleFilteredBreakIteratorBuilder() { + filterSet = new HashSet(); + } + + @Override + public boolean suppressBreakAfter(String str) { + if (filterSet == null) { + filterSet = new HashSet(); + } + return filterSet.add(str); + } + + @Override + public boolean unsuppressBreakAfter(String str) { + if (filterSet == null) { + return false; + } else { + return filterSet.remove(str); + } + } + + @Override + public BreakIterator build(BreakIterator adoptBreakIterator) { + CharsTrieBuilder builder = new CharsTrieBuilder(); + CharsTrieBuilder builder2 = new CharsTrieBuilder(); + + int revCount = 0; + int fwdCount = 0; + + int subCount = filterSet.size(); + String[] ustrs = new String[subCount]; + int[] partials = new int[subCount]; + + CharsTrie backwardsTrie = null; // i.e. ".srM" for Mrs. + CharsTrie forwardsPartialTrie = null; // Has ".a" for "a.M." + + int i = 0; + for (String s : filterSet) { + ustrs[i] = s; // copy by value? + partials[i] = 0; // default: no partial + i++; + } + + for (i = 0; i < subCount; i++) { + int nn = ustrs[i].indexOf('.'); // TODO: non-'.' abbreviations + if (nn > -1 && (nn + 1) != ustrs[i].length()) { + // is partial. + // is it unique? + int sameAs = -1; + for (int j = 0; j < subCount; j++) { + if (j == i) + continue; + if (ustrs[i].regionMatches(0, ustrs[j], 0, nn + 1)) { + if (partials[j] == 0) { // hasn't been processed yet + partials[j] = SuppressInReverse | AddToForward; + } else if ((partials[j] & SuppressInReverse) != 0) { + sameAs = j; // the other entry is already in the reverse table. + } + } + } + + if ((sameAs == -1) && (partials[i] == 0)) { + StringBuilder prefix = new StringBuilder(ustrs[i].substring(0, nn + 1)); + // first one - add the prefix to the reverse table. + prefix.reverse(); + builder.add(prefix, PARTIAL); + revCount++; + partials[i] = SuppressInReverse | AddToForward; + } + } + } + + for (i = 0; i < subCount; i++) { + if (partials[i] == 0) { + StringBuilder reversed = new StringBuilder(ustrs[i]).reverse(); + builder.add(reversed, MATCH); + revCount++; + } else { + // an optimization would be to only add the portion after the '.' + // for example, for "Ph.D." we store ".hP" in the reverse table. We could just store "D." in the + // forward, + // instead of "Ph.D." since we already know the "Ph." part is a match. + // would need the trie to be able to hold 0-length strings, though. + builder2.add(ustrs[i], MATCH); // forward + fwdCount++; + } + } + + if (revCount > 0) { + backwardsTrie = builder.build(StringTrieBuilder.Option.FAST); + } + + if (fwdCount > 0) { + forwardsPartialTrie = builder2.build(StringTrieBuilder.Option.FAST); + } + return new SimpleFilteredSentenceBreakIterator(adoptBreakIterator, forwardsPartialTrie, backwardsTrie); + } +} diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/BreakIteratorTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/BreakIteratorTest.java index 902aa1d961b..a3bda1e99c7 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/BreakIteratorTest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/BreakIteratorTest.java @@ -13,6 +13,7 @@ import java.util.Locale; import com.ibm.icu.dev.test.TestFmwk; import com.ibm.icu.text.BreakIterator; +import com.ibm.icu.text.FilteredBreakIteratorBuilder; import com.ibm.icu.util.ULocale; public class BreakIteratorTest extends TestFmwk @@ -906,4 +907,147 @@ public class BreakIteratorTest extends TestFmwk errln("getWordInstance((ULocale)null) did not throw NPE."); } catch (NullPointerException e) { /* OK */ } } + + /** + * Test FilteredBreakIteratorBuilder newly introduced + */ + public void TestFilteredBreakIteratorBuilder() { + FilteredBreakIteratorBuilder builder; + BreakIterator baseBI; + BreakIterator filteredBI; + + String text = "In the meantime Mr. Weston arrived with his small ship, which he had now recovered. Capt. Gorges, who informed the Sgt. here that one purpose of his going east was to meet with Mr. Weston, took this opportunity to call him to account for some abuses he had to lay to his charge."; // (William Bradford, public domain. http://catalog.hathitrust.org/Record/008651224 ) - edited. + String ABBR_MR = "Mr."; + String ABBR_CAPT = "Capt."; + + { + logln("Constructing empty builder\n"); + builder = FilteredBreakIteratorBuilder.createInstance(); + + logln("Constructing base BI\n"); + baseBI = BreakIterator.getSentenceInstance(Locale.ENGLISH); + + logln("Building new BI\n"); + filteredBI = builder.build(baseBI); + + logln("Testing:"); + filteredBI.setText(text); + assertEquals("1st next", 20, filteredBI.next()); + assertEquals("1st next", 84, filteredBI.next()); + assertEquals("1st next", 90, filteredBI.next()); + assertEquals("1st next", 181, filteredBI.next()); + assertEquals("1st next", 278, filteredBI.next()); + filteredBI.first(); + } + + { + logln("Constructing empty builder\n"); + builder = FilteredBreakIteratorBuilder.createInstance(); + + logln("Adding Mr. as an exception\n"); + + assertEquals("2.1 suppressBreakAfter", true, builder.suppressBreakAfter(ABBR_MR)); + assertEquals("2.2 suppressBreakAfter", false, builder.suppressBreakAfter(ABBR_MR)); + assertEquals("2.3 unsuppressBreakAfter", true, builder.unsuppressBreakAfter(ABBR_MR)); + assertEquals("2.4 unsuppressBreakAfter", false, builder.unsuppressBreakAfter(ABBR_MR)); + assertEquals("2.5 suppressBreakAfter", true, builder.suppressBreakAfter(ABBR_MR)); + + logln("Constructing base BI\n"); + baseBI = BreakIterator.getSentenceInstance(Locale.ENGLISH); + + logln("Building new BI\n"); + filteredBI = builder.build(baseBI); + + logln("Testing:"); + filteredBI.setText(text); + assertEquals("2nd next", 84, filteredBI.next()); + assertEquals("2nd next", 90, filteredBI.next()); + assertEquals("2nd next", 278, filteredBI.next()); + filteredBI.first(); + } + + + { + logln("Constructing empty builder\n"); + builder = FilteredBreakIteratorBuilder.createInstance(); + + logln("Adding Mr. and Capt as an exception\n"); + assertEquals("3.1 suppressBreakAfter", true, builder.suppressBreakAfter(ABBR_MR)); + assertEquals("3.2 suppressBreakAfter", true, builder.suppressBreakAfter(ABBR_CAPT)); + + logln("Constructing base BI\n"); + baseBI = BreakIterator.getSentenceInstance(Locale.ENGLISH); + + logln("Building new BI\n"); + filteredBI = builder.build(baseBI); + + logln("Testing:"); + filteredBI.setText(text); + assertEquals("3rd next", 84, filteredBI.next()); + assertEquals("3rd next", 278, filteredBI.next()); + filteredBI.first(); + } + + { + logln("Constructing English builder\n"); + builder = FilteredBreakIteratorBuilder.createInstance(ULocale.ENGLISH); + + logln("Constructing base BI\n"); + baseBI = BreakIterator.getSentenceInstance(Locale.ENGLISH); + + logln("unsuppressing 'Capt'"); + assertEquals("1st suppressBreakAfter", true, builder.unsuppressBreakAfter(ABBR_CAPT)); + + logln("Building new BI\n"); + filteredBI = builder.build(baseBI); + + if(filteredBI != null) { + logln("Testing:"); + filteredBI.setText(text); + assertEquals("4th next", 84, filteredBI.next()); + assertEquals("4th next", 90, filteredBI.next()); + assertEquals("4th next", 278, filteredBI.next()); + filteredBI.first(); + } + } + + { + logln("Constructing English builder\n"); + builder = FilteredBreakIteratorBuilder.createInstance(ULocale.ENGLISH); + + logln("Constructing base BI\n"); + baseBI = BreakIterator.getSentenceInstance(Locale.ENGLISH); + + logln("Building new BI\n"); + filteredBI = builder.build(baseBI); + + if(filteredBI != null) { + logln("Testing:"); + filteredBI.setText(text); + + assertEquals("5th next", 84, filteredBI.next()); + assertEquals("5th next", 278, filteredBI.next()); + filteredBI.first(); + } + } + + { + logln("Constructing French builder"); + builder = FilteredBreakIteratorBuilder.createInstance(ULocale.FRENCH); + + logln("Constructing base BI\n"); + baseBI = BreakIterator.getSentenceInstance(Locale.FRENCH); + + logln("Building new BI\n"); + filteredBI = builder.build(baseBI); + + if(filteredBI != null) { + logln("Testing:"); + filteredBI.setText(text); + assertEquals("6th next", 20, filteredBI.next()); + assertEquals("6th next", 84, filteredBI.next()); + filteredBI.first(); + } + } + } } -- 2.40.0