From 152867f7ab2b87a5765d403160b914bae83659ec Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Mon, 5 Apr 2021 15:57:33 -0700 Subject: [PATCH] ICU-21459 properly guard BytesTrie.Result.getValue() and clone/copy objects so that objects shared among instances are not modified and use an atomic int for the C++ refcount --- icu4c/source/common/filteredbrk.cpp | 93 ++++++++++++------- icu4c/source/test/testdata/rbbitst.txt | 5 + .../SimpleFilteredSentenceBreakIterator.java | 46 ++++++--- .../src/com/ibm/icu/dev/test/rbbi/rbbitst.txt | 7 +- 4 files changed, 103 insertions(+), 48 deletions(-) diff --git a/icu4c/source/common/filteredbrk.cpp b/icu4c/source/common/filteredbrk.cpp index c07128cbce1..25080f9d33a 100644 --- a/icu4c/source/common/filteredbrk.cpp +++ b/icu4c/source/common/filteredbrk.cpp @@ -20,6 +20,7 @@ #include "ubrkimpl.h" // U_ICUDATA_BRKITR #include "uvector.h" #include "cmemory.h" +#include "umutex.h" U_NAMESPACE_BEGIN @@ -139,13 +140,30 @@ class SimpleFilteredSentenceBreakData : public UMemory { public: SimpleFilteredSentenceBreakData(UCharsTrie *forwards, UCharsTrie *backwards ) : fForwardsPartialTrie(forwards), fBackwardsTrie(backwards), refcount(1) { } - SimpleFilteredSentenceBreakData *incr() { refcount++; return this; } - SimpleFilteredSentenceBreakData *decr() { if((--refcount) <= 0) delete this; return 0; } - virtual ~SimpleFilteredSentenceBreakData(); + SimpleFilteredSentenceBreakData *incr() { + umtx_atomic_inc(&refcount); + return this; + } + SimpleFilteredSentenceBreakData *decr() { + if(umtx_atomic_dec(&refcount) <= 0) { + delete this; + } + return 0; + } + virtual ~SimpleFilteredSentenceBreakData(); + + bool hasForwardsPartialTrie() const { return fForwardsPartialTrie.isValid(); } + bool hasBackwardsTrie() const { return fBackwardsTrie.isValid(); } - LocalPointer fForwardsPartialTrie; // Has ".a" for "a.M." - LocalPointer fBackwardsTrie; // i.e. ".srM" for Mrs. - int32_t refcount; + const UCharsTrie &getForwardsPartialTrie() const { return *fForwardsPartialTrie; } + const UCharsTrie &getBackwardsTrie() const { return *fBackwardsTrie; } + +private: + // These tries own their data arrays. + // They are shared and must therefore not be modified. + LocalPointer fForwardsPartialTrie; // Has ".a" for "a.M." + LocalPointer fBackwardsTrie; // i.e. ".srM" for Mrs. + u_atomic_int32_t refcount; }; SimpleFilteredSentenceBreakData::~SimpleFilteredSentenceBreakData() {} @@ -244,7 +262,13 @@ SimpleFilteredSentenceBreakIterator::SimpleFilteredSentenceBreakIterator(BreakIt fData(new SimpleFilteredSentenceBreakData(forwards, backwards)), fDelegate(adopt) { - // all set.. + if (fData == nullptr) { + delete forwards; + delete backwards; + if (U_SUCCESS(status)) { + status = U_MEMORY_ALLOCATION_ERROR; + } + } } SimpleFilteredSentenceBreakIterator::~SimpleFilteredSentenceBreakIterator() { @@ -261,59 +285,62 @@ SimpleFilteredSentenceBreakIterator::breakExceptionAt(int32_t n) { int32_t bestValue = -1; // loops while 'n' points to an exception. utext_setNativeIndex(fText.getAlias(), n); // from n.. - fData->fBackwardsTrie->reset(); - UChar32 uch; //if(debug2) u_printf(" n@ %d\n", n); // Assume a space is following the '.' (so we handle the case: "Mr. /Brown") - if((uch=utext_previous32(fText.getAlias()))==(UChar32)0x0020) { // TODO: skip a class of chars here?? + if(utext_previous32(fText.getAlias())==u' ') { // TODO: skip a class of chars here?? // TODO only do this the 1st time? //if(debug2) u_printf("skipping prev: |%C| \n", (UChar)uch); } else { //if(debug2) u_printf("not skipping prev: |%C| \n", (UChar)uch); - uch = utext_next32(fText.getAlias()); + utext_next32(fText.getAlias()); //if(debug2) u_printf(" -> : |%C| \n", (UChar)uch); } - UStringTrieResult r = USTRINGTRIE_INTERMEDIATE_VALUE; - - while((uch=utext_previous32(fText.getAlias()))!=U_SENTINEL && // more to consume backwards and.. - USTRINGTRIE_HAS_NEXT(r=fData->fBackwardsTrie->nextForCodePoint(uch))) {// more in the trie - if(USTRINGTRIE_HAS_VALUE(r)) { // remember the best match so far - bestPosn = utext_getNativeIndex(fText.getAlias()); - bestValue = fData->fBackwardsTrie->getValue(); - } - //if(debug2) u_printf("rev< /%C/ cont?%d @%d\n", (UChar)uch, r, utext_getNativeIndex(fText.getAlias())); + { + // Do not modify the shared trie! + UCharsTrie iter(fData->getBackwardsTrie()); + UChar32 uch; + while((uch=utext_previous32(fText.getAlias()))!=U_SENTINEL) { // more to consume backwards + UStringTrieResult r = iter.nextForCodePoint(uch); + if(USTRINGTRIE_HAS_VALUE(r)) { // remember the best match so far + bestPosn = utext_getNativeIndex(fText.getAlias()); + bestValue = iter.getValue(); + } + if(!USTRINGTRIE_HAS_NEXT(r)) { + break; + } + //if(debug2) u_printf("rev< /%C/ cont?%d @%d\n", (UChar)uch, r, utext_getNativeIndex(fText.getAlias())); + } } - if(USTRINGTRIE_MATCHES(r)) { // exact match? - //if(debug2) u_printf("revfBackwardsTrie->getValue(); - bestPosn = utext_getNativeIndex(fText.getAlias()); - //if(debug2) u_printf("rev<+/%C/+end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue); - } + //if(bestValue >= 0) { + //if(debug2) u_printf("rev<+/%C/+end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue); + //} if(bestPosn>=0) { //if(debug2) u_printf("rev< /%C/ end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue); //if(USTRINGTRIE_MATCHES(r)) { // matched - so, now what? - //int32_t bestValue = fBackwardsTrie->getValue(); + //int32_t bestValue = iter.getValue(); ////if(debug2) u_printf("rev< /%C/ matched, skip..%d bestValue=%d\n", (UChar)uch, r, bestValue); if(bestValue == kMATCH) { // exact match! //if(debug2) u_printf(" exact backward match\n"); return kExceptionHere; // See if the next is another exception. } else if(bestValue == kPARTIAL - && fData->fForwardsPartialTrie.isValid()) { // make sure there's a forward trie + && fData->hasForwardsPartialTrie()) { // make sure there's a forward trie //if(debug2) u_printf(" partial backward match\n"); // We matched the "Ph." in "Ph.D." - now we need to run everything through the forwards trie // to see if it matches something going forward. - fData->fForwardsPartialTrie->reset(); UStringTrieResult rfwd = USTRINGTRIE_INTERMEDIATE_VALUE; utext_setNativeIndex(fText.getAlias(), bestPosn); // hope that's close .. //if(debug2) u_printf("Retrying at %d\n", bestPosn); + // Do not modify the shared trie! + UCharsTrie iter(fData->getForwardsPartialTrie()); + UChar32 uch; while((uch=utext_next32(fText.getAlias()))!=U_SENTINEL && - USTRINGTRIE_HAS_NEXT(rfwd=fData->fForwardsPartialTrie->nextForCodePoint(uch))) { + USTRINGTRIE_HAS_NEXT(rfwd=iter.nextForCodePoint(uch))) { //if(debug2) u_printf("fwd> /%C/ cont?%d @%d\n", (UChar)uch, rfwd, utext_getNativeIndex(fText.getAlias())); } if(USTRINGTRIE_MATCHES(rfwd)) { @@ -339,7 +366,7 @@ SimpleFilteredSentenceBreakIterator::breakExceptionAt(int32_t n) { int32_t SimpleFilteredSentenceBreakIterator::internalNext(int32_t n) { if(n == UBRK_DONE || // at end or - fData->fBackwardsTrie.isNull()) { // .. no backwards table loaded == no exceptions + !fData->hasBackwardsTrie()) { // .. no backwards table loaded == no exceptions return n; } // OK, do we need to break here? @@ -369,7 +396,7 @@ SimpleFilteredSentenceBreakIterator::internalNext(int32_t n) { int32_t SimpleFilteredSentenceBreakIterator::internalPrev(int32_t n) { if(n == 0 || n == UBRK_DONE || // at end or - fData->fBackwardsTrie.isNull()) { // .. no backwards table loaded == no exceptions + !fData->hasBackwardsTrie()) { // .. no backwards table loaded == no exceptions return n; } // OK, do we need to break here? @@ -420,7 +447,7 @@ SimpleFilteredSentenceBreakIterator::previous(void) { UBool SimpleFilteredSentenceBreakIterator::isBoundary(int32_t offset) { if (!fDelegate->isBoundary(offset)) return false; // no break to suppress - if (fData->fBackwardsTrie.isNull()) return true; // no data = no suppressions + if (!fData->hasBackwardsTrie()) return true; // no data = no suppressions UErrorCode status = U_ZERO_ERROR; resetState(status); diff --git a/icu4c/source/test/testdata/rbbitst.txt b/icu4c/source/test/testdata/rbbitst.txt index 3fae72cba95..face041429c 100644 --- a/icu4c/source/test/testdata/rbbitst.txt +++ b/icu4c/source/test/testdata/rbbitst.txt @@ -62,6 +62,11 @@ \ •Doctor with a D. •As in, Ph.D., you know.• +# ICU-21459 logic error. + + +•on. •But after a day in the arena sun, the metal feels hot enough to blister my hands.• + # same as root (unless some exceptions are added!) diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/SimpleFilteredSentenceBreakIterator.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/SimpleFilteredSentenceBreakIterator.java index bef89a2d0a1..e9655a83ae0 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/SimpleFilteredSentenceBreakIterator.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/SimpleFilteredSentenceBreakIterator.java @@ -19,6 +19,7 @@ import com.ibm.icu.text.UCharacterIterator; import com.ibm.icu.util.BytesTrie; import com.ibm.icu.util.CharsTrie; import com.ibm.icu.util.CharsTrieBuilder; +import com.ibm.icu.util.ICUCloneNotSupportedException; import com.ibm.icu.util.StringTrieBuilder; import com.ibm.icu.util.ULocale; @@ -72,8 +73,6 @@ public class SimpleFilteredSentenceBreakIterator extends BreakIterator { backwardsTrie.reset(); int uch; - - // Assume a space is following the '.' (so we handle the case: "Mr. /Brown") if ((uch = text.previousCodePoint()) == ' ') { // TODO: skip a class of chars here?? // TODO only do this the 1st time? @@ -81,20 +80,17 @@ public class SimpleFilteredSentenceBreakIterator extends BreakIterator { uch = text.nextCodePoint(); } - BytesTrie.Result r = BytesTrie.Result.INTERMEDIATE_VALUE; - - while ((uch = text.previousCodePoint()) != UCharacterIterator.DONE && // more to consume backwards and.. - ((r = backwardsTrie.nextForCodePoint(uch)).hasNext())) {// more in the trie + while ((uch = text.previousCodePoint()) >= 0) { // more to consume backwards + BytesTrie.Result r = backwardsTrie.nextForCodePoint(uch); if (r.hasValue()) { // remember the best match so far bestPosn = text.getIndex(); bestValue = backwardsTrie.getValue(); } + if (!r.hasNext()) { + break; + } } - - if (r.matches()) { // exact match? - bestValue = backwardsTrie.getValue(); - bestPosn = text.getIndex(); - } + backwardsTrie.reset(); // for equals() & hashCode() if (bestPosn >= 0) { if (bestValue == Builder.MATCH) { // exact match! @@ -110,6 +106,7 @@ public class SimpleFilteredSentenceBreakIterator extends BreakIterator { while ((uch = text.nextCodePoint()) != BreakIterator.DONE && ((rfwd = forwardsPartialTrie.nextForCodePoint(uch)).hasNext())) { } + forwardsPartialTrie.reset(); // for equals() & hashCode() if (rfwd.matches()) { // Exception here return true; @@ -186,18 +183,39 @@ public class SimpleFilteredSentenceBreakIterator extends BreakIterator { if (getClass() != obj.getClass()) return false; SimpleFilteredSentenceBreakIterator other = (SimpleFilteredSentenceBreakIterator) obj; - return delegate.equals(other.delegate) && text.equals(other.text) && backwardsTrie.equals(other.backwardsTrie) + // TODO(ICU-21575): CharsTrie.equals() is not defined. + // Should compare the underlying data, and can then stop resetting after iteration. + return delegate.equals(other.delegate) && text.equals(other.text) + && backwardsTrie.equals(other.backwardsTrie) && forwardsPartialTrie.equals(other.forwardsPartialTrie); } @Override public int hashCode() { - return (forwardsPartialTrie.hashCode() * 39) + (backwardsTrie.hashCode() * 11) + delegate.hashCode(); + // TODO(ICU-21575): CharsTrie.hashCode() is not defined. + return (forwardsPartialTrie.hashCode() * 39) + (backwardsTrie.hashCode() * 11) + + delegate.hashCode(); } @Override public Object clone() { SimpleFilteredSentenceBreakIterator other = (SimpleFilteredSentenceBreakIterator) super.clone(); + try { + if (delegate != null) { + other.delegate = (BreakIterator) delegate.clone(); + } + if (text != null) { + other.text = (UCharacterIterator) text.clone(); + } + if (backwardsTrie != null) { + other.backwardsTrie = backwardsTrie.clone(); + } + if (forwardsPartialTrie != null) { + other.forwardsPartialTrie = forwardsPartialTrie.clone(); + } + } catch (CloneNotSupportedException e) { + throw new ICUCloneNotSupportedException(e); + } return other; } @@ -273,7 +291,7 @@ public class SimpleFilteredSentenceBreakIterator extends BreakIterator { /** * filter set to store all exceptions */ - private HashSet filterSet = new HashSet(); + private HashSet filterSet = new HashSet<>(); static final int PARTIAL = (1 << 0); // < partial - need to run through forward trie static final int MATCH = (1 << 1); // < exact match - skip this one. diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt index 98cf6883d72..face041429c 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt @@ -32,7 +32,7 @@ # [ICU4C] source/test/testdata/rbbitst.txt # [ICU4J] main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt # -# ICU4C's copy is the master. If any changes are made to ICU4J's copy, make sure they +# ICU4C's copy is the primary one. If any changes are made to ICU4J's copy, make sure they # are merged back into ICU4C's copy of the file, lest they get overwritten later. # TODO: figure out how to have a single copy of the file for use by both C and Java. @@ -62,6 +62,11 @@ \ •Doctor with a D. •As in, Ph.D., you know.• +# ICU-21459 logic error. + + +•on. •But after a day in the arena sun, the metal feels hot enough to blister my hands.• + # same as root (unless some exceptions are added!) -- 2.40.0