From: Peter Edberg Date: Wed, 9 Sep 2015 04:05:01 +0000 (+0000) Subject: ICU-11770 If locale has ss-standard, sentence break iterator uses suppressions data... X-Git-Tag: milestone-59-0-1~906 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=cd4634345e6810faa5ffdccdbcbcbfc14fdd0e71;p=icu ICU-11770 If locale has ss-standard, sentence break iterator uses suppressions data via FilteredBreakIterator X-SVN-Rev: 37914 --- diff --git a/icu4c/source/common/brkiter.cpp b/icu4c/source/common/brkiter.cpp index fe231e1b019..d466fb80d25 100644 --- a/icu4c/source/common/brkiter.cpp +++ b/icu4c/source/common/brkiter.cpp @@ -27,6 +27,7 @@ #include "unicode/udata.h" #include "unicode/ures.h" #include "unicode/ustring.h" +#include "unicode/filteredbrk.h" #include "ucln_cmn.h" #include "cstring.h" #include "umutex.h" @@ -383,7 +384,7 @@ BreakIterator::createInstance(const Locale& loc, int32_t kind, UErrorCode& statu } // ------------------------------------- -enum { kLBTypeLenMax = 32 }; +enum { kKeyValueLenMax = 32 }; BreakIterator* BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status) @@ -392,7 +393,7 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status) if (U_FAILURE(status)) { return NULL; } - char lbType[kLBTypeLenMax]; + char lbType[kKeyValueLenMax]; BreakIterator *result = NULL; switch (kind) { @@ -405,9 +406,9 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status) case UBRK_LINE: uprv_strcpy(lbType, "line"); { - char lbKeyValue[kLBTypeLenMax] = {0}; + char lbKeyValue[kKeyValueLenMax] = {0}; UErrorCode kvStatus = U_ZERO_ERROR; - int32_t kLen = loc.getKeywordValue("lb", lbKeyValue, kLBTypeLenMax, kvStatus); + int32_t kLen = loc.getKeywordValue("lb", lbKeyValue, kKeyValueLenMax, kvStatus); if (U_SUCCESS(kvStatus) && kLen > 0 && (uprv_strcmp(lbKeyValue,"strict")==0 || uprv_strcmp(lbKeyValue,"normal")==0 || uprv_strcmp(lbKeyValue,"loose")==0)) { uprv_strcat(lbType, "_"); uprv_strcat(lbType, lbKeyValue); @@ -417,6 +418,18 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status) break; case UBRK_SENTENCE: result = BreakIterator::buildInstance(loc, "sentence", kind, status); + { + char ssKeyValue[kKeyValueLenMax] = {0}; + UErrorCode kvStatus = U_ZERO_ERROR; + int32_t kLen = loc.getKeywordValue("ss", ssKeyValue, kKeyValueLenMax, kvStatus); + if (U_SUCCESS(kvStatus) && kLen > 0 && uprv_strcmp(ssKeyValue,"standard")==0) { + FilteredBreakIteratorBuilder* fbiBuilder = FilteredBreakIteratorBuilder::createInstance(loc, kvStatus); + if (U_SUCCESS(kvStatus)) { + result = fbiBuilder->build(result, status); + delete fbiBuilder; + } + } + } break; case UBRK_TITLE: result = BreakIterator::buildInstance(loc, "title", kind, status); diff --git a/icu4c/source/common/ubrk.cpp b/icu4c/source/common/ubrk.cpp index a0002c21cea..161c0ac4d32 100644 --- a/icu4c/source/common/ubrk.cpp +++ b/icu4c/source/common/ubrk.cpp @@ -1,6 +1,6 @@ /* ******************************************************************************** -* Copyright (C) 1996-2013, International Business Machines +* Copyright (C) 1996-2015, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************** */ @@ -164,10 +164,9 @@ ubrk_setText(UBreakIterator* bi, int32_t textLength, UErrorCode* status) { - BreakIterator *brit = (BreakIterator *)bi; UText ut = UTEXT_INITIALIZER; utext_openUChars(&ut, text, textLength, status); - brit->setText(&ut, *status); + ((BreakIterator*)bi)->setText(&ut, *status); // A stack allocated UText wrapping a UChar * string // can be dumped without explicitly closing it. } @@ -179,8 +178,7 @@ ubrk_setUText(UBreakIterator *bi, UText *text, UErrorCode *status) { - RuleBasedBreakIterator *brit = (RuleBasedBreakIterator *)bi; - brit->RuleBasedBreakIterator::setText(text, *status); + ((BreakIterator*)bi)->setText(text, *status); } @@ -191,35 +189,35 @@ U_CAPI int32_t U_EXPORT2 ubrk_current(const UBreakIterator *bi) { - return ((RuleBasedBreakIterator*)bi)->RuleBasedBreakIterator::current(); + return ((BreakIterator*)bi)->current(); } U_CAPI int32_t U_EXPORT2 ubrk_next(UBreakIterator *bi) { - return ((RuleBasedBreakIterator*)bi)->RuleBasedBreakIterator::next(); + return ((BreakIterator*)bi)->next(); } U_CAPI int32_t U_EXPORT2 ubrk_previous(UBreakIterator *bi) { - return ((RuleBasedBreakIterator*)bi)->RuleBasedBreakIterator::previous(); + return ((BreakIterator*)bi)->previous(); } U_CAPI int32_t U_EXPORT2 ubrk_first(UBreakIterator *bi) { - return ((RuleBasedBreakIterator*)bi)->RuleBasedBreakIterator::first(); + return ((BreakIterator*)bi)->first(); } U_CAPI int32_t U_EXPORT2 ubrk_last(UBreakIterator *bi) { - return ((RuleBasedBreakIterator*)bi)->RuleBasedBreakIterator::last(); + return ((BreakIterator*)bi)->last(); } U_CAPI int32_t U_EXPORT2 @@ -227,7 +225,7 @@ ubrk_preceding(UBreakIterator *bi, int32_t offset) { - return ((RuleBasedBreakIterator*)bi)->RuleBasedBreakIterator::preceding(offset); + return ((BreakIterator*)bi)->preceding(offset); } U_CAPI int32_t U_EXPORT2 @@ -235,7 +233,7 @@ ubrk_following(UBreakIterator *bi, int32_t offset) { - return ((RuleBasedBreakIterator*)bi)->RuleBasedBreakIterator::following(offset); + return ((BreakIterator*)bi)->following(offset); } U_CAPI const char* U_EXPORT2 @@ -256,20 +254,20 @@ ubrk_countAvailable() U_CAPI UBool U_EXPORT2 ubrk_isBoundary(UBreakIterator *bi, int32_t offset) { - return ((RuleBasedBreakIterator *)bi)->RuleBasedBreakIterator::isBoundary(offset); + return ((BreakIterator*)bi)->isBoundary(offset); } U_CAPI int32_t U_EXPORT2 ubrk_getRuleStatus(UBreakIterator *bi) { - return ((RuleBasedBreakIterator *)bi)->RuleBasedBreakIterator::getRuleStatus(); + return ((BreakIterator*)bi)->getRuleStatus(); } U_CAPI int32_t U_EXPORT2 ubrk_getRuleStatusVec(UBreakIterator *bi, int32_t *fillInVec, int32_t capacity, UErrorCode *status) { - return ((RuleBasedBreakIterator *)bi)->RuleBasedBreakIterator::getRuleStatusVec(fillInVec, capacity, *status); + return ((BreakIterator*)bi)->getRuleStatusVec(fillInVec, capacity, *status); } diff --git a/icu4c/source/common/unicode/ubrk.h b/icu4c/source/common/unicode/ubrk.h index 9bdb4d5d9de..3c26663ec96 100644 --- a/icu4c/source/common/unicode/ubrk.h +++ b/icu4c/source/common/unicode/ubrk.h @@ -1,6 +1,6 @@ /* ****************************************************************************** -* Copyright (C) 1996-2014, International Business Machines Corporation and others. +* Copyright (C) 1996-2015, International Business Machines Corporation and others. * All Rights Reserved. ****************************************************************************** */ @@ -45,10 +45,20 @@ * when line-wrapping. The mechanism correctly handles punctuation and * hyphenated words. *

+ * Note: The locale keyword "lb" can be used to modify line break + * behavior according to the CSS level 3 line-break options, see + * . For example: + * "ja@lb=strict", "zh@lb=loose". + *

* Sentence boundary analysis allows selection with correct * interpretation of periods within numbers and abbreviations, and * trailing punctuation marks such as quotation marks and parentheses. *

+ * Note: The locale keyword "ss" can be used to enable use of + * segmentation suppression data (preventing breaks in English after + * abbreviations such as "Mr." or "Est.", for example), as follows: + * "en@ss=standard". + *

* Word boundary analysis is used by search and replace functions, as * well as within text editing applications that allow the user to * select words with a double click. Word selection provides correct @@ -202,7 +212,9 @@ typedef enum USentenceBreakTag { * and sentence breaks in text. * @param type The type of UBreakIterator to open: one of UBRK_CHARACTER, UBRK_WORD, * UBRK_LINE, UBRK_SENTENCE - * @param locale The locale specifying the text-breaking conventions. + * @param locale The locale specifying the text-breaking conventions. Note that + * locale keys such as "lb" and "ss" may be used to modify text break behavior, + * see general discussion of BreakIterator C API. * @param text The text to be iterated over. * @param textLength The number of characters in text, or -1 if null-terminated. * @param status A UErrorCode to receive any errors. diff --git a/icu4c/source/test/cintltst/cbiapts.c b/icu4c/source/test/cintltst/cbiapts.c index a148636b988..9a0f5d9924b 100644 --- a/icu4c/source/test/cintltst/cbiapts.c +++ b/icu4c/source/test/cintltst/cbiapts.c @@ -49,6 +49,7 @@ static void TestBreakIteratorUText(void); static void TestBreakIteratorTailoring(void); static void TestBreakIteratorRefresh(void); static void TestBug11665(void); +static void TestBreakIteratorSuppressions(void); void addBrkIterAPITest(TestNode** root); @@ -65,6 +66,7 @@ void addBrkIterAPITest(TestNode** root) addTest(root, &TestBreakIteratorTailoring, "tstxtbd/cbiapts/TestBreakIteratorTailoring"); addTest(root, &TestBreakIteratorRefresh, "tstxtbd/cbiapts/TestBreakIteratorRefresh"); addTest(root, &TestBug11665, "tstxtbd/cbiapts/TestBug11665"); + addTest(root, &TestBreakIteratorSuppressions, "tstxtbd/cbiapts/TestBreakIteratorSuppressions"); } #define CLONETEST_ITERATOR_COUNT 2 @@ -934,5 +936,117 @@ static void TestBug11665(void) { ubrk_close(bi); } +static const char testSentenceSuppressionsEn[] = "Mr. Jones comes home. Dr. Smith Ph.D. is out. In the U.S.A. it is hot."; +static const int32_t testSentSuppFwdOffsetsEn[] = { 22, 26, 46, 70, -1 }; /* With suppressions, currently not handling Dr. */ +static const int32_t testSentFwdOffsetsEn[] = { 4, 22, 26, 46, 70, -1 }; /* Without suppressions */ +static const int32_t testSentSuppRevOffsetsEn[] = { 46, 26, 22, 0, -1 }; /* With suppressions, currently not handling Dr. */ +static const int32_t testSentRevOffsetsEn[] = { 46, 26, 22, 4, 0, -1 }; /* Without suppressions */ + +static const char testSentenceSuppressionsDe[] = "Wenn ich schon h\\u00F6re zu Guttenberg kommt evtl. zur\\u00FCck."; +static const int32_t testSentSuppFwdOffsetsDe[] = { 53, -1 }; /* With suppressions */ +static const int32_t testSentFwdOffsetsDe[] = { 53, -1 }; /* Without suppressions; no break in evtl. zur due to casing */ +static const int32_t testSentSuppRevOffsetsDe[] = { 0, -1 }; /* With suppressions */ +static const int32_t testSentRevOffsetsDe[] = { 0, -1 }; /* Without suppressions */ + +static const char testSentenceSuppressionsEs[] = "Te esperamos todos los miercoles en Bravo 416, Col. El Pueblo a las 7 PM."; +static const int32_t testSentSuppFwdOffsetsEs[] = { 73, -1 }; /* With suppressions */ +static const int32_t testSentFwdOffsetsEs[] = { 52, 73, -1 }; /* Without suppressions */ +static const int32_t testSentSuppRevOffsetsEs[] = { 0, -1 }; /* With suppressions */ +static const int32_t testSentRevOffsetsEs[] = { 52, 0, -1 }; /* Without suppressions */ + +enum { kTextULenMax = 128 }; + +typedef struct { + const char * locale; + const char * text; + const int32_t * expFwdOffsets; + const int32_t * expRevOffsets; +} TestBISuppressionsItem; + +static const TestBISuppressionsItem testBISuppressionsItems[] = { + { "en@ss=standard", testSentenceSuppressionsEn, testSentSuppFwdOffsetsEn, testSentSuppRevOffsetsEn }, + { "en", testSentenceSuppressionsEn, testSentFwdOffsetsEn, testSentRevOffsetsEn }, + { "fr@ss=standard", testSentenceSuppressionsEn, testSentFwdOffsetsEn, testSentRevOffsetsEn }, + { "af@ss=standard", testSentenceSuppressionsEn, testSentSuppFwdOffsetsEn, testSentSuppRevOffsetsEn }, /* no brkiter data => en suppressions? */ + { "zh@ss=standard", testSentenceSuppressionsEn, testSentFwdOffsetsEn, testSentRevOffsetsEn }, /* brkiter data, no suppressions data => no suppressions */ + { "zh_Hant@ss=standard", testSentenceSuppressionsEn, testSentFwdOffsetsEn, testSentRevOffsetsEn }, /* brkiter data, no suppressions data => no suppressions */ + { "fi@ss=standard", testSentenceSuppressionsEn, testSentFwdOffsetsEn, testSentRevOffsetsEn }, /* brkiter data, no suppressions data => no suppressions */ + { "ja@ss=standard", testSentenceSuppressionsEn, testSentFwdOffsetsEn, testSentRevOffsetsEn }, /* brkiter data, no suppressions data => no suppressions */ + { "de@ss=standard", testSentenceSuppressionsDe, testSentSuppFwdOffsetsDe, testSentSuppRevOffsetsDe }, + { "de", testSentenceSuppressionsDe, testSentFwdOffsetsDe, testSentRevOffsetsDe }, + { "es@ss=standard", testSentenceSuppressionsEs, testSentSuppFwdOffsetsEs, testSentSuppRevOffsetsEs }, + { "es", testSentenceSuppressionsEs, testSentFwdOffsetsEs, testSentRevOffsetsEs }, + { NULL, NULL, NULL } +}; + +static void TestBreakIteratorSuppressions(void) { + const TestBISuppressionsItem * itemPtr; + + for (itemPtr = testBISuppressionsItems; itemPtr->locale != NULL; itemPtr++) { + UChar textU[kTextULenMax]; + int32_t textULen = u_unescape(itemPtr->text, textU, kTextULenMax); + UErrorCode status = U_ZERO_ERROR; + UBreakIterator *bi = ubrk_open(UBRK_SENTENCE, itemPtr->locale, textU, textULen, &status); + if (U_SUCCESS(status)) { + int32_t offset, start; + const int32_t * expOffsetPtr; + + expOffsetPtr = itemPtr->expFwdOffsets; + ubrk_first(bi); + for (; (offset = ubrk_next(bi)) != UBRK_DONE && *expOffsetPtr >= 0; expOffsetPtr++) { + if (offset != *expOffsetPtr) { + log_err("FAIL: ubrk_next loc \"%s\", expected %d, got %d\n", itemPtr->locale, *expOffsetPtr, offset); + } + } + if (offset != UBRK_DONE || *expOffsetPtr >= 0) { + log_err("FAIL: ubrk_next loc \"%s\", expected UBRK_DONE & expOffset -1, got %d and %d\n", itemPtr->locale, offset, *expOffsetPtr); + } + + expOffsetPtr = itemPtr->expFwdOffsets; + start = ubrk_first(bi) + 1; + for (; (offset = ubrk_following(bi, start)) != UBRK_DONE && *expOffsetPtr >= 0; expOffsetPtr++) { + if (offset != *expOffsetPtr) { + log_err("FAIL: ubrk_following(%d) loc \"%s\", expected %d, got %d\n", start, itemPtr->locale, *expOffsetPtr, offset); + } + start = *expOffsetPtr + 1; + } + if (offset != UBRK_DONE || *expOffsetPtr >= 0) { + log_err("FAIL: ubrk_following(%d) loc \"%s\", expected UBRK_DONE & expOffset -1, got %d and %d\n", start, itemPtr->locale, offset, *expOffsetPtr); + } + + expOffsetPtr = itemPtr->expRevOffsets; + ubrk_last(bi); + for (; (offset = ubrk_previous(bi)) != UBRK_DONE && *expOffsetPtr >= 0; expOffsetPtr++) { + if (offset != *expOffsetPtr) { + log_err("FAIL: ubrk_previous loc \"%s\", expected %d, got %d\n", itemPtr->locale, *expOffsetPtr, offset); + } + } + if (offset == UBRK_DONE && expOffsetPtr == itemPtr->expRevOffsets && + log_knownIssue("11786", "Filtered break iterator issues at beginning/end of text")) { + // skip this test for problem cases until the fix for #11786 is complete + } else + if (offset != UBRK_DONE || *expOffsetPtr >= 0) { + log_err("FAIL: ubrk_previous loc \"%s\", expected UBRK_DONE & expOffset -1, got %d and %d\n", itemPtr->locale, offset, *expOffsetPtr); + } + + expOffsetPtr = itemPtr->expRevOffsets; + start = ubrk_last(bi) - 1; + for (; (offset = ubrk_preceding(bi, start)) != UBRK_DONE && *expOffsetPtr >= 0; expOffsetPtr++) { + if (offset != *expOffsetPtr) { + log_err("FAIL: ubrk_preceding(%d) loc \"%s\", expected %d, got %d\n", start, itemPtr->locale, *expOffsetPtr, offset); + } + start = *expOffsetPtr - 1; + } + if (start >=0 && (offset != UBRK_DONE || *expOffsetPtr >= 0)) { + log_err("FAIL: ubrk_preceding loc(%d) \"%s\", expected UBRK_DONE & expOffset -1, got %d and %d\n", start, itemPtr->locale, offset, *expOffsetPtr); + } + + ubrk_close(bi); + } else { + log_data_err("FAIL: ubrk_open(UBRK_SENTENCE, \"%s\", ...) status %s (Are you missing data?)\n", itemPtr->locale, u_errorName(status)); + } + } +} + #endif /* #if !UCONFIG_NO_BREAK_ITERATION */