ICU-11770 If locale has ss-standard, sentence break iterator uses suppressions data...

author Peter Edberg <pedberg@unicode.org>

Wed, 9 Sep 2015 04:05:01 +0000 (04:05 +0000)

committer Peter Edberg <pedberg@unicode.org>

Wed, 9 Sep 2015 04:05:01 +0000 (04:05 +0000)
author Peter Edberg <pedberg@unicode.org>
Wed, 9 Sep 2015 04:05:01 +0000 (04:05 +0000)
committer Peter Edberg <pedberg@unicode.org>
Wed, 9 Sep 2015 04:05:01 +0000 (04:05 +0000)
diff --git a/icu4c/source/common/brkiter.cpp b/icu4c/source/common/brkiter.cpp

index fe231e1b01953e07f6684d8c1a477a3d5128e61f..d466fb80d25e952c0ccb9600732fba2d6dc45faa 100644 (file)
--- a/icu4c/source/common/brkiter.cpp
+++ b/icu4c/source/common/brkiter.cpp
@@ -27,6 +27,7 @@
  #include "unicode/udata.h"
  #include "unicode/ures.h"
  #include "unicode/ustring.h"
+#include "unicode/filteredbrk.h"
  #include "ucln_cmn.h"
  #include "cstring.h"
  #include "umutex.h"
@@ -383,7 +384,7 @@ BreakIterator::createInstance(const Locale& loc, int32_t kind, UErrorCode& statu
  }
  
  // -------------------------------------
-enum { kLBTypeLenMax = 32 };
+enum { kKeyValueLenMax = 32 };
  
  BreakIterator*
  BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
@@ -392,7 +393,7 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
      if (U_FAILURE(status)) {
          return NULL;
      }
-    char lbType[kLBTypeLenMax];
+    char lbType[kKeyValueLenMax];
  
      BreakIterator *result = NULL;
      switch (kind) {
@@ -405,9 +406,9 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
      case UBRK_LINE:
          uprv_strcpy(lbType, "line");
          {
-            char lbKeyValue[kLBTypeLenMax] = {0};
+            char lbKeyValue[kKeyValueLenMax] = {0};
              UErrorCode kvStatus = U_ZERO_ERROR;
-            int32_t kLen = loc.getKeywordValue("lb", lbKeyValue, kLBTypeLenMax, kvStatus);
+            int32_t kLen = loc.getKeywordValue("lb", lbKeyValue, kKeyValueLenMax, kvStatus);
              if (U_SUCCESS(kvStatus) && kLen > 0 && (uprv_strcmp(lbKeyValue,"strict")==0 || uprv_strcmp(lbKeyValue,"normal")==0 || uprv_strcmp(lbKeyValue,"loose")==0)) {
                  uprv_strcat(lbType, "_");
                  uprv_strcat(lbType, lbKeyValue);
@@ -417,6 +418,18 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
          break;
      case UBRK_SENTENCE:
          result = BreakIterator::buildInstance(loc, "sentence", kind, status);
+        {
+            char ssKeyValue[kKeyValueLenMax] = {0};
+            UErrorCode kvStatus = U_ZERO_ERROR;
+            int32_t kLen = loc.getKeywordValue("ss", ssKeyValue, kKeyValueLenMax, kvStatus);
+            if (U_SUCCESS(kvStatus) && kLen > 0 && uprv_strcmp(ssKeyValue,"standard")==0) {
+                FilteredBreakIteratorBuilder* fbiBuilder = FilteredBreakIteratorBuilder::createInstance(loc, kvStatus);
+                if (U_SUCCESS(kvStatus)) {
+                    result = fbiBuilder->build(result, status);
+                    delete fbiBuilder;
+                }
+            }
+        }
          break;
      case UBRK_TITLE:
          result = BreakIterator::buildInstance(loc, "title", kind, status);
diff --git a/icu4c/source/common/ubrk.cpp b/icu4c/source/common/ubrk.cpp

index a0002c21cea90effce913adf5652fcc6e48951ab..161c0ac4d32aad7eee40906398aaa24f46e63a14 100644 (file)
--- a/icu4c/source/common/ubrk.cpp
+++ b/icu4c/source/common/ubrk.cpp
@@ -1,6 +1,6 @@
  /*
  ********************************************************************************
-*   Copyright (C) 1996-2013, International Business Machines
+*   Copyright (C) 1996-2015, International Business Machines
  *   Corporation and others.  All Rights Reserved.
  ********************************************************************************
  */
@@ -164,10 +164,9 @@ ubrk_setText(UBreakIterator* bi,
               int32_t         textLength,
               UErrorCode*     status)
  {
-    BreakIterator *brit = (BreakIterator *)bi;
      UText  ut = UTEXT_INITIALIZER;
      utext_openUChars(&ut, text, textLength, status);
-    brit->setText(&ut, *status);
+    ((BreakIterator*)bi)->setText(&ut, *status);
      // A stack allocated UText wrapping a UChar * string
      //   can be dumped without explicitly closing it.
  }
@@ -179,8 +178,7 @@ ubrk_setUText(UBreakIterator *bi,
               UText          *text,
               UErrorCode     *status)
  {
-    RuleBasedBreakIterator *brit = (RuleBasedBreakIterator *)bi;
-    brit->RuleBasedBreakIterator::setText(text, *status);
+  ((BreakIterator*)bi)->setText(text, *status);
  }
  
  
@@ -191,35 +189,35 @@ U_CAPI int32_t U_EXPORT2
  ubrk_current(const UBreakIterator *bi)
  {
  
-  return ((RuleBasedBreakIterator*)bi)->RuleBasedBreakIterator::current();
+  return ((BreakIterator*)bi)->current();
  }
  
  U_CAPI int32_t U_EXPORT2
  ubrk_next(UBreakIterator *bi)
  {
  
-  return ((RuleBasedBreakIterator*)bi)->RuleBasedBreakIterator::next();
+  return ((BreakIterator*)bi)->next();
  }
  
  U_CAPI int32_t U_EXPORT2
  ubrk_previous(UBreakIterator *bi)
  {
  
-  return ((RuleBasedBreakIterator*)bi)->RuleBasedBreakIterator::previous();
+  return ((BreakIterator*)bi)->previous();
  }
  
  U_CAPI int32_t U_EXPORT2
  ubrk_first(UBreakIterator *bi)
  {
  
-  return ((RuleBasedBreakIterator*)bi)->RuleBasedBreakIterator::first();
+  return ((BreakIterator*)bi)->first();
  }
  
  U_CAPI int32_t U_EXPORT2
  ubrk_last(UBreakIterator *bi)
  {
  
-  return ((RuleBasedBreakIterator*)bi)->RuleBasedBreakIterator::last();
+  return ((BreakIterator*)bi)->last();
  }
  
  U_CAPI int32_t U_EXPORT2
@@ -227,7 +225,7 @@ ubrk_preceding(UBreakIterator *bi,
             int32_t offset)
  {
  
-  return ((RuleBasedBreakIterator*)bi)->RuleBasedBreakIterator::preceding(offset);
+  return ((BreakIterator*)bi)->preceding(offset);
  }
  
  U_CAPI int32_t U_EXPORT2
@@ -235,7 +233,7 @@ ubrk_following(UBreakIterator *bi,
             int32_t offset)
  {
  
-  return ((RuleBasedBreakIterator*)bi)->RuleBasedBreakIterator::following(offset);
+  return ((BreakIterator*)bi)->following(offset);
  }
  
  U_CAPI const char* U_EXPORT2
@@ -256,20 +254,20 @@ ubrk_countAvailable()
  U_CAPI  UBool U_EXPORT2
  ubrk_isBoundary(UBreakIterator *bi, int32_t offset)
  {
-    return ((RuleBasedBreakIterator *)bi)->RuleBasedBreakIterator::isBoundary(offset);
+    return ((BreakIterator*)bi)->isBoundary(offset);
  }
  
  
  U_CAPI  int32_t U_EXPORT2
  ubrk_getRuleStatus(UBreakIterator *bi)
  {
-    return ((RuleBasedBreakIterator *)bi)->RuleBasedBreakIterator::getRuleStatus();
+    return ((BreakIterator*)bi)->getRuleStatus();
  }
  
  U_CAPI  int32_t U_EXPORT2
  ubrk_getRuleStatusVec(UBreakIterator *bi, int32_t *fillInVec, int32_t capacity, UErrorCode *status)
  {
-    return ((RuleBasedBreakIterator *)bi)->RuleBasedBreakIterator::getRuleStatusVec(fillInVec, capacity, *status);
+    return ((BreakIterator*)bi)->getRuleStatusVec(fillInVec, capacity, *status);
  }
  
  
diff --git a/icu4c/source/common/unicode/ubrk.h b/icu4c/source/common/unicode/ubrk.h

index 9bdb4d5d9de3188e3fed72e39efeb752b5da5b04..3c26663ec961d983488bced551656bdaad85e705 100644 (file)
--- a/icu4c/source/common/unicode/ubrk.h
+++ b/icu4c/source/common/unicode/ubrk.h
@@ -1,6 +1,6 @@
  /*
  ******************************************************************************
-* Copyright (C) 1996-2014, International Business Machines Corporation and others.
+* Copyright (C) 1996-2015, International Business Machines Corporation and others.
  * All Rights Reserved.
  ******************************************************************************
  */
@@ -45,10 +45,20 @@
   * when line-wrapping. The mechanism correctly handles punctuation and
   * hyphenated words.
   * <p>
+ * Note: The locale keyword "lb" can be used to modify line break
+ * behavior according to the CSS level 3 line-break options, see
+ * <http://dev.w3.org/csswg/css-text/#line-breaking>. For example:
+ * "ja@lb=strict", "zh@lb=loose".
+ * <p>
   * Sentence boundary analysis allows selection with correct
   * interpretation of periods within numbers and abbreviations, and
   * trailing punctuation marks such as quotation marks and parentheses.
   * <p>
+ * Note: The locale keyword "ss" can be used to enable use of
+ * segmentation suppression data (preventing breaks in English after
+ * abbreviations such as "Mr." or "Est.", for example), as follows:
+ * "en@ss=standard".
+ * <p>
   * Word boundary analysis is used by search and replace functions, as
   * well as within text editing applications that allow the user to
   * select words with a double click. Word selection provides correct
@@ -202,7 +212,9 @@ typedef enum USentenceBreakTag {
   * and sentence breaks in text.
   * @param type The type of UBreakIterator to open: one of UBRK_CHARACTER, UBRK_WORD,
   * UBRK_LINE, UBRK_SENTENCE
- * @param locale The locale specifying the text-breaking conventions.
+ * @param locale The locale specifying the text-breaking conventions. Note that
+ * locale keys such as "lb" and "ss" may be used to modify text break behavior,
+ * see general discussion of BreakIterator C API.
   * @param text The text to be iterated over.
   * @param textLength The number of characters in text, or -1 if null-terminated.
   * @param status A UErrorCode to receive any errors.
diff --git a/icu4c/source/test/cintltst/cbiapts.c b/icu4c/source/test/cintltst/cbiapts.c

index a148636b988ba5130599ba0d851b03aa74cb878c..9a0f5d9924bc6bbee4c95602e0910e867794b6ff 100644 (file)
--- a/icu4c/source/test/cintltst/cbiapts.c
+++ b/icu4c/source/test/cintltst/cbiapts.c
@@ -49,6 +49,7 @@ static void TestBreakIteratorUText(void);
  static void TestBreakIteratorTailoring(void);
  static void TestBreakIteratorRefresh(void);
  static void TestBug11665(void);
+static void TestBreakIteratorSuppressions(void);
  
  void addBrkIterAPITest(TestNode** root);
  
@@ -65,6 +66,7 @@ void addBrkIterAPITest(TestNode** root)
      addTest(root, &TestBreakIteratorTailoring, "tstxtbd/cbiapts/TestBreakIteratorTailoring");
      addTest(root, &TestBreakIteratorRefresh, "tstxtbd/cbiapts/TestBreakIteratorRefresh");
      addTest(root, &TestBug11665, "tstxtbd/cbiapts/TestBug11665");
+    addTest(root, &TestBreakIteratorSuppressions, "tstxtbd/cbiapts/TestBreakIteratorSuppressions");
  }
  
  #define CLONETEST_ITERATOR_COUNT 2
@@ -934,5 +936,117 @@ static void TestBug11665(void) {
      ubrk_close(bi);
  }
  
+static const char testSentenceSuppressionsEn[]  = "Mr. Jones comes home. Dr. Smith Ph.D. is out. In the U.S.A. it is hot.";
+static const int32_t testSentSuppFwdOffsetsEn[] = { 22, 26, 46, 70, -1 };     /* With suppressions, currently not handling Dr. */
+static const int32_t testSentFwdOffsetsEn[]     = {  4, 22, 26, 46, 70, -1 }; /* Without suppressions */
+static const int32_t testSentSuppRevOffsetsEn[] = { 46, 26, 22,  0, -1 };     /* With suppressions, currently not handling Dr.  */
+static const int32_t testSentRevOffsetsEn[]     = { 46, 26, 22,  4,  0, -1 }; /* Without suppressions */
+
+static const char testSentenceSuppressionsDe[]  = "Wenn ich schon h\\u00F6re zu Guttenberg kommt evtl. zur\\u00FCck.";
+static const int32_t testSentSuppFwdOffsetsDe[] = { 53, -1 };       /* With suppressions */
+static const int32_t testSentFwdOffsetsDe[]     = { 53, -1 };       /* Without suppressions; no break in evtl. zur due to casing */
+static const int32_t testSentSuppRevOffsetsDe[] = {  0, -1 };       /* With suppressions */
+static const int32_t testSentRevOffsetsDe[]     = {  0, -1 };       /* Without suppressions */
+
+static const char testSentenceSuppressionsEs[]  = "Te esperamos todos los miercoles en Bravo 416, Col. El Pueblo a las 7 PM.";
+static const int32_t testSentSuppFwdOffsetsEs[] = { 73, -1 };       /* With suppressions */
+static const int32_t testSentFwdOffsetsEs[]     = { 52, 73, -1 };   /* Without suppressions */
+static const int32_t testSentSuppRevOffsetsEs[] = {  0, -1 };       /* With suppressions */
+static const int32_t testSentRevOffsetsEs[]     = { 52,  0, -1 };   /* Without suppressions */
+
+enum { kTextULenMax = 128 };
+
+typedef struct {
+    const char * locale;
+    const char * text;
+    const int32_t * expFwdOffsets;
+    const int32_t * expRevOffsets;
+} TestBISuppressionsItem;
+
+static const TestBISuppressionsItem testBISuppressionsItems[] = {
+    { "en@ss=standard", testSentenceSuppressionsEn, testSentSuppFwdOffsetsEn, testSentSuppRevOffsetsEn },
+    { "en",             testSentenceSuppressionsEn, testSentFwdOffsetsEn,     testSentRevOffsetsEn     },
+    { "fr@ss=standard", testSentenceSuppressionsEn, testSentFwdOffsetsEn,     testSentRevOffsetsEn     },
+    { "af@ss=standard", testSentenceSuppressionsEn, testSentSuppFwdOffsetsEn, testSentSuppRevOffsetsEn }, /* no brkiter data => en suppressions? */
+    { "zh@ss=standard", testSentenceSuppressionsEn, testSentFwdOffsetsEn,     testSentRevOffsetsEn     }, /* brkiter data, no suppressions data => no suppressions */
+    { "zh_Hant@ss=standard", testSentenceSuppressionsEn, testSentFwdOffsetsEn, testSentRevOffsetsEn    }, /* brkiter data, no suppressions data => no suppressions */
+    { "fi@ss=standard", testSentenceSuppressionsEn, testSentFwdOffsetsEn,     testSentRevOffsetsEn     }, /* brkiter data, no suppressions data => no suppressions */
+    { "ja@ss=standard", testSentenceSuppressionsEn, testSentFwdOffsetsEn,     testSentRevOffsetsEn     }, /* brkiter data, no suppressions data => no suppressions */
+    { "de@ss=standard", testSentenceSuppressionsDe, testSentSuppFwdOffsetsDe, testSentSuppRevOffsetsDe },
+    { "de",             testSentenceSuppressionsDe, testSentFwdOffsetsDe,     testSentRevOffsetsDe     },
+    { "es@ss=standard", testSentenceSuppressionsEs, testSentSuppFwdOffsetsEs, testSentSuppRevOffsetsEs },
+    { "es",             testSentenceSuppressionsEs, testSentFwdOffsetsEs,     testSentRevOffsetsEs     },
+    { NULL, NULL, NULL }
+};
+
+static void TestBreakIteratorSuppressions(void) {
+    const TestBISuppressionsItem * itemPtr;
+    
+    for (itemPtr = testBISuppressionsItems; itemPtr->locale != NULL; itemPtr++) {
+        UChar textU[kTextULenMax];
+        int32_t textULen = u_unescape(itemPtr->text, textU, kTextULenMax);
+        UErrorCode status = U_ZERO_ERROR;
+        UBreakIterator *bi = ubrk_open(UBRK_SENTENCE, itemPtr->locale, textU, textULen, &status);
+        if (U_SUCCESS(status)) {
+            int32_t offset, start;
+            const int32_t * expOffsetPtr;
+
+            expOffsetPtr = itemPtr->expFwdOffsets;
+            ubrk_first(bi);
+            for (; (offset = ubrk_next(bi)) != UBRK_DONE && *expOffsetPtr >= 0; expOffsetPtr++) {
+                if (offset != *expOffsetPtr) {
+                    log_err("FAIL: ubrk_next loc \"%s\", expected %d, got %d\n", itemPtr->locale, *expOffsetPtr, offset);
+                }
+            }
+            if (offset != UBRK_DONE || *expOffsetPtr >= 0) {
+                log_err("FAIL: ubrk_next loc \"%s\", expected UBRK_DONE & expOffset -1, got %d and %d\n", itemPtr->locale, offset, *expOffsetPtr);
+            }
+
+            expOffsetPtr = itemPtr->expFwdOffsets;
+            start = ubrk_first(bi) + 1;
+            for (; (offset = ubrk_following(bi, start)) != UBRK_DONE && *expOffsetPtr >= 0; expOffsetPtr++) {
+                if (offset != *expOffsetPtr) {
+                    log_err("FAIL: ubrk_following(%d) loc \"%s\", expected %d, got %d\n", start, itemPtr->locale, *expOffsetPtr, offset);
+                }
+                start = *expOffsetPtr + 1;
+            }
+            if (offset != UBRK_DONE || *expOffsetPtr >= 0) {
+                log_err("FAIL: ubrk_following(%d) loc \"%s\", expected UBRK_DONE & expOffset -1, got %d and %d\n", start, itemPtr->locale, offset, *expOffsetPtr);
+            }
+
+            expOffsetPtr = itemPtr->expRevOffsets;
+            ubrk_last(bi);
+            for (; (offset = ubrk_previous(bi)) != UBRK_DONE && *expOffsetPtr >= 0; expOffsetPtr++) {
+                if (offset != *expOffsetPtr) {
+                    log_err("FAIL: ubrk_previous loc \"%s\", expected %d, got %d\n", itemPtr->locale, *expOffsetPtr, offset);
+                }
+            }
+            if (offset == UBRK_DONE && expOffsetPtr == itemPtr->expRevOffsets &&
+                    log_knownIssue("11786", "Filtered break iterator issues at beginning/end of text")) {
+                // skip this test for problem cases until the fix for #11786 is complete
+            } else
+            if (offset != UBRK_DONE || *expOffsetPtr >= 0) {
+                log_err("FAIL: ubrk_previous loc \"%s\", expected UBRK_DONE & expOffset -1, got %d and %d\n", itemPtr->locale, offset, *expOffsetPtr);
+            }
+
+            expOffsetPtr = itemPtr->expRevOffsets;
+            start = ubrk_last(bi) - 1;
+            for (; (offset = ubrk_preceding(bi, start)) != UBRK_DONE && *expOffsetPtr >= 0; expOffsetPtr++) {
+                if (offset != *expOffsetPtr) {
+                    log_err("FAIL: ubrk_preceding(%d) loc \"%s\", expected %d, got %d\n", start, itemPtr->locale, *expOffsetPtr, offset);
+                }
+                start = *expOffsetPtr - 1;
+            }
+            if (start >=0 && (offset != UBRK_DONE || *expOffsetPtr >= 0)) {
+                log_err("FAIL: ubrk_preceding loc(%d) \"%s\", expected UBRK_DONE & expOffset -1, got %d and %d\n", start, itemPtr->locale, offset, *expOffsetPtr);
+            }
+
+            ubrk_close(bi);
+        } else {
+            log_data_err("FAIL: ubrk_open(UBRK_SENTENCE, \"%s\", ...) status %s (Are you missing data?)\n", itemPtr->locale, u_errorName(status));
+        }
+    }
+}
+
  
  #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
author	Peter Edberg <pedberg@unicode.org>
	Wed, 9 Sep 2015 04:05:01 +0000 (04:05 +0000)
committer	Peter Edberg <pedberg@unicode.org>
	Wed, 9 Sep 2015 04:05:01 +0000 (04:05 +0000)
icu4c/source/common/brkiter.cpp		patch \| blob \| history
icu4c/source/common/ubrk.cpp		patch \| blob \| history
icu4c/source/common/unicode/ubrk.h		patch \| blob \| history
icu4c/source/test/cintltst/cbiapts.c		patch \| blob \| history