ICU-12914 Add ubrk_openBinaryRules, ubrk_getBinaryRules, and simple test

author Peter Edberg <pedberg@unicode.org>

Thu, 19 Jan 2017 23:10:23 +0000 (23:10 +0000)

committer Peter Edberg <pedberg@unicode.org>

Thu, 19 Jan 2017 23:10:23 +0000 (23:10 +0000)
author Peter Edberg <pedberg@unicode.org>
Thu, 19 Jan 2017 23:10:23 +0000 (23:10 +0000)
committer Peter Edberg <pedberg@unicode.org>
Thu, 19 Jan 2017 23:10:23 +0000 (23:10 +0000)
diff --git a/icu4c/source/common/ubrk.cpp b/icu4c/source/common/ubrk.cpp

index b02c966b107d7f3eb20a163dec9e82aa5e6644b1..925c1e90a91ba6ec51de934a7901e9f6c5dbccdc 100644 (file)
--- a/icu4c/source/common/ubrk.cpp
+++ b/icu4c/source/common/ubrk.cpp
@@ -20,6 +20,7 @@
  #include "unicode/rbbi.h"
  #include "rbbirb.h"
  #include "uassert.h"
+#include "cmemory.h"
  
  U_NAMESPACE_USE
  
@@ -119,7 +120,24 @@ ubrk_openRules(  const UChar        *rules,
  }
  
  
-
+U_CAPI UBreakIterator* U_EXPORT2
+ubrk_openBinaryRules(const uint8_t *binaryRules, uint32_t rulesLength,
+                     const UChar *  text, int32_t textLength,
+                     UErrorCode *   status)
+{
+    if (U_FAILURE(*status)) {
+        return NULL;
+    }
+    LocalPointer<RuleBasedBreakIterator> lpRBBI(new RuleBasedBreakIterator(binaryRules, rulesLength, *status), *status);
+    if (U_FAILURE(*status)) {
+        return NULL;
+    }
+    UBreakIterator *uBI = reinterpret_cast<UBreakIterator *>(lpRBBI.orphan());
+    if (text != NULL) {
+        ubrk_setText(uBI, text, textLength, status);
+    }
+    return uBI;
+}
  
  
  U_CAPI UBreakIterator * U_EXPORT2
@@ -288,7 +306,8 @@ ubrk_getLocaleByType(const UBreakIterator *bi,
  }
  
  
-void ubrk_refreshUText(UBreakIterator *bi,
+U_CAPI void U_EXPORT2
+ubrk_refreshUText(UBreakIterator *bi,
                         UText          *text,
                         UErrorCode     *status)
  {
@@ -296,6 +315,34 @@ void ubrk_refreshUText(UBreakIterator *bi,
      bii->refreshInputText(text, *status);
  }
  
+U_CAPI uint32_t U_EXPORT2
+ubrk_getBinaryRules(UBreakIterator *bi,
+                    uint8_t *       binaryRules, uint32_t rulesCapacity,
+                    UErrorCode *    status)
+{
+    if (U_FAILURE(*status)) {
+        return 0;
+    }
+    if (binaryRules == NULL && rulesCapacity > 0) {
+        *status = U_ILLEGAL_ARGUMENT_ERROR;
+        return 0;
+    }
+    RuleBasedBreakIterator* rbbi;
+    if ((rbbi = dynamic_cast<RuleBasedBreakIterator*>(reinterpret_cast<BreakIterator*>(bi))) == NULL) {
+        *status = U_ILLEGAL_ARGUMENT_ERROR;
+        return 0;
+    }
+    uint32_t rulesLength;
+    const uint8_t * returnedRules = rbbi->getBinaryRules(rulesLength);
+    if (binaryRules != NULL) { // if not preflighting
+        if (rulesLength > rulesCapacity) {
+            *status = U_BUFFER_OVERFLOW_ERROR;
+        } else {
+            uprv_memcpy(binaryRules, returnedRules, rulesLength);
+        }
+    }
+    return rulesLength;
+}
  
  
  #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
diff --git a/icu4c/source/common/unicode/ubrk.h b/icu4c/source/common/unicode/ubrk.h

index f43943ed1ab12c1798945dbdea02f9db163d1ab9..1c8f62a17c87f41aaf10049ba468e9672367418a 100644 (file)
--- a/icu4c/source/common/unicode/ubrk.h
+++ b/icu4c/source/common/unicode/ubrk.h
@@ -267,6 +267,34 @@ ubrk_openRules(const UChar     *rules,
                 UParseError     *parseErr,
                 UErrorCode      *status);
  
+#ifndef U_HIDE_DRAFT_API
+/**
+ * Open a new UBreakIterator for locating text boundaries using precompiled binary rules.
+ * Opening a UBreakIterator this way is substantially faster than using ubrk_openRules.
+ * Binary rules may be obtained using ubrk_getBinaryRules. The compiled rules are not
+ * compatible across different major versions of ICU, nor across platforms of different
+ * endianness or different base character set family (ASCII vs EBCDIC).
+ * @param binaryRules A set of compiled binary rules specifying the text breaking
+ *                    conventions. Ownership of the storage containing the compiled
+ *                    rules remains with the caller of this function. The compiled
+ *                    rules must not be modified or deleted during the life of the
+ *                    break iterator.
+ * @param rulesLength The length of binaryRules in bytes.
+ * @param text        The text to be iterated over.  May be null, in which case
+ *                    ubrk_setText() is used to specify the text to be iterated.
+ * @param textLength  The number of characters in text, or -1 if null-terminated.
+ * @param status      Pointer to UErrorCode to receive any errors.
+ * @return            UBreakIterator for the specified rules.
+ * @see ubrk_getBinaryRules
+ * @draft ICU 59
+ */
+U_DRAFT UBreakIterator* U_EXPORT2
+ubrk_openBinaryRules(const uint8_t *binaryRules, uint32_t rulesLength,
+                     const UChar *  text, int32_t textLength,
+                     UErrorCode *   status);
+
+#endif  /* U_HIDE_DRAFT_API */
+
  /**
   * Thread safe cloning operation
   * @param bi iterator to be cloned
@@ -566,6 +594,35 @@ ubrk_refreshUText(UBreakIterator *bi,
                         UText          *text,
                         UErrorCode     *status);
  
+
+#ifndef U_HIDE_DRAFT_API
+/**
+ * Get a compiled binary version of the rules specifying the behavior of a UBreakIterator.
+ * The binary rules may be used with ubrk_openBinaryRules to open a new UBreakIterator
+ * more quickly than using ubrk_openRules. The compiled rules are not compatible across
+ * different major versions of ICU, nor across platforms of different endianness or
+ * different base character set family (ASCII vs EBCDIC). Supports preflighting (with
+ * binaryRules=NULL and rulesCapacity=0) to get the rules length without copying them to
+ * the binaryRules buffer,
+ * @param bi            The break iterator to use.
+ * @param binaryRules   Buffer to receive the compiled binary rules; set to NULL for
+ *                      preflighting.
+ * @param rulesCapacity Capacity (in bytes) of the binaryRules buffer; set to 0 for
+ *                      preflighting.
+ * @param status        Pointer to UErrorCode to receive any errors.
+ * @return              The actual byte length of the binary rules. If not preflighting
+ *                      and this is larger than rulesCapacity, *status will be set to
+ *                      an error.
+ * @see ubrk_openBinaryRules
+ * @draft ICU 59
+ */
+U_DRAFT uint32_t U_EXPORT2
+ubrk_getBinaryRules(UBreakIterator *bi,
+                    uint8_t *       binaryRules, uint32_t rulesCapacity,
+                    UErrorCode *    status);
+
+#endif  /* U_HIDE_DRAFT_API */
+
  #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
  
  #endif
diff --git a/icu4c/source/test/cintltst/cbiapts.c b/icu4c/source/test/cintltst/cbiapts.c

index 398db3c7fe3c3a3a6eaea936cf9da9ccfb7405ef..7ad1924ca8913ab436f4ca8c7e2fce0b97ebbc38 100644 (file)
--- a/icu4c/source/test/cintltst/cbiapts.c
+++ b/icu4c/source/test/cintltst/cbiapts.c
@@ -10,7 +10,7 @@
  * File CBIAPTS.C
  *
  * Modification History:
-*        Name                     Description            
+*        Name                     Description
  *     Madhu Katragadda              Creation
  *********************************************************************************/
  /*C API TEST FOR BREAKITERATOR */
@@ -128,7 +128,7 @@ static UChar* toUChar(const char *src, void **freeHook) {
      if (dest == NULL) {
          return NULL;
      }
-    
+
      dest->link = (StringStruct*)(*freeHook);
      *freeHook = dest;
      return dest->str;
@@ -164,7 +164,7 @@ static void TestBreakIteratorCAPI()
  
  /*test ubrk_open()*/
      log_verbose("\nTesting BreakIterator open functions\n");
-                                            
+
      /* Use french for fun */
      word         = ubrk_open(UBRK_WORD, "en_US", text, u_strlen(text), &status);
      if(status == U_FILE_ACCESS_ERROR) {
@@ -176,7 +176,7 @@ static void TestBreakIteratorCAPI()
      else{
          log_verbose("PASS: Successfully opened  word breakiterator\n");
      }
-    
+
      sentence     = ubrk_open(UBRK_SENTENCE, "en_US", text, u_strlen(text), &status);
      if(U_FAILURE(status)){
          log_err_status(status, "FAIL: Error in ubrk_open() for sentence breakiterator: %s\n", myErrorName(status));
@@ -185,7 +185,7 @@ static void TestBreakIteratorCAPI()
      else{
          log_verbose("PASS: Successfully opened  sentence breakiterator\n");
      }
-    
+
      line         = ubrk_open(UBRK_LINE, "en_US", text, u_strlen(text), &status);
      if(U_FAILURE(status)){
          log_err("FAIL: Error in ubrk_open() for line breakiterator: %s\n", myErrorName(status));
@@ -194,7 +194,7 @@ static void TestBreakIteratorCAPI()
      else{
          log_verbose("PASS: Successfully opened  line breakiterator\n");
      }
-    
+
      character     = ubrk_open(UBRK_CHARACTER, "en_US", text, u_strlen(text), &status);
      if(U_FAILURE(status)){
          log_err("FAIL: Error in ubrk_open() for character breakiterator: %s\n", myErrorName(status));
@@ -232,10 +232,10 @@ static void TestBreakIteratorCAPI()
      }
      for(i=0;i<count;i++)
      {
-        log_verbose("%s\n", ubrk_getAvailable(i)); 
+        log_verbose("%s\n", ubrk_getAvailable(i));
          if (ubrk_getAvailable(i) == 0)
              log_err("No locale for which breakiterator is applicable\n");
-        else 
+        else
              log_verbose("A locale %s for which breakiterator is applicable\n",ubrk_getAvailable(i));
      }
  
@@ -258,10 +258,10 @@ static void TestBreakIteratorCAPI()
      if(end!=49)
          log_err("error ubrk_last(word) did not return 49\n");
      log_verbose("last (word = %d\n", (int32_t)end);
-    
+
      pos=ubrk_previous(word);
      log_verbose("%d   %d\n", end, pos);
-     
+
      pos=ubrk_previous(word);
      log_verbose("%d \n", pos);
  
@@ -277,7 +277,7 @@ static void TestBreakIteratorCAPI()
      }
  
  
-    
+
      log_verbose("\nTesting the functions for character\n");
      ubrk_first(character);
      pos = ubrk_following(character, 5);
@@ -292,7 +292,7 @@ static void TestBreakIteratorCAPI()
      if(pos!=21)
         log_err("error ubrk_preceding(character,22) did not return 21\n");
      log_verbose("preceding(character,22) = %d\n", (int32_t)pos);
-    
+
  
      log_verbose("\nTesting the functions for line\n");
      pos=ubrk_first(line);
@@ -304,7 +304,7 @@ static void TestBreakIteratorCAPI()
          log_err("error ubrk_following(line) did not return 22\n");
      log_verbose("following (line) = %d\n", (int32_t)pos);
  
-    
+
      log_verbose("\nTesting the functions for sentence\n");
      ubrk_first(sentence);
      pos = ubrk_current(sentence);
@@ -321,8 +321,8 @@ static void TestBreakIteratorCAPI()
      if (ubrk_first(sentence)!=ubrk_current(sentence)) {
          log_err("error in ubrk_first() or ubrk_current()\n");
      }
-    
- 
+
+
      /*---- */
      /*Testing ubrk_open and ubrk_close()*/
     log_verbose("\nTesting open and close for us locale\n");
@@ -368,7 +368,7 @@ static void TestBreakIteratorCAPI()
  static void TestBreakIteratorSafeClone(void)
  {
      UChar text[51];     /* Keep this odd to test for 64-bit memory alignment */
-                        /*  NOTE:  This doesn't reliably force mis-alignment of following items. */ 
+                        /*  NOTE:  This doesn't reliably force mis-alignment of following items. */
      uint8_t buffer [CLONETEST_ITERATOR_COUNT] [U_BRK_SAFECLONE_BUFFERSIZE];
      int32_t bufferSize = U_BRK_SAFECLONE_BUFFERSIZE;
  
@@ -526,7 +526,7 @@ static UBreakIterator * testOpenRules(char *rules) {
      bi = ubrk_openRules(ruleSourceU,  -1,     /*  The rules  */
                          NULL,  -1,            /*  The text to be iterated over. */
                          &parseErr, &status);
-    
+
      if (U_FAILURE(status)) {
          log_data_err("FAIL: ubrk_openRules: ICU Error \"%s\" (Are you missing data?)\n", u_errorName(status));
          bi = 0;
@@ -586,6 +586,41 @@ static void TestBreakIteratorRules() {
          }
      }
  
+    /* #12914 add basic sanity test for ubrk_getBinaryRules, ubrk_openBinaryRules */
+    /* Underlying functionality checked in C++ rbbiapts.cpp TestRoundtripRules */
+    status = U_ZERO_ERROR;
+    uint32_t rulesLength = ubrk_getBinaryRules(bi, NULL, 0, &status); /* preflight */
+    if (U_FAILURE(status)) {
+        log_err("FAIL: ubrk_getBinaryRules preflight err: %s", u_errorName(status));
+    } else {
+        uint8_t* binaryRules = (uint8_t*)uprv_malloc(rulesLength);
+        if (binaryRules == NULL) {
+            log_err("FAIL: unable to malloc rules buffer, size %u", rulesLength);
+        } else {
+            rulesLength = ubrk_getBinaryRules(bi, binaryRules, rulesLength, &status);
+            if (U_FAILURE(status)) {
+                log_err("FAIL: ubrk_getBinaryRules err: %s", u_errorName(status));
+            } else {
+                UBreakIterator* bi2 = ubrk_openBinaryRules(binaryRules, rulesLength, uData, -1, &status);
+                if (U_FAILURE(status)) {
+                    log_err("FAIL: ubrk_openBinaryRules err: %s", u_errorName(status));
+                } else {
+                    int32_t pos2 = ubrk_first(bi2);
+                    pos = ubrk_first(bi);
+                    for (i=0; i<sizeof(breaks); i++) {
+                        if (pos2 != pos) {
+                            log_err("FAIL: interator from ubrk_openBinaryRules does not match original, get pos = %d instead of %d", pos2, pos);
+                        }
+                        pos2 = ubrk_next(bi2);
+                        pos = ubrk_next(bi);
+                    }
+                    ubrk_close(bi2);
+                }
+            }
+            uprv_free(binaryRules);
+        }
+    }
+
      freeToUCharStrings(&freeHook);
      ubrk_close(bi);
  }
@@ -809,7 +844,7 @@ static void TestBreakIteratorTailoring(void) {
              }
              if (!foundError && offsindx < testPtr->numOffsets) {
                  log_err("FAIL: locale %s, break type %d, ubrk_next expected %d, got UBRK_DONE\n",
-                       testPtr->locale, testPtr->type, testPtr->offsFwd[offsindx]);
+                        testPtr->locale, testPtr->type, testPtr->offsFwd[offsindx]);
              }
  
              foundError = FALSE;
@@ -826,7 +861,7 @@ static void TestBreakIteratorTailoring(void) {
              }
              if (!foundError && offsindx < testPtr->numOffsets) {
                  log_err("FAIL: locale %s, break type %d, ubrk_previous expected %d, got UBRK_DONE\n",
-                       testPtr->locale, testPtr->type, testPtr->offsRev[offsindx]);
+                        testPtr->locale, testPtr->type, testPtr->offsRev[offsindx]);
              }
  
              ubrk_close(ubrkiter);
@@ -851,7 +886,7 @@ static void TestBreakIteratorRefresh(void) {
      UBreakIterator *bi;
      UText ut1 = UTEXT_INITIALIZER;
      UText ut2 = UTEXT_INITIALIZER;
-    
+
      bi = ubrk_open(UBRK_LINE, "en_US", NULL, 0, &status);
      TEST_ASSERT_SUCCESS(status);
      if (U_FAILURE(status)) {
@@ -875,7 +910,7 @@ static void TestBreakIteratorRefresh(void) {
          TEST_ASSERT_SUCCESS(status);
          ubrk_refreshUText(bi, &ut2, &status);
          TEST_ASSERT_SUCCESS(status);
-    
+
          /* Find the following matches, now working in the moved string. */
          TEST_ASSERT(5 == ubrk_next(bi));
          TEST_ASSERT(7 == ubrk_next(bi));
@@ -994,7 +1029,7 @@ static const TestBISuppressionsItem testBISuppressionsItems[] = {
  
  static void TestBreakIteratorSuppressions(void) {
      const TestBISuppressionsItem * itemPtr;
-    
+
      for (itemPtr = testBISuppressionsItems; itemPtr->locale != NULL; itemPtr++) {
          UChar textU[kTextULenMax];
          int32_t textULen = u_unescape(itemPtr->text, textU, kTextULenMax);
author	Peter Edberg <pedberg@unicode.org>
	Thu, 19 Jan 2017 23:10:23 +0000 (23:10 +0000)
committer	Peter Edberg <pedberg@unicode.org>
	Thu, 19 Jan 2017 23:10:23 +0000 (23:10 +0000)
icu4c/source/common/ubrk.cpp		patch \| blob \| history
icu4c/source/common/unicode/ubrk.h		patch \| blob \| history
icu4c/source/test/cintltst/cbiapts.c		patch \| blob \| history