]> granicus.if.org Git - icu/commitdiff
ICU-12914 Add ubrk_openBinaryRules, ubrk_getBinaryRules, and simple test
authorPeter Edberg <pedberg@unicode.org>
Thu, 19 Jan 2017 23:10:23 +0000 (23:10 +0000)
committerPeter Edberg <pedberg@unicode.org>
Thu, 19 Jan 2017 23:10:23 +0000 (23:10 +0000)
X-SVN-Rev: 39582

icu4c/source/common/ubrk.cpp
icu4c/source/common/unicode/ubrk.h
icu4c/source/test/cintltst/cbiapts.c

index b02c966b107d7f3eb20a163dec9e82aa5e6644b1..925c1e90a91ba6ec51de934a7901e9f6c5dbccdc 100644 (file)
@@ -20,6 +20,7 @@
 #include "unicode/rbbi.h"
 #include "rbbirb.h"
 #include "uassert.h"
+#include "cmemory.h"
 
 U_NAMESPACE_USE
 
@@ -119,7 +120,24 @@ ubrk_openRules(  const UChar        *rules,
 }
 
 
-
+U_CAPI UBreakIterator* U_EXPORT2
+ubrk_openBinaryRules(const uint8_t *binaryRules, uint32_t rulesLength,
+                     const UChar *  text, int32_t textLength,
+                     UErrorCode *   status)
+{
+    if (U_FAILURE(*status)) {
+        return NULL;
+    }
+    LocalPointer<RuleBasedBreakIterator> lpRBBI(new RuleBasedBreakIterator(binaryRules, rulesLength, *status), *status);
+    if (U_FAILURE(*status)) {
+        return NULL;
+    }
+    UBreakIterator *uBI = reinterpret_cast<UBreakIterator *>(lpRBBI.orphan());
+    if (text != NULL) {
+        ubrk_setText(uBI, text, textLength, status);
+    }
+    return uBI;
+}
 
 
 U_CAPI UBreakIterator * U_EXPORT2
@@ -288,7 +306,8 @@ ubrk_getLocaleByType(const UBreakIterator *bi,
 }
 
 
-void ubrk_refreshUText(UBreakIterator *bi,
+U_CAPI void U_EXPORT2
+ubrk_refreshUText(UBreakIterator *bi,
                        UText          *text,
                        UErrorCode     *status)
 {
@@ -296,6 +315,34 @@ void ubrk_refreshUText(UBreakIterator *bi,
     bii->refreshInputText(text, *status);
 }
 
+U_CAPI uint32_t U_EXPORT2
+ubrk_getBinaryRules(UBreakIterator *bi,
+                    uint8_t *       binaryRules, uint32_t rulesCapacity,
+                    UErrorCode *    status)
+{
+    if (U_FAILURE(*status)) {
+        return 0;
+    }
+    if (binaryRules == NULL && rulesCapacity > 0) {
+        *status = U_ILLEGAL_ARGUMENT_ERROR;
+        return 0;
+    }
+    RuleBasedBreakIterator* rbbi;
+    if ((rbbi = dynamic_cast<RuleBasedBreakIterator*>(reinterpret_cast<BreakIterator*>(bi))) == NULL) {
+        *status = U_ILLEGAL_ARGUMENT_ERROR;
+        return 0;
+    }
+    uint32_t rulesLength;
+    const uint8_t * returnedRules = rbbi->getBinaryRules(rulesLength);
+    if (binaryRules != NULL) { // if not preflighting
+        if (rulesLength > rulesCapacity) {
+            *status = U_BUFFER_OVERFLOW_ERROR;
+        } else {
+            uprv_memcpy(binaryRules, returnedRules, rulesLength);
+        }
+    }
+    return rulesLength;
+}
 
 
 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
index f43943ed1ab12c1798945dbdea02f9db163d1ab9..1c8f62a17c87f41aaf10049ba468e9672367418a 100644 (file)
@@ -267,6 +267,34 @@ ubrk_openRules(const UChar     *rules,
                UParseError     *parseErr,
                UErrorCode      *status);
 
+#ifndef U_HIDE_DRAFT_API
+/**
+ * Open a new UBreakIterator for locating text boundaries using precompiled binary rules.
+ * Opening a UBreakIterator this way is substantially faster than using ubrk_openRules.
+ * Binary rules may be obtained using ubrk_getBinaryRules. The compiled rules are not
+ * compatible across different major versions of ICU, nor across platforms of different
+ * endianness or different base character set family (ASCII vs EBCDIC).
+ * @param binaryRules A set of compiled binary rules specifying the text breaking
+ *                    conventions. Ownership of the storage containing the compiled
+ *                    rules remains with the caller of this function. The compiled
+ *                    rules must not be modified or deleted during the life of the
+ *                    break iterator.
+ * @param rulesLength The length of binaryRules in bytes.
+ * @param text        The text to be iterated over.  May be null, in which case
+ *                    ubrk_setText() is used to specify the text to be iterated.
+ * @param textLength  The number of characters in text, or -1 if null-terminated.
+ * @param status      Pointer to UErrorCode to receive any errors.
+ * @return            UBreakIterator for the specified rules.
+ * @see ubrk_getBinaryRules
+ * @draft ICU 59
+ */
+U_DRAFT UBreakIterator* U_EXPORT2
+ubrk_openBinaryRules(const uint8_t *binaryRules, uint32_t rulesLength,
+                     const UChar *  text, int32_t textLength,
+                     UErrorCode *   status);
+
+#endif  /* U_HIDE_DRAFT_API */
+
 /**
  * Thread safe cloning operation
  * @param bi iterator to be cloned
@@ -566,6 +594,35 @@ ubrk_refreshUText(UBreakIterator *bi,
                        UText          *text,
                        UErrorCode     *status);
 
+
+#ifndef U_HIDE_DRAFT_API
+/**
+ * Get a compiled binary version of the rules specifying the behavior of a UBreakIterator.
+ * The binary rules may be used with ubrk_openBinaryRules to open a new UBreakIterator
+ * more quickly than using ubrk_openRules. The compiled rules are not compatible across
+ * different major versions of ICU, nor across platforms of different endianness or
+ * different base character set family (ASCII vs EBCDIC). Supports preflighting (with
+ * binaryRules=NULL and rulesCapacity=0) to get the rules length without copying them to
+ * the binaryRules buffer,
+ * @param bi            The break iterator to use.
+ * @param binaryRules   Buffer to receive the compiled binary rules; set to NULL for
+ *                      preflighting.
+ * @param rulesCapacity Capacity (in bytes) of the binaryRules buffer; set to 0 for
+ *                      preflighting.
+ * @param status        Pointer to UErrorCode to receive any errors.
+ * @return              The actual byte length of the binary rules. If not preflighting
+ *                      and this is larger than rulesCapacity, *status will be set to
+ *                      an error.
+ * @see ubrk_openBinaryRules
+ * @draft ICU 59
+ */
+U_DRAFT uint32_t U_EXPORT2
+ubrk_getBinaryRules(UBreakIterator *bi,
+                    uint8_t *       binaryRules, uint32_t rulesCapacity,
+                    UErrorCode *    status);
+
+#endif  /* U_HIDE_DRAFT_API */
+
 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
 
 #endif
index 398db3c7fe3c3a3a6eaea936cf9da9ccfb7405ef..7ad1924ca8913ab436f4ca8c7e2fce0b97ebbc38 100644 (file)
@@ -10,7 +10,7 @@
 * File CBIAPTS.C
 *
 * Modification History:
-*        Name                     Description            
+*        Name                     Description
 *     Madhu Katragadda              Creation
 *********************************************************************************/
 /*C API TEST FOR BREAKITERATOR */
@@ -128,7 +128,7 @@ static UChar* toUChar(const char *src, void **freeHook) {
     if (dest == NULL) {
         return NULL;
     }
-    
+
     dest->link = (StringStruct*)(*freeHook);
     *freeHook = dest;
     return dest->str;
@@ -164,7 +164,7 @@ static void TestBreakIteratorCAPI()
 
 /*test ubrk_open()*/
     log_verbose("\nTesting BreakIterator open functions\n");
-                                            
+
     /* Use french for fun */
     word         = ubrk_open(UBRK_WORD, "en_US", text, u_strlen(text), &status);
     if(status == U_FILE_ACCESS_ERROR) {
@@ -176,7 +176,7 @@ static void TestBreakIteratorCAPI()
     else{
         log_verbose("PASS: Successfully opened  word breakiterator\n");
     }
-    
+
     sentence     = ubrk_open(UBRK_SENTENCE, "en_US", text, u_strlen(text), &status);
     if(U_FAILURE(status)){
         log_err_status(status, "FAIL: Error in ubrk_open() for sentence breakiterator: %s\n", myErrorName(status));
@@ -185,7 +185,7 @@ static void TestBreakIteratorCAPI()
     else{
         log_verbose("PASS: Successfully opened  sentence breakiterator\n");
     }
-    
+
     line         = ubrk_open(UBRK_LINE, "en_US", text, u_strlen(text), &status);
     if(U_FAILURE(status)){
         log_err("FAIL: Error in ubrk_open() for line breakiterator: %s\n", myErrorName(status));
@@ -194,7 +194,7 @@ static void TestBreakIteratorCAPI()
     else{
         log_verbose("PASS: Successfully opened  line breakiterator\n");
     }
-    
+
     character     = ubrk_open(UBRK_CHARACTER, "en_US", text, u_strlen(text), &status);
     if(U_FAILURE(status)){
         log_err("FAIL: Error in ubrk_open() for character breakiterator: %s\n", myErrorName(status));
@@ -232,10 +232,10 @@ static void TestBreakIteratorCAPI()
     }
     for(i=0;i<count;i++)
     {
-        log_verbose("%s\n", ubrk_getAvailable(i)); 
+        log_verbose("%s\n", ubrk_getAvailable(i));
         if (ubrk_getAvailable(i) == 0)
             log_err("No locale for which breakiterator is applicable\n");
-        else 
+        else
             log_verbose("A locale %s for which breakiterator is applicable\n",ubrk_getAvailable(i));
     }
 
@@ -258,10 +258,10 @@ static void TestBreakIteratorCAPI()
     if(end!=49)
         log_err("error ubrk_last(word) did not return 49\n");
     log_verbose("last (word = %d\n", (int32_t)end);
-    
+
     pos=ubrk_previous(word);
     log_verbose("%d   %d\n", end, pos);
-     
+
     pos=ubrk_previous(word);
     log_verbose("%d \n", pos);
 
@@ -277,7 +277,7 @@ static void TestBreakIteratorCAPI()
     }
 
 
-    
+
     log_verbose("\nTesting the functions for character\n");
     ubrk_first(character);
     pos = ubrk_following(character, 5);
@@ -292,7 +292,7 @@ static void TestBreakIteratorCAPI()
     if(pos!=21)
        log_err("error ubrk_preceding(character,22) did not return 21\n");
     log_verbose("preceding(character,22) = %d\n", (int32_t)pos);
-    
+
 
     log_verbose("\nTesting the functions for line\n");
     pos=ubrk_first(line);
@@ -304,7 +304,7 @@ static void TestBreakIteratorCAPI()
         log_err("error ubrk_following(line) did not return 22\n");
     log_verbose("following (line) = %d\n", (int32_t)pos);
 
-    
+
     log_verbose("\nTesting the functions for sentence\n");
     ubrk_first(sentence);
     pos = ubrk_current(sentence);
@@ -321,8 +321,8 @@ static void TestBreakIteratorCAPI()
     if (ubrk_first(sentence)!=ubrk_current(sentence)) {
         log_err("error in ubrk_first() or ubrk_current()\n");
     }
-    
+
+
     /*---- */
     /*Testing ubrk_open and ubrk_close()*/
    log_verbose("\nTesting open and close for us locale\n");
@@ -368,7 +368,7 @@ static void TestBreakIteratorCAPI()
 static void TestBreakIteratorSafeClone(void)
 {
     UChar text[51];     /* Keep this odd to test for 64-bit memory alignment */
-                        /*  NOTE:  This doesn't reliably force mis-alignment of following items. */ 
+                        /*  NOTE:  This doesn't reliably force mis-alignment of following items. */
     uint8_t buffer [CLONETEST_ITERATOR_COUNT] [U_BRK_SAFECLONE_BUFFERSIZE];
     int32_t bufferSize = U_BRK_SAFECLONE_BUFFERSIZE;
 
@@ -526,7 +526,7 @@ static UBreakIterator * testOpenRules(char *rules) {
     bi = ubrk_openRules(ruleSourceU,  -1,     /*  The rules  */
                         NULL,  -1,            /*  The text to be iterated over. */
                         &parseErr, &status);
-    
+
     if (U_FAILURE(status)) {
         log_data_err("FAIL: ubrk_openRules: ICU Error \"%s\" (Are you missing data?)\n", u_errorName(status));
         bi = 0;
@@ -586,6 +586,41 @@ static void TestBreakIteratorRules() {
         }
     }
 
+    /* #12914 add basic sanity test for ubrk_getBinaryRules, ubrk_openBinaryRules */
+    /* Underlying functionality checked in C++ rbbiapts.cpp TestRoundtripRules */
+    status = U_ZERO_ERROR;
+    uint32_t rulesLength = ubrk_getBinaryRules(bi, NULL, 0, &status); /* preflight */
+    if (U_FAILURE(status)) {
+        log_err("FAIL: ubrk_getBinaryRules preflight err: %s", u_errorName(status));
+    } else {
+        uint8_t* binaryRules = (uint8_t*)uprv_malloc(rulesLength);
+        if (binaryRules == NULL) {
+            log_err("FAIL: unable to malloc rules buffer, size %u", rulesLength);
+        } else {
+            rulesLength = ubrk_getBinaryRules(bi, binaryRules, rulesLength, &status);
+            if (U_FAILURE(status)) {
+                log_err("FAIL: ubrk_getBinaryRules err: %s", u_errorName(status));
+            } else {
+                UBreakIterator* bi2 = ubrk_openBinaryRules(binaryRules, rulesLength, uData, -1, &status);
+                if (U_FAILURE(status)) {
+                    log_err("FAIL: ubrk_openBinaryRules err: %s", u_errorName(status));
+                } else {
+                    int32_t pos2 = ubrk_first(bi2);
+                    pos = ubrk_first(bi);
+                    for (i=0; i<sizeof(breaks); i++) {
+                        if (pos2 != pos) {
+                            log_err("FAIL: interator from ubrk_openBinaryRules does not match original, get pos = %d instead of %d", pos2, pos);
+                        }
+                        pos2 = ubrk_next(bi2);
+                        pos = ubrk_next(bi);
+                    }
+                    ubrk_close(bi2);
+                }
+            }
+            uprv_free(binaryRules);
+        }
+    }
+
     freeToUCharStrings(&freeHook);
     ubrk_close(bi);
 }
@@ -809,7 +844,7 @@ static void TestBreakIteratorTailoring(void) {
             }
             if (!foundError && offsindx < testPtr->numOffsets) {
                 log_err("FAIL: locale %s, break type %d, ubrk_next expected %d, got UBRK_DONE\n",
-                       testPtr->locale, testPtr->type, testPtr->offsFwd[offsindx]);
+                        testPtr->locale, testPtr->type, testPtr->offsFwd[offsindx]);
             }
 
             foundError = FALSE;
@@ -826,7 +861,7 @@ static void TestBreakIteratorTailoring(void) {
             }
             if (!foundError && offsindx < testPtr->numOffsets) {
                 log_err("FAIL: locale %s, break type %d, ubrk_previous expected %d, got UBRK_DONE\n",
-                       testPtr->locale, testPtr->type, testPtr->offsRev[offsindx]);
+                        testPtr->locale, testPtr->type, testPtr->offsRev[offsindx]);
             }
 
             ubrk_close(ubrkiter);
@@ -851,7 +886,7 @@ static void TestBreakIteratorRefresh(void) {
     UBreakIterator *bi;
     UText ut1 = UTEXT_INITIALIZER;
     UText ut2 = UTEXT_INITIALIZER;
-    
+
     bi = ubrk_open(UBRK_LINE, "en_US", NULL, 0, &status);
     TEST_ASSERT_SUCCESS(status);
     if (U_FAILURE(status)) {
@@ -875,7 +910,7 @@ static void TestBreakIteratorRefresh(void) {
         TEST_ASSERT_SUCCESS(status);
         ubrk_refreshUText(bi, &ut2, &status);
         TEST_ASSERT_SUCCESS(status);
-    
+
         /* Find the following matches, now working in the moved string. */
         TEST_ASSERT(5 == ubrk_next(bi));
         TEST_ASSERT(7 == ubrk_next(bi));
@@ -994,7 +1029,7 @@ static const TestBISuppressionsItem testBISuppressionsItems[] = {
 
 static void TestBreakIteratorSuppressions(void) {
     const TestBISuppressionsItem * itemPtr;
-    
+
     for (itemPtr = testBISuppressionsItems; itemPtr->locale != NULL; itemPtr++) {
         UChar textU[kTextULenMax];
         int32_t textULen = u_unescape(itemPtr->text, textU, kTextULenMax);