#include "unicode/rbbi.h"
#include "rbbirb.h"
#include "uassert.h"
+#include "cmemory.h"
U_NAMESPACE_USE
}
-
+U_CAPI UBreakIterator* U_EXPORT2
+ubrk_openBinaryRules(const uint8_t *binaryRules, uint32_t rulesLength,
+ const UChar * text, int32_t textLength,
+ UErrorCode * status)
+{
+ if (U_FAILURE(*status)) {
+ return NULL;
+ }
+ LocalPointer<RuleBasedBreakIterator> lpRBBI(new RuleBasedBreakIterator(binaryRules, rulesLength, *status), *status);
+ if (U_FAILURE(*status)) {
+ return NULL;
+ }
+ UBreakIterator *uBI = reinterpret_cast<UBreakIterator *>(lpRBBI.orphan());
+ if (text != NULL) {
+ ubrk_setText(uBI, text, textLength, status);
+ }
+ return uBI;
+}
U_CAPI UBreakIterator * U_EXPORT2
}
-void ubrk_refreshUText(UBreakIterator *bi,
+U_CAPI void U_EXPORT2
+ubrk_refreshUText(UBreakIterator *bi,
UText *text,
UErrorCode *status)
{
bii->refreshInputText(text, *status);
}
+U_CAPI uint32_t U_EXPORT2
+ubrk_getBinaryRules(UBreakIterator *bi,
+ uint8_t * binaryRules, uint32_t rulesCapacity,
+ UErrorCode * status)
+{
+ if (U_FAILURE(*status)) {
+ return 0;
+ }
+ if (binaryRules == NULL && rulesCapacity > 0) {
+ *status = U_ILLEGAL_ARGUMENT_ERROR;
+ return 0;
+ }
+ RuleBasedBreakIterator* rbbi;
+ if ((rbbi = dynamic_cast<RuleBasedBreakIterator*>(reinterpret_cast<BreakIterator*>(bi))) == NULL) {
+ *status = U_ILLEGAL_ARGUMENT_ERROR;
+ return 0;
+ }
+ uint32_t rulesLength;
+ const uint8_t * returnedRules = rbbi->getBinaryRules(rulesLength);
+ if (binaryRules != NULL) { // if not preflighting
+ if (rulesLength > rulesCapacity) {
+ *status = U_BUFFER_OVERFLOW_ERROR;
+ } else {
+ uprv_memcpy(binaryRules, returnedRules, rulesLength);
+ }
+ }
+ return rulesLength;
+}
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
UParseError *parseErr,
UErrorCode *status);
+#ifndef U_HIDE_DRAFT_API
+/**
+ * Open a new UBreakIterator for locating text boundaries using precompiled binary rules.
+ * Opening a UBreakIterator this way is substantially faster than using ubrk_openRules.
+ * Binary rules may be obtained using ubrk_getBinaryRules. The compiled rules are not
+ * compatible across different major versions of ICU, nor across platforms of different
+ * endianness or different base character set family (ASCII vs EBCDIC).
+ * @param binaryRules A set of compiled binary rules specifying the text breaking
+ * conventions. Ownership of the storage containing the compiled
+ * rules remains with the caller of this function. The compiled
+ * rules must not be modified or deleted during the life of the
+ * break iterator.
+ * @param rulesLength The length of binaryRules in bytes.
+ * @param text The text to be iterated over. May be null, in which case
+ * ubrk_setText() is used to specify the text to be iterated.
+ * @param textLength The number of characters in text, or -1 if null-terminated.
+ * @param status Pointer to UErrorCode to receive any errors.
+ * @return UBreakIterator for the specified rules.
+ * @see ubrk_getBinaryRules
+ * @draft ICU 59
+ */
+U_DRAFT UBreakIterator* U_EXPORT2
+ubrk_openBinaryRules(const uint8_t *binaryRules, uint32_t rulesLength,
+ const UChar * text, int32_t textLength,
+ UErrorCode * status);
+
+#endif /* U_HIDE_DRAFT_API */
+
/**
* Thread safe cloning operation
* @param bi iterator to be cloned
UText *text,
UErrorCode *status);
+
+#ifndef U_HIDE_DRAFT_API
+/**
+ * Get a compiled binary version of the rules specifying the behavior of a UBreakIterator.
+ * The binary rules may be used with ubrk_openBinaryRules to open a new UBreakIterator
+ * more quickly than using ubrk_openRules. The compiled rules are not compatible across
+ * different major versions of ICU, nor across platforms of different endianness or
+ * different base character set family (ASCII vs EBCDIC). Supports preflighting (with
+ * binaryRules=NULL and rulesCapacity=0) to get the rules length without copying them to
+ * the binaryRules buffer,
+ * @param bi The break iterator to use.
+ * @param binaryRules Buffer to receive the compiled binary rules; set to NULL for
+ * preflighting.
+ * @param rulesCapacity Capacity (in bytes) of the binaryRules buffer; set to 0 for
+ * preflighting.
+ * @param status Pointer to UErrorCode to receive any errors.
+ * @return The actual byte length of the binary rules. If not preflighting
+ * and this is larger than rulesCapacity, *status will be set to
+ * an error.
+ * @see ubrk_openBinaryRules
+ * @draft ICU 59
+ */
+U_DRAFT uint32_t U_EXPORT2
+ubrk_getBinaryRules(UBreakIterator *bi,
+ uint8_t * binaryRules, uint32_t rulesCapacity,
+ UErrorCode * status);
+
+#endif /* U_HIDE_DRAFT_API */
+
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
#endif
* File CBIAPTS.C
*
* Modification History:
-* Name Description
+* Name Description
* Madhu Katragadda Creation
*********************************************************************************/
/*C API TEST FOR BREAKITERATOR */
if (dest == NULL) {
return NULL;
}
-
+
dest->link = (StringStruct*)(*freeHook);
*freeHook = dest;
return dest->str;
/*test ubrk_open()*/
log_verbose("\nTesting BreakIterator open functions\n");
-
+
/* Use french for fun */
word = ubrk_open(UBRK_WORD, "en_US", text, u_strlen(text), &status);
if(status == U_FILE_ACCESS_ERROR) {
else{
log_verbose("PASS: Successfully opened word breakiterator\n");
}
-
+
sentence = ubrk_open(UBRK_SENTENCE, "en_US", text, u_strlen(text), &status);
if(U_FAILURE(status)){
log_err_status(status, "FAIL: Error in ubrk_open() for sentence breakiterator: %s\n", myErrorName(status));
else{
log_verbose("PASS: Successfully opened sentence breakiterator\n");
}
-
+
line = ubrk_open(UBRK_LINE, "en_US", text, u_strlen(text), &status);
if(U_FAILURE(status)){
log_err("FAIL: Error in ubrk_open() for line breakiterator: %s\n", myErrorName(status));
else{
log_verbose("PASS: Successfully opened line breakiterator\n");
}
-
+
character = ubrk_open(UBRK_CHARACTER, "en_US", text, u_strlen(text), &status);
if(U_FAILURE(status)){
log_err("FAIL: Error in ubrk_open() for character breakiterator: %s\n", myErrorName(status));
}
for(i=0;i<count;i++)
{
- log_verbose("%s\n", ubrk_getAvailable(i));
+ log_verbose("%s\n", ubrk_getAvailable(i));
if (ubrk_getAvailable(i) == 0)
log_err("No locale for which breakiterator is applicable\n");
- else
+ else
log_verbose("A locale %s for which breakiterator is applicable\n",ubrk_getAvailable(i));
}
if(end!=49)
log_err("error ubrk_last(word) did not return 49\n");
log_verbose("last (word = %d\n", (int32_t)end);
-
+
pos=ubrk_previous(word);
log_verbose("%d %d\n", end, pos);
-
+
pos=ubrk_previous(word);
log_verbose("%d \n", pos);
}
-
+
log_verbose("\nTesting the functions for character\n");
ubrk_first(character);
pos = ubrk_following(character, 5);
if(pos!=21)
log_err("error ubrk_preceding(character,22) did not return 21\n");
log_verbose("preceding(character,22) = %d\n", (int32_t)pos);
-
+
log_verbose("\nTesting the functions for line\n");
pos=ubrk_first(line);
log_err("error ubrk_following(line) did not return 22\n");
log_verbose("following (line) = %d\n", (int32_t)pos);
-
+
log_verbose("\nTesting the functions for sentence\n");
ubrk_first(sentence);
pos = ubrk_current(sentence);
if (ubrk_first(sentence)!=ubrk_current(sentence)) {
log_err("error in ubrk_first() or ubrk_current()\n");
}
-
-
+
+
/*---- */
/*Testing ubrk_open and ubrk_close()*/
log_verbose("\nTesting open and close for us locale\n");
static void TestBreakIteratorSafeClone(void)
{
UChar text[51]; /* Keep this odd to test for 64-bit memory alignment */
- /* NOTE: This doesn't reliably force mis-alignment of following items. */
+ /* NOTE: This doesn't reliably force mis-alignment of following items. */
uint8_t buffer [CLONETEST_ITERATOR_COUNT] [U_BRK_SAFECLONE_BUFFERSIZE];
int32_t bufferSize = U_BRK_SAFECLONE_BUFFERSIZE;
bi = ubrk_openRules(ruleSourceU, -1, /* The rules */
NULL, -1, /* The text to be iterated over. */
&parseErr, &status);
-
+
if (U_FAILURE(status)) {
log_data_err("FAIL: ubrk_openRules: ICU Error \"%s\" (Are you missing data?)\n", u_errorName(status));
bi = 0;
}
}
+ /* #12914 add basic sanity test for ubrk_getBinaryRules, ubrk_openBinaryRules */
+ /* Underlying functionality checked in C++ rbbiapts.cpp TestRoundtripRules */
+ status = U_ZERO_ERROR;
+ uint32_t rulesLength = ubrk_getBinaryRules(bi, NULL, 0, &status); /* preflight */
+ if (U_FAILURE(status)) {
+ log_err("FAIL: ubrk_getBinaryRules preflight err: %s", u_errorName(status));
+ } else {
+ uint8_t* binaryRules = (uint8_t*)uprv_malloc(rulesLength);
+ if (binaryRules == NULL) {
+ log_err("FAIL: unable to malloc rules buffer, size %u", rulesLength);
+ } else {
+ rulesLength = ubrk_getBinaryRules(bi, binaryRules, rulesLength, &status);
+ if (U_FAILURE(status)) {
+ log_err("FAIL: ubrk_getBinaryRules err: %s", u_errorName(status));
+ } else {
+ UBreakIterator* bi2 = ubrk_openBinaryRules(binaryRules, rulesLength, uData, -1, &status);
+ if (U_FAILURE(status)) {
+ log_err("FAIL: ubrk_openBinaryRules err: %s", u_errorName(status));
+ } else {
+ int32_t pos2 = ubrk_first(bi2);
+ pos = ubrk_first(bi);
+ for (i=0; i<sizeof(breaks); i++) {
+ if (pos2 != pos) {
+ log_err("FAIL: interator from ubrk_openBinaryRules does not match original, get pos = %d instead of %d", pos2, pos);
+ }
+ pos2 = ubrk_next(bi2);
+ pos = ubrk_next(bi);
+ }
+ ubrk_close(bi2);
+ }
+ }
+ uprv_free(binaryRules);
+ }
+ }
+
freeToUCharStrings(&freeHook);
ubrk_close(bi);
}
}
if (!foundError && offsindx < testPtr->numOffsets) {
log_err("FAIL: locale %s, break type %d, ubrk_next expected %d, got UBRK_DONE\n",
- testPtr->locale, testPtr->type, testPtr->offsFwd[offsindx]);
+ testPtr->locale, testPtr->type, testPtr->offsFwd[offsindx]);
}
foundError = FALSE;
}
if (!foundError && offsindx < testPtr->numOffsets) {
log_err("FAIL: locale %s, break type %d, ubrk_previous expected %d, got UBRK_DONE\n",
- testPtr->locale, testPtr->type, testPtr->offsRev[offsindx]);
+ testPtr->locale, testPtr->type, testPtr->offsRev[offsindx]);
}
ubrk_close(ubrkiter);
UBreakIterator *bi;
UText ut1 = UTEXT_INITIALIZER;
UText ut2 = UTEXT_INITIALIZER;
-
+
bi = ubrk_open(UBRK_LINE, "en_US", NULL, 0, &status);
TEST_ASSERT_SUCCESS(status);
if (U_FAILURE(status)) {
TEST_ASSERT_SUCCESS(status);
ubrk_refreshUText(bi, &ut2, &status);
TEST_ASSERT_SUCCESS(status);
-
+
/* Find the following matches, now working in the moved string. */
TEST_ASSERT(5 == ubrk_next(bi));
TEST_ASSERT(7 == ubrk_next(bi));
static void TestBreakIteratorSuppressions(void) {
const TestBISuppressionsItem * itemPtr;
-
+
for (itemPtr = testBISuppressionsItems; itemPtr->locale != NULL; itemPtr++) {
UChar textU[kTextULenMax];
int32_t textULen = u_unescape(itemPtr->text, textU, kTextULenMax);