From: Yoshito Umaoka Date: Tue, 17 Sep 2013 06:57:53 +0000 (+0000) Subject: ICU-10012 Disable EBCDIC Arabic/Hebrew detectors by default. Added APIs to enable... X-Git-Tag: milestone-59-0-1~2507 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=920dadff8d99c7d1bbe62fad8bf377849eb1bb70;p=icu ICU-10012 Disable EBCDIC Arabic/Hebrew detectors by default. Added APIs to enable/disable each charset and get currently active charsets. X-SVN-Rev: 34350 --- diff --git a/icu4c/source/i18n/csdetect.cpp b/icu4c/source/i18n/csdetect.cpp index c7f82b5525a..5115684d5f5 100644 --- a/icu4c/source/i18n/csdetect.cpp +++ b/icu4c/source/i18n/csdetect.cpp @@ -32,7 +32,21 @@ #define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type)) #define DELETE_ARRAY(array) uprv_free((void *) (array)) -static icu::CharsetRecognizer **fCSRecognizers = NULL; +U_NAMESPACE_BEGIN + +struct CSRecognizerInfo : public UMemory { + CSRecognizerInfo(CharsetRecognizer *recognizer, UBool isDefaultEnabled) + : recognizer(recognizer), isDefaultEnabled(isDefaultEnabled) {}; + + ~CSRecognizerInfo() {delete recognizer;}; + + CharsetRecognizer *recognizer; + UBool isDefaultEnabled; +}; + +U_NAMESPACE_END + +static icu::CSRecognizerInfo **fCSRecognizers = NULL; static icu::UInitOnce gCSRecognizersInitOnce; static int32_t fCSRecognizers_size = 0; @@ -70,47 +84,48 @@ charsetMatchComparator(const void * /*context*/, const void *left, const void *r static void U_CALLCONV initRecognizers(UErrorCode &status) { U_NAMESPACE_USE ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup); - CharsetRecognizer *tempArray[] = { - new CharsetRecog_UTF8(), - - new CharsetRecog_UTF_16_BE(), - new CharsetRecog_UTF_16_LE(), - new CharsetRecog_UTF_32_BE(), - new CharsetRecog_UTF_32_LE(), - - new CharsetRecog_8859_1(), - new CharsetRecog_8859_2(), - new CharsetRecog_8859_5_ru(), - new CharsetRecog_8859_6_ar(), - new CharsetRecog_8859_7_el(), - new CharsetRecog_8859_8_I_he(), - new CharsetRecog_8859_8_he(), - new CharsetRecog_windows_1251(), - new CharsetRecog_windows_1256(), - new CharsetRecog_KOI8_R(), - new CharsetRecog_8859_9_tr(), - new CharsetRecog_sjis(), - new CharsetRecog_gb_18030(), - new CharsetRecog_euc_jp(), - new CharsetRecog_euc_kr(), - new CharsetRecog_big5(), - - new CharsetRecog_2022JP(), - new CharsetRecog_2022KR(), - new CharsetRecog_2022CN(), - - new CharsetRecog_IBM424_he_rtl(), - new CharsetRecog_IBM424_he_ltr(), - new CharsetRecog_IBM420_ar_rtl(), - new CharsetRecog_IBM420_ar_ltr() + CSRecognizerInfo *tempArray[] = { + new CSRecognizerInfo(new CharsetRecog_UTF8(), TRUE), + + new CSRecognizerInfo(new CharsetRecog_UTF_16_BE(), TRUE), + new CSRecognizerInfo(new CharsetRecog_UTF_16_LE(), TRUE), + new CSRecognizerInfo(new CharsetRecog_UTF_32_BE(), TRUE), + new CSRecognizerInfo(new CharsetRecog_UTF_32_LE(), TRUE), + + new CSRecognizerInfo(new CharsetRecog_8859_1(), TRUE), + new CSRecognizerInfo(new CharsetRecog_8859_2(), TRUE), + new CSRecognizerInfo(new CharsetRecog_8859_5_ru(), TRUE), + new CSRecognizerInfo(new CharsetRecog_8859_6_ar(), TRUE), + new CSRecognizerInfo(new CharsetRecog_8859_7_el(), TRUE), + new CSRecognizerInfo(new CharsetRecog_8859_8_I_he(), TRUE), + new CSRecognizerInfo(new CharsetRecog_8859_8_he(), TRUE), + new CSRecognizerInfo(new CharsetRecog_windows_1251(), TRUE), + new CSRecognizerInfo(new CharsetRecog_windows_1256(), TRUE), + new CSRecognizerInfo(new CharsetRecog_KOI8_R(), TRUE), + new CSRecognizerInfo(new CharsetRecog_8859_9_tr(), TRUE), + new CSRecognizerInfo(new CharsetRecog_sjis(), TRUE), + new CSRecognizerInfo(new CharsetRecog_gb_18030(), TRUE), + new CSRecognizerInfo(new CharsetRecog_euc_jp(), TRUE), + new CSRecognizerInfo(new CharsetRecog_euc_kr(), TRUE), + new CSRecognizerInfo(new CharsetRecog_big5(), TRUE), + + new CSRecognizerInfo(new CharsetRecog_2022JP(), TRUE), + new CSRecognizerInfo(new CharsetRecog_2022KR(), TRUE), + new CSRecognizerInfo(new CharsetRecog_2022CN(), TRUE), + + new CSRecognizerInfo(new CharsetRecog_IBM424_he_rtl(), FALSE), + new CSRecognizerInfo(new CharsetRecog_IBM424_he_ltr(), FALSE), + new CSRecognizerInfo(new CharsetRecog_IBM420_ar_rtl(), FALSE), + new CSRecognizerInfo(new CharsetRecog_IBM420_ar_ltr(), FALSE) }; int32_t rCount = ARRAY_SIZE(tempArray); - fCSRecognizers = NEW_ARRAY(CharsetRecognizer *, rCount); + fCSRecognizers = NEW_ARRAY(CSRecognizerInfo *, rCount); if (fCSRecognizers == NULL) { status = U_MEMORY_ALLOCATION_ERROR; - } else { + } + else { fCSRecognizers_size = rCount; for (int32_t r = 0; r < rCount; r += 1) { fCSRecognizers[r] = tempArray[r]; @@ -132,7 +147,8 @@ void CharsetDetector::setRecognizers(UErrorCode &status) CharsetDetector::CharsetDetector(UErrorCode &status) : textIn(new InputText(status)), resultArray(NULL), - resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE) + resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE), + fEnabledRecognizers(NULL) { if (U_FAILURE(status)) { return; @@ -170,6 +186,10 @@ CharsetDetector::~CharsetDetector() } uprv_free(resultArray); + + if (fEnabledRecognizers) { + uprv_free(fEnabledRecognizers); + } } void CharsetDetector::setText(const char *in, int32_t len) @@ -234,7 +254,7 @@ const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, // give a match quality > 0. resultCount = 0; for (i = 0; i < fCSRecognizers_size; i += 1) { - csr = fCSRecognizers[i]; + csr = fCSRecognizers[i]->recognizer; if (csr->match(textIn, resultArray[resultCount])) { resultCount++; } @@ -251,6 +271,46 @@ const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, return resultArray; } +void CharsetDetector::setDetectableCharset(const char *encoding, UBool enabled, UErrorCode &status) +{ + if (U_FAILURE(status)) { + return; + } + + int32_t modIdx = -1; + UBool isDefaultVal = FALSE; + for (int32_t i = 0; i < fCSRecognizers_size; i++) { + CSRecognizerInfo *csrinfo = fCSRecognizers[i]; + if (uprv_strcmp(csrinfo->recognizer->getName(), encoding) == 0) { + modIdx = i; + isDefaultVal = (csrinfo->isDefaultEnabled == enabled); + break; + } + } + if (modIdx < 0) { + // No matching encoding found + status = U_ILLEGAL_ARGUMENT_ERROR; + return; + } + + if (fEnabledRecognizers == NULL && !isDefaultVal) { + // Create an array storing the non default setting + fEnabledRecognizers = NEW_ARRAY(UBool, fCSRecognizers_size); + if (fEnabledRecognizers == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + return; + } + // Initialize the array with default info + for (int32_t i = 0; i < fCSRecognizers_size; i++) { + fEnabledRecognizers[i] = fCSRecognizers[i]->isDefaultEnabled; + } + } + + if (fEnabledRecognizers != NULL) { + fEnabledRecognizers[modIdx] = enabled; + } +} + /*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const { if( index > fCSRecognizers_size-1 || index < 0) { @@ -267,6 +327,8 @@ U_NAMESPACE_END U_CDECL_BEGIN typedef struct { int32_t currIndex; + UBool all; + UBool *enabledRecognizers; } Context; @@ -281,27 +343,73 @@ enumClose(UEnumeration *en) { } static int32_t U_CALLCONV -enumCount(UEnumeration *, UErrorCode *) { - return fCSRecognizers_size; +enumCount(UEnumeration *en, UErrorCode *) { + if (((Context *)en->context)->all) { + // ucsdet_getAllDetectableCharsets, all charset detector names + return fCSRecognizers_size; + } + + // Otherwise, ucsdet_getDetectableCharsets - only enabled ones + int32_t count = 0; + UBool *enabledArray = ((Context *)en->context)->enabledRecognizers; + if (enabledArray != NULL) { + // custom set + for (int32_t i = 0; i < fCSRecognizers_size; i++) { + if (enabledArray[i]) { + count++; + } + } + } else { + // default set + for (int32_t i = 0; i < fCSRecognizers_size; i++) { + if (fCSRecognizers[i]->isDefaultEnabled) { + count++; + } + } + } + return count; } static const char* U_CALLCONV enumNext(UEnumeration *en, int32_t *resultLength, UErrorCode * /*status*/) { - if(((Context *)en->context)->currIndex >= fCSRecognizers_size) { - if(resultLength != NULL) { - *resultLength = 0; + const char *currName = NULL; + + if (((Context *)en->context)->currIndex < fCSRecognizers_size) { + if (((Context *)en->context)->all) { + // ucsdet_getAllDetectableCharsets, all charset detector names + currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName(); + ((Context *)en->context)->currIndex++; + } else { + // ucsdet_getDetectableCharsets + UBool *enabledArray = ((Context *)en->context)->enabledRecognizers; + if (enabledArray != NULL) { + // custome set + while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) { + if (enabledArray[((Context *)en->context)->currIndex]) { + currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName(); + } + ((Context *)en->context)->currIndex++; + } + } else { + // default set + while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) { + if (fCSRecognizers[((Context *)en->context)->currIndex]->isDefaultEnabled) { + currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName(); + } + ((Context *)en->context)->currIndex++; + } + } } - return NULL; } - const char *currName = fCSRecognizers[((Context *)en->context)->currIndex]->getName(); + if(resultLength != NULL) { - *resultLength = (int32_t)uprv_strlen(currName); + *resultLength = currName == NULL ? 0 : (int32_t)uprv_strlen(currName); } - ((Context *)en->context)->currIndex++; return currName; } + static void U_CALLCONV enumReset(UEnumeration *en, UErrorCode *) { ((Context *)en->context)->currIndex = 0; @@ -317,25 +425,61 @@ static const UEnumeration gCSDetEnumeration = { enumReset }; -U_CAPI UEnumeration * U_EXPORT2 -ucsdet_getAllDetectableCharsets(const UCharsetDetector * /*ucsd*/, UErrorCode *status) +U_CDECL_END + +U_NAMESPACE_BEGIN + +UEnumeration * CharsetDetector::getAllDetectableCharsets(UErrorCode &status) { - U_NAMESPACE_USE - if(U_FAILURE(*status)) { + /* Initialize recognized charsets. */ + setRecognizers(status); + + if(U_FAILURE(status)) { return 0; } - /* Initialize recognized charsets. */ - CharsetDetector::getDetectableCount(); + UEnumeration *en = NEW_ARRAY(UEnumeration, 1); + if (en == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + return 0; + } + memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration)); + en->context = (void*)NEW_ARRAY(Context, 1); + if (en->context == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + DELETE_ARRAY(en); + return 0; + } + uprv_memset(en->context, 0, sizeof(Context)); + ((Context*)en->context)->all = TRUE; + return en; +} + +UEnumeration * CharsetDetector::getDetectableCharsets(UErrorCode &status) const +{ + if(U_FAILURE(status)) { + return 0; + } UEnumeration *en = NEW_ARRAY(UEnumeration, 1); + if (en == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + return 0; + } memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration)); en->context = (void*)NEW_ARRAY(Context, 1); + if (en->context == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + DELETE_ARRAY(en); + return 0; + } uprv_memset(en->context, 0, sizeof(Context)); + ((Context*)en->context)->all = FALSE; + ((Context*)en->context)->enabledRecognizers = fEnabledRecognizers; return en; } -U_CDECL_END -#endif +U_NAMESPACE_END +#endif \ No newline at end of file diff --git a/icu4c/source/i18n/csdetect.h b/icu4c/source/i18n/csdetect.h index 405e1f55800..15910453e84 100644 --- a/icu4c/source/i18n/csdetect.h +++ b/icu4c/source/i18n/csdetect.h @@ -1,6 +1,6 @@ /* ********************************************************************** - * Copyright (C) 2005-2006, International Business Machines + * Copyright (C) 2005-2013, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** */ @@ -28,6 +28,10 @@ private: UBool fFreshTextSet; static void setRecognizers(UErrorCode &status); + UBool *fEnabledRecognizers; // If not null, active set of charset recognizers had + // been changed from the default. The array index is + // corresponding to fCSRecognizers. See setDetectableCharset(). + public: CharsetDetector(UErrorCode &status); @@ -47,7 +51,12 @@ public: // const char *getCharsetName(int32_t index, UErrorCode& status) const; - static int32_t getDetectableCount(); + static int32_t getDetectableCount(); + + + static UEnumeration * getAllDetectableCharsets(UErrorCode &status); + UEnumeration * getDetectableCharsets(UErrorCode &status) const; + void setDetectableCharset(const char *encoding, UBool enabled, UErrorCode &status); }; U_NAMESPACE_END diff --git a/icu4c/source/i18n/ucsdet.cpp b/icu4c/source/i18n/ucsdet.cpp index d06dd336428..d702bdfffbc 100644 --- a/icu4c/source/i18n/ucsdet.cpp +++ b/icu4c/source/i18n/ucsdet.cpp @@ -1,6 +1,6 @@ /* ******************************************************************************** - * Copyright (C) 2005-2007, International Business Machines + * Copyright (C) 2005-2013, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************** */ @@ -11,6 +11,11 @@ #include "unicode/ucsdet.h" #include "csdetect.h" #include "csmatch.h" +#include "csrsbcs.h" +#include "csrmbcs.h" +#include "csrutf8.h" +#include "csrucode.h" +#include "csr2022.h" #include "cmemory.h" @@ -175,6 +180,26 @@ ucsdet_getUChars(const UCharsetMatch *ucsm, return ((CharsetMatch *) ucsm)->getUChars(buf, cap, status); } + +U_CAPI void U_EXPORT2 +ucsdet_setDetectableCharset(UCharsetDetector *ucsd, const char *encoding, UBool enabled, UErrorCode *status) +{ + ((CharsetDetector *)ucsd)->setDetectableCharset(encoding, enabled, *status); +} + +U_CAPI UEnumeration * U_EXPORT2 +ucsdet_getAllDetectableCharsets(const UCharsetDetector * /*ucsd*/, UErrorCode *status) +{ + return CharsetDetector::getAllDetectableCharsets(*status); +} + +U_DRAFT UEnumeration * U_EXPORT2 +ucsdet_getDetectableCharsets(const UCharsetDetector *ucsd, UErrorCode *status) +{ + return ((CharsetDetector *)ucsd)->getDetectableCharsets(*status); +} + U_CDECL_END + #endif diff --git a/icu4c/source/i18n/unicode/ucsdet.h b/icu4c/source/i18n/unicode/ucsdet.h index dc492ee229e..d9b86a38955 100644 --- a/icu4c/source/i18n/unicode/ucsdet.h +++ b/icu4c/source/i18n/unicode/ucsdet.h @@ -1,6 +1,6 @@ /* ********************************************************************** - * Copyright (C) 2005-2010, International Business Machines + * Copyright (C) 2005-2013, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * file name: ucsdet.h @@ -321,12 +321,21 @@ ucsdet_getUChars(const UCharsetMatch *ucsm, * The returned UEnumeration provides access to the names of * the charsets. * + *

* The state of the Charset detector that is passed in does not * affect the result of this function, but requiring a valid, open * charset detector as a parameter insures that the charset detection * service has been safely initialized and that the required detection * data is available. * + *

+ * Note: Multiple different charset encodings in a same family may use + * a single shared name in this implementation. For example, this method returns + * an array including "ISO-8859-1" (ISO Latin 1), but not including "windows-1252" + * (Windows Latin 1). However, actual detection result could be "windows-1252" + * when the input data matches Latin 1 code points with any points only available + * in "windows-1252". + * * @param ucsd a Charset detector. * @param status Any error conditions are reported back in this variable. * @return an iterator providing access to the detectable charset names. @@ -335,7 +344,6 @@ ucsdet_getUChars(const UCharsetMatch *ucsm, U_STABLE UEnumeration * U_EXPORT2 ucsdet_getAllDetectableCharsets(const UCharsetDetector *ucsd, UErrorCode *status); - /** * Test whether input filtering is enabled for this charset detector. * Input filtering removes text that appears to be HTML or xml @@ -346,6 +354,7 @@ ucsdet_getAllDetectableCharsets(const UCharsetDetector *ucsd, UErrorCode *statu * @return TRUE if filtering is enabled. * @stable ICU 3.6 */ + U_STABLE UBool U_EXPORT2 ucsdet_isInputFilterEnabled(const UCharsetDetector *ucsd); @@ -364,6 +373,39 @@ ucsdet_isInputFilterEnabled(const UCharsetDetector *ucsd); U_STABLE UBool U_EXPORT2 ucsdet_enableInputFilter(UCharsetDetector *ucsd, UBool filter); + +/** + * Get an iterator over the set of detectable charsets - + * over the charsets that are enabled by the specified charset detector. + * + * The returned UEnumeration provides access to the names of + * the charsets. + * + * @param ucsd a Charset detector. + * @param status Any error conditions are reported back in this variable. + * @return an iterator providing access to the detectable charset names by + * the specified charset detector. + * @internal + */ +U_DRAFT UEnumeration * U_EXPORT2 +ucsdet_getDetectableCharsets(const UCharsetDetector *ucsd, UErrorCode *status); + +/** + * Enable or disable individual charset encoding. + * A name of charset encoding must be included in the names returned by + * {@link #getAllDetectableCharsets()}. + * + * @param ucsd a Charset detector. + * @param encoding encoding the name of charset encoding. + * @param enabled TRUE to enable, or FALSE to disable the + * charset encoding. + * @param status receives the return status. When the name of charset encoding + * is not supported, U_ILLEGAL_ARGUMENT_ERROR is set. + * @internal + */ +U_DRAFT void U_EXPORT2 +ucsdet_setDetectableCharset(UCharsetDetector *ucsd, const char *encoding, UBool enabled, UErrorCode *status); + #endif #endif /* __UCSDET_H */ diff --git a/icu4c/source/test/intltest/csdetest.cpp b/icu4c/source/test/intltest/csdetest.cpp index cdb39cefeae..62b4eacb2b8 100644 --- a/icu4c/source/test/intltest/csdetest.cpp +++ b/icu4c/source/test/intltest/csdetest.cpp @@ -1,6 +1,6 @@ /* ********************************************************************** - * Copyright (C) 2005-2012, International Business Machines + * Copyright (C) 2005-2013, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** */ @@ -275,6 +275,45 @@ void CharsetDetectionTest::ConstructionTest() printf("%s\n", name); #endif } + + const char* defDisabled[] = { + "IBM420_rtl", "IBM420_ltr", + "IBM424_rtl", "IBM424_ltr", + 0 + }; + + LocalUEnumerationPointer eActive(ucsdet_getDetectableCharsets(csd.getAlias(), status)); + const char *activeName = NULL; + + while (activeName = uenum_next(eActive.getAlias(), NULL, status)) { + // the charset must be included in all list + UBool found = FALSE; + + const char *name = NULL; + uenum_reset(e.getAlias(), status); + while (name = uenum_next(e.getAlias(), NULL, status)) { + if (strcmp(activeName, name) == 0) { + found = TRUE; + break; + } + } + + if (!found) { + errln(UnicodeString(activeName) + " is not included in the all charset list."); + } + + // some charsets are disabled by default + found = FALSE; + for (int32_t i = 0; defDisabled[i] != 0; i++) { + if (strcmp(activeName, defDisabled[i]) == 0) { + found = TRUE; + break; + } + } + if (found) { + errln(UnicodeString(activeName) + " should not be included in the default charset list."); + } + } } void CharsetDetectionTest::UTF8Test() @@ -597,6 +636,10 @@ void CharsetDetectionTest::IBM424Test() char *bytes_r = extractBytes(s2, "IBM424", brLength); UCharsetDetector *csd = ucsdet_open(&status); + ucsdet_setDetectableCharset(csd, "IBM424_rtl", TRUE, &status); + ucsdet_setDetectableCharset(csd, "IBM424_ltr", TRUE, &status); + ucsdet_setDetectableCharset(csd, "IBM420_rtl", TRUE, &status); + ucsdet_setDetectableCharset(csd, "IBM420_ltr", TRUE, &status); if (U_FAILURE(status)) { errln("Error opening charset detector. - %s", u_errorName(status)); } @@ -684,6 +727,10 @@ void CharsetDetectionTest::IBM420Test() if (U_FAILURE(status)) { errln("Error opening charset detector. - %s", u_errorName(status)); } + ucsdet_setDetectableCharset(csd, "IBM424_rtl", TRUE, &status); + ucsdet_setDetectableCharset(csd, "IBM424_ltr", TRUE, &status); + ucsdet_setDetectableCharset(csd, "IBM420_rtl", TRUE, &status); + ucsdet_setDetectableCharset(csd, "IBM420_ltr", TRUE, &status); const UCharsetMatch *match; const char *name;