#define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
#define DELETE_ARRAY(array) uprv_free((void *) (array))
-static icu::CharsetRecognizer **fCSRecognizers = NULL;
+U_NAMESPACE_BEGIN
+
+struct CSRecognizerInfo : public UMemory {
+ CSRecognizerInfo(CharsetRecognizer *recognizer, UBool isDefaultEnabled)
+ : recognizer(recognizer), isDefaultEnabled(isDefaultEnabled) {};
+
+ ~CSRecognizerInfo() {delete recognizer;};
+
+ CharsetRecognizer *recognizer;
+ UBool isDefaultEnabled;
+};
+
+U_NAMESPACE_END
+
+static icu::CSRecognizerInfo **fCSRecognizers = NULL;
static icu::UInitOnce gCSRecognizersInitOnce;
static int32_t fCSRecognizers_size = 0;
static void U_CALLCONV initRecognizers(UErrorCode &status) {
U_NAMESPACE_USE
ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup);
- CharsetRecognizer *tempArray[] = {
- new CharsetRecog_UTF8(),
-
- new CharsetRecog_UTF_16_BE(),
- new CharsetRecog_UTF_16_LE(),
- new CharsetRecog_UTF_32_BE(),
- new CharsetRecog_UTF_32_LE(),
-
- new CharsetRecog_8859_1(),
- new CharsetRecog_8859_2(),
- new CharsetRecog_8859_5_ru(),
- new CharsetRecog_8859_6_ar(),
- new CharsetRecog_8859_7_el(),
- new CharsetRecog_8859_8_I_he(),
- new CharsetRecog_8859_8_he(),
- new CharsetRecog_windows_1251(),
- new CharsetRecog_windows_1256(),
- new CharsetRecog_KOI8_R(),
- new CharsetRecog_8859_9_tr(),
- new CharsetRecog_sjis(),
- new CharsetRecog_gb_18030(),
- new CharsetRecog_euc_jp(),
- new CharsetRecog_euc_kr(),
- new CharsetRecog_big5(),
-
- new CharsetRecog_2022JP(),
- new CharsetRecog_2022KR(),
- new CharsetRecog_2022CN(),
-
- new CharsetRecog_IBM424_he_rtl(),
- new CharsetRecog_IBM424_he_ltr(),
- new CharsetRecog_IBM420_ar_rtl(),
- new CharsetRecog_IBM420_ar_ltr()
+ CSRecognizerInfo *tempArray[] = {
+ new CSRecognizerInfo(new CharsetRecog_UTF8(), TRUE),
+
+ new CSRecognizerInfo(new CharsetRecog_UTF_16_BE(), TRUE),
+ new CSRecognizerInfo(new CharsetRecog_UTF_16_LE(), TRUE),
+ new CSRecognizerInfo(new CharsetRecog_UTF_32_BE(), TRUE),
+ new CSRecognizerInfo(new CharsetRecog_UTF_32_LE(), TRUE),
+
+ new CSRecognizerInfo(new CharsetRecog_8859_1(), TRUE),
+ new CSRecognizerInfo(new CharsetRecog_8859_2(), TRUE),
+ new CSRecognizerInfo(new CharsetRecog_8859_5_ru(), TRUE),
+ new CSRecognizerInfo(new CharsetRecog_8859_6_ar(), TRUE),
+ new CSRecognizerInfo(new CharsetRecog_8859_7_el(), TRUE),
+ new CSRecognizerInfo(new CharsetRecog_8859_8_I_he(), TRUE),
+ new CSRecognizerInfo(new CharsetRecog_8859_8_he(), TRUE),
+ new CSRecognizerInfo(new CharsetRecog_windows_1251(), TRUE),
+ new CSRecognizerInfo(new CharsetRecog_windows_1256(), TRUE),
+ new CSRecognizerInfo(new CharsetRecog_KOI8_R(), TRUE),
+ new CSRecognizerInfo(new CharsetRecog_8859_9_tr(), TRUE),
+ new CSRecognizerInfo(new CharsetRecog_sjis(), TRUE),
+ new CSRecognizerInfo(new CharsetRecog_gb_18030(), TRUE),
+ new CSRecognizerInfo(new CharsetRecog_euc_jp(), TRUE),
+ new CSRecognizerInfo(new CharsetRecog_euc_kr(), TRUE),
+ new CSRecognizerInfo(new CharsetRecog_big5(), TRUE),
+
+ new CSRecognizerInfo(new CharsetRecog_2022JP(), TRUE),
+ new CSRecognizerInfo(new CharsetRecog_2022KR(), TRUE),
+ new CSRecognizerInfo(new CharsetRecog_2022CN(), TRUE),
+
+ new CSRecognizerInfo(new CharsetRecog_IBM424_he_rtl(), FALSE),
+ new CSRecognizerInfo(new CharsetRecog_IBM424_he_ltr(), FALSE),
+ new CSRecognizerInfo(new CharsetRecog_IBM420_ar_rtl(), FALSE),
+ new CSRecognizerInfo(new CharsetRecog_IBM420_ar_ltr(), FALSE)
};
int32_t rCount = ARRAY_SIZE(tempArray);
- fCSRecognizers = NEW_ARRAY(CharsetRecognizer *, rCount);
+ fCSRecognizers = NEW_ARRAY(CSRecognizerInfo *, rCount);
if (fCSRecognizers == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
- } else {
+ }
+ else {
fCSRecognizers_size = rCount;
for (int32_t r = 0; r < rCount; r += 1) {
fCSRecognizers[r] = tempArray[r];
CharsetDetector::CharsetDetector(UErrorCode &status)
: textIn(new InputText(status)), resultArray(NULL),
- resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE)
+ resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE),
+ fEnabledRecognizers(NULL)
{
if (U_FAILURE(status)) {
return;
}
uprv_free(resultArray);
+
+ if (fEnabledRecognizers) {
+ uprv_free(fEnabledRecognizers);
+ }
}
void CharsetDetector::setText(const char *in, int32_t len)
// give a match quality > 0.
resultCount = 0;
for (i = 0; i < fCSRecognizers_size; i += 1) {
- csr = fCSRecognizers[i];
+ csr = fCSRecognizers[i]->recognizer;
if (csr->match(textIn, resultArray[resultCount])) {
resultCount++;
}
return resultArray;
}
+void CharsetDetector::setDetectableCharset(const char *encoding, UBool enabled, UErrorCode &status)
+{
+ if (U_FAILURE(status)) {
+ return;
+ }
+
+ int32_t modIdx = -1;
+ UBool isDefaultVal = FALSE;
+ for (int32_t i = 0; i < fCSRecognizers_size; i++) {
+ CSRecognizerInfo *csrinfo = fCSRecognizers[i];
+ if (uprv_strcmp(csrinfo->recognizer->getName(), encoding) == 0) {
+ modIdx = i;
+ isDefaultVal = (csrinfo->isDefaultEnabled == enabled);
+ break;
+ }
+ }
+ if (modIdx < 0) {
+ // No matching encoding found
+ status = U_ILLEGAL_ARGUMENT_ERROR;
+ return;
+ }
+
+ if (fEnabledRecognizers == NULL && !isDefaultVal) {
+ // Create an array storing the non default setting
+ fEnabledRecognizers = NEW_ARRAY(UBool, fCSRecognizers_size);
+ if (fEnabledRecognizers == NULL) {
+ status = U_MEMORY_ALLOCATION_ERROR;
+ return;
+ }
+ // Initialize the array with default info
+ for (int32_t i = 0; i < fCSRecognizers_size; i++) {
+ fEnabledRecognizers[i] = fCSRecognizers[i]->isDefaultEnabled;
+ }
+ }
+
+ if (fEnabledRecognizers != NULL) {
+ fEnabledRecognizers[modIdx] = enabled;
+ }
+}
+
/*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const
{
if( index > fCSRecognizers_size-1 || index < 0) {
U_CDECL_BEGIN
typedef struct {
int32_t currIndex;
+ UBool all;
+ UBool *enabledRecognizers;
} Context;
}
static int32_t U_CALLCONV
-enumCount(UEnumeration *, UErrorCode *) {
- return fCSRecognizers_size;
+enumCount(UEnumeration *en, UErrorCode *) {
+ if (((Context *)en->context)->all) {
+ // ucsdet_getAllDetectableCharsets, all charset detector names
+ return fCSRecognizers_size;
+ }
+
+ // Otherwise, ucsdet_getDetectableCharsets - only enabled ones
+ int32_t count = 0;
+ UBool *enabledArray = ((Context *)en->context)->enabledRecognizers;
+ if (enabledArray != NULL) {
+ // custom set
+ for (int32_t i = 0; i < fCSRecognizers_size; i++) {
+ if (enabledArray[i]) {
+ count++;
+ }
+ }
+ } else {
+ // default set
+ for (int32_t i = 0; i < fCSRecognizers_size; i++) {
+ if (fCSRecognizers[i]->isDefaultEnabled) {
+ count++;
+ }
+ }
+ }
+ return count;
}
static const char* U_CALLCONV
enumNext(UEnumeration *en, int32_t *resultLength, UErrorCode * /*status*/) {
- if(((Context *)en->context)->currIndex >= fCSRecognizers_size) {
- if(resultLength != NULL) {
- *resultLength = 0;
+ const char *currName = NULL;
+
+ if (((Context *)en->context)->currIndex < fCSRecognizers_size) {
+ if (((Context *)en->context)->all) {
+ // ucsdet_getAllDetectableCharsets, all charset detector names
+ currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
+ ((Context *)en->context)->currIndex++;
+ } else {
+ // ucsdet_getDetectableCharsets
+ UBool *enabledArray = ((Context *)en->context)->enabledRecognizers;
+ if (enabledArray != NULL) {
+ // custome set
+ while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) {
+ if (enabledArray[((Context *)en->context)->currIndex]) {
+ currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
+ }
+ ((Context *)en->context)->currIndex++;
+ }
+ } else {
+ // default set
+ while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) {
+ if (fCSRecognizers[((Context *)en->context)->currIndex]->isDefaultEnabled) {
+ currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
+ }
+ ((Context *)en->context)->currIndex++;
+ }
+ }
}
- return NULL;
}
- const char *currName = fCSRecognizers[((Context *)en->context)->currIndex]->getName();
+
if(resultLength != NULL) {
- *resultLength = (int32_t)uprv_strlen(currName);
+ *resultLength = currName == NULL ? 0 : (int32_t)uprv_strlen(currName);
}
- ((Context *)en->context)->currIndex++;
return currName;
}
+
static void U_CALLCONV
enumReset(UEnumeration *en, UErrorCode *) {
((Context *)en->context)->currIndex = 0;
enumReset
};
-U_CAPI UEnumeration * U_EXPORT2
-ucsdet_getAllDetectableCharsets(const UCharsetDetector * /*ucsd*/, UErrorCode *status)
+U_CDECL_END
+
+U_NAMESPACE_BEGIN
+
+UEnumeration * CharsetDetector::getAllDetectableCharsets(UErrorCode &status)
{
- U_NAMESPACE_USE
- if(U_FAILURE(*status)) {
+ /* Initialize recognized charsets. */
+ setRecognizers(status);
+
+ if(U_FAILURE(status)) {
return 0;
}
- /* Initialize recognized charsets. */
- CharsetDetector::getDetectableCount();
+ UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
+ if (en == NULL) {
+ status = U_MEMORY_ALLOCATION_ERROR;
+ return 0;
+ }
+ memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
+ en->context = (void*)NEW_ARRAY(Context, 1);
+ if (en->context == NULL) {
+ status = U_MEMORY_ALLOCATION_ERROR;
+ DELETE_ARRAY(en);
+ return 0;
+ }
+ uprv_memset(en->context, 0, sizeof(Context));
+ ((Context*)en->context)->all = TRUE;
+ return en;
+}
+
+UEnumeration * CharsetDetector::getDetectableCharsets(UErrorCode &status) const
+{
+ if(U_FAILURE(status)) {
+ return 0;
+ }
UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
+ if (en == NULL) {
+ status = U_MEMORY_ALLOCATION_ERROR;
+ return 0;
+ }
memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
en->context = (void*)NEW_ARRAY(Context, 1);
+ if (en->context == NULL) {
+ status = U_MEMORY_ALLOCATION_ERROR;
+ DELETE_ARRAY(en);
+ return 0;
+ }
uprv_memset(en->context, 0, sizeof(Context));
+ ((Context*)en->context)->all = FALSE;
+ ((Context*)en->context)->enabledRecognizers = fEnabledRecognizers;
return en;
}
-U_CDECL_END
-#endif
+U_NAMESPACE_END
+#endif
\ No newline at end of file
/*
**********************************************************************
- * Copyright (C) 2005-2010, International Business Machines
+ * Copyright (C) 2005-2013, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* file name: ucsdet.h
* The returned UEnumeration provides access to the names of
* the charsets.
*
+ * <p>
* The state of the Charset detector that is passed in does not
* affect the result of this function, but requiring a valid, open
* charset detector as a parameter insures that the charset detection
* service has been safely initialized and that the required detection
* data is available.
*
+ * <p>
+ * <b>Note:</b> Multiple different charset encodings in a same family may use
+ * a single shared name in this implementation. For example, this method returns
+ * an array including "ISO-8859-1" (ISO Latin 1), but not including "windows-1252"
+ * (Windows Latin 1). However, actual detection result could be "windows-1252"
+ * when the input data matches Latin 1 code points with any points only available
+ * in "windows-1252".
+ *
* @param ucsd a Charset detector.
* @param status Any error conditions are reported back in this variable.
* @return an iterator providing access to the detectable charset names.
U_STABLE UEnumeration * U_EXPORT2
ucsdet_getAllDetectableCharsets(const UCharsetDetector *ucsd, UErrorCode *status);
-
/**
* Test whether input filtering is enabled for this charset detector.
* Input filtering removes text that appears to be HTML or xml
* @return TRUE if filtering is enabled.
* @stable ICU 3.6
*/
+
U_STABLE UBool U_EXPORT2
ucsdet_isInputFilterEnabled(const UCharsetDetector *ucsd);
U_STABLE UBool U_EXPORT2
ucsdet_enableInputFilter(UCharsetDetector *ucsd, UBool filter);
+
+/**
+ * Get an iterator over the set of detectable charsets -
+ * over the charsets that are enabled by the specified charset detector.
+ *
+ * The returned UEnumeration provides access to the names of
+ * the charsets.
+ *
+ * @param ucsd a Charset detector.
+ * @param status Any error conditions are reported back in this variable.
+ * @return an iterator providing access to the detectable charset names by
+ * the specified charset detector.
+ * @internal
+ */
+U_DRAFT UEnumeration * U_EXPORT2
+ucsdet_getDetectableCharsets(const UCharsetDetector *ucsd, UErrorCode *status);
+
+/**
+ * Enable or disable individual charset encoding.
+ * A name of charset encoding must be included in the names returned by
+ * {@link #getAllDetectableCharsets()}.
+ *
+ * @param ucsd a Charset detector.
+ * @param encoding encoding the name of charset encoding.
+ * @param enabled <code>TRUE</code> to enable, or <code>FALSE</code> to disable the
+ * charset encoding.
+ * @param status receives the return status. When the name of charset encoding
+ * is not supported, U_ILLEGAL_ARGUMENT_ERROR is set.
+ * @internal
+ */
+U_DRAFT void U_EXPORT2
+ucsdet_setDetectableCharset(UCharsetDetector *ucsd, const char *encoding, UBool enabled, UErrorCode *status);
+
#endif
#endif /* __UCSDET_H */
/*
**********************************************************************
- * Copyright (C) 2005-2012, International Business Machines
+ * Copyright (C) 2005-2013, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*/
printf("%s\n", name);
#endif
}
+
+ const char* defDisabled[] = {
+ "IBM420_rtl", "IBM420_ltr",
+ "IBM424_rtl", "IBM424_ltr",
+ 0
+ };
+
+ LocalUEnumerationPointer eActive(ucsdet_getDetectableCharsets(csd.getAlias(), status));
+ const char *activeName = NULL;
+
+ while (activeName = uenum_next(eActive.getAlias(), NULL, status)) {
+ // the charset must be included in all list
+ UBool found = FALSE;
+
+ const char *name = NULL;
+ uenum_reset(e.getAlias(), status);
+ while (name = uenum_next(e.getAlias(), NULL, status)) {
+ if (strcmp(activeName, name) == 0) {
+ found = TRUE;
+ break;
+ }
+ }
+
+ if (!found) {
+ errln(UnicodeString(activeName) + " is not included in the all charset list.");
+ }
+
+ // some charsets are disabled by default
+ found = FALSE;
+ for (int32_t i = 0; defDisabled[i] != 0; i++) {
+ if (strcmp(activeName, defDisabled[i]) == 0) {
+ found = TRUE;
+ break;
+ }
+ }
+ if (found) {
+ errln(UnicodeString(activeName) + " should not be included in the default charset list.");
+ }
+ }
}
void CharsetDetectionTest::UTF8Test()
char *bytes_r = extractBytes(s2, "IBM424", brLength);
UCharsetDetector *csd = ucsdet_open(&status);
+ ucsdet_setDetectableCharset(csd, "IBM424_rtl", TRUE, &status);
+ ucsdet_setDetectableCharset(csd, "IBM424_ltr", TRUE, &status);
+ ucsdet_setDetectableCharset(csd, "IBM420_rtl", TRUE, &status);
+ ucsdet_setDetectableCharset(csd, "IBM420_ltr", TRUE, &status);
if (U_FAILURE(status)) {
errln("Error opening charset detector. - %s", u_errorName(status));
}
if (U_FAILURE(status)) {
errln("Error opening charset detector. - %s", u_errorName(status));
}
+ ucsdet_setDetectableCharset(csd, "IBM424_rtl", TRUE, &status);
+ ucsdet_setDetectableCharset(csd, "IBM424_ltr", TRUE, &status);
+ ucsdet_setDetectableCharset(csd, "IBM420_rtl", TRUE, &status);
+ ucsdet_setDetectableCharset(csd, "IBM420_ltr", TRUE, &status);
const UCharsetMatch *match;
const char *name;