From b1269c91211deab8f94c3cf85accb51e5a71ca5c Mon Sep 17 00:00:00 2001 From: shaobero Date: Thu, 4 Nov 2021 17:57:12 +0000 Subject: [PATCH] ICU-21823 Adding changes to fix charset detection incase of no match --- icu4c/source/i18n/csdetect.cpp | 5 +++++ icu4c/source/test/cintltst/ucsdetst.c | 1 + icu4c/source/test/intltest/csdetest.cpp | 23 +++++++++++++++++++++++ icu4c/source/test/intltest/csdetest.h | 1 + 4 files changed, 30 insertions(+) diff --git a/icu4c/source/i18n/csdetect.cpp b/icu4c/source/i18n/csdetect.cpp index 84f0776542d..d866eb66286 100644 --- a/icu4c/source/i18n/csdetect.cpp +++ b/icu4c/source/i18n/csdetect.cpp @@ -270,6 +270,11 @@ const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, maxMatchesFound = resultCount; + if (maxMatchesFound == 0) { + status = U_INVALID_CHAR_FOUND; + return NULL; + } + return resultArray; } diff --git a/icu4c/source/test/cintltst/ucsdetst.c b/icu4c/source/test/cintltst/ucsdetst.c index f65f8d79aa4..b8d3b5f20b8 100644 --- a/icu4c/source/test/cintltst/ucsdetst.c +++ b/icu4c/source/test/cintltst/ucsdetst.c @@ -402,6 +402,7 @@ static void TestBufferOverflow(void) { } for (idx = 0; idx < UPRV_LENGTHOF(testStrings); idx++) { + status = U_ZERO_ERROR; ucsdet_setText(csd, testStrings[idx], -1, &status); match = ucsdet_detect(csd, &status); diff --git a/icu4c/source/test/intltest/csdetest.cpp b/icu4c/source/test/intltest/csdetest.cpp index 66f9340b83a..95f19d43d1a 100644 --- a/icu4c/source/test/intltest/csdetest.cpp +++ b/icu4c/source/test/intltest/csdetest.cpp @@ -109,6 +109,10 @@ void CharsetDetectionTest::runIndexedTest( int32_t index, UBool exec, const char if (exec) Ticket6954Test(); break; + case 10: name = "Ticket21823Test"; + if (exec) Ticket21823Test(); + break; + default: name = ""; break; //needed to end loop } @@ -839,3 +843,22 @@ void CharsetDetectionTest::Ticket6954Test() { TEST_ASSERT(strcmp(name1, "windows-1252")==0); #endif } + + +// Ticket 21823 - Issue with Charset Detector for ill-formed input strings. +// Its fix involves returning a failure based error code +// (U_INVALID_CHAR_FOUND) incase no charsets appear to match the input data. +void CharsetDetectionTest::Ticket21823Test() { + UErrorCode status = U_ZERO_ERROR; + std::string str = "\x80"; + UCharsetDetector* csd = ucsdet_open(&status); + + ucsdet_setText(csd, str.data(), str.length(), &status); + const UCharsetMatch* match = ucsdet_detect(csd, &status); + + if (match == NULL) { + TEST_ASSERT(U_FAILURE(status)); + } + + ucsdet_close(csd); +} diff --git a/icu4c/source/test/intltest/csdetest.h b/icu4c/source/test/intltest/csdetest.h index 72d1ca92aea..8893b88e591 100644 --- a/icu4c/source/test/intltest/csdetest.h +++ b/icu4c/source/test/intltest/csdetest.h @@ -33,6 +33,7 @@ public: virtual void IBM420Test(); virtual void Ticket6394Test(); virtual void Ticket6954Test(); + virtual void Ticket21823Test(); private: void checkEncoding(const UnicodeString &testString, -- 2.50.1