From c9fae4bda44f4976772e207f054f00b18d61debd Mon Sep 17 00:00:00 2001
From: allensu05 <52812914+allensu05@users.noreply.github.com>
Date: Wed, 19 Jan 2022 17:25:45 +0000
Subject: [PATCH] ICU-21699 Concatenate Katakana chars
See #1962
---
icu4c/source/common/dictbe.cpp | 7 +++++--
icu4c/source/test/testdata/rbbitst.txt | 4 +++-
2 files changed, 8 insertions(+), 3 deletions(-)
diff --git a/icu4c/source/common/dictbe.cpp b/icu4c/source/common/dictbe.cpp
index 64b4fbf6391..c0af19ef171 100644
--- a/icu4c/source/common/dictbe.cpp
+++ b/icu4c/source/common/dictbe.cpp
@@ -1366,8 +1366,11 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
// Calculate the length by using the code unit.
length = inString.moveIndex32(0, prevIdx) - codeUnitIdx;
prevIdx = i;
- // Skip the breakpoint if it belongs to the particle or Hiragana.
- if (!fSkipSet.containsKey(inString.tempSubString(codeUnitIdx, length))) {
+ // Keep the breakpoint if the pattern is not in the fSkipSet and continuous Katakana
+ // characters don't occur.
+ if (!fSkipSet.containsKey(inString.tempSubString(codeUnitIdx, length))
+ && (!isKatakana(inString.char32At(codeUnitIdx -1))
+ || !isKatakana(inString.char32At(codeUnitIdx)))) {
t_boundary.addElement(i, status);
numBreaks++;
}
diff --git a/icu4c/source/test/testdata/rbbitst.txt b/icu4c/source/test/testdata/rbbitst.txt
index 54c612da22c..1be45e9f3c5 100644
--- a/icu4c/source/test/testdata/rbbitst.txt
+++ b/icu4c/source/test/testdata/rbbitst.txt
@@ -1892,10 +1892,12 @@ Bangkok)â¢
â¢\uff19\u6708\u306bâ¢\u6771\u4eac\u304b\u3089â¢\u53cb\u9054\u304câ¢\u904a\u3073\u306bâ¢\u6765\u305fâ¢
#ãæåãããã ã京é½ã-> ãâ¢æåâ¢ãããã ãâ¢äº¬é½ãâ¢
â¢\u308bâ¢\u6587\u5b57â¢\u300c\u305d\u3046\u3060\u3001â¢\u4eac\u90fd\u300dâ¢
-#ä¹è»çï¼ï¼ï¼
ç¨åº¦ã ããã -> ä¹è»â¢çâ¢ï¼ï¼ï¼
â¢ç¨åº¦ã â¢ãããâ¢
+#ä¹è»çï¼ï¼ï¼
ç¨åº¦ã ãããã -> ä¹è»â¢çâ¢ï¼ï¼ï¼
â¢ç¨åº¦ã â¢ããããâ¢
â¢\u4e57\u8ecaâ¢\u7387â¢\uff19\uff10\uff05â¢\u7a0b\u5ea6\u3060â¢\u308d\u3046\u304b\u3002â¢
#ï¼»æºå¸¯é»è©±ï¼½æ£ãã鏿 -> ï¼»æºå¸¯â¢é»è©±ï¼½â¢æ£ããâ¢é¸æâ¢
â¢\uff3b\u643a\u5e2fâ¢\u96fb\u8a71\uff3dâ¢\u6b63\u3057\u3044â¢\u9078\u629eâ¢
+#ç´é製ç¾äººä¸é¦ã«ãµãã«ã¼ãã¼ã« -> ç´éâ¢è£½â¢ç¾äººä¸é¦ã«â¢ãµãã«ã¼ãã¼ã«
+â¢\u7D14\u91D1â¢\u88FDâ¢\u767E\u4EBA\u4E00\u9996\u306Bâ¢\u30B5\u30C3\u30AB\u30FC\u30DC\u30FC\u30EBâ¢
####################################################################################
--
2.50.1