ICU-22119 Add lw=phrase for Korean using line_*_phrase_cj

author Jungshik Shin <jshin@chromium.org>

Thu, 25 Aug 2022 08:04:33 +0000 (01:04 -0700)

committer Markus Scherer <markus.icu@gmail.com>

Mon, 7 Nov 2022 22:30:49 +0000 (22:30 +0000)
author Jungshik Shin <jshin@chromium.org>
Thu, 25 Aug 2022 08:04:33 +0000 (01:04 -0700)
committer Markus Scherer <markus.icu@gmail.com>
Mon, 7 Nov 2022 22:30:49 +0000 (22:30 +0000)
diff --git a/icu4c/source/common/brkiter.cpp b/icu4c/source/common/brkiter.cpp

index e01cf20470b2a14cdb18fd7294dfae831a57d69e..f61f24e50bac3cab712fd55ba2fc5e6b45dd1393 100644 (file)
--- a/icu4c/source/common/brkiter.cpp
+++ b/icu4c/source/common/brkiter.cpp
@@ -439,8 +439,8 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
                  uprv_strcat(lb_lw, "_");
                  uprv_strcat(lb_lw, value.data());
              }
-            // lw=phrase is only supported in Japanese.
-            if (uprv_strcmp(loc.getLanguage(), "ja") == 0) {
+            // lw=phrase is only supported in Japanese and Korean
+            if (uprv_strcmp(loc.getLanguage(), "ja") == 0 || uprv_strcmp(loc.getLanguage(), "ko") == 0) {
                  value.clear();
                  loc.getKeywordValue("lw", valueSink, kvStatus);
                  if (U_SUCCESS(kvStatus) && value == "phrase") {
diff --git a/icu4c/source/data/brkitr/ko.txt b/icu4c/source/data/brkitr/ko.txt

new file mode 100644 (file)

index 0000000..2152e74
--- /dev/null
+++ b/icu4c/source/data/brkitr/ko.txt
@@ -0,0 +1,20 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+// Generated using tools/cldr/cldr-to-icu/build-icu-data.xml
+ko{
+    boundaries{
+        line:process(dependency){"line_normal.brk"}
+        line_loose:process(dependency){"line_loose_cj.brk"}
+        line_loose_phrase:process(dependency){"line_loose_phrase_cj.brk"}
+        line_normal:process(dependency){"line_normal_cj.brk"}
+        line_normal_phrase:process(dependency){"line_normal_phrase_cj.brk"}
+        line_phrase:process(dependency){"line_phrase_cj.brk"}
+        line_strict:process(dependency){"line_cj.brk"}
+        line_strict_phrase:process(dependency){"line_phrase_cj.brk"}
+    }
+// Korean particles should be added here and dictbe.cpp needs to be adjusted
+// once a Korean dictionary is added. Even without a Korean dictionary,
+// Korean particle list can be used when a Korean word written in CJK ideographs
+// or a Korean word in Hangul and a closing puncutation mark is followed by a Korean
+// particle. Examples include: "大韓民國은  民主共和國이다", "『님의 침묵』이" .
+}
diff --git a/icu4c/source/data/xml/brkitr/ko.xml b/icu4c/source/data/xml/brkitr/ko.xml

new file mode 100644 (file)

index 0000000..5ba204b
--- /dev/null
+++ b/icu4c/source/data/xml/brkitr/ko.xml
@@ -0,0 +1,42 @@
+<?xml version="1.0" encoding="UTF-8" ?>
+<!--
+ Copyright (C) 2016 and later: Unicode, Inc. and others.
+ License & terms of use: http://www.unicode.org/copyright.html
+ Copyright (c) 2006-2015 International Business Machines Corporation and others. All rights reserved.
+-->
+<!DOCTYPE ldml SYSTEM "../../dtd/cldr/common/dtd/ldml.dtd"
+[
+   <!ENTITY % icu SYSTEM "../../dtd/cldr/common/dtd/ldmlICU.dtd">
+   %icu;
+]
+>
+<ldml>
+    <identity>
+        <version number="$Revision$"/>
+        <language type="ko"/>
+    </identity>
+    <special xmlns:icu="http://www.icu-project.org/">
+        <icu:breakIteratorData>
+            <icu:boundaries>
+                <icu:line icu:dependency="line_normal.brk"/>
+                <icu:line alt="loose"  icu:dependency="line_loose_cj.brk"/>
+                <icu:line alt="normal" icu:dependency="line_normal_cj.brk"/>
+                <icu:line alt="strict" icu:dependency="line_cj.brk"/>
+                <icu:line alt="loose_phrase"  icu:dependency="line_loose_phrase_cj.brk"/>
+                <icu:line alt="normal_phrase" icu:dependency="line_normal_phrase_cj.brk"/>
+                <icu:line alt="strict_phrase" icu:dependency="line_phrase_cj.brk"/>
+                <icu:line alt="phrase" icu:dependency="line_phrase_cj.brk"/>
+            </icu:boundaries>
+            <!--
+            <icu:extensions>
+              Korean particles should be added here and dictbe.cpp needs to be adjusted
+              once a Korean dictionary is added. Even without a Korean dictionary,
+              Korean particle list can be used when a Korean word written in CJK ideographs
+              or a Korean word in Hangul and a closing puncutation mark is followed by a Korean
+              particle. Examples include: "大韓民國은  民主共和國이다", "『님의 침묵』이" .
+            </icu:extensions>
+            -->
+        </icu:breakIteratorData>
+    </special>
+</ldml>
+
diff --git a/icu4c/source/test/testdata/rbbitst.txt b/icu4c/source/test/testdata/rbbitst.txt

index 6f7555d2f6fec91c6c0d3c544eb351117d2e9cef..72bd15803d6854839c16517cde04725ff244e8c2 100644 (file)
--- a/icu4c/source/test/testdata/rbbitst.txt
+++ b/icu4c/source/test/testdata/rbbitst.txt
@@ -1981,6 +1981,58 @@ Bangkok)•</data>
  #これらの連絡先はデバイスをロック解除しなくても表示され -> これらの▁連絡先は▁デバイスを▁ロック▁解除しなくても▁表示され
  <data>•\u3053\u308C\u3089\u306E•\u9023\u7D61\u5148\u306F•\u30C7\u30D0\u30A4\u30B9\u3092•\u30ED\u30C3\u30AF•\u89E3\u9664\u3057\u306A\u304F\u3066\u3082•\u8868\u793A\u3055\u308C•</data>
  
+# Test the differences in ko with or without lw=phrase.
+<locale ko@lw=phrase>
+<line>
+#1948년 7월 12일에 제정되고 8차에 국민투표에 의하여 개정한다.
+<data>•1948년 •7월 •12일에 •제정되고 •8차에 •국민투표에 •의하여 •개정한다.•</data>
+#대한민국은 민주공화국이다.
+<data>•대한민국은 •민주공화국이다.•</data>
+#서울에서 부산까지 London까지
+<data>•서울에서 •부산까지 •London까지•</data>
+#LTE가 안 되면 WiFi를
+<data>•LTE가 •안 •되면 •WiFi를•</data>
+#<님의 침묵>을 읽고 느낀 점은?
+<data>•\u003c님의 •침묵\u003e을 •읽고 •느낀 •점은?•</data>
+# The following entry passes in ICU4C but fails in ICU4J for an unknown reason.
+#"님의 침묵"을 읽고
+#<data>•"님의 •침묵"을 •읽고•</data>
+# The following 3 lines are not handled properly, yet.
+#“님의 침묵”을 읽고
+#<data>•“님의 •침묵”을 •읽고•</data>
+#『님의 침묵』을 읽고
+#<data>•『님의 •침묵』을 •읽고•</data>
+#大韓民國은 民主共和國이다
+#<data>•大韓民國은 •民主•共和國이다•</data>
+# All the tests for ja@lw=phrase should also work in Korean.
+#[京都観光］時雨殿に行った。-> [京都•観光］•時雨•殿に•行った。•
+<data>•\uff3b\u4eac\u90fd•\u89b3\u5149\uff3d•\u6642\u96e8•\u6bbf\u306b•\u884c\u3063\u305f\u3002•</data>
+#９月に東京から友達が遊びに来た -> ９月に•東京から•友達が•遊びに•来た•
+<data>•\uff19\u6708\u306b•\u6771\u4eac\u304b\u3089•\u53cb\u9054\u304c•\u904a\u3073\u306b•\u6765\u305f•</data>
+
+<locale ko>
+<line>
+#1948년 7월 12일에 제정되고 8차에 국민투표에 의하여 개정한다.
+<data>•1948•년 •7•월 •12•일•에 •제•정•되•고 •8•차•에 •국•민•투•표•에 •의•하•여•개•정•한•다.•</data>
+#대한민국은 민주공화국이다.
+<data>•대•한•민•국•은 •민•주•공•화•국•이•다.•</data>
+#서울에서 부산까지 London까지
+<data>•서•울•에•서 •부•산•까•지 •London•까•지•</data>
+#LTE가 안 되면 WiFi를
+<data>•LTE•가 •안 •되•면 •WiFi•를•</data>
+#<님의 침묵>을 읽고 느낀 점은?
+<data>•\u003c•님•의 •침•묵•\u003e•을 •읽•고 •느•낀 •점•은?•</data>
+#"님의 침묵"을 읽고
+<data>•"님•의 •침•묵"을 •읽•고•</data>
+#“님의 침묵”을 읽고
+<data>•“님•의 •침•묵”을 •읽•고•</data>
+#『님의 침묵』을 읽고
+<data>•『님•의 •침•묵』•을 •읽•고•</data>
+#『foo bar』load
+<data>•『foo •bar』•load•</data>
+#《님의 침묵》을 읽고
+<data>•《님•의 •침•묵》•을 •읽•고•</data>
+
  ####################################################################################
  #
  #  Test rule status values
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/BreakIteratorFactory.java b/icu4j/main/classes/core/src/com/ibm/icu/text/BreakIteratorFactory.java

index 3de520597aa7d0e796ccbcc2cf3282670139801e..c78f36ed638b371c7acee0b60be693ec8b04c51d 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/BreakIteratorFactory.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/BreakIteratorFactory.java
@@ -136,7 +136,8 @@ final class BreakIteratorFactory extends BreakIterator.BreakIteratorServiceShim
                  typeKeyExt = "_" + keyValue;
              }
              String language = locale.getLanguage();
-            if (language != null && language.equals("ja")) {
+            // lw=phrase is only supported in Japanese and Korean
+            if (language != null && (language.equals("ja") || language.equals("ko"))) {
                  keyValue = locale.getKeywordValue("lw");
                  if (keyValue != null && keyValue.equals("phrase")) {
                      typeKeyExt += "_" + keyValue;
diff --git a/icu4j/main/shared/data/icudata.jar b/icu4j/main/shared/data/icudata.jar

index 906ece8061650f42b92218f426cba84dcc566b9e..995bf905d04ae3130f293001491f396e99b309af 100644 (file)
--- a/icu4j/main/shared/data/icudata.jar
+++ b/icu4j/main/shared/data/icudata.jar
@@ -1,3 +1,3 @@
  version https://git-lfs.github.com/spec/v1
-oid sha256:06dfeeb5d37e1335a30a60f86cedffc88a3bfac1fa88a3a01cde60b1c2cb7e03
-size 14175154
+oid sha256:a751c6fca25c606a7748811c108cbb07f594322ca2430f3445d307920ef99cc0
+size 14175551
diff --git a/icu4j/main/shared/data/testdata.jar b/icu4j/main/shared/data/testdata.jar

index b97c7b0775c5ea189e329e3288bbf63af466c0a5..9aa3fce963480f68d73cd547cf6d7705dbfcfb77 100644 (file)
--- a/icu4j/main/shared/data/testdata.jar
+++ b/icu4j/main/shared/data/testdata.jar
@@ -1,3 +1,3 @@
  version https://git-lfs.github.com/spec/v1
-oid sha256:048c14ed0e27b1e9a48c20f3d24b8ab10aa9d54b7aedf9eab1753f2ed3191ad0
-size 831605
+oid sha256:5b24e24cc139203a0f7887abdbf5f6df37a61bfa727d4be8ae3b916367a39137
+size 831604
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt

index 6f7555d2f6fec91c6c0d3c544eb351117d2e9cef..72bd15803d6854839c16517cde04725ff244e8c2 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt
@@ -1981,6 +1981,58 @@ Bangkok)•</data>
  #これらの連絡先はデバイスをロック解除しなくても表示され -> これらの▁連絡先は▁デバイスを▁ロック▁解除しなくても▁表示され
  <data>•\u3053\u308C\u3089\u306E•\u9023\u7D61\u5148\u306F•\u30C7\u30D0\u30A4\u30B9\u3092•\u30ED\u30C3\u30AF•\u89E3\u9664\u3057\u306A\u304F\u3066\u3082•\u8868\u793A\u3055\u308C•</data>
  
+# Test the differences in ko with or without lw=phrase.
+<locale ko@lw=phrase>
+<line>
+#1948년 7월 12일에 제정되고 8차에 국민투표에 의하여 개정한다.
+<data>•1948년 •7월 •12일에 •제정되고 •8차에 •국민투표에 •의하여 •개정한다.•</data>
+#대한민국은 민주공화국이다.
+<data>•대한민국은 •민주공화국이다.•</data>
+#서울에서 부산까지 London까지
+<data>•서울에서 •부산까지 •London까지•</data>
+#LTE가 안 되면 WiFi를
+<data>•LTE가 •안 •되면 •WiFi를•</data>
+#<님의 침묵>을 읽고 느낀 점은?
+<data>•\u003c님의 •침묵\u003e을 •읽고 •느낀 •점은?•</data>
+# The following entry passes in ICU4C but fails in ICU4J for an unknown reason.
+#"님의 침묵"을 읽고
+#<data>•"님의 •침묵"을 •읽고•</data>
+# The following 3 lines are not handled properly, yet.
+#“님의 침묵”을 읽고
+#<data>•“님의 •침묵”을 •읽고•</data>
+#『님의 침묵』을 읽고
+#<data>•『님의 •침묵』을 •읽고•</data>
+#大韓民國은 民主共和國이다
+#<data>•大韓民國은 •民主•共和國이다•</data>
+# All the tests for ja@lw=phrase should also work in Korean.
+#[京都観光］時雨殿に行った。-> [京都•観光］•時雨•殿に•行った。•
+<data>•\uff3b\u4eac\u90fd•\u89b3\u5149\uff3d•\u6642\u96e8•\u6bbf\u306b•\u884c\u3063\u305f\u3002•</data>
+#９月に東京から友達が遊びに来た -> ９月に•東京から•友達が•遊びに•来た•
+<data>•\uff19\u6708\u306b•\u6771\u4eac\u304b\u3089•\u53cb\u9054\u304c•\u904a\u3073\u306b•\u6765\u305f•</data>
+
+<locale ko>
+<line>
+#1948년 7월 12일에 제정되고 8차에 국민투표에 의하여 개정한다.
+<data>•1948•년 •7•월 •12•일•에 •제•정•되•고 •8•차•에 •국•민•투•표•에 •의•하•여•개•정•한•다.•</data>
+#대한민국은 민주공화국이다.
+<data>•대•한•민•국•은 •민•주•공•화•국•이•다.•</data>
+#서울에서 부산까지 London까지
+<data>•서•울•에•서 •부•산•까•지 •London•까•지•</data>
+#LTE가 안 되면 WiFi를
+<data>•LTE•가 •안 •되•면 •WiFi•를•</data>
+#<님의 침묵>을 읽고 느낀 점은?
+<data>•\u003c•님•의 •침•묵•\u003e•을 •읽•고 •느•낀 •점•은?•</data>
+#"님의 침묵"을 읽고
+<data>•"님•의 •침•묵"을 •읽•고•</data>
+#“님의 침묵”을 읽고
+<data>•“님•의 •침•묵”을 •읽•고•</data>
+#『님의 침묵』을 읽고
+<data>•『님•의 •침•묵』•을 •읽•고•</data>
+#『foo bar』load
+<data>•『foo •bar』•load•</data>
+#《님의 침묵》을 읽고
+<data>•《님•의 •침•묵》•을 •읽•고•</data>
+
  ####################################################################################
  #
  #  Test rule status values
diff --git a/tools/cldr/cldr-to-icu/build-icu-data.xml b/tools/cldr/cldr-to-icu/build-icu-data.xml

index a9488cc78f035229c6859cf45a77e2b5bd835930..e60290d42acaa66f26f8cc2f831e8790951b9d6f 100644 (file)
--- a/tools/cldr/cldr-to-icu/build-icu-data.xml
+++ b/tools/cldr/cldr-to-icu/build-icu-data.xml
@@ -356,7 +356,7 @@
              <directory dir="brkitr" inheritLanguageSubtag="zh_Hant">
                  <localeIds>
                      root,
-                    de, el, en, en_US_POSIX, en_US, es, fi, fr, it, ja, pt, ru, sv, zh_Hant, zh
+                    de, el, en, en_US_POSIX, en_US, es, fi, fr, it, ja, ko, pt, ru, sv, zh_Hant, zh
                  </localeIds>
              </directory>
author	Jungshik Shin <jshin@chromium.org>
	Thu, 25 Aug 2022 08:04:33 +0000 (01:04 -0700)
committer	Markus Scherer <markus.icu@gmail.com>
	Mon, 7 Nov 2022 22:30:49 +0000 (22:30 +0000)
icu4c/source/common/brkiter.cpp		patch \| blob \| history
icu4c/source/data/brkitr/ko.txt	[new file with mode: 0644]	patch \| blob
icu4c/source/data/xml/brkitr/ko.xml	[new file with mode: 0644]	patch \| blob
icu4c/source/test/testdata/rbbitst.txt		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/text/BreakIteratorFactory.java		patch \| blob \| history
icu4j/main/shared/data/icudata.jar		patch \| blob \| history
icu4j/main/shared/data/testdata.jar		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt		patch \| blob \| history
tools/cldr/cldr-to-icu/build-icu-data.xml		patch \| blob \| history