--- /dev/null
+// © 2018 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html#License
+
+package com.ibm.icu.dev.test.translit;
+
+import java.util.Map.Entry;
+
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+
+import com.ibm.icu.dev.test.TestFmwk;
+import com.ibm.icu.dev.util.UnicodeMap;
+import com.ibm.icu.text.CanonicalIterator;
+import com.ibm.icu.text.Normalizer2;
+import com.ibm.icu.text.Transliterator;
+import com.ibm.icu.text.UnicodeSet;
+
+import junitparams.JUnitParamsRunner;
+import junitparams.Parameters;
+
+/**
+ * @test
+ * @summary Disordered marks test of Transliterator
+ */
+@RunWith(JUnitParamsRunner.class)
+public class TransliteratorDisorderedMarksTest extends TestFmwk {
+ private static UnicodeSet disorderedMarks;
+
+ @AfterClass
+ public static void disorderedMarksNull() {
+ disorderedMarks = null;
+ }
+
+ @BeforeClass
+ public static void disorderedMarksAddAll() {
+ Normalizer2 nfc = Normalizer2.getNFCInstance();
+ Normalizer2 nfd = Normalizer2.getNFDInstance();
+
+ // Normalizer2 nfkd = Normalizer2.getInstance(null, "nfkd", Mode.DECOMPOSE);
+ // UnicodeSet nfkdSource = new UnicodeSet();
+ // UnicodeSet nfkdTarget = new UnicodeSet();
+ // for (int i = 0; i <= 0x10FFFF; ++i) {
+ // if (nfkd.isInert(i)) {
+ // continue;
+ // }
+ // nfkdSource.add(i);
+ // String t = nfkd.getDecomposition(i);
+ // if (t != null) {
+ // nfkdTarget.addAll(t);
+ // } else {
+ // nfkdTarget.add(i);
+ // }
+ // }
+ // nfkdSource.freeze();
+ // nfkdTarget.freeze();
+ // logln("NFKD Source: " + nfkdSource.toPattern(false));
+ // logln("NFKD Target: " + nfkdTarget.toPattern(false));
+
+ UnicodeMap<UnicodeSet> leadToTrail = new UnicodeMap();
+ UnicodeMap<UnicodeSet> leadToSources = new UnicodeMap();
+ UnicodeSet nonStarters = new UnicodeSet("[:^ccc=0:]").freeze();
+ CanonicalIterator can = new CanonicalIterator("");
+
+ disorderedMarks = new UnicodeSet();
+
+ for (int i = 0; i <= 0x10FFFF; ++i) {
+ String s = nfd.getDecomposition(i);
+ if (s == null) {
+ continue;
+ }
+
+ can.setSource(s);
+ for (String t = can.next(); t != null; t = can.next()) {
+ disorderedMarks.add(t);
+ }
+
+ // if s has two code points, (or more), add the lead/trail information
+ int first = s.codePointAt(0);
+ int firstCount = Character.charCount(first);
+ if (s.length() == firstCount) continue;
+ String trailString = s.substring(firstCount);
+
+ // add all the trail characters
+ if (!nonStarters.containsSome(trailString)) {
+ continue;
+ }
+ UnicodeSet trailSet = leadToTrail.get(first);
+ if (trailSet == null) {
+ leadToTrail.put(first, trailSet = new UnicodeSet());
+ }
+ trailSet.addAll(trailString); // add remaining trails
+
+ // add the sources
+ UnicodeSet sourcesSet = leadToSources.get(first);
+ if (sourcesSet == null) {
+ leadToSources.put(first, sourcesSet = new UnicodeSet());
+ }
+ sourcesSet.add(i);
+ }
+
+
+ for (Entry<String, UnicodeSet> x : leadToSources.entrySet()) {
+ String lead = x.getKey();
+ UnicodeSet sources = x.getValue();
+ UnicodeSet trailSet = leadToTrail.get(lead);
+ for (String source : sources) {
+ for (String trail : trailSet) {
+ can.setSource(source + trail);
+ for (String t = can.next(); t != null; t = can.next()) {
+ if (t.endsWith(trail)) continue;
+ disorderedMarks.add(t);
+ }
+ }
+ }
+ }
+
+
+ for (String s : nonStarters) {
+ disorderedMarks.add("\u0345" + s);
+ disorderedMarks.add(s+"\u0323");
+ String xx = nfc.normalize("\u01EC" + s);
+ if (!xx.startsWith("\u01EC")) {
+ logln("??");
+ }
+ }
+
+ // for (int i = 0; i <= 0x10FFFF; ++i) {
+ // String s = nfkd.getDecomposition(i);
+ // if (s != null) {
+ // disorderedMarks.add(s);
+ // disorderedMarks.add(nfc.normalize(s));
+ // addDerivedStrings(nfc, disorderedMarks, s);
+ // }
+ // s = nfd.getDecomposition(i);
+ // if (s != null) {
+ // disorderedMarks.add(s);
+ // }
+ // if (!nfc.isInert(i)) {
+ // if (i == 0x00C0) {
+ // logln("\u00C0");
+ // }
+ // can.setSource(s+"\u0334");
+ // for (String t = can.next(); t != null; t = can.next()) {
+ // addDerivedStrings(nfc, disorderedMarks, t);
+ // }
+ // can.setSource(s+"\u0345");
+ // for (String t = can.next(); t != null; t = can.next()) {
+ // addDerivedStrings(nfc, disorderedMarks, t);
+ // }
+ // can.setSource(s+"\u0323");
+ // for (String t = can.next(); t != null; t = can.next()) {
+ // addDerivedStrings(nfc, disorderedMarks, t);
+ // }
+ // }
+ // }
+ logln("Test cases: " + disorderedMarks.size());
+ disorderedMarks.addAll(0,0x10FFFF).freeze();
+ logln("isInert \u0104 " + nfc.isInert('\u0104'));
+ }
+
+ @Test
+ @Parameters({
+ ":: [:sc=COMMON:] any-name;",
+
+ ":: [:Greek:] hex-any/C;",
+ ":: [:Greek:] any-hex/C;",
+
+ ":: [[:Mn:][:Me:]] remove;",
+ ":: [[:Mn:][:Me:]] null;",
+
+
+ ":: lower;",
+ ":: upper;",
+ ":: title;",
+ ":: CaseFold;",
+
+ ":: NFD;",
+ ":: NFC;",
+ ":: NFKD;",
+ ":: NFKC;",
+
+ ":: [[:Mn:][:Me:]] NFKD;",
+ ":: Latin-Greek;",
+ ":: [:Latin:] NFKD;",
+ ":: NFKD;",
+ ":: NFKD;\n" +
+ ":: [[:Mn:][:Me:]] remove;\n" +
+ ":: NFC;",
+ })
+ public void TestSourceTargetSet2(String rule) {
+ Transliterator trans = Transliterator.createFromRules("temp", rule, Transliterator.FORWARD);
+ UnicodeSet actualSource = trans.getSourceSet();
+ UnicodeSet actualTarget = trans.getTargetSet();
+ UnicodeSet empiricalSource = new UnicodeSet();
+ UnicodeSet empiricalTarget = new UnicodeSet();
+ String ruleDisplay = rule.replace("\n", "\t\t");
+ UnicodeSet toTest = disorderedMarks;
+ Normalizer2 nfd = Normalizer2.getNFDInstance();
+
+ String test = nfd.normalize("\u0104");
+ boolean DEBUG = true;
+ @SuppressWarnings("unused")
+ int count = 0; // for debugging
+ for (String s : toTest) {
+ if (s.equals(test)) {
+ logln(test);
+ }
+ String t = trans.transform(s);
+ if (!s.equals(t)) {
+ if (!TransliteratorTest.isAtomic(s, t, trans)) {
+ TransliteratorTest.isAtomic(s, t, trans);
+ continue;
+ }
+
+ // only keep the part that changed; so skip the front and end.
+ // int start = findSharedStartLength(s,t);
+ // int end = findSharedEndLength(s,t);
+ // if (start != 0 || end != 0) {
+ // s = s.substring(start, s.length() - end);
+ // t = t.substring(start, t.length() - end);
+ // }
+ if (DEBUG) {
+ if (!actualSource.containsAll(s)) {
+ count++;
+ }
+ if (!actualTarget.containsAll(t)) {
+ count++;
+ }
+ }
+ TransliteratorTest.addSourceTarget(s, empiricalSource, t, empiricalTarget);
+ }
+ }
+ if (rule.contains("title")) {
+ // See the comment in TestCasing() about the iota subscript.
+ empiricalSource.remove(0x345);
+ }
+ TransliteratorTest.assertEquals("getSource(" + ruleDisplay + ")", empiricalSource, actualSource, TransliteratorTest.SetAssert.MISSING_OK);
+ TransliteratorTest.assertEquals("getTarget(" + ruleDisplay + ")", empiricalTarget, actualTarget, TransliteratorTest.SetAssert.MISSING_OK);
+ }
+}
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
-import java.util.Map.Entry;
import org.junit.Test;
import org.junit.runner.RunWith;
import com.ibm.icu.dev.test.TestFmwk;
import com.ibm.icu.dev.test.TestUtil;
-import com.ibm.icu.dev.util.UnicodeMap;
import com.ibm.icu.impl.Utility;
import com.ibm.icu.impl.UtilityExtensions;
import com.ibm.icu.lang.CharSequences;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UScript;
-import com.ibm.icu.text.CanonicalIterator;
-import com.ibm.icu.text.Normalizer2;
import com.ibm.icu.text.Replaceable;
import com.ibm.icu.text.ReplaceableString;
import com.ibm.icu.text.StringTransform;
}
}
- @Test
- public void TestSourceTargetSet2() {
-
-
- Normalizer2 nfc = Normalizer2.getNFCInstance();
- Normalizer2 nfd = Normalizer2.getNFDInstance();
-
- // Normalizer2 nfkd = Normalizer2.getInstance(null, "nfkd", Mode.DECOMPOSE);
- // UnicodeSet nfkdSource = new UnicodeSet();
- // UnicodeSet nfkdTarget = new UnicodeSet();
- // for (int i = 0; i <= 0x10FFFF; ++i) {
- // if (nfkd.isInert(i)) {
- // continue;
- // }
- // nfkdSource.add(i);
- // String t = nfkd.getDecomposition(i);
- // if (t != null) {
- // nfkdTarget.addAll(t);
- // } else {
- // nfkdTarget.add(i);
- // }
- // }
- // nfkdSource.freeze();
- // nfkdTarget.freeze();
- // logln("NFKD Source: " + nfkdSource.toPattern(false));
- // logln("NFKD Target: " + nfkdTarget.toPattern(false));
-
- UnicodeMap<UnicodeSet> leadToTrail = new UnicodeMap();
- UnicodeMap<UnicodeSet> leadToSources = new UnicodeMap();
- UnicodeSet nonStarters = new UnicodeSet("[:^ccc=0:]").freeze();
- CanonicalIterator can = new CanonicalIterator("");
-
- UnicodeSet disorderedMarks = new UnicodeSet();
-
- for (int i = 0; i <= 0x10FFFF; ++i) {
- String s = nfd.getDecomposition(i);
- if (s == null) {
- continue;
- }
-
- can.setSource(s);
- for (String t = can.next(); t != null; t = can.next()) {
- disorderedMarks.add(t);
- }
-
- // if s has two code points, (or more), add the lead/trail information
- int first = s.codePointAt(0);
- int firstCount = Character.charCount(first);
- if (s.length() == firstCount) continue;
- String trailString = s.substring(firstCount);
-
- // add all the trail characters
- if (!nonStarters.containsSome(trailString)) {
- continue;
- }
- UnicodeSet trailSet = leadToTrail.get(first);
- if (trailSet == null) {
- leadToTrail.put(first, trailSet = new UnicodeSet());
- }
- trailSet.addAll(trailString); // add remaining trails
-
- // add the sources
- UnicodeSet sourcesSet = leadToSources.get(first);
- if (sourcesSet == null) {
- leadToSources.put(first, sourcesSet = new UnicodeSet());
- }
- sourcesSet.add(i);
- }
-
-
- for (Entry<String, UnicodeSet> x : leadToSources.entrySet()) {
- String lead = x.getKey();
- UnicodeSet sources = x.getValue();
- UnicodeSet trailSet = leadToTrail.get(lead);
- for (String source : sources) {
- for (String trail : trailSet) {
- can.setSource(source + trail);
- for (String t = can.next(); t != null; t = can.next()) {
- if (t.endsWith(trail)) continue;
- disorderedMarks.add(t);
- }
- }
- }
- }
-
-
- for (String s : nonStarters) {
- disorderedMarks.add("\u0345" + s);
- disorderedMarks.add(s+"\u0323");
- String xx = nfc.normalize("\u01EC" + s);
- if (!xx.startsWith("\u01EC")) {
- logln("??");
- }
- }
-
- // for (int i = 0; i <= 0x10FFFF; ++i) {
- // String s = nfkd.getDecomposition(i);
- // if (s != null) {
- // disorderedMarks.add(s);
- // disorderedMarks.add(nfc.normalize(s));
- // addDerivedStrings(nfc, disorderedMarks, s);
- // }
- // s = nfd.getDecomposition(i);
- // if (s != null) {
- // disorderedMarks.add(s);
- // }
- // if (!nfc.isInert(i)) {
- // if (i == 0x00C0) {
- // logln("\u00C0");
- // }
- // can.setSource(s+"\u0334");
- // for (String t = can.next(); t != null; t = can.next()) {
- // addDerivedStrings(nfc, disorderedMarks, t);
- // }
- // can.setSource(s+"\u0345");
- // for (String t = can.next(); t != null; t = can.next()) {
- // addDerivedStrings(nfc, disorderedMarks, t);
- // }
- // can.setSource(s+"\u0323");
- // for (String t = can.next(); t != null; t = can.next()) {
- // addDerivedStrings(nfc, disorderedMarks, t);
- // }
- // }
- // }
- logln("Test cases: " + disorderedMarks.size());
- disorderedMarks.addAll(0,0x10FFFF).freeze();
- logln("isInert \u0104 " + nfc.isInert('\u0104'));
-
- Object[][] rules = {
- {":: [:sc=COMMON:] any-name;", null},
-
- {":: [:Greek:] hex-any/C;", null},
- {":: [:Greek:] any-hex/C;", null},
-
- {":: [[:Mn:][:Me:]] remove;", null},
- {":: [[:Mn:][:Me:]] null;", null},
-
-
- {":: lower;", null},
- {":: upper;", null},
- {":: title;", null},
- {":: CaseFold;", null},
-
- {":: NFD;", null},
- {":: NFC;", null},
- {":: NFKD;", null},
- {":: NFKC;", null},
-
- {":: [[:Mn:][:Me:]] NFKD;", null},
- {":: Latin-Greek;", null},
- {":: [:Latin:] NFKD;", null},
- {":: NFKD;", null},
- {":: NFKD;\n" +
- ":: [[:Mn:][:Me:]] remove;\n" +
- ":: NFC;", null},
- };
- for (Object[] rulex : rules) {
- String rule = (String) rulex[0];
- Transliterator trans = Transliterator.createFromRules("temp", rule, Transliterator.FORWARD);
- UnicodeSet actualSource = trans.getSourceSet();
- UnicodeSet actualTarget = trans.getTargetSet();
- UnicodeSet empiricalSource = new UnicodeSet();
- UnicodeSet empiricalTarget = new UnicodeSet();
- String ruleDisplay = rule.replace("\n", "\t\t");
- UnicodeSet toTest = disorderedMarks;
- // if (rulex[1] != null) {
- // toTest = new UnicodeSet(disorderedMarks);
- // toTest.addAll((UnicodeSet) rulex[1]);
- // }
-
- String test = nfd.normalize("\u0104");
- boolean DEBUG = true;
- @SuppressWarnings("unused")
- int count = 0; // for debugging
- for (String s : toTest) {
- if (s.equals(test)) {
- logln(test);
- }
- String t = trans.transform(s);
- if (!s.equals(t)) {
- if (!isAtomic(s, t, trans)) {
- isAtomic(s, t, trans);
- continue;
- }
-
- // only keep the part that changed; so skip the front and end.
- // int start = findSharedStartLength(s,t);
- // int end = findSharedEndLength(s,t);
- // if (start != 0 || end != 0) {
- // s = s.substring(start, s.length() - end);
- // t = t.substring(start, t.length() - end);
- // }
- if (DEBUG) {
- if (!actualSource.containsAll(s)) {
- count++;
- }
- if (!actualTarget.containsAll(t)) {
- count++;
- }
- }
- addSourceTarget(s, empiricalSource, t, empiricalTarget);
- }
- }
- if (rule.contains("title")) {
- // See the comment in TestCasing() about the iota subscript.
- empiricalSource.remove(0x345);
- }
- assertEquals("getSource(" + ruleDisplay + ")", empiricalSource, actualSource, SetAssert.MISSING_OK);
- assertEquals("getTarget(" + ruleDisplay + ")", empiricalTarget, actualTarget, SetAssert.MISSING_OK);
- }
- }
-
@Test
public void TestSourceTargetSetFilter() {
String[][] tests = {
}
}
- private boolean isAtomic(String s, String t, Transliterator trans) {
+ static boolean isAtomic(String s, String t, Transliterator trans) {
for (int i = 1; i < s.length(); ++i) {
if (!CharSequences.onCharacterBoundary(s, i)) {
continue;
// }
}
- private void addSourceTarget(String s, UnicodeSet expectedSource, String t, UnicodeSet expectedTarget) {
+ static void addSourceTarget(String s, UnicodeSet expectedSource, String t, UnicodeSet expectedTarget) {
expectedSource.addAll(s);
if (t.length() > 0) {
expectedTarget.addAll(t);
enum SetAssert {EQUALS, MISSING_OK, EXTRA_OK}
- void assertEquals(String message, UnicodeSet empirical, UnicodeSet actual, SetAssert setAssert) {
+ static void assertEquals(String message, UnicodeSet empirical, UnicodeSet actual, SetAssert setAssert) {
boolean haveError = false;
if (!actual.containsAll(empirical)) {
UnicodeSet missing = new UnicodeSet(empirical).removeAll(actual);
}
}
- private String toPattern(UnicodeSet missing) {
+ private static String toPattern(UnicodeSet missing) {
String result = missing.toPattern(false);
if (result.length() < 200) {
return result;