From: Fredrik Roubert Date: Thu, 26 Jul 2018 18:15:45 +0000 (+0200) Subject: ICU-20006 Parameterize the TestSourceTargetSet2 test case. X-Git-Tag: release-63-rc~95 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=d2b2d2409a5ae2c63628467b456a2601da2d3809;p=icu ICU-20006 Parameterize the TestSourceTargetSet2 test case. By extracting this test case into a separate class and separating the complex and time consuming building of the UnicodeSet disorderedMarks data structure into an @BeforeClass shared setup method it becomes possible to test the 18 different transliteration rules as a parameterized test case. This will lower the running time per test case and also aid in debugging as it will make it immediately obvious which transliteration rules cause test failures (and which don't). --- diff --git a/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/translit/TransliteratorDisorderedMarksTest.java b/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/translit/TransliteratorDisorderedMarksTest.java new file mode 100644 index 00000000000..ae7fe296e8a --- /dev/null +++ b/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/translit/TransliteratorDisorderedMarksTest.java @@ -0,0 +1,242 @@ +// © 2018 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html#License + +package com.ibm.icu.dev.test.translit; + +import java.util.Map.Entry; + +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; +import org.junit.runner.RunWith; + +import com.ibm.icu.dev.test.TestFmwk; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.text.CanonicalIterator; +import com.ibm.icu.text.Normalizer2; +import com.ibm.icu.text.Transliterator; +import com.ibm.icu.text.UnicodeSet; + +import junitparams.JUnitParamsRunner; +import junitparams.Parameters; + +/** + * @test + * @summary Disordered marks test of Transliterator + */ +@RunWith(JUnitParamsRunner.class) +public class TransliteratorDisorderedMarksTest extends TestFmwk { + private static UnicodeSet disorderedMarks; + + @AfterClass + public static void disorderedMarksNull() { + disorderedMarks = null; + } + + @BeforeClass + public static void disorderedMarksAddAll() { + Normalizer2 nfc = Normalizer2.getNFCInstance(); + Normalizer2 nfd = Normalizer2.getNFDInstance(); + + // Normalizer2 nfkd = Normalizer2.getInstance(null, "nfkd", Mode.DECOMPOSE); + // UnicodeSet nfkdSource = new UnicodeSet(); + // UnicodeSet nfkdTarget = new UnicodeSet(); + // for (int i = 0; i <= 0x10FFFF; ++i) { + // if (nfkd.isInert(i)) { + // continue; + // } + // nfkdSource.add(i); + // String t = nfkd.getDecomposition(i); + // if (t != null) { + // nfkdTarget.addAll(t); + // } else { + // nfkdTarget.add(i); + // } + // } + // nfkdSource.freeze(); + // nfkdTarget.freeze(); + // logln("NFKD Source: " + nfkdSource.toPattern(false)); + // logln("NFKD Target: " + nfkdTarget.toPattern(false)); + + UnicodeMap leadToTrail = new UnicodeMap(); + UnicodeMap leadToSources = new UnicodeMap(); + UnicodeSet nonStarters = new UnicodeSet("[:^ccc=0:]").freeze(); + CanonicalIterator can = new CanonicalIterator(""); + + disorderedMarks = new UnicodeSet(); + + for (int i = 0; i <= 0x10FFFF; ++i) { + String s = nfd.getDecomposition(i); + if (s == null) { + continue; + } + + can.setSource(s); + for (String t = can.next(); t != null; t = can.next()) { + disorderedMarks.add(t); + } + + // if s has two code points, (or more), add the lead/trail information + int first = s.codePointAt(0); + int firstCount = Character.charCount(first); + if (s.length() == firstCount) continue; + String trailString = s.substring(firstCount); + + // add all the trail characters + if (!nonStarters.containsSome(trailString)) { + continue; + } + UnicodeSet trailSet = leadToTrail.get(first); + if (trailSet == null) { + leadToTrail.put(first, trailSet = new UnicodeSet()); + } + trailSet.addAll(trailString); // add remaining trails + + // add the sources + UnicodeSet sourcesSet = leadToSources.get(first); + if (sourcesSet == null) { + leadToSources.put(first, sourcesSet = new UnicodeSet()); + } + sourcesSet.add(i); + } + + + for (Entry x : leadToSources.entrySet()) { + String lead = x.getKey(); + UnicodeSet sources = x.getValue(); + UnicodeSet trailSet = leadToTrail.get(lead); + for (String source : sources) { + for (String trail : trailSet) { + can.setSource(source + trail); + for (String t = can.next(); t != null; t = can.next()) { + if (t.endsWith(trail)) continue; + disorderedMarks.add(t); + } + } + } + } + + + for (String s : nonStarters) { + disorderedMarks.add("\u0345" + s); + disorderedMarks.add(s+"\u0323"); + String xx = nfc.normalize("\u01EC" + s); + if (!xx.startsWith("\u01EC")) { + logln("??"); + } + } + + // for (int i = 0; i <= 0x10FFFF; ++i) { + // String s = nfkd.getDecomposition(i); + // if (s != null) { + // disorderedMarks.add(s); + // disorderedMarks.add(nfc.normalize(s)); + // addDerivedStrings(nfc, disorderedMarks, s); + // } + // s = nfd.getDecomposition(i); + // if (s != null) { + // disorderedMarks.add(s); + // } + // if (!nfc.isInert(i)) { + // if (i == 0x00C0) { + // logln("\u00C0"); + // } + // can.setSource(s+"\u0334"); + // for (String t = can.next(); t != null; t = can.next()) { + // addDerivedStrings(nfc, disorderedMarks, t); + // } + // can.setSource(s+"\u0345"); + // for (String t = can.next(); t != null; t = can.next()) { + // addDerivedStrings(nfc, disorderedMarks, t); + // } + // can.setSource(s+"\u0323"); + // for (String t = can.next(); t != null; t = can.next()) { + // addDerivedStrings(nfc, disorderedMarks, t); + // } + // } + // } + logln("Test cases: " + disorderedMarks.size()); + disorderedMarks.addAll(0,0x10FFFF).freeze(); + logln("isInert \u0104 " + nfc.isInert('\u0104')); + } + + @Test + @Parameters({ + ":: [:sc=COMMON:] any-name;", + + ":: [:Greek:] hex-any/C;", + ":: [:Greek:] any-hex/C;", + + ":: [[:Mn:][:Me:]] remove;", + ":: [[:Mn:][:Me:]] null;", + + + ":: lower;", + ":: upper;", + ":: title;", + ":: CaseFold;", + + ":: NFD;", + ":: NFC;", + ":: NFKD;", + ":: NFKC;", + + ":: [[:Mn:][:Me:]] NFKD;", + ":: Latin-Greek;", + ":: [:Latin:] NFKD;", + ":: NFKD;", + ":: NFKD;\n" + + ":: [[:Mn:][:Me:]] remove;\n" + + ":: NFC;", + }) + public void TestSourceTargetSet2(String rule) { + Transliterator trans = Transliterator.createFromRules("temp", rule, Transliterator.FORWARD); + UnicodeSet actualSource = trans.getSourceSet(); + UnicodeSet actualTarget = trans.getTargetSet(); + UnicodeSet empiricalSource = new UnicodeSet(); + UnicodeSet empiricalTarget = new UnicodeSet(); + String ruleDisplay = rule.replace("\n", "\t\t"); + UnicodeSet toTest = disorderedMarks; + Normalizer2 nfd = Normalizer2.getNFDInstance(); + + String test = nfd.normalize("\u0104"); + boolean DEBUG = true; + @SuppressWarnings("unused") + int count = 0; // for debugging + for (String s : toTest) { + if (s.equals(test)) { + logln(test); + } + String t = trans.transform(s); + if (!s.equals(t)) { + if (!TransliteratorTest.isAtomic(s, t, trans)) { + TransliteratorTest.isAtomic(s, t, trans); + continue; + } + + // only keep the part that changed; so skip the front and end. + // int start = findSharedStartLength(s,t); + // int end = findSharedEndLength(s,t); + // if (start != 0 || end != 0) { + // s = s.substring(start, s.length() - end); + // t = t.substring(start, t.length() - end); + // } + if (DEBUG) { + if (!actualSource.containsAll(s)) { + count++; + } + if (!actualTarget.containsAll(t)) { + count++; + } + } + TransliteratorTest.addSourceTarget(s, empiricalSource, t, empiricalTarget); + } + } + if (rule.contains("title")) { + // See the comment in TestCasing() about the iota subscript. + empiricalSource.remove(0x345); + } + TransliteratorTest.assertEquals("getSource(" + ruleDisplay + ")", empiricalSource, actualSource, TransliteratorTest.SetAssert.MISSING_OK); + TransliteratorTest.assertEquals("getTarget(" + ruleDisplay + ")", empiricalTarget, actualTarget, TransliteratorTest.SetAssert.MISSING_OK); + } +} diff --git a/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java b/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java index b71629448ba..db20f6ea409 100644 --- a/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java +++ b/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java @@ -15,7 +15,6 @@ import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Locale; -import java.util.Map.Entry; import org.junit.Test; import org.junit.runner.RunWith; @@ -23,14 +22,11 @@ import org.junit.runners.JUnit4; import com.ibm.icu.dev.test.TestFmwk; import com.ibm.icu.dev.test.TestUtil; -import com.ibm.icu.dev.util.UnicodeMap; import com.ibm.icu.impl.Utility; import com.ibm.icu.impl.UtilityExtensions; import com.ibm.icu.lang.CharSequences; import com.ibm.icu.lang.UCharacter; import com.ibm.icu.lang.UScript; -import com.ibm.icu.text.CanonicalIterator; -import com.ibm.icu.text.Normalizer2; import com.ibm.icu.text.Replaceable; import com.ibm.icu.text.ReplaceableString; import com.ibm.icu.text.StringTransform; @@ -2989,218 +2985,6 @@ public class TransliteratorTest extends TestFmwk { } } - @Test - public void TestSourceTargetSet2() { - - - Normalizer2 nfc = Normalizer2.getNFCInstance(); - Normalizer2 nfd = Normalizer2.getNFDInstance(); - - // Normalizer2 nfkd = Normalizer2.getInstance(null, "nfkd", Mode.DECOMPOSE); - // UnicodeSet nfkdSource = new UnicodeSet(); - // UnicodeSet nfkdTarget = new UnicodeSet(); - // for (int i = 0; i <= 0x10FFFF; ++i) { - // if (nfkd.isInert(i)) { - // continue; - // } - // nfkdSource.add(i); - // String t = nfkd.getDecomposition(i); - // if (t != null) { - // nfkdTarget.addAll(t); - // } else { - // nfkdTarget.add(i); - // } - // } - // nfkdSource.freeze(); - // nfkdTarget.freeze(); - // logln("NFKD Source: " + nfkdSource.toPattern(false)); - // logln("NFKD Target: " + nfkdTarget.toPattern(false)); - - UnicodeMap leadToTrail = new UnicodeMap(); - UnicodeMap leadToSources = new UnicodeMap(); - UnicodeSet nonStarters = new UnicodeSet("[:^ccc=0:]").freeze(); - CanonicalIterator can = new CanonicalIterator(""); - - UnicodeSet disorderedMarks = new UnicodeSet(); - - for (int i = 0; i <= 0x10FFFF; ++i) { - String s = nfd.getDecomposition(i); - if (s == null) { - continue; - } - - can.setSource(s); - for (String t = can.next(); t != null; t = can.next()) { - disorderedMarks.add(t); - } - - // if s has two code points, (or more), add the lead/trail information - int first = s.codePointAt(0); - int firstCount = Character.charCount(first); - if (s.length() == firstCount) continue; - String trailString = s.substring(firstCount); - - // add all the trail characters - if (!nonStarters.containsSome(trailString)) { - continue; - } - UnicodeSet trailSet = leadToTrail.get(first); - if (trailSet == null) { - leadToTrail.put(first, trailSet = new UnicodeSet()); - } - trailSet.addAll(trailString); // add remaining trails - - // add the sources - UnicodeSet sourcesSet = leadToSources.get(first); - if (sourcesSet == null) { - leadToSources.put(first, sourcesSet = new UnicodeSet()); - } - sourcesSet.add(i); - } - - - for (Entry x : leadToSources.entrySet()) { - String lead = x.getKey(); - UnicodeSet sources = x.getValue(); - UnicodeSet trailSet = leadToTrail.get(lead); - for (String source : sources) { - for (String trail : trailSet) { - can.setSource(source + trail); - for (String t = can.next(); t != null; t = can.next()) { - if (t.endsWith(trail)) continue; - disorderedMarks.add(t); - } - } - } - } - - - for (String s : nonStarters) { - disorderedMarks.add("\u0345" + s); - disorderedMarks.add(s+"\u0323"); - String xx = nfc.normalize("\u01EC" + s); - if (!xx.startsWith("\u01EC")) { - logln("??"); - } - } - - // for (int i = 0; i <= 0x10FFFF; ++i) { - // String s = nfkd.getDecomposition(i); - // if (s != null) { - // disorderedMarks.add(s); - // disorderedMarks.add(nfc.normalize(s)); - // addDerivedStrings(nfc, disorderedMarks, s); - // } - // s = nfd.getDecomposition(i); - // if (s != null) { - // disorderedMarks.add(s); - // } - // if (!nfc.isInert(i)) { - // if (i == 0x00C0) { - // logln("\u00C0"); - // } - // can.setSource(s+"\u0334"); - // for (String t = can.next(); t != null; t = can.next()) { - // addDerivedStrings(nfc, disorderedMarks, t); - // } - // can.setSource(s+"\u0345"); - // for (String t = can.next(); t != null; t = can.next()) { - // addDerivedStrings(nfc, disorderedMarks, t); - // } - // can.setSource(s+"\u0323"); - // for (String t = can.next(); t != null; t = can.next()) { - // addDerivedStrings(nfc, disorderedMarks, t); - // } - // } - // } - logln("Test cases: " + disorderedMarks.size()); - disorderedMarks.addAll(0,0x10FFFF).freeze(); - logln("isInert \u0104 " + nfc.isInert('\u0104')); - - Object[][] rules = { - {":: [:sc=COMMON:] any-name;", null}, - - {":: [:Greek:] hex-any/C;", null}, - {":: [:Greek:] any-hex/C;", null}, - - {":: [[:Mn:][:Me:]] remove;", null}, - {":: [[:Mn:][:Me:]] null;", null}, - - - {":: lower;", null}, - {":: upper;", null}, - {":: title;", null}, - {":: CaseFold;", null}, - - {":: NFD;", null}, - {":: NFC;", null}, - {":: NFKD;", null}, - {":: NFKC;", null}, - - {":: [[:Mn:][:Me:]] NFKD;", null}, - {":: Latin-Greek;", null}, - {":: [:Latin:] NFKD;", null}, - {":: NFKD;", null}, - {":: NFKD;\n" + - ":: [[:Mn:][:Me:]] remove;\n" + - ":: NFC;", null}, - }; - for (Object[] rulex : rules) { - String rule = (String) rulex[0]; - Transliterator trans = Transliterator.createFromRules("temp", rule, Transliterator.FORWARD); - UnicodeSet actualSource = trans.getSourceSet(); - UnicodeSet actualTarget = trans.getTargetSet(); - UnicodeSet empiricalSource = new UnicodeSet(); - UnicodeSet empiricalTarget = new UnicodeSet(); - String ruleDisplay = rule.replace("\n", "\t\t"); - UnicodeSet toTest = disorderedMarks; - // if (rulex[1] != null) { - // toTest = new UnicodeSet(disorderedMarks); - // toTest.addAll((UnicodeSet) rulex[1]); - // } - - String test = nfd.normalize("\u0104"); - boolean DEBUG = true; - @SuppressWarnings("unused") - int count = 0; // for debugging - for (String s : toTest) { - if (s.equals(test)) { - logln(test); - } - String t = trans.transform(s); - if (!s.equals(t)) { - if (!isAtomic(s, t, trans)) { - isAtomic(s, t, trans); - continue; - } - - // only keep the part that changed; so skip the front and end. - // int start = findSharedStartLength(s,t); - // int end = findSharedEndLength(s,t); - // if (start != 0 || end != 0) { - // s = s.substring(start, s.length() - end); - // t = t.substring(start, t.length() - end); - // } - if (DEBUG) { - if (!actualSource.containsAll(s)) { - count++; - } - if (!actualTarget.containsAll(t)) { - count++; - } - } - addSourceTarget(s, empiricalSource, t, empiricalTarget); - } - } - if (rule.contains("title")) { - // See the comment in TestCasing() about the iota subscript. - empiricalSource.remove(0x345); - } - assertEquals("getSource(" + ruleDisplay + ")", empiricalSource, actualSource, SetAssert.MISSING_OK); - assertEquals("getTarget(" + ruleDisplay + ")", empiricalTarget, actualTarget, SetAssert.MISSING_OK); - } - } - @Test public void TestSourceTargetSetFilter() { String[][] tests = { @@ -3256,7 +3040,7 @@ public class TransliteratorTest extends TestFmwk { } } - private boolean isAtomic(String s, String t, Transliterator trans) { + static boolean isAtomic(String s, String t, Transliterator trans) { for (int i = 1; i < s.length(); ++i) { if (!CharSequences.onCharacterBoundary(s, i)) { continue; @@ -3285,7 +3069,7 @@ public class TransliteratorTest extends TestFmwk { // } } - private void addSourceTarget(String s, UnicodeSet expectedSource, String t, UnicodeSet expectedTarget) { + static void addSourceTarget(String s, UnicodeSet expectedSource, String t, UnicodeSet expectedTarget) { expectedSource.addAll(s); if (t.length() > 0) { expectedTarget.addAll(t); @@ -3378,7 +3162,7 @@ public class TransliteratorTest extends TestFmwk { enum SetAssert {EQUALS, MISSING_OK, EXTRA_OK} - void assertEquals(String message, UnicodeSet empirical, UnicodeSet actual, SetAssert setAssert) { + static void assertEquals(String message, UnicodeSet empirical, UnicodeSet actual, SetAssert setAssert) { boolean haveError = false; if (!actual.containsAll(empirical)) { UnicodeSet missing = new UnicodeSet(empirical).removeAll(actual); @@ -3395,7 +3179,7 @@ public class TransliteratorTest extends TestFmwk { } } - private String toPattern(UnicodeSet missing) { + private static String toPattern(UnicodeSet missing) { String result = missing.toPattern(false); if (result.length() < 200) { return result;