From: Markus Scherer Date: Fri, 1 Jul 2011 22:17:53 +0000 (+0000) Subject: ICU-8615 implement optional IDNA2008 CONTEXTO check in UTS46 X-Git-Tag: milestone-59-0-1~4691 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=3044b396156f5dfeafad01ebe7b5d2d237d4e35d;p=icu ICU-8615 implement optional IDNA2008 CONTEXTO check in UTS46 X-SVN-Rev: 30267 --- diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/UCharacterProperty.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/UCharacterProperty.java index e2413a9e9e2..acb645874e6 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/UCharacterProperty.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/UCharacterProperty.java @@ -116,21 +116,19 @@ public final class UCharacterProperty /** * Gets the unicode additional properties. - * C version getUnicodeProperties. + * Java version of C u_getUnicodeProperties(). * @param codepoint codepoint whose additional properties is to be * retrieved * @param column The column index. * @return unicode properties */ - public int getAdditional(int codepoint, int column) { - if (column == -1) { - return getProperty(codepoint); + public int getAdditional(int codepoint, int column) { + assert column >= 0; + if (column >= m_additionalColumnsCount_) { + return 0; } - if (column < 0 || column >= m_additionalColumnsCount_) { - return 0; - } - return m_additionalVectors_[m_additionalTrie_.get(codepoint) + column]; - } + return m_additionalVectors_[m_additionalTrie_.get(codepoint) + column]; + } static final int MY_MASK = UCharacterProperty.TYPE_MASK & ((1<=0xb7) { + checkLabelContextO(labelString, labelStart, labelLength, info); + } if(toASCII) { if(wasPunycode) { // Leave a Punycode label unchanged if it has no severe errors. @@ -734,6 +738,96 @@ public final class UTS46 extends IDNA { return true; } + private void + checkLabelContextO(CharSequence label, int labelStart, int labelLength, Info info) { + int labelEnd=labelStart+labelLength-1; // inclusive + int arabicDigits=0; // -1 for 066x, +1 for 06Fx + for(int i=labelStart; i<=labelEnd; ++i) { + int c=label.charAt(i); + if(c<0xb7) { + // ASCII fastpath + } else if(c<=0x6f9) { + if(c==0xb7) { + // Appendix A.3. MIDDLE DOT (U+00B7) + // Rule Set: + // False; + // If Before(cp) .eq. U+006C And + // After(cp) .eq. U+006C Then True; + if(!(labelStart0) { + addLabelError(info, Error.CONTEXTO_DIGITS); + } + arabicDigits=-1; + } else if(0x6f0<=c) { + if(arabicDigits<0) { + addLabelError(info, Error.CONTEXTO_DIGITS); + } + arabicDigits=1; + } + } + } else if(c==0x30fb) { + // Appendix A.7. KATAKANA MIDDLE DOT (U+30FB) + // Rule Set: + // False; + // For All Characters: + // If Script(cp) .in. {Hiragana, Katakana, Han} Then True; + // End For; + for(int j=labelStart;; j+=Character.charCount(c)) { + if(j>labelEnd) { + addLabelError(info, Error.CONTEXTO_PUNCTUATION); + break; + } + c=Character.codePointAt(label, j); + int script=UScript.getScript(c); + if(script==UScript.HIRAGANA || script==UScript.KATAKANA || script==UScript.HAN) { + break; + } + } + } + } + } + // TODO: make public(?) -- in C, these are public in uchar.h private static int U_MASK(int x) { return 1<This option is ignored by the UTS46 implementation. * (UTS #46 disallows unassigned code points.) * @stable ICU 2.8 */ @@ -68,12 +70,14 @@ public abstract class IDNA { * Option to check whether the input conforms to the STD3 ASCII rules, * for example the restriction of labels to LDH characters * (ASCII Letters, Digits and Hyphen-Minus). + * For use in static worker and factory methods. * @stable ICU 2.8 */ public static final int USE_STD3_RULES = 2; /** * IDNA option to check for whether the input conforms to the BiDi rules. - * This option is ignored by the IDNA2003 implementation. + * For use in static worker and factory methods. + *

This option is ignored by the IDNA2003 implementation. * (IDNA2003 always performs a BiDi check.) * @draft ICU 4.6 * @provisional This API might change or be removed in a future release. @@ -81,7 +85,8 @@ public abstract class IDNA { public static final int CHECK_BIDI = 4; /** * IDNA option to check for whether the input conforms to the CONTEXTJ rules. - * This option is ignored by the IDNA2003 implementation. + * For use in static worker and factory methods. + *

This option is ignored by the IDNA2003 implementation. * (The CONTEXTJ check is new in IDNA2008.) * @draft ICU 4.6 * @provisional This API might change or be removed in a future release. @@ -89,8 +94,9 @@ public abstract class IDNA { public static final int CHECK_CONTEXTJ = 8; /** * IDNA option for nontransitional processing in ToASCII(). - * By default, ToASCII() uses transitional processing. - * This option is ignored by the IDNA2003 implementation. + * For use in static worker and factory methods. + *

By default, ToASCII() uses transitional processing. + *

This option is ignored by the IDNA2003 implementation. * (This is only relevant for compatibility of newer IDNA implementations with IDNA2003.) * @draft ICU 4.6 * @provisional This API might change or be removed in a future release. @@ -98,13 +104,25 @@ public abstract class IDNA { public static final int NONTRANSITIONAL_TO_ASCII = 0x10; /** * IDNA option for nontransitional processing in ToUnicode(). - * By default, ToUnicode() uses transitional processing. - * This option is ignored by the IDNA2003 implementation. + * For use in static worker and factory methods. + *

By default, ToUnicode() uses transitional processing. + *

This option is ignored by the IDNA2003 implementation. * (This is only relevant for compatibility of newer IDNA implementations with IDNA2003.) * @draft ICU 4.6 * @provisional This API might change or be removed in a future release. */ public static final int NONTRANSITIONAL_TO_UNICODE = 0x20; + /** + * IDNA option to check for whether the input conforms to the CONTEXTO rules. + * For use in static worker and factory methods. + *

This option is ignored by the IDNA2003 implementation. + * (The CONTEXTO check is new in IDNA2008.) + *

This is for use by registries for IDNA2008 conformance. + * UTS #46 does not require the CONTEXTO check. + * @draft ICU 49 + * @provisional This API might change or be removed in a future release. + */ + public static final int CHECK_CONTEXTO = 0x40; /** * Returns an IDNA instance which implements UTS #46. @@ -440,7 +458,22 @@ public abstract class IDNA { * @draft ICU 4.6 * @provisional This API might change or be removed in a future release. */ - CONTEXTJ + CONTEXTJ, + /** + * A label does not meet the IDNA CONTEXTO requirements for punctuation characters. + * Some punctuation characters "Would otherwise have been DISALLOWED" + * but are allowed in certain contexts. (RFC 5892) + * @draft ICU 49 + * @provisional This API might change or be removed in a future release. + */ + CONTEXTO_PUNCTUATION, + /** + * A label does not meet the IDNA CONTEXTO requirements for digits. + * Arabic-Indic Digits (U+066x) must not be mixed with Extended Arabic-Indic Digits (U+06Fx). + * @draft ICU 49 + * @provisional This API might change or be removed in a future release. + */ + CONTEXTO_DIGITS } /** diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/normalizer/UTS46Test.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/normalizer/UTS46Test.java index 4ba9a4b9100..c9a93c660c0 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/normalizer/UTS46Test.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/normalizer/UTS46Test.java @@ -1,6 +1,6 @@ /* ******************************************************************************* -* Copyright (C) 2010, International Business Machines +* Copyright (C) 2010-2011, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************* */ @@ -26,8 +26,11 @@ public class UTS46Test extends TestFmwk { new UTS46Test().run(args); } public UTS46Test() { - trans=IDNA.getUTS46Instance(IDNA.USE_STD3_RULES|IDNA.CHECK_BIDI|IDNA.CHECK_CONTEXTJ); - nontrans=IDNA.getUTS46Instance(IDNA.USE_STD3_RULES|IDNA.CHECK_BIDI|IDNA.CHECK_CONTEXTJ| + int commonOptions= + IDNA.USE_STD3_RULES|IDNA.CHECK_BIDI| + IDNA.CHECK_CONTEXTJ|IDNA.CHECK_CONTEXTO; + trans=IDNA.getUTS46Instance(commonOptions); + nontrans=IDNA.getUTS46Instance(commonOptions| IDNA.NONTRANSITIONAL_TO_ASCII|IDNA.NONTRANSITIONAL_TO_UNICODE); } @@ -107,6 +110,8 @@ public class UTS46Test extends TestFmwk { errorNamesToErrors.put("UIDNA_ERROR_INVALID_ACE_LABEL", IDNA.Error.INVALID_ACE_LABEL); errorNamesToErrors.put("UIDNA_ERROR_BIDI", IDNA.Error.BIDI); errorNamesToErrors.put("UIDNA_ERROR_CONTEXTJ", IDNA.Error.CONTEXTJ); + errorNamesToErrors.put("UIDNA_ERROR_CONTEXTO_PUNCTUATION", IDNA.Error.CONTEXTO_PUNCTUATION); + errorNamesToErrors.put("UIDNA_ERROR_CONTEXTO_DIGITS", IDNA.Error.CONTEXTO_DIGITS); } private static final class TestCase { @@ -424,6 +429,29 @@ public class UTS46Test extends TestFmwk { "\u06EF\u200C\u06EF", "UIDNA_ERROR_CONTEXTJ" }, { "\u0644\u200C", "N", // D ZWNJ "\u0644\u200C", "UIDNA_ERROR_BIDI|UIDNA_ERROR_CONTEXTJ" }, + { "\u0660\u0661", "B", // Arabic-Indic Digits alone + "\u0660\u0661", "UIDNA_ERROR_BIDI" }, + { "\u06F0\u06F1", "B", // Extended Arabic-Indic Digits alone + "\u06F0\u06F1", "" }, + { "\u0660\u06F1", "B", // Mixed Arabic-Indic Digits + "\u0660\u06F1", "UIDNA_ERROR_CONTEXTO_DIGITS|UIDNA_ERROR_BIDI" }, + // All of the CONTEXTO "Would otherwise have been DISALLOWED" characters + // in their correct contexts, + // then each in incorrect context. + { "l\u00B7l\u4E00\u0375\u03B1\u05D0\u05F3\u05F4\u30FB", "B", + "l\u00B7l\u4E00\u0375\u03B1\u05D0\u05F3\u05F4\u30FB", "UIDNA_ERROR_BIDI" }, + { "l\u00B7", "B", + "l\u00B7", "UIDNA_ERROR_CONTEXTO_PUNCTUATION" }, + { "\u00B7l", "B", + "\u00B7l", "UIDNA_ERROR_CONTEXTO_PUNCTUATION" }, + { "\u0375", "B", + "\u0375", "UIDNA_ERROR_CONTEXTO_PUNCTUATION" }, + { "\u03B1\u05F3", "B", + "\u03B1\u05F3", "UIDNA_ERROR_CONTEXTO_PUNCTUATION|UIDNA_ERROR_BIDI" }, + { "\u05F4", "B", + "\u05F4", "UIDNA_ERROR_CONTEXTO_PUNCTUATION" }, + { "l\u30FB", "B", + "l\u30FB", "UIDNA_ERROR_CONTEXTO_PUNCTUATION" }, // { "", "B", // "", "" }, };