From: Markus Scherer Date: Fri, 1 Jul 2011 22:19:14 +0000 (+0000) Subject: ICU-8615 implement optional IDNA2008 CONTEXTO check in UTS46 X-Git-Tag: milestone-59-0-1~4690 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=b6036a94f9923c8a6c99b304da914e8b0065fc28;p=icu ICU-8615 implement optional IDNA2008 CONTEXTO check in UTS46 X-SVN-Rev: 30268 --- diff --git a/icu4c/source/common/uchar.c b/icu4c/source/common/uchar.c index 04dd60b8e7f..d56ab121d06 100644 --- a/icu4c/source/common/uchar.c +++ b/icu4c/source/common/uchar.c @@ -1,6 +1,6 @@ /* ******************************************************************************** -* Copyright (C) 1996-2010, International Business Machines +* Copyright (C) 1996-2011, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************** * @@ -23,6 +23,7 @@ #include "unicode/uchar.h" #include "unicode/uscript.h" #include "unicode/udata.h" +#include "uassert.h" #include "umutex.h" #include "cmemory.h" #include "ucln_cmn.h" @@ -475,7 +476,7 @@ u_forDigit(int32_t digit, int8_t radix) { } } -/* miscellaneous, and support for uprops.c ---------------------------------- */ +/* miscellaneous, and support for uprops.cpp -------------------------------- */ U_CAPI void U_EXPORT2 u_getUnicodeVersion(UVersionInfo versionArray) { @@ -485,19 +486,19 @@ u_getUnicodeVersion(UVersionInfo versionArray) { } U_CFUNC uint32_t -u_getUnicodeProperties(UChar32 c, int32_t column) { - uint16_t vecIndex; +u_getMainProperties(UChar32 c) { + uint32_t props; + GET_PROPS(c, props); + return props; +} - if(column==-1) { - uint32_t props; - GET_PROPS(c, props); - return props; - } else if( - column<0 || column>=propsVectorsColumns - ) { +U_CFUNC uint32_t +u_getUnicodeProperties(UChar32 c, int32_t column) { + U_ASSERT(column>=0); + if(column>=propsVectorsColumns) { return 0; } else { - vecIndex=UTRIE2_GET16(&propsVectorsTrie, c); + uint16_t vecIndex=UTRIE2_GET16(&propsVectorsTrie, c); return propsVectors[vecIndex+column]; } } diff --git a/icu4c/source/common/unicode/uidna.h b/icu4c/source/common/unicode/uidna.h index 04b439d970f..bf49a35bd97 100644 --- a/icu4c/source/common/unicode/uidna.h +++ b/icu4c/source/common/unicode/uidna.h @@ -1,7 +1,7 @@ /* ******************************************************************************* * - * Copyright (C) 2003-2010, International Business Machines + * Copyright (C) 2003-2011, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* @@ -42,12 +42,14 @@ enum { /** * Default options value: None of the other options are set. + * For use in static worker and factory methods. * @stable ICU 2.6 */ UIDNA_DEFAULT=0, /** * Option to allow unassigned code points in domain names and labels. - * This option is ignored by the UTS46 implementation. + * For use in static worker and factory methods. + *

This option is ignored by the UTS46 implementation. * (UTS #46 disallows unassigned code points.) * @stable ICU 2.6 */ @@ -56,39 +58,54 @@ enum { * Option to check whether the input conforms to the STD3 ASCII rules, * for example the restriction of labels to LDH characters * (ASCII Letters, Digits and Hyphen-Minus). + * For use in static worker and factory methods. * @stable ICU 2.6 */ UIDNA_USE_STD3_RULES=2, /** * IDNA option to check for whether the input conforms to the BiDi rules. - * This option is ignored by the IDNA2003 implementation. + * For use in static worker and factory methods. + *

This option is ignored by the IDNA2003 implementation. * (IDNA2003 always performs a BiDi check.) * @draft ICU 4.6 */ UIDNA_CHECK_BIDI=4, /** * IDNA option to check for whether the input conforms to the CONTEXTJ rules. - * This option is ignored by the IDNA2003 implementation. + * For use in static worker and factory methods. + *

This option is ignored by the IDNA2003 implementation. * (The CONTEXTJ check is new in IDNA2008.) * @draft ICU 4.6 */ UIDNA_CHECK_CONTEXTJ=8, /** * IDNA option for nontransitional processing in ToASCII(). - * By default, ToASCII() uses transitional processing. - * This option is ignored by the IDNA2003 implementation. + * For use in static worker and factory methods. + *

By default, ToASCII() uses transitional processing. + *

This option is ignored by the IDNA2003 implementation. * (This is only relevant for compatibility of newer IDNA implementations with IDNA2003.) * @draft ICU 4.6 */ UIDNA_NONTRANSITIONAL_TO_ASCII=0x10, /** * IDNA option for nontransitional processing in ToUnicode(). - * By default, ToUnicode() uses transitional processing. - * This option is ignored by the IDNA2003 implementation. + * For use in static worker and factory methods. + *

By default, ToUnicode() uses transitional processing. + *

This option is ignored by the IDNA2003 implementation. * (This is only relevant for compatibility of newer IDNA implementations with IDNA2003.) * @draft ICU 4.6 */ - UIDNA_NONTRANSITIONAL_TO_UNICODE=0x20 + UIDNA_NONTRANSITIONAL_TO_UNICODE=0x20, + /** + * IDNA option to check for whether the input conforms to the CONTEXTO rules. + * For use in static worker and factory methods. + *

This option is ignored by the IDNA2003 implementation. + * (The CONTEXTO check is new in IDNA2008.) + *

This is for use by registries for IDNA2008 conformance. + * UTS #46 does not require the CONTEXTO check. + * @draft ICU 49 + */ + UIDNA_CHECK_CONTEXTO=0x40 }; /** @@ -471,7 +488,20 @@ enum { * A label does not meet the IDNA CONTEXTJ requirements. * @draft ICU 4.6 */ - UIDNA_ERROR_CONTEXTJ=0x1000 + UIDNA_ERROR_CONTEXTJ=0x1000, + /** + * A label does not meet the IDNA CONTEXTO requirements for punctuation characters. + * Some punctuation characters "Would otherwise have been DISALLOWED" + * but are allowed in certain contexts. (RFC 5892) + * @draft ICU 49 + */ + UIDNA_ERROR_CONTEXTO_PUNCTUATION=0x2000, + /** + * A label does not meet the IDNA CONTEXTO requirements for digits. + * Arabic-Indic Digits (U+066x) must not be mixed with Extended Arabic-Indic Digits (U+06Fx). + * @draft ICU 49 + */ + UIDNA_ERROR_CONTEXTO_DIGITS=0x4000 }; /* IDNA2003 API ------------------------------------------------------------- */ diff --git a/icu4c/source/common/uprops.cpp b/icu4c/source/common/uprops.cpp index 3d57a861a5e..d3872fe6210 100644 --- a/icu4c/source/common/uprops.cpp +++ b/icu4c/source/common/uprops.cpp @@ -214,7 +214,7 @@ static const BinaryProperty binProps[UCHAR_BINARY_LIMIT]={ * Must be in order of corresponding UProperty, * and there must be exactly one entry per binary UProperty. * - * Properties with mask==0 and contains==NULL are handled in code. + * Properties with mask==0 are handled in code. * For them, column is the UPropertySource value. */ { 1, U_MASK(UPROPS_ALPHABETIC), defaultContains }, @@ -345,7 +345,7 @@ static int32_t getJoiningType(const IntProperty &/*prop*/, UChar32 c, UProperty } static int32_t getNumericType(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { - int32_t ntv=(int32_t)GET_NUMERIC_TYPE_VALUE(u_getUnicodeProperties(c, -1)); + int32_t ntv=(int32_t)GET_NUMERIC_TYPE_VALUE(u_getMainProperties(c)); return UPROPS_NTV_GET_TYPE(ntv); } @@ -421,7 +421,7 @@ static const IntProperty intProps[UCHAR_INT_LIMIT-UCHAR_INT_START]={ * Must be in order of corresponding UProperty, * and there must be exactly one entry per int UProperty. * - * Properties with mask==0 and getValue==NULL are handled in code. + * Properties with mask==0 are handled in code. * For them, column is the UPropertySource value. */ { UPROPS_SRC_BIDI, 0, 0, getBiDiClass, biDiGetMaxValue }, diff --git a/icu4c/source/common/uprops.h b/icu4c/source/common/uprops.h index 69d3d3456f1..2d7febf5f1c 100644 --- a/icu4c/source/common/uprops.h +++ b/icu4c/source/common/uprops.h @@ -1,7 +1,7 @@ /* ******************************************************************************* * -* Copyright (C) 2002-2010, International Business Machines +* Copyright (C) 2002-2011, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* @@ -192,10 +192,16 @@ enum { #define UPROPS_DT_MASK 0x0000001f +/** + * Gets the main properties value for a code point. + * Implemented in uchar.c for uprops.cpp. + */ +U_CFUNC uint32_t +u_getMainProperties(UChar32 c); + /** * Get a properties vector word for a code point. - * Implemented in uchar.c for uprops.c. - * column==-1 gets the 32-bit main properties word instead. + * Implemented in uchar.c for uprops.cpp. * @return 0 if no data or illegal argument */ U_CFUNC uint32_t diff --git a/icu4c/source/common/uts46.cpp b/icu4c/source/common/uts46.cpp index 682f9ed4bdb..c81c467eec7 100644 --- a/icu4c/source/common/uts46.cpp +++ b/icu4c/source/common/uts46.cpp @@ -18,6 +18,7 @@ #include "unicode/idna.h" #include "unicode/normalizer2.h" +#include "unicode/uscript.h" #include "unicode/ustring.h" #include "cmemory.h" #include "cstring.h" @@ -188,6 +189,9 @@ private: UBool isLabelOkContextJ(const UChar *label, int32_t labelLength) const; + void + checkLabelContextO(const UChar *label, int32_t labelLength, IDNAInfo &info) const; + const Normalizer2 &uts46Norm2; // uts46.nrm uint32_t options; }; @@ -822,6 +826,9 @@ UTS46::processLabel(UnicodeString &dest, ) { info.labelErrors|=UIDNA_ERROR_CONTEXTJ; } + if((options&UIDNA_CHECK_CONTEXTO)!=0 && oredChars>=0xb7) { + checkLabelContextO(label, labelLength, info); + } if(toASCII) { if(wasPunycode) { // Leave a Punycode label unchanged if it has no severe errors. @@ -1171,6 +1178,109 @@ UTS46::isLabelOkContextJ(const UChar *label, int32_t labelLength) const { return TRUE; } +void +UTS46::checkLabelContextO(const UChar *label, int32_t labelLength, IDNAInfo &info) const { + int32_t labelEnd=labelLength-1; // inclusive + int32_t arabicDigits=0; // -1 for 066x, +1 for 06Fx + for(int32_t i=0; i<=labelEnd; ++i) { + UChar32 c=label[i]; + if(c<0xb7) { + // ASCII fastpath + } else if(c<=0x6f9) { + if(c==0xb7) { + // Appendix A.3. MIDDLE DOT (U+00B7) + // Rule Set: + // False; + // If Before(cp) .eq. U+006C And + // After(cp) .eq. U+006C Then True; + if(!(00) { + info.labelErrors|=UIDNA_ERROR_CONTEXTO_DIGITS; + } + arabicDigits=-1; + } else if(0x6f0<=c) { + if(arabicDigits<0) { + info.labelErrors|=UIDNA_ERROR_CONTEXTO_DIGITS; + } + arabicDigits=1; + } + } + } else if(c==0x30fb) { + // Appendix A.7. KATAKANA MIDDLE DOT (U+30FB) + // Rule Set: + // False; + // For All Characters: + // If Script(cp) .in. {Hiragana, Katakana, Han} Then True; + // End For; + UErrorCode errorCode=U_ZERO_ERROR; + for(int j=0;;) { + if(j>labelEnd) { + info.labelErrors|=UIDNA_ERROR_CONTEXTO_PUNCTUATION; + break; + } + U16_NEXT(label, j, labelLength, c); + UScriptCode script=uscript_getScript(c, &errorCode); + if(script==USCRIPT_HIRAGANA || script==USCRIPT_KATAKANA || script==USCRIPT_HAN) { + break; + } + } + } + } +} + U_NAMESPACE_END // C API ------------------------------------------------------------------- *** diff --git a/icu4c/source/test/intltest/uts46test.cpp b/icu4c/source/test/intltest/uts46test.cpp index f3480281dce..df699dd08e1 100644 --- a/icu4c/source/test/intltest/uts46test.cpp +++ b/icu4c/source/test/intltest/uts46test.cpp @@ -1,6 +1,6 @@ /* ******************************************************************************* -* Copyright (C) 2010, International Business Machines +* Copyright (C) 2010-2011, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************* * file name: uts46test.cpp @@ -55,11 +55,12 @@ void UTS46Test::runIndexedTest(int32_t index, UBool exec, const char *&name, cha logln("TestSuite UTS46Test: "); if(trans==NULL) { IcuTestErrorCode errorCode(*this, "init/createUTS46Instance()"); - trans=IDNA::createUTS46Instance( - UIDNA_USE_STD3_RULES|UIDNA_CHECK_BIDI|UIDNA_CHECK_CONTEXTJ, - errorCode); + uint32_t commonOptions= + UIDNA_USE_STD3_RULES|UIDNA_CHECK_BIDI| + UIDNA_CHECK_CONTEXTJ|UIDNA_CHECK_CONTEXTO; + trans=IDNA::createUTS46Instance(commonOptions, errorCode); nontrans=IDNA::createUTS46Instance( - UIDNA_USE_STD3_RULES|UIDNA_CHECK_BIDI|UIDNA_CHECK_CONTEXTJ| + commonOptions| UIDNA_NONTRANSITIONAL_TO_ASCII|UIDNA_NONTRANSITIONAL_TO_UNICODE, errorCode); if(errorCode.logDataIfFailureAndReset("createUTS46Instance()")) { @@ -534,6 +535,29 @@ static const TestCase testCases[]={ "\\u06EF\\u200C\\u06EF", UIDNA_ERROR_CONTEXTJ }, { "\\u0644\\u200C", "N", // D ZWNJ "\\u0644\\u200C", UIDNA_ERROR_BIDI|UIDNA_ERROR_CONTEXTJ }, + { "\\u0660\\u0661", "B", // Arabic-Indic Digits alone + "\\u0660\\u0661", UIDNA_ERROR_BIDI }, + { "\\u06F0\\u06F1", "B", // Extended Arabic-Indic Digits alone + "\\u06F0\\u06F1", 0 }, + { "\\u0660\\u06F1", "B", // Mixed Arabic-Indic Digits + "\\u0660\\u06F1", UIDNA_ERROR_CONTEXTO_DIGITS|UIDNA_ERROR_BIDI }, + // All of the CONTEXTO "Would otherwise have been DISALLOWED" characters + // in their correct contexts, + // then each in incorrect context. + { "l\\u00B7l\\u4E00\\u0375\\u03B1\\u05D0\\u05F3\\u05F4\\u30FB", "B", + "l\\u00B7l\\u4E00\\u0375\\u03B1\\u05D0\\u05F3\\u05F4\\u30FB", UIDNA_ERROR_BIDI }, + { "l\\u00B7", "B", + "l\\u00B7", UIDNA_ERROR_CONTEXTO_PUNCTUATION }, + { "\\u00B7l", "B", + "\\u00B7l", UIDNA_ERROR_CONTEXTO_PUNCTUATION }, + { "\\u0375", "B", + "\\u0375", UIDNA_ERROR_CONTEXTO_PUNCTUATION }, + { "\\u03B1\\u05F3", "B", + "\\u03B1\\u05F3", UIDNA_ERROR_CONTEXTO_PUNCTUATION|UIDNA_ERROR_BIDI }, + { "\\u05F4", "B", + "\\u05F4", UIDNA_ERROR_CONTEXTO_PUNCTUATION }, + { "l\\u30FB", "B", + "l\\u30FB", UIDNA_ERROR_CONTEXTO_PUNCTUATION }, // Ticket #8137: UTS #46 toUnicode() fails with non-ASCII labels that turn // into 15 characters (UChars). // The bug was in u_strFromPunycode() which did not write the last character