ICU-8615 implement optional IDNA2008 CONTEXTO check in UTS46

author Markus Scherer <markus.icu@gmail.com>

Fri, 1 Jul 2011 22:19:14 +0000 (22:19 +0000)

committer Markus Scherer <markus.icu@gmail.com>

Fri, 1 Jul 2011 22:19:14 +0000 (22:19 +0000)
author Markus Scherer <markus.icu@gmail.com>
Fri, 1 Jul 2011 22:19:14 +0000 (22:19 +0000)
committer Markus Scherer <markus.icu@gmail.com>
Fri, 1 Jul 2011 22:19:14 +0000 (22:19 +0000)
diff --git a/icu4c/source/common/uchar.c b/icu4c/source/common/uchar.c

index 04dd60b8e7fc93538db2d2de3b2c91a459d970ad..d56ab121d0683a63d187ae3e919f156e1816d631 100644 (file)
--- a/icu4c/source/common/uchar.c
+++ b/icu4c/source/common/uchar.c
@@ -1,6 +1,6 @@
  /*
  ********************************************************************************
-*   Copyright (C) 1996-2010, International Business Machines
+*   Copyright (C) 1996-2011, International Business Machines
  *   Corporation and others.  All Rights Reserved.
  ********************************************************************************
  *
@@ -23,6 +23,7 @@
  #include "unicode/uchar.h"
  #include "unicode/uscript.h"
  #include "unicode/udata.h"
+#include "uassert.h"
  #include "umutex.h"
  #include "cmemory.h"
  #include "ucln_cmn.h"
@@ -475,7 +476,7 @@ u_forDigit(int32_t digit, int8_t radix) {
      }
  }
  
-/* miscellaneous, and support for uprops.c ---------------------------------- */
+/* miscellaneous, and support for uprops.cpp -------------------------------- */
  
  U_CAPI void U_EXPORT2
  u_getUnicodeVersion(UVersionInfo versionArray) {
@@ -485,19 +486,19 @@ u_getUnicodeVersion(UVersionInfo versionArray) {
  }
  
  U_CFUNC uint32_t
-u_getUnicodeProperties(UChar32 c, int32_t column) {
-    uint16_t vecIndex;
+u_getMainProperties(UChar32 c) {
+    uint32_t props;
+    GET_PROPS(c, props);
+    return props;
+}
  
-    if(column==-1) {
-        uint32_t props;
-        GET_PROPS(c, props);
-        return props;
-    } else if(
-               column<0 || column>=propsVectorsColumns
-    ) {
+U_CFUNC uint32_t
+u_getUnicodeProperties(UChar32 c, int32_t column) {
+    U_ASSERT(column>=0);
+    if(column>=propsVectorsColumns) {
          return 0;
      } else {
-        vecIndex=UTRIE2_GET16(&propsVectorsTrie, c);
+        uint16_t vecIndex=UTRIE2_GET16(&propsVectorsTrie, c);
          return propsVectors[vecIndex+column];
      }
  }
diff --git a/icu4c/source/common/unicode/uidna.h b/icu4c/source/common/unicode/uidna.h

index 04b439d970f45824ab9fe8c7ef477106334ead1d..bf49a35bd977c174c1d1f5819f062e0353975948 100644 (file)
--- a/icu4c/source/common/unicode/uidna.h
+++ b/icu4c/source/common/unicode/uidna.h
@@ -1,7 +1,7 @@
  /*
   *******************************************************************************
   *
- *   Copyright (C) 2003-2010, International Business Machines
+ *   Copyright (C) 2003-2011, International Business Machines
   *   Corporation and others.  All Rights Reserved.
   *
   *******************************************************************************
@@ -42,12 +42,14 @@
  enum {
      /**
       * Default options value: None of the other options are set.
+     * For use in static worker and factory methods.
       * @stable ICU 2.6
       */
      UIDNA_DEFAULT=0,
      /**
       * Option to allow unassigned code points in domain names and labels.
-     * This option is ignored by the UTS46 implementation.
+     * For use in static worker and factory methods.
+     * <p>This option is ignored by the UTS46 implementation.
       * (UTS #46 disallows unassigned code points.)
       * @stable ICU 2.6
       */
@@ -56,39 +58,54 @@ enum {
       * Option to check whether the input conforms to the STD3 ASCII rules,
       * for example the restriction of labels to LDH characters
       * (ASCII Letters, Digits and Hyphen-Minus).
+     * For use in static worker and factory methods.
       * @stable ICU 2.6
       */
      UIDNA_USE_STD3_RULES=2,
      /**
       * IDNA option to check for whether the input conforms to the BiDi rules.
-     * This option is ignored by the IDNA2003 implementation.
+     * For use in static worker and factory methods.
+     * <p>This option is ignored by the IDNA2003 implementation.
       * (IDNA2003 always performs a BiDi check.)
       * @draft ICU 4.6
       */
      UIDNA_CHECK_BIDI=4,
      /**
       * IDNA option to check for whether the input conforms to the CONTEXTJ rules.
-     * This option is ignored by the IDNA2003 implementation.
+     * For use in static worker and factory methods.
+     * <p>This option is ignored by the IDNA2003 implementation.
       * (The CONTEXTJ check is new in IDNA2008.)
       * @draft ICU 4.6
       */
      UIDNA_CHECK_CONTEXTJ=8,
      /**
       * IDNA option for nontransitional processing in ToASCII().
-     * By default, ToASCII() uses transitional processing.
-     * This option is ignored by the IDNA2003 implementation.
+     * For use in static worker and factory methods.
+     * <p>By default, ToASCII() uses transitional processing.
+     * <p>This option is ignored by the IDNA2003 implementation.
       * (This is only relevant for compatibility of newer IDNA implementations with IDNA2003.)
       * @draft ICU 4.6
       */
      UIDNA_NONTRANSITIONAL_TO_ASCII=0x10,
      /**
       * IDNA option for nontransitional processing in ToUnicode().
-     * By default, ToUnicode() uses transitional processing.
-     * This option is ignored by the IDNA2003 implementation.
+     * For use in static worker and factory methods.
+     * <p>By default, ToUnicode() uses transitional processing.
+     * <p>This option is ignored by the IDNA2003 implementation.
       * (This is only relevant for compatibility of newer IDNA implementations with IDNA2003.)
       * @draft ICU 4.6
       */
-    UIDNA_NONTRANSITIONAL_TO_UNICODE=0x20
+    UIDNA_NONTRANSITIONAL_TO_UNICODE=0x20,
+    /**
+     * IDNA option to check for whether the input conforms to the CONTEXTO rules.
+     * For use in static worker and factory methods.
+     * <p>This option is ignored by the IDNA2003 implementation.
+     * (The CONTEXTO check is new in IDNA2008.)
+     * <p>This is for use by registries for IDNA2008 conformance.
+     * UTS #46 does not require the CONTEXTO check.
+     * @draft ICU 49
+     */
+    UIDNA_CHECK_CONTEXTO=0x40
  };
  
  /**
@@ -471,7 +488,20 @@ enum {
       * A label does not meet the IDNA CONTEXTJ requirements.
       * @draft ICU 4.6
       */
-    UIDNA_ERROR_CONTEXTJ=0x1000
+    UIDNA_ERROR_CONTEXTJ=0x1000,
+    /**
+     * A label does not meet the IDNA CONTEXTO requirements for punctuation characters.
+     * Some punctuation characters "Would otherwise have been DISALLOWED"
+     * but are allowed in certain contexts. (RFC 5892)
+     * @draft ICU 49
+     */
+    UIDNA_ERROR_CONTEXTO_PUNCTUATION=0x2000,
+    /**
+     * A label does not meet the IDNA CONTEXTO requirements for digits.
+     * Arabic-Indic Digits (U+066x) must not be mixed with Extended Arabic-Indic Digits (U+06Fx).
+     * @draft ICU 49
+     */
+    UIDNA_ERROR_CONTEXTO_DIGITS=0x4000
  };
  
  /* IDNA2003 API ------------------------------------------------------------- */
diff --git a/icu4c/source/common/uprops.cpp b/icu4c/source/common/uprops.cpp

index 3d57a861a5ee711f246bb694b47aeb32307ad1c4..d3872fe6210d02651f65a39a30910c86f10fffe2 100644 (file)
--- a/icu4c/source/common/uprops.cpp
+++ b/icu4c/source/common/uprops.cpp
@@ -214,7 +214,7 @@ static const BinaryProperty binProps[UCHAR_BINARY_LIMIT]={
       * Must be in order of corresponding UProperty,
       * and there must be exactly one entry per binary UProperty.
       *
-     * Properties with mask==0 and contains==NULL are handled in code.
+     * Properties with mask==0 are handled in code.
       * For them, column is the UPropertySource value.
       */
      { 1,                U_MASK(UPROPS_ALPHABETIC), defaultContains },
@@ -345,7 +345,7 @@ static int32_t getJoiningType(const IntProperty &/*prop*/, UChar32 c, UProperty
  }
  
  static int32_t getNumericType(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) {
-    int32_t ntv=(int32_t)GET_NUMERIC_TYPE_VALUE(u_getUnicodeProperties(c, -1));
+    int32_t ntv=(int32_t)GET_NUMERIC_TYPE_VALUE(u_getMainProperties(c));
      return UPROPS_NTV_GET_TYPE(ntv);
  }
  
@@ -421,7 +421,7 @@ static const IntProperty intProps[UCHAR_INT_LIMIT-UCHAR_INT_START]={
       * Must be in order of corresponding UProperty,
       * and there must be exactly one entry per int UProperty.
       *
-     * Properties with mask==0 and getValue==NULL are handled in code.
+     * Properties with mask==0 are handled in code.
       * For them, column is the UPropertySource value.
       */
      { UPROPS_SRC_BIDI,  0, 0,                               getBiDiClass, biDiGetMaxValue },
diff --git a/icu4c/source/common/uprops.h b/icu4c/source/common/uprops.h

index 69d3d3456f1d9c67e4dcece60cd26c95fe5c3eec..2d7febf5f1cbed0c45ada5bfc5c5f585dfdcb85a 100644 (file)
--- a/icu4c/source/common/uprops.h
+++ b/icu4c/source/common/uprops.h
@@ -1,7 +1,7 @@
  /*
  *******************************************************************************
  *
-*   Copyright (C) 2002-2010, International Business Machines
+*   Copyright (C) 2002-2011, International Business Machines
  *   Corporation and others.  All Rights Reserved.
  *
  *******************************************************************************
@@ -192,10 +192,16 @@ enum {
  
  #define UPROPS_DT_MASK          0x0000001f
  
+/**
+ * Gets the main properties value for a code point.
+ * Implemented in uchar.c for uprops.cpp.
+ */
+U_CFUNC uint32_t
+u_getMainProperties(UChar32 c);
+
  /**
   * Get a properties vector word for a code point.
- * Implemented in uchar.c for uprops.c.
- * column==-1 gets the 32-bit main properties word instead.
+ * Implemented in uchar.c for uprops.cpp.
   * @return 0 if no data or illegal argument
   */
  U_CFUNC uint32_t
diff --git a/icu4c/source/common/uts46.cpp b/icu4c/source/common/uts46.cpp

index 682f9ed4bdb399a928639c71d6a5920c61b76710..c81c467eec727bbc43f8cd9da80086f85a2b0b12 100644 (file)
--- a/icu4c/source/common/uts46.cpp
+++ b/icu4c/source/common/uts46.cpp
@@ -18,6 +18,7 @@
  
  #include "unicode/idna.h"
  #include "unicode/normalizer2.h"
+#include "unicode/uscript.h"
  #include "unicode/ustring.h"
  #include "cmemory.h"
  #include "cstring.h"
@@ -188,6 +189,9 @@ private:
      UBool
      isLabelOkContextJ(const UChar *label, int32_t labelLength) const;
  
+    void
+    checkLabelContextO(const UChar *label, int32_t labelLength, IDNAInfo &info) const;
+
      const Normalizer2 &uts46Norm2;  // uts46.nrm
      uint32_t options;
  };
@@ -822,6 +826,9 @@ UTS46::processLabel(UnicodeString &dest,
          ) {
              info.labelErrors|=UIDNA_ERROR_CONTEXTJ;
          }
+        if((options&UIDNA_CHECK_CONTEXTO)!=0 && oredChars>=0xb7) {
+            checkLabelContextO(label, labelLength, info);
+        }
          if(toASCII) {
              if(wasPunycode) {
                  // Leave a Punycode label unchanged if it has no severe errors.
@@ -1171,6 +1178,109 @@ UTS46::isLabelOkContextJ(const UChar *label, int32_t labelLength) const {
      return TRUE;
  }
  
+void
+UTS46::checkLabelContextO(const UChar *label, int32_t labelLength, IDNAInfo &info) const {
+    int32_t labelEnd=labelLength-1;  // inclusive
+    int32_t arabicDigits=0;  // -1 for 066x, +1 for 06Fx
+    for(int32_t i=0; i<=labelEnd; ++i) {
+        UChar32 c=label[i];
+        if(c<0xb7) {
+            // ASCII fastpath
+        } else if(c<=0x6f9) {
+            if(c==0xb7) {
+                // Appendix A.3. MIDDLE DOT (U+00B7)
+                // Rule Set:
+                //  False;
+                //  If Before(cp) .eq.  U+006C And
+                //     After(cp) .eq.  U+006C Then True;
+                if(!(0<i && label[i-1]==0x6c &&
+                     i<labelEnd && label[i+1]==0x6c)) {
+                    info.labelErrors|=UIDNA_ERROR_CONTEXTO_PUNCTUATION;
+                }
+            } else if(c==0x375) {
+                // Appendix A.4. GREEK LOWER NUMERAL SIGN (KERAIA) (U+0375)
+                // Rule Set:
+                //  False;
+                //  If Script(After(cp)) .eq.  Greek Then True;
+                UScriptCode script=USCRIPT_INVALID_CODE;
+                if(i<labelEnd) {
+                    UErrorCode errorCode=U_ZERO_ERROR;
+                    int32_t j=i+1;
+                    U16_NEXT(label, j, labelLength, c);
+                    script=uscript_getScript(c, &errorCode);
+                }
+                if(script!=USCRIPT_GREEK) {
+                    info.labelErrors|=UIDNA_ERROR_CONTEXTO_PUNCTUATION;
+                }
+            } else if(c==0x5f3 || c==0x5f4) {
+                // Appendix A.5. HEBREW PUNCTUATION GERESH (U+05F3)
+                // Rule Set:
+                //  False;
+                //  If Script(Before(cp)) .eq.  Hebrew Then True;
+                //
+                // Appendix A.6. HEBREW PUNCTUATION GERSHAYIM (U+05F4)
+                // Rule Set:
+                //  False;
+                //  If Script(Before(cp)) .eq.  Hebrew Then True;
+                UScriptCode script=USCRIPT_INVALID_CODE;
+                if(0<i) {
+                    UErrorCode errorCode=U_ZERO_ERROR;
+                    int32_t j=i;
+                    U16_PREV(label, 0, j, c);
+                    script=uscript_getScript(c, &errorCode);
+                }
+                if(script!=USCRIPT_HEBREW) {
+                    info.labelErrors|=UIDNA_ERROR_CONTEXTO_PUNCTUATION;
+                }
+            } else if(0x660<=c /* && c<=0x6f9 */) {
+                // Appendix A.8. ARABIC-INDIC DIGITS (0660..0669)
+                // Rule Set:
+                //  True;
+                //  For All Characters:
+                //    If cp .in. 06F0..06F9 Then False;
+                //  End For;
+                //
+                // Appendix A.9. EXTENDED ARABIC-INDIC DIGITS (06F0..06F9)
+                // Rule Set:
+                //  True;
+                //  For All Characters:
+                //    If cp .in. 0660..0669 Then False;
+                //  End For;
+                if(c<=0x669) {
+                    if(arabicDigits>0) {
+                        info.labelErrors|=UIDNA_ERROR_CONTEXTO_DIGITS;
+                    }
+                    arabicDigits=-1;
+                } else if(0x6f0<=c) {
+                    if(arabicDigits<0) {
+                        info.labelErrors|=UIDNA_ERROR_CONTEXTO_DIGITS;
+                    }
+                    arabicDigits=1;
+                }
+            }
+        } else if(c==0x30fb) {
+            // Appendix A.7. KATAKANA MIDDLE DOT (U+30FB)
+            // Rule Set:
+            //  False;
+            //  For All Characters:
+            //    If Script(cp) .in. {Hiragana, Katakana, Han} Then True;
+            //  End For;
+            UErrorCode errorCode=U_ZERO_ERROR;
+            for(int j=0;;) {
+                if(j>labelEnd) {
+                    info.labelErrors|=UIDNA_ERROR_CONTEXTO_PUNCTUATION;
+                    break;
+                }
+                U16_NEXT(label, j, labelLength, c);
+                UScriptCode script=uscript_getScript(c, &errorCode);
+                if(script==USCRIPT_HIRAGANA || script==USCRIPT_KATAKANA || script==USCRIPT_HAN) {
+                    break;
+                }
+            }
+        }
+    }
+}
+
  U_NAMESPACE_END
  
  // C API ------------------------------------------------------------------- ***
diff --git a/icu4c/source/test/intltest/uts46test.cpp b/icu4c/source/test/intltest/uts46test.cpp

index f3480281dce5a3ce901275e2892c3639eb3a82eb..df699dd08e19c2fbacf289cb82cd66fe2e4ccffa 100644 (file)
--- a/icu4c/source/test/intltest/uts46test.cpp
+++ b/icu4c/source/test/intltest/uts46test.cpp
@@ -1,6 +1,6 @@
  /*
  *******************************************************************************
-*   Copyright (C) 2010, International Business Machines
+*   Copyright (C) 2010-2011, International Business Machines
  *   Corporation and others.  All Rights Reserved.
  *******************************************************************************
  *   file name:  uts46test.cpp
@@ -55,11 +55,12 @@ void UTS46Test::runIndexedTest(int32_t index, UBool exec, const char *&name, cha
          logln("TestSuite UTS46Test: ");
          if(trans==NULL) {
              IcuTestErrorCode errorCode(*this, "init/createUTS46Instance()");
-            trans=IDNA::createUTS46Instance(
-                UIDNA_USE_STD3_RULES|UIDNA_CHECK_BIDI|UIDNA_CHECK_CONTEXTJ,
-                errorCode);
+            uint32_t commonOptions=
+                UIDNA_USE_STD3_RULES|UIDNA_CHECK_BIDI|
+                UIDNA_CHECK_CONTEXTJ|UIDNA_CHECK_CONTEXTO;
+            trans=IDNA::createUTS46Instance(commonOptions, errorCode);
              nontrans=IDNA::createUTS46Instance(
-                UIDNA_USE_STD3_RULES|UIDNA_CHECK_BIDI|UIDNA_CHECK_CONTEXTJ|
+                commonOptions|
                  UIDNA_NONTRANSITIONAL_TO_ASCII|UIDNA_NONTRANSITIONAL_TO_UNICODE,
                  errorCode);
              if(errorCode.logDataIfFailureAndReset("createUTS46Instance()")) {
@@ -534,6 +535,29 @@ static const TestCase testCases[]={
        "\\u06EF\\u200C\\u06EF", UIDNA_ERROR_CONTEXTJ },
      { "\\u0644\\u200C", "N",  // D ZWNJ
        "\\u0644\\u200C", UIDNA_ERROR_BIDI|UIDNA_ERROR_CONTEXTJ },
+    { "\\u0660\\u0661", "B",  // Arabic-Indic Digits alone
+      "\\u0660\\u0661", UIDNA_ERROR_BIDI },
+    { "\\u06F0\\u06F1", "B",  // Extended Arabic-Indic Digits alone
+      "\\u06F0\\u06F1", 0 },
+    { "\\u0660\\u06F1", "B",  // Mixed Arabic-Indic Digits
+      "\\u0660\\u06F1", UIDNA_ERROR_CONTEXTO_DIGITS|UIDNA_ERROR_BIDI },
+    // All of the CONTEXTO "Would otherwise have been DISALLOWED" characters
+    // in their correct contexts,
+    // then each in incorrect context.
+    { "l\\u00B7l\\u4E00\\u0375\\u03B1\\u05D0\\u05F3\\u05F4\\u30FB", "B",
+      "l\\u00B7l\\u4E00\\u0375\\u03B1\\u05D0\\u05F3\\u05F4\\u30FB", UIDNA_ERROR_BIDI },
+    { "l\\u00B7", "B",
+      "l\\u00B7", UIDNA_ERROR_CONTEXTO_PUNCTUATION },
+    { "\\u00B7l", "B",
+      "\\u00B7l", UIDNA_ERROR_CONTEXTO_PUNCTUATION },
+    { "\\u0375", "B",
+      "\\u0375", UIDNA_ERROR_CONTEXTO_PUNCTUATION },
+    { "\\u03B1\\u05F3", "B",
+      "\\u03B1\\u05F3", UIDNA_ERROR_CONTEXTO_PUNCTUATION|UIDNA_ERROR_BIDI },
+    { "\\u05F4", "B",
+      "\\u05F4", UIDNA_ERROR_CONTEXTO_PUNCTUATION },
+    { "l\\u30FB", "B",
+      "l\\u30FB", UIDNA_ERROR_CONTEXTO_PUNCTUATION },
      // Ticket #8137: UTS #46 toUnicode() fails with non-ASCII labels that turn
      // into 15 characters (UChars).
      // The bug was in u_strFromPunycode() which did not write the last character
author	Markus Scherer <markus.icu@gmail.com>
	Fri, 1 Jul 2011 22:19:14 +0000 (22:19 +0000)
committer	Markus Scherer <markus.icu@gmail.com>
	Fri, 1 Jul 2011 22:19:14 +0000 (22:19 +0000)
icu4c/source/common/uchar.c		patch \| blob \| history
icu4c/source/common/unicode/uidna.h		patch \| blob \| history
icu4c/source/common/uprops.cpp		patch \| blob \| history
icu4c/source/common/uprops.h		patch \| blob \| history
icu4c/source/common/uts46.cpp		patch \| blob \| history
icu4c/source/test/intltest/uts46test.cpp		patch \| blob \| history