ICU-8615 implement optional IDNA2008 CONTEXTO check in UTS46

author Markus Scherer <markus.icu@gmail.com>

Fri, 1 Jul 2011 22:17:53 +0000 (22:17 +0000)

committer Markus Scherer <markus.icu@gmail.com>

Fri, 1 Jul 2011 22:17:53 +0000 (22:17 +0000)
author Markus Scherer <markus.icu@gmail.com>
Fri, 1 Jul 2011 22:17:53 +0000 (22:17 +0000)
committer Markus Scherer <markus.icu@gmail.com>
Fri, 1 Jul 2011 22:17:53 +0000 (22:17 +0000)
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/UCharacterProperty.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/UCharacterProperty.java

index e2413a9e9e2108c170dbb4249ce8d87437c14549..acb645874e6e1b22ab23e4af4668bcb92bc75507 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/UCharacterProperty.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/UCharacterProperty.java
@@ -116,21 +116,19 @@ public final class UCharacterProperty
  
      /**
       * Gets the unicode additional properties.
-     * C version getUnicodeProperties.
+     * Java version of C u_getUnicodeProperties().
       * @param codepoint codepoint whose additional properties is to be
       *                  retrieved
       * @param column The column index.
       * @return unicode properties
       */
-       public int getAdditional(int codepoint, int column) {
-        if (column == -1) {
-            return getProperty(codepoint);
+    public int getAdditional(int codepoint, int column) {
+        assert column >= 0;
+        if (column >= m_additionalColumnsCount_) {
+            return 0;
          }
-           if (column < 0 || column >= m_additionalColumnsCount_) {
-           return 0;
-       }
-       return m_additionalVectors_[m_additionalTrie_.get(codepoint) + column];
-       }
+        return m_additionalVectors_[m_additionalTrie_.get(codepoint) + column];
+    }
  
      static final int MY_MASK = UCharacterProperty.TYPE_MASK
          & ((1<<UCharacterCategory.UPPERCASE_LETTER) |
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/UTS46.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/UTS46.java

index a9550653ad85adebcb6e292aa80643cccf504665..610995518760a448ac4c85a18e9fd44df49b3c95 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/UTS46.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/UTS46.java
@@ -1,6 +1,6 @@
  /*
  *******************************************************************************
-* Copyright (C) 2011, International Business Machines
+* Copyright (C) 2010-2011, International Business Machines
  * Corporation and others.  All Rights Reserved.
  *******************************************************************************
  */
@@ -11,6 +11,7 @@ import java.util.EnumSet;
  import com.ibm.icu.lang.UCharacter;
  import com.ibm.icu.lang.UCharacterCategory;
  import com.ibm.icu.lang.UCharacterDirection;
+import com.ibm.icu.lang.UScript;
  import com.ibm.icu.text.IDNA;
  import com.ibm.icu.text.Normalizer2;
  import com.ibm.icu.text.StringPrepParseException;
@@ -437,6 +438,9 @@ public final class UTS46 extends IDNA {
              ) {
                  addLabelError(info, Error.CONTEXTJ);
              }
+            if((options&CHECK_CONTEXTO)!=0 && oredChars>=0xb7) {
+                checkLabelContextO(labelString, labelStart, labelLength, info);
+            }
              if(toASCII) {
                  if(wasPunycode) {
                      // Leave a Punycode label unchanged if it has no severe errors.
@@ -734,6 +738,96 @@ public final class UTS46 extends IDNA {
          return true;
      }
  
+    private void
+    checkLabelContextO(CharSequence label, int labelStart, int labelLength, Info info) {
+        int labelEnd=labelStart+labelLength-1;  // inclusive
+        int arabicDigits=0;  // -1 for 066x, +1 for 06Fx
+        for(int i=labelStart; i<=labelEnd; ++i) {
+            int c=label.charAt(i);
+            if(c<0xb7) {
+                // ASCII fastpath
+            } else if(c<=0x6f9) {
+                if(c==0xb7) {
+                    // Appendix A.3. MIDDLE DOT (U+00B7)
+                    // Rule Set:
+                    //  False;
+                    //  If Before(cp) .eq.  U+006C And
+                    //     After(cp) .eq.  U+006C Then True;
+                    if(!(labelStart<i && label.charAt(i-1)=='l' &&
+                         i<labelEnd && label.charAt(i+1)=='l')) {
+                        addLabelError(info, Error.CONTEXTO_PUNCTUATION);
+                    }
+                } else if(c==0x375) {
+                    // Appendix A.4. GREEK LOWER NUMERAL SIGN (KERAIA) (U+0375)
+                    // Rule Set:
+                    //  False;
+                    //  If Script(After(cp)) .eq.  Greek Then True;
+                    if(!(i<labelEnd &&
+                         UScript.GREEK==UScript.getScript(Character.codePointAt(label, i+1)))) {
+                        addLabelError(info, Error.CONTEXTO_PUNCTUATION);
+                    }
+                } else if(c==0x5f3 || c==0x5f4) {
+                    // Appendix A.5. HEBREW PUNCTUATION GERESH (U+05F3)
+                    // Rule Set:
+                    //  False;
+                    //  If Script(Before(cp)) .eq.  Hebrew Then True;
+                    //
+                    // Appendix A.6. HEBREW PUNCTUATION GERSHAYIM (U+05F4)
+                    // Rule Set:
+                    //  False;
+                    //  If Script(Before(cp)) .eq.  Hebrew Then True;
+                    if(!(labelStart<i &&
+                         UScript.HEBREW==UScript.getScript(Character.codePointBefore(label, i)))) {
+                        addLabelError(info, Error.CONTEXTO_PUNCTUATION);
+                    }
+                } else if(0x660<=c /* && c<=0x6f9 */) {
+                    // Appendix A.8. ARABIC-INDIC DIGITS (0660..0669)
+                    // Rule Set:
+                    //  True;
+                    //  For All Characters:
+                    //    If cp .in. 06F0..06F9 Then False;
+                    //  End For;
+                    //
+                    // Appendix A.9. EXTENDED ARABIC-INDIC DIGITS (06F0..06F9)
+                    // Rule Set:
+                    //  True;
+                    //  For All Characters:
+                    //    If cp .in. 0660..0669 Then False;
+                    //  End For;
+                    if(c<=0x669) {
+                        if(arabicDigits>0) {
+                            addLabelError(info, Error.CONTEXTO_DIGITS);
+                        }
+                        arabicDigits=-1;
+                    } else if(0x6f0<=c) {
+                        if(arabicDigits<0) {
+                            addLabelError(info, Error.CONTEXTO_DIGITS);
+                        }
+                        arabicDigits=1;
+                    }
+                }
+            } else if(c==0x30fb) {
+                // Appendix A.7. KATAKANA MIDDLE DOT (U+30FB)
+                // Rule Set:
+                //  False;
+                //  For All Characters:
+                //    If Script(cp) .in. {Hiragana, Katakana, Han} Then True;
+                //  End For;
+                for(int j=labelStart;; j+=Character.charCount(c)) {
+                    if(j>labelEnd) {
+                        addLabelError(info, Error.CONTEXTO_PUNCTUATION);
+                        break;
+                    }
+                    c=Character.codePointAt(label, j);
+                    int script=UScript.getScript(c);
+                    if(script==UScript.HIRAGANA || script==UScript.KATAKANA || script==UScript.HAN) {
+                        break;
+                    }
+                }
+            }
+        }
+    }
+
      // TODO: make public(?) -- in C, these are public in uchar.h
      private static int U_MASK(int x) {
          return 1<<x;
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/IDNA.java b/icu4j/main/classes/core/src/com/ibm/icu/text/IDNA.java

index be5c69ba7524a2c8b799561861ba775cf5a2fc9f..9cbd07241a218b7e329167b9f1b559a716110acf 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/IDNA.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/IDNA.java
@@ -1,6 +1,6 @@
  /*
   *******************************************************************************
- * Copyright (C) 2003-2010, International Business Machines Corporation and    *
+ * Copyright (C) 2003-2011, International Business Machines Corporation and    *
   * others. All Rights Reserved.                                                *
   *******************************************************************************
   */
@@ -54,12 +54,14 @@ import com.ibm.icu.impl.UTS46;
  public abstract class IDNA {
      /** 
       * Default options value: None of the other options are set.
+     * For use in static worker and factory methods.
       * @stable ICU 2.8
       */
      public static final int DEFAULT = 0;
      /** 
       * Option to allow unassigned code points in domain names and labels.
-     * This option is ignored by the UTS46 implementation.
+     * For use in static worker and factory methods.
+     * <p>This option is ignored by the UTS46 implementation.
       * (UTS #46 disallows unassigned code points.)
       * @stable ICU 2.8
       */
@@ -68,12 +70,14 @@ public abstract class IDNA {
       * Option to check whether the input conforms to the STD3 ASCII rules,
       * for example the restriction of labels to LDH characters
       * (ASCII Letters, Digits and Hyphen-Minus).
+     * For use in static worker and factory methods.
       * @stable ICU 2.8
       */
      public static final int USE_STD3_RULES = 2;
      /**
       * IDNA option to check for whether the input conforms to the BiDi rules.
-     * This option is ignored by the IDNA2003 implementation.
+     * For use in static worker and factory methods.
+     * <p>This option is ignored by the IDNA2003 implementation.
       * (IDNA2003 always performs a BiDi check.)
       * @draft ICU 4.6
       * @provisional This API might change or be removed in a future release.
@@ -81,7 +85,8 @@ public abstract class IDNA {
      public static final int CHECK_BIDI = 4;
      /**
       * IDNA option to check for whether the input conforms to the CONTEXTJ rules.
-     * This option is ignored by the IDNA2003 implementation.
+     * For use in static worker and factory methods.
+     * <p>This option is ignored by the IDNA2003 implementation.
       * (The CONTEXTJ check is new in IDNA2008.)
       * @draft ICU 4.6
       * @provisional This API might change or be removed in a future release.
@@ -89,8 +94,9 @@ public abstract class IDNA {
      public static final int CHECK_CONTEXTJ = 8;
      /**
       * IDNA option for nontransitional processing in ToASCII().
-     * By default, ToASCII() uses transitional processing.
-     * This option is ignored by the IDNA2003 implementation.
+     * For use in static worker and factory methods.
+     * <p>By default, ToASCII() uses transitional processing.
+     * <p>This option is ignored by the IDNA2003 implementation.
       * (This is only relevant for compatibility of newer IDNA implementations with IDNA2003.)
       * @draft ICU 4.6
       * @provisional This API might change or be removed in a future release.
@@ -98,13 +104,25 @@ public abstract class IDNA {
      public static final int NONTRANSITIONAL_TO_ASCII = 0x10;
      /**
       * IDNA option for nontransitional processing in ToUnicode().
-     * By default, ToUnicode() uses transitional processing.
-     * This option is ignored by the IDNA2003 implementation.
+     * For use in static worker and factory methods.
+     * <p>By default, ToUnicode() uses transitional processing.
+     * <p>This option is ignored by the IDNA2003 implementation.
       * (This is only relevant for compatibility of newer IDNA implementations with IDNA2003.)
       * @draft ICU 4.6
       * @provisional This API might change or be removed in a future release.
       */
      public static final int NONTRANSITIONAL_TO_UNICODE = 0x20;
+    /**
+     * IDNA option to check for whether the input conforms to the CONTEXTO rules.
+     * For use in static worker and factory methods.
+     * <p>This option is ignored by the IDNA2003 implementation.
+     * (The CONTEXTO check is new in IDNA2008.)
+     * <p>This is for use by registries for IDNA2008 conformance.
+     * UTS #46 does not require the CONTEXTO check.
+     * @draft ICU 49
+     * @provisional This API might change or be removed in a future release.
+     */
+    public static final int CHECK_CONTEXTO = 0x40;
  
      /**
       * Returns an IDNA instance which implements UTS #46.
@@ -440,7 +458,22 @@ public abstract class IDNA {
           * @draft ICU 4.6
           * @provisional This API might change or be removed in a future release.
           */
-        CONTEXTJ
+        CONTEXTJ,
+        /**
+         * A label does not meet the IDNA CONTEXTO requirements for punctuation characters.
+         * Some punctuation characters "Would otherwise have been DISALLOWED"
+         * but are allowed in certain contexts. (RFC 5892)
+         * @draft ICU 49
+         * @provisional This API might change or be removed in a future release.
+         */
+        CONTEXTO_PUNCTUATION,
+        /**
+         * A label does not meet the IDNA CONTEXTO requirements for digits.
+         * Arabic-Indic Digits (U+066x) must not be mixed with Extended Arabic-Indic Digits (U+06Fx).
+         * @draft ICU 49
+         * @provisional This API might change or be removed in a future release.
+         */
+        CONTEXTO_DIGITS
      }
  
      /**
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/normalizer/UTS46Test.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/normalizer/UTS46Test.java

index 4ba9a4b91002045412461998790a626623bce5a5..c9a93c660c0c030c546fa17cb1731676757173b4 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/normalizer/UTS46Test.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/normalizer/UTS46Test.java
@@ -1,6 +1,6 @@
  /*
  *******************************************************************************
-* Copyright (C) 2010, International Business Machines
+* Copyright (C) 2010-2011, International Business Machines
  * Corporation and others.  All Rights Reserved.
  *******************************************************************************
  */
@@ -26,8 +26,11 @@ public class UTS46Test extends TestFmwk {
          new UTS46Test().run(args);
      }
      public UTS46Test() {
-        trans=IDNA.getUTS46Instance(IDNA.USE_STD3_RULES|IDNA.CHECK_BIDI|IDNA.CHECK_CONTEXTJ);
-        nontrans=IDNA.getUTS46Instance(IDNA.USE_STD3_RULES|IDNA.CHECK_BIDI|IDNA.CHECK_CONTEXTJ|
+        int commonOptions=
+            IDNA.USE_STD3_RULES|IDNA.CHECK_BIDI|
+            IDNA.CHECK_CONTEXTJ|IDNA.CHECK_CONTEXTO;
+        trans=IDNA.getUTS46Instance(commonOptions);
+        nontrans=IDNA.getUTS46Instance(commonOptions|
                                         IDNA.NONTRANSITIONAL_TO_ASCII|IDNA.NONTRANSITIONAL_TO_UNICODE);
      }
  
@@ -107,6 +110,8 @@ public class UTS46Test extends TestFmwk {
          errorNamesToErrors.put("UIDNA_ERROR_INVALID_ACE_LABEL", IDNA.Error.INVALID_ACE_LABEL);
          errorNamesToErrors.put("UIDNA_ERROR_BIDI", IDNA.Error.BIDI);
          errorNamesToErrors.put("UIDNA_ERROR_CONTEXTJ", IDNA.Error.CONTEXTJ);
+        errorNamesToErrors.put("UIDNA_ERROR_CONTEXTO_PUNCTUATION", IDNA.Error.CONTEXTO_PUNCTUATION);
+        errorNamesToErrors.put("UIDNA_ERROR_CONTEXTO_DIGITS", IDNA.Error.CONTEXTO_DIGITS);
      }
  
      private static final class TestCase {
@@ -424,6 +429,29 @@ public class UTS46Test extends TestFmwk {
            "\u06EF\u200C\u06EF", "UIDNA_ERROR_CONTEXTJ" },
          { "\u0644\u200C", "N",  // D ZWNJ
            "\u0644\u200C", "UIDNA_ERROR_BIDI|UIDNA_ERROR_CONTEXTJ" },
+        { "\u0660\u0661", "B",  // Arabic-Indic Digits alone
+          "\u0660\u0661", "UIDNA_ERROR_BIDI" },
+        { "\u06F0\u06F1", "B",  // Extended Arabic-Indic Digits alone
+          "\u06F0\u06F1", "" },
+        { "\u0660\u06F1", "B",  // Mixed Arabic-Indic Digits
+          "\u0660\u06F1", "UIDNA_ERROR_CONTEXTO_DIGITS|UIDNA_ERROR_BIDI" },
+        // All of the CONTEXTO "Would otherwise have been DISALLOWED" characters
+        // in their correct contexts,
+        // then each in incorrect context.
+        { "l\u00B7l\u4E00\u0375\u03B1\u05D0\u05F3\u05F4\u30FB", "B",
+          "l\u00B7l\u4E00\u0375\u03B1\u05D0\u05F3\u05F4\u30FB", "UIDNA_ERROR_BIDI" },
+        { "l\u00B7", "B",
+          "l\u00B7", "UIDNA_ERROR_CONTEXTO_PUNCTUATION" },
+        { "\u00B7l", "B",
+          "\u00B7l", "UIDNA_ERROR_CONTEXTO_PUNCTUATION" },
+        { "\u0375", "B",
+          "\u0375", "UIDNA_ERROR_CONTEXTO_PUNCTUATION" },
+        { "\u03B1\u05F3", "B",
+          "\u03B1\u05F3", "UIDNA_ERROR_CONTEXTO_PUNCTUATION|UIDNA_ERROR_BIDI" },
+        { "\u05F4", "B",
+          "\u05F4", "UIDNA_ERROR_CONTEXTO_PUNCTUATION" },
+        { "l\u30FB", "B",
+          "l\u30FB", "UIDNA_ERROR_CONTEXTO_PUNCTUATION" },
          // { "", "B",
          //   "", "" },
      };
author	Markus Scherer <markus.icu@gmail.com>
	Fri, 1 Jul 2011 22:17:53 +0000 (22:17 +0000)
committer	Markus Scherer <markus.icu@gmail.com>
	Fri, 1 Jul 2011 22:17:53 +0000 (22:17 +0000)
icu4j/main/classes/core/src/com/ibm/icu/impl/UCharacterProperty.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/impl/UTS46.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/text/IDNA.java		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/normalizer/UTS46Test.java		patch \| blob \| history