ICU-21030 validate ACE label edge cases

author Markus Scherer <markus.icu@gmail.com>

Fri, 14 Aug 2020 20:33:34 +0000 (13:33 -0700)

committer Markus Scherer <markus.icu@gmail.com>

Fri, 14 Aug 2020 21:32:47 +0000 (14:32 -0700)
author Markus Scherer <markus.icu@gmail.com>
Fri, 14 Aug 2020 20:33:34 +0000 (13:33 -0700)
committer Markus Scherer <markus.icu@gmail.com>
Fri, 14 Aug 2020 21:32:47 +0000 (14:32 -0700)
diff --git a/icu4c/source/common/uts46.cpp b/icu4c/source/common/uts46.cpp

index b9e6cb023bb379964838caf16ff2285c54cfdbf1..f25b4e12f124ca248446c3c5b3d7891dd8aea1eb 100644 (file)
--- a/icu4c/source/common/uts46.cpp
+++ b/icu4c/source/common/uts46.cpp
@@ -714,6 +714,16 @@ UTS46::processLabel(UnicodeString &dest,
      UBool wasPunycode;
      if(labelLength>=4 && label[0]==0x78 && label[1]==0x6e && label[2]==0x2d && label[3]==0x2d) {
          // Label starts with "xn--", try to un-Punycode it.
+        // In IDNA2008, labels like "xn--" (decodes to an empty string) and
+        // "xn--ASCII-" (decodes to just "ASCII") fail the round-trip validation from
+        // comparing the ToUnicode input with the back-to-ToASCII output.
+        // They are alternate encodings of the respective ASCII labels.
+        // Ignore "xn---" here: It will fail Punycode.decode() which logically comes before
+        // the round-trip verification.
+        if(labelLength==4 || (labelLength>5 && label[labelLength-1]==u'-')) {
+            info.labelErrors|=UIDNA_ERROR_INVALID_ACE_LABEL;
+            return markBadACELabel(dest, labelStart, labelLength, toASCII, info, errorCode);
+        }
          wasPunycode=TRUE;
          UChar *unicodeBuffer=fromPunycode.getBuffer(-1);  // capacity==-1: most labels should fit
          if(unicodeBuffer==NULL) {
@@ -925,10 +935,10 @@ UTS46::markBadACELabel(UnicodeString &dest,
      UBool isASCII=TRUE;
      UBool onlyLDH=TRUE;
      const UChar *label=dest.getBuffer()+labelStart;
-    // Ok to cast away const because we own the UnicodeString.
-    UChar *s=(UChar *)label+4;  // After the initial "xn--".
      const UChar *limit=label+labelLength;
-    do {
+    // Start after the initial "xn--".
+    // Ok to cast away const because we own the UnicodeString.
+    for(UChar *s=const_cast<UChar *>(label+4); s<limit; ++s) {
          UChar c=*s;
          if(c<=0x7f) {
              if(c==0x2e) {
@@ -945,7 +955,7 @@ UTS46::markBadACELabel(UnicodeString &dest,
          } else {
              isASCII=onlyLDH=FALSE;
          }
-    } while(++s<limit);
+    }
      if(onlyLDH) {
          dest.insert(labelStart+labelLength, (UChar)0xfffd);
          if(dest.isBogus()) {
diff --git a/icu4c/source/test/intltest/uts46test.cpp b/icu4c/source/test/intltest/uts46test.cpp

index 39ba01d385c9cf646c80ff208e88384641177ec6..6700d0b15b70599ee2459da7876bd36947fba64a 100644 (file)
--- a/icu4c/source/test/intltest/uts46test.cpp
+++ b/icu4c/source/test/intltest/uts46test.cpp
@@ -40,6 +40,7 @@ public:
      void TestAPI();
      void TestNotSTD3();
      void TestInvalidPunycodeDigits();
+    void TestACELabelEdgeCases();
      void TestSomeCases();
      void IdnaTest();
  
@@ -84,6 +85,7 @@ void UTS46Test::runIndexedTest(int32_t index, UBool exec, const char *&name, cha
      TESTCASE_AUTO(TestAPI);
      TESTCASE_AUTO(TestNotSTD3);
      TESTCASE_AUTO(TestInvalidPunycodeDigits);
+    TESTCASE_AUTO(TestACELabelEdgeCases);
      TESTCASE_AUTO(TestSomeCases);
      TESTCASE_AUTO(IdnaTest);
      TESTCASE_AUTO_END;
@@ -312,6 +314,35 @@ void UTS46Test::TestInvalidPunycodeDigits() {
      }
  }
  
+void UTS46Test::TestACELabelEdgeCases() {
+    // In IDNA2008, these labels fail the round-trip validation from comparing
+    // the ToUnicode input with the back-to-ToASCII output.
+    IcuTestErrorCode errorCode(*this, "TestACELabelEdgeCases()");
+    LocalPointer<IDNA> idna(IDNA::createUTS46Instance(0, errorCode));
+    if(errorCode.isFailure()) {
+        return;
+    }
+    UnicodeString result;
+    {
+        IDNAInfo info;
+        idna->labelToUnicode(u"xn--", result, info, errorCode);
+        assertTrue("empty xn--", (info.getErrors()&UIDNA_ERROR_INVALID_ACE_LABEL)!=0);
+    }
+    {
+        IDNAInfo info;
+        idna->labelToUnicode(u"xN--ASCII-", result, info, errorCode);
+        assertTrue("nothing but ASCII", (info.getErrors()&UIDNA_ERROR_INVALID_ACE_LABEL)!=0);
+    }
+    {
+        // Different error: The Punycode decoding procedure does not consume the last delimiter
+        // if it is right after the xn-- so the main decoding loop fails because the hyphen
+        // is not a valid Punycode digit.
+        IDNAInfo info;
+        idna->labelToUnicode(u"Xn---", result, info, errorCode);
+        assertTrue("empty Xn---", (info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0);
+    }
+}
+
  struct TestCase {
      // Input string and options string (Nontransitional/Transitional/Both).
      const char *s, *o;
@@ -558,13 +589,13 @@ static const TestCase testCases[]={
        UIDNA_ERROR_EMPTY_LABEL|UIDNA_ERROR_LEADING_HYPHEN|UIDNA_ERROR_TRAILING_HYPHEN|
        UIDNA_ERROR_HYPHEN_3_4 },
      { "a..c", "B", "a..c", UIDNA_ERROR_EMPTY_LABEL },
-    { "a.xn--.c", "B", "a..c", UIDNA_ERROR_EMPTY_LABEL },
+    { "a.xn--.c", "B", "a.xn--\\uFFFD.c", UIDNA_ERROR_INVALID_ACE_LABEL },
      { "a.-b.", "B", "a.-b.", UIDNA_ERROR_LEADING_HYPHEN },
      { "a.b-.c", "B", "a.b-.c", UIDNA_ERROR_TRAILING_HYPHEN },
      { "a.-.c", "B", "a.-.c", UIDNA_ERROR_LEADING_HYPHEN|UIDNA_ERROR_TRAILING_HYPHEN },
      { "a.bc--de.f", "B", "a.bc--de.f", UIDNA_ERROR_HYPHEN_3_4 },
      { "\\u00E4.\\u00AD.c", "B", "\\u00E4..c", UIDNA_ERROR_EMPTY_LABEL },
-    { "\\u00E4.xn--.c", "B", "\\u00E4..c", UIDNA_ERROR_EMPTY_LABEL },
+    { "\\u00E4.xn--.c", "B", "\\u00E4.xn--\\uFFFD.c", UIDNA_ERROR_INVALID_ACE_LABEL },
      { "\\u00E4.-b.", "B", "\\u00E4.-b.", UIDNA_ERROR_LEADING_HYPHEN },
      { "\\u00E4.b-.c", "B", "\\u00E4.b-.c", UIDNA_ERROR_TRAILING_HYPHEN },
      { "\\u00E4.-.c", "B", "\\u00E4.-.c", UIDNA_ERROR_LEADING_HYPHEN|UIDNA_ERROR_TRAILING_HYPHEN },
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/UTS46.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/UTS46.java

index c830bc3c75e3fc99c852c7ce9c2658b5ddeddb1e..1e52f8f91e555c63b11b934975190e80dace1b03 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/UTS46.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/UTS46.java
@@ -341,6 +341,16 @@ public final class UTS46 extends IDNA {
              dest.charAt(labelStart+2)=='-' && dest.charAt(labelStart+3)=='-'
          ) {
              // Label starts with "xn--", try to un-Punycode it.
+            // In IDNA2008, labels like "xn--" (decodes to an empty string) and
+            // "xn--ASCII-" (decodes to just "ASCII") fail the round-trip validation from
+            // comparing the ToUnicode input with the back-to-ToASCII output.
+            // They are alternate encodings of the respective ASCII labels.
+            // Ignore "xn---" here: It will fail Punycode.decode() which logically comes before
+            // the round-trip verification.
+            if(labelLength==4 || (labelLength>5 && dest.charAt(labelStart+labelLength-1)=='-')) {
+                addLabelError(info, Error.INVALID_ACE_LABEL);
+                return markBadACELabel(dest, labelStart, labelLength, toASCII, info);
+            }
              wasPunycode=true;
              try {
                  fromPunycode=Punycode.decode(dest.subSequence(labelStart+4, labelStart+labelLength), null);
@@ -496,9 +506,9 @@ public final class UTS46 extends IDNA {
          boolean disallowNonLDHDot=(options&USE_STD3_RULES)!=0;
          boolean isASCII=true;
          boolean onlyLDH=true;
-        int i=labelStart+4;  // After the initial "xn--".
          int limit=labelStart+labelLength;
-        do {
+        // Start after the initial "xn--".
+        for(int i=labelStart+4; i<limit; ++i) {
              char c=dest.charAt(i);
              if(c<=0x7f) {
                  if(c=='.') {
@@ -515,7 +525,7 @@ public final class UTS46 extends IDNA {
              } else {
                  isASCII=onlyLDH=false;
              }
-        } while(++i<limit);
+        }
          if(onlyLDH) {
              dest.insert(labelStart+labelLength, '\ufffd');
              ++labelLength;
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/normalizer/UTS46Test.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/normalizer/UTS46Test.java

index 8ab2d72314e03cba866b4146ca384d94e37882cd..25ee84af23ccd901c7f36956553df5267162b065 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/normalizer/UTS46Test.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/normalizer/UTS46Test.java
@@ -152,6 +152,28 @@ public class UTS46Test extends TestFmwk {
                  info.getErrors().contains(IDNA.Error.PUNYCODE));
      }
  
+    @Test
+    public void TestACELabelEdgeCases() {
+        // In IDNA2008, these labels fail the round-trip validation from comparing
+        // the ToUnicode input with the back-to-ToASCII output.
+        IDNA idna=IDNA.getUTS46Instance(0);
+        StringBuilder result=new StringBuilder();
+        IDNA.Info info=new IDNA.Info();
+        idna.labelToUnicode("xn--", result, info);
+        assertTrue("empty xn--", info.getErrors().contains(IDNA.Error.INVALID_ACE_LABEL));
+
+        info=new IDNA.Info();
+        idna.labelToUnicode("xN--ASCII-", result, info);
+        assertTrue("nothing but ASCII", info.getErrors().contains(IDNA.Error.INVALID_ACE_LABEL));
+
+        // Different error: The Punycode decoding procedure does not consume the last delimiter
+        // if it is right after the xn-- so the main decoding loop fails because the hyphen
+        // is not a valid Punycode digit.
+        info=new IDNA.Info();
+        idna.labelToUnicode("Xn---", result, info);
+        assertTrue("empty Xn---", info.getErrors().contains(IDNA.Error.PUNYCODE));
+    }
+
      private static final Map<String, IDNA.Error> errorNamesToErrors;
      static {
          errorNamesToErrors=new TreeMap<>();
@@ -432,13 +454,13 @@ public class UTS46Test extends TestFmwk {
            "UIDNA_ERROR_EMPTY_LABEL|UIDNA_ERROR_LEADING_HYPHEN|UIDNA_ERROR_TRAILING_HYPHEN|"+
            "UIDNA_ERROR_HYPHEN_3_4" },
          { "a..c", "B", "a..c", "UIDNA_ERROR_EMPTY_LABEL" },
-        { "a.xn--.c", "B", "a..c", "UIDNA_ERROR_EMPTY_LABEL" },
+        { "a.xn--.c", "B", "a.xn--\uFFFD.c", "UIDNA_ERROR_INVALID_ACE_LABEL" },
          { "a.-b.", "B", "a.-b.", "UIDNA_ERROR_LEADING_HYPHEN" },
          { "a.b-.c", "B", "a.b-.c", "UIDNA_ERROR_TRAILING_HYPHEN" },
          { "a.-.c", "B", "a.-.c", "UIDNA_ERROR_LEADING_HYPHEN|UIDNA_ERROR_TRAILING_HYPHEN" },
          { "a.bc--de.f", "B", "a.bc--de.f", "UIDNA_ERROR_HYPHEN_3_4" },
          { "\u00E4.\u00AD.c", "B", "\u00E4..c", "UIDNA_ERROR_EMPTY_LABEL" },
-        { "\u00E4.xn--.c", "B", "\u00E4..c", "UIDNA_ERROR_EMPTY_LABEL" },
+        { "\u00E4.xn--.c", "B", "\u00E4.xn--\uFFFD.c", "UIDNA_ERROR_INVALID_ACE_LABEL" },
          { "\u00E4.-b.", "B", "\u00E4.-b.", "UIDNA_ERROR_LEADING_HYPHEN" },
          { "\u00E4.b-.c", "B", "\u00E4.b-.c", "UIDNA_ERROR_TRAILING_HYPHEN" },
          { "\u00E4.-.c", "B", "\u00E4.-.c", "UIDNA_ERROR_LEADING_HYPHEN|UIDNA_ERROR_TRAILING_HYPHEN" },
author	Markus Scherer <markus.icu@gmail.com>
	Fri, 14 Aug 2020 20:33:34 +0000 (13:33 -0700)
committer	Markus Scherer <markus.icu@gmail.com>
	Fri, 14 Aug 2020 21:32:47 +0000 (14:32 -0700)
icu4c/source/common/uts46.cpp		patch \| blob \| history
icu4c/source/test/intltest/uts46test.cpp		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/impl/UTS46.java		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/normalizer/UTS46Test.java		patch \| blob \| history