UBool wasPunycode;
if(labelLength>=4 && label[0]==0x78 && label[1]==0x6e && label[2]==0x2d && label[3]==0x2d) {
// Label starts with "xn--", try to un-Punycode it.
+ // In IDNA2008, labels like "xn--" (decodes to an empty string) and
+ // "xn--ASCII-" (decodes to just "ASCII") fail the round-trip validation from
+ // comparing the ToUnicode input with the back-to-ToASCII output.
+ // They are alternate encodings of the respective ASCII labels.
+ // Ignore "xn---" here: It will fail Punycode.decode() which logically comes before
+ // the round-trip verification.
+ if(labelLength==4 || (labelLength>5 && label[labelLength-1]==u'-')) {
+ info.labelErrors|=UIDNA_ERROR_INVALID_ACE_LABEL;
+ return markBadACELabel(dest, labelStart, labelLength, toASCII, info, errorCode);
+ }
wasPunycode=TRUE;
UChar *unicodeBuffer=fromPunycode.getBuffer(-1); // capacity==-1: most labels should fit
if(unicodeBuffer==NULL) {
UBool isASCII=TRUE;
UBool onlyLDH=TRUE;
const UChar *label=dest.getBuffer()+labelStart;
- // Ok to cast away const because we own the UnicodeString.
- UChar *s=(UChar *)label+4; // After the initial "xn--".
const UChar *limit=label+labelLength;
- do {
+ // Start after the initial "xn--".
+ // Ok to cast away const because we own the UnicodeString.
+ for(UChar *s=const_cast<UChar *>(label+4); s<limit; ++s) {
UChar c=*s;
if(c<=0x7f) {
if(c==0x2e) {
} else {
isASCII=onlyLDH=FALSE;
}
- } while(++s<limit);
+ }
if(onlyLDH) {
dest.insert(labelStart+labelLength, (UChar)0xfffd);
if(dest.isBogus()) {
void TestAPI();
void TestNotSTD3();
void TestInvalidPunycodeDigits();
+ void TestACELabelEdgeCases();
void TestSomeCases();
void IdnaTest();
TESTCASE_AUTO(TestAPI);
TESTCASE_AUTO(TestNotSTD3);
TESTCASE_AUTO(TestInvalidPunycodeDigits);
+ TESTCASE_AUTO(TestACELabelEdgeCases);
TESTCASE_AUTO(TestSomeCases);
TESTCASE_AUTO(IdnaTest);
TESTCASE_AUTO_END;
}
}
+void UTS46Test::TestACELabelEdgeCases() {
+ // In IDNA2008, these labels fail the round-trip validation from comparing
+ // the ToUnicode input with the back-to-ToASCII output.
+ IcuTestErrorCode errorCode(*this, "TestACELabelEdgeCases()");
+ LocalPointer<IDNA> idna(IDNA::createUTS46Instance(0, errorCode));
+ if(errorCode.isFailure()) {
+ return;
+ }
+ UnicodeString result;
+ {
+ IDNAInfo info;
+ idna->labelToUnicode(u"xn--", result, info, errorCode);
+ assertTrue("empty xn--", (info.getErrors()&UIDNA_ERROR_INVALID_ACE_LABEL)!=0);
+ }
+ {
+ IDNAInfo info;
+ idna->labelToUnicode(u"xN--ASCII-", result, info, errorCode);
+ assertTrue("nothing but ASCII", (info.getErrors()&UIDNA_ERROR_INVALID_ACE_LABEL)!=0);
+ }
+ {
+ // Different error: The Punycode decoding procedure does not consume the last delimiter
+ // if it is right after the xn-- so the main decoding loop fails because the hyphen
+ // is not a valid Punycode digit.
+ IDNAInfo info;
+ idna->labelToUnicode(u"Xn---", result, info, errorCode);
+ assertTrue("empty Xn---", (info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0);
+ }
+}
+
struct TestCase {
// Input string and options string (Nontransitional/Transitional/Both).
const char *s, *o;
UIDNA_ERROR_EMPTY_LABEL|UIDNA_ERROR_LEADING_HYPHEN|UIDNA_ERROR_TRAILING_HYPHEN|
UIDNA_ERROR_HYPHEN_3_4 },
{ "a..c", "B", "a..c", UIDNA_ERROR_EMPTY_LABEL },
- { "a.xn--.c", "B", "a..c", UIDNA_ERROR_EMPTY_LABEL },
+ { "a.xn--.c", "B", "a.xn--\\uFFFD.c", UIDNA_ERROR_INVALID_ACE_LABEL },
{ "a.-b.", "B", "a.-b.", UIDNA_ERROR_LEADING_HYPHEN },
{ "a.b-.c", "B", "a.b-.c", UIDNA_ERROR_TRAILING_HYPHEN },
{ "a.-.c", "B", "a.-.c", UIDNA_ERROR_LEADING_HYPHEN|UIDNA_ERROR_TRAILING_HYPHEN },
{ "a.bc--de.f", "B", "a.bc--de.f", UIDNA_ERROR_HYPHEN_3_4 },
{ "\\u00E4.\\u00AD.c", "B", "\\u00E4..c", UIDNA_ERROR_EMPTY_LABEL },
- { "\\u00E4.xn--.c", "B", "\\u00E4..c", UIDNA_ERROR_EMPTY_LABEL },
+ { "\\u00E4.xn--.c", "B", "\\u00E4.xn--\\uFFFD.c", UIDNA_ERROR_INVALID_ACE_LABEL },
{ "\\u00E4.-b.", "B", "\\u00E4.-b.", UIDNA_ERROR_LEADING_HYPHEN },
{ "\\u00E4.b-.c", "B", "\\u00E4.b-.c", UIDNA_ERROR_TRAILING_HYPHEN },
{ "\\u00E4.-.c", "B", "\\u00E4.-.c", UIDNA_ERROR_LEADING_HYPHEN|UIDNA_ERROR_TRAILING_HYPHEN },
dest.charAt(labelStart+2)=='-' && dest.charAt(labelStart+3)=='-'
) {
// Label starts with "xn--", try to un-Punycode it.
+ // In IDNA2008, labels like "xn--" (decodes to an empty string) and
+ // "xn--ASCII-" (decodes to just "ASCII") fail the round-trip validation from
+ // comparing the ToUnicode input with the back-to-ToASCII output.
+ // They are alternate encodings of the respective ASCII labels.
+ // Ignore "xn---" here: It will fail Punycode.decode() which logically comes before
+ // the round-trip verification.
+ if(labelLength==4 || (labelLength>5 && dest.charAt(labelStart+labelLength-1)=='-')) {
+ addLabelError(info, Error.INVALID_ACE_LABEL);
+ return markBadACELabel(dest, labelStart, labelLength, toASCII, info);
+ }
wasPunycode=true;
try {
fromPunycode=Punycode.decode(dest.subSequence(labelStart+4, labelStart+labelLength), null);
boolean disallowNonLDHDot=(options&USE_STD3_RULES)!=0;
boolean isASCII=true;
boolean onlyLDH=true;
- int i=labelStart+4; // After the initial "xn--".
int limit=labelStart+labelLength;
- do {
+ // Start after the initial "xn--".
+ for(int i=labelStart+4; i<limit; ++i) {
char c=dest.charAt(i);
if(c<=0x7f) {
if(c=='.') {
} else {
isASCII=onlyLDH=false;
}
- } while(++i<limit);
+ }
if(onlyLDH) {
dest.insert(labelStart+labelLength, '\ufffd');
++labelLength;
info.getErrors().contains(IDNA.Error.PUNYCODE));
}
+ @Test
+ public void TestACELabelEdgeCases() {
+ // In IDNA2008, these labels fail the round-trip validation from comparing
+ // the ToUnicode input with the back-to-ToASCII output.
+ IDNA idna=IDNA.getUTS46Instance(0);
+ StringBuilder result=new StringBuilder();
+ IDNA.Info info=new IDNA.Info();
+ idna.labelToUnicode("xn--", result, info);
+ assertTrue("empty xn--", info.getErrors().contains(IDNA.Error.INVALID_ACE_LABEL));
+
+ info=new IDNA.Info();
+ idna.labelToUnicode("xN--ASCII-", result, info);
+ assertTrue("nothing but ASCII", info.getErrors().contains(IDNA.Error.INVALID_ACE_LABEL));
+
+ // Different error: The Punycode decoding procedure does not consume the last delimiter
+ // if it is right after the xn-- so the main decoding loop fails because the hyphen
+ // is not a valid Punycode digit.
+ info=new IDNA.Info();
+ idna.labelToUnicode("Xn---", result, info);
+ assertTrue("empty Xn---", info.getErrors().contains(IDNA.Error.PUNYCODE));
+ }
+
private static final Map<String, IDNA.Error> errorNamesToErrors;
static {
errorNamesToErrors=new TreeMap<>();
"UIDNA_ERROR_EMPTY_LABEL|UIDNA_ERROR_LEADING_HYPHEN|UIDNA_ERROR_TRAILING_HYPHEN|"+
"UIDNA_ERROR_HYPHEN_3_4" },
{ "a..c", "B", "a..c", "UIDNA_ERROR_EMPTY_LABEL" },
- { "a.xn--.c", "B", "a..c", "UIDNA_ERROR_EMPTY_LABEL" },
+ { "a.xn--.c", "B", "a.xn--\uFFFD.c", "UIDNA_ERROR_INVALID_ACE_LABEL" },
{ "a.-b.", "B", "a.-b.", "UIDNA_ERROR_LEADING_HYPHEN" },
{ "a.b-.c", "B", "a.b-.c", "UIDNA_ERROR_TRAILING_HYPHEN" },
{ "a.-.c", "B", "a.-.c", "UIDNA_ERROR_LEADING_HYPHEN|UIDNA_ERROR_TRAILING_HYPHEN" },
{ "a.bc--de.f", "B", "a.bc--de.f", "UIDNA_ERROR_HYPHEN_3_4" },
{ "\u00E4.\u00AD.c", "B", "\u00E4..c", "UIDNA_ERROR_EMPTY_LABEL" },
- { "\u00E4.xn--.c", "B", "\u00E4..c", "UIDNA_ERROR_EMPTY_LABEL" },
+ { "\u00E4.xn--.c", "B", "\u00E4.xn--\uFFFD.c", "UIDNA_ERROR_INVALID_ACE_LABEL" },
{ "\u00E4.-b.", "B", "\u00E4.-b.", "UIDNA_ERROR_LEADING_HYPHEN" },
{ "\u00E4.b-.c", "B", "\u00E4.b-.c", "UIDNA_ERROR_TRAILING_HYPHEN" },
{ "\u00E4.-.c", "B", "\u00E4.-.c", "UIDNA_ERROR_LEADING_HYPHEN|UIDNA_ERROR_TRAILING_HYPHEN" },