* Gets the unicode additional properties.
- * C version getUnicodeProperties.
+ * Java version of C u_getUnicodeProperties().
* @param codepoint codepoint whose additional properties is to be
* retrieved
* @param column The column index.
* @return unicode properties
- public int getAdditional(int codepoint, int column) {
- if (column == -1) {
- return getProperty(codepoint);
+ public int getAdditional(int codepoint, int column) {
+ assert column >= 0;
+ if (column >= m_additionalColumnsCount_) {
+ return 0;
- if (column < 0 || column >= m_additionalColumnsCount_) {
- return 0;
- }
- return m_additionalVectors_[m_additionalTrie_.get(codepoint) + column];
- }
+ return m_additionalVectors_[m_additionalTrie_.get(codepoint) + column];
+ }
static final int MY_MASK = UCharacterProperty.TYPE_MASK
& ((1<<UCharacterCategory.UPPERCASE_LETTER) |
-* Copyright (C) 2011, International Business Machines
+* Copyright (C) 2010-2011, International Business Machines
* Corporation and others. All Rights Reserved.
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UCharacterCategory;
import com.ibm.icu.lang.UCharacterDirection;
+import com.ibm.icu.lang.UScript;
import com.ibm.icu.text.IDNA;
import com.ibm.icu.text.Normalizer2;
import com.ibm.icu.text.StringPrepParseException;
) {
addLabelError(info, Error.CONTEXTJ);
+ if((options&CHECK_CONTEXTO)!=0 && oredChars>=0xb7) {
+ checkLabelContextO(labelString, labelStart, labelLength, info);
+ }
if(toASCII) {
if(wasPunycode) {
// Leave a Punycode label unchanged if it has no severe errors.
return true;
+ private void
+ checkLabelContextO(CharSequence label, int labelStart, int labelLength, Info info) {
+ int labelEnd=labelStart+labelLength-1; // inclusive
+ int arabicDigits=0; // -1 for 066x, +1 for 06Fx
+ for(int i=labelStart; i<=labelEnd; ++i) {
+ int c=label.charAt(i);
+ if(c<0xb7) {
+ // ASCII fastpath
+ } else if(c<=0x6f9) {
+ if(c==0xb7) {
+ // Appendix A.3. MIDDLE DOT (U+00B7)
+ // Rule Set:
+ // False;
+ // If Before(cp) .eq. U+006C And
+ // After(cp) .eq. U+006C Then True;
+ if(!(labelStart<i && label.charAt(i-1)=='l' &&
+ i<labelEnd && label.charAt(i+1)=='l')) {
+ addLabelError(info, Error.CONTEXTO_PUNCTUATION);
+ }
+ } else if(c==0x375) {
+ // Appendix A.4. GREEK LOWER NUMERAL SIGN (KERAIA) (U+0375)
+ // Rule Set:
+ // False;
+ // If Script(After(cp)) .eq. Greek Then True;
+ if(!(i<labelEnd &&
+ UScript.GREEK==UScript.getScript(Character.codePointAt(label, i+1)))) {
+ addLabelError(info, Error.CONTEXTO_PUNCTUATION);
+ }
+ } else if(c==0x5f3 || c==0x5f4) {
+ // Rule Set:
+ // False;
+ // If Script(Before(cp)) .eq. Hebrew Then True;
+ //
+ // Rule Set:
+ // False;
+ // If Script(Before(cp)) .eq. Hebrew Then True;
+ if(!(labelStart<i &&
+ UScript.HEBREW==UScript.getScript(Character.codePointBefore(label, i)))) {
+ addLabelError(info, Error.CONTEXTO_PUNCTUATION);
+ }
+ } else if(0x660<=c /* && c<=0x6f9 */) {
+ // Appendix A.8. ARABIC-INDIC DIGITS (0660..0669)
+ // Rule Set:
+ // True;
+ // For All Characters:
+ // If cp .in. 06F0..06F9 Then False;
+ // End For;
+ //
+ // Appendix A.9. EXTENDED ARABIC-INDIC DIGITS (06F0..06F9)
+ // Rule Set:
+ // True;
+ // For All Characters:
+ // If cp .in. 0660..0669 Then False;
+ // End For;
+ if(c<=0x669) {
+ if(arabicDigits>0) {
+ addLabelError(info, Error.CONTEXTO_DIGITS);
+ }
+ arabicDigits=-1;
+ } else if(0x6f0<=c) {
+ if(arabicDigits<0) {
+ addLabelError(info, Error.CONTEXTO_DIGITS);
+ }
+ arabicDigits=1;
+ }
+ }
+ } else if(c==0x30fb) {
+ // Appendix A.7. KATAKANA MIDDLE DOT (U+30FB)
+ // Rule Set:
+ // False;
+ // For All Characters:
+ // If Script(cp) .in. {Hiragana, Katakana, Han} Then True;
+ // End For;
+ for(int j=labelStart;; j+=Character.charCount(c)) {
+ if(j>labelEnd) {
+ addLabelError(info, Error.CONTEXTO_PUNCTUATION);
+ break;
+ }
+ c=Character.codePointAt(label, j);
+ int script=UScript.getScript(c);
+ if(script==UScript.HIRAGANA || script==UScript.KATAKANA || script==UScript.HAN) {
+ break;
+ }
+ }
+ }
+ }
+ }
// TODO: make public(?) -- in C, these are public in uchar.h
private static int U_MASK(int x) {
return 1<<x;
- * Copyright (C) 2003-2010, International Business Machines Corporation and *
+ * Copyright (C) 2003-2011, International Business Machines Corporation and *
* others. All Rights Reserved. *
public abstract class IDNA {
* Default options value: None of the other options are set.
+ * For use in static worker and factory methods.
* @stable ICU 2.8
public static final int DEFAULT = 0;
* Option to allow unassigned code points in domain names and labels.
- * This option is ignored by the UTS46 implementation.
+ * For use in static worker and factory methods.
+ * <p>This option is ignored by the UTS46 implementation.
* (UTS #46 disallows unassigned code points.)
* @stable ICU 2.8
* Option to check whether the input conforms to the STD3 ASCII rules,
* for example the restriction of labels to LDH characters
* (ASCII Letters, Digits and Hyphen-Minus).
+ * For use in static worker and factory methods.
* @stable ICU 2.8
public static final int USE_STD3_RULES = 2;
* IDNA option to check for whether the input conforms to the BiDi rules.
- * This option is ignored by the IDNA2003 implementation.
+ * For use in static worker and factory methods.
+ * <p>This option is ignored by the IDNA2003 implementation.
* (IDNA2003 always performs a BiDi check.)
* @draft ICU 4.6
* @provisional This API might change or be removed in a future release.
public static final int CHECK_BIDI = 4;
* IDNA option to check for whether the input conforms to the CONTEXTJ rules.
- * This option is ignored by the IDNA2003 implementation.
+ * For use in static worker and factory methods.
+ * <p>This option is ignored by the IDNA2003 implementation.
* (The CONTEXTJ check is new in IDNA2008.)
* @draft ICU 4.6
* @provisional This API might change or be removed in a future release.
public static final int CHECK_CONTEXTJ = 8;
* IDNA option for nontransitional processing in ToASCII().
- * By default, ToASCII() uses transitional processing.
- * This option is ignored by the IDNA2003 implementation.
+ * For use in static worker and factory methods.
+ * <p>By default, ToASCII() uses transitional processing.
+ * <p>This option is ignored by the IDNA2003 implementation.
* (This is only relevant for compatibility of newer IDNA implementations with IDNA2003.)
* @draft ICU 4.6
* @provisional This API might change or be removed in a future release.
public static final int NONTRANSITIONAL_TO_ASCII = 0x10;
* IDNA option for nontransitional processing in ToUnicode().
- * By default, ToUnicode() uses transitional processing.
- * This option is ignored by the IDNA2003 implementation.
+ * For use in static worker and factory methods.
+ * <p>By default, ToUnicode() uses transitional processing.
+ * <p>This option is ignored by the IDNA2003 implementation.
* (This is only relevant for compatibility of newer IDNA implementations with IDNA2003.)
* @draft ICU 4.6
* @provisional This API might change or be removed in a future release.
public static final int NONTRANSITIONAL_TO_UNICODE = 0x20;
+ /**
+ * IDNA option to check for whether the input conforms to the CONTEXTO rules.
+ * For use in static worker and factory methods.
+ * <p>This option is ignored by the IDNA2003 implementation.
+ * (The CONTEXTO check is new in IDNA2008.)
+ * <p>This is for use by registries for IDNA2008 conformance.
+ * UTS #46 does not require the CONTEXTO check.
+ * @draft ICU 49
+ * @provisional This API might change or be removed in a future release.
+ */
+ public static final int CHECK_CONTEXTO = 0x40;
* Returns an IDNA instance which implements UTS #46.
* @draft ICU 4.6
* @provisional This API might change or be removed in a future release.
+ /**
+ * A label does not meet the IDNA CONTEXTO requirements for punctuation characters.
+ * Some punctuation characters "Would otherwise have been DISALLOWED"
+ * but are allowed in certain contexts. (RFC 5892)
+ * @draft ICU 49
+ * @provisional This API might change or be removed in a future release.
+ */
+ /**
+ * A label does not meet the IDNA CONTEXTO requirements for digits.
+ * Arabic-Indic Digits (U+066x) must not be mixed with Extended Arabic-Indic Digits (U+06Fx).
+ * @draft ICU 49
+ * @provisional This API might change or be removed in a future release.
+ */
-* Copyright (C) 2010, International Business Machines
+* Copyright (C) 2010-2011, International Business Machines
* Corporation and others. All Rights Reserved.
new UTS46Test().run(args);
public UTS46Test() {
+ int commonOptions=
+ trans=IDNA.getUTS46Instance(commonOptions);
+ nontrans=IDNA.getUTS46Instance(commonOptions|
errorNamesToErrors.put("UIDNA_ERROR_BIDI", IDNA.Error.BIDI);
errorNamesToErrors.put("UIDNA_ERROR_CONTEXTJ", IDNA.Error.CONTEXTJ);
private static final class TestCase {
"\u06EF\u200C\u06EF", "UIDNA_ERROR_CONTEXTJ" },
{ "\u0644\u200C", "N", // D ZWNJ
+ { "\u0660\u0661", "B", // Arabic-Indic Digits alone
+ "\u0660\u0661", "UIDNA_ERROR_BIDI" },
+ { "\u06F0\u06F1", "B", // Extended Arabic-Indic Digits alone
+ "\u06F0\u06F1", "" },
+ { "\u0660\u06F1", "B", // Mixed Arabic-Indic Digits
+ // All of the CONTEXTO "Would otherwise have been DISALLOWED" characters
+ // in their correct contexts,
+ // then each in incorrect context.
+ { "l\u00B7l\u4E00\u0375\u03B1\u05D0\u05F3\u05F4\u30FB", "B",
+ "l\u00B7l\u4E00\u0375\u03B1\u05D0\u05F3\u05F4\u30FB", "UIDNA_ERROR_BIDI" },
+ { "l\u00B7", "B",
+ { "\u00B7l", "B",
+ { "\u0375", "B",
+ { "\u03B1\u05F3", "B",
+ { "\u05F4", "B",
+ { "l\u30FB", "B",
// { "", "B",
// "", "" },