11C34;BHAIKSUKI VOWEL SIGN VOCALIC R;Mn;0;NSM;;;;;N;;;;;
11C35;BHAIKSUKI VOWEL SIGN VOCALIC RR;Mn;0;NSM;;;;;N;;;;;
11C36;BHAIKSUKI VOWEL SIGN VOCALIC L;Mn;0;NSM;;;;;N;;;;;
-11C38;BHAIKSUKI VOWEL SIGN E;Mc;0;L;;;;;N;;;;;
-11C39;BHAIKSUKI VOWEL SIGN AI;Mc;0;L;;;;;N;;;;;
-11C3A;BHAIKSUKI VOWEL SIGN O;Mc;0;L;;;;;N;;;;;
-11C3B;BHAIKSUKI VOWEL SIGN AU;Mc;0;L;;;;;N;;;;;
+11C38;BHAIKSUKI VOWEL SIGN E;Mn;0;NSM;;;;;N;;;;;
+11C39;BHAIKSUKI VOWEL SIGN AI;Mn;0;NSM;;;;;N;;;;;
+11C3A;BHAIKSUKI VOWEL SIGN O;Mn;0;NSM;;;;;N;;;;;
+11C3B;BHAIKSUKI VOWEL SIGN AU;Mn;0;NSM;;;;;N;;;;;
11C3C;BHAIKSUKI SIGN CANDRABINDU;Mn;0;NSM;;;;;N;;;;;
11C3D;BHAIKSUKI SIGN ANUSVARA;Mn;0;NSM;;;;;N;;;;;
11C3E;BHAIKSUKI SIGN VISARGA;Mc;0;L;;;;;N;;;;;
int breakPos = -1;
int c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
+ int cBase; // for (X Extend*) patterns, the X character.
// Previous break at end of string. return DONE.
if (prevPos >= fText.length()) {
}
/* p0 = */ p1 = p2 = p3 = prevPos;
c3 = UTF16.charAt(fText, prevPos);
- c0 = c1 = c2 = 0;
+ c0 = c1 = c2 = cBase = 0;
// Loop runs once per "significant" character position in the input text.
for (;;) {
continue;
}
- // Rule (GB8a) Regional_Indicator x Regional_Indicator
- // Note: The first if condition is a little tricky. We only need to force
- // a break if there are three or more contiguous RIs. If there are
- // only two, a break following will occur via other rules, and will include
- // any trailing extend characters, which is needed behavior.
- if (fRegionalIndicatorSet.contains(c0) && fRegionalIndicatorSet.contains(c1)
- && fRegionalIndicatorSet.contains(c2)) {
- break;
- }
-
- if (fRegionalIndicatorSet.contains(c1) && fRegionalIndicatorSet.contains(c2)) {
- continue;
- }
-
// Rule (GB9) x (Extend | ZWJ)
if (fExtendSet.contains(c2) || fZWJSet.contains(c2)) {
+ if (!fExtendSet.contains(c1)) {
+ cBase = c1;
+ }
continue;
}
if (fPrependSet.contains(c1)) {
continue;
}
- // Rule (GB10) (Emoji_Base | EBG) x Emoji_Modifier
+ // Rule (GB10) (Emoji_Base | EBG) Extend* x Emoji_Modifier
if ((fEmojiBaseSet.contains(c1) || fEBGSet.contains(c1)) && fEmojiModifierSet.contains(c2)) {
continue;
}
+ if ((fEmojiBaseSet.contains(cBase) || fEBGSet.contains(cBase)) &&
+ fExtendSet.contains(c1) && fEmojiModifierSet.contains(c2)) {
+ continue;
+ }
// Rule (GB11) ZWJ x (Glue_After_Zwj | EBG)
if (fZWJSet.contains(c1) && (fGAZSet.contains(c2) || fEBGSet.contains(c2))) {
continue;
}
+ // Rule (GB12-13) Regional_Indicator x Regional_Indicator
+ // Note: The first if condition is a little tricky. We only need to force
+ // a break if there are three or more contiguous RIs. If there are
+ // only two, a break following will occur via other rules, and will include
+ // any trailing extend characters, which is needed behavior.
+ if (fRegionalIndicatorSet.contains(c0) && fRegionalIndicatorSet.contains(c1)
+ && fRegionalIndicatorSet.contains(c2)) {
+ break;
+ }
+ if (fRegionalIndicatorSet.contains(c1) && fRegionalIndicatorSet.contains(c2)) {
+ continue;
+ }
+
// Rule (GB999) Any <break> Any
break;
}
String[][] tests = {
// String, restriction-level, numerics, scripts, alternates, common-alternates
{"a♥", "UNRESTRICTIVE", "[]", "Latn", "", ""},
- {"a〆", "HIGHLY_RESTRICTIVE", "[]", "Latn", "Hani Hira Kana", "Hani Hira Kana"},
- {"aー〆", "HIGHLY_RESTRICTIVE", "[]", "Latn", "Hira Kana", "Hira Kana"},
- {"aー〆ア", "HIGHLY_RESTRICTIVE", "[]", "Latn Kana", "", ""},
- {"アaー〆", "HIGHLY_RESTRICTIVE", "[]", "Latn Kana", "", ""},
+ {"a\u303c", "HIGHLY_RESTRICTIVE", "[]", "Latn", "Hani Hira Kana", "Hani Hira Kana"},
+ {"aー\u303c", "HIGHLY_RESTRICTIVE", "[]", "Latn", "Hira Kana", "Hira Kana"},
+ {"aー\u303cア", "HIGHLY_RESTRICTIVE", "[]", "Latn Kana", "", ""},
+ { "アaー\u303c", "HIGHLY_RESTRICTIVE", "[]", "Latn Kana", "", ""},
{"a1١", "UNRESTRICTIVE", "[0٠]", "Latn", "Arab Thaa", "Arab Thaa"},
{"a1١۱", "UNRESTRICTIVE", "[0٠۰]", "Latn Arab", "", ""},
- {"١ー〆aア1१۱", "UNRESTRICTIVE", "[0٠۰०]", "Latn Kana Arab", "Deva Kthi Mahj", "Deva Kthi Mahj"},
- {"aアー〆1१١۱", "UNRESTRICTIVE", "[0٠۰०]", "Latn Kana Arab", "Deva Kthi Mahj", "Deva Kthi Mahj"},
+ {"١ー\u303caア1१۱", "UNRESTRICTIVE", "[0٠۰०]", "Latn Kana Arab", "Deva Kthi Mahj", "Deva Kthi Mahj"},
+ {"aアー\u303c1१١۱", "UNRESTRICTIVE", "[0٠۰०]", "Latn Kana Arab", "Deva Kthi Mahj", "Deva Kthi Mahj"},
};
for (String[] test : tests) {
String testString = test[0];
IdentifierInfo idInfo = new IdentifierInfo();
- idInfo.setIdentifierProfile(SpoofChecker.RECOMMENDED);
+ UnicodeSet allowedChars = new UnicodeSet();
+ // Allowed Identifier Characters. In addition to the Recommended Set,
+ // allow u303c, which has an interesting script extension of Hani Hira Kana.
+ allowedChars.addAll(SpoofChecker.RECOMMENDED).add(0x303c);
+ idInfo.setIdentifierProfile(allowedChars);
idInfo.setIdentifier(testString);
assertEquals("Identifier " + testString, testString, idInfo.getIdentifier());