* @author markdavis
*/
public class UnicodeRegex implements Cloneable, Freezable<UnicodeRegex>, StringTransform {
+ private static final Pattern SUPP_ESCAPE = Pattern.compile("\\\\U00([0-9a-fA-F]{6})");
+
// Note: we don't currently have any state, but intend to in the future,
// particularly for the regex style supported.
* <p>Not thread-safe; create a separate copy for different threads.
* <p>In the future, we may extend this to support other regex packages.
*
- * @regex A modified Java regex pattern, as in the input to
+ * @param regex A modified Java regex pattern, as in the input to
* Pattern.compile(), except that all "character classes" are
* processed as if they were UnicodeSet patterns. Example:
* "abc[:bc=N:]. See UnicodeSet for the differences in syntax.
*/
public String compileBnf(List<String> lines) {
Map<String, String> variables = getVariables(lines);
- Set<String> unused = new LinkedHashSet<String>(variables.keySet());
+ Set<String> unused = new LinkedHashSet<>(variables.keySet());
// brute force replacement; do twice to allow for different order
// later on can optimize
for (int i = 0; i < 2; ++i) {
pos.setIndex(i);
UnicodeSet x = temp.clear().applyPattern(regex, pos, symbolTable, 0);
x.complement().complement(); // hack to fix toPattern
- result.append(x.toPattern(false));
+ String pattern = x.toPattern(false);
+ // Escaping of supplementary code points differs between ICU UnicodeSet and Java regex.
+ if (pattern.contains("\\U")) {
+ pattern = SUPP_ESCAPE.matcher(pattern).replaceAll("\\\\x{$1}");
+ }
+ result.append(pattern);
i = pos.getIndex() - 1; // allow for the loop increment
return i;
} catch (Exception e) {
};
private Map<String, String> getVariables(List<String> lines) {
- Map<String, String> variables = new TreeMap<String, String>(LongestFirst);
+ Map<String, String> variables = new TreeMap<>(LongestFirst);
String variable = null;
StringBuffer definition = new StringBuffer();
int count = 0;
UnicodeSet requiresQuote = new UnicodeSet("[\\$\\&\\-\\:\\[\\\\\\]\\^\\{\\}[:pattern_whitespace:]]");
boolean skip = TestFmwk.getExhaustiveness() < 10;
for (int cp = 0; cp < 0x110000; ++cp) {
- if (cp > 0xFF && skip && (cp % 37 != 0)) {
+ // Do always test U+1FFFE to cover UnicodeSet escaping a supplementary noncharacter.
+ if (cp > 0xFF && skip && (cp % 37 != 0) && cp != 0x1fffe) {
continue;
}
String cpString = UTF16.valueOf(cp);
String expected = "[" + s + "]"; // Try this first for faster testing.
boolean ok = pattern.equals(expected);
if (!ok) {
- expected = new UnicodeSet(expected).toPattern(false);
+ // Escape like in UnicodeSet, and change supplementary escapes to Java regex syntax.
+ expected = new UnicodeSet(expected).toPattern(false).
+ replaceAll("\\\\U00([0-9a-fA-F]{6})", "\\\\x{$1}");
ok = pattern.equals(expected);
}
assertTrue("Doubled character works " + hex.transform(s), ok);