/**
* A helper class used to count, replace, and trim CharSequences based on UnicodeSet matches.
* An instance is immutable (and thus thread-safe) iff the source UnicodeSet is frozen.
+ * <p><b>Note:</b> The counting, deletion, and replacement depend on alternating a {@link SpanCondition} with
+ * its inverse. That is, the code spans, then spans for the inverse, then spans, and so on.
+ * For the inverse, the following mapping is used:</p>
+ * <ul>
+ * <li>{@link SpanCondition.SIMPLE} → {@link SpanCondition.NOT_CONTAINED}</li>
+ * <li>{@link SpanCondition.CONTAINED} → {@link SpanCondition.NOT_CONTAINED}</li>
+ * <li>{@link SpanCondition.NOT_CONTAINED} → {@link SpanCondition.SIMPLE}</li>
+ * </ul>
+ * These are actually not complete inverses. However, the alternating works because there are no gaps.
+ * For example, with [a{ab}{bc}], you get the following behavior when scanning forward:
+ * <p>
+ * <table border="1">
+ * <tr><th>SIMPLE</th><td>xxx[ab]cyyy</td></tr>
+ * <tr><th>CONTAINED</th><td>xxx[abc]yyy</td></tr>
+ * <tr><th>NOT_CONTAINED</th><td>[xxx]ab[cyyy]</td></tr>
+ * </table>
+ * <p>So here is what happens when you alternate:
+ * <p>
+ * <table border="1">
+ * <tr><th>start</th><td>|xxxabcyyy</td></tr>
+ * <tr><th>NOT_CONTAINED</th><td>xxx|abcyyy</td></tr>
+ * <tr><th>CONTAINED</th><td>xxxabc|yyy</td></tr>
+ * <tr><th>NOT_CONTAINED</th><td>xxxabcyyy|</td></tr>
+ * </table>
+ * </p>The entire string is traversed.
*/
public class UnicodeSetSpanner {
* since it is similar to whether one is replacing [abc] by x, or [abc]* by x.
*
*/
- public enum Quantifier {
+ public enum CountMethod {
/**
* Collapse spans. That is, modify/count the entire matching span as a single item, instead of separate
* code points.
*
*/
- SPAN,
+ WHOLE_SPAN,
/**
- * Use the smallest number of elements in the spanned range for counting and modification. In other words, the "longest matches" are
- * used where possible. If there are no strings, this will be the same as code points.
- * <p>For example, in the string "abab":
+ * Use the smallest number of elements in the spanned range for counting and modification,
+ * based on the {@link UnicodeSet.SpanCondition}.
+ * If the set has no strings, this will be the same as the number of spanned code points.
+ * <p>For example, in the string "abab" with SpanCondition.SIMPLE:
* <ul>
- * <li>spanning with [ab] will also count four MIN_ELEMENTS.</li>
+ * <li>spanning with [ab] will count four MIN_ELEMENTS.</li>
* <li>spanning with [{ab}] will count two MIN_ELEMENTS.</li>
* <li>spanning with [ab{ab}] will also count two MIN_ELEMENTS.</li>
* </ul>
}
/**
- * Returns the number of matching characters found in a character sequence, counting by Quantifier.ELEMENT using SpanCondition.CONTAINED.
- *
+ * Returns the number of matching characters found in a character sequence,
+ * counting by Quantifier.MIN_ELEMENTS using SpanCondition.SIMPLE.
+ * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions.
* @param sequence
* the sequence to count characters in
* @return the count. Zero if there are none.
*/
public int countIn(CharSequence sequence) {
- return countIn(sequence, Quantifier.MIN_ELEMENTS, SpanCondition.CONTAINED);
+ return countIn(sequence, CountMethod.MIN_ELEMENTS, SpanCondition.SIMPLE);
}
/**
- * Returns the number of matching characters found in a character sequence, using SpanCondition.CONTAINED
- *
+ * Returns the number of matching characters found in a character sequence, using SpanCondition.SIMPLE.
+ * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions.
* @param sequence
* the sequence to count characters in
* @return the count. Zero if there are none.
*/
- public int countIn(CharSequence sequence, Quantifier quantifier) {
- return countIn(sequence, quantifier, SpanCondition.CONTAINED);
+ public int countIn(CharSequence sequence, CountMethod quantifier) {
+ return countIn(sequence, quantifier, SpanCondition.SIMPLE);
}
/**
* Returns the number of matching characters found in a character sequence.
- *
+ * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions.
* @param sequence
* the sequence to count characters in
- * @param quantifier
- * (optional) whether to treat the entire span as a match, or individual code points
- * @param countSpan
- * (optional) the spanCondition to use. CONTAINED means only count the code points in the CONTAINED span;
+ * @param quantifier whether to treat an entire span as a match, or individual code points
+ * @param spanCondition
+ * the spanCondition to use. SIMPLE or CONTAINED means only count the code points in the span;
* NOT_CONTAINED is the reverse.
+ * <br><b>WARNING: </b> when a UnicodeSet contains strings, there may be unexpected behavior in edge cases.
* @return the count. Zero if there are none.
*/
- public int countIn(CharSequence sequence, Quantifier quantifier, SpanCondition countSpan) {
+ public int countIn(CharSequence sequence, CountMethod quantifier, SpanCondition spanCondition) {
int count = 0;
int start = 0;
- SpanCondition skipSpan = countSpan == SpanCondition.CONTAINED ? SpanCondition.NOT_CONTAINED
- : SpanCondition.CONTAINED;
+ SpanCondition skipSpan = spanCondition == SpanCondition.NOT_CONTAINED ? SpanCondition.SIMPLE
+ : SpanCondition.NOT_CONTAINED;
final int length = sequence.length();
OutputInt spanCount = new OutputInt();
while (start != length) {
if (endNotContained == length) {
break;
}
- start = unicodeSet.spanAndCount(sequence, endNotContained, countSpan, spanCount);
- count += quantifier == Quantifier.SPAN ? 1 : spanCount.value;
+ start = unicodeSet.spanAndCount(sequence, endNotContained, spanCondition, spanCount);
+ count += quantifier == CountMethod.WHOLE_SPAN ? 1 : spanCount.value;
}
return count;
}
/**
- * Delete all the matching spans in sequence, using SpanCondition.CONTAINED
- *
+ * Delete all the matching spans in sequence, using SpanCondition.SIMPLE
+ * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions.
* @param sequence
* charsequence to replace matching spans in.
* @return modified string.
*/
public String deleteFrom(CharSequence sequence) {
- return replaceFrom(sequence, "", Quantifier.SPAN, SpanCondition.CONTAINED);
+ return replaceFrom(sequence, "", CountMethod.WHOLE_SPAN, SpanCondition.SIMPLE);
}
/**
* Delete all matching spans in sequence, according to the operations.
- *
+ * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions.
* @param sequence
* charsequence to replace matching spans in.
- * @param modifySpan
- * specify whether to modify the matching spans (CONTAINED) or the non-matching (NOT_CONTAINED)
+ * @param spanCondition
+ * specify whether to modify the matching spans (CONTAINED or SIMPLE) or the non-matching (NOT_CONTAINED)
* @return modified string.
*/
- public String deleteFrom(CharSequence sequence, SpanCondition modifySpan) {
- return replaceFrom(sequence, "", Quantifier.SPAN, modifySpan);
+ public String deleteFrom(CharSequence sequence, SpanCondition spanCondition) {
+ return replaceFrom(sequence, "", CountMethod.WHOLE_SPAN, spanCondition);
}
/**
* Replace all matching spans in sequence by the replacement,
- * counting by Quantifier.ELEMENT using SpanCondition.CONTAINED.
- *
+ * counting by Quantifier.MIN_ELEMENTS using SpanCondition.SIMPLE.
+ * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions.
* @param sequence
* charsequence to replace matching spans in.
* @param replacement
* @return modified string.
*/
public String replaceFrom(CharSequence sequence, CharSequence replacement) {
- return replaceFrom(sequence, replacement, Quantifier.MIN_ELEMENTS, SpanCondition.CONTAINED);
+ return replaceFrom(sequence, replacement, CountMethod.MIN_ELEMENTS, SpanCondition.SIMPLE);
}
/**
- * Replace all matching spans in sequence by replacement, according to the Quantifier, using SpanCondition.CONTAINED.
- *
+ * Replace all matching spans in sequence by replacement, according to the Quantifier, using SpanCondition.SIMPLE.
+ * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions.
* @param sequence
* charsequence to replace matching spans in.
* @param replacement
* replacement sequence. To delete, use ""
* @param quantifier
- * whether to treat the entire span as a match, or individual code points
+ * whether to treat an entire span as a match, or individual code points
* @return modified string.
*/
- public String replaceFrom(CharSequence sequence, CharSequence replacement, Quantifier quantifier) {
- return replaceFrom(sequence, replacement, quantifier, SpanCondition.CONTAINED);
+ public String replaceFrom(CharSequence sequence, CharSequence replacement, CountMethod quantifier) {
+ return replaceFrom(sequence, replacement, quantifier, SpanCondition.SIMPLE);
}
/**
- * Replace all matching spans in sequence by replacement, according to the operations quantifier and modifySpan.
- *
+ * Replace all matching spans in sequence by replacement, according to the operations quantifier and spanCondition.
+ * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions.
* @param sequence
* charsequence to replace matching spans in.
* @param replacement
* replacement sequence. To delete, use ""
- * @param modifySpan
- * (optional) specify whether to modify the matching spans (CONTAINED) or the non-matching
+ * @param spanCondition
+ * specify whether to modify the matching spans (CONTAINED or SIMPLE) or the non-matching
* (NOT_CONTAINED)
* @param quantifier
- * (optional) specify whether to collapse or do codepoint by codepoint.
+ * specify whether to collapse or do codepoint by codepoint.
* @return modified string.
*/
- public String replaceFrom(CharSequence sequence, CharSequence replacement, Quantifier quantifier,
- SpanCondition modifySpan) {
- SpanCondition copySpan = modifySpan == SpanCondition.CONTAINED ? SpanCondition.NOT_CONTAINED
- : SpanCondition.CONTAINED;
+ public String replaceFrom(CharSequence sequence, CharSequence replacement, CountMethod quantifier,
+ SpanCondition spanCondition) {
+ SpanCondition copySpan = spanCondition == SpanCondition.NOT_CONTAINED ? SpanCondition.SIMPLE
+ : SpanCondition.NOT_CONTAINED;
final boolean remove = replacement.length() == 0;
StringBuilder result = new StringBuilder();
// TODO, we can optimize this to
final int length = sequence.length();
OutputInt spanCount = new OutputInt();
for (int endCopy = 0; endCopy != length;) {
- int endModify = unicodeSet.spanAndCount(sequence, endCopy, modifySpan, spanCount);
+ int endModify = unicodeSet.spanAndCount(sequence, endCopy, spanCondition, spanCount);
if (remove || endModify == 0) {
// do nothing
- } else if (quantifier == Quantifier.SPAN) {
+ } else if (quantifier == CountMethod.WHOLE_SPAN) {
result.append(replacement);
} else {
for (int i = spanCount.value; i > 0; --i) {
*/
public enum TrimOption {
/**
- * Trim leading spans (subject to INVERT).
+ * Trim leading spans.
*
*/
LEADING,
/**
- * Trim leading and trailing spans (subject to INVERT).
+ * Trim leading and trailing spans.
*
*/
BOTH,
/**
- * Trim trailing spans (subject to INVERT).
+ * Trim trailing spans.
*
*/
TRAILING;
/**
* Returns a trimmed sequence (using CharSequence.subsequence()), that omits matching code points at the start or
- * end of the string, using TrimOption.BOTH and SpanCondition.CONTAINED. For example:
+ * end of the string, using TrimOption.BOTH and SpanCondition.SIMPLE. For example:
*
* <pre>
* {@code
*
*/
public CharSequence trim(CharSequence sequence) {
- return trim(sequence, TrimOption.BOTH, SpanCondition.CONTAINED);
+ return trim(sequence, TrimOption.BOTH, SpanCondition.SIMPLE);
}
/**
* Returns a trimmed sequence (using CharSequence.subsequence()), that omits matching code points at the start or
- * end of the string, using the trimOption and SpanCondition.CONTAINED. For example:
+ * end of the string, using the trimOption and SpanCondition.SIMPLE. For example:
*
* <pre>
* {@code
*
*/
public CharSequence trim(CharSequence sequence, TrimOption trimOption) {
- return trim(sequence, trimOption, SpanCondition.CONTAINED);
+ return trim(sequence, trimOption, SpanCondition.SIMPLE);
}
/**
* Returns a trimmed sequence (using CharSequence.subsequence()), that omits matching code points at the start or
- * end of the string, depending on the trimOption and modifySpan. For example:
+ * end of the string, depending on the trimOption and spanCondition. For example:
*
* <pre>
* {@code
* @param sequence
* the sequence to trim
* @param trimOption
- * (optional) LEADING, TRAILING, or BOTH
- * @param modifySpan
- * (optional) CONTAINED or NOT_CONTAINED
+ * LEADING, TRAILING, or BOTH
+ * @param spanCondition
+ * SIMPLE, CONTAINED or NOT_CONTAINED
* @return a subsequence
*/
- public CharSequence trim(CharSequence sequence, TrimOption trimOption, SpanCondition modifySpan) {
+ public CharSequence trim(CharSequence sequence, TrimOption trimOption, SpanCondition spanCondition) {
int endLeadContained, startTrailContained;
final int length = sequence.length();
if (trimOption != TrimOption.TRAILING) {
- endLeadContained = unicodeSet.span(sequence, modifySpan);
+ endLeadContained = unicodeSet.span(sequence, spanCondition);
if (endLeadContained == length) {
return "";
}
endLeadContained = 0;
}
if (trimOption != TrimOption.LEADING) {
- startTrailContained = unicodeSet.spanBack(sequence, modifySpan);
+ startTrailContained = unicodeSet.spanBack(sequence, spanCondition);
} else {
startTrailContained = length;
}
import com.ibm.icu.text.UnicodeSet.ComparisonStyle;
import com.ibm.icu.text.UnicodeSet.EntryRange;
import com.ibm.icu.text.UnicodeSetSpanner;
-import com.ibm.icu.text.UnicodeSetSpanner.Quantifier;
+import com.ibm.icu.text.UnicodeSetSpanner.CountMethod;
import com.ibm.icu.text.UnicodeSet.SpanCondition;
import com.ibm.icu.text.UnicodeSetSpanner.TrimOption;
import com.ibm.icu.text.UnicodeSetIterator;
assertEquals("", "a_._b_._c_._", m.trim("_._a_._b_._c_._", TrimOption.LEADING));
assertEquals("", "_._a_._b_._c", m.trim("_._a_._b_._c_._", TrimOption.TRAILING));
- assertEquals("", "a??b??c", m.replaceFrom("a_._b_._c", "??", Quantifier.SPAN));
- assertEquals("", "a??b??c", m.replaceFrom(m.trim("_._a_._b_._c_._"), "??", Quantifier.SPAN));
+ assertEquals("", "a??b??c", m.replaceFrom("a_._b_._c", "??", CountMethod.WHOLE_SPAN));
+ assertEquals("", "a??b??c", m.replaceFrom(m.trim("_._a_._b_._c_._"), "??", CountMethod.WHOLE_SPAN));
assertEquals("", "XYXYXYaXYXYXYbXYXYXYcXYXYXY", m.replaceFrom("_._a_._b_._c_._", "XY"));
- assertEquals("", "XYaXYbXYcXY", m.replaceFrom("_._a_._b_._c_._", "XY", Quantifier.SPAN));
+ assertEquals("", "XYaXYbXYcXY", m.replaceFrom("_._a_._b_._c_._", "XY", CountMethod.WHOLE_SPAN));
m = new UnicodeSetSpanner(new UnicodeSet("\\p{uppercase}"));
assertEquals("", "TQBF", m.deleteFrom("The Quick Brown Fox.", SpanCondition.NOT_CONTAINED));
m = new UnicodeSetSpanner(new UnicodeSet("[{ab}]"));
assertEquals("", "XXc acb", m.replaceFrom("ababc acb", "X"));
- assertEquals("", "Xc acb", m.replaceFrom("ababc acb", "X", Quantifier.SPAN));
+ assertEquals("", "Xc acb", m.replaceFrom("ababc acb", "X", CountMethod.WHOLE_SPAN));
}
public void TestCodePoints() {
// test supplemental code points and strings clusters
- checkCodePoints("x\u0308", "z\u0308", Quantifier.MIN_ELEMENTS, null, 1);
- checkCodePoints("𣿡", "𣿢", Quantifier.MIN_ELEMENTS, null, 1);
- checkCodePoints("👦", "👧", Quantifier.MIN_ELEMENTS, null, 1);
+ checkCodePoints("x\u0308", "z\u0308", CountMethod.MIN_ELEMENTS, null, 1);
+ checkCodePoints("𣿡", "𣿢", CountMethod.MIN_ELEMENTS, null, 1);
+ checkCodePoints("👦", "👧", CountMethod.MIN_ELEMENTS, null, 1);
}
- private void checkCodePoints(String a, String b, Quantifier quantifier, String expectedReplaced, int expectedCount) {
+ private void checkCodePoints(String a, String b, CountMethod quantifier, String expectedReplaced, int expectedCount) {
final String ab = a+b;
UnicodeSetSpanner m = new UnicodeSetSpanner(new UnicodeSet("[{" + a + "}]"));
assertEquals("new UnicodeSetSpanner(\"[{" + a + "}]\").countIn(\"" + ab + "\")",
expectedReplaced, m.replaceFrom(ab, "-", quantifier));
}
+ public void testForSpanGaps() {
+ String[] items = {"a", "b", "c", "{ab}", "{bc}", "{cd}", "{abc}", "{bcd}"};
+ final int limit = 1<<items.length;
+ // build long string for testing
+ StringBuilder longBuffer = new StringBuilder();
+ for (int i = 1; i < limit; ++i) {
+ longBuffer.append("x");
+ longBuffer.append(getCombinations(items, i));
+ }
+ String longString = longBuffer.toString();
+ longString = longString.replace("{","").replace("}","");
+
+ long start = System.nanoTime();
+ for (int i = 1; i < limit; ++i) {
+ UnicodeSet us = new UnicodeSet("[" + getCombinations(items, i) + "]");
+ int problemFound = checkSpan(longString, us, SpanCondition.SIMPLE);
+ if (problemFound >= 0) {
+ assertEquals("Testing " + longString + ", found gap at", -1, problemFound);
+ break;
+ }
+ }
+ long end = System.nanoTime();
+ logln("Time for SIMPLE :\t" + (end-start));
+ start = System.nanoTime();
+ for (int i = 1; i < limit; ++i) {
+ UnicodeSet us = new UnicodeSet("[" + getCombinations(items, i) + "]");
+ int problemFound = checkSpan(longString, us, SpanCondition.CONTAINED);
+ if (problemFound >= 0) {
+ assertEquals("Testing " + longString + ", found gap at", -1, problemFound);
+ break;
+ }
+ }
+ end = System.nanoTime();
+ logln("Time for CONTAINED:\t" + (end-start));
+ }
+
+ /**
+ * Check that there are no gaps, when we alternate spanning. That is, there
+ * should only be a zero length span at the very start.
+ * @param longString
+ * @param us
+ * @param simple
+ */
+ private int checkSpan(String longString, UnicodeSet us, SpanCondition spanCondition) {
+ int start = 0;
+ while (start < longString.length()) {
+ int limit = us.span(longString, start, spanCondition);
+ if (limit == longString.length()) {
+ break;
+ } else if (limit == start && start != 0) {
+ return start;
+ }
+ start = limit;
+ limit = us.span(longString, start, SpanCondition.NOT_CONTAINED);
+ if (limit == start) {
+ return start;
+ }
+ start = limit;
+ }
+ return -1; // all ok
+ }
+
+ private String getCombinations(String[] items, int bitset) {
+ StringBuilder result = new StringBuilder();
+ for (int i = 0; bitset != 0; ++i) {
+ int other = bitset & (1 << i);
+ if (other != 0) {
+ bitset ^= other;
+ result.append(items[i]);
+ }
+ }
+ return result.toString();
+ }
}