ICU-9131 incorporated final changes from review

author Mark Davis <mark@macchiato.com>

Fri, 22 Aug 2014 11:08:47 +0000 (11:08 +0000)

committer Mark Davis <mark@macchiato.com>

Fri, 22 Aug 2014 11:08:47 +0000 (11:08 +0000)
author Mark Davis <mark@macchiato.com>
Fri, 22 Aug 2014 11:08:47 +0000 (11:08 +0000)
committer Mark Davis <mark@macchiato.com>
Fri, 22 Aug 2014 11:08:47 +0000 (11:08 +0000)
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/UTF16.java b/icu4j/main/classes/core/src/com/ibm/icu/text/UTF16.java

index 0f3c77a7acec2a2d645d595e38b0d7bfa4d7d2b5..18972aed0a15cad6f19d08a99960256d4f970f94 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/UTF16.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/UTF16.java
@@ -2614,7 +2614,7 @@ public final class UTF16 {
  
      /**
       * Utility for getting a code point from a CharSequence that contains exactly one code point.
-     * @return a code point IF the string is non-null and consists of a single code point.
+     * @return the code point IF the string is non-null and consists of a single code point.
       * otherwise returns -1.
       * @param s to test
       */
@@ -2640,7 +2640,7 @@ public final class UTF16 {
       * as a code point comparison of UTF16.valueOf(codePoint) and s.toString(). More specifically, if
       * <pre>
       * sc = new StringComparator(true,false,0);
-     * fast = UTF16.compare(codePoint, charSequence)
+     * fast = UTF16.compareCodePoint(codePoint, charSequence)
       * slower = sc.compare(UTF16.valueOf(codePoint), charSequence == null ? "" : charSequence.toString())
       * </pre>
       * then
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java b/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java

index a6cf2707b4f10cc8c1b011a9d478a061c2946468..f8d24e5313fda581fe72938868e3a7e0e368f9d1 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java
@@ -4128,14 +4128,15 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
      }
  
      /**
-     * Provide for faster iteration than by String. Returns an iterator over a range values. The UnicodeSet
-     * must not be altered during the iteration. The EntryRange is the same each time; the contents are just reset.
-     * <br><b>Warning: </b>To iterate over the full contents, you have to also iterate over the strings.
+     * Provide for faster iteration than by String. Returns an iterator over a ranges of code points.
+     * The UnicodeSet must not be altered during the iteration.
+     * The EntryRange is the same each time; the contents are just reset.<br>
+     * <b>Warning: </b>To iterate over the full contents, you have to also iterate over the strings.
       * 
       * <pre>
       * // Sample code
       * for (EntryRange range : us1.ranges()) {
-     *     // do something with code points between range.codepointEnd and range.codepointEnd;
+     *     // do something with code points between range.codepoint and range.codepointEnd;
       * }
       * for (String s : us1.strings()) {
       *     // do something with each string;
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSetSpanner.java b/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSetSpanner.java

index cac14805b84c7f3ad89c61182a9c1233073bd13a..80c637cdae5dbecceee152d2aa8f6709f844fa95 100644 (file)
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSetSpanner.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSetSpanner.java
@@ -12,6 +12,31 @@ import com.ibm.icu.util.OutputInt;
  /**
   * A helper class used to count, replace, and trim CharSequences based on UnicodeSet matches.
   * An instance is immutable (and thus thread-safe) iff the source UnicodeSet is frozen.
+ * <p><b>Note:</b> The counting, deletion, and replacement depend on alternating a {@link SpanCondition} with
+ * its inverse. That is, the code spans, then spans for the inverse, then spans, and so on.
+ * For the inverse, the following mapping is used:</p>
+ * <ul>
+ * <li>{@link SpanCondition.SIMPLE} → {@link SpanCondition.NOT_CONTAINED}</li>
+ * <li>{@link SpanCondition.CONTAINED} → {@link SpanCondition.NOT_CONTAINED}</li>
+ * <li>{@link SpanCondition.NOT_CONTAINED} → {@link SpanCondition.SIMPLE}</li>
+ * </ul>
+ * These are actually not complete inverses. However, the alternating works because there are no gaps.
+ * For example, with [a{ab}{bc}], you get the following behavior when scanning forward:
+ * <p>
+ * <table border="1">
+ * <tr><th>SIMPLE</th><td>xxx[ab]cyyy</td></tr>
+ * <tr><th>CONTAINED</th><td>xxx[abc]yyy</td></tr>
+ * <tr><th>NOT_CONTAINED</th><td>[xxx]ab[cyyy]</td></tr>
+ * </table>
+ * <p>So here is what happens when you alternate:
+ * <p>
+ * <table border="1">
+ * <tr><th>start</th><td>|xxxabcyyy</td></tr>
+ * <tr><th>NOT_CONTAINED</th><td>xxx|abcyyy</td></tr>
+ * <tr><th>CONTAINED</th><td>xxxabc|yyy</td></tr>
+ * <tr><th>NOT_CONTAINED</th><td>xxxabcyyy|</td></tr>
+ * </table>
+ * </p>The entire string is traversed.
   */
  public class UnicodeSetSpanner {
  
@@ -63,19 +88,20 @@ public class UnicodeSetSpanner {
       * since it is similar to whether one is replacing [abc] by x, or [abc]* by x.
       * 
       */
-    public enum Quantifier {
+    public enum CountMethod {
          /**
           * Collapse spans. That is, modify/count the entire matching span as a single item, instead of separate
           * code points.
           * 
           */
-        SPAN,
+        WHOLE_SPAN,
          /**
-         * Use the smallest number of elements in the spanned range for counting and modification. In other words, the "longest matches" are
-         * used where possible. If there are no strings, this will be the same as code points.
-         * <p>For example, in the string "abab":
+         * Use the smallest number of elements in the spanned range for counting and modification,
+         * based on the {@link UnicodeSet.SpanCondition}.
+         * If the set has no strings, this will be the same as the number of spanned code points.
+         * <p>For example, in the string "abab" with SpanCondition.SIMPLE:
           * <ul>
-         * <li>spanning with [ab] will also count four MIN_ELEMENTS.</li>
+         * <li>spanning with [ab] will count four MIN_ELEMENTS.</li>
           * <li>spanning with [{ab}] will count two MIN_ELEMENTS.</li>
           * <li>spanning with [ab{ab}] will also count two MIN_ELEMENTS.</li>
           * </ul>
@@ -85,44 +111,45 @@ public class UnicodeSetSpanner {
      }
  
      /**
-     * Returns the number of matching characters found in a character sequence, counting by Quantifier.ELEMENT using SpanCondition.CONTAINED.
-     * 
+     * Returns the number of matching characters found in a character sequence, 
+     * counting by Quantifier.MIN_ELEMENTS using SpanCondition.SIMPLE.
+     * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions.
       * @param sequence
       *            the sequence to count characters in
       * @return the count. Zero if there are none.
       */
      public int countIn(CharSequence sequence) {
-        return countIn(sequence, Quantifier.MIN_ELEMENTS, SpanCondition.CONTAINED);
+        return countIn(sequence, CountMethod.MIN_ELEMENTS, SpanCondition.SIMPLE);
      }
  
      /**
-     * Returns the number of matching characters found in a character sequence, using SpanCondition.CONTAINED
-     * 
+     * Returns the number of matching characters found in a character sequence, using SpanCondition.SIMPLE.
+     * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions.
       * @param sequence
       *            the sequence to count characters in
       * @return the count. Zero if there are none.
       */
-    public int countIn(CharSequence sequence, Quantifier quantifier) {
-        return countIn(sequence, quantifier, SpanCondition.CONTAINED);
+    public int countIn(CharSequence sequence, CountMethod quantifier) {
+        return countIn(sequence, quantifier, SpanCondition.SIMPLE);
      }
  
      /**
       * Returns the number of matching characters found in a character sequence.
-     * 
+     * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions.
       * @param sequence
       *            the sequence to count characters in
-     * @param quantifier
-     *            (optional) whether to treat the entire span as a match, or individual code points
-     * @param countSpan
-     *            (optional) the spanCondition to use. CONTAINED means only count the code points in the CONTAINED span;
+     * @param quantifier whether to treat an entire span as a match, or individual code points
+     * @param spanCondition
+     *            the spanCondition to use. SIMPLE or CONTAINED means only count the code points in the span;
       *            NOT_CONTAINED is the reverse.
+     *            <br><b>WARNING: </b> when a UnicodeSet contains strings, there may be unexpected behavior in edge cases.
       * @return the count. Zero if there are none.
       */
-    public int countIn(CharSequence sequence, Quantifier quantifier, SpanCondition countSpan) {
+    public int countIn(CharSequence sequence, CountMethod quantifier, SpanCondition spanCondition) {
          int count = 0;
          int start = 0;
-        SpanCondition skipSpan = countSpan == SpanCondition.CONTAINED ? SpanCondition.NOT_CONTAINED
-                : SpanCondition.CONTAINED;
+        SpanCondition skipSpan = spanCondition == SpanCondition.NOT_CONTAINED ? SpanCondition.SIMPLE
+                : SpanCondition.NOT_CONTAINED;
          final int length = sequence.length();
          OutputInt spanCount = new OutputInt();
          while (start != length) {
@@ -130,40 +157,40 @@ public class UnicodeSetSpanner {
              if (endNotContained == length) {
                  break;
              }
-            start = unicodeSet.spanAndCount(sequence, endNotContained, countSpan, spanCount);
-            count += quantifier == Quantifier.SPAN ? 1 : spanCount.value;
+            start = unicodeSet.spanAndCount(sequence, endNotContained, spanCondition, spanCount);
+            count += quantifier == CountMethod.WHOLE_SPAN ? 1 : spanCount.value;
          }
          return count;
      }
  
      /**
-     * Delete all the matching spans in sequence, using SpanCondition.CONTAINED
-     * 
+     * Delete all the matching spans in sequence, using SpanCondition.SIMPLE
+     * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions.
       * @param sequence
       *            charsequence to replace matching spans in.
       * @return modified string.
       */
      public String deleteFrom(CharSequence sequence) {
-        return replaceFrom(sequence, "", Quantifier.SPAN, SpanCondition.CONTAINED);
+        return replaceFrom(sequence, "", CountMethod.WHOLE_SPAN, SpanCondition.SIMPLE);
      }
  
      /**
       * Delete all matching spans in sequence, according to the operations.
-     * 
+     * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions.
       * @param sequence
       *            charsequence to replace matching spans in.
-     * @param modifySpan
-     *            specify whether to modify the matching spans (CONTAINED) or the non-matching (NOT_CONTAINED)
+     * @param spanCondition
+     *            specify whether to modify the matching spans (CONTAINED or SIMPLE) or the non-matching (NOT_CONTAINED)
       * @return modified string.
       */
-    public String deleteFrom(CharSequence sequence, SpanCondition modifySpan) {
-        return replaceFrom(sequence, "", Quantifier.SPAN, modifySpan);
+    public String deleteFrom(CharSequence sequence, SpanCondition spanCondition) {
+        return replaceFrom(sequence, "", CountMethod.WHOLE_SPAN, spanCondition);
      }
  
      /**
       * Replace all matching spans in sequence by the replacement,
-     * counting by Quantifier.ELEMENT using SpanCondition.CONTAINED.
-     * 
+     * counting by Quantifier.MIN_ELEMENTS using SpanCondition.SIMPLE.
+     * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions.
       * @param sequence
       *            charsequence to replace matching spans in.
       * @param replacement
@@ -171,42 +198,42 @@ public class UnicodeSetSpanner {
       * @return modified string.
       */
      public String replaceFrom(CharSequence sequence, CharSequence replacement) {
-        return replaceFrom(sequence, replacement, Quantifier.MIN_ELEMENTS, SpanCondition.CONTAINED);
+        return replaceFrom(sequence, replacement, CountMethod.MIN_ELEMENTS, SpanCondition.SIMPLE);
      }
  
      /**
-     * Replace all matching spans in sequence by replacement, according to the Quantifier, using SpanCondition.CONTAINED. 
-     * 
+     * Replace all matching spans in sequence by replacement, according to the Quantifier, using SpanCondition.SIMPLE. 
+     * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions.
       * @param sequence
       *            charsequence to replace matching spans in.
       * @param replacement
       *            replacement sequence. To delete, use ""
       * @param quantifier
-     *            whether to treat the entire span as a match, or individual code points
+     *            whether to treat an entire span as a match, or individual code points
       * @return modified string.
       */
-    public String replaceFrom(CharSequence sequence, CharSequence replacement, Quantifier quantifier) {
-        return replaceFrom(sequence, replacement, quantifier, SpanCondition.CONTAINED);
+    public String replaceFrom(CharSequence sequence, CharSequence replacement, CountMethod quantifier) {
+        return replaceFrom(sequence, replacement, quantifier, SpanCondition.SIMPLE);
      }
  
      /**
-     * Replace all matching spans in sequence by replacement, according to the operations quantifier and modifySpan.
-     * 
+     * Replace all matching spans in sequence by replacement, according to the operations quantifier and spanCondition.
+     * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions.
       * @param sequence
       *            charsequence to replace matching spans in.
       * @param replacement
       *            replacement sequence. To delete, use ""
-     * @param modifySpan
-     *            (optional) specify whether to modify the matching spans (CONTAINED) or the non-matching
+     * @param spanCondition
+     *            specify whether to modify the matching spans (CONTAINED or SIMPLE) or the non-matching
       *            (NOT_CONTAINED)
       * @param quantifier
-     *            (optional) specify whether to collapse or do codepoint by codepoint.
+     *            specify whether to collapse or do codepoint by codepoint.
       * @return modified string.
       */
-    public String replaceFrom(CharSequence sequence, CharSequence replacement, Quantifier quantifier,
-            SpanCondition modifySpan) {
-        SpanCondition copySpan = modifySpan == SpanCondition.CONTAINED ? SpanCondition.NOT_CONTAINED
-                : SpanCondition.CONTAINED;
+    public String replaceFrom(CharSequence sequence, CharSequence replacement, CountMethod quantifier,
+            SpanCondition spanCondition) {
+        SpanCondition copySpan = spanCondition == SpanCondition.NOT_CONTAINED ? SpanCondition.SIMPLE
+                : SpanCondition.NOT_CONTAINED;
          final boolean remove = replacement.length() == 0;
          StringBuilder result = new StringBuilder();
          // TODO, we can optimize this to
@@ -215,10 +242,10 @@ public class UnicodeSetSpanner {
          final int length = sequence.length();
          OutputInt spanCount = new OutputInt();
          for (int endCopy = 0; endCopy != length;) {
-            int endModify = unicodeSet.spanAndCount(sequence, endCopy, modifySpan, spanCount);
+            int endModify = unicodeSet.spanAndCount(sequence, endCopy, spanCondition, spanCount);
              if (remove || endModify == 0) {
                  // do nothing
-            } else if (quantifier == Quantifier.SPAN) {
+            } else if (quantifier == CountMethod.WHOLE_SPAN) {
                  result.append(replacement);
              } else {
                  for (int i = spanCount.value; i > 0; --i) {
@@ -240,17 +267,17 @@ public class UnicodeSetSpanner {
       */
      public enum TrimOption {
          /**
-         * Trim leading spans (subject to INVERT).
+         * Trim leading spans.
           * 
           */
          LEADING,
          /**
-         * Trim leading and trailing spans (subject to INVERT).
+         * Trim leading and trailing spans.
           * 
           */
          BOTH,
          /**
-         * Trim trailing spans (subject to INVERT).
+         * Trim trailing spans.
           * 
           */
          TRAILING;
@@ -258,7 +285,7 @@ public class UnicodeSetSpanner {
  
      /**
       * Returns a trimmed sequence (using CharSequence.subsequence()), that omits matching code points at the start or
-     * end of the string, using TrimOption.BOTH and SpanCondition.CONTAINED. For example:
+     * end of the string, using TrimOption.BOTH and SpanCondition.SIMPLE. For example:
       * 
       * <pre>
       * {@code
@@ -270,12 +297,12 @@ public class UnicodeSetSpanner {
       * 
       */
      public CharSequence trim(CharSequence sequence) {
-        return trim(sequence, TrimOption.BOTH, SpanCondition.CONTAINED);
+        return trim(sequence, TrimOption.BOTH, SpanCondition.SIMPLE);
      }
  
      /**
       * Returns a trimmed sequence (using CharSequence.subsequence()), that omits matching code points at the start or
-     * end of the string, using the trimOption and SpanCondition.CONTAINED. For example:
+     * end of the string, using the trimOption and SpanCondition.SIMPLE. For example:
       * 
       * <pre>
       * {@code
@@ -287,12 +314,12 @@ public class UnicodeSetSpanner {
       * 
       */
      public CharSequence trim(CharSequence sequence, TrimOption trimOption) {
-        return trim(sequence, trimOption, SpanCondition.CONTAINED);
+        return trim(sequence, trimOption, SpanCondition.SIMPLE);
      }
  
      /**
       * Returns a trimmed sequence (using CharSequence.subsequence()), that omits matching code points at the start or
-     * end of the string, depending on the trimOption and modifySpan. For example:
+     * end of the string, depending on the trimOption and spanCondition. For example:
       * 
       * <pre>
       * {@code
@@ -305,16 +332,16 @@ public class UnicodeSetSpanner {
       * @param sequence
       *            the sequence to trim
       * @param trimOption
-     *            (optional) LEADING, TRAILING, or BOTH
-     * @param modifySpan
-     *            (optional) CONTAINED or NOT_CONTAINED
+     *            LEADING, TRAILING, or BOTH
+     * @param spanCondition
+     *            SIMPLE, CONTAINED or NOT_CONTAINED
       * @return a subsequence
       */
-    public CharSequence trim(CharSequence sequence, TrimOption trimOption, SpanCondition modifySpan) {
+    public CharSequence trim(CharSequence sequence, TrimOption trimOption, SpanCondition spanCondition) {
          int endLeadContained, startTrailContained;
          final int length = sequence.length();
          if (trimOption != TrimOption.TRAILING) {
-            endLeadContained = unicodeSet.span(sequence, modifySpan);
+            endLeadContained = unicodeSet.span(sequence, spanCondition);
              if (endLeadContained == length) {
                  return "";
              }
@@ -322,7 +349,7 @@ public class UnicodeSetSpanner {
              endLeadContained = 0;
          }
          if (trimOption != TrimOption.LEADING) {
-            startTrailContained = unicodeSet.spanBack(sequence, modifySpan);
+            startTrailContained = unicodeSet.spanBack(sequence, spanCondition);
          } else {
              startTrailContained = length;
          }
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetTest.java

index e598f3985ccd218f001adb4d3ec0cd91fc13b807..8ba4cb4a060376902d4006f5e490e338900c95a6 100644 (file)
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetTest.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetTest.java
@@ -37,7 +37,7 @@ import com.ibm.icu.text.UnicodeSet;
  import com.ibm.icu.text.UnicodeSet.ComparisonStyle;
  import com.ibm.icu.text.UnicodeSet.EntryRange;
  import com.ibm.icu.text.UnicodeSetSpanner;
-import com.ibm.icu.text.UnicodeSetSpanner.Quantifier;
+import com.ibm.icu.text.UnicodeSetSpanner.CountMethod;
  import com.ibm.icu.text.UnicodeSet.SpanCondition;
  import com.ibm.icu.text.UnicodeSetSpanner.TrimOption;
  import com.ibm.icu.text.UnicodeSetIterator;
@@ -2455,10 +2455,10 @@ public class UnicodeSetTest extends TestFmwk {
          assertEquals("", "a_._b_._c_._", m.trim("_._a_._b_._c_._", TrimOption.LEADING));
          assertEquals("", "_._a_._b_._c", m.trim("_._a_._b_._c_._", TrimOption.TRAILING));
  
-        assertEquals("", "a??b??c", m.replaceFrom("a_._b_._c", "??", Quantifier.SPAN));
-        assertEquals("", "a??b??c", m.replaceFrom(m.trim("_._a_._b_._c_._"), "??", Quantifier.SPAN));
+        assertEquals("", "a??b??c", m.replaceFrom("a_._b_._c", "??", CountMethod.WHOLE_SPAN));
+        assertEquals("", "a??b??c", m.replaceFrom(m.trim("_._a_._b_._c_._"), "??", CountMethod.WHOLE_SPAN));
          assertEquals("", "XYXYXYaXYXYXYbXYXYXYcXYXYXY", m.replaceFrom("_._a_._b_._c_._", "XY"));
-        assertEquals("", "XYaXYbXYcXY", m.replaceFrom("_._a_._b_._c_._", "XY", Quantifier.SPAN));
+        assertEquals("", "XYaXYbXYcXY", m.replaceFrom("_._a_._b_._c_._", "XY", CountMethod.WHOLE_SPAN));
  
          m = new UnicodeSetSpanner(new UnicodeSet("\\p{uppercase}"));
          assertEquals("", "TQBF", m.deleteFrom("The Quick Brown Fox.", SpanCondition.NOT_CONTAINED));
@@ -2468,17 +2468,17 @@ public class UnicodeSetTest extends TestFmwk {
  
          m = new UnicodeSetSpanner(new UnicodeSet("[{ab}]"));
          assertEquals("", "XXc acb", m.replaceFrom("ababc acb", "X"));
-        assertEquals("", "Xc acb", m.replaceFrom("ababc acb", "X", Quantifier.SPAN));
+        assertEquals("", "Xc acb", m.replaceFrom("ababc acb", "X", CountMethod.WHOLE_SPAN));
      }
  
      public void TestCodePoints() {
          // test supplemental code points and strings clusters
-        checkCodePoints("x\u0308", "z\u0308", Quantifier.MIN_ELEMENTS, null, 1);
-        checkCodePoints("𣿡", "𣿢", Quantifier.MIN_ELEMENTS, null, 1);
-        checkCodePoints("👦", "👧", Quantifier.MIN_ELEMENTS, null, 1);
+        checkCodePoints("x\u0308", "z\u0308", CountMethod.MIN_ELEMENTS, null, 1);
+        checkCodePoints("𣿡", "𣿢", CountMethod.MIN_ELEMENTS, null, 1);
+        checkCodePoints("👦", "👧", CountMethod.MIN_ELEMENTS, null, 1);
      }
  
-    private void checkCodePoints(String a, String b, Quantifier quantifier, String expectedReplaced, int expectedCount) {
+    private void checkCodePoints(String a, String b, CountMethod quantifier, String expectedReplaced, int expectedCount) {
          final String ab = a+b;
          UnicodeSetSpanner m = new UnicodeSetSpanner(new UnicodeSet("[{" + a + "}]"));
          assertEquals("new UnicodeSetSpanner(\"[{" + a + "}]\").countIn(\"" + ab + "\")", 
@@ -2492,4 +2492,77 @@ public class UnicodeSetTest extends TestFmwk {
                  expectedReplaced, m.replaceFrom(ab, "-", quantifier));
      }
  
+    public void testForSpanGaps() {
+        String[] items = {"a", "b", "c", "{ab}", "{bc}", "{cd}", "{abc}", "{bcd}"};
+        final int limit = 1<<items.length;
+        // build long string for testing
+        StringBuilder longBuffer = new StringBuilder();
+        for (int i = 1; i < limit; ++i) {
+            longBuffer.append("x");
+            longBuffer.append(getCombinations(items, i));
+        }
+        String longString = longBuffer.toString();
+        longString = longString.replace("{","").replace("}","");
+
+        long start = System.nanoTime();
+        for (int i = 1; i < limit; ++i) {
+            UnicodeSet us = new UnicodeSet("[" + getCombinations(items, i) + "]");
+            int problemFound = checkSpan(longString, us, SpanCondition.SIMPLE);
+            if (problemFound >= 0) {
+                assertEquals("Testing " + longString + ", found gap at", -1, problemFound);
+                break;
+            }
+        }
+        long end = System.nanoTime();
+        logln("Time for SIMPLE   :\t" + (end-start));
+        start = System.nanoTime();
+        for (int i = 1; i < limit; ++i) {
+            UnicodeSet us = new UnicodeSet("[" + getCombinations(items, i) + "]");
+            int problemFound = checkSpan(longString, us, SpanCondition.CONTAINED);
+            if (problemFound >= 0) {
+                assertEquals("Testing " + longString + ", found gap at", -1, problemFound);
+                break;
+            }
+        }
+        end = System.nanoTime();
+        logln("Time for CONTAINED:\t" + (end-start));
+    }
+
+    /**
+     * Check that there are no gaps, when we alternate spanning. That is, there
+     * should only be a zero length span at the very start.
+     * @param longString
+     * @param us
+     * @param simple
+     */
+    private int checkSpan(String longString, UnicodeSet us, SpanCondition spanCondition) {
+        int start = 0;
+        while (start < longString.length()) {
+            int limit = us.span(longString, start, spanCondition);
+            if (limit == longString.length()) {
+                break;
+            } else if (limit == start && start != 0) {
+                return start;
+            }
+            start = limit;
+            limit = us.span(longString, start, SpanCondition.NOT_CONTAINED);
+            if (limit == start) {
+                return start;
+            }
+            start = limit;
+        }
+        return -1; // all ok
+    }
+
+    private String getCombinations(String[] items, int bitset) {
+        StringBuilder result = new StringBuilder();
+        for (int i = 0; bitset != 0; ++i) {
+            int other = bitset & (1 << i);
+            if (other != 0) {
+                bitset ^= other;
+                result.append(items[i]);
+            }
+        }
+        return result.toString();
+    }
  }
author	Mark Davis <mark@macchiato.com>
	Fri, 22 Aug 2014 11:08:47 +0000 (11:08 +0000)
committer	Mark Davis <mark@macchiato.com>
	Fri, 22 Aug 2014 11:08:47 +0000 (11:08 +0000)
icu4j/main/classes/core/src/com/ibm/icu/text/UTF16.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java		patch \| blob \| history
icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSetSpanner.java		patch \| blob \| history
icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetTest.java		patch \| blob \| history