// is a leftover from already-disabled Boyer-Moore search code. This Java implementation
// preserves the code, but we should clean this up later.
-/**
+/**
*
* <tt>StringSearch</tt> is a {@link SearchIterator} that provides
* language-sensitive text searching based on the comparison rules defined
* <br>
* A pattern string P matches a text string S at the offsets [start, end]
* if
- * <pre>
+ * <pre>
* option 1. Some canonical equivalent of P matches some canonical equivalent
* of S'
* option 2. P matches S' and if P starts or ends with a combining mark,
* </pre>
* Option 2. is the default.
* <p>
- * This search has APIs similar to that of other text iteration mechanisms
- * such as the break iterators in {@link BreakIterator}. Using these
- * APIs, it is easy to scan through text looking for all occurrences of
- * a given pattern. This search iterator allows changing of direction by
+ * This search has APIs similar to that of other text iteration mechanisms
+ * such as the break iterators in {@link BreakIterator}. Using these
+ * APIs, it is easy to scan through text looking for all occurrences of
+ * a given pattern. This search iterator allows changing of direction by
* calling a {@link #reset} followed by a {@link #next} or {@link #previous}.
* Though a direction change can occur without calling {@link #reset} first,
* this operation comes with some speed penalty.
* from {@link #getCollator} and using the APIs in {@link RuleBasedCollator}.
* Lastly to update <tt>StringSearch</tt> to the new collator attributes,
* {@link #reset} has to be called.
- * <p>
+ * <p>
* Restriction: <br>
* Currently there are no composite characters that consists of a
* character with combining class > 0 before a character with combining
* @author Laura Werner, synwee
* @stable ICU 2.0
*/
-// internal notes: all methods do not guarantee the correct status of the
+// internal notes: all methods do not guarantee the correct status of the
// characteriterator. the caller has to maintain the original index position
// if necessary. methods could change the index position as it deems fit
public final class StringSearch extends SearchIterator {
// private char[] canonicalSuffixAccents_;
/**
- * Initializes the iterator to use the language-specific rules defined in
- * the argument collator to search for argument pattern in the argument
+ * Initializes the iterator to use the language-specific rules defined in
+ * the argument collator to search for argument pattern in the argument
* target text. The argument <code>breakiter</code> is used to define logical matches.
- * See super class documentation for more details on the use of the target
+ * See super class documentation for more details on the use of the target
* text and {@link BreakIterator}.
* @param pattern text to look for.
- * @param target target text to search for pattern.
+ * @param target target text to search for pattern.
* @param collator {@link RuleBasedCollator} that defines the language rules
- * @param breakiter A {@link BreakIterator} that is used to determine the
+ * @param breakiter A {@link BreakIterator} that is used to determine the
* boundaries of a logical match. This argument can be null.
* @throws IllegalArgumentException thrown when argument target is null,
* or of length 0
}
/**
- * Initializes the iterator to use the language-specific rules defined in
- * the argument collator to search for argument pattern in the argument
+ * Initializes the iterator to use the language-specific rules defined in
+ * the argument collator to search for argument pattern in the argument
* target text. No {@link BreakIterator}s are set to test for logical matches.
* @param pattern text to look for.
- * @param target target text to search for pattern.
+ * @param target target text to search for pattern.
* @param collator {@link RuleBasedCollator} that defines the language rules
* @throws IllegalArgumentException thrown when argument target is null,
* or of length 0
}
/**
- * Initializes the iterator to use the language-specific rules and
- * break iterator rules defined in the argument locale to search for
- * argument pattern in the argument target text.
+ * Initializes the iterator to use the language-specific rules and
+ * break iterator rules defined in the argument locale to search for
+ * argument pattern in the argument target text.
* @param pattern text to look for.
- * @param target target text to search for pattern.
+ * @param target target text to search for pattern.
* @param locale locale to use for language and break iterator rules
* @throws IllegalArgumentException thrown when argument target is null,
- * or of length 0. ClassCastException thrown if the collator for
+ * or of length 0. ClassCastException thrown if the collator for
* the specified locale is not a RuleBasedCollator.
* @stable ICU 2.0
*/
}
/**
- * Initializes the iterator to use the language-specific rules and
- * break iterator rules defined in the argument locale to search for
- * argument pattern in the argument target text.
- * See super class documentation for more details on the use of the target
+ * Initializes the iterator to use the language-specific rules and
+ * break iterator rules defined in the argument locale to search for
+ * argument pattern in the argument target text.
+ * See super class documentation for more details on the use of the target
* text and {@link BreakIterator}.
* @param pattern text to look for.
- * @param target target text to search for pattern.
+ * @param target target text to search for pattern.
* @param locale locale to use for language and break iterator rules
* @throws IllegalArgumentException thrown when argument target is null,
- * or of length 0. ClassCastException thrown if the collator for
+ * or of length 0. ClassCastException thrown if the collator for
* the specified locale is not a RuleBasedCollator.
* @see BreakIterator
* @see RuleBasedCollator
}
/**
- * Initializes the iterator to use the language-specific rules and
- * break iterator rules defined in the default locale to search for
+ * Initializes the iterator to use the language-specific rules and
+ * break iterator rules defined in the default locale to search for
* argument pattern in the argument target text.
* @param pattern text to look for.
- * @param target target text to search for pattern.
+ * @param target target text to search for pattern.
* @throws IllegalArgumentException thrown when argument target is null,
- * or of length 0. ClassCastException thrown if the collator for
+ * or of length 0. ClassCastException thrown if the collator for
* the default locale is not a RuleBasedCollator.
* @stable ICU 2.0
*/
/**
* Gets the {@link RuleBasedCollator} used for the language rules.
* <p>
- * Since <tt>StringSearch</tt> depends on the returned {@link RuleBasedCollator}, any
- * changes to the {@link RuleBasedCollator} result should follow with a call to
- * either {@link #reset()} or {@link #setCollator(RuleBasedCollator)} to ensure the correct
+ * Since <tt>StringSearch</tt> depends on the returned {@link RuleBasedCollator}, any
+ * changes to the {@link RuleBasedCollator} result should follow with a call to
+ * either {@link #reset()} or {@link #setCollator(RuleBasedCollator)} to ensure the correct
* search behavior.
* </p>
* @return {@link RuleBasedCollator} used by this <tt>StringSearch</tt>
}
/**
- * Set the pattern to search for.
+ * Set the pattern to search for.
* The iterator's position will not be changed by this method.
* @param pattern for searching
* @see #getPattern
}
/**
- * Determines whether canonical matches (option 1, as described in the
+ * Determines whether canonical matches (option 1, as described in the
* class documentation) is set.
* See setCanonical(boolean) for more information.
* @see #setCanonical
textIter_.setOffset(position);
}
- /**
+ /**
* {@inheritDoc}
* @stable ICU 2.8
*/
/**
* Getting the modified collation elements taking into account the collation
* attributes.
- *
+ *
* @param sourcece
* @return the modified collation element
*/
}
/**
- * Direct port of ICU4C static int32_t * addTouint32_tArray(...) in usearch.cpp.
+ * Direct port of ICU4C static int32_t * addTouint32_tArray(...) in usearch.cpp
+ * (except not taking destination buffer size and status param).
* This is used for appending a PCE to Pattern.PCE_ buffer. We probably should
* implement this in Pattern class.
- *
+ *
* @param destination target array
* @param offset destination offset to add value
- * @param destinationlength target array size
* @param value to be added
* @param increments incremental size expected
* @return new destination array, destination if there was no new allocation
*/
- private static int[] addToIntArray(int[] destination, int offset, int destinationlength,
- int value, int increments) {
- int newlength = destinationlength;
+ private static int[] addToIntArray(int[] destination, int offset, int value, int increments) {
+ int newlength = destination.length;
if (offset + 1 == newlength) {
newlength += increments;
int temp[] = new int[newlength];
* Direct port of ICU4C static int64_t * addTouint64_tArray(...) in usearch.cpp.
* This is used for appending a PCE to Pattern.PCE_ buffer. We probably should
* implement this in Pattern class.
- *
+ *
* @param destination target array
* @param offset destination offset to add value
* @param destinationlength target array size
// TODO: We probably do not need Pattern CE table.
private int initializePatternCETable() {
int[] cetable = new int[INITIAL_ARRAY_SIZE_];
- int cetablesize = cetable.length;
int patternlength = pattern_.text_.length();
CollationElementIterator coleiter = utilIter_;
while ((ce = coleiter.next()) != CollationElementIterator.NULLORDER) {
int newce = getCE(ce);
if (newce != CollationElementIterator.IGNORABLE /* 0 */) {
- int[] temp = addToIntArray(cetable, offset, cetablesize, newce,
+ int[] temp = addToIntArray(cetable, offset, newce,
patternlength - coleiter.getOffset() + 1);
offset++;
cetable = temp;
// *** Boyer-Moore ***
/*
- private final void setShiftTable(char shift[],
- char backshift[],
- int cetable[], int cesize,
+ private final void setShiftTable(char shift[],
+ char backshift[],
+ int cetable[], int cesize,
int expansionsize,
int defaultforward,
int defaultbackward) {
* @internal
* @deprecated This API is ICU internal only.
*/
+ @Override
@Deprecated
protected void setMatchNotFound() {
super.setMatchNotFound();
//
// ICU4C usearch_handleNextExact() is identical to usearch_handleNextCanonical()
// for the linear search implementation. The differences are addressed in search().
- //
+ //
private boolean handleNextExact() {
return handleNextCommonImpl();
}
/**
* Gets a substring out of a CharacterIterator
- *
+ *
* Java porting note: Not available in ICU4C
- *
+ *
* @param text CharacterIterator
* @param start start offset
* @param length of substring
/**
* Get the processed ordering priority of the next collation element in the text.
* A single character may contain more than one collation element.
- *
+ *
* Note: This is equivalent to
* UCollationPCE::nextProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status);
*
* @param range receiving the iterator index before/after fetching the CE.
- * @return The next collation elements ordering, otherwise returns PROCESSED_NULLORDER
+ * @return The next collation elements ordering, otherwise returns PROCESSED_NULLORDER
* if an error has occurred or if the end of string has been reached
*/
public long nextProcessed(Range range) {
* UCollationPCE::previousProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status);
*
* @param range receiving the iterator index before/after fetching the CE.
- * @return The previous collation elements ordering, otherwise returns
+ * @return The previous collation elements ordering, otherwise returns
* PROCESSED_NULLORDER if an error has occurred or if the start of
* string has been reached.
*/
/**
* Java port of ICU4C CEI (usearch.cpp)
- *
+ *
* CEI Collation Element + source text index.
* These structs are kept in the circular buffer.
*/
import static com.ibm.icu.text.SearchIterator.ElementComparisonType.PATTERN_BASE_WEIGHT_IS_WILDCARD;
import static com.ibm.icu.text.SearchIterator.ElementComparisonType.STANDARD_ELEMENT_COMPARISON;
+import java.text.CharacterIterator;
import java.text.StringCharacterIterator;
import java.util.Locale;
text = buffer.toString();
targetText.setIndex(targetText.getBeginIndex());
}
+ @Override
protected int handleNext(int start)
{
int match = text.indexOf(pattern, start);
setMatchLength(pattern.length());
return match;
}
+ @Override
protected int handlePrevious(int start)
{
int match = text.lastIndexOf(pattern, start - 1);
return match;
}
+ @Override
public int getIndex()
{
int result = targetText.getIndex();
errln("Error should have reached the start of the iteration");
}
}
-
+
//Test for ticket 5024
@Test
public void TestDiactricMatch() {
public String getPattern() { return pattern; }
public int[] getOffsets() { return offsets; }
}
- final PatternAndOffsets[] scKoSrchPatternsOffsets = {
+ final PatternAndOffsets[] scKoSrchPatternsOffsets = {
new PatternAndOffsets( scKoPat0, scKoSrchOff01 ),
new PatternAndOffsets( scKoPat1, scKoSrchOff01 ),
new PatternAndOffsets( scKoPat2, scKoSrchOff23 ),
new PatternAndOffsets( scKoPat4, scKoSrchOff45 ),
new PatternAndOffsets( scKoPat5, scKoSrchOff45 ),
};
- final PatternAndOffsets[] scKoStndPatternsOffsets = {
+ final PatternAndOffsets[] scKoStndPatternsOffsets = {
new PatternAndOffsets( scKoPat0, scKoStndOff01 ),
new PatternAndOffsets( scKoPat1, scKoStndOff01 ),
new PatternAndOffsets( scKoPat2, scKoStndOff2 ),
public String getText() { return text; }
public PatternAndOffsets[] getPatternsAndOffsets() { return patternsAndOffsets; }
}
- final TUSCItem[] tuscItems = {
+ final TUSCItem[] tuscItems = {
new TUSCItem( "root", scKoText, scKoStndPatternsOffsets ),
new TUSCItem( "root@collation=search", scKoText, scKoSrchPatternsOffsets ),
new TUSCItem( "ko@collation=search", scKoText, scKoSrchPatternsOffsets ),
};
-
+
String dummyPat = "a";
for (TUSCItem tuscItem: tuscItems) {
}
}
-
+ // Test case for ticket#12555
+ @Test
+ public void TestLongPattern() {
+ StringBuilder pattern = new StringBuilder();
+ for (int i = 0; i < 255; i++) {
+ pattern.append('a');
+ }
+ // appends a character producing multiple ce32 at
+ // index 256.
+ pattern.append('รก');
+
+ CharacterIterator target = new StringCharacterIterator("not important");
+ try {
+ StringSearch ss = new StringSearch(pattern.toString(), target, Locale.ENGLISH);
+ assertNotNull("Non-null StringSearch instance", ss);
+ } catch (Exception e) {
+ errln("Error initializing a new StringSearch object");
+ }
+ }
}