typedef struct varlena unknown;
typedef struct varlena VarString;
+/*
+ * State for text_position_* functions.
+ */
typedef struct
{
- bool use_wchar; /* T if multibyte encoding */
- char *str1; /* use these if not use_wchar */
- char *str2; /* note: these point to original texts */
- pg_wchar *wstr1; /* use these if use_wchar */
- pg_wchar *wstr2; /* note: these are palloc'd */
- int len1; /* string lengths in logical characters */
+ bool is_multibyte; /* T if multibyte encoding */
+ bool is_multibyte_char_in_char;
+
+ char *str1; /* haystack string */
+ char *str2; /* needle string */
+ int len1; /* string lengths in bytes */
int len2;
+
/* Skip table for Boyer-Moore-Horspool search algorithm: */
int skiptablemask; /* mask for ANDing with skiptable subscripts */
int skiptable[256]; /* skip distance for given mismatched char */
+
+ char *last_match; /* pointer to last match in 'str1' */
+
+ /*
+ * Sometimes we need to convert the byte position of a match to a
+ * character position. These store the last position that was converted,
+ * so that on the next call, we can continue from that point, rather than
+ * count characters from the very beginning.
+ */
+ char *refpoint; /* pointer within original haystack string */
+ int refpos; /* 0-based character offset of the same point */
} TextPositionState;
typedef struct
static text *text_overlay(text *t1, text *t2, int sp, int sl);
static int text_position(text *t1, text *t2);
static void text_position_setup(text *t1, text *t2, TextPositionState *state);
-static int text_position_next(int start_pos, TextPositionState *state);
+static bool text_position_next(TextPositionState *state);
+static char *text_position_next_internal(char *start_ptr, TextPositionState *state);
+static char *text_position_get_match_ptr(TextPositionState *state);
+static int text_position_get_match_pos(TextPositionState *state);
static void text_position_cleanup(TextPositionState *state);
static int text_cmp(text *arg1, text *arg2, Oid collid);
static bytea *bytea_catenate(bytea *t1, bytea *t2);
TextPositionState state;
int result;
+ if (VARSIZE_ANY_EXHDR(t1) < 1 || VARSIZE_ANY_EXHDR(t2) < 1)
+ return 0;
+
text_position_setup(t1, t2, &state);
- result = text_position_next(1, &state);
+ if (!text_position_next(&state))
+ result = 0;
+ else
+ result = text_position_get_match_pos(&state);
text_position_cleanup(&state);
return result;
}
*
* These are broken out so that a string can be efficiently searched for
* multiple occurrences of the same pattern. text_position_next may be
- * called multiple times with increasing values of start_pos, which is
- * the 1-based character position to start the search from. The "state"
- * variable is normally just a local variable in the caller.
+ * called multiple times, and it advances to the next match on each call.
+ * text_position_get_match_ptr() and text_position_get_match_pos() return
+ * a pointer or 1-based character position of the last match, respectively.
+ *
+ * The "state" variable is normally just a local variable in the caller.
+ *
+ * NOTE: text_position_next skips over the matched portion. For example,
+ * searching for "xx" in "xxx" returns only one match, not two.
*/
static void
int len1 = VARSIZE_ANY_EXHDR(t1);
int len2 = VARSIZE_ANY_EXHDR(t2);
+ Assert(len1 > 0);
+ Assert(len2 > 0);
+
+ /*
+ * Even with a multi-byte encoding, we perform the search using the raw
+ * byte sequence, ignoring multibyte issues. For UTF-8, that works fine,
+ * because in UTF-8 the byte sequence of one character cannot contain
+ * another character. For other multi-byte encodings, we do the search
+ * initially as a simple byte search, ignoring multibyte issues, but
+ * verify afterwards that the match we found is at a character boundary,
+ * and continue the search if it was a false match.
+ */
if (pg_database_encoding_max_length() == 1)
{
- /* simple case - single byte encoding */
- state->use_wchar = false;
- state->str1 = VARDATA_ANY(t1);
- state->str2 = VARDATA_ANY(t2);
- state->len1 = len1;
- state->len2 = len2;
+ state->is_multibyte = false;
+ state->is_multibyte_char_in_char = false;
+ }
+ else if (GetDatabaseEncoding() == PG_UTF8)
+ {
+ state->is_multibyte = true;
+ state->is_multibyte_char_in_char = false;
}
else
{
- /* not as simple - multibyte encoding */
- pg_wchar *p1,
- *p2;
-
- p1 = (pg_wchar *) palloc((len1 + 1) * sizeof(pg_wchar));
- len1 = pg_mb2wchar_with_len(VARDATA_ANY(t1), p1, len1);
- p2 = (pg_wchar *) palloc((len2 + 1) * sizeof(pg_wchar));
- len2 = pg_mb2wchar_with_len(VARDATA_ANY(t2), p2, len2);
-
- state->use_wchar = true;
- state->wstr1 = p1;
- state->wstr2 = p2;
- state->len1 = len1;
- state->len2 = len2;
+ state->is_multibyte = true;
+ state->is_multibyte_char_in_char = true;
}
+ state->str1 = VARDATA_ANY(t1);
+ state->str2 = VARDATA_ANY(t2);
+ state->len1 = len1;
+ state->len2 = len2;
+ state->last_match = NULL;
+ state->refpoint = state->str1;
+ state->refpos = 0;
+
/*
* Prepare the skip table for Boyer-Moore-Horspool searching. In these
* notes we use the terminology that the "haystack" is the string to be
int skiptablemask;
int last;
int i;
+ const char *str2 = state->str2;
/*
* First we must determine how much of the skip table to use. The
*/
last = len2 - 1;
- if (!state->use_wchar)
- {
- const char *str2 = state->str2;
-
- for (i = 0; i < last; i++)
- state->skiptable[(unsigned char) str2[i] & skiptablemask] = last - i;
- }
- else
- {
- const pg_wchar *wstr2 = state->wstr2;
-
- for (i = 0; i < last; i++)
- state->skiptable[wstr2[i] & skiptablemask] = last - i;
- }
+ for (i = 0; i < last; i++)
+ state->skiptable[(unsigned char) str2[i] & skiptablemask] = last - i;
}
}
-static int
-text_position_next(int start_pos, TextPositionState *state)
+/*
+ * Advance to the next match, starting from the end of the previous match
+ * (or the beginning of the string, on first call). Returns true if a match
+ * is found.
+ */
+static bool
+text_position_next(TextPositionState *state)
{
- int haystack_len = state->len1;
int needle_len = state->len2;
- int skiptablemask = state->skiptablemask;
-
- Assert(start_pos > 0); /* else caller error */
+ char *start_ptr;
+ char *matchptr;
if (needle_len <= 0)
- return start_pos; /* result for empty pattern */
+ return false; /* result for empty pattern */
- start_pos--; /* adjust for zero based arrays */
+ /* Start from the point right after the previous match. */
+ if (state->last_match)
+ start_ptr = state->last_match + needle_len;
+ else
+ start_ptr = state->str1;
- /* Done if the needle can't possibly fit */
- if (haystack_len < start_pos + needle_len)
- return 0;
+retry:
+ matchptr = text_position_next_internal(start_ptr, state);
- if (!state->use_wchar)
+ if (!matchptr)
+ return false;
+
+ /*
+ * Found a match for the byte sequence. If this is a multibyte encoding,
+ * where one character's byte sequence can appear inside a longer
+ * multi-byte character, we need to verify that the match was at a
+ * character boundary, not in the middle of a multi-byte character.
+ */
+ if (state->is_multibyte_char_in_char)
{
- /* simple case - single byte encoding */
- const char *haystack = state->str1;
- const char *needle = state->str2;
- const char *haystack_end = &haystack[haystack_len];
- const char *hptr;
+ /* Walk one character at a time, until we reach the match. */
- if (needle_len == 1)
+ /* the search should never move backwards. */
+ Assert(state->refpoint <= matchptr);
+
+ while (state->refpoint < matchptr)
{
- /* No point in using B-M-H for a one-character needle */
- char nchar = *needle;
+ /* step to next character. */
+ state->refpoint += pg_mblen(state->refpoint);
+ state->refpos++;
- hptr = &haystack[start_pos];
- while (hptr < haystack_end)
+ /*
+ * If we stepped over the match's start position, then it was a
+ * false positive, where the byte sequence appeared in the middle
+ * of a multi-byte character. Skip it, and continue the search at
+ * the next character boundary.
+ */
+ if (state->refpoint > matchptr)
{
- if (*hptr == nchar)
- return hptr - haystack + 1;
- hptr++;
+ start_ptr = state->refpoint;
+ goto retry;
}
}
- else
- {
- const char *needle_last = &needle[needle_len - 1];
+ }
- /* Start at startpos plus the length of the needle */
- hptr = &haystack[start_pos + needle_len - 1];
- while (hptr < haystack_end)
- {
- /* Match the needle scanning *backward* */
- const char *nptr;
- const char *p;
+ state->last_match = matchptr;
+ return true;
+}
- nptr = needle_last;
- p = hptr;
- while (*nptr == *p)
- {
- /* Matched it all? If so, return 1-based position */
- if (nptr == needle)
- return p - haystack + 1;
- nptr--, p--;
- }
+/*
+ * Subroutine of text_position_next(). This searches for the raw byte
+ * sequence, ignoring any multi-byte encoding issues. Returns the first
+ * match starting at 'start_ptr', or NULL if no match is found.
+ */
+static char *
+text_position_next_internal(char *start_ptr, TextPositionState *state)
+{
+ int haystack_len = state->len1;
+ int needle_len = state->len2;
+ int skiptablemask = state->skiptablemask;
+ const char *haystack = state->str1;
+ const char *needle = state->str2;
+ const char *haystack_end = &haystack[haystack_len];
+ const char *hptr;
- /*
- * No match, so use the haystack char at hptr to decide how
- * far to advance. If the needle had any occurrence of that
- * character (or more precisely, one sharing the same
- * skiptable entry) before its last character, then we advance
- * far enough to align the last such needle character with
- * that haystack position. Otherwise we can advance by the
- * whole needle length.
- */
- hptr += state->skiptable[(unsigned char) *hptr & skiptablemask];
- }
+ Assert(start_ptr >= haystack && start_ptr <= haystack_end);
+
+ if (needle_len == 1)
+ {
+ /* No point in using B-M-H for a one-character needle */
+ char nchar = *needle;
+
+ hptr = start_ptr;
+ while (hptr < haystack_end)
+ {
+ if (*hptr == nchar)
+ return (char *) hptr;
+ hptr++;
}
}
else
{
- /* The multibyte char version. This works exactly the same way. */
- const pg_wchar *haystack = state->wstr1;
- const pg_wchar *needle = state->wstr2;
- const pg_wchar *haystack_end = &haystack[haystack_len];
- const pg_wchar *hptr;
+ const char *needle_last = &needle[needle_len - 1];
- if (needle_len == 1)
+ /* Start at startpos plus the length of the needle */
+ hptr = start_ptr + needle_len - 1;
+ while (hptr < haystack_end)
{
- /* No point in using B-M-H for a one-character needle */
- pg_wchar nchar = *needle;
+ /* Match the needle scanning *backward* */
+ const char *nptr;
+ const char *p;
- hptr = &haystack[start_pos];
- while (hptr < haystack_end)
+ nptr = needle_last;
+ p = hptr;
+ while (*nptr == *p)
{
- if (*hptr == nchar)
- return hptr - haystack + 1;
- hptr++;
+ /* Matched it all? If so, return 1-based position */
+ if (nptr == needle)
+ return (char *) p;
+ nptr--, p--;
}
+
+ /*
+ * No match, so use the haystack char at hptr to decide how far to
+ * advance. If the needle had any occurrence of that character
+ * (or more precisely, one sharing the same skiptable entry)
+ * before its last character, then we advance far enough to align
+ * the last such needle character with that haystack position.
+ * Otherwise we can advance by the whole needle length.
+ */
+ hptr += state->skiptable[(unsigned char) *hptr & skiptablemask];
}
- else
- {
- const pg_wchar *needle_last = &needle[needle_len - 1];
+ }
- /* Start at startpos plus the length of the needle */
- hptr = &haystack[start_pos + needle_len - 1];
- while (hptr < haystack_end)
- {
- /* Match the needle scanning *backward* */
- const pg_wchar *nptr;
- const pg_wchar *p;
+ return 0; /* not found */
+}
- nptr = needle_last;
- p = hptr;
- while (*nptr == *p)
- {
- /* Matched it all? If so, return 1-based position */
- if (nptr == needle)
- return p - haystack + 1;
- nptr--, p--;
- }
+/*
+ * Return a pointer to the current match.
+ *
+ * The returned pointer points into correct position in the original
+ * the haystack string.
+ */
+static char *
+text_position_get_match_ptr(TextPositionState *state)
+{
+ return state->last_match;
+}
- /*
- * No match, so use the haystack char at hptr to decide how
- * far to advance. If the needle had any occurrence of that
- * character (or more precisely, one sharing the same
- * skiptable entry) before its last character, then we advance
- * far enough to align the last such needle character with
- * that haystack position. Otherwise we can advance by the
- * whole needle length.
- */
- hptr += state->skiptable[*hptr & skiptablemask];
- }
+/*
+ * Return the offset of the current match.
+ *
+ * The offset is in characters, 1-based.
+ */
+static int
+text_position_get_match_pos(TextPositionState *state)
+{
+ if (!state->is_multibyte)
+ return state->last_match - state->str1 + 1;
+ else
+ {
+ /* Convert the byte position to char position. */
+ while (state->refpoint < state->last_match)
+ {
+ state->refpoint += pg_mblen(state->refpoint);
+ state->refpos++;
}
+ Assert(state->refpoint == state->last_match);
+ return state->refpos + 1;
}
-
- return 0; /* not found */
}
static void
text_position_cleanup(TextPositionState *state)
{
- if (state->use_wchar)
- {
- pfree(state->wstr1);
- pfree(state->wstr2);
- }
+ /* no cleanup needed */
}
/* varstr_cmp()
int from_sub_text_len;
TextPositionState state;
text *ret_text;
- int start_posn;
- int curr_posn;
int chunk_len;
+ char *curr_ptr;
char *start_ptr;
StringInfoData str;
+ bool found;
- text_position_setup(src_text, from_sub_text, &state);
-
- /*
- * Note: we check the converted string length, not the original, because
- * they could be different if the input contained invalid encoding.
- */
- src_text_len = state.len1;
- from_sub_text_len = state.len2;
+ src_text_len = VARSIZE_ANY_EXHDR(src_text);
+ from_sub_text_len = VARSIZE_ANY_EXHDR(from_sub_text);
/* Return unmodified source string if empty source or pattern */
if (src_text_len < 1 || from_sub_text_len < 1)
{
- text_position_cleanup(&state);
PG_RETURN_TEXT_P(src_text);
}
- start_posn = 1;
- curr_posn = text_position_next(1, &state);
+ text_position_setup(src_text, from_sub_text, &state);
+
+ found = text_position_next(&state);
/* When the from_sub_text is not found, there is nothing to do. */
- if (curr_posn == 0)
+ if (!found)
{
text_position_cleanup(&state);
PG_RETURN_TEXT_P(src_text);
}
-
- /* start_ptr points to the start_posn'th character of src_text */
+ curr_ptr = text_position_get_match_ptr(&state);
start_ptr = VARDATA_ANY(src_text);
initStringInfo(&str);
CHECK_FOR_INTERRUPTS();
/* copy the data skipped over by last text_position_next() */
- chunk_len = charlen_to_bytelen(start_ptr, curr_posn - start_posn);
+ chunk_len = curr_ptr - start_ptr;
appendBinaryStringInfo(&str, start_ptr, chunk_len);
appendStringInfoText(&str, to_sub_text);
- start_posn = curr_posn;
- start_ptr += chunk_len;
- start_posn += from_sub_text_len;
- start_ptr += charlen_to_bytelen(start_ptr, from_sub_text_len);
+ start_ptr = curr_ptr + from_sub_text_len;
- curr_posn = text_position_next(start_posn, &state);
+ found = text_position_next(&state);
+ if (found)
+ curr_ptr = text_position_get_match_ptr(&state);
}
- while (curr_posn > 0);
+ while (found);
/* copy trailing data */
chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
int inputstring_len;
int fldsep_len;
TextPositionState state;
- int start_posn;
- int end_posn;
+ char *start_ptr;
+ char *end_ptr;
text *result_text;
+ bool found;
/* field number is 1 based */
if (fldnum < 1)
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("field position must be greater than zero")));
- text_position_setup(inputstring, fldsep, &state);
-
- /*
- * Note: we check the converted string length, not the original, because
- * they could be different if the input contained invalid encoding.
- */
- inputstring_len = state.len1;
- fldsep_len = state.len2;
+ inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
+ fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
/* return empty string for empty input string */
if (inputstring_len < 1)
- {
- text_position_cleanup(&state);
PG_RETURN_TEXT_P(cstring_to_text(""));
- }
/* empty field separator */
if (fldsep_len < 1)
PG_RETURN_TEXT_P(cstring_to_text(""));
}
+ text_position_setup(inputstring, fldsep, &state);
+
/* identify bounds of first field */
- start_posn = 1;
- end_posn = text_position_next(1, &state);
+ start_ptr = VARDATA_ANY(inputstring);
+ found = text_position_next(&state);
/* special case if fldsep not found at all */
- if (end_posn == 0)
+ if (!found)
{
text_position_cleanup(&state);
/* if field 1 requested, return input string, else empty string */
else
PG_RETURN_TEXT_P(cstring_to_text(""));
}
+ end_ptr = text_position_get_match_ptr(&state);
- while (end_posn > 0 && --fldnum > 0)
+ while (found && --fldnum > 0)
{
/* identify bounds of next field */
- start_posn = end_posn + fldsep_len;
- end_posn = text_position_next(start_posn, &state);
+ start_ptr = end_ptr + fldsep_len;
+ found = text_position_next(&state);
+ if (found)
+ end_ptr = text_position_get_match_ptr(&state);
}
text_position_cleanup(&state);
/* N'th field separator not found */
/* if last field requested, return it, else empty string */
if (fldnum == 1)
- result_text = text_substring(PointerGetDatum(inputstring),
- start_posn,
- -1,
- true);
+ {
+ int last_len = start_ptr - VARDATA_ANY(inputstring);
+
+ result_text = cstring_to_text_with_len(start_ptr,
+ inputstring_len - last_len);
+ }
else
result_text = cstring_to_text("");
}
else
{
/* non-last field requested */
- result_text = text_substring(PointerGetDatum(inputstring),
- start_posn,
- end_posn - start_posn,
- false);
+ result_text = cstring_to_text_with_len(start_ptr, end_ptr - start_ptr);
}
PG_RETURN_TEXT_P(result_text);
*/
TextPositionState state;
int fldnum;
- int start_posn;
- int end_posn;
int chunk_len;
- text_position_setup(inputstring, fldsep, &state);
-
- /*
- * Note: we check the converted string length, not the original,
- * because they could be different if the input contained invalid
- * encoding.
- */
- inputstring_len = state.len1;
- fldsep_len = state.len2;
+ inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
+ fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
/* return empty array for empty input string */
if (inputstring_len < 1)
- {
- text_position_cleanup(&state);
PG_RETURN_ARRAYTYPE_P(construct_empty_array(TEXTOID));
- }
/*
* empty field separator: return the input string as a one-element
int dims[1];
int lbs[1];
- text_position_cleanup(&state);
/* single element can be a NULL too */
is_null = null_string ? text_isequal(inputstring, null_string) : false;
TEXTOID, -1, false, 'i'));
}
- start_posn = 1;
- /* start_ptr points to the start_posn'th character of inputstring */
+ text_position_setup(inputstring, fldsep, &state);
+
start_ptr = VARDATA_ANY(inputstring);
for (fldnum = 1;; fldnum++) /* field number is 1 based */
{
- CHECK_FOR_INTERRUPTS();
+ bool found;
+ char *end_ptr;
- end_posn = text_position_next(start_posn, &state);
+ CHECK_FOR_INTERRUPTS();
- if (end_posn == 0)
+ found = text_position_next(&state);
+ if (!found)
{
/* fetch last field */
chunk_len = ((char *) inputstring + VARSIZE_ANY(inputstring)) - start_ptr;
else
{
/* fetch non-last field */
- chunk_len = charlen_to_bytelen(start_ptr, end_posn - start_posn);
+ end_ptr = text_position_get_match_ptr(&state);
+ chunk_len = end_ptr - start_ptr;
}
/* must build a temp text datum to pass to accumArrayResult */
pfree(result_text);
- if (end_posn == 0)
+ if (!found)
break;
- start_posn = end_posn;
- start_ptr += chunk_len;
- start_posn += fldsep_len;
- start_ptr += charlen_to_bytelen(start_ptr, fldsep_len);
+ start_ptr = end_ptr + fldsep_len;
}
text_position_cleanup(&state);