1 /*-------------------------------------------------------------------------
4 * Functions for the variable-length built-in types.
6 * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
11 * src/backend/utils/adt/varlena.c
13 *-------------------------------------------------------------------------
20 #include "access/hash.h"
21 #include "access/tuptoaster.h"
22 #include "catalog/pg_collation.h"
23 #include "catalog/pg_type.h"
24 #include "common/md5.h"
25 #include "lib/hyperloglog.h"
26 #include "libpq/pqformat.h"
27 #include "miscadmin.h"
28 #include "parser/scansup.h"
29 #include "port/pg_bswap.h"
30 #include "regex/regex.h"
31 #include "utils/builtins.h"
32 #include "utils/bytea.h"
33 #include "utils/lsyscache.h"
34 #include "utils/memutils.h"
35 #include "utils/pg_locale.h"
36 #include "utils/sortsupport.h"
37 #include "utils/varlena.h"
41 int bytea_output = BYTEA_OUTPUT_HEX;
43 typedef struct varlena unknown;
44 typedef struct varlena VarString;
48 bool use_wchar; /* T if multibyte encoding */
49 char *str1; /* use these if not use_wchar */
50 char *str2; /* note: these point to original texts */
51 pg_wchar *wstr1; /* use these if use_wchar */
52 pg_wchar *wstr2; /* note: these are palloc'd */
53 int len1; /* string lengths in logical characters */
55 /* Skip table for Boyer-Moore-Horspool search algorithm: */
56 int skiptablemask; /* mask for ANDing with skiptable subscripts */
57 int skiptable[256]; /* skip distance for given mismatched char */
62 char *buf1; /* 1st string, or abbreviation original string
64 char *buf2; /* 2nd string, or abbreviation strxfrm() buf */
67 int last_len1; /* Length of last buf1 string/strxfrm() input */
68 int last_len2; /* Length of last buf2 string/strxfrm() blob */
69 int last_returned; /* Last comparison result (cache) */
70 bool cache_blob; /* Does buf2 contain strxfrm() blob, etc? */
72 bool bpchar; /* Sorting bpchar, not varchar/text/bytea? */
73 hyperLogLogState abbr_card; /* Abbreviated key cardinality state */
74 hyperLogLogState full_card; /* Full key cardinality state */
75 double prop_card; /* Required cardinality proportion */
79 } VarStringSortSupport;
82 * This should be large enough that most strings will fit, but small enough
83 * that we feel comfortable putting it on the stack
85 #define TEXTBUFLEN 1024
87 #define DatumGetUnknownP(X) ((unknown *) PG_DETOAST_DATUM(X))
88 #define DatumGetUnknownPCopy(X) ((unknown *) PG_DETOAST_DATUM_COPY(X))
89 #define PG_GETARG_UNKNOWN_P(n) DatumGetUnknownP(PG_GETARG_DATUM(n))
90 #define PG_GETARG_UNKNOWN_P_COPY(n) DatumGetUnknownPCopy(PG_GETARG_DATUM(n))
91 #define PG_RETURN_UNKNOWN_P(x) PG_RETURN_POINTER(x)
93 #define DatumGetVarStringP(X) ((VarString *) PG_DETOAST_DATUM(X))
94 #define DatumGetVarStringPP(X) ((VarString *) PG_DETOAST_DATUM_PACKED(X))
96 static int varstrfastcmp_c(Datum x, Datum y, SortSupport ssup);
97 static int bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup);
98 static int varstrfastcmp_locale(Datum x, Datum y, SortSupport ssup);
99 static int varstrcmp_abbrev(Datum x, Datum y, SortSupport ssup);
100 static Datum varstr_abbrev_convert(Datum original, SortSupport ssup);
101 static bool varstr_abbrev_abort(int memtupcount, SortSupport ssup);
102 static int32 text_length(Datum str);
103 static text *text_catenate(text *t1, text *t2);
104 static text *text_substring(Datum str,
107 bool length_not_specified);
108 static text *text_overlay(text *t1, text *t2, int sp, int sl);
109 static int text_position(text *t1, text *t2);
110 static void text_position_setup(text *t1, text *t2, TextPositionState *state);
111 static int text_position_next(int start_pos, TextPositionState *state);
112 static void text_position_cleanup(TextPositionState *state);
113 static int text_cmp(text *arg1, text *arg2, Oid collid);
114 static bytea *bytea_catenate(bytea *t1, bytea *t2);
115 static bytea *bytea_substring(Datum str,
118 bool length_not_specified);
119 static bytea *bytea_overlay(bytea *t1, bytea *t2, int sp, int sl);
120 static void appendStringInfoText(StringInfo str, const text *t);
121 static Datum text_to_array_internal(PG_FUNCTION_ARGS);
122 static text *array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v,
123 const char *fldsep, const char *null_string);
124 static StringInfo makeStringAggState(FunctionCallInfo fcinfo);
125 static bool text_format_parse_digits(const char **ptr, const char *end_ptr,
127 static const char *text_format_parse_format(const char *start_ptr,
129 int *argpos, int *widthpos,
130 int *flags, int *width);
131 static void text_format_string_conversion(StringInfo buf, char conversion,
132 FmgrInfo *typOutputInfo,
133 Datum value, bool isNull,
134 int flags, int width);
135 static void text_format_append_string(StringInfo buf, const char *str,
136 int flags, int width);
139 /*****************************************************************************
140 * CONVERSION ROUTINES EXPORTED FOR USE BY C CODE *
141 *****************************************************************************/
146 * Create a text value from a null-terminated C string.
148 * The new text value is freshly palloc'd with a full-size VARHDR.
151 cstring_to_text(const char *s)
153 return cstring_to_text_with_len(s, strlen(s));
157 * cstring_to_text_with_len
159 * Same as cstring_to_text except the caller specifies the string length;
160 * the string need not be null_terminated.
163 cstring_to_text_with_len(const char *s, int len)
165 text *result = (text *) palloc(len + VARHDRSZ);
167 SET_VARSIZE(result, len + VARHDRSZ);
168 memcpy(VARDATA(result), s, len);
176 * Create a palloc'd, null-terminated C string from a text value.
178 * We support being passed a compressed or toasted text value.
179 * This is a bit bogus since such values shouldn't really be referred to as
180 * "text *", but it seems useful for robustness. If we didn't handle that
181 * case here, we'd need another routine that did, anyway.
184 text_to_cstring(const text *t)
186 /* must cast away the const, unfortunately */
187 text *tunpacked = pg_detoast_datum_packed((struct varlena *) t);
188 int len = VARSIZE_ANY_EXHDR(tunpacked);
191 result = (char *) palloc(len + 1);
192 memcpy(result, VARDATA_ANY(tunpacked), len);
202 * text_to_cstring_buffer
204 * Copy a text value into a caller-supplied buffer of size dst_len.
206 * The text string is truncated if necessary to fit. The result is
207 * guaranteed null-terminated (unless dst_len == 0).
209 * We support being passed a compressed or toasted text value.
210 * This is a bit bogus since such values shouldn't really be referred to as
211 * "text *", but it seems useful for robustness. If we didn't handle that
212 * case here, we'd need another routine that did, anyway.
215 text_to_cstring_buffer(const text *src, char *dst, size_t dst_len)
217 /* must cast away the const, unfortunately */
218 text *srcunpacked = pg_detoast_datum_packed((struct varlena *) src);
219 size_t src_len = VARSIZE_ANY_EXHDR(srcunpacked);
224 if (dst_len >= src_len)
226 else /* ensure truncation is encoding-safe */
227 dst_len = pg_mbcliplen(VARDATA_ANY(srcunpacked), src_len, dst_len);
228 memcpy(dst, VARDATA_ANY(srcunpacked), dst_len);
232 if (srcunpacked != src)
237 /*****************************************************************************
238 * USER I/O ROUTINES *
239 *****************************************************************************/
242 #define VAL(CH) ((CH) - '0')
243 #define DIG(VAL) ((VAL) + '0')
246 * byteain - converts from printable representation of byte array
248 * Non-printable characters must be passed as '\nnn' (octal) and are
249 * converted to internal form. '\' must be passed as '\\'.
250 * ereport(ERROR, ...) if bad form.
253 * The input is scanned twice.
254 * The error checking of input is minimal.
257 byteain(PG_FUNCTION_ARGS)
259 char *inputText = PG_GETARG_CSTRING(0);
265 /* Recognize hex input */
266 if (inputText[0] == '\\' && inputText[1] == 'x')
268 size_t len = strlen(inputText);
270 bc = (len - 2) / 2 + VARHDRSZ; /* maximum possible length */
272 bc = hex_decode(inputText + 2, len - 2, VARDATA(result));
273 SET_VARSIZE(result, bc + VARHDRSZ); /* actual length */
275 PG_RETURN_BYTEA_P(result);
278 /* Else, it's the traditional escaped style */
279 for (bc = 0, tp = inputText; *tp != '\0'; bc++)
283 else if ((tp[0] == '\\') &&
284 (tp[1] >= '0' && tp[1] <= '3') &&
285 (tp[2] >= '0' && tp[2] <= '7') &&
286 (tp[3] >= '0' && tp[3] <= '7'))
288 else if ((tp[0] == '\\') &&
294 * one backslash, not followed by another or ### valid octal
297 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
298 errmsg("invalid input syntax for type %s", "bytea")));
304 result = (bytea *) palloc(bc);
305 SET_VARSIZE(result, bc);
308 rp = VARDATA(result);
313 else if ((tp[0] == '\\') &&
314 (tp[1] >= '0' && tp[1] <= '3') &&
315 (tp[2] >= '0' && tp[2] <= '7') &&
316 (tp[3] >= '0' && tp[3] <= '7'))
322 *rp++ = bc + VAL(tp[3]);
326 else if ((tp[0] == '\\') &&
335 * We should never get here. The first pass should not allow it.
338 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
339 errmsg("invalid input syntax for type %s", "bytea")));
343 PG_RETURN_BYTEA_P(result);
347 * byteaout - converts to printable representation of byte array
349 * In the traditional escaped format, non-printable characters are
350 * printed as '\nnn' (octal) and '\' as '\\'.
353 byteaout(PG_FUNCTION_ARGS)
355 bytea *vlena = PG_GETARG_BYTEA_PP(0);
359 if (bytea_output == BYTEA_OUTPUT_HEX)
361 /* Print hex format */
362 rp = result = palloc(VARSIZE_ANY_EXHDR(vlena) * 2 + 2 + 1);
365 rp += hex_encode(VARDATA_ANY(vlena), VARSIZE_ANY_EXHDR(vlena), rp);
367 else if (bytea_output == BYTEA_OUTPUT_ESCAPE)
369 /* Print traditional escaped format */
374 len = 1; /* empty string has 1 char */
375 vp = VARDATA_ANY(vlena);
376 for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
380 else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
385 rp = result = (char *) palloc(len);
386 vp = VARDATA_ANY(vlena);
387 for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
394 else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
396 int val; /* holds unprintable chars */
400 rp[3] = DIG(val & 07);
402 rp[2] = DIG(val & 07);
404 rp[1] = DIG(val & 03);
413 elog(ERROR, "unrecognized bytea_output setting: %d",
415 rp = result = NULL; /* keep compiler quiet */
418 PG_RETURN_CSTRING(result);
422 * bytearecv - converts external binary format to bytea
425 bytearecv(PG_FUNCTION_ARGS)
427 StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
431 nbytes = buf->len - buf->cursor;
432 result = (bytea *) palloc(nbytes + VARHDRSZ);
433 SET_VARSIZE(result, nbytes + VARHDRSZ);
434 pq_copymsgbytes(buf, VARDATA(result), nbytes);
435 PG_RETURN_BYTEA_P(result);
439 * byteasend - converts bytea to binary format
441 * This is a special case: just copy the input...
444 byteasend(PG_FUNCTION_ARGS)
446 bytea *vlena = PG_GETARG_BYTEA_P_COPY(0);
448 PG_RETURN_BYTEA_P(vlena);
452 bytea_string_agg_transfn(PG_FUNCTION_ARGS)
456 state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
458 /* Append the value unless null. */
459 if (!PG_ARGISNULL(1))
461 bytea *value = PG_GETARG_BYTEA_PP(1);
463 /* On the first time through, we ignore the delimiter. */
465 state = makeStringAggState(fcinfo);
466 else if (!PG_ARGISNULL(2))
468 bytea *delim = PG_GETARG_BYTEA_PP(2);
470 appendBinaryStringInfo(state, VARDATA_ANY(delim), VARSIZE_ANY_EXHDR(delim));
473 appendBinaryStringInfo(state, VARDATA_ANY(value), VARSIZE_ANY_EXHDR(value));
477 * The transition type for string_agg() is declared to be "internal",
478 * which is a pass-by-value type the same size as a pointer.
480 PG_RETURN_POINTER(state);
484 bytea_string_agg_finalfn(PG_FUNCTION_ARGS)
488 /* cannot be called directly because of internal-type argument */
489 Assert(AggCheckCallContext(fcinfo, NULL));
491 state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
497 result = (bytea *) palloc(state->len + VARHDRSZ);
498 SET_VARSIZE(result, state->len + VARHDRSZ);
499 memcpy(VARDATA(result), state->data, state->len);
500 PG_RETURN_BYTEA_P(result);
507 * textin - converts "..." to internal representation
510 textin(PG_FUNCTION_ARGS)
512 char *inputText = PG_GETARG_CSTRING(0);
514 PG_RETURN_TEXT_P(cstring_to_text(inputText));
518 * textout - converts internal representation to "..."
521 textout(PG_FUNCTION_ARGS)
523 Datum txt = PG_GETARG_DATUM(0);
525 PG_RETURN_CSTRING(TextDatumGetCString(txt));
529 * textrecv - converts external binary format to text
532 textrecv(PG_FUNCTION_ARGS)
534 StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
539 str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
541 result = cstring_to_text_with_len(str, nbytes);
543 PG_RETURN_TEXT_P(result);
547 * textsend - converts text to binary format
550 textsend(PG_FUNCTION_ARGS)
552 text *t = PG_GETARG_TEXT_PP(0);
555 pq_begintypsend(&buf);
556 pq_sendtext(&buf, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t));
557 PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
562 * unknownin - converts "..." to internal representation
565 unknownin(PG_FUNCTION_ARGS)
567 char *str = PG_GETARG_CSTRING(0);
569 /* representation is same as cstring */
570 PG_RETURN_CSTRING(pstrdup(str));
574 * unknownout - converts internal representation to "..."
577 unknownout(PG_FUNCTION_ARGS)
579 /* representation is same as cstring */
580 char *str = PG_GETARG_CSTRING(0);
582 PG_RETURN_CSTRING(pstrdup(str));
586 * unknownrecv - converts external binary format to unknown
589 unknownrecv(PG_FUNCTION_ARGS)
591 StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
595 str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
596 /* representation is same as cstring */
597 PG_RETURN_CSTRING(str);
601 * unknownsend - converts unknown to binary format
604 unknownsend(PG_FUNCTION_ARGS)
606 /* representation is same as cstring */
607 char *str = PG_GETARG_CSTRING(0);
610 pq_begintypsend(&buf);
611 pq_sendtext(&buf, str, strlen(str));
612 PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
616 /* ========== PUBLIC ROUTINES ========== */
620 * returns the logical length of a text*
621 * (which is less than the VARSIZE of the text*)
624 textlen(PG_FUNCTION_ARGS)
626 Datum str = PG_GETARG_DATUM(0);
628 /* try to avoid decompressing argument */
629 PG_RETURN_INT32(text_length(str));
634 * Does the real work for textlen()
636 * This is broken out so it can be called directly by other string processing
637 * functions. Note that the argument is passed as a Datum, to indicate that
638 * it may still be in compressed form. We can avoid decompressing it at all
642 text_length(Datum str)
644 /* fastpath when max encoding length is one */
645 if (pg_database_encoding_max_length() == 1)
646 PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
649 text *t = DatumGetTextPP(str);
651 PG_RETURN_INT32(pg_mbstrlen_with_len(VARDATA_ANY(t),
652 VARSIZE_ANY_EXHDR(t)));
658 * returns the physical length of a text*
659 * (which is less than the VARSIZE of the text*)
662 textoctetlen(PG_FUNCTION_ARGS)
664 Datum str = PG_GETARG_DATUM(0);
666 /* We need not detoast the input at all */
667 PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
672 * takes two text* and returns a text* that is the concatenation of
675 * Rewritten by Sapa, sapa@hq.icb.chel.su. 8-Jul-96.
676 * Updated by Thomas, Thomas.Lockhart@jpl.nasa.gov 1997-07-10.
677 * Allocate space for output in all cases.
678 * XXX - thomas 1997-07-10
681 textcat(PG_FUNCTION_ARGS)
683 text *t1 = PG_GETARG_TEXT_PP(0);
684 text *t2 = PG_GETARG_TEXT_PP(1);
686 PG_RETURN_TEXT_P(text_catenate(t1, t2));
691 * Guts of textcat(), broken out so it can be used by other functions
693 * Arguments can be in short-header form, but not compressed or out-of-line
696 text_catenate(text *t1, text *t2)
704 len1 = VARSIZE_ANY_EXHDR(t1);
705 len2 = VARSIZE_ANY_EXHDR(t2);
707 /* paranoia ... probably should throw error instead? */
713 len = len1 + len2 + VARHDRSZ;
714 result = (text *) palloc(len);
716 /* Set size of result string... */
717 SET_VARSIZE(result, len);
719 /* Fill data field of result string... */
720 ptr = VARDATA(result);
722 memcpy(ptr, VARDATA_ANY(t1), len1);
724 memcpy(ptr + len1, VARDATA_ANY(t2), len2);
730 * charlen_to_bytelen()
731 * Compute the number of bytes occupied by n characters starting at *p
733 * It is caller's responsibility that there actually are n characters;
734 * the string need not be null-terminated.
737 charlen_to_bytelen(const char *p, int n)
739 if (pg_database_encoding_max_length() == 1)
741 /* Optimization for single-byte encodings */
748 for (s = p; n > 0; n--)
757 * Return a substring starting at the specified position.
758 * - thomas 1997-12-31
762 * - starting position (is one-based)
765 * If the starting position is zero or less, then return from the start of the string
766 * adjusting the length to be consistent with the "negative start" per SQL.
767 * If the length is less than zero, return the remaining string.
769 * Added multibyte support.
770 * - Tatsuo Ishii 1998-4-21
771 * Changed behavior if starting position is less than one to conform to SQL behavior.
772 * Formerly returned the entire string; now returns a portion.
773 * - Thomas Lockhart 1998-12-10
774 * Now uses faster TOAST-slicing interface
775 * - John Gray 2002-02-22
776 * Remove "#ifdef MULTIBYTE" and test for encoding_max_length instead. Change
777 * behaviors conflicting with SQL to meet SQL (if E = S + L < S throw
778 * error; if E < 1, return '', not entire string). Fixed MB related bug when
779 * S > LC and < LC + 4 sometimes garbage characters are returned.
780 * - Joe Conway 2002-08-10
783 text_substr(PG_FUNCTION_ARGS)
785 PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0),
792 * text_substr_no_len -
793 * Wrapper to avoid opr_sanity failure due to
794 * one function accepting a different number of args.
797 text_substr_no_len(PG_FUNCTION_ARGS)
799 PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0),
806 * Does the real work for text_substr() and text_substr_no_len()
808 * This is broken out so it can be called directly by other string processing
809 * functions. Note that the argument is passed as a Datum, to indicate that
810 * it may still be in compressed/toasted form. We can avoid detoasting all
811 * of it in some cases.
813 * The result is always a freshly palloc'd datum.
816 text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
818 int32 eml = pg_database_encoding_max_length();
819 int32 S = start; /* start position */
820 int32 S1; /* adjusted start position */
821 int32 L1; /* adjusted substring length */
823 /* life is easy if the encoding max length is 1 */
828 if (length_not_specified) /* special case - get length to end of
837 * A negative value for L is the only way for the end position to
838 * be before the start. SQL99 says to throw an error.
842 (errcode(ERRCODE_SUBSTRING_ERROR),
843 errmsg("negative substring length not allowed")));
846 * A zero or negative value for the end position can happen if the
847 * start was negative or one. SQL99 says to return a zero-length
851 return cstring_to_text("");
857 * If the start position is past the end of the string, SQL99 says to
858 * return a zero-length string -- PG_GETARG_TEXT_P_SLICE() will do
859 * that for us. Convert to zero-based starting position
861 return DatumGetTextPSlice(str, S1 - 1, L1);
866 * When encoding max length is > 1, we can't get LC without
867 * detoasting, so we'll grab a conservatively large slice now and go
868 * back later to do the right thing
881 * if S is past the end of the string, the tuple toaster will return a
882 * zero-length string to us
887 * We need to start at position zero because there is no way to know
888 * in advance which byte offset corresponds to the supplied start
893 if (length_not_specified) /* special case - get length to end of
895 slice_size = L1 = -1;
901 * A negative value for L is the only way for the end position to
902 * be before the start. SQL99 says to throw an error.
906 (errcode(ERRCODE_SUBSTRING_ERROR),
907 errmsg("negative substring length not allowed")));
910 * A zero or negative value for the end position can happen if the
911 * start was negative or one. SQL99 says to return a zero-length
915 return cstring_to_text("");
918 * if E is past the end of the string, the tuple toaster will
919 * truncate the length for us
924 * Total slice size in bytes can't be any longer than the start
925 * position plus substring length times the encoding max length.
927 slice_size = (S1 + L1) * eml;
931 * If we're working with an untoasted source, no need to do an extra
934 if (VARATT_IS_COMPRESSED(DatumGetPointer(str)) ||
935 VARATT_IS_EXTERNAL(DatumGetPointer(str)))
936 slice = DatumGetTextPSlice(str, slice_start, slice_size);
938 slice = (text *) DatumGetPointer(str);
940 /* see if we got back an empty string */
941 if (VARSIZE_ANY_EXHDR(slice) == 0)
943 if (slice != (text *) DatumGetPointer(str))
945 return cstring_to_text("");
948 /* Now we can get the actual length of the slice in MB characters */
949 slice_strlen = pg_mbstrlen_with_len(VARDATA_ANY(slice),
950 VARSIZE_ANY_EXHDR(slice));
953 * Check that the start position wasn't > slice_strlen. If so, SQL99
954 * says to return a zero-length string.
956 if (S1 > slice_strlen)
958 if (slice != (text *) DatumGetPointer(str))
960 return cstring_to_text("");
964 * Adjust L1 and E1 now that we know the slice string length. Again
965 * remember that S1 is one based, and slice_start is zero based.
968 E1 = Min(S1 + L1, slice_start + 1 + slice_strlen);
970 E1 = slice_start + 1 + slice_strlen;
973 * Find the start position in the slice; remember S1 is not zero based
975 p = VARDATA_ANY(slice);
976 for (i = 0; i < S1 - 1; i++)
979 /* hang onto a pointer to our start position */
983 * Count the actual bytes used by the substring of the requested
986 for (i = S1; i < E1; i++)
989 ret = (text *) palloc(VARHDRSZ + (p - s));
990 SET_VARSIZE(ret, VARHDRSZ + (p - s));
991 memcpy(VARDATA(ret), s, (p - s));
993 if (slice != (text *) DatumGetPointer(str))
999 elog(ERROR, "invalid backend encoding: encoding max length < 1");
1001 /* not reached: suppress compiler warning */
1007 * Replace specified substring of first string with second
1009 * The SQL standard defines OVERLAY() in terms of substring and concatenation.
1010 * This code is a direct implementation of what the standard says.
1013 textoverlay(PG_FUNCTION_ARGS)
1015 text *t1 = PG_GETARG_TEXT_PP(0);
1016 text *t2 = PG_GETARG_TEXT_PP(1);
1017 int sp = PG_GETARG_INT32(2); /* substring start position */
1018 int sl = PG_GETARG_INT32(3); /* substring length */
1020 PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
1024 textoverlay_no_len(PG_FUNCTION_ARGS)
1026 text *t1 = PG_GETARG_TEXT_PP(0);
1027 text *t2 = PG_GETARG_TEXT_PP(1);
1028 int sp = PG_GETARG_INT32(2); /* substring start position */
1031 sl = text_length(PointerGetDatum(t2)); /* defaults to length(t2) */
1032 PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
1036 text_overlay(text *t1, text *t2, int sp, int sl)
1044 * Check for possible integer-overflow cases. For negative sp, throw a
1045 * "substring length" error because that's what should be expected
1046 * according to the spec's definition of OVERLAY().
1050 (errcode(ERRCODE_SUBSTRING_ERROR),
1051 errmsg("negative substring length not allowed")));
1055 (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
1056 errmsg("integer out of range")));
1058 s1 = text_substring(PointerGetDatum(t1), 1, sp - 1, false);
1059 s2 = text_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
1060 result = text_catenate(s1, t2);
1061 result = text_catenate(result, s2);
1068 * Return the position of the specified substring.
1069 * Implements the SQL POSITION() function.
1070 * Ref: A Guide To The SQL Standard, Date & Darwen, 1997
1071 * - thomas 1997-07-27
1074 textpos(PG_FUNCTION_ARGS)
1076 text *str = PG_GETARG_TEXT_PP(0);
1077 text *search_str = PG_GETARG_TEXT_PP(1);
1079 PG_RETURN_INT32((int32) text_position(str, search_str));
1084 * Does the real work for textpos()
1087 * t1 - string to be searched
1088 * t2 - pattern to match within t1
1090 * Character index of the first matched char, starting from 1,
1093 * This is broken out so it can be called directly by other string processing
1097 text_position(text *t1, text *t2)
1099 TextPositionState state;
1102 text_position_setup(t1, t2, &state);
1103 result = text_position_next(1, &state);
1104 text_position_cleanup(&state);
1110 * text_position_setup, text_position_next, text_position_cleanup -
1111 * Component steps of text_position()
1113 * These are broken out so that a string can be efficiently searched for
1114 * multiple occurrences of the same pattern. text_position_next may be
1115 * called multiple times with increasing values of start_pos, which is
1116 * the 1-based character position to start the search from. The "state"
1117 * variable is normally just a local variable in the caller.
1121 text_position_setup(text *t1, text *t2, TextPositionState *state)
1123 int len1 = VARSIZE_ANY_EXHDR(t1);
1124 int len2 = VARSIZE_ANY_EXHDR(t2);
1126 if (pg_database_encoding_max_length() == 1)
1128 /* simple case - single byte encoding */
1129 state->use_wchar = false;
1130 state->str1 = VARDATA_ANY(t1);
1131 state->str2 = VARDATA_ANY(t2);
1137 /* not as simple - multibyte encoding */
1141 p1 = (pg_wchar *) palloc((len1 + 1) * sizeof(pg_wchar));
1142 len1 = pg_mb2wchar_with_len(VARDATA_ANY(t1), p1, len1);
1143 p2 = (pg_wchar *) palloc((len2 + 1) * sizeof(pg_wchar));
1144 len2 = pg_mb2wchar_with_len(VARDATA_ANY(t2), p2, len2);
1146 state->use_wchar = true;
1154 * Prepare the skip table for Boyer-Moore-Horspool searching. In these
1155 * notes we use the terminology that the "haystack" is the string to be
1156 * searched (t1) and the "needle" is the pattern being sought (t2).
1158 * If the needle is empty or bigger than the haystack then there is no
1159 * point in wasting cycles initializing the table. We also choose not to
1160 * use B-M-H for needles of length 1, since the skip table can't possibly
1161 * save anything in that case.
1163 if (len1 >= len2 && len2 > 1)
1165 int searchlength = len1 - len2;
1171 * First we must determine how much of the skip table to use. The
1172 * declaration of TextPositionState allows up to 256 elements, but for
1173 * short search problems we don't really want to have to initialize so
1174 * many elements --- it would take too long in comparison to the
1175 * actual search time. So we choose a useful skip table size based on
1176 * the haystack length minus the needle length. The closer the needle
1177 * length is to the haystack length the less useful skipping becomes.
1179 * Note: since we use bit-masking to select table elements, the skip
1180 * table size MUST be a power of 2, and so the mask must be 2^N-1.
1182 if (searchlength < 16)
1184 else if (searchlength < 64)
1186 else if (searchlength < 128)
1188 else if (searchlength < 512)
1190 else if (searchlength < 2048)
1192 else if (searchlength < 4096)
1193 skiptablemask = 127;
1195 skiptablemask = 255;
1196 state->skiptablemask = skiptablemask;
1199 * Initialize the skip table. We set all elements to the needle
1200 * length, since this is the correct skip distance for any character
1201 * not found in the needle.
1203 for (i = 0; i <= skiptablemask; i++)
1204 state->skiptable[i] = len2;
1207 * Now examine the needle. For each character except the last one,
1208 * set the corresponding table element to the appropriate skip
1209 * distance. Note that when two characters share the same skip table
1210 * entry, the one later in the needle must determine the skip
1215 if (!state->use_wchar)
1217 const char *str2 = state->str2;
1219 for (i = 0; i < last; i++)
1220 state->skiptable[(unsigned char) str2[i] & skiptablemask] = last - i;
1224 const pg_wchar *wstr2 = state->wstr2;
1226 for (i = 0; i < last; i++)
1227 state->skiptable[wstr2[i] & skiptablemask] = last - i;
1233 text_position_next(int start_pos, TextPositionState *state)
1235 int haystack_len = state->len1;
1236 int needle_len = state->len2;
1237 int skiptablemask = state->skiptablemask;
1239 Assert(start_pos > 0); /* else caller error */
1241 if (needle_len <= 0)
1242 return start_pos; /* result for empty pattern */
1244 start_pos--; /* adjust for zero based arrays */
1246 /* Done if the needle can't possibly fit */
1247 if (haystack_len < start_pos + needle_len)
1250 if (!state->use_wchar)
1252 /* simple case - single byte encoding */
1253 const char *haystack = state->str1;
1254 const char *needle = state->str2;
1255 const char *haystack_end = &haystack[haystack_len];
1258 if (needle_len == 1)
1260 /* No point in using B-M-H for a one-character needle */
1261 char nchar = *needle;
1263 hptr = &haystack[start_pos];
1264 while (hptr < haystack_end)
1267 return hptr - haystack + 1;
1273 const char *needle_last = &needle[needle_len - 1];
1275 /* Start at startpos plus the length of the needle */
1276 hptr = &haystack[start_pos + needle_len - 1];
1277 while (hptr < haystack_end)
1279 /* Match the needle scanning *backward* */
1287 /* Matched it all? If so, return 1-based position */
1289 return p - haystack + 1;
1294 * No match, so use the haystack char at hptr to decide how
1295 * far to advance. If the needle had any occurrence of that
1296 * character (or more precisely, one sharing the same
1297 * skiptable entry) before its last character, then we advance
1298 * far enough to align the last such needle character with
1299 * that haystack position. Otherwise we can advance by the
1300 * whole needle length.
1302 hptr += state->skiptable[(unsigned char) *hptr & skiptablemask];
1308 /* The multibyte char version. This works exactly the same way. */
1309 const pg_wchar *haystack = state->wstr1;
1310 const pg_wchar *needle = state->wstr2;
1311 const pg_wchar *haystack_end = &haystack[haystack_len];
1312 const pg_wchar *hptr;
1314 if (needle_len == 1)
1316 /* No point in using B-M-H for a one-character needle */
1317 pg_wchar nchar = *needle;
1319 hptr = &haystack[start_pos];
1320 while (hptr < haystack_end)
1323 return hptr - haystack + 1;
1329 const pg_wchar *needle_last = &needle[needle_len - 1];
1331 /* Start at startpos plus the length of the needle */
1332 hptr = &haystack[start_pos + needle_len - 1];
1333 while (hptr < haystack_end)
1335 /* Match the needle scanning *backward* */
1336 const pg_wchar *nptr;
1343 /* Matched it all? If so, return 1-based position */
1345 return p - haystack + 1;
1350 * No match, so use the haystack char at hptr to decide how
1351 * far to advance. If the needle had any occurrence of that
1352 * character (or more precisely, one sharing the same
1353 * skiptable entry) before its last character, then we advance
1354 * far enough to align the last such needle character with
1355 * that haystack position. Otherwise we can advance by the
1356 * whole needle length.
1358 hptr += state->skiptable[*hptr & skiptablemask];
1363 return 0; /* not found */
1367 text_position_cleanup(TextPositionState *state)
1369 if (state->use_wchar)
1371 pfree(state->wstr1);
1372 pfree(state->wstr2);
1377 * Comparison function for text strings with given lengths.
1378 * Includes locale support, but must copy strings to temporary memory
1379 * to allow null-termination for inputs to strcoll().
1380 * Returns an integer less than, equal to, or greater than zero, indicating
1381 * whether arg1 is less than, equal to, or greater than arg2.
1384 varstr_cmp(char *arg1, int len1, char *arg2, int len2, Oid collid)
1389 * Unfortunately, there is no strncoll(), so in the non-C locale case we
1390 * have to do some memory copying. This turns out to be significantly
1391 * slower, so we optimize the case where LC_COLLATE is C. We also try to
1392 * optimize relatively-short strings by avoiding palloc/pfree overhead.
1394 if (lc_collate_is_c(collid))
1396 result = memcmp(arg1, arg2, Min(len1, len2));
1397 if ((result == 0) && (len1 != len2))
1398 result = (len1 < len2) ? -1 : 1;
1402 char a1buf[TEXTBUFLEN];
1403 char a2buf[TEXTBUFLEN];
1407 #ifdef HAVE_LOCALE_T
1408 pg_locale_t mylocale = 0;
1411 if (collid != DEFAULT_COLLATION_OID)
1413 if (!OidIsValid(collid))
1416 * This typically means that the parser could not resolve a
1417 * conflict of implicit collations, so report it that way.
1420 (errcode(ERRCODE_INDETERMINATE_COLLATION),
1421 errmsg("could not determine which collation to use for string comparison"),
1422 errhint("Use the COLLATE clause to set the collation explicitly.")));
1424 #ifdef HAVE_LOCALE_T
1425 mylocale = pg_newlocale_from_collation(collid);
1430 * memcmp() can't tell us which of two unequal strings sorts first,
1431 * but it's a cheap way to tell if they're equal. Testing shows that
1432 * memcmp() followed by strcoll() is only trivially slower than
1433 * strcoll() by itself, so we don't lose much if this doesn't work out
1434 * very often, and if it does - for example, because there are many
1435 * equal strings in the input - then we win big by avoiding expensive
1436 * collation-aware comparisons.
1438 if (len1 == len2 && memcmp(arg1, arg2, len1) == 0)
1442 /* Win32 does not have UTF-8, so we need to map to UTF-16 */
1443 if (GetDatabaseEncoding() == PG_UTF8)
1449 if (len1 >= TEXTBUFLEN / 2)
1451 a1len = len1 * 2 + 2;
1452 a1p = palloc(a1len);
1459 if (len2 >= TEXTBUFLEN / 2)
1461 a2len = len2 * 2 + 2;
1462 a2p = palloc(a2len);
1470 /* stupid Microsloth API does not work for zero-length input */
1475 r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1,
1476 (LPWSTR) a1p, a1len / 2);
1479 (errmsg("could not convert string to UTF-16: error code %lu",
1482 ((LPWSTR) a1p)[r] = 0;
1488 r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2,
1489 (LPWSTR) a2p, a2len / 2);
1492 (errmsg("could not convert string to UTF-16: error code %lu",
1495 ((LPWSTR) a2p)[r] = 0;
1498 #ifdef HAVE_LOCALE_T
1500 result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, mylocale);
1503 result = wcscoll((LPWSTR) a1p, (LPWSTR) a2p);
1504 if (result == 2147483647) /* _NLSCMPERROR; missing from mingw
1507 (errmsg("could not compare Unicode strings: %m")));
1510 * In some locales wcscoll() can claim that nonidentical strings
1511 * are equal. Believing that would be bad news for a number of
1512 * reasons, so we follow Perl's lead and sort "equal" strings
1513 * according to strcmp (on the UTF-8 representation).
1517 result = memcmp(arg1, arg2, Min(len1, len2));
1518 if ((result == 0) && (len1 != len2))
1519 result = (len1 < len2) ? -1 : 1;
1531 if (len1 >= TEXTBUFLEN)
1532 a1p = (char *) palloc(len1 + 1);
1535 if (len2 >= TEXTBUFLEN)
1536 a2p = (char *) palloc(len2 + 1);
1540 memcpy(a1p, arg1, len1);
1542 memcpy(a2p, arg2, len2);
1545 #ifdef HAVE_LOCALE_T
1547 result = strcoll_l(a1p, a2p, mylocale);
1550 result = strcoll(a1p, a2p);
1553 * In some locales strcoll() can claim that nonidentical strings are
1554 * equal. Believing that would be bad news for a number of reasons,
1555 * so we follow Perl's lead and sort "equal" strings according to
1559 result = strcmp(a1p, a2p);
1571 * Internal comparison function for text strings.
1572 * Returns -1, 0 or 1
1575 text_cmp(text *arg1, text *arg2, Oid collid)
1582 a1p = VARDATA_ANY(arg1);
1583 a2p = VARDATA_ANY(arg2);
1585 len1 = VARSIZE_ANY_EXHDR(arg1);
1586 len2 = VARSIZE_ANY_EXHDR(arg2);
1588 return varstr_cmp(a1p, len1, a2p, len2, collid);
1592 * Comparison functions for text strings.
1594 * Note: btree indexes need these routines not to leak memory; therefore,
1595 * be careful to free working copies of toasted datums. Most places don't
1596 * need to be so careful.
1600 texteq(PG_FUNCTION_ARGS)
1602 Datum arg1 = PG_GETARG_DATUM(0);
1603 Datum arg2 = PG_GETARG_DATUM(1);
1609 * Since we only care about equality or not-equality, we can avoid all the
1610 * expense of strcoll() here, and just do bitwise comparison. In fact, we
1611 * don't even have to do a bitwise comparison if we can show the lengths
1612 * of the strings are unequal; which might save us from having to detoast
1613 * one or both values.
1615 len1 = toast_raw_datum_size(arg1);
1616 len2 = toast_raw_datum_size(arg2);
1621 text *targ1 = DatumGetTextPP(arg1);
1622 text *targ2 = DatumGetTextPP(arg2);
1624 result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1625 len1 - VARHDRSZ) == 0);
1627 PG_FREE_IF_COPY(targ1, 0);
1628 PG_FREE_IF_COPY(targ2, 1);
1631 PG_RETURN_BOOL(result);
1635 textne(PG_FUNCTION_ARGS)
1637 Datum arg1 = PG_GETARG_DATUM(0);
1638 Datum arg2 = PG_GETARG_DATUM(1);
1643 /* See comment in texteq() */
1644 len1 = toast_raw_datum_size(arg1);
1645 len2 = toast_raw_datum_size(arg2);
1650 text *targ1 = DatumGetTextPP(arg1);
1651 text *targ2 = DatumGetTextPP(arg2);
1653 result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1654 len1 - VARHDRSZ) != 0);
1656 PG_FREE_IF_COPY(targ1, 0);
1657 PG_FREE_IF_COPY(targ2, 1);
1660 PG_RETURN_BOOL(result);
1664 text_lt(PG_FUNCTION_ARGS)
1666 text *arg1 = PG_GETARG_TEXT_PP(0);
1667 text *arg2 = PG_GETARG_TEXT_PP(1);
1670 result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0);
1672 PG_FREE_IF_COPY(arg1, 0);
1673 PG_FREE_IF_COPY(arg2, 1);
1675 PG_RETURN_BOOL(result);
1679 text_le(PG_FUNCTION_ARGS)
1681 text *arg1 = PG_GETARG_TEXT_PP(0);
1682 text *arg2 = PG_GETARG_TEXT_PP(1);
1685 result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) <= 0);
1687 PG_FREE_IF_COPY(arg1, 0);
1688 PG_FREE_IF_COPY(arg2, 1);
1690 PG_RETURN_BOOL(result);
1694 text_gt(PG_FUNCTION_ARGS)
1696 text *arg1 = PG_GETARG_TEXT_PP(0);
1697 text *arg2 = PG_GETARG_TEXT_PP(1);
1700 result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0);
1702 PG_FREE_IF_COPY(arg1, 0);
1703 PG_FREE_IF_COPY(arg2, 1);
1705 PG_RETURN_BOOL(result);
1709 text_ge(PG_FUNCTION_ARGS)
1711 text *arg1 = PG_GETARG_TEXT_PP(0);
1712 text *arg2 = PG_GETARG_TEXT_PP(1);
1715 result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) >= 0);
1717 PG_FREE_IF_COPY(arg1, 0);
1718 PG_FREE_IF_COPY(arg2, 1);
1720 PG_RETURN_BOOL(result);
1724 bttextcmp(PG_FUNCTION_ARGS)
1726 text *arg1 = PG_GETARG_TEXT_PP(0);
1727 text *arg2 = PG_GETARG_TEXT_PP(1);
1730 result = text_cmp(arg1, arg2, PG_GET_COLLATION());
1732 PG_FREE_IF_COPY(arg1, 0);
1733 PG_FREE_IF_COPY(arg2, 1);
1735 PG_RETURN_INT32(result);
1739 bttextsortsupport(PG_FUNCTION_ARGS)
1741 SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
1742 Oid collid = ssup->ssup_collation;
1743 MemoryContext oldcontext;
1745 oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
1747 /* Use generic string SortSupport */
1748 varstr_sortsupport(ssup, collid, false);
1750 MemoryContextSwitchTo(oldcontext);
1756 * Generic sortsupport interface for character type's operator classes.
1757 * Includes locale support, and support for BpChar semantics (i.e. removing
1758 * trailing spaces before comparison).
1760 * Relies on the assumption that text, VarChar, BpChar, and bytea all have the
1761 * same representation. Callers that always use the C collation (e.g.
1762 * non-collatable type callers like bytea) may have NUL bytes in their strings;
1763 * this will not work with any other collation, though.
1766 varstr_sortsupport(SortSupport ssup, Oid collid, bool bpchar)
1768 bool abbreviate = ssup->abbreviate;
1769 bool collate_c = false;
1770 VarStringSortSupport *sss;
1772 #ifdef HAVE_LOCALE_T
1773 pg_locale_t locale = 0;
1777 * If possible, set ssup->comparator to a function which can be used to
1778 * directly compare two datums. If we can do this, we'll avoid the
1779 * overhead of a trip through the fmgr layer for every comparison, which
1780 * can be substantial.
1782 * Most typically, we'll set the comparator to varstrfastcmp_locale, which
1783 * uses strcoll() to perform comparisons and knows about the special
1784 * requirements of BpChar callers. However, if LC_COLLATE = C, we can
1785 * make things quite a bit faster with varstrfastcmp_c or bpcharfastcmp_c,
1786 * both of which use memcmp() rather than strcoll().
1788 * There is a further exception on Windows. When the database encoding is
1789 * UTF-8 and we are not using the C collation, complex hacks are required.
1790 * We don't currently have a comparator that handles that case, so we fall
1791 * back on the slow method of having the sort code invoke bttextcmp() (in
1792 * the case of text) via the fmgr trampoline.
1794 if (lc_collate_is_c(collid))
1797 ssup->comparator = varstrfastcmp_c;
1799 ssup->comparator = bpcharfastcmp_c;
1804 else if (GetDatabaseEncoding() == PG_UTF8)
1809 ssup->comparator = varstrfastcmp_locale;
1812 * We need a collation-sensitive comparison. To make things faster,
1813 * we'll figure out the collation based on the locale id and cache the
1816 if (collid != DEFAULT_COLLATION_OID)
1818 if (!OidIsValid(collid))
1821 * This typically means that the parser could not resolve a
1822 * conflict of implicit collations, so report it that way.
1825 (errcode(ERRCODE_INDETERMINATE_COLLATION),
1826 errmsg("could not determine which collation to use for string comparison"),
1827 errhint("Use the COLLATE clause to set the collation explicitly.")));
1829 #ifdef HAVE_LOCALE_T
1830 locale = pg_newlocale_from_collation(collid);
1836 * Unfortunately, it seems that abbreviation for non-C collations is
1837 * broken on many common platforms; testing of multiple versions of glibc
1838 * reveals that, for many locales, strcoll() and strxfrm() do not return
1839 * consistent results, which is fatal to this optimization. While no
1840 * other libc other than Cygwin has so far been shown to have a problem,
1841 * we take the conservative course of action for right now and disable
1842 * this categorically. (Users who are certain this isn't a problem on
1843 * their system can define TRUST_STRXFRM.)
1845 * Even apart from the risk of broken locales, it's possible that there
1846 * are platforms where the use of abbreviated keys should be disabled at
1847 * compile time. Having only 4 byte datums could make worst-case
1848 * performance drastically more likely, for example. Moreover, macOS's
1849 * strxfrm() implementation is known to not effectively concentrate a
1850 * significant amount of entropy from the original string in earlier
1851 * transformed blobs. It's possible that other supported platforms are
1852 * similarly encumbered. So, if we ever get past disabling this
1853 * categorically, we may still want or need to disable it for particular
1856 #ifndef TRUST_STRXFRM
1862 * If we're using abbreviated keys, or if we're using a locale-aware
1863 * comparison, we need to initialize a StringSortSupport object. Both
1864 * cases will make use of the temporary buffers we initialize here for
1865 * scratch space (and to detect requirement for BpChar semantics from
1866 * caller), and the abbreviation case requires additional state.
1868 if (abbreviate || !collate_c)
1870 sss = palloc(sizeof(VarStringSortSupport));
1871 sss->buf1 = palloc(TEXTBUFLEN);
1872 sss->buflen1 = TEXTBUFLEN;
1873 sss->buf2 = palloc(TEXTBUFLEN);
1874 sss->buflen2 = TEXTBUFLEN;
1875 /* Start with invalid values */
1876 sss->last_len1 = -1;
1877 sss->last_len2 = -1;
1879 sss->last_returned = 0;
1880 #ifdef HAVE_LOCALE_T
1881 sss->locale = locale;
1885 * To avoid somehow confusing a strxfrm() blob and an original string,
1886 * constantly keep track of the variety of data that buf1 and buf2
1887 * currently contain.
1889 * Comparisons may be interleaved with conversion calls. Frequently,
1890 * conversions and comparisons are batched into two distinct phases,
1891 * but the correctness of caching cannot hinge upon this. For
1892 * comparison caching, buffer state is only trusted if cache_blob is
1893 * found set to false, whereas strxfrm() caching only trusts the state
1894 * when cache_blob is found set to true.
1896 * Arbitrarily initialize cache_blob to true.
1898 sss->cache_blob = true;
1899 sss->collate_c = collate_c;
1900 sss->bpchar = bpchar;
1901 ssup->ssup_extra = sss;
1904 * If possible, plan to use the abbreviated keys optimization. The
1905 * core code may switch back to authoritative comparator should
1906 * abbreviation be aborted.
1910 sss->prop_card = 0.20;
1911 initHyperLogLog(&sss->abbr_card, 10);
1912 initHyperLogLog(&sss->full_card, 10);
1913 ssup->abbrev_full_comparator = ssup->comparator;
1914 ssup->comparator = varstrcmp_abbrev;
1915 ssup->abbrev_converter = varstr_abbrev_convert;
1916 ssup->abbrev_abort = varstr_abbrev_abort;
1922 * sortsupport comparison func (for C locale case)
1925 varstrfastcmp_c(Datum x, Datum y, SortSupport ssup)
1927 VarString *arg1 = DatumGetVarStringPP(x);
1928 VarString *arg2 = DatumGetVarStringPP(y);
1935 a1p = VARDATA_ANY(arg1);
1936 a2p = VARDATA_ANY(arg2);
1938 len1 = VARSIZE_ANY_EXHDR(arg1);
1939 len2 = VARSIZE_ANY_EXHDR(arg2);
1941 result = memcmp(a1p, a2p, Min(len1, len2));
1942 if ((result == 0) && (len1 != len2))
1943 result = (len1 < len2) ? -1 : 1;
1945 /* We can't afford to leak memory here. */
1946 if (PointerGetDatum(arg1) != x)
1948 if (PointerGetDatum(arg2) != y)
1955 * sortsupport comparison func (for BpChar C locale case)
1957 * BpChar outsources its sortsupport to this module. Specialization for the
1958 * varstr_sortsupport BpChar case, modeled on
1959 * internal_bpchar_pattern_compare().
1962 bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup)
1964 BpChar *arg1 = DatumGetBpCharPP(x);
1965 BpChar *arg2 = DatumGetBpCharPP(y);
1972 a1p = VARDATA_ANY(arg1);
1973 a2p = VARDATA_ANY(arg2);
1975 len1 = bpchartruelen(a1p, VARSIZE_ANY_EXHDR(arg1));
1976 len2 = bpchartruelen(a2p, VARSIZE_ANY_EXHDR(arg2));
1978 result = memcmp(a1p, a2p, Min(len1, len2));
1979 if ((result == 0) && (len1 != len2))
1980 result = (len1 < len2) ? -1 : 1;
1982 /* We can't afford to leak memory here. */
1983 if (PointerGetDatum(arg1) != x)
1985 if (PointerGetDatum(arg2) != y)
1992 * sortsupport comparison func (for locale case)
1995 varstrfastcmp_locale(Datum x, Datum y, SortSupport ssup)
1997 VarString *arg1 = DatumGetVarStringPP(x);
1998 VarString *arg2 = DatumGetVarStringPP(y);
2000 VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
2009 a1p = VARDATA_ANY(arg1);
2010 a2p = VARDATA_ANY(arg2);
2012 len1 = VARSIZE_ANY_EXHDR(arg1);
2013 len2 = VARSIZE_ANY_EXHDR(arg2);
2015 /* Fast pre-check for equality, as discussed in varstr_cmp() */
2016 if (len1 == len2 && memcmp(a1p, a2p, len1) == 0)
2019 * No change in buf1 or buf2 contents, so avoid changing last_len1 or
2020 * last_len2. Existing contents of buffers might still be used by
2023 * It's fine to allow the comparison of BpChar padding bytes here,
2024 * even though that implies that the memcmp() will usually be
2025 * performed for BpChar callers (though multibyte characters could
2026 * still prevent that from occurring). The memcmp() is still very
2027 * cheap, and BpChar's funny semantics have us remove trailing spaces
2028 * (not limited to padding), so we need make no distinction between
2029 * padding space characters and "real" space characters.
2037 /* Get true number of bytes, ignoring trailing spaces */
2038 len1 = bpchartruelen(a1p, len1);
2039 len2 = bpchartruelen(a2p, len2);
2042 if (len1 >= sss->buflen1)
2045 sss->buflen1 = Max(len1 + 1, Min(sss->buflen1 * 2, MaxAllocSize));
2046 sss->buf1 = MemoryContextAlloc(ssup->ssup_cxt, sss->buflen1);
2048 if (len2 >= sss->buflen2)
2051 sss->buflen2 = Max(len2 + 1, Min(sss->buflen2 * 2, MaxAllocSize));
2052 sss->buf2 = MemoryContextAlloc(ssup->ssup_cxt, sss->buflen2);
2056 * We're likely to be asked to compare the same strings repeatedly, and
2057 * memcmp() is so much cheaper than strcoll() that it pays to try to cache
2058 * comparisons, even though in general there is no reason to think that
2059 * that will work out (every string datum may be unique). Caching does
2060 * not slow things down measurably when it doesn't work out, and can speed
2061 * things up by rather a lot when it does. In part, this is because the
2062 * memcmp() compares data from cachelines that are needed in L1 cache even
2063 * when the last comparison's result cannot be reused.
2066 if (len1 != sss->last_len1 || memcmp(sss->buf1, a1p, len1) != 0)
2069 memcpy(sss->buf1, a1p, len1);
2070 sss->buf1[len1] = '\0';
2071 sss->last_len1 = len1;
2075 * If we're comparing the same two strings as last time, we can return the
2076 * same answer without calling strcoll() again. This is more likely than
2077 * it seems (at least with moderate to low cardinality sets), because
2078 * quicksort compares the same pivot against many values.
2080 if (len2 != sss->last_len2 || memcmp(sss->buf2, a2p, len2) != 0)
2082 memcpy(sss->buf2, a2p, len2);
2083 sss->buf2[len2] = '\0';
2084 sss->last_len2 = len2;
2086 else if (arg1_match && !sss->cache_blob)
2088 /* Use result cached following last actual strcoll() call */
2089 result = sss->last_returned;
2093 #ifdef HAVE_LOCALE_T
2095 result = strcoll_l(sss->buf1, sss->buf2, sss->locale);
2098 result = strcoll(sss->buf1, sss->buf2);
2101 * In some locales strcoll() can claim that nonidentical strings are
2102 * equal. Believing that would be bad news for a number of reasons, so we
2103 * follow Perl's lead and sort "equal" strings according to strcmp().
2106 result = strcmp(sss->buf1, sss->buf2);
2108 /* Cache result, perhaps saving an expensive strcoll() call next time */
2109 sss->cache_blob = false;
2110 sss->last_returned = result;
2112 /* We can't afford to leak memory here. */
2113 if (PointerGetDatum(arg1) != x)
2115 if (PointerGetDatum(arg2) != y)
2122 * Abbreviated key comparison func
2125 varstrcmp_abbrev(Datum x, Datum y, SortSupport ssup)
2128 * When 0 is returned, the core system will call varstrfastcmp_c()
2129 * (bpcharfastcmp_c() in BpChar case) or varstrfastcmp_locale(). Even a
2130 * strcmp() on two non-truncated strxfrm() blobs cannot indicate *equality*
2131 * authoritatively, for the same reason that there is a strcoll()
2132 * tie-breaker call to strcmp() in varstr_cmp().
2143 * Conversion routine for sortsupport. Converts original to abbreviated key
2144 * representation. Our encoding strategy is simple -- pack the first 8 bytes
2145 * of a strxfrm() blob into a Datum (on little-endian machines, the 8 bytes are
2146 * stored in reverse order), and treat it as an unsigned integer. When the "C"
2147 * locale is used, or in case of bytea, just memcpy() from original instead.
2150 varstr_abbrev_convert(Datum original, SortSupport ssup)
2152 VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
2153 VarString *authoritative = DatumGetVarStringPP(original);
2154 char *authoritative_data = VARDATA_ANY(authoritative);
2162 pres = (char *) &res;
2163 /* memset(), so any non-overwritten bytes are NUL */
2164 memset(pres, 0, sizeof(Datum));
2165 len = VARSIZE_ANY_EXHDR(authoritative);
2167 /* Get number of bytes, ignoring trailing spaces */
2169 len = bpchartruelen(authoritative_data, len);
2172 * If we're using the C collation, use memcpy(), rather than strxfrm(), to
2173 * abbreviate keys. The full comparator for the C locale is always
2174 * memcmp(). It would be incorrect to allow bytea callers (callers that
2175 * always force the C collation -- bytea isn't a collatable type, but this
2176 * approach is convenient) to use strxfrm(). This is because bytea
2177 * strings may contain NUL bytes. Besides, this should be faster, too.
2179 * More generally, it's okay that bytea callers can have NUL bytes in
2180 * strings because varstrcmp_abbrev() need not make a distinction between
2181 * terminating NUL bytes, and NUL bytes representing actual NULs in the
2182 * authoritative representation. Hopefully a comparison at or past one
2183 * abbreviated key's terminating NUL byte will resolve the comparison
2184 * without consulting the authoritative representation; specifically, some
2185 * later non-NUL byte in the longer string can resolve the comparison
2186 * against a subsequent terminating NUL in the shorter string. There will
2187 * usually be what is effectively a "length-wise" resolution there and
2190 * If that doesn't work out -- if all bytes in the longer string
2191 * positioned at or past the offset of the smaller string's (first)
2192 * terminating NUL are actually representative of NUL bytes in the
2193 * authoritative binary string (perhaps with some *terminating* NUL bytes
2194 * towards the end of the longer string iff it happens to still be small)
2195 * -- then an authoritative tie-breaker will happen, and do the right
2196 * thing: explicitly consider string length.
2199 memcpy(pres, authoritative_data, Min(len, sizeof(Datum)));
2205 * We're not using the C collation, so fall back on strxfrm.
2208 /* By convention, we use buffer 1 to store and NUL-terminate */
2209 if (len >= sss->buflen1)
2212 sss->buflen1 = Max(len + 1, Min(sss->buflen1 * 2, MaxAllocSize));
2213 sss->buf1 = palloc(sss->buflen1);
2216 /* Might be able to reuse strxfrm() blob from last call */
2217 if (sss->last_len1 == len && sss->cache_blob &&
2218 memcmp(sss->buf1, authoritative_data, len) == 0)
2220 memcpy(pres, sss->buf2, Min(sizeof(Datum), sss->last_len2));
2221 /* No change affecting cardinality, so no hashing required */
2225 /* Just like strcoll(), strxfrm() expects a NUL-terminated string */
2226 memcpy(sss->buf1, authoritative_data, len);
2227 sss->buf1[len] = '\0';
2228 sss->last_len1 = len;
2232 #ifdef HAVE_LOCALE_T
2234 bsize = strxfrm_l(sss->buf2, sss->buf1,
2235 sss->buflen2, sss->locale);
2238 bsize = strxfrm(sss->buf2, sss->buf1, sss->buflen2);
2240 sss->last_len2 = bsize;
2241 if (bsize < sss->buflen2)
2245 * The C standard states that the contents of the buffer is now
2246 * unspecified. Grow buffer, and retry.
2249 sss->buflen2 = Max(bsize + 1,
2250 Min(sss->buflen2 * 2, MaxAllocSize));
2251 sss->buf2 = palloc(sss->buflen2);
2255 * Every Datum byte is always compared. This is safe because the
2256 * strxfrm() blob is itself NUL terminated, leaving no danger of
2257 * misinterpreting any NUL bytes not intended to be interpreted as
2258 * logically representing termination.
2260 * (Actually, even if there were NUL bytes in the blob it would be
2261 * okay. See remarks on bytea case above.)
2263 memcpy(pres, sss->buf2, Min(sizeof(Datum), bsize));
2267 * Maintain approximate cardinality of both abbreviated keys and original,
2268 * authoritative keys using HyperLogLog. Used as cheap insurance against
2269 * the worst case, where we do many string transformations for no saving
2270 * in full strcoll()-based comparisons. These statistics are used by
2271 * varstr_abbrev_abort().
2273 * First, Hash key proper, or a significant fraction of it. Mix in length
2274 * in order to compensate for cases where differences are past
2275 * PG_CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing.
2277 hash = DatumGetUInt32(hash_any((unsigned char *) authoritative_data,
2278 Min(len, PG_CACHE_LINE_SIZE)));
2280 if (len > PG_CACHE_LINE_SIZE)
2281 hash ^= DatumGetUInt32(hash_uint32((uint32) len));
2283 addHyperLogLog(&sss->full_card, hash);
2285 /* Hash abbreviated key */
2286 #if SIZEOF_DATUM == 8
2291 lohalf = (uint32) res;
2292 hihalf = (uint32) (res >> 32);
2293 hash = DatumGetUInt32(hash_uint32(lohalf ^ hihalf));
2295 #else /* SIZEOF_DATUM != 8 */
2296 hash = DatumGetUInt32(hash_uint32((uint32) res));
2299 addHyperLogLog(&sss->abbr_card, hash);
2301 /* Cache result, perhaps saving an expensive strxfrm() call next time */
2302 sss->cache_blob = true;
2306 * Byteswap on little-endian machines.
2308 * This is needed so that varstrcmp_abbrev() (an unsigned integer 3-way
2309 * comparator) works correctly on all platforms. If we didn't do this,
2310 * the comparator would have to call memcmp() with a pair of pointers to
2311 * the first byte of each abbreviated key, which is slower.
2313 res = DatumBigEndianToNative(res);
2315 /* Don't leak memory here */
2316 if (PointerGetDatum(authoritative) != original)
2317 pfree(authoritative);
2323 * Callback for estimating effectiveness of abbreviated key optimization, using
2324 * heuristic rules. Returns value indicating if the abbreviation optimization
2325 * should be aborted, based on its projected effectiveness.
2328 varstr_abbrev_abort(int memtupcount, SortSupport ssup)
2330 VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
2331 double abbrev_distinct,
2334 Assert(ssup->abbreviate);
2336 /* Have a little patience */
2337 if (memtupcount < 100)
2340 abbrev_distinct = estimateHyperLogLog(&sss->abbr_card);
2341 key_distinct = estimateHyperLogLog(&sss->full_card);
2344 * Clamp cardinality estimates to at least one distinct value. While
2345 * NULLs are generally disregarded, if only NULL values were seen so far,
2346 * that might misrepresent costs if we failed to clamp.
2348 if (abbrev_distinct <= 1.0)
2349 abbrev_distinct = 1.0;
2351 if (key_distinct <= 1.0)
2355 * In the worst case all abbreviated keys are identical, while at the same
2356 * time there are differences within full key strings not captured in
2362 double norm_abbrev_card = abbrev_distinct / (double) memtupcount;
2364 elog(LOG, "varstr_abbrev: abbrev_distinct after %d: %f "
2365 "(key_distinct: %f, norm_abbrev_card: %f, prop_card: %f)",
2366 memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card,
2372 * If the number of distinct abbreviated keys approximately matches the
2373 * number of distinct authoritative original keys, that's reason enough to
2374 * proceed. We can win even with a very low cardinality set if most
2375 * tie-breakers only memcmp(). This is by far the most important
2378 * While comparisons that are resolved at the abbreviated key level are
2379 * considerably cheaper than tie-breakers resolved with memcmp(), both of
2380 * those two outcomes are so much cheaper than a full strcoll() once
2381 * sorting is underway that it doesn't seem worth it to weigh abbreviated
2382 * cardinality against the overall size of the set in order to more
2383 * accurately model costs. Assume that an abbreviated comparison, and an
2384 * abbreviated comparison with a cheap memcmp()-based authoritative
2385 * resolution are equivalent.
2387 if (abbrev_distinct > key_distinct * sss->prop_card)
2390 * When we have exceeded 10,000 tuples, decay required cardinality
2391 * aggressively for next call.
2393 * This is useful because the number of comparisons required on
2394 * average increases at a linearithmic rate, and at roughly 10,000
2395 * tuples that factor will start to dominate over the linear costs of
2396 * string transformation (this is a conservative estimate). The decay
2397 * rate is chosen to be a little less aggressive than halving -- which
2398 * (since we're called at points at which memtupcount has doubled)
2399 * would never see the cost model actually abort past the first call
2400 * following a decay. This decay rate is mostly a precaution against
2401 * a sudden, violent swing in how well abbreviated cardinality tracks
2402 * full key cardinality. The decay also serves to prevent a marginal
2403 * case from being aborted too late, when too much has already been
2404 * invested in string transformation.
2406 * It's possible for sets of several million distinct strings with
2407 * mere tens of thousands of distinct abbreviated keys to still
2408 * benefit very significantly. This will generally occur provided
2409 * each abbreviated key is a proxy for a roughly uniform number of the
2410 * set's full keys. If it isn't so, we hope to catch that early and
2411 * abort. If it isn't caught early, by the time the problem is
2412 * apparent it's probably not worth aborting.
2414 if (memtupcount > 10000)
2415 sss->prop_card *= 0.65;
2421 * Abort abbreviation strategy.
2423 * The worst case, where all abbreviated keys are identical while all
2424 * original strings differ will typically only see a regression of about
2425 * 10% in execution time for small to medium sized lists of strings.
2426 * Whereas on modern CPUs where cache stalls are the dominant cost, we can
2427 * often expect very large improvements, particularly with sets of strings
2428 * of moderately high to high abbreviated cardinality. There is little to
2429 * lose but much to gain, which our strategy reflects.
2433 elog(LOG, "varstr_abbrev: aborted abbreviation at %d "
2434 "(abbrev_distinct: %f, key_distinct: %f, prop_card: %f)",
2435 memtupcount, abbrev_distinct, key_distinct, sss->prop_card);
2442 text_larger(PG_FUNCTION_ARGS)
2444 text *arg1 = PG_GETARG_TEXT_PP(0);
2445 text *arg2 = PG_GETARG_TEXT_PP(1);
2448 result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0) ? arg1 : arg2);
2450 PG_RETURN_TEXT_P(result);
2454 text_smaller(PG_FUNCTION_ARGS)
2456 text *arg1 = PG_GETARG_TEXT_PP(0);
2457 text *arg2 = PG_GETARG_TEXT_PP(1);
2460 result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0) ? arg1 : arg2);
2462 PG_RETURN_TEXT_P(result);
2467 * The following operators support character-by-character comparison
2468 * of text datums, to allow building indexes suitable for LIKE clauses.
2469 * Note that the regular texteq/textne comparison operators, and regular
2470 * support functions 1 and 2 with "C" collation are assumed to be
2471 * compatible with these!
2475 internal_text_pattern_compare(text *arg1, text *arg2)
2481 len1 = VARSIZE_ANY_EXHDR(arg1);
2482 len2 = VARSIZE_ANY_EXHDR(arg2);
2484 result = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
2487 else if (len1 < len2)
2489 else if (len1 > len2)
2497 text_pattern_lt(PG_FUNCTION_ARGS)
2499 text *arg1 = PG_GETARG_TEXT_PP(0);
2500 text *arg2 = PG_GETARG_TEXT_PP(1);
2503 result = internal_text_pattern_compare(arg1, arg2);
2505 PG_FREE_IF_COPY(arg1, 0);
2506 PG_FREE_IF_COPY(arg2, 1);
2508 PG_RETURN_BOOL(result < 0);
2513 text_pattern_le(PG_FUNCTION_ARGS)
2515 text *arg1 = PG_GETARG_TEXT_PP(0);
2516 text *arg2 = PG_GETARG_TEXT_PP(1);
2519 result = internal_text_pattern_compare(arg1, arg2);
2521 PG_FREE_IF_COPY(arg1, 0);
2522 PG_FREE_IF_COPY(arg2, 1);
2524 PG_RETURN_BOOL(result <= 0);
2529 text_pattern_ge(PG_FUNCTION_ARGS)
2531 text *arg1 = PG_GETARG_TEXT_PP(0);
2532 text *arg2 = PG_GETARG_TEXT_PP(1);
2535 result = internal_text_pattern_compare(arg1, arg2);
2537 PG_FREE_IF_COPY(arg1, 0);
2538 PG_FREE_IF_COPY(arg2, 1);
2540 PG_RETURN_BOOL(result >= 0);
2545 text_pattern_gt(PG_FUNCTION_ARGS)
2547 text *arg1 = PG_GETARG_TEXT_PP(0);
2548 text *arg2 = PG_GETARG_TEXT_PP(1);
2551 result = internal_text_pattern_compare(arg1, arg2);
2553 PG_FREE_IF_COPY(arg1, 0);
2554 PG_FREE_IF_COPY(arg2, 1);
2556 PG_RETURN_BOOL(result > 0);
2561 bttext_pattern_cmp(PG_FUNCTION_ARGS)
2563 text *arg1 = PG_GETARG_TEXT_PP(0);
2564 text *arg2 = PG_GETARG_TEXT_PP(1);
2567 result = internal_text_pattern_compare(arg1, arg2);
2569 PG_FREE_IF_COPY(arg1, 0);
2570 PG_FREE_IF_COPY(arg2, 1);
2572 PG_RETURN_INT32(result);
2577 bttext_pattern_sortsupport(PG_FUNCTION_ARGS)
2579 SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
2580 MemoryContext oldcontext;
2582 oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
2584 /* Use generic string SortSupport, forcing "C" collation */
2585 varstr_sortsupport(ssup, C_COLLATION_OID, false);
2587 MemoryContextSwitchTo(oldcontext);
2593 /*-------------------------------------------------------------
2596 * get the number of bytes contained in an instance of type 'bytea'
2597 *-------------------------------------------------------------
2600 byteaoctetlen(PG_FUNCTION_ARGS)
2602 Datum str = PG_GETARG_DATUM(0);
2604 /* We need not detoast the input at all */
2605 PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
2610 * takes two bytea* and returns a bytea* that is the concatenation of
2613 * Cloned from textcat and modified as required.
2616 byteacat(PG_FUNCTION_ARGS)
2618 bytea *t1 = PG_GETARG_BYTEA_PP(0);
2619 bytea *t2 = PG_GETARG_BYTEA_PP(1);
2621 PG_RETURN_BYTEA_P(bytea_catenate(t1, t2));
2626 * Guts of byteacat(), broken out so it can be used by other functions
2628 * Arguments can be in short-header form, but not compressed or out-of-line
2631 bytea_catenate(bytea *t1, bytea *t2)
2639 len1 = VARSIZE_ANY_EXHDR(t1);
2640 len2 = VARSIZE_ANY_EXHDR(t2);
2642 /* paranoia ... probably should throw error instead? */
2648 len = len1 + len2 + VARHDRSZ;
2649 result = (bytea *) palloc(len);
2651 /* Set size of result string... */
2652 SET_VARSIZE(result, len);
2654 /* Fill data field of result string... */
2655 ptr = VARDATA(result);
2657 memcpy(ptr, VARDATA_ANY(t1), len1);
2659 memcpy(ptr + len1, VARDATA_ANY(t2), len2);
2664 #define PG_STR_GET_BYTEA(str_) \
2665 DatumGetByteaP(DirectFunctionCall1(byteain, CStringGetDatum(str_)))
2669 * Return a substring starting at the specified position.
2670 * Cloned from text_substr and modified as required.
2674 * - starting position (is one-based)
2675 * - string length (optional)
2677 * If the starting position is zero or less, then return from the start of the string
2678 * adjusting the length to be consistent with the "negative start" per SQL.
2679 * If the length is less than zero, an ERROR is thrown. If no third argument
2680 * (length) is provided, the length to the end of the string is assumed.
2683 bytea_substr(PG_FUNCTION_ARGS)
2685 PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0),
2692 * bytea_substr_no_len -
2693 * Wrapper to avoid opr_sanity failure due to
2694 * one function accepting a different number of args.
2697 bytea_substr_no_len(PG_FUNCTION_ARGS)
2699 PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0),
2706 bytea_substring(Datum str,
2709 bool length_not_specified)
2711 int S1; /* adjusted start position */
2712 int L1; /* adjusted substring length */
2716 if (length_not_specified)
2719 * Not passed a length - DatumGetByteaPSlice() grabs everything to the
2720 * end of the string if we pass it a negative value for length.
2730 * A negative value for L is the only way for the end position to be
2731 * before the start. SQL99 says to throw an error.
2735 (errcode(ERRCODE_SUBSTRING_ERROR),
2736 errmsg("negative substring length not allowed")));
2739 * A zero or negative value for the end position can happen if the
2740 * start was negative or one. SQL99 says to return a zero-length
2744 return PG_STR_GET_BYTEA("");
2750 * If the start position is past the end of the string, SQL99 says to
2751 * return a zero-length string -- DatumGetByteaPSlice() will do that for
2752 * us. Convert to zero-based starting position
2754 return DatumGetByteaPSlice(str, S1 - 1, L1);
2759 * Replace specified substring of first string with second
2761 * The SQL standard defines OVERLAY() in terms of substring and concatenation.
2762 * This code is a direct implementation of what the standard says.
2765 byteaoverlay(PG_FUNCTION_ARGS)
2767 bytea *t1 = PG_GETARG_BYTEA_PP(0);
2768 bytea *t2 = PG_GETARG_BYTEA_PP(1);
2769 int sp = PG_GETARG_INT32(2); /* substring start position */
2770 int sl = PG_GETARG_INT32(3); /* substring length */
2772 PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
2776 byteaoverlay_no_len(PG_FUNCTION_ARGS)
2778 bytea *t1 = PG_GETARG_BYTEA_PP(0);
2779 bytea *t2 = PG_GETARG_BYTEA_PP(1);
2780 int sp = PG_GETARG_INT32(2); /* substring start position */
2783 sl = VARSIZE_ANY_EXHDR(t2); /* defaults to length(t2) */
2784 PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
2788 bytea_overlay(bytea *t1, bytea *t2, int sp, int sl)
2796 * Check for possible integer-overflow cases. For negative sp, throw a
2797 * "substring length" error because that's what should be expected
2798 * according to the spec's definition of OVERLAY().
2802 (errcode(ERRCODE_SUBSTRING_ERROR),
2803 errmsg("negative substring length not allowed")));
2807 (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
2808 errmsg("integer out of range")));
2810 s1 = bytea_substring(PointerGetDatum(t1), 1, sp - 1, false);
2811 s2 = bytea_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
2812 result = bytea_catenate(s1, t2);
2813 result = bytea_catenate(result, s2);
2820 * Return the position of the specified substring.
2821 * Implements the SQL POSITION() function.
2822 * Cloned from textpos and modified as required.
2825 byteapos(PG_FUNCTION_ARGS)
2827 bytea *t1 = PG_GETARG_BYTEA_PP(0);
2828 bytea *t2 = PG_GETARG_BYTEA_PP(1);
2837 len1 = VARSIZE_ANY_EXHDR(t1);
2838 len2 = VARSIZE_ANY_EXHDR(t2);
2841 PG_RETURN_INT32(1); /* result for empty pattern */
2843 p1 = VARDATA_ANY(t1);
2844 p2 = VARDATA_ANY(t2);
2848 for (p = 0; p <= px; p++)
2850 if ((*p2 == *p1) && (memcmp(p1, p2, len2) == 0))
2858 PG_RETURN_INT32(pos);
2861 /*-------------------------------------------------------------
2864 * this routine treats "bytea" as an array of bytes.
2865 * It returns the Nth byte (a number between 0 and 255).
2866 *-------------------------------------------------------------
2869 byteaGetByte(PG_FUNCTION_ARGS)
2871 bytea *v = PG_GETARG_BYTEA_PP(0);
2872 int32 n = PG_GETARG_INT32(1);
2876 len = VARSIZE_ANY_EXHDR(v);
2878 if (n < 0 || n >= len)
2880 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
2881 errmsg("index %d out of valid range, 0..%d",
2884 byte = ((unsigned char *) VARDATA_ANY(v))[n];
2886 PG_RETURN_INT32(byte);
2889 /*-------------------------------------------------------------
2892 * This routine treats a "bytea" type like an array of bits.
2893 * It returns the value of the Nth bit (0 or 1).
2895 *-------------------------------------------------------------
2898 byteaGetBit(PG_FUNCTION_ARGS)
2900 bytea *v = PG_GETARG_BYTEA_PP(0);
2901 int32 n = PG_GETARG_INT32(1);
2907 len = VARSIZE_ANY_EXHDR(v);
2909 if (n < 0 || n >= len * 8)
2911 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
2912 errmsg("index %d out of valid range, 0..%d",
2918 byte = ((unsigned char *) VARDATA_ANY(v))[byteNo];
2920 if (byte & (1 << bitNo))
2926 /*-------------------------------------------------------------
2929 * Given an instance of type 'bytea' creates a new one with
2930 * the Nth byte set to the given value.
2932 *-------------------------------------------------------------
2935 byteaSetByte(PG_FUNCTION_ARGS)
2937 bytea *v = PG_GETARG_BYTEA_P(0);
2938 int32 n = PG_GETARG_INT32(1);
2939 int32 newByte = PG_GETARG_INT32(2);
2943 len = VARSIZE(v) - VARHDRSZ;
2945 if (n < 0 || n >= len)
2947 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
2948 errmsg("index %d out of valid range, 0..%d",
2952 * Make a copy of the original varlena.
2954 res = (bytea *) palloc(VARSIZE(v));
2955 memcpy((char *) res, (char *) v, VARSIZE(v));
2960 ((unsigned char *) VARDATA(res))[n] = newByte;
2962 PG_RETURN_BYTEA_P(res);
2965 /*-------------------------------------------------------------
2968 * Given an instance of type 'bytea' creates a new one with
2969 * the Nth bit set to the given value.
2971 *-------------------------------------------------------------
2974 byteaSetBit(PG_FUNCTION_ARGS)
2976 bytea *v = PG_GETARG_BYTEA_P(0);
2977 int32 n = PG_GETARG_INT32(1);
2978 int32 newBit = PG_GETARG_INT32(2);
2986 len = VARSIZE(v) - VARHDRSZ;
2988 if (n < 0 || n >= len * 8)
2990 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
2991 errmsg("index %d out of valid range, 0..%d",
3000 if (newBit != 0 && newBit != 1)
3002 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
3003 errmsg("new bit must be 0 or 1")));
3006 * Make a copy of the original varlena.
3008 res = (bytea *) palloc(VARSIZE(v));
3009 memcpy((char *) res, (char *) v, VARSIZE(v));
3014 oldByte = ((unsigned char *) VARDATA(res))[byteNo];
3017 newByte = oldByte & (~(1 << bitNo));
3019 newByte = oldByte | (1 << bitNo);
3021 ((unsigned char *) VARDATA(res))[byteNo] = newByte;
3023 PG_RETURN_BYTEA_P(res);
3028 * Converts a text type to a Name type.
3031 text_name(PG_FUNCTION_ARGS)
3033 text *s = PG_GETARG_TEXT_PP(0);
3037 len = VARSIZE_ANY_EXHDR(s);
3039 /* Truncate oversize input */
3040 if (len >= NAMEDATALEN)
3041 len = pg_mbcliplen(VARDATA_ANY(s), len, NAMEDATALEN - 1);
3043 /* We use palloc0 here to ensure result is zero-padded */
3044 result = (Name) palloc0(NAMEDATALEN);
3045 memcpy(NameStr(*result), VARDATA_ANY(s), len);
3047 PG_RETURN_NAME(result);
3051 * Converts a Name type to a text type.
3054 name_text(PG_FUNCTION_ARGS)
3056 Name s = PG_GETARG_NAME(0);
3058 PG_RETURN_TEXT_P(cstring_to_text(NameStr(*s)));
3063 * textToQualifiedNameList - convert a text object to list of names
3065 * This implements the input parsing needed by nextval() and other
3066 * functions that take a text parameter representing a qualified name.
3067 * We split the name at dots, downcase if not double-quoted, and
3068 * truncate names if they're too long.
3071 textToQualifiedNameList(text *textval)
3078 /* Convert to C string (handles possible detoasting). */
3079 /* Note we rely on being able to modify rawname below. */
3080 rawname = text_to_cstring(textval);
3082 if (!SplitIdentifierString(rawname, '.', &namelist))
3084 (errcode(ERRCODE_INVALID_NAME),
3085 errmsg("invalid name syntax")));
3087 if (namelist == NIL)
3089 (errcode(ERRCODE_INVALID_NAME),
3090 errmsg("invalid name syntax")));
3092 foreach(l, namelist)
3094 char *curname = (char *) lfirst(l);
3096 result = lappend(result, makeString(pstrdup(curname)));
3100 list_free(namelist);
3106 * SplitIdentifierString --- parse a string containing identifiers
3108 * This is the guts of textToQualifiedNameList, and is exported for use in
3109 * other situations such as parsing GUC variables. In the GUC case, it's
3110 * important to avoid memory leaks, so the API is designed to minimize the
3111 * amount of stuff that needs to be allocated and freed.
3114 * rawstring: the input string; must be overwritable! On return, it's
3115 * been modified to contain the separated identifiers.
3116 * separator: the separator punctuation expected between identifiers
3117 * (typically '.' or ','). Whitespace may also appear around
3120 * namelist: filled with a palloc'd list of pointers to identifiers within
3121 * rawstring. Caller should list_free() this even on error return.
3123 * Returns TRUE if okay, FALSE if there is a syntax error in the string.
3125 * Note that an empty string is considered okay here, though not in
3126 * textToQualifiedNameList.
3129 SplitIdentifierString(char *rawstring, char separator,
3132 char *nextp = rawstring;
3137 while (isspace((unsigned char) *nextp))
3138 nextp++; /* skip leading whitespace */
3141 return true; /* allow empty string */
3143 /* At the top of the loop, we are at start of a new identifier. */
3151 /* Quoted name --- collapse quote-quote pairs, no downcasing */
3152 curname = nextp + 1;
3155 endp = strchr(nextp + 1, '"');
3157 return false; /* mismatched quotes */
3159 break; /* found end of quoted name */
3160 /* Collapse adjacent quotes into one quote, and look again */
3161 memmove(endp, endp + 1, strlen(endp));
3164 /* endp now points at the terminating quote */
3169 /* Unquoted name --- extends to separator or whitespace */
3174 while (*nextp && *nextp != separator &&
3175 !isspace((unsigned char) *nextp))
3178 if (curname == nextp)
3179 return false; /* empty unquoted name not allowed */
3182 * Downcase the identifier, using same code as main lexer does.
3184 * XXX because we want to overwrite the input in-place, we cannot
3185 * support a downcasing transformation that increases the string
3186 * length. This is not a problem given the current implementation
3187 * of downcase_truncate_identifier, but we'll probably have to do
3188 * something about this someday.
3190 len = endp - curname;
3191 downname = downcase_truncate_identifier(curname, len, false);
3192 Assert(strlen(downname) <= len);
3193 strncpy(curname, downname, len); /* strncpy is required here */
3197 while (isspace((unsigned char) *nextp))
3198 nextp++; /* skip trailing whitespace */
3200 if (*nextp == separator)
3203 while (isspace((unsigned char) *nextp))
3204 nextp++; /* skip leading whitespace for next */
3205 /* we expect another name, so done remains false */
3207 else if (*nextp == '\0')
3210 return false; /* invalid syntax */
3212 /* Now safe to overwrite separator with a null */
3215 /* Truncate name if it's overlength */
3216 truncate_identifier(curname, strlen(curname), false);
3219 * Finished isolating current name --- add it to list
3221 *namelist = lappend(*namelist, curname);
3223 /* Loop back if we didn't reach end of string */
3231 * SplitDirectoriesString --- parse a string containing directory names
3233 * This is similar to SplitIdentifierString, except that the parsing
3234 * rules are meant to handle pathnames instead of identifiers: there is
3235 * no downcasing, embedded spaces are allowed, the max length is MAXPGPATH-1,
3236 * and we apply canonicalize_path() to each extracted string. Because of the
3237 * last, the returned strings are separately palloc'd rather than being
3238 * pointers into rawstring --- but we still scribble on rawstring.
3241 * rawstring: the input string; must be modifiable!
3242 * separator: the separator punctuation expected between directories
3243 * (typically ',' or ';'). Whitespace may also appear around
3246 * namelist: filled with a palloc'd list of directory names.
3247 * Caller should list_free_deep() this even on error return.
3249 * Returns TRUE if okay, FALSE if there is a syntax error in the string.
3251 * Note that an empty string is considered okay here.
3254 SplitDirectoriesString(char *rawstring, char separator,
3257 char *nextp = rawstring;
3262 while (isspace((unsigned char) *nextp))
3263 nextp++; /* skip leading whitespace */
3266 return true; /* allow empty string */
3268 /* At the top of the loop, we are at start of a new directory. */
3276 /* Quoted name --- collapse quote-quote pairs */
3277 curname = nextp + 1;
3280 endp = strchr(nextp + 1, '"');
3282 return false; /* mismatched quotes */
3284 break; /* found end of quoted name */
3285 /* Collapse adjacent quotes into one quote, and look again */
3286 memmove(endp, endp + 1, strlen(endp));
3289 /* endp now points at the terminating quote */
3294 /* Unquoted name --- extends to separator or end of string */
3295 curname = endp = nextp;
3296 while (*nextp && *nextp != separator)
3298 /* trailing whitespace should not be included in name */
3299 if (!isspace((unsigned char) *nextp))
3303 if (curname == endp)
3304 return false; /* empty unquoted name not allowed */
3307 while (isspace((unsigned char) *nextp))
3308 nextp++; /* skip trailing whitespace */
3310 if (*nextp == separator)
3313 while (isspace((unsigned char) *nextp))
3314 nextp++; /* skip leading whitespace for next */
3315 /* we expect another name, so done remains false */
3317 else if (*nextp == '\0')
3320 return false; /* invalid syntax */
3322 /* Now safe to overwrite separator with a null */
3325 /* Truncate path if it's overlength */
3326 if (strlen(curname) >= MAXPGPATH)
3327 curname[MAXPGPATH - 1] = '\0';
3330 * Finished isolating current name --- add it to list
3332 curname = pstrdup(curname);
3333 canonicalize_path(curname);
3334 *namelist = lappend(*namelist, curname);
3336 /* Loop back if we didn't reach end of string */
3343 /*****************************************************************************
3344 * Comparison Functions used for bytea
3346 * Note: btree indexes need these routines not to leak memory; therefore,
3347 * be careful to free working copies of toasted datums. Most places don't
3348 * need to be so careful.
3349 *****************************************************************************/
3352 byteaeq(PG_FUNCTION_ARGS)
3354 Datum arg1 = PG_GETARG_DATUM(0);
3355 Datum arg2 = PG_GETARG_DATUM(1);
3361 * We can use a fast path for unequal lengths, which might save us from
3362 * having to detoast one or both values.
3364 len1 = toast_raw_datum_size(arg1);
3365 len2 = toast_raw_datum_size(arg2);
3370 bytea *barg1 = DatumGetByteaPP(arg1);
3371 bytea *barg2 = DatumGetByteaPP(arg2);
3373 result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
3374 len1 - VARHDRSZ) == 0);
3376 PG_FREE_IF_COPY(barg1, 0);
3377 PG_FREE_IF_COPY(barg2, 1);
3380 PG_RETURN_BOOL(result);
3384 byteane(PG_FUNCTION_ARGS)
3386 Datum arg1 = PG_GETARG_DATUM(0);
3387 Datum arg2 = PG_GETARG_DATUM(1);
3393 * We can use a fast path for unequal lengths, which might save us from
3394 * having to detoast one or both values.
3396 len1 = toast_raw_datum_size(arg1);
3397 len2 = toast_raw_datum_size(arg2);
3402 bytea *barg1 = DatumGetByteaPP(arg1);
3403 bytea *barg2 = DatumGetByteaPP(arg2);
3405 result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
3406 len1 - VARHDRSZ) != 0);
3408 PG_FREE_IF_COPY(barg1, 0);
3409 PG_FREE_IF_COPY(barg2, 1);
3412 PG_RETURN_BOOL(result);
3416 bytealt(PG_FUNCTION_ARGS)
3418 bytea *arg1 = PG_GETARG_BYTEA_PP(0);
3419 bytea *arg2 = PG_GETARG_BYTEA_PP(1);
3424 len1 = VARSIZE_ANY_EXHDR(arg1);
3425 len2 = VARSIZE_ANY_EXHDR(arg2);
3427 cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3429 PG_FREE_IF_COPY(arg1, 0);
3430 PG_FREE_IF_COPY(arg2, 1);
3432 PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 < len2)));
3436 byteale(PG_FUNCTION_ARGS)
3438 bytea *arg1 = PG_GETARG_BYTEA_PP(0);
3439 bytea *arg2 = PG_GETARG_BYTEA_PP(1);
3444 len1 = VARSIZE_ANY_EXHDR(arg1);
3445 len2 = VARSIZE_ANY_EXHDR(arg2);
3447 cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3449 PG_FREE_IF_COPY(arg1, 0);
3450 PG_FREE_IF_COPY(arg2, 1);
3452 PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 <= len2)));
3456 byteagt(PG_FUNCTION_ARGS)
3458 bytea *arg1 = PG_GETARG_BYTEA_PP(0);
3459 bytea *arg2 = PG_GETARG_BYTEA_PP(1);
3464 len1 = VARSIZE_ANY_EXHDR(arg1);
3465 len2 = VARSIZE_ANY_EXHDR(arg2);
3467 cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3469 PG_FREE_IF_COPY(arg1, 0);
3470 PG_FREE_IF_COPY(arg2, 1);
3472 PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 > len2)));
3476 byteage(PG_FUNCTION_ARGS)
3478 bytea *arg1 = PG_GETARG_BYTEA_PP(0);
3479 bytea *arg2 = PG_GETARG_BYTEA_PP(1);
3484 len1 = VARSIZE_ANY_EXHDR(arg1);
3485 len2 = VARSIZE_ANY_EXHDR(arg2);
3487 cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3489 PG_FREE_IF_COPY(arg1, 0);
3490 PG_FREE_IF_COPY(arg2, 1);
3492 PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 >= len2)));
3496 byteacmp(PG_FUNCTION_ARGS)
3498 bytea *arg1 = PG_GETARG_BYTEA_PP(0);
3499 bytea *arg2 = PG_GETARG_BYTEA_PP(1);
3504 len1 = VARSIZE_ANY_EXHDR(arg1);
3505 len2 = VARSIZE_ANY_EXHDR(arg2);
3507 cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3508 if ((cmp == 0) && (len1 != len2))
3509 cmp = (len1 < len2) ? -1 : 1;
3511 PG_FREE_IF_COPY(arg1, 0);
3512 PG_FREE_IF_COPY(arg2, 1);
3514 PG_RETURN_INT32(cmp);
3518 bytea_sortsupport(PG_FUNCTION_ARGS)
3520 SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
3521 MemoryContext oldcontext;
3523 oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
3525 /* Use generic string SortSupport, forcing "C" collation */
3526 varstr_sortsupport(ssup, C_COLLATION_OID, false);
3528 MemoryContextSwitchTo(oldcontext);
3534 * appendStringInfoText
3536 * Append a text to str.
3537 * Like appendStringInfoString(str, text_to_cstring(t)) but faster.
3540 appendStringInfoText(StringInfo str, const text *t)
3542 appendBinaryStringInfo(str, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t));
3547 * replace all occurrences of 'old_sub_str' in 'orig_str'
3548 * with 'new_sub_str' to form 'new_str'
3550 * returns 'orig_str' if 'old_sub_str' == '' or 'orig_str' == ''
3551 * otherwise returns 'new_str'
3554 replace_text(PG_FUNCTION_ARGS)
3556 text *src_text = PG_GETARG_TEXT_PP(0);
3557 text *from_sub_text = PG_GETARG_TEXT_PP(1);
3558 text *to_sub_text = PG_GETARG_TEXT_PP(2);
3560 int from_sub_text_len;
3561 TextPositionState state;
3569 text_position_setup(src_text, from_sub_text, &state);
3572 * Note: we check the converted string length, not the original, because
3573 * they could be different if the input contained invalid encoding.
3575 src_text_len = state.len1;
3576 from_sub_text_len = state.len2;
3578 /* Return unmodified source string if empty source or pattern */
3579 if (src_text_len < 1 || from_sub_text_len < 1)
3581 text_position_cleanup(&state);
3582 PG_RETURN_TEXT_P(src_text);
3586 curr_posn = text_position_next(1, &state);
3588 /* When the from_sub_text is not found, there is nothing to do. */
3591 text_position_cleanup(&state);
3592 PG_RETURN_TEXT_P(src_text);
3595 /* start_ptr points to the start_posn'th character of src_text */
3596 start_ptr = VARDATA_ANY(src_text);
3598 initStringInfo(&str);
3602 CHECK_FOR_INTERRUPTS();
3604 /* copy the data skipped over by last text_position_next() */
3605 chunk_len = charlen_to_bytelen(start_ptr, curr_posn - start_posn);
3606 appendBinaryStringInfo(&str, start_ptr, chunk_len);
3608 appendStringInfoText(&str, to_sub_text);
3610 start_posn = curr_posn;
3611 start_ptr += chunk_len;
3612 start_posn += from_sub_text_len;
3613 start_ptr += charlen_to_bytelen(start_ptr, from_sub_text_len);
3615 curr_posn = text_position_next(start_posn, &state);
3617 while (curr_posn > 0);
3619 /* copy trailing data */
3620 chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
3621 appendBinaryStringInfo(&str, start_ptr, chunk_len);
3623 text_position_cleanup(&state);
3625 ret_text = cstring_to_text_with_len(str.data, str.len);
3628 PG_RETURN_TEXT_P(ret_text);
3632 * check_replace_text_has_escape_char
3634 * check whether replace_text contains escape char.
3637 check_replace_text_has_escape_char(const text *replace_text)
3639 const char *p = VARDATA_ANY(replace_text);
3640 const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
3642 if (pg_database_encoding_max_length() == 1)
3644 for (; p < p_end; p++)
3652 for (; p < p_end; p += pg_mblen(p))
3663 * appendStringInfoRegexpSubstr
3665 * Append replace_text to str, substituting regexp back references for
3666 * \n escapes. start_ptr is the start of the match in the source string,
3667 * at logical character position data_pos.
3670 appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
3672 char *start_ptr, int data_pos)
3674 const char *p = VARDATA_ANY(replace_text);
3675 const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
3676 int eml = pg_database_encoding_max_length();
3680 const char *chunk_start = p;
3684 /* Find next escape char. */
3687 for (; p < p_end && *p != '\\'; p++)
3692 for (; p < p_end && *p != '\\'; p += pg_mblen(p))
3696 /* Copy the text we just scanned over, if any. */
3697 if (p > chunk_start)
3698 appendBinaryStringInfo(str, chunk_start, p - chunk_start);
3700 /* Done if at end of string, else advance over escape char. */
3707 /* Escape at very end of input. Treat same as unexpected char */
3708 appendStringInfoChar(str, '\\');
3712 if (*p >= '1' && *p <= '9')
3714 /* Use the back reference of regexp. */
3717 so = pmatch[idx].rm_so;
3718 eo = pmatch[idx].rm_eo;
3723 /* Use the entire matched string. */
3724 so = pmatch[0].rm_so;
3725 eo = pmatch[0].rm_eo;
3728 else if (*p == '\\')
3730 /* \\ means transfer one \ to output. */
3731 appendStringInfoChar(str, '\\');
3738 * If escape char is not followed by any expected char, just treat
3739 * it as ordinary data to copy. (XXX would it be better to throw
3742 appendStringInfoChar(str, '\\');
3746 if (so != -1 && eo != -1)
3749 * Copy the text that is back reference of regexp. Note so and eo
3750 * are counted in characters not bytes.
3755 Assert(so >= data_pos);
3756 chunk_start = start_ptr;
3757 chunk_start += charlen_to_bytelen(chunk_start, so - data_pos);
3758 chunk_len = charlen_to_bytelen(chunk_start, eo - so);
3759 appendBinaryStringInfo(str, chunk_start, chunk_len);
3764 #define REGEXP_REPLACE_BACKREF_CNT 10
3767 * replace_text_regexp
3769 * replace text that matches to regexp in src_text to replace_text.
3771 * Note: to avoid having to include regex.h in builtins.h, we declare
3772 * the regexp argument as void *, but really it's regex_t *.
3775 replace_text_regexp(text *src_text, void *regexp,
3776 text *replace_text, bool glob)
3779 regex_t *re = (regex_t *) regexp;
3780 int src_text_len = VARSIZE_ANY_EXHDR(src_text);
3782 regmatch_t pmatch[REGEXP_REPLACE_BACKREF_CNT];
3790 initStringInfo(&buf);
3792 /* Convert data string to wide characters. */
3793 data = (pg_wchar *) palloc((src_text_len + 1) * sizeof(pg_wchar));
3794 data_len = pg_mb2wchar_with_len(VARDATA_ANY(src_text), data, src_text_len);
3796 /* Check whether replace_text has escape char. */
3797 have_escape = check_replace_text_has_escape_char(replace_text);
3799 /* start_ptr points to the data_pos'th character of src_text */
3800 start_ptr = (char *) VARDATA_ANY(src_text);
3804 while (search_start <= data_len)
3808 CHECK_FOR_INTERRUPTS();
3810 regexec_result = pg_regexec(re,
3814 NULL, /* no details */
3815 REGEXP_REPLACE_BACKREF_CNT,
3819 if (regexec_result == REG_NOMATCH)
3822 if (regexec_result != REG_OKAY)
3826 CHECK_FOR_INTERRUPTS();
3827 pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
3829 (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
3830 errmsg("regular expression failed: %s", errMsg)));
3834 * Copy the text to the left of the match position. Note we are given
3835 * character not byte indexes.
3837 if (pmatch[0].rm_so - data_pos > 0)
3841 chunk_len = charlen_to_bytelen(start_ptr,
3842 pmatch[0].rm_so - data_pos);
3843 appendBinaryStringInfo(&buf, start_ptr, chunk_len);
3846 * Advance start_ptr over that text, to avoid multiple rescans of
3847 * it if the replace_text contains multiple back-references.
3849 start_ptr += chunk_len;
3850 data_pos = pmatch[0].rm_so;
3854 * Copy the replace_text. Process back references when the
3855 * replace_text has escape characters.
3858 appendStringInfoRegexpSubstr(&buf, replace_text, pmatch,
3859 start_ptr, data_pos);
3861 appendStringInfoText(&buf, replace_text);
3863 /* Advance start_ptr and data_pos over the matched text. */
3864 start_ptr += charlen_to_bytelen(start_ptr,
3865 pmatch[0].rm_eo - data_pos);
3866 data_pos = pmatch[0].rm_eo;
3869 * When global option is off, replace the first instance only.
3875 * Advance search position. Normally we start the next search at the
3876 * end of the previous match; but if the match was of zero length, we
3877 * have to advance by one character, or we'd just find the same match
3880 search_start = data_pos;
3881 if (pmatch[0].rm_so == pmatch[0].rm_eo)
3886 * Copy the text to the right of the last match.
3888 if (data_pos < data_len)
3892 chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
3893 appendBinaryStringInfo(&buf, start_ptr, chunk_len);
3896 ret_text = cstring_to_text_with_len(buf.data, buf.len);
3905 * parse input string
3906 * return ord item (1 based)
3907 * based on provided field separator
3910 split_text(PG_FUNCTION_ARGS)
3912 text *inputstring = PG_GETARG_TEXT_PP(0);
3913 text *fldsep = PG_GETARG_TEXT_PP(1);
3914 int fldnum = PG_GETARG_INT32(2);
3915 int inputstring_len;
3917 TextPositionState state;
3922 /* field number is 1 based */
3925 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
3926 errmsg("field position must be greater than zero")));
3928 text_position_setup(inputstring, fldsep, &state);
3931 * Note: we check the converted string length, not the original, because
3932 * they could be different if the input contained invalid encoding.
3934 inputstring_len = state.len1;
3935 fldsep_len = state.len2;
3937 /* return empty string for empty input string */
3938 if (inputstring_len < 1)
3940 text_position_cleanup(&state);
3941 PG_RETURN_TEXT_P(cstring_to_text(""));
3944 /* empty field separator */
3947 text_position_cleanup(&state);
3948 /* if first field, return input string, else empty string */
3950 PG_RETURN_TEXT_P(inputstring);
3952 PG_RETURN_TEXT_P(cstring_to_text(""));
3955 /* identify bounds of first field */
3957 end_posn = text_position_next(1, &state);
3959 /* special case if fldsep not found at all */
3962 text_position_cleanup(&state);
3963 /* if field 1 requested, return input string, else empty string */
3965 PG_RETURN_TEXT_P(inputstring);
3967 PG_RETURN_TEXT_P(cstring_to_text(""));
3970 while (end_posn > 0 && --fldnum > 0)
3972 /* identify bounds of next field */
3973 start_posn = end_posn + fldsep_len;
3974 end_posn = text_position_next(start_posn, &state);
3977 text_position_cleanup(&state);
3981 /* N'th field separator not found */
3982 /* if last field requested, return it, else empty string */
3984 result_text = text_substring(PointerGetDatum(inputstring),
3989 result_text = cstring_to_text("");
3993 /* non-last field requested */
3994 result_text = text_substring(PointerGetDatum(inputstring),
3996 end_posn - start_posn,
4000 PG_RETURN_TEXT_P(result_text);
4004 * Convenience function to return true when two text params are equal.
4007 text_isequal(text *txt1, text *txt2)
4009 return DatumGetBool(DirectFunctionCall2(texteq,
4010 PointerGetDatum(txt1),
4011 PointerGetDatum(txt2)));
4016 * parse input string and return text array of elements,
4017 * based on provided field separator
4020 text_to_array(PG_FUNCTION_ARGS)
4022 return text_to_array_internal(fcinfo);
4026 * text_to_array_null
4027 * parse input string and return text array of elements,
4028 * based on provided field separator and null string
4030 * This is a separate entry point only to prevent the regression tests from
4031 * complaining about different argument sets for the same internal function.
4034 text_to_array_null(PG_FUNCTION_ARGS)
4036 return text_to_array_internal(fcinfo);
4040 * common code for text_to_array and text_to_array_null functions
4042 * These are not strict so we have to test for null inputs explicitly.
4045 text_to_array_internal(PG_FUNCTION_ARGS)
4050 int inputstring_len;
4055 ArrayBuildState *astate = NULL;
4057 /* when input string is NULL, then result is NULL too */
4058 if (PG_ARGISNULL(0))
4061 inputstring = PG_GETARG_TEXT_PP(0);
4063 /* fldsep can be NULL */
4064 if (!PG_ARGISNULL(1))
4065 fldsep = PG_GETARG_TEXT_PP(1);
4069 /* null_string can be NULL or omitted */
4070 if (PG_NARGS() > 2 && !PG_ARGISNULL(2))
4071 null_string = PG_GETARG_TEXT_PP(2);
4078 * Normal case with non-null fldsep. Use the text_position machinery
4079 * to search for occurrences of fldsep.
4081 TextPositionState state;
4087 text_position_setup(inputstring, fldsep, &state);
4090 * Note: we check the converted string length, not the original,
4091 * because they could be different if the input contained invalid
4094 inputstring_len = state.len1;
4095 fldsep_len = state.len2;
4097 /* return empty array for empty input string */
4098 if (inputstring_len < 1)
4100 text_position_cleanup(&state);
4101 PG_RETURN_ARRAYTYPE_P(construct_empty_array(TEXTOID));
4105 * empty field separator: return the input string as a one-element
4110 text_position_cleanup(&state);
4111 /* single element can be a NULL too */
4112 is_null = null_string ? text_isequal(inputstring, null_string) : false;
4113 PG_RETURN_ARRAYTYPE_P(create_singleton_array(fcinfo, TEXTOID,
4114 PointerGetDatum(inputstring),
4119 /* start_ptr points to the start_posn'th character of inputstring */
4120 start_ptr = VARDATA_ANY(inputstring);
4122 for (fldnum = 1;; fldnum++) /* field number is 1 based */
4124 CHECK_FOR_INTERRUPTS();
4126 end_posn = text_position_next(start_posn, &state);
4130 /* fetch last field */
4131 chunk_len = ((char *) inputstring + VARSIZE_ANY(inputstring)) - start_ptr;
4135 /* fetch non-last field */
4136 chunk_len = charlen_to_bytelen(start_ptr, end_posn - start_posn);
4139 /* must build a temp text datum to pass to accumArrayResult */
4140 result_text = cstring_to_text_with_len(start_ptr, chunk_len);
4141 is_null = null_string ? text_isequal(result_text, null_string) : false;
4143 /* stash away this field */
4144 astate = accumArrayResult(astate,
4145 PointerGetDatum(result_text),
4148 CurrentMemoryContext);
4155 start_posn = end_posn;
4156 start_ptr += chunk_len;
4157 start_posn += fldsep_len;
4158 start_ptr += charlen_to_bytelen(start_ptr, fldsep_len);
4161 text_position_cleanup(&state);
4166 * When fldsep is NULL, each character in the inputstring becomes an
4167 * element in the result array. The separator is effectively the
4168 * space between characters.
4170 inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4172 /* return empty array for empty input string */
4173 if (inputstring_len < 1)
4174 PG_RETURN_ARRAYTYPE_P(construct_empty_array(TEXTOID));
4176 start_ptr = VARDATA_ANY(inputstring);
4178 while (inputstring_len > 0)
4180 int chunk_len = pg_mblen(start_ptr);
4182 CHECK_FOR_INTERRUPTS();
4184 /* must build a temp text datum to pass to accumArrayResult */
4185 result_text = cstring_to_text_with_len(start_ptr, chunk_len);
4186 is_null = null_string ? text_isequal(result_text, null_string) : false;
4188 /* stash away this field */
4189 astate = accumArrayResult(astate,
4190 PointerGetDatum(result_text),
4193 CurrentMemoryContext);
4197 start_ptr += chunk_len;
4198 inputstring_len -= chunk_len;
4202 PG_RETURN_ARRAYTYPE_P(makeArrayResult(astate,
4203 CurrentMemoryContext));
4208 * concatenate Cstring representation of input array elements
4209 * using provided field separator
4212 array_to_text(PG_FUNCTION_ARGS)
4214 ArrayType *v = PG_GETARG_ARRAYTYPE_P(0);
4215 char *fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
4217 PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, NULL));
4221 * array_to_text_null
4222 * concatenate Cstring representation of input array elements
4223 * using provided field separator and null string
4225 * This version is not strict so we have to test for null inputs explicitly.
4228 array_to_text_null(PG_FUNCTION_ARGS)
4234 /* returns NULL when first or second parameter is NULL */
4235 if (PG_ARGISNULL(0) || PG_ARGISNULL(1))
4238 v = PG_GETARG_ARRAYTYPE_P(0);
4239 fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
4241 /* NULL null string is passed through as a null pointer */
4242 if (!PG_ARGISNULL(2))
4243 null_string = text_to_cstring(PG_GETARG_TEXT_PP(2));
4247 PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, null_string));
4251 * common code for array_to_text and array_to_text_null functions
4254 array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v,
4255 const char *fldsep, const char *null_string)
4266 bool printed = false;
4271 ArrayMetaState *my_extra;
4273 ndims = ARR_NDIM(v);
4275 nitems = ArrayGetNItems(ndims, dims);
4277 /* if there are no elements, return an empty string */
4279 return cstring_to_text_with_len("", 0);
4281 element_type = ARR_ELEMTYPE(v);
4282 initStringInfo(&buf);
4285 * We arrange to look up info about element type, including its output
4286 * conversion proc, only once per series of calls, assuming the element
4287 * type doesn't change underneath us.
4289 my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
4290 if (my_extra == NULL)
4292 fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
4293 sizeof(ArrayMetaState));
4294 my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
4295 my_extra->element_type = ~element_type;
4298 if (my_extra->element_type != element_type)
4301 * Get info about element type, including its output conversion proc
4303 get_type_io_data(element_type, IOFunc_output,
4304 &my_extra->typlen, &my_extra->typbyval,
4305 &my_extra->typalign, &my_extra->typdelim,
4306 &my_extra->typioparam, &my_extra->typiofunc);
4307 fmgr_info_cxt(my_extra->typiofunc, &my_extra->proc,
4308 fcinfo->flinfo->fn_mcxt);
4309 my_extra->element_type = element_type;
4311 typlen = my_extra->typlen;
4312 typbyval = my_extra->typbyval;
4313 typalign = my_extra->typalign;
4315 p = ARR_DATA_PTR(v);
4316 bitmap = ARR_NULLBITMAP(v);
4319 for (i = 0; i < nitems; i++)
4324 /* Get source element, checking for NULL */
4325 if (bitmap && (*bitmap & bitmask) == 0)
4327 /* if null_string is NULL, we just ignore null elements */
4328 if (null_string != NULL)
4331 appendStringInfo(&buf, "%s%s", fldsep, null_string);
4333 appendStringInfoString(&buf, null_string);
4339 itemvalue = fetch_att(p, typbyval, typlen);
4341 value = OutputFunctionCall(&my_extra->proc, itemvalue);
4344 appendStringInfo(&buf, "%s%s", fldsep, value);
4346 appendStringInfoString(&buf, value);
4349 p = att_addlength_pointer(p, typlen, p);
4350 p = (char *) att_align_nominal(p, typalign);
4353 /* advance bitmap pointer if any */
4357 if (bitmask == 0x100)
4365 result = cstring_to_text_with_len(buf.data, buf.len);
4373 * Convert an int32 to a string containing a base 16 (hex) representation of
4377 to_hex32(PG_FUNCTION_ARGS)
4379 uint32 value = (uint32) PG_GETARG_INT32(0);
4381 const char *digits = "0123456789abcdef";
4382 char buf[32]; /* bigger than needed, but reasonable */
4384 ptr = buf + sizeof(buf) - 1;
4389 *--ptr = digits[value % HEXBASE];
4391 } while (ptr > buf && value);
4393 PG_RETURN_TEXT_P(cstring_to_text(ptr));
4397 * Convert an int64 to a string containing a base 16 (hex) representation of
4401 to_hex64(PG_FUNCTION_ARGS)
4403 uint64 value = (uint64) PG_GETARG_INT64(0);
4405 const char *digits = "0123456789abcdef";
4406 char buf[32]; /* bigger than needed, but reasonable */
4408 ptr = buf + sizeof(buf) - 1;
4413 *--ptr = digits[value % HEXBASE];
4415 } while (ptr > buf && value);
4417 PG_RETURN_TEXT_P(cstring_to_text(ptr));
4421 * Create an md5 hash of a text string and return it as hex
4423 * md5 produces a 16 byte (128 bit) hash; double it for hex
4425 #define MD5_HASH_LEN 32
4428 md5_text(PG_FUNCTION_ARGS)
4430 text *in_text = PG_GETARG_TEXT_PP(0);
4432 char hexsum[MD5_HASH_LEN + 1];
4434 /* Calculate the length of the buffer using varlena metadata */
4435 len = VARSIZE_ANY_EXHDR(in_text);
4437 /* get the hash result */
4438 if (pg_md5_hash(VARDATA_ANY(in_text), len, hexsum) == false)
4440 (errcode(ERRCODE_OUT_OF_MEMORY),
4441 errmsg("out of memory")));
4443 /* convert to text and return it */
4444 PG_RETURN_TEXT_P(cstring_to_text(hexsum));
4448 * Create an md5 hash of a bytea field and return it as a hex string:
4449 * 16-byte md5 digest is represented in 32 hex characters.
4452 md5_bytea(PG_FUNCTION_ARGS)
4454 bytea *in = PG_GETARG_BYTEA_PP(0);
4456 char hexsum[MD5_HASH_LEN + 1];
4458 len = VARSIZE_ANY_EXHDR(in);
4459 if (pg_md5_hash(VARDATA_ANY(in), len, hexsum) == false)
4461 (errcode(ERRCODE_OUT_OF_MEMORY),
4462 errmsg("out of memory")));
4464 PG_RETURN_TEXT_P(cstring_to_text(hexsum));
4468 * Return the size of a datum, possibly compressed
4470 * Works on any data type
4473 pg_column_size(PG_FUNCTION_ARGS)
4475 Datum value = PG_GETARG_DATUM(0);
4479 /* On first call, get the input type's typlen, and save at *fn_extra */
4480 if (fcinfo->flinfo->fn_extra == NULL)
4482 /* Lookup the datatype of the supplied argument */
4483 Oid argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
4485 typlen = get_typlen(argtypeid);
4486 if (typlen == 0) /* should not happen */
4487 elog(ERROR, "cache lookup failed for type %u", argtypeid);
4489 fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
4491 *((int *) fcinfo->flinfo->fn_extra) = typlen;
4494 typlen = *((int *) fcinfo->flinfo->fn_extra);
4498 /* varlena type, possibly toasted */
4499 result = toast_datum_size(value);
4501 else if (typlen == -2)
4504 result = strlen(DatumGetCString(value)) + 1;
4508 /* ordinary fixed-width type */
4512 PG_RETURN_INT32(result);
4516 * string_agg - Concatenates values and returns string.
4518 * Syntax: string_agg(value text, delimiter text) RETURNS text
4520 * Note: Any NULL values are ignored. The first-call delimiter isn't
4521 * actually used at all, and on subsequent calls the delimiter precedes
4522 * the associated value.
4525 /* subroutine to initialize state */
4527 makeStringAggState(FunctionCallInfo fcinfo)
4530 MemoryContext aggcontext;
4531 MemoryContext oldcontext;
4533 if (!AggCheckCallContext(fcinfo, &aggcontext))
4535 /* cannot be called directly because of internal-type argument */
4536 elog(ERROR, "string_agg_transfn called in non-aggregate context");
4540 * Create state in aggregate context. It'll stay there across subsequent
4543 oldcontext = MemoryContextSwitchTo(aggcontext);
4544 state = makeStringInfo();
4545 MemoryContextSwitchTo(oldcontext);
4551 string_agg_transfn(PG_FUNCTION_ARGS)
4555 state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
4557 /* Append the value unless null. */
4558 if (!PG_ARGISNULL(1))
4560 /* On the first time through, we ignore the delimiter. */
4562 state = makeStringAggState(fcinfo);
4563 else if (!PG_ARGISNULL(2))
4564 appendStringInfoText(state, PG_GETARG_TEXT_PP(2)); /* delimiter */
4566 appendStringInfoText(state, PG_GETARG_TEXT_PP(1)); /* value */
4570 * The transition type for string_agg() is declared to be "internal",
4571 * which is a pass-by-value type the same size as a pointer.
4573 PG_RETURN_POINTER(state);
4577 string_agg_finalfn(PG_FUNCTION_ARGS)
4581 /* cannot be called directly because of internal-type argument */
4582 Assert(AggCheckCallContext(fcinfo, NULL));
4584 state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
4587 PG_RETURN_TEXT_P(cstring_to_text_with_len(state->data, state->len));
4593 * Implementation of both concat() and concat_ws().
4595 * sepstr is the separator string to place between values.
4596 * argidx identifies the first argument to concatenate (counting from zero).
4597 * Returns NULL if result should be NULL, else text value.
4600 concat_internal(const char *sepstr, int argidx,
4601 FunctionCallInfo fcinfo)
4605 bool first_arg = true;
4609 * concat(VARIADIC some-array) is essentially equivalent to
4610 * array_to_text(), ie concat the array elements with the given separator.
4611 * So we just pass the case off to that code.
4613 if (get_fn_expr_variadic(fcinfo->flinfo))
4617 /* Should have just the one argument */
4618 Assert(argidx == PG_NARGS() - 1);
4620 /* concat(VARIADIC NULL) is defined as NULL */
4621 if (PG_ARGISNULL(argidx))
4625 * Non-null argument had better be an array. We assume that any call
4626 * context that could let get_fn_expr_variadic return true will have
4627 * checked that a VARIADIC-labeled parameter actually is an array. So
4628 * it should be okay to just Assert that it's an array rather than
4629 * doing a full-fledged error check.
4631 Assert(OidIsValid(get_base_element_type(get_fn_expr_argtype(fcinfo->flinfo, argidx))));
4633 /* OK, safe to fetch the array value */
4634 arr = PG_GETARG_ARRAYTYPE_P(argidx);
4637 * And serialize the array. We tell array_to_text to ignore null
4638 * elements, which matches the behavior of the loop below.
4640 return array_to_text_internal(fcinfo, arr, sepstr, NULL);
4643 /* Normal case without explicit VARIADIC marker */
4644 initStringInfo(&str);
4646 for (i = argidx; i < PG_NARGS(); i++)
4648 if (!PG_ARGISNULL(i))
4650 Datum value = PG_GETARG_DATUM(i);
4655 /* add separator if appropriate */
4659 appendStringInfoString(&str, sepstr);
4661 /* call the appropriate type output function, append the result */
4662 valtype = get_fn_expr_argtype(fcinfo->flinfo, i);
4663 if (!OidIsValid(valtype))
4664 elog(ERROR, "could not determine data type of concat() input");
4665 getTypeOutputInfo(valtype, &typOutput, &typIsVarlena);
4666 appendStringInfoString(&str,
4667 OidOutputFunctionCall(typOutput, value));
4671 result = cstring_to_text_with_len(str.data, str.len);
4678 * Concatenate all arguments. NULL arguments are ignored.
4681 text_concat(PG_FUNCTION_ARGS)
4685 result = concat_internal("", 0, fcinfo);
4688 PG_RETURN_TEXT_P(result);
4692 * Concatenate all but first argument value with separators. The first
4693 * parameter is used as the separator. NULL arguments are ignored.
4696 text_concat_ws(PG_FUNCTION_ARGS)
4701 /* return NULL when separator is NULL */
4702 if (PG_ARGISNULL(0))
4704 sep = text_to_cstring(PG_GETARG_TEXT_PP(0));
4706 result = concat_internal(sep, 1, fcinfo);
4709 PG_RETURN_TEXT_P(result);
4713 * Return first n characters in the string. When n is negative,
4714 * return all but last |n| characters.
4717 text_left(PG_FUNCTION_ARGS)
4719 text *str = PG_GETARG_TEXT_PP(0);
4720 const char *p = VARDATA_ANY(str);
4721 int len = VARSIZE_ANY_EXHDR(str);
4722 int n = PG_GETARG_INT32(1);
4726 n = pg_mbstrlen_with_len(p, len) + n;
4727 rlen = pg_mbcharcliplen(p, len, n);
4729 PG_RETURN_TEXT_P(cstring_to_text_with_len(p, rlen));
4733 * Return last n characters in the string. When n is negative,
4734 * return all but first |n| characters.
4737 text_right(PG_FUNCTION_ARGS)
4739 text *str = PG_GETARG_TEXT_PP(0);
4740 const char *p = VARDATA_ANY(str);
4741 int len = VARSIZE_ANY_EXHDR(str);
4742 int n = PG_GETARG_INT32(1);
4748 n = pg_mbstrlen_with_len(p, len) - n;
4749 off = pg_mbcharcliplen(p, len, n);
4751 PG_RETURN_TEXT_P(cstring_to_text_with_len(p + off, len - off));
4755 * Return reversed string
4758 text_reverse(PG_FUNCTION_ARGS)
4760 text *str = PG_GETARG_TEXT_PP(0);
4761 const char *p = VARDATA_ANY(str);
4762 int len = VARSIZE_ANY_EXHDR(str);
4763 const char *endp = p + len;
4767 result = palloc(len + VARHDRSZ);
4768 dst = (char *) VARDATA(result) + len;
4769 SET_VARSIZE(result, len + VARHDRSZ);
4771 if (pg_database_encoding_max_length() > 1)
4773 /* multibyte version */
4786 /* single byte version */
4791 PG_RETURN_TEXT_P(result);
4796 * Support macros for text_format()
4798 #define TEXT_FORMAT_FLAG_MINUS 0x0001 /* is minus flag present? */
4800 #define ADVANCE_PARSE_POINTER(ptr,end_ptr) \
4802 if (++(ptr) >= (end_ptr)) \
4804 (errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
4805 errmsg("unterminated format() type specifier"), \
4806 errhint("For a single \"%%\" use \"%%%%\"."))); \
4810 * Returns a formatted string
4813 text_format(PG_FUNCTION_ARGS)
4818 const char *start_ptr;
4819 const char *end_ptr;
4824 Datum *elements = NULL;
4826 Oid element_type = InvalidOid;
4827 Oid prev_type = InvalidOid;
4828 Oid prev_width_type = InvalidOid;
4829 FmgrInfo typoutputfinfo;
4830 FmgrInfo typoutputinfo_width;
4832 /* When format string is null, immediately return null */
4833 if (PG_ARGISNULL(0))
4836 /* If argument is marked VARIADIC, expand array into elements */
4837 if (get_fn_expr_variadic(fcinfo->flinfo))
4845 /* Should have just the one argument */
4846 Assert(PG_NARGS() == 2);
4848 /* If argument is NULL, we treat it as zero-length array */
4849 if (PG_ARGISNULL(1))
4854 * Non-null argument had better be an array. We assume that any
4855 * call context that could let get_fn_expr_variadic return true
4856 * will have checked that a VARIADIC-labeled parameter actually is
4857 * an array. So it should be okay to just Assert that it's an
4858 * array rather than doing a full-fledged error check.
4860 Assert(OidIsValid(get_base_element_type(get_fn_expr_argtype(fcinfo->flinfo, 1))));
4862 /* OK, safe to fetch the array value */
4863 arr = PG_GETARG_ARRAYTYPE_P(1);
4865 /* Get info about array element type */
4866 element_type = ARR_ELEMTYPE(arr);
4867 get_typlenbyvalalign(element_type,
4868 &elmlen, &elmbyval, &elmalign);
4870 /* Extract all array elements */
4871 deconstruct_array(arr, element_type, elmlen, elmbyval, elmalign,
4872 &elements, &nulls, &nitems);
4876 funcvariadic = true;
4880 /* Non-variadic case, we'll process the arguments individually */
4882 funcvariadic = false;
4885 /* Setup for main loop. */
4886 fmt = PG_GETARG_TEXT_PP(0);
4887 start_ptr = VARDATA_ANY(fmt);
4888 end_ptr = start_ptr + VARSIZE_ANY_EXHDR(fmt);
4889 initStringInfo(&str);
4890 arg = 1; /* next argument position to print */
4892 /* Scan format string, looking for conversion specifiers. */
4893 for (cp = start_ptr; cp < end_ptr; cp++)
4904 * If it's not the start of a conversion specifier, just copy it to
4905 * the output buffer.
4909 appendStringInfoCharMacro(&str, *cp);
4913 ADVANCE_PARSE_POINTER(cp, end_ptr);
4915 /* Easy case: %% outputs a single % */
4918 appendStringInfoCharMacro(&str, *cp);
4922 /* Parse the optional portions of the format specifier */
4923 cp = text_format_parse_format(cp, end_ptr,
4928 * Next we should see the main conversion specifier. Whether or not
4929 * an argument position was present, it's known that at least one
4930 * character remains in the string at this point. Experience suggests
4931 * that it's worth checking that that character is one of the expected
4932 * ones before we try to fetch arguments, so as to produce the least
4933 * confusing response to a mis-formatted specifier.
4935 if (strchr("sIL", *cp) == NULL)
4937 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4938 errmsg("unrecognized format() type specifier \"%c\"",
4940 errhint("For a single \"%%\" use \"%%%%\".")));
4942 /* If indirect width was specified, get its value */
4945 /* Collect the specified or next argument position */
4950 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4951 errmsg("too few arguments for format()")));
4953 /* Get the value and type of the selected argument */
4956 value = PG_GETARG_DATUM(arg);
4957 isNull = PG_ARGISNULL(arg);
4958 typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
4962 value = elements[arg - 1];
4963 isNull = nulls[arg - 1];
4964 typid = element_type;
4966 if (!OidIsValid(typid))
4967 elog(ERROR, "could not determine data type of format() input");
4971 /* We can treat NULL width the same as zero */
4974 else if (typid == INT4OID)
4975 width = DatumGetInt32(value);
4976 else if (typid == INT2OID)
4977 width = DatumGetInt16(value);
4980 /* For less-usual datatypes, convert to text then to int */
4983 if (typid != prev_width_type)
4988 getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
4989 fmgr_info(typoutputfunc, &typoutputinfo_width);
4990 prev_width_type = typid;
4993 str = OutputFunctionCall(&typoutputinfo_width, value);
4995 /* pg_atoi will complain about bad data or overflow */
4996 width = pg_atoi(str, sizeof(int), '\0');
5002 /* Collect the specified or next argument position */
5007 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5008 errmsg("too few arguments for format()")));
5010 /* Get the value and type of the selected argument */
5013 value = PG_GETARG_DATUM(arg);
5014 isNull = PG_ARGISNULL(arg);
5015 typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
5019 value = elements[arg - 1];
5020 isNull = nulls[arg - 1];
5021 typid = element_type;
5023 if (!OidIsValid(typid))
5024 elog(ERROR, "could not determine data type of format() input");
5029 * Get the appropriate typOutput function, reusing previous one if
5030 * same type as previous argument. That's particularly useful in the
5031 * variadic-array case, but often saves work even for ordinary calls.
5033 if (typid != prev_type)
5038 getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
5039 fmgr_info(typoutputfunc, &typoutputfinfo);
5044 * And now we can format the value.
5051 text_format_string_conversion(&str, *cp, &typoutputfinfo,
5056 /* should not get here, because of previous check */
5058 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5059 errmsg("unrecognized format() type specifier \"%c\"",
5061 errhint("For a single \"%%\" use \"%%%%\".")));
5066 /* Don't need deconstruct_array results anymore. */
5067 if (elements != NULL)
5072 /* Generate results. */
5073 result = cstring_to_text_with_len(str.data, str.len);
5076 PG_RETURN_TEXT_P(result);
5080 * Parse contiguous digits as a decimal number.
5082 * Returns true if some digits could be parsed.
5083 * The value is returned into *value, and *ptr is advanced to the next
5084 * character to be parsed.
5086 * Note parsing invariant: at least one character is known available before
5087 * string end (end_ptr) at entry, and this is still true at exit.
5090 text_format_parse_digits(const char **ptr, const char *end_ptr, int *value)
5093 const char *cp = *ptr;
5096 while (*cp >= '0' && *cp <= '9')
5098 int newval = val * 10 + (*cp - '0');
5100 if (newval / 10 != val) /* overflow? */
5102 (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
5103 errmsg("number is out of range")));
5105 ADVANCE_PARSE_POINTER(cp, end_ptr);
5116 * Parse a format specifier (generally following the SUS printf spec).
5118 * We have already advanced over the initial '%', and we are looking for
5119 * [argpos][flags][width]type (but the type character is not consumed here).
5121 * Inputs are start_ptr (the position after '%') and end_ptr (string end + 1).
5122 * Output parameters:
5123 * argpos: argument position for value to be printed. -1 means unspecified.
5124 * widthpos: argument position for width. Zero means the argument position
5125 * was unspecified (ie, take the next arg) and -1 means no width
5126 * argument (width was omitted or specified as a constant).
5127 * flags: bitmask of flags.
5128 * width: directly-specified width value. Zero means the width was omitted
5129 * (note it's not necessary to distinguish this case from an explicit
5130 * zero width value).
5132 * The function result is the next character position to be parsed, ie, the
5133 * location where the type character is/should be.
5135 * Note parsing invariant: at least one character is known available before
5136 * string end (end_ptr) at entry, and this is still true at exit.
5139 text_format_parse_format(const char *start_ptr, const char *end_ptr,
5140 int *argpos, int *widthpos,
5141 int *flags, int *width)
5143 const char *cp = start_ptr;
5146 /* set defaults for output parameters */
5152 /* try to identify first number */
5153 if (text_format_parse_digits(&cp, end_ptr, &n))
5157 /* Must be just a width and a type, so we're done */
5161 /* The number was argument position */
5163 /* Explicit 0 for argument index is immediately refused */
5166 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5167 errmsg("format specifies argument 0, but arguments are numbered from 1")));
5168 ADVANCE_PARSE_POINTER(cp, end_ptr);
5171 /* Handle flags (only minus is supported now) */
5174 *flags |= TEXT_FORMAT_FLAG_MINUS;
5175 ADVANCE_PARSE_POINTER(cp, end_ptr);
5180 /* Handle indirect width */
5181 ADVANCE_PARSE_POINTER(cp, end_ptr);
5182 if (text_format_parse_digits(&cp, end_ptr, &n))
5184 /* number in this position must be closed by $ */
5187 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5188 errmsg("width argument position must be ended by \"$\"")));
5189 /* The number was width argument position */
5191 /* Explicit 0 for argument index is immediately refused */
5194 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5195 errmsg("format specifies argument 0, but arguments are numbered from 1")));
5196 ADVANCE_PARSE_POINTER(cp, end_ptr);
5199 *widthpos = 0; /* width's argument position is unspecified */
5203 /* Check for direct width specification */
5204 if (text_format_parse_digits(&cp, end_ptr, &n))
5208 /* cp should now be pointing at type character */
5213 * Format a %s, %I, or %L conversion
5216 text_format_string_conversion(StringInfo buf, char conversion,
5217 FmgrInfo *typOutputInfo,
5218 Datum value, bool isNull,
5219 int flags, int width)
5223 /* Handle NULL arguments before trying to stringify the value. */
5226 if (conversion == 's')
5227 text_format_append_string(buf, "", flags, width);
5228 else if (conversion == 'L')
5229 text_format_append_string(buf, "NULL", flags, width);
5230 else if (conversion == 'I')
5232 (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
5233 errmsg("null values cannot be formatted as an SQL identifier")));
5238 str = OutputFunctionCall(typOutputInfo, value);
5241 if (conversion == 'I')
5243 /* quote_identifier may or may not allocate a new string. */
5244 text_format_append_string(buf, quote_identifier(str), flags, width);
5246 else if (conversion == 'L')
5248 char *qstr = quote_literal_cstr(str);
5250 text_format_append_string(buf, qstr, flags, width);
5251 /* quote_literal_cstr() always allocates a new string */
5255 text_format_append_string(buf, str, flags, width);
5262 * Append str to buf, padding as directed by flags/width
5265 text_format_append_string(StringInfo buf, const char *str,
5266 int flags, int width)
5268 bool align_to_left = false;
5271 /* fast path for typical easy case */
5274 appendStringInfoString(buf, str);
5280 /* Negative width: implicit '-' flag, then take absolute value */
5281 align_to_left = true;
5282 /* -INT_MIN is undefined */
5283 if (width <= INT_MIN)
5285 (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
5286 errmsg("number is out of range")));
5289 else if (flags & TEXT_FORMAT_FLAG_MINUS)
5290 align_to_left = true;
5292 len = pg_mbstrlen(str);
5296 appendStringInfoString(buf, str);
5298 appendStringInfoSpaces(buf, width - len);
5304 appendStringInfoSpaces(buf, width - len);
5305 appendStringInfoString(buf, str);
5310 * text_format_nv - nonvariadic wrapper for text_format function.
5312 * note: this wrapper is necessary to pass the sanity check in opr_sanity,
5313 * which checks that all built-in functions that share the implementing C
5314 * function take the same number of arguments.
5317 text_format_nv(PG_FUNCTION_ARGS)
5319 return text_format(fcinfo);
5323 * Helper function for Levenshtein distance functions. Faster than memcmp(),
5324 * for this use case.
5327 rest_of_char_same(const char *s1, const char *s2, int len)
5332 if (s1[len] != s2[len])
5338 /* Expand each Levenshtein distance variant */
5339 #include "levenshtein.c"
5340 #define LEVENSHTEIN_LESS_EQUAL
5341 #include "levenshtein.c"