granicus.if.org Git - postgresql/blob - src/backend/utils/adt/varlena.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * varlena.c
   4  *        Functions for the variable-length built-in types.
   5  *
   6  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   7  * Portions Copyright (c) 1994, Regents of the University of California
   8  *
   9  *
  10  * IDENTIFICATION
  11  *        src/backend/utils/adt/varlena.c
  12  *
  13  *-------------------------------------------------------------------------
  14  */
  15 #include "postgres.h"
  16
  17 #include <ctype.h>
  18 #include <limits.h>
  19
  20 #include "access/hash.h"
  21 #include "access/tuptoaster.h"
  22 #include "catalog/pg_collation.h"
  23 #include "catalog/pg_type.h"
  24 #include "common/md5.h"
  25 #include "lib/hyperloglog.h"
  26 #include "libpq/pqformat.h"
  27 #include "miscadmin.h"
  28 #include "parser/scansup.h"
  29 #include "port/pg_bswap.h"
  30 #include "regex/regex.h"
  31 #include "utils/builtins.h"
  32 #include "utils/bytea.h"
  33 #include "utils/lsyscache.h"
  34 #include "utils/memutils.h"
  35 #include "utils/pg_locale.h"
  36 #include "utils/sortsupport.h"
  37 #include "utils/varlena.h"
  38
  39
  40 /* GUC variable */
  41 int                     bytea_output = BYTEA_OUTPUT_HEX;
  42
  43 typedef struct varlena unknown;
  44 typedef struct varlena VarString;
  45
  46 typedef struct
  47 {
  48         bool            use_wchar;              /* T if multibyte encoding */
  49         char       *str1;                       /* use these if not use_wchar */
  50         char       *str2;                       /* note: these point to original texts */
  51         pg_wchar   *wstr1;                      /* use these if use_wchar */
  52         pg_wchar   *wstr2;                      /* note: these are palloc'd */
  53         int                     len1;                   /* string lengths in logical characters */
  54         int                     len2;
  55         /* Skip table for Boyer-Moore-Horspool search algorithm: */
  56         int                     skiptablemask;  /* mask for ANDing with skiptable subscripts */
  57         int                     skiptable[256]; /* skip distance for given mismatched char */
  58 } TextPositionState;
  59
  60 typedef struct
  61 {
  62         char       *buf1;                       /* 1st string, or abbreviation original string
  63                                                                  * buf */
  64         char       *buf2;                       /* 2nd string, or abbreviation strxfrm() buf */
  65         int                     buflen1;
  66         int                     buflen2;
  67         int                     last_len1;              /* Length of last buf1 string/strxfrm() input */
  68         int                     last_len2;              /* Length of last buf2 string/strxfrm() blob */
  69         int                     last_returned;  /* Last comparison result (cache) */
  70         bool            cache_blob;             /* Does buf2 contain strxfrm() blob, etc? */
  71         bool            collate_c;
  72         bool            bpchar;                 /* Sorting bpchar, not varchar/text/bytea? */
  73         hyperLogLogState abbr_card; /* Abbreviated key cardinality state */
  74         hyperLogLogState full_card; /* Full key cardinality state */
  75         double          prop_card;              /* Required cardinality proportion */
  76 #ifdef HAVE_LOCALE_T
  77         pg_locale_t locale;
  78 #endif
  79 } VarStringSortSupport;
  80
  81 /*
  82  * This should be large enough that most strings will fit, but small enough
  83  * that we feel comfortable putting it on the stack
  84  */
  85 #define TEXTBUFLEN              1024
  86
  87 #define DatumGetUnknownP(X)                     ((unknown *) PG_DETOAST_DATUM(X))
  88 #define DatumGetUnknownPCopy(X)         ((unknown *) PG_DETOAST_DATUM_COPY(X))
  89 #define PG_GETARG_UNKNOWN_P(n)          DatumGetUnknownP(PG_GETARG_DATUM(n))
  90 #define PG_GETARG_UNKNOWN_P_COPY(n) DatumGetUnknownPCopy(PG_GETARG_DATUM(n))
  91 #define PG_RETURN_UNKNOWN_P(x)          PG_RETURN_POINTER(x)
  92
  93 #define DatumGetVarStringP(X)           ((VarString *) PG_DETOAST_DATUM(X))
  94 #define DatumGetVarStringPP(X)          ((VarString *) PG_DETOAST_DATUM_PACKED(X))
  95
  96 static int      varstrfastcmp_c(Datum x, Datum y, SortSupport ssup);
  97 static int      bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup);
  98 static int      varstrfastcmp_locale(Datum x, Datum y, SortSupport ssup);
  99 static int      varstrcmp_abbrev(Datum x, Datum y, SortSupport ssup);
 100 static Datum varstr_abbrev_convert(Datum original, SortSupport ssup);
 101 static bool varstr_abbrev_abort(int memtupcount, SortSupport ssup);
 102 static int32 text_length(Datum str);
 103 static text *text_catenate(text *t1, text *t2);
 104 static text *text_substring(Datum str,
 105                            int32 start,
 106                            int32 length,
 107                            bool length_not_specified);
 108 static text *text_overlay(text *t1, text *t2, int sp, int sl);
 109 static int      text_position(text *t1, text *t2);
 110 static void text_position_setup(text *t1, text *t2, TextPositionState *state);
 111 static int      text_position_next(int start_pos, TextPositionState *state);
 112 static void text_position_cleanup(TextPositionState *state);
 113 static int      text_cmp(text *arg1, text *arg2, Oid collid);
 114 static bytea *bytea_catenate(bytea *t1, bytea *t2);
 115 static bytea *bytea_substring(Datum str,
 116                                 int S,
 117                                 int L,
 118                                 bool length_not_specified);
 119 static bytea *bytea_overlay(bytea *t1, bytea *t2, int sp, int sl);
 120 static void appendStringInfoText(StringInfo str, const text *t);
 121 static Datum text_to_array_internal(PG_FUNCTION_ARGS);
 122 static text *array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v,
 123                                            const char *fldsep, const char *null_string);
 124 static StringInfo makeStringAggState(FunctionCallInfo fcinfo);
 125 static bool text_format_parse_digits(const char **ptr, const char *end_ptr,
 126                                                  int *value);
 127 static const char *text_format_parse_format(const char *start_ptr,
 128                                                  const char *end_ptr,
 129                                                  int *argpos, int *widthpos,
 130                                                  int *flags, int *width);
 131 static void text_format_string_conversion(StringInfo buf, char conversion,
 132                                                           FmgrInfo *typOutputInfo,
 133                                                           Datum value, bool isNull,
 134                                                           int flags, int width);
 135 static void text_format_append_string(StringInfo buf, const char *str,
 136                                                   int flags, int width);
 137
 138
 139 /*****************************************************************************
 140  *       CONVERSION ROUTINES EXPORTED FOR USE BY C CODE                                                  *
 141  *****************************************************************************/
 142
 143 /*
 144  * cstring_to_text
 145  *
 146  * Create a text value from a null-terminated C string.
 147  *
 148  * The new text value is freshly palloc'd with a full-size VARHDR.
 149  */
 150 text *
 151 cstring_to_text(const char *s)
 152 {
 153         return cstring_to_text_with_len(s, strlen(s));
 154 }
 155
 156 /*
 157  * cstring_to_text_with_len
 158  *
 159  * Same as cstring_to_text except the caller specifies the string length;
 160  * the string need not be null_terminated.
 161  */
 162 text *
 163 cstring_to_text_with_len(const char *s, int len)
 164 {
 165         text       *result = (text *) palloc(len + VARHDRSZ);
 166
 167         SET_VARSIZE(result, len + VARHDRSZ);
 168         memcpy(VARDATA(result), s, len);
 169
 170         return result;
 171 }
 172
 173 /*
 174  * text_to_cstring
 175  *
 176  * Create a palloc'd, null-terminated C string from a text value.
 177  *
 178  * We support being passed a compressed or toasted text value.
 179  * This is a bit bogus since such values shouldn't really be referred to as
 180  * "text *", but it seems useful for robustness.  If we didn't handle that
 181  * case here, we'd need another routine that did, anyway.
 182  */
 183 char *
 184 text_to_cstring(const text *t)
 185 {
 186         /* must cast away the const, unfortunately */
 187         text       *tunpacked = pg_detoast_datum_packed((struct varlena *) t);
 188         int                     len = VARSIZE_ANY_EXHDR(tunpacked);
 189         char       *result;
 190
 191         result = (char *) palloc(len + 1);
 192         memcpy(result, VARDATA_ANY(tunpacked), len);
 193         result[len] = '\0';
 194
 195         if (tunpacked != t)
 196                 pfree(tunpacked);
 197
 198         return result;
 199 }
 200
 201 /*
 202  * text_to_cstring_buffer
 203  *
 204  * Copy a text value into a caller-supplied buffer of size dst_len.
 205  *
 206  * The text string is truncated if necessary to fit.  The result is
 207  * guaranteed null-terminated (unless dst_len == 0).
 208  *
 209  * We support being passed a compressed or toasted text value.
 210  * This is a bit bogus since such values shouldn't really be referred to as
 211  * "text *", but it seems useful for robustness.  If we didn't handle that
 212  * case here, we'd need another routine that did, anyway.
 213  */
 214 void
 215 text_to_cstring_buffer(const text *src, char *dst, size_t dst_len)
 216 {
 217         /* must cast away the const, unfortunately */
 218         text       *srcunpacked = pg_detoast_datum_packed((struct varlena *) src);
 219         size_t          src_len = VARSIZE_ANY_EXHDR(srcunpacked);
 220
 221         if (dst_len > 0)
 222         {
 223                 dst_len--;
 224                 if (dst_len >= src_len)
 225                         dst_len = src_len;
 226                 else    /* ensure truncation is encoding-safe */
 227                         dst_len = pg_mbcliplen(VARDATA_ANY(srcunpacked), src_len, dst_len);
 228                 memcpy(dst, VARDATA_ANY(srcunpacked), dst_len);
 229                 dst[dst_len] = '\0';
 230         }
 231
 232         if (srcunpacked != src)
 233                 pfree(srcunpacked);
 234 }
 235
 236
 237 /*****************************************************************************
 238  *       USER I/O ROUTINES                                                                                                               *
 239  *****************************************************************************/
 240
 241
 242 #define VAL(CH)                 ((CH) - '0')
 243 #define DIG(VAL)                ((VAL) + '0')
 244
 245 /*
 246  *              byteain                 - converts from printable representation of byte array
 247  *
 248  *              Non-printable characters must be passed as '\nnn' (octal) and are
 249  *              converted to internal form.  '\' must be passed as '\\'.
 250  *              ereport(ERROR, ...) if bad form.
 251  *
 252  *              BUGS:
 253  *                              The input is scanned twice.
 254  *                              The error checking of input is minimal.
 255  */
 256 Datum
 257 byteain(PG_FUNCTION_ARGS)
 258 {
 259         char       *inputText = PG_GETARG_CSTRING(0);
 260         char       *tp;
 261         char       *rp;
 262         int                     bc;
 263         bytea      *result;
 264
 265         /* Recognize hex input */
 266         if (inputText[0] == '\\' && inputText[1] == 'x')
 267         {
 268                 size_t          len = strlen(inputText);
 269
 270                 bc = (len - 2) / 2 + VARHDRSZ;  /* maximum possible length */
 271                 result = palloc(bc);
 272                 bc = hex_decode(inputText + 2, len - 2, VARDATA(result));
 273                 SET_VARSIZE(result, bc + VARHDRSZ);             /* actual length */
 274
 275                 PG_RETURN_BYTEA_P(result);
 276         }
 277
 278         /* Else, it's the traditional escaped style */
 279         for (bc = 0, tp = inputText; *tp != '\0'; bc++)
 280         {
 281                 if (tp[0] != '\\')
 282                         tp++;
 283                 else if ((tp[0] == '\\') &&
 284                                  (tp[1] >= '0' && tp[1] <= '3') &&
 285                                  (tp[2] >= '0' && tp[2] <= '7') &&
 286                                  (tp[3] >= '0' && tp[3] <= '7'))
 287                         tp += 4;
 288                 else if ((tp[0] == '\\') &&
 289                                  (tp[1] == '\\'))
 290                         tp += 2;
 291                 else
 292                 {
 293                         /*
 294                          * one backslash, not followed by another or ### valid octal
 295                          */
 296                         ereport(ERROR,
 297                                         (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
 298                                          errmsg("invalid input syntax for type %s", "bytea")));
 299                 }
 300         }
 301
 302         bc += VARHDRSZ;
 303
 304         result = (bytea *) palloc(bc);
 305         SET_VARSIZE(result, bc);
 306
 307         tp = inputText;
 308         rp = VARDATA(result);
 309         while (*tp != '\0')
 310         {
 311                 if (tp[0] != '\\')
 312                         *rp++ = *tp++;
 313                 else if ((tp[0] == '\\') &&
 314                                  (tp[1] >= '0' && tp[1] <= '3') &&
 315                                  (tp[2] >= '0' && tp[2] <= '7') &&
 316                                  (tp[3] >= '0' && tp[3] <= '7'))
 317                 {
 318                         bc = VAL(tp[1]);
 319                         bc <<= 3;
 320                         bc += VAL(tp[2]);
 321                         bc <<= 3;
 322                         *rp++ = bc + VAL(tp[3]);
 323
 324                         tp += 4;
 325                 }
 326                 else if ((tp[0] == '\\') &&
 327                                  (tp[1] == '\\'))
 328                 {
 329                         *rp++ = '\\';
 330                         tp += 2;
 331                 }
 332                 else
 333                 {
 334                         /*
 335                          * We should never get here. The first pass should not allow it.
 336                          */
 337                         ereport(ERROR,
 338                                         (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
 339                                          errmsg("invalid input syntax for type %s", "bytea")));
 340                 }
 341         }
 342
 343         PG_RETURN_BYTEA_P(result);
 344 }
 345
 346 /*
 347  *              byteaout                - converts to printable representation of byte array
 348  *
 349  *              In the traditional escaped format, non-printable characters are
 350  *              printed as '\nnn' (octal) and '\' as '\\'.
 351  */
 352 Datum
 353 byteaout(PG_FUNCTION_ARGS)
 354 {
 355         bytea      *vlena = PG_GETARG_BYTEA_PP(0);
 356         char       *result;
 357         char       *rp;
 358
 359         if (bytea_output == BYTEA_OUTPUT_HEX)
 360         {
 361                 /* Print hex format */
 362                 rp = result = palloc(VARSIZE_ANY_EXHDR(vlena) * 2 + 2 + 1);
 363                 *rp++ = '\\';
 364                 *rp++ = 'x';
 365                 rp += hex_encode(VARDATA_ANY(vlena), VARSIZE_ANY_EXHDR(vlena), rp);
 366         }
 367         else if (bytea_output == BYTEA_OUTPUT_ESCAPE)
 368         {
 369                 /* Print traditional escaped format */
 370                 char       *vp;
 371                 int                     len;
 372                 int                     i;
 373
 374                 len = 1;                                /* empty string has 1 char */
 375                 vp = VARDATA_ANY(vlena);
 376                 for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
 377                 {
 378                         if (*vp == '\\')
 379                                 len += 2;
 380                         else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
 381                                 len += 4;
 382                         else
 383                                 len++;
 384                 }
 385                 rp = result = (char *) palloc(len);
 386                 vp = VARDATA_ANY(vlena);
 387                 for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
 388                 {
 389                         if (*vp == '\\')
 390                         {
 391                                 *rp++ = '\\';
 392                                 *rp++ = '\\';
 393                         }
 394                         else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
 395                         {
 396                                 int                     val;    /* holds unprintable chars */
 397
 398                                 val = *vp;
 399                                 rp[0] = '\\';
 400                                 rp[3] = DIG(val & 07);
 401                                 val >>= 3;
 402                                 rp[2] = DIG(val & 07);
 403                                 val >>= 3;
 404                                 rp[1] = DIG(val & 03);
 405                                 rp += 4;
 406                         }
 407                         else
 408                                 *rp++ = *vp;
 409                 }
 410         }
 411         else
 412         {
 413                 elog(ERROR, "unrecognized bytea_output setting: %d",
 414                          bytea_output);
 415                 rp = result = NULL;             /* keep compiler quiet */
 416         }
 417         *rp = '\0';
 418         PG_RETURN_CSTRING(result);
 419 }
 420
 421 /*
 422  *              bytearecv                       - converts external binary format to bytea
 423  */
 424 Datum
 425 bytearecv(PG_FUNCTION_ARGS)
 426 {
 427         StringInfo      buf = (StringInfo) PG_GETARG_POINTER(0);
 428         bytea      *result;
 429         int                     nbytes;
 430
 431         nbytes = buf->len - buf->cursor;
 432         result = (bytea *) palloc(nbytes + VARHDRSZ);
 433         SET_VARSIZE(result, nbytes + VARHDRSZ);
 434         pq_copymsgbytes(buf, VARDATA(result), nbytes);
 435         PG_RETURN_BYTEA_P(result);
 436 }
 437
 438 /*
 439  *              byteasend                       - converts bytea to binary format
 440  *
 441  * This is a special case: just copy the input...
 442  */
 443 Datum
 444 byteasend(PG_FUNCTION_ARGS)
 445 {
 446         bytea      *vlena = PG_GETARG_BYTEA_P_COPY(0);
 447
 448         PG_RETURN_BYTEA_P(vlena);
 449 }
 450
 451 Datum
 452 bytea_string_agg_transfn(PG_FUNCTION_ARGS)
 453 {
 454         StringInfo      state;
 455
 456         state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
 457
 458         /* Append the value unless null. */
 459         if (!PG_ARGISNULL(1))
 460         {
 461                 bytea      *value = PG_GETARG_BYTEA_PP(1);
 462
 463                 /* On the first time through, we ignore the delimiter. */
 464                 if (state == NULL)
 465                         state = makeStringAggState(fcinfo);
 466                 else if (!PG_ARGISNULL(2))
 467                 {
 468                         bytea      *delim = PG_GETARG_BYTEA_PP(2);
 469
 470                         appendBinaryStringInfo(state, VARDATA_ANY(delim), VARSIZE_ANY_EXHDR(delim));
 471                 }
 472
 473                 appendBinaryStringInfo(state, VARDATA_ANY(value), VARSIZE_ANY_EXHDR(value));
 474         }
 475
 476         /*
 477          * The transition type for string_agg() is declared to be "internal",
 478          * which is a pass-by-value type the same size as a pointer.
 479          */
 480         PG_RETURN_POINTER(state);
 481 }
 482
 483 Datum
 484 bytea_string_agg_finalfn(PG_FUNCTION_ARGS)
 485 {
 486         StringInfo      state;
 487
 488         /* cannot be called directly because of internal-type argument */
 489         Assert(AggCheckCallContext(fcinfo, NULL));
 490
 491         state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
 492
 493         if (state != NULL)
 494         {
 495                 bytea      *result;
 496
 497                 result = (bytea *) palloc(state->len + VARHDRSZ);
 498                 SET_VARSIZE(result, state->len + VARHDRSZ);
 499                 memcpy(VARDATA(result), state->data, state->len);
 500                 PG_RETURN_BYTEA_P(result);
 501         }
 502         else
 503                 PG_RETURN_NULL();
 504 }
 505
 506 /*
 507  *              textin                  - converts "..." to internal representation
 508  */
 509 Datum
 510 textin(PG_FUNCTION_ARGS)
 511 {
 512         char       *inputText = PG_GETARG_CSTRING(0);
 513
 514         PG_RETURN_TEXT_P(cstring_to_text(inputText));
 515 }
 516
 517 /*
 518  *              textout                 - converts internal representation to "..."
 519  */
 520 Datum
 521 textout(PG_FUNCTION_ARGS)
 522 {
 523         Datum           txt = PG_GETARG_DATUM(0);
 524
 525         PG_RETURN_CSTRING(TextDatumGetCString(txt));
 526 }
 527
 528 /*
 529  *              textrecv                        - converts external binary format to text
 530  */
 531 Datum
 532 textrecv(PG_FUNCTION_ARGS)
 533 {
 534         StringInfo      buf = (StringInfo) PG_GETARG_POINTER(0);
 535         text       *result;
 536         char       *str;
 537         int                     nbytes;
 538
 539         str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
 540
 541         result = cstring_to_text_with_len(str, nbytes);
 542         pfree(str);
 543         PG_RETURN_TEXT_P(result);
 544 }
 545
 546 /*
 547  *              textsend                        - converts text to binary format
 548  */
 549 Datum
 550 textsend(PG_FUNCTION_ARGS)
 551 {
 552         text       *t = PG_GETARG_TEXT_PP(0);
 553         StringInfoData buf;
 554
 555         pq_begintypsend(&buf);
 556         pq_sendtext(&buf, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t));
 557         PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
 558 }
 559
 560
 561 /*
 562  *              unknownin                       - converts "..." to internal representation
 563  */
 564 Datum
 565 unknownin(PG_FUNCTION_ARGS)
 566 {
 567         char       *str = PG_GETARG_CSTRING(0);
 568
 569         /* representation is same as cstring */
 570         PG_RETURN_CSTRING(pstrdup(str));
 571 }
 572
 573 /*
 574  *              unknownout                      - converts internal representation to "..."
 575  */
 576 Datum
 577 unknownout(PG_FUNCTION_ARGS)
 578 {
 579         /* representation is same as cstring */
 580         char       *str = PG_GETARG_CSTRING(0);
 581
 582         PG_RETURN_CSTRING(pstrdup(str));
 583 }
 584
 585 /*
 586  *              unknownrecv                     - converts external binary format to unknown
 587  */
 588 Datum
 589 unknownrecv(PG_FUNCTION_ARGS)
 590 {
 591         StringInfo      buf = (StringInfo) PG_GETARG_POINTER(0);
 592         char       *str;
 593         int                     nbytes;
 594
 595         str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
 596         /* representation is same as cstring */
 597         PG_RETURN_CSTRING(str);
 598 }
 599
 600 /*
 601  *              unknownsend                     - converts unknown to binary format
 602  */
 603 Datum
 604 unknownsend(PG_FUNCTION_ARGS)
 605 {
 606         /* representation is same as cstring */
 607         char       *str = PG_GETARG_CSTRING(0);
 608         StringInfoData buf;
 609
 610         pq_begintypsend(&buf);
 611         pq_sendtext(&buf, str, strlen(str));
 612         PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
 613 }
 614
 615
 616 /* ========== PUBLIC ROUTINES ========== */
 617
 618 /*
 619  * textlen -
 620  *        returns the logical length of a text*
 621  *         (which is less than the VARSIZE of the text*)
 622  */
 623 Datum
 624 textlen(PG_FUNCTION_ARGS)
 625 {
 626         Datum           str = PG_GETARG_DATUM(0);
 627
 628         /* try to avoid decompressing argument */
 629         PG_RETURN_INT32(text_length(str));
 630 }
 631
 632 /*
 633  * text_length -
 634  *      Does the real work for textlen()
 635  *
 636  *      This is broken out so it can be called directly by other string processing
 637  *      functions.  Note that the argument is passed as a Datum, to indicate that
 638  *      it may still be in compressed form.  We can avoid decompressing it at all
 639  *      in some cases.
 640  */
 641 static int32
 642 text_length(Datum str)
 643 {
 644         /* fastpath when max encoding length is one */
 645         if (pg_database_encoding_max_length() == 1)
 646                 PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
 647         else
 648         {
 649                 text       *t = DatumGetTextPP(str);
 650
 651                 PG_RETURN_INT32(pg_mbstrlen_with_len(VARDATA_ANY(t),
 652                                                                                          VARSIZE_ANY_EXHDR(t)));
 653         }
 654 }
 655
 656 /*
 657  * textoctetlen -
 658  *        returns the physical length of a text*
 659  *         (which is less than the VARSIZE of the text*)
 660  */
 661 Datum
 662 textoctetlen(PG_FUNCTION_ARGS)
 663 {
 664         Datum           str = PG_GETARG_DATUM(0);
 665
 666         /* We need not detoast the input at all */
 667         PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
 668 }
 669
 670 /*
 671  * textcat -
 672  *        takes two text* and returns a text* that is the concatenation of
 673  *        the two.
 674  *
 675  * Rewritten by Sapa, sapa@hq.icb.chel.su. 8-Jul-96.
 676  * Updated by Thomas, Thomas.Lockhart@jpl.nasa.gov 1997-07-10.
 677  * Allocate space for output in all cases.
 678  * XXX - thomas 1997-07-10
 679  */
 680 Datum
 681 textcat(PG_FUNCTION_ARGS)
 682 {
 683         text       *t1 = PG_GETARG_TEXT_PP(0);
 684         text       *t2 = PG_GETARG_TEXT_PP(1);
 685
 686         PG_RETURN_TEXT_P(text_catenate(t1, t2));
 687 }
 688
 689 /*
 690  * text_catenate
 691  *      Guts of textcat(), broken out so it can be used by other functions
 692  *
 693  * Arguments can be in short-header form, but not compressed or out-of-line
 694  */
 695 static text *
 696 text_catenate(text *t1, text *t2)
 697 {
 698         text       *result;
 699         int                     len1,
 700                                 len2,
 701                                 len;
 702         char       *ptr;
 703
 704         len1 = VARSIZE_ANY_EXHDR(t1);
 705         len2 = VARSIZE_ANY_EXHDR(t2);
 706
 707         /* paranoia ... probably should throw error instead? */
 708         if (len1 < 0)
 709                 len1 = 0;
 710         if (len2 < 0)
 711                 len2 = 0;
 712
 713         len = len1 + len2 + VARHDRSZ;
 714         result = (text *) palloc(len);
 715
 716         /* Set size of result string... */
 717         SET_VARSIZE(result, len);
 718
 719         /* Fill data field of result string... */
 720         ptr = VARDATA(result);
 721         if (len1 > 0)
 722                 memcpy(ptr, VARDATA_ANY(t1), len1);
 723         if (len2 > 0)
 724                 memcpy(ptr + len1, VARDATA_ANY(t2), len2);
 725
 726         return result;
 727 }
 728
 729 /*
 730  * charlen_to_bytelen()
 731  *      Compute the number of bytes occupied by n characters starting at *p
 732  *
 733  * It is caller's responsibility that there actually are n characters;
 734  * the string need not be null-terminated.
 735  */
 736 static int
 737 charlen_to_bytelen(const char *p, int n)
 738 {
 739         if (pg_database_encoding_max_length() == 1)
 740         {
 741                 /* Optimization for single-byte encodings */
 742                 return n;
 743         }
 744         else
 745         {
 746                 const char *s;
 747
 748                 for (s = p; n > 0; n--)
 749                         s += pg_mblen(s);
 750
 751                 return s - p;
 752         }
 753 }
 754
 755 /*
 756  * text_substr()
 757  * Return a substring starting at the specified position.
 758  * - thomas 1997-12-31
 759  *
 760  * Input:
 761  *      - string
 762  *      - starting position (is one-based)
 763  *      - string length
 764  *
 765  * If the starting position is zero or less, then return from the start of the string
 766  *      adjusting the length to be consistent with the "negative start" per SQL.
 767  * If the length is less than zero, return the remaining string.
 768  *
 769  * Added multibyte support.
 770  * - Tatsuo Ishii 1998-4-21
 771  * Changed behavior if starting position is less than one to conform to SQL behavior.
 772  * Formerly returned the entire string; now returns a portion.
 773  * - Thomas Lockhart 1998-12-10
 774  * Now uses faster TOAST-slicing interface
 775  * - John Gray 2002-02-22
 776  * Remove "#ifdef MULTIBYTE" and test for encoding_max_length instead. Change
 777  * behaviors conflicting with SQL to meet SQL (if E = S + L < S throw
 778  * error; if E < 1, return '', not entire string). Fixed MB related bug when
 779  * S > LC and < LC + 4 sometimes garbage characters are returned.
 780  * - Joe Conway 2002-08-10
 781  */
 782 Datum
 783 text_substr(PG_FUNCTION_ARGS)
 784 {
 785         PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0),
 786                                                                         PG_GETARG_INT32(1),
 787                                                                         PG_GETARG_INT32(2),
 788                                                                         false));
 789 }
 790
 791 /*
 792  * text_substr_no_len -
 793  *        Wrapper to avoid opr_sanity failure due to
 794  *        one function accepting a different number of args.
 795  */
 796 Datum
 797 text_substr_no_len(PG_FUNCTION_ARGS)
 798 {
 799         PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0),
 800                                                                         PG_GETARG_INT32(1),
 801                                                                         -1, true));
 802 }
 803
 804 /*
 805  * text_substring -
 806  *      Does the real work for text_substr() and text_substr_no_len()
 807  *
 808  *      This is broken out so it can be called directly by other string processing
 809  *      functions.  Note that the argument is passed as a Datum, to indicate that
 810  *      it may still be in compressed/toasted form.  We can avoid detoasting all
 811  *      of it in some cases.
 812  *
 813  *      The result is always a freshly palloc'd datum.
 814  */
 815 static text *
 816 text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
 817 {
 818         int32           eml = pg_database_encoding_max_length();
 819         int32           S = start;              /* start position */
 820         int32           S1;                             /* adjusted start position */
 821         int32           L1;                             /* adjusted substring length */
 822
 823         /* life is easy if the encoding max length is 1 */
 824         if (eml == 1)
 825         {
 826                 S1 = Max(S, 1);
 827
 828                 if (length_not_specified)               /* special case - get length to end of
 829                                                                                  * string */
 830                         L1 = -1;
 831                 else
 832                 {
 833                         /* end position */
 834                         int                     E = S + length;
 835
 836                         /*
 837                          * A negative value for L is the only way for the end position to
 838                          * be before the start. SQL99 says to throw an error.
 839                          */
 840                         if (E < S)
 841                                 ereport(ERROR,
 842                                                 (errcode(ERRCODE_SUBSTRING_ERROR),
 843                                                  errmsg("negative substring length not allowed")));
 844
 845                         /*
 846                          * A zero or negative value for the end position can happen if the
 847                          * start was negative or one. SQL99 says to return a zero-length
 848                          * string.
 849                          */
 850                         if (E < 1)
 851                                 return cstring_to_text("");
 852
 853                         L1 = E - S1;
 854                 }
 855
 856                 /*
 857                  * If the start position is past the end of the string, SQL99 says to
 858                  * return a zero-length string -- PG_GETARG_TEXT_P_SLICE() will do
 859                  * that for us. Convert to zero-based starting position
 860                  */
 861                 return DatumGetTextPSlice(str, S1 - 1, L1);
 862         }
 863         else if (eml > 1)
 864         {
 865                 /*
 866                  * When encoding max length is > 1, we can't get LC without
 867                  * detoasting, so we'll grab a conservatively large slice now and go
 868                  * back later to do the right thing
 869                  */
 870                 int32           slice_start;
 871                 int32           slice_size;
 872                 int32           slice_strlen;
 873                 text       *slice;
 874                 int32           E1;
 875                 int32           i;
 876                 char       *p;
 877                 char       *s;
 878                 text       *ret;
 879
 880                 /*
 881                  * if S is past the end of the string, the tuple toaster will return a
 882                  * zero-length string to us
 883                  */
 884                 S1 = Max(S, 1);
 885
 886                 /*
 887                  * We need to start at position zero because there is no way to know
 888                  * in advance which byte offset corresponds to the supplied start
 889                  * position.
 890                  */
 891                 slice_start = 0;
 892
 893                 if (length_not_specified)               /* special case - get length to end of
 894                                                                                  * string */
 895                         slice_size = L1 = -1;
 896                 else
 897                 {
 898                         int                     E = S + length;
 899
 900                         /*
 901                          * A negative value for L is the only way for the end position to
 902                          * be before the start. SQL99 says to throw an error.
 903                          */
 904                         if (E < S)
 905                                 ereport(ERROR,
 906                                                 (errcode(ERRCODE_SUBSTRING_ERROR),
 907                                                  errmsg("negative substring length not allowed")));
 908
 909                         /*
 910                          * A zero or negative value for the end position can happen if the
 911                          * start was negative or one. SQL99 says to return a zero-length
 912                          * string.
 913                          */
 914                         if (E < 1)
 915                                 return cstring_to_text("");
 916
 917                         /*
 918                          * if E is past the end of the string, the tuple toaster will
 919                          * truncate the length for us
 920                          */
 921                         L1 = E - S1;
 922
 923                         /*
 924                          * Total slice size in bytes can't be any longer than the start
 925                          * position plus substring length times the encoding max length.
 926                          */
 927                         slice_size = (S1 + L1) * eml;
 928                 }
 929
 930                 /*
 931                  * If we're working with an untoasted source, no need to do an extra
 932                  * copying step.
 933                  */
 934                 if (VARATT_IS_COMPRESSED(DatumGetPointer(str)) ||
 935                         VARATT_IS_EXTERNAL(DatumGetPointer(str)))
 936                         slice = DatumGetTextPSlice(str, slice_start, slice_size);
 937                 else
 938                         slice = (text *) DatumGetPointer(str);
 939
 940                 /* see if we got back an empty string */
 941                 if (VARSIZE_ANY_EXHDR(slice) == 0)
 942                 {
 943                         if (slice != (text *) DatumGetPointer(str))
 944                                 pfree(slice);
 945                         return cstring_to_text("");
 946                 }
 947
 948                 /* Now we can get the actual length of the slice in MB characters */
 949                 slice_strlen = pg_mbstrlen_with_len(VARDATA_ANY(slice),
 950                                                                                         VARSIZE_ANY_EXHDR(slice));
 951
 952                 /*
 953                  * Check that the start position wasn't > slice_strlen. If so, SQL99
 954                  * says to return a zero-length string.
 955                  */
 956                 if (S1 > slice_strlen)
 957                 {
 958                         if (slice != (text *) DatumGetPointer(str))
 959                                 pfree(slice);
 960                         return cstring_to_text("");
 961                 }
 962
 963                 /*
 964                  * Adjust L1 and E1 now that we know the slice string length. Again
 965                  * remember that S1 is one based, and slice_start is zero based.
 966                  */
 967                 if (L1 > -1)
 968                         E1 = Min(S1 + L1, slice_start + 1 + slice_strlen);
 969                 else
 970                         E1 = slice_start + 1 + slice_strlen;
 971
 972                 /*
 973                  * Find the start position in the slice; remember S1 is not zero based
 974                  */
 975                 p = VARDATA_ANY(slice);
 976                 for (i = 0; i < S1 - 1; i++)
 977                         p += pg_mblen(p);
 978
 979                 /* hang onto a pointer to our start position */
 980                 s = p;
 981
 982                 /*
 983                  * Count the actual bytes used by the substring of the requested
 984                  * length.
 985                  */
 986                 for (i = S1; i < E1; i++)
 987                         p += pg_mblen(p);
 988
 989                 ret = (text *) palloc(VARHDRSZ + (p - s));
 990                 SET_VARSIZE(ret, VARHDRSZ + (p - s));
 991                 memcpy(VARDATA(ret), s, (p - s));
 992
 993                 if (slice != (text *) DatumGetPointer(str))
 994                         pfree(slice);
 995
 996                 return ret;
 997         }
 998         else
 999                 elog(ERROR, "invalid backend encoding: encoding max length < 1");
1000
1001         /* not reached: suppress compiler warning */
1002         return NULL;
1003 }
1004
1005 /*
1006  * textoverlay
1007  *      Replace specified substring of first string with second
1008  *
1009  * The SQL standard defines OVERLAY() in terms of substring and concatenation.
1010  * This code is a direct implementation of what the standard says.
1011  */
1012 Datum
1013 textoverlay(PG_FUNCTION_ARGS)
1014 {
1015         text       *t1 = PG_GETARG_TEXT_PP(0);
1016         text       *t2 = PG_GETARG_TEXT_PP(1);
1017         int                     sp = PG_GETARG_INT32(2);                /* substring start position */
1018         int                     sl = PG_GETARG_INT32(3);                /* substring length */
1019
1020         PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
1021 }
1022
1023 Datum
1024 textoverlay_no_len(PG_FUNCTION_ARGS)
1025 {
1026         text       *t1 = PG_GETARG_TEXT_PP(0);
1027         text       *t2 = PG_GETARG_TEXT_PP(1);
1028         int                     sp = PG_GETARG_INT32(2);                /* substring start position */
1029         int                     sl;
1030
1031         sl = text_length(PointerGetDatum(t2));          /* defaults to length(t2) */
1032         PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
1033 }
1034
1035 static text *
1036 text_overlay(text *t1, text *t2, int sp, int sl)
1037 {
1038         text       *result;
1039         text       *s1;
1040         text       *s2;
1041         int                     sp_pl_sl;
1042
1043         /*
1044          * Check for possible integer-overflow cases.  For negative sp, throw a
1045          * "substring length" error because that's what should be expected
1046          * according to the spec's definition of OVERLAY().
1047          */
1048         if (sp <= 0)
1049                 ereport(ERROR,
1050                                 (errcode(ERRCODE_SUBSTRING_ERROR),
1051                                  errmsg("negative substring length not allowed")));
1052         sp_pl_sl = sp + sl;
1053         if (sp_pl_sl <= sl)
1054                 ereport(ERROR,
1055                                 (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
1056                                  errmsg("integer out of range")));
1057
1058         s1 = text_substring(PointerGetDatum(t1), 1, sp - 1, false);
1059         s2 = text_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
1060         result = text_catenate(s1, t2);
1061         result = text_catenate(result, s2);
1062
1063         return result;
1064 }
1065
1066 /*
1067  * textpos -
1068  *        Return the position of the specified substring.
1069  *        Implements the SQL POSITION() function.
1070  *        Ref: A Guide To The SQL Standard, Date & Darwen, 1997
1071  * - thomas 1997-07-27
1072  */
1073 Datum
1074 textpos(PG_FUNCTION_ARGS)
1075 {
1076         text       *str = PG_GETARG_TEXT_PP(0);
1077         text       *search_str = PG_GETARG_TEXT_PP(1);
1078
1079         PG_RETURN_INT32((int32) text_position(str, search_str));
1080 }
1081
1082 /*
1083  * text_position -
1084  *      Does the real work for textpos()
1085  *
1086  * Inputs:
1087  *              t1 - string to be searched
1088  *              t2 - pattern to match within t1
1089  * Result:
1090  *              Character index of the first matched char, starting from 1,
1091  *              or 0 if no match.
1092  *
1093  *      This is broken out so it can be called directly by other string processing
1094  *      functions.
1095  */
1096 static int
1097 text_position(text *t1, text *t2)
1098 {
1099         TextPositionState state;
1100         int                     result;
1101
1102         text_position_setup(t1, t2, &state);
1103         result = text_position_next(1, &state);
1104         text_position_cleanup(&state);
1105         return result;
1106 }
1107
1108
1109 /*
1110  * text_position_setup, text_position_next, text_position_cleanup -
1111  *      Component steps of text_position()
1112  *
1113  * These are broken out so that a string can be efficiently searched for
1114  * multiple occurrences of the same pattern.  text_position_next may be
1115  * called multiple times with increasing values of start_pos, which is
1116  * the 1-based character position to start the search from.  The "state"
1117  * variable is normally just a local variable in the caller.
1118  */
1119
1120 static void
1121 text_position_setup(text *t1, text *t2, TextPositionState *state)
1122 {
1123         int                     len1 = VARSIZE_ANY_EXHDR(t1);
1124         int                     len2 = VARSIZE_ANY_EXHDR(t2);
1125
1126         if (pg_database_encoding_max_length() == 1)
1127         {
1128                 /* simple case - single byte encoding */
1129                 state->use_wchar = false;
1130                 state->str1 = VARDATA_ANY(t1);
1131                 state->str2 = VARDATA_ANY(t2);
1132                 state->len1 = len1;
1133                 state->len2 = len2;
1134         }
1135         else
1136         {
1137                 /* not as simple - multibyte encoding */
1138                 pg_wchar   *p1,
1139                                    *p2;
1140
1141                 p1 = (pg_wchar *) palloc((len1 + 1) * sizeof(pg_wchar));
1142                 len1 = pg_mb2wchar_with_len(VARDATA_ANY(t1), p1, len1);
1143                 p2 = (pg_wchar *) palloc((len2 + 1) * sizeof(pg_wchar));
1144                 len2 = pg_mb2wchar_with_len(VARDATA_ANY(t2), p2, len2);
1145
1146                 state->use_wchar = true;
1147                 state->wstr1 = p1;
1148                 state->wstr2 = p2;
1149                 state->len1 = len1;
1150                 state->len2 = len2;
1151         }
1152
1153         /*
1154          * Prepare the skip table for Boyer-Moore-Horspool searching.  In these
1155          * notes we use the terminology that the "haystack" is the string to be
1156          * searched (t1) and the "needle" is the pattern being sought (t2).
1157          *
1158          * If the needle is empty or bigger than the haystack then there is no
1159          * point in wasting cycles initializing the table.  We also choose not to
1160          * use B-M-H for needles of length 1, since the skip table can't possibly
1161          * save anything in that case.
1162          */
1163         if (len1 >= len2 && len2 > 1)
1164         {
1165                 int                     searchlength = len1 - len2;
1166                 int                     skiptablemask;
1167                 int                     last;
1168                 int                     i;
1169
1170                 /*
1171                  * First we must determine how much of the skip table to use.  The
1172                  * declaration of TextPositionState allows up to 256 elements, but for
1173                  * short search problems we don't really want to have to initialize so
1174                  * many elements --- it would take too long in comparison to the
1175                  * actual search time.  So we choose a useful skip table size based on
1176                  * the haystack length minus the needle length.  The closer the needle
1177                  * length is to the haystack length the less useful skipping becomes.
1178                  *
1179                  * Note: since we use bit-masking to select table elements, the skip
1180                  * table size MUST be a power of 2, and so the mask must be 2^N-1.
1181                  */
1182                 if (searchlength < 16)
1183                         skiptablemask = 3;
1184                 else if (searchlength < 64)
1185                         skiptablemask = 7;
1186                 else if (searchlength < 128)
1187                         skiptablemask = 15;
1188                 else if (searchlength < 512)
1189                         skiptablemask = 31;
1190                 else if (searchlength < 2048)
1191                         skiptablemask = 63;
1192                 else if (searchlength < 4096)
1193                         skiptablemask = 127;
1194                 else
1195                         skiptablemask = 255;
1196                 state->skiptablemask = skiptablemask;
1197
1198                 /*
1199                  * Initialize the skip table.  We set all elements to the needle
1200                  * length, since this is the correct skip distance for any character
1201                  * not found in the needle.
1202                  */
1203                 for (i = 0; i <= skiptablemask; i++)
1204                         state->skiptable[i] = len2;
1205
1206                 /*
1207                  * Now examine the needle.  For each character except the last one,
1208                  * set the corresponding table element to the appropriate skip
1209                  * distance.  Note that when two characters share the same skip table
1210                  * entry, the one later in the needle must determine the skip
1211                  * distance.
1212                  */
1213                 last = len2 - 1;
1214
1215                 if (!state->use_wchar)
1216                 {
1217                         const char *str2 = state->str2;
1218
1219                         for (i = 0; i < last; i++)
1220                                 state->skiptable[(unsigned char) str2[i] & skiptablemask] = last - i;
1221                 }
1222                 else
1223                 {
1224                         const pg_wchar *wstr2 = state->wstr2;
1225
1226                         for (i = 0; i < last; i++)
1227                                 state->skiptable[wstr2[i] & skiptablemask] = last - i;
1228                 }
1229         }
1230 }
1231
1232 static int
1233 text_position_next(int start_pos, TextPositionState *state)
1234 {
1235         int                     haystack_len = state->len1;
1236         int                     needle_len = state->len2;
1237         int                     skiptablemask = state->skiptablemask;
1238
1239         Assert(start_pos > 0);          /* else caller error */
1240
1241         if (needle_len <= 0)
1242                 return start_pos;               /* result for empty pattern */
1243
1244         start_pos--;                            /* adjust for zero based arrays */
1245
1246         /* Done if the needle can't possibly fit */
1247         if (haystack_len < start_pos + needle_len)
1248                 return 0;
1249
1250         if (!state->use_wchar)
1251         {
1252                 /* simple case - single byte encoding */
1253                 const char *haystack = state->str1;
1254                 const char *needle = state->str2;
1255                 const char *haystack_end = &haystack[haystack_len];
1256                 const char *hptr;
1257
1258                 if (needle_len == 1)
1259                 {
1260                         /* No point in using B-M-H for a one-character needle */
1261                         char            nchar = *needle;
1262
1263                         hptr = &haystack[start_pos];
1264                         while (hptr < haystack_end)
1265                         {
1266                                 if (*hptr == nchar)
1267                                         return hptr - haystack + 1;
1268                                 hptr++;
1269                         }
1270                 }
1271                 else
1272                 {
1273                         const char *needle_last = &needle[needle_len - 1];
1274
1275                         /* Start at startpos plus the length of the needle */
1276                         hptr = &haystack[start_pos + needle_len - 1];
1277                         while (hptr < haystack_end)
1278                         {
1279                                 /* Match the needle scanning *backward* */
1280                                 const char *nptr;
1281                                 const char *p;
1282
1283                                 nptr = needle_last;
1284                                 p = hptr;
1285                                 while (*nptr == *p)
1286                                 {
1287                                         /* Matched it all?      If so, return 1-based position */
1288                                         if (nptr == needle)
1289                                                 return p - haystack + 1;
1290                                         nptr--, p--;
1291                                 }
1292
1293                                 /*
1294                                  * No match, so use the haystack char at hptr to decide how
1295                                  * far to advance.  If the needle had any occurrence of that
1296                                  * character (or more precisely, one sharing the same
1297                                  * skiptable entry) before its last character, then we advance
1298                                  * far enough to align the last such needle character with
1299                                  * that haystack position.  Otherwise we can advance by the
1300                                  * whole needle length.
1301                                  */
1302                                 hptr += state->skiptable[(unsigned char) *hptr & skiptablemask];
1303                         }
1304                 }
1305         }
1306         else
1307         {
1308                 /* The multibyte char version. This works exactly the same way. */
1309                 const pg_wchar *haystack = state->wstr1;
1310                 const pg_wchar *needle = state->wstr2;
1311                 const pg_wchar *haystack_end = &haystack[haystack_len];
1312                 const pg_wchar *hptr;
1313
1314                 if (needle_len == 1)
1315                 {
1316                         /* No point in using B-M-H for a one-character needle */
1317                         pg_wchar        nchar = *needle;
1318
1319                         hptr = &haystack[start_pos];
1320                         while (hptr < haystack_end)
1321                         {
1322                                 if (*hptr == nchar)
1323                                         return hptr - haystack + 1;
1324                                 hptr++;
1325                         }
1326                 }
1327                 else
1328                 {
1329                         const pg_wchar *needle_last = &needle[needle_len - 1];
1330
1331                         /* Start at startpos plus the length of the needle */
1332                         hptr = &haystack[start_pos + needle_len - 1];
1333                         while (hptr < haystack_end)
1334                         {
1335                                 /* Match the needle scanning *backward* */
1336                                 const pg_wchar *nptr;
1337                                 const pg_wchar *p;
1338
1339                                 nptr = needle_last;
1340                                 p = hptr;
1341                                 while (*nptr == *p)
1342                                 {
1343                                         /* Matched it all?      If so, return 1-based position */
1344                                         if (nptr == needle)
1345                                                 return p - haystack + 1;
1346                                         nptr--, p--;
1347                                 }
1348
1349                                 /*
1350                                  * No match, so use the haystack char at hptr to decide how
1351                                  * far to advance.  If the needle had any occurrence of that
1352                                  * character (or more precisely, one sharing the same
1353                                  * skiptable entry) before its last character, then we advance
1354                                  * far enough to align the last such needle character with
1355                                  * that haystack position.  Otherwise we can advance by the
1356                                  * whole needle length.
1357                                  */
1358                                 hptr += state->skiptable[*hptr & skiptablemask];
1359                         }
1360                 }
1361         }
1362
1363         return 0;                                       /* not found */
1364 }
1365
1366 static void
1367 text_position_cleanup(TextPositionState *state)
1368 {
1369         if (state->use_wchar)
1370         {
1371                 pfree(state->wstr1);
1372                 pfree(state->wstr2);
1373         }
1374 }
1375
1376 /* varstr_cmp()
1377  * Comparison function for text strings with given lengths.
1378  * Includes locale support, but must copy strings to temporary memory
1379  *      to allow null-termination for inputs to strcoll().
1380  * Returns an integer less than, equal to, or greater than zero, indicating
1381  * whether arg1 is less than, equal to, or greater than arg2.
1382  */
1383 int
1384 varstr_cmp(char *arg1, int len1, char *arg2, int len2, Oid collid)
1385 {
1386         int                     result;
1387
1388         /*
1389          * Unfortunately, there is no strncoll(), so in the non-C locale case we
1390          * have to do some memory copying.  This turns out to be significantly
1391          * slower, so we optimize the case where LC_COLLATE is C.  We also try to
1392          * optimize relatively-short strings by avoiding palloc/pfree overhead.
1393          */
1394         if (lc_collate_is_c(collid))
1395         {
1396                 result = memcmp(arg1, arg2, Min(len1, len2));
1397                 if ((result == 0) && (len1 != len2))
1398                         result = (len1 < len2) ? -1 : 1;
1399         }
1400         else
1401         {
1402                 char            a1buf[TEXTBUFLEN];
1403                 char            a2buf[TEXTBUFLEN];
1404                 char       *a1p,
1405                                    *a2p;
1406
1407 #ifdef HAVE_LOCALE_T
1408                 pg_locale_t mylocale = 0;
1409 #endif
1410
1411                 if (collid != DEFAULT_COLLATION_OID)
1412                 {
1413                         if (!OidIsValid(collid))
1414                         {
1415                                 /*
1416                                  * This typically means that the parser could not resolve a
1417                                  * conflict of implicit collations, so report it that way.
1418                                  */
1419                                 ereport(ERROR,
1420                                                 (errcode(ERRCODE_INDETERMINATE_COLLATION),
1421                                                  errmsg("could not determine which collation to use for string comparison"),
1422                                                  errhint("Use the COLLATE clause to set the collation explicitly.")));
1423                         }
1424 #ifdef HAVE_LOCALE_T
1425                         mylocale = pg_newlocale_from_collation(collid);
1426 #endif
1427                 }
1428
1429                 /*
1430                  * memcmp() can't tell us which of two unequal strings sorts first,
1431                  * but it's a cheap way to tell if they're equal.  Testing shows that
1432                  * memcmp() followed by strcoll() is only trivially slower than
1433                  * strcoll() by itself, so we don't lose much if this doesn't work out
1434                  * very often, and if it does - for example, because there are many
1435                  * equal strings in the input - then we win big by avoiding expensive
1436                  * collation-aware comparisons.
1437                  */
1438                 if (len1 == len2 && memcmp(arg1, arg2, len1) == 0)
1439                         return 0;
1440
1441 #ifdef WIN32
1442                 /* Win32 does not have UTF-8, so we need to map to UTF-16 */
1443                 if (GetDatabaseEncoding() == PG_UTF8)
1444                 {
1445                         int                     a1len;
1446                         int                     a2len;
1447                         int                     r;
1448
1449                         if (len1 >= TEXTBUFLEN / 2)
1450                         {
1451                                 a1len = len1 * 2 + 2;
1452                                 a1p = palloc(a1len);
1453                         }
1454                         else
1455                         {
1456                                 a1len = TEXTBUFLEN;
1457                                 a1p = a1buf;
1458                         }
1459                         if (len2 >= TEXTBUFLEN / 2)
1460                         {
1461                                 a2len = len2 * 2 + 2;
1462                                 a2p = palloc(a2len);
1463                         }
1464                         else
1465                         {
1466                                 a2len = TEXTBUFLEN;
1467                                 a2p = a2buf;
1468                         }
1469
1470                         /* stupid Microsloth API does not work for zero-length input */
1471                         if (len1 == 0)
1472                                 r = 0;
1473                         else
1474                         {
1475                                 r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1,
1476                                                                                 (LPWSTR) a1p, a1len / 2);
1477                                 if (!r)
1478                                         ereport(ERROR,
1479                                                         (errmsg("could not convert string to UTF-16: error code %lu",
1480                                                                         GetLastError())));
1481                         }
1482                         ((LPWSTR) a1p)[r] = 0;
1483
1484                         if (len2 == 0)
1485                                 r = 0;
1486                         else
1487                         {
1488                                 r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2,
1489                                                                                 (LPWSTR) a2p, a2len / 2);
1490                                 if (!r)
1491                                         ereport(ERROR,
1492                                                         (errmsg("could not convert string to UTF-16: error code %lu",
1493                                                                         GetLastError())));
1494                         }
1495                         ((LPWSTR) a2p)[r] = 0;
1496
1497                         errno = 0;
1498 #ifdef HAVE_LOCALE_T
1499                         if (mylocale)
1500                                 result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, mylocale);
1501                         else
1502 #endif
1503                                 result = wcscoll((LPWSTR) a1p, (LPWSTR) a2p);
1504                         if (result == 2147483647)       /* _NLSCMPERROR; missing from mingw
1505                                                                                  * headers */
1506                                 ereport(ERROR,
1507                                                 (errmsg("could not compare Unicode strings: %m")));
1508
1509                         /*
1510                          * In some locales wcscoll() can claim that nonidentical strings
1511                          * are equal.  Believing that would be bad news for a number of
1512                          * reasons, so we follow Perl's lead and sort "equal" strings
1513                          * according to strcmp (on the UTF-8 representation).
1514                          */
1515                         if (result == 0)
1516                         {
1517                                 result = memcmp(arg1, arg2, Min(len1, len2));
1518                                 if ((result == 0) && (len1 != len2))
1519                                         result = (len1 < len2) ? -1 : 1;
1520                         }
1521
1522                         if (a1p != a1buf)
1523                                 pfree(a1p);
1524                         if (a2p != a2buf)
1525                                 pfree(a2p);
1526
1527                         return result;
1528                 }
1529 #endif   /* WIN32 */
1530
1531                 if (len1 >= TEXTBUFLEN)
1532                         a1p = (char *) palloc(len1 + 1);
1533                 else
1534                         a1p = a1buf;
1535                 if (len2 >= TEXTBUFLEN)
1536                         a2p = (char *) palloc(len2 + 1);
1537                 else
1538                         a2p = a2buf;
1539
1540                 memcpy(a1p, arg1, len1);
1541                 a1p[len1] = '\0';
1542                 memcpy(a2p, arg2, len2);
1543                 a2p[len2] = '\0';
1544
1545 #ifdef HAVE_LOCALE_T
1546                 if (mylocale)
1547                         result = strcoll_l(a1p, a2p, mylocale);
1548                 else
1549 #endif
1550                         result = strcoll(a1p, a2p);
1551
1552                 /*
1553                  * In some locales strcoll() can claim that nonidentical strings are
1554                  * equal.  Believing that would be bad news for a number of reasons,
1555                  * so we follow Perl's lead and sort "equal" strings according to
1556                  * strcmp().
1557                  */
1558                 if (result == 0)
1559                         result = strcmp(a1p, a2p);
1560
1561                 if (a1p != a1buf)
1562                         pfree(a1p);
1563                 if (a2p != a2buf)
1564                         pfree(a2p);
1565         }
1566
1567         return result;
1568 }
1569
1570 /* text_cmp()
1571  * Internal comparison function for text strings.
1572  * Returns -1, 0 or 1
1573  */
1574 static int
1575 text_cmp(text *arg1, text *arg2, Oid collid)
1576 {
1577         char       *a1p,
1578                            *a2p;
1579         int                     len1,
1580                                 len2;
1581
1582         a1p = VARDATA_ANY(arg1);
1583         a2p = VARDATA_ANY(arg2);
1584
1585         len1 = VARSIZE_ANY_EXHDR(arg1);
1586         len2 = VARSIZE_ANY_EXHDR(arg2);
1587
1588         return varstr_cmp(a1p, len1, a2p, len2, collid);
1589 }
1590
1591 /*
1592  * Comparison functions for text strings.
1593  *
1594  * Note: btree indexes need these routines not to leak memory; therefore,
1595  * be careful to free working copies of toasted datums.  Most places don't
1596  * need to be so careful.
1597  */
1598
1599 Datum
1600 texteq(PG_FUNCTION_ARGS)
1601 {
1602         Datum           arg1 = PG_GETARG_DATUM(0);
1603         Datum           arg2 = PG_GETARG_DATUM(1);
1604         bool            result;
1605         Size            len1,
1606                                 len2;
1607
1608         /*
1609          * Since we only care about equality or not-equality, we can avoid all the
1610          * expense of strcoll() here, and just do bitwise comparison.  In fact, we
1611          * don't even have to do a bitwise comparison if we can show the lengths
1612          * of the strings are unequal; which might save us from having to detoast
1613          * one or both values.
1614          */
1615         len1 = toast_raw_datum_size(arg1);
1616         len2 = toast_raw_datum_size(arg2);
1617         if (len1 != len2)
1618                 result = false;
1619         else
1620         {
1621                 text       *targ1 = DatumGetTextPP(arg1);
1622                 text       *targ2 = DatumGetTextPP(arg2);
1623
1624                 result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1625                                                  len1 - VARHDRSZ) == 0);
1626
1627                 PG_FREE_IF_COPY(targ1, 0);
1628                 PG_FREE_IF_COPY(targ2, 1);
1629         }
1630
1631         PG_RETURN_BOOL(result);
1632 }
1633
1634 Datum
1635 textne(PG_FUNCTION_ARGS)
1636 {
1637         Datum           arg1 = PG_GETARG_DATUM(0);
1638         Datum           arg2 = PG_GETARG_DATUM(1);
1639         bool            result;
1640         Size            len1,
1641                                 len2;
1642
1643         /* See comment in texteq() */
1644         len1 = toast_raw_datum_size(arg1);
1645         len2 = toast_raw_datum_size(arg2);
1646         if (len1 != len2)
1647                 result = true;
1648         else
1649         {
1650                 text       *targ1 = DatumGetTextPP(arg1);
1651                 text       *targ2 = DatumGetTextPP(arg2);
1652
1653                 result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1654                                                  len1 - VARHDRSZ) != 0);
1655
1656                 PG_FREE_IF_COPY(targ1, 0);
1657                 PG_FREE_IF_COPY(targ2, 1);
1658         }
1659
1660         PG_RETURN_BOOL(result);
1661 }
1662
1663 Datum
1664 text_lt(PG_FUNCTION_ARGS)
1665 {
1666         text       *arg1 = PG_GETARG_TEXT_PP(0);
1667         text       *arg2 = PG_GETARG_TEXT_PP(1);
1668         bool            result;
1669
1670         result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0);
1671
1672         PG_FREE_IF_COPY(arg1, 0);
1673         PG_FREE_IF_COPY(arg2, 1);
1674
1675         PG_RETURN_BOOL(result);
1676 }
1677
1678 Datum
1679 text_le(PG_FUNCTION_ARGS)
1680 {
1681         text       *arg1 = PG_GETARG_TEXT_PP(0);
1682         text       *arg2 = PG_GETARG_TEXT_PP(1);
1683         bool            result;
1684
1685         result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) <= 0);
1686
1687         PG_FREE_IF_COPY(arg1, 0);
1688         PG_FREE_IF_COPY(arg2, 1);
1689
1690         PG_RETURN_BOOL(result);
1691 }
1692
1693 Datum
1694 text_gt(PG_FUNCTION_ARGS)
1695 {
1696         text       *arg1 = PG_GETARG_TEXT_PP(0);
1697         text       *arg2 = PG_GETARG_TEXT_PP(1);
1698         bool            result;
1699
1700         result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0);
1701
1702         PG_FREE_IF_COPY(arg1, 0);
1703         PG_FREE_IF_COPY(arg2, 1);
1704
1705         PG_RETURN_BOOL(result);
1706 }
1707
1708 Datum
1709 text_ge(PG_FUNCTION_ARGS)
1710 {
1711         text       *arg1 = PG_GETARG_TEXT_PP(0);
1712         text       *arg2 = PG_GETARG_TEXT_PP(1);
1713         bool            result;
1714
1715         result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) >= 0);
1716
1717         PG_FREE_IF_COPY(arg1, 0);
1718         PG_FREE_IF_COPY(arg2, 1);
1719
1720         PG_RETURN_BOOL(result);
1721 }
1722
1723 Datum
1724 bttextcmp(PG_FUNCTION_ARGS)
1725 {
1726         text       *arg1 = PG_GETARG_TEXT_PP(0);
1727         text       *arg2 = PG_GETARG_TEXT_PP(1);
1728         int32           result;
1729
1730         result = text_cmp(arg1, arg2, PG_GET_COLLATION());
1731
1732         PG_FREE_IF_COPY(arg1, 0);
1733         PG_FREE_IF_COPY(arg2, 1);
1734
1735         PG_RETURN_INT32(result);
1736 }
1737
1738 Datum
1739 bttextsortsupport(PG_FUNCTION_ARGS)
1740 {
1741         SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
1742         Oid                     collid = ssup->ssup_collation;
1743         MemoryContext oldcontext;
1744
1745         oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
1746
1747         /* Use generic string SortSupport */
1748         varstr_sortsupport(ssup, collid, false);
1749
1750         MemoryContextSwitchTo(oldcontext);
1751
1752         PG_RETURN_VOID();
1753 }
1754
1755 /*
1756  * Generic sortsupport interface for character type's operator classes.
1757  * Includes locale support, and support for BpChar semantics (i.e. removing
1758  * trailing spaces before comparison).
1759  *
1760  * Relies on the assumption that text, VarChar, BpChar, and bytea all have the
1761  * same representation.  Callers that always use the C collation (e.g.
1762  * non-collatable type callers like bytea) may have NUL bytes in their strings;
1763  * this will not work with any other collation, though.
1764  */
1765 void
1766 varstr_sortsupport(SortSupport ssup, Oid collid, bool bpchar)
1767 {
1768         bool            abbreviate = ssup->abbreviate;
1769         bool            collate_c = false;
1770         VarStringSortSupport *sss;
1771
1772 #ifdef HAVE_LOCALE_T
1773         pg_locale_t locale = 0;
1774 #endif
1775
1776         /*
1777          * If possible, set ssup->comparator to a function which can be used to
1778          * directly compare two datums.  If we can do this, we'll avoid the
1779          * overhead of a trip through the fmgr layer for every comparison, which
1780          * can be substantial.
1781          *
1782          * Most typically, we'll set the comparator to varstrfastcmp_locale, which
1783          * uses strcoll() to perform comparisons and knows about the special
1784          * requirements of BpChar callers.  However, if LC_COLLATE = C, we can
1785          * make things quite a bit faster with varstrfastcmp_c or bpcharfastcmp_c,
1786          * both of which use memcmp() rather than strcoll().
1787          *
1788          * There is a further exception on Windows.  When the database encoding is
1789          * UTF-8 and we are not using the C collation, complex hacks are required.
1790          * We don't currently have a comparator that handles that case, so we fall
1791          * back on the slow method of having the sort code invoke bttextcmp() (in
1792          * the case of text) via the fmgr trampoline.
1793          */
1794         if (lc_collate_is_c(collid))
1795         {
1796                 if (!bpchar)
1797                         ssup->comparator = varstrfastcmp_c;
1798                 else
1799                         ssup->comparator = bpcharfastcmp_c;
1800
1801                 collate_c = true;
1802         }
1803 #ifdef WIN32
1804         else if (GetDatabaseEncoding() == PG_UTF8)
1805                 return;
1806 #endif
1807         else
1808         {
1809                 ssup->comparator = varstrfastcmp_locale;
1810
1811                 /*
1812                  * We need a collation-sensitive comparison.  To make things faster,
1813                  * we'll figure out the collation based on the locale id and cache the
1814                  * result.
1815                  */
1816                 if (collid != DEFAULT_COLLATION_OID)
1817                 {
1818                         if (!OidIsValid(collid))
1819                         {
1820                                 /*
1821                                  * This typically means that the parser could not resolve a
1822                                  * conflict of implicit collations, so report it that way.
1823                                  */
1824                                 ereport(ERROR,
1825                                                 (errcode(ERRCODE_INDETERMINATE_COLLATION),
1826                                                  errmsg("could not determine which collation to use for string comparison"),
1827                                                  errhint("Use the COLLATE clause to set the collation explicitly.")));
1828                         }
1829 #ifdef HAVE_LOCALE_T
1830                         locale = pg_newlocale_from_collation(collid);
1831 #endif
1832                 }
1833         }
1834
1835         /*
1836          * Unfortunately, it seems that abbreviation for non-C collations is
1837          * broken on many common platforms; testing of multiple versions of glibc
1838          * reveals that, for many locales, strcoll() and strxfrm() do not return
1839          * consistent results, which is fatal to this optimization.  While no
1840          * other libc other than Cygwin has so far been shown to have a problem,
1841          * we take the conservative course of action for right now and disable
1842          * this categorically.  (Users who are certain this isn't a problem on
1843          * their system can define TRUST_STRXFRM.)
1844          *
1845          * Even apart from the risk of broken locales, it's possible that there
1846          * are platforms where the use of abbreviated keys should be disabled at
1847          * compile time.  Having only 4 byte datums could make worst-case
1848          * performance drastically more likely, for example.  Moreover, macOS's
1849          * strxfrm() implementation is known to not effectively concentrate a
1850          * significant amount of entropy from the original string in earlier
1851          * transformed blobs.  It's possible that other supported platforms are
1852          * similarly encumbered.  So, if we ever get past disabling this
1853          * categorically, we may still want or need to disable it for particular
1854          * platforms.
1855          */
1856 #ifndef TRUST_STRXFRM
1857         if (!collate_c)
1858                 abbreviate = false;
1859 #endif
1860
1861         /*
1862          * If we're using abbreviated keys, or if we're using a locale-aware
1863          * comparison, we need to initialize a StringSortSupport object.  Both
1864          * cases will make use of the temporary buffers we initialize here for
1865          * scratch space (and to detect requirement for BpChar semantics from
1866          * caller), and the abbreviation case requires additional state.
1867          */
1868         if (abbreviate || !collate_c)
1869         {
1870                 sss = palloc(sizeof(VarStringSortSupport));
1871                 sss->buf1 = palloc(TEXTBUFLEN);
1872                 sss->buflen1 = TEXTBUFLEN;
1873                 sss->buf2 = palloc(TEXTBUFLEN);
1874                 sss->buflen2 = TEXTBUFLEN;
1875                 /* Start with invalid values */
1876                 sss->last_len1 = -1;
1877                 sss->last_len2 = -1;
1878                 /* Initialize */
1879                 sss->last_returned = 0;
1880 #ifdef HAVE_LOCALE_T
1881                 sss->locale = locale;
1882 #endif
1883
1884                 /*
1885                  * To avoid somehow confusing a strxfrm() blob and an original string,
1886                  * constantly keep track of the variety of data that buf1 and buf2
1887                  * currently contain.
1888                  *
1889                  * Comparisons may be interleaved with conversion calls.  Frequently,
1890                  * conversions and comparisons are batched into two distinct phases,
1891                  * but the correctness of caching cannot hinge upon this.  For
1892                  * comparison caching, buffer state is only trusted if cache_blob is
1893                  * found set to false, whereas strxfrm() caching only trusts the state
1894                  * when cache_blob is found set to true.
1895                  *
1896                  * Arbitrarily initialize cache_blob to true.
1897                  */
1898                 sss->cache_blob = true;
1899                 sss->collate_c = collate_c;
1900                 sss->bpchar = bpchar;
1901                 ssup->ssup_extra = sss;
1902
1903                 /*
1904                  * If possible, plan to use the abbreviated keys optimization.  The
1905                  * core code may switch back to authoritative comparator should
1906                  * abbreviation be aborted.
1907                  */
1908                 if (abbreviate)
1909                 {
1910                         sss->prop_card = 0.20;
1911                         initHyperLogLog(&sss->abbr_card, 10);
1912                         initHyperLogLog(&sss->full_card, 10);
1913                         ssup->abbrev_full_comparator = ssup->comparator;
1914                         ssup->comparator = varstrcmp_abbrev;
1915                         ssup->abbrev_converter = varstr_abbrev_convert;
1916                         ssup->abbrev_abort = varstr_abbrev_abort;
1917                 }
1918         }
1919 }
1920
1921 /*
1922  * sortsupport comparison func (for C locale case)
1923  */
1924 static int
1925 varstrfastcmp_c(Datum x, Datum y, SortSupport ssup)
1926 {
1927         VarString  *arg1 = DatumGetVarStringPP(x);
1928         VarString  *arg2 = DatumGetVarStringPP(y);
1929         char       *a1p,
1930                            *a2p;
1931         int                     len1,
1932                                 len2,
1933                                 result;
1934
1935         a1p = VARDATA_ANY(arg1);
1936         a2p = VARDATA_ANY(arg2);
1937
1938         len1 = VARSIZE_ANY_EXHDR(arg1);
1939         len2 = VARSIZE_ANY_EXHDR(arg2);
1940
1941         result = memcmp(a1p, a2p, Min(len1, len2));
1942         if ((result == 0) && (len1 != len2))
1943                 result = (len1 < len2) ? -1 : 1;
1944
1945         /* We can't afford to leak memory here. */
1946         if (PointerGetDatum(arg1) != x)
1947                 pfree(arg1);
1948         if (PointerGetDatum(arg2) != y)
1949                 pfree(arg2);
1950
1951         return result;
1952 }
1953
1954 /*
1955  * sortsupport comparison func (for BpChar C locale case)
1956  *
1957  * BpChar outsources its sortsupport to this module.  Specialization for the
1958  * varstr_sortsupport BpChar case, modeled on
1959  * internal_bpchar_pattern_compare().
1960  */
1961 static int
1962 bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup)
1963 {
1964         BpChar     *arg1 = DatumGetBpCharPP(x);
1965         BpChar     *arg2 = DatumGetBpCharPP(y);
1966         char       *a1p,
1967                            *a2p;
1968         int                     len1,
1969                                 len2,
1970                                 result;
1971
1972         a1p = VARDATA_ANY(arg1);
1973         a2p = VARDATA_ANY(arg2);
1974
1975         len1 = bpchartruelen(a1p, VARSIZE_ANY_EXHDR(arg1));
1976         len2 = bpchartruelen(a2p, VARSIZE_ANY_EXHDR(arg2));
1977
1978         result = memcmp(a1p, a2p, Min(len1, len2));
1979         if ((result == 0) && (len1 != len2))
1980                 result = (len1 < len2) ? -1 : 1;
1981
1982         /* We can't afford to leak memory here. */
1983         if (PointerGetDatum(arg1) != x)
1984                 pfree(arg1);
1985         if (PointerGetDatum(arg2) != y)
1986                 pfree(arg2);
1987
1988         return result;
1989 }
1990
1991 /*
1992  * sortsupport comparison func (for locale case)
1993  */
1994 static int
1995 varstrfastcmp_locale(Datum x, Datum y, SortSupport ssup)
1996 {
1997         VarString  *arg1 = DatumGetVarStringPP(x);
1998         VarString  *arg2 = DatumGetVarStringPP(y);
1999         bool            arg1_match;
2000         VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
2001
2002         /* working state */
2003         char       *a1p,
2004                            *a2p;
2005         int                     len1,
2006                                 len2,
2007                                 result;
2008
2009         a1p = VARDATA_ANY(arg1);
2010         a2p = VARDATA_ANY(arg2);
2011
2012         len1 = VARSIZE_ANY_EXHDR(arg1);
2013         len2 = VARSIZE_ANY_EXHDR(arg2);
2014
2015         /* Fast pre-check for equality, as discussed in varstr_cmp() */
2016         if (len1 == len2 && memcmp(a1p, a2p, len1) == 0)
2017         {
2018                 /*
2019                  * No change in buf1 or buf2 contents, so avoid changing last_len1 or
2020                  * last_len2.  Existing contents of buffers might still be used by
2021                  * next call.
2022                  *
2023                  * It's fine to allow the comparison of BpChar padding bytes here,
2024                  * even though that implies that the memcmp() will usually be
2025                  * performed for BpChar callers (though multibyte characters could
2026                  * still prevent that from occurring).  The memcmp() is still very
2027                  * cheap, and BpChar's funny semantics have us remove trailing spaces
2028                  * (not limited to padding), so we need make no distinction between
2029                  * padding space characters and "real" space characters.
2030                  */
2031                 result = 0;
2032                 goto done;
2033         }
2034
2035         if (sss->bpchar)
2036         {
2037                 /* Get true number of bytes, ignoring trailing spaces */
2038                 len1 = bpchartruelen(a1p, len1);
2039                 len2 = bpchartruelen(a2p, len2);
2040         }
2041
2042         if (len1 >= sss->buflen1)
2043         {
2044                 pfree(sss->buf1);
2045                 sss->buflen1 = Max(len1 + 1, Min(sss->buflen1 * 2, MaxAllocSize));
2046                 sss->buf1 = MemoryContextAlloc(ssup->ssup_cxt, sss->buflen1);
2047         }
2048         if (len2 >= sss->buflen2)
2049         {
2050                 pfree(sss->buf2);
2051                 sss->buflen2 = Max(len2 + 1, Min(sss->buflen2 * 2, MaxAllocSize));
2052                 sss->buf2 = MemoryContextAlloc(ssup->ssup_cxt, sss->buflen2);
2053         }
2054
2055         /*
2056          * We're likely to be asked to compare the same strings repeatedly, and
2057          * memcmp() is so much cheaper than strcoll() that it pays to try to cache
2058          * comparisons, even though in general there is no reason to think that
2059          * that will work out (every string datum may be unique).  Caching does
2060          * not slow things down measurably when it doesn't work out, and can speed
2061          * things up by rather a lot when it does.  In part, this is because the
2062          * memcmp() compares data from cachelines that are needed in L1 cache even
2063          * when the last comparison's result cannot be reused.
2064          */
2065         arg1_match = true;
2066         if (len1 != sss->last_len1 || memcmp(sss->buf1, a1p, len1) != 0)
2067         {
2068                 arg1_match = false;
2069                 memcpy(sss->buf1, a1p, len1);
2070                 sss->buf1[len1] = '\0';
2071                 sss->last_len1 = len1;
2072         }
2073
2074         /*
2075          * If we're comparing the same two strings as last time, we can return the
2076          * same answer without calling strcoll() again.  This is more likely than
2077          * it seems (at least with moderate to low cardinality sets), because
2078          * quicksort compares the same pivot against many values.
2079          */
2080         if (len2 != sss->last_len2 || memcmp(sss->buf2, a2p, len2) != 0)
2081         {
2082                 memcpy(sss->buf2, a2p, len2);
2083                 sss->buf2[len2] = '\0';
2084                 sss->last_len2 = len2;
2085         }
2086         else if (arg1_match && !sss->cache_blob)
2087         {
2088                 /* Use result cached following last actual strcoll() call */
2089                 result = sss->last_returned;
2090                 goto done;
2091         }
2092
2093 #ifdef HAVE_LOCALE_T
2094         if (sss->locale)
2095                 result = strcoll_l(sss->buf1, sss->buf2, sss->locale);
2096         else
2097 #endif
2098                 result = strcoll(sss->buf1, sss->buf2);
2099
2100         /*
2101          * In some locales strcoll() can claim that nonidentical strings are
2102          * equal. Believing that would be bad news for a number of reasons, so we
2103          * follow Perl's lead and sort "equal" strings according to strcmp().
2104          */
2105         if (result == 0)
2106                 result = strcmp(sss->buf1, sss->buf2);
2107
2108         /* Cache result, perhaps saving an expensive strcoll() call next time */
2109         sss->cache_blob = false;
2110         sss->last_returned = result;
2111 done:
2112         /* We can't afford to leak memory here. */
2113         if (PointerGetDatum(arg1) != x)
2114                 pfree(arg1);
2115         if (PointerGetDatum(arg2) != y)
2116                 pfree(arg2);
2117
2118         return result;
2119 }
2120
2121 /*
2122  * Abbreviated key comparison func
2123  */
2124 static int
2125 varstrcmp_abbrev(Datum x, Datum y, SortSupport ssup)
2126 {
2127         /*
2128          * When 0 is returned, the core system will call varstrfastcmp_c()
2129          * (bpcharfastcmp_c() in BpChar case) or varstrfastcmp_locale().  Even a
2130          * strcmp() on two non-truncated strxfrm() blobs cannot indicate *equality*
2131          * authoritatively, for the same reason that there is a strcoll()
2132          * tie-breaker call to strcmp() in varstr_cmp().
2133          */
2134         if (x > y)
2135                 return 1;
2136         else if (x == y)
2137                 return 0;
2138         else
2139                 return -1;
2140 }
2141
2142 /*
2143  * Conversion routine for sortsupport.  Converts original to abbreviated key
2144  * representation.  Our encoding strategy is simple -- pack the first 8 bytes
2145  * of a strxfrm() blob into a Datum (on little-endian machines, the 8 bytes are
2146  * stored in reverse order), and treat it as an unsigned integer.  When the "C"
2147  * locale is used, or in case of bytea, just memcpy() from original instead.
2148  */
2149 static Datum
2150 varstr_abbrev_convert(Datum original, SortSupport ssup)
2151 {
2152         VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
2153         VarString  *authoritative = DatumGetVarStringPP(original);
2154         char       *authoritative_data = VARDATA_ANY(authoritative);
2155
2156         /* working state */
2157         Datum           res;
2158         char       *pres;
2159         int                     len;
2160         uint32          hash;
2161
2162         pres = (char *) &res;
2163         /* memset(), so any non-overwritten bytes are NUL */
2164         memset(pres, 0, sizeof(Datum));
2165         len = VARSIZE_ANY_EXHDR(authoritative);
2166
2167         /* Get number of bytes, ignoring trailing spaces */
2168         if (sss->bpchar)
2169                 len = bpchartruelen(authoritative_data, len);
2170
2171         /*
2172          * If we're using the C collation, use memcpy(), rather than strxfrm(), to
2173          * abbreviate keys.  The full comparator for the C locale is always
2174          * memcmp().  It would be incorrect to allow bytea callers (callers that
2175          * always force the C collation -- bytea isn't a collatable type, but this
2176          * approach is convenient) to use strxfrm().  This is because bytea
2177          * strings may contain NUL bytes.  Besides, this should be faster, too.
2178          *
2179          * More generally, it's okay that bytea callers can have NUL bytes in
2180          * strings because varstrcmp_abbrev() need not make a distinction between
2181          * terminating NUL bytes, and NUL bytes representing actual NULs in the
2182          * authoritative representation.  Hopefully a comparison at or past one
2183          * abbreviated key's terminating NUL byte will resolve the comparison
2184          * without consulting the authoritative representation; specifically, some
2185          * later non-NUL byte in the longer string can resolve the comparison
2186          * against a subsequent terminating NUL in the shorter string.  There will
2187          * usually be what is effectively a "length-wise" resolution there and
2188          * then.
2189          *
2190          * If that doesn't work out -- if all bytes in the longer string
2191          * positioned at or past the offset of the smaller string's (first)
2192          * terminating NUL are actually representative of NUL bytes in the
2193          * authoritative binary string (perhaps with some *terminating* NUL bytes
2194          * towards the end of the longer string iff it happens to still be small)
2195          * -- then an authoritative tie-breaker will happen, and do the right
2196          * thing: explicitly consider string length.
2197          */
2198         if (sss->collate_c)
2199                 memcpy(pres, authoritative_data, Min(len, sizeof(Datum)));
2200         else
2201         {
2202                 Size            bsize;
2203
2204                 /*
2205                  * We're not using the C collation, so fall back on strxfrm.
2206                  */
2207
2208                 /* By convention, we use buffer 1 to store and NUL-terminate */
2209                 if (len >= sss->buflen1)
2210                 {
2211                         pfree(sss->buf1);
2212                         sss->buflen1 = Max(len + 1, Min(sss->buflen1 * 2, MaxAllocSize));
2213                         sss->buf1 = palloc(sss->buflen1);
2214                 }
2215
2216                 /* Might be able to reuse strxfrm() blob from last call */
2217                 if (sss->last_len1 == len && sss->cache_blob &&
2218                         memcmp(sss->buf1, authoritative_data, len) == 0)
2219                 {
2220                         memcpy(pres, sss->buf2, Min(sizeof(Datum), sss->last_len2));
2221                         /* No change affecting cardinality, so no hashing required */
2222                         goto done;
2223                 }
2224
2225                 /* Just like strcoll(), strxfrm() expects a NUL-terminated string */
2226                 memcpy(sss->buf1, authoritative_data, len);
2227                 sss->buf1[len] = '\0';
2228                 sss->last_len1 = len;
2229
2230                 for (;;)
2231                 {
2232 #ifdef HAVE_LOCALE_T
2233                         if (sss->locale)
2234                                 bsize = strxfrm_l(sss->buf2, sss->buf1,
2235                                                                   sss->buflen2, sss->locale);
2236                         else
2237 #endif
2238                                 bsize = strxfrm(sss->buf2, sss->buf1, sss->buflen2);
2239
2240                         sss->last_len2 = bsize;
2241                         if (bsize < sss->buflen2)
2242                                 break;
2243
2244                         /*
2245                          * The C standard states that the contents of the buffer is now
2246                          * unspecified.  Grow buffer, and retry.
2247                          */
2248                         pfree(sss->buf2);
2249                         sss->buflen2 = Max(bsize + 1,
2250                                                            Min(sss->buflen2 * 2, MaxAllocSize));
2251                         sss->buf2 = palloc(sss->buflen2);
2252                 }
2253
2254                 /*
2255                  * Every Datum byte is always compared.  This is safe because the
2256                  * strxfrm() blob is itself NUL terminated, leaving no danger of
2257                  * misinterpreting any NUL bytes not intended to be interpreted as
2258                  * logically representing termination.
2259                  *
2260                  * (Actually, even if there were NUL bytes in the blob it would be
2261                  * okay.  See remarks on bytea case above.)
2262                  */
2263                 memcpy(pres, sss->buf2, Min(sizeof(Datum), bsize));
2264         }
2265
2266         /*
2267          * Maintain approximate cardinality of both abbreviated keys and original,
2268          * authoritative keys using HyperLogLog.  Used as cheap insurance against
2269          * the worst case, where we do many string transformations for no saving
2270          * in full strcoll()-based comparisons.  These statistics are used by
2271          * varstr_abbrev_abort().
2272          *
2273          * First, Hash key proper, or a significant fraction of it.  Mix in length
2274          * in order to compensate for cases where differences are past
2275          * PG_CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing.
2276          */
2277         hash = DatumGetUInt32(hash_any((unsigned char *) authoritative_data,
2278                                                                    Min(len, PG_CACHE_LINE_SIZE)));
2279
2280         if (len > PG_CACHE_LINE_SIZE)
2281                 hash ^= DatumGetUInt32(hash_uint32((uint32) len));
2282
2283         addHyperLogLog(&sss->full_card, hash);
2284
2285         /* Hash abbreviated key */
2286 #if SIZEOF_DATUM == 8
2287         {
2288                 uint32          lohalf,
2289                                         hihalf;
2290
2291                 lohalf = (uint32) res;
2292                 hihalf = (uint32) (res >> 32);
2293                 hash = DatumGetUInt32(hash_uint32(lohalf ^ hihalf));
2294         }
2295 #else                                                   /* SIZEOF_DATUM != 8 */
2296         hash = DatumGetUInt32(hash_uint32((uint32) res));
2297 #endif
2298
2299         addHyperLogLog(&sss->abbr_card, hash);
2300
2301         /* Cache result, perhaps saving an expensive strxfrm() call next time */
2302         sss->cache_blob = true;
2303 done:
2304
2305         /*
2306          * Byteswap on little-endian machines.
2307          *
2308          * This is needed so that varstrcmp_abbrev() (an unsigned integer 3-way
2309          * comparator) works correctly on all platforms.  If we didn't do this,
2310          * the comparator would have to call memcmp() with a pair of pointers to
2311          * the first byte of each abbreviated key, which is slower.
2312          */
2313         res = DatumBigEndianToNative(res);
2314
2315         /* Don't leak memory here */
2316         if (PointerGetDatum(authoritative) != original)
2317                 pfree(authoritative);
2318
2319         return res;
2320 }
2321
2322 /*
2323  * Callback for estimating effectiveness of abbreviated key optimization, using
2324  * heuristic rules.  Returns value indicating if the abbreviation optimization
2325  * should be aborted, based on its projected effectiveness.
2326  */
2327 static bool
2328 varstr_abbrev_abort(int memtupcount, SortSupport ssup)
2329 {
2330         VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
2331         double          abbrev_distinct,
2332                                 key_distinct;
2333
2334         Assert(ssup->abbreviate);
2335
2336         /* Have a little patience */
2337         if (memtupcount < 100)
2338                 return false;
2339
2340         abbrev_distinct = estimateHyperLogLog(&sss->abbr_card);
2341         key_distinct = estimateHyperLogLog(&sss->full_card);
2342
2343         /*
2344          * Clamp cardinality estimates to at least one distinct value.  While
2345          * NULLs are generally disregarded, if only NULL values were seen so far,
2346          * that might misrepresent costs if we failed to clamp.
2347          */
2348         if (abbrev_distinct <= 1.0)
2349                 abbrev_distinct = 1.0;
2350
2351         if (key_distinct <= 1.0)
2352                 key_distinct = 1.0;
2353
2354         /*
2355          * In the worst case all abbreviated keys are identical, while at the same
2356          * time there are differences within full key strings not captured in
2357          * abbreviations.
2358          */
2359 #ifdef TRACE_SORT
2360         if (trace_sort)
2361         {
2362                 double          norm_abbrev_card = abbrev_distinct / (double) memtupcount;
2363
2364                 elog(LOG, "varstr_abbrev: abbrev_distinct after %d: %f "
2365                          "(key_distinct: %f, norm_abbrev_card: %f, prop_card: %f)",
2366                          memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card,
2367                          sss->prop_card);
2368         }
2369 #endif
2370
2371         /*
2372          * If the number of distinct abbreviated keys approximately matches the
2373          * number of distinct authoritative original keys, that's reason enough to
2374          * proceed.  We can win even with a very low cardinality set if most
2375          * tie-breakers only memcmp().  This is by far the most important
2376          * consideration.
2377          *
2378          * While comparisons that are resolved at the abbreviated key level are
2379          * considerably cheaper than tie-breakers resolved with memcmp(), both of
2380          * those two outcomes are so much cheaper than a full strcoll() once
2381          * sorting is underway that it doesn't seem worth it to weigh abbreviated
2382          * cardinality against the overall size of the set in order to more
2383          * accurately model costs.  Assume that an abbreviated comparison, and an
2384          * abbreviated comparison with a cheap memcmp()-based authoritative
2385          * resolution are equivalent.
2386          */
2387         if (abbrev_distinct > key_distinct * sss->prop_card)
2388         {
2389                 /*
2390                  * When we have exceeded 10,000 tuples, decay required cardinality
2391                  * aggressively for next call.
2392                  *
2393                  * This is useful because the number of comparisons required on
2394                  * average increases at a linearithmic rate, and at roughly 10,000
2395                  * tuples that factor will start to dominate over the linear costs of
2396                  * string transformation (this is a conservative estimate).  The decay
2397                  * rate is chosen to be a little less aggressive than halving -- which
2398                  * (since we're called at points at which memtupcount has doubled)
2399                  * would never see the cost model actually abort past the first call
2400                  * following a decay.  This decay rate is mostly a precaution against
2401                  * a sudden, violent swing in how well abbreviated cardinality tracks
2402                  * full key cardinality.  The decay also serves to prevent a marginal
2403                  * case from being aborted too late, when too much has already been
2404                  * invested in string transformation.
2405                  *
2406                  * It's possible for sets of several million distinct strings with
2407                  * mere tens of thousands of distinct abbreviated keys to still
2408                  * benefit very significantly.  This will generally occur provided
2409                  * each abbreviated key is a proxy for a roughly uniform number of the
2410                  * set's full keys. If it isn't so, we hope to catch that early and
2411                  * abort.  If it isn't caught early, by the time the problem is
2412                  * apparent it's probably not worth aborting.
2413                  */
2414                 if (memtupcount > 10000)
2415                         sss->prop_card *= 0.65;
2416
2417                 return false;
2418         }
2419
2420         /*
2421          * Abort abbreviation strategy.
2422          *
2423          * The worst case, where all abbreviated keys are identical while all
2424          * original strings differ will typically only see a regression of about
2425          * 10% in execution time for small to medium sized lists of strings.
2426          * Whereas on modern CPUs where cache stalls are the dominant cost, we can
2427          * often expect very large improvements, particularly with sets of strings
2428          * of moderately high to high abbreviated cardinality.  There is little to
2429          * lose but much to gain, which our strategy reflects.
2430          */
2431 #ifdef TRACE_SORT
2432         if (trace_sort)
2433                 elog(LOG, "varstr_abbrev: aborted abbreviation at %d "
2434                          "(abbrev_distinct: %f, key_distinct: %f, prop_card: %f)",
2435                          memtupcount, abbrev_distinct, key_distinct, sss->prop_card);
2436 #endif
2437
2438         return true;
2439 }
2440
2441 Datum
2442 text_larger(PG_FUNCTION_ARGS)
2443 {
2444         text       *arg1 = PG_GETARG_TEXT_PP(0);
2445         text       *arg2 = PG_GETARG_TEXT_PP(1);
2446         text       *result;
2447
2448         result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0) ? arg1 : arg2);
2449
2450         PG_RETURN_TEXT_P(result);
2451 }
2452
2453 Datum
2454 text_smaller(PG_FUNCTION_ARGS)
2455 {
2456         text       *arg1 = PG_GETARG_TEXT_PP(0);
2457         text       *arg2 = PG_GETARG_TEXT_PP(1);
2458         text       *result;
2459
2460         result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0) ? arg1 : arg2);
2461
2462         PG_RETURN_TEXT_P(result);
2463 }
2464
2465
2466 /*
2467  * The following operators support character-by-character comparison
2468  * of text datums, to allow building indexes suitable for LIKE clauses.
2469  * Note that the regular texteq/textne comparison operators, and regular
2470  * support functions 1 and 2 with "C" collation are assumed to be
2471  * compatible with these!
2472  */
2473
2474 static int
2475 internal_text_pattern_compare(text *arg1, text *arg2)
2476 {
2477         int                     result;
2478         int                     len1,
2479                                 len2;
2480
2481         len1 = VARSIZE_ANY_EXHDR(arg1);
2482         len2 = VARSIZE_ANY_EXHDR(arg2);
2483
2484         result = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
2485         if (result != 0)
2486                 return result;
2487         else if (len1 < len2)
2488                 return -1;
2489         else if (len1 > len2)
2490                 return 1;
2491         else
2492                 return 0;
2493 }
2494
2495
2496 Datum
2497 text_pattern_lt(PG_FUNCTION_ARGS)
2498 {
2499         text       *arg1 = PG_GETARG_TEXT_PP(0);
2500         text       *arg2 = PG_GETARG_TEXT_PP(1);
2501         int                     result;
2502
2503         result = internal_text_pattern_compare(arg1, arg2);
2504
2505         PG_FREE_IF_COPY(arg1, 0);
2506         PG_FREE_IF_COPY(arg2, 1);
2507
2508         PG_RETURN_BOOL(result < 0);
2509 }
2510
2511
2512 Datum
2513 text_pattern_le(PG_FUNCTION_ARGS)
2514 {
2515         text       *arg1 = PG_GETARG_TEXT_PP(0);
2516         text       *arg2 = PG_GETARG_TEXT_PP(1);
2517         int                     result;
2518
2519         result = internal_text_pattern_compare(arg1, arg2);
2520
2521         PG_FREE_IF_COPY(arg1, 0);
2522         PG_FREE_IF_COPY(arg2, 1);
2523
2524         PG_RETURN_BOOL(result <= 0);
2525 }
2526
2527
2528 Datum
2529 text_pattern_ge(PG_FUNCTION_ARGS)
2530 {
2531         text       *arg1 = PG_GETARG_TEXT_PP(0);
2532         text       *arg2 = PG_GETARG_TEXT_PP(1);
2533         int                     result;
2534
2535         result = internal_text_pattern_compare(arg1, arg2);
2536
2537         PG_FREE_IF_COPY(arg1, 0);
2538         PG_FREE_IF_COPY(arg2, 1);
2539
2540         PG_RETURN_BOOL(result >= 0);
2541 }
2542
2543
2544 Datum
2545 text_pattern_gt(PG_FUNCTION_ARGS)
2546 {
2547         text       *arg1 = PG_GETARG_TEXT_PP(0);
2548         text       *arg2 = PG_GETARG_TEXT_PP(1);
2549         int                     result;
2550
2551         result = internal_text_pattern_compare(arg1, arg2);
2552
2553         PG_FREE_IF_COPY(arg1, 0);
2554         PG_FREE_IF_COPY(arg2, 1);
2555
2556         PG_RETURN_BOOL(result > 0);
2557 }
2558
2559
2560 Datum
2561 bttext_pattern_cmp(PG_FUNCTION_ARGS)
2562 {
2563         text       *arg1 = PG_GETARG_TEXT_PP(0);
2564         text       *arg2 = PG_GETARG_TEXT_PP(1);
2565         int                     result;
2566
2567         result = internal_text_pattern_compare(arg1, arg2);
2568
2569         PG_FREE_IF_COPY(arg1, 0);
2570         PG_FREE_IF_COPY(arg2, 1);
2571
2572         PG_RETURN_INT32(result);
2573 }
2574
2575
2576 Datum
2577 bttext_pattern_sortsupport(PG_FUNCTION_ARGS)
2578 {
2579         SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
2580         MemoryContext oldcontext;
2581
2582         oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
2583
2584         /* Use generic string SortSupport, forcing "C" collation */
2585         varstr_sortsupport(ssup, C_COLLATION_OID, false);
2586
2587         MemoryContextSwitchTo(oldcontext);
2588
2589         PG_RETURN_VOID();
2590 }
2591
2592
2593 /*-------------------------------------------------------------
2594  * byteaoctetlen
2595  *
2596  * get the number of bytes contained in an instance of type 'bytea'
2597  *-------------------------------------------------------------
2598  */
2599 Datum
2600 byteaoctetlen(PG_FUNCTION_ARGS)
2601 {
2602         Datum           str = PG_GETARG_DATUM(0);
2603
2604         /* We need not detoast the input at all */
2605         PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
2606 }
2607
2608 /*
2609  * byteacat -
2610  *        takes two bytea* and returns a bytea* that is the concatenation of
2611  *        the two.
2612  *
2613  * Cloned from textcat and modified as required.
2614  */
2615 Datum
2616 byteacat(PG_FUNCTION_ARGS)
2617 {
2618         bytea      *t1 = PG_GETARG_BYTEA_PP(0);
2619         bytea      *t2 = PG_GETARG_BYTEA_PP(1);
2620
2621         PG_RETURN_BYTEA_P(bytea_catenate(t1, t2));
2622 }
2623
2624 /*
2625  * bytea_catenate
2626  *      Guts of byteacat(), broken out so it can be used by other functions
2627  *
2628  * Arguments can be in short-header form, but not compressed or out-of-line
2629  */
2630 static bytea *
2631 bytea_catenate(bytea *t1, bytea *t2)
2632 {
2633         bytea      *result;
2634         int                     len1,
2635                                 len2,
2636                                 len;
2637         char       *ptr;
2638
2639         len1 = VARSIZE_ANY_EXHDR(t1);
2640         len2 = VARSIZE_ANY_EXHDR(t2);
2641
2642         /* paranoia ... probably should throw error instead? */
2643         if (len1 < 0)
2644                 len1 = 0;
2645         if (len2 < 0)
2646                 len2 = 0;
2647
2648         len = len1 + len2 + VARHDRSZ;
2649         result = (bytea *) palloc(len);
2650
2651         /* Set size of result string... */
2652         SET_VARSIZE(result, len);
2653
2654         /* Fill data field of result string... */
2655         ptr = VARDATA(result);
2656         if (len1 > 0)
2657                 memcpy(ptr, VARDATA_ANY(t1), len1);
2658         if (len2 > 0)
2659                 memcpy(ptr + len1, VARDATA_ANY(t2), len2);
2660
2661         return result;
2662 }
2663
2664 #define PG_STR_GET_BYTEA(str_) \
2665         DatumGetByteaP(DirectFunctionCall1(byteain, CStringGetDatum(str_)))
2666
2667 /*
2668  * bytea_substr()
2669  * Return a substring starting at the specified position.
2670  * Cloned from text_substr and modified as required.
2671  *
2672  * Input:
2673  *      - string
2674  *      - starting position (is one-based)
2675  *      - string length (optional)
2676  *
2677  * If the starting position is zero or less, then return from the start of the string
2678  * adjusting the length to be consistent with the "negative start" per SQL.
2679  * If the length is less than zero, an ERROR is thrown. If no third argument
2680  * (length) is provided, the length to the end of the string is assumed.
2681  */
2682 Datum
2683 bytea_substr(PG_FUNCTION_ARGS)
2684 {
2685         PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0),
2686                                                                           PG_GETARG_INT32(1),
2687                                                                           PG_GETARG_INT32(2),
2688                                                                           false));
2689 }
2690
2691 /*
2692  * bytea_substr_no_len -
2693  *        Wrapper to avoid opr_sanity failure due to
2694  *        one function accepting a different number of args.
2695  */
2696 Datum
2697 bytea_substr_no_len(PG_FUNCTION_ARGS)
2698 {
2699         PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0),
2700                                                                           PG_GETARG_INT32(1),
2701                                                                           -1,
2702                                                                           true));
2703 }
2704
2705 static bytea *
2706 bytea_substring(Datum str,
2707                                 int S,
2708                                 int L,
2709                                 bool length_not_specified)
2710 {
2711         int                     S1;                             /* adjusted start position */
2712         int                     L1;                             /* adjusted substring length */
2713
2714         S1 = Max(S, 1);
2715
2716         if (length_not_specified)
2717         {
2718                 /*
2719                  * Not passed a length - DatumGetByteaPSlice() grabs everything to the
2720                  * end of the string if we pass it a negative value for length.
2721                  */
2722                 L1 = -1;
2723         }
2724         else
2725         {
2726                 /* end position */
2727                 int                     E = S + L;
2728
2729                 /*
2730                  * A negative value for L is the only way for the end position to be
2731                  * before the start. SQL99 says to throw an error.
2732                  */
2733                 if (E < S)
2734                         ereport(ERROR,
2735                                         (errcode(ERRCODE_SUBSTRING_ERROR),
2736                                          errmsg("negative substring length not allowed")));
2737
2738                 /*
2739                  * A zero or negative value for the end position can happen if the
2740                  * start was negative or one. SQL99 says to return a zero-length
2741                  * string.
2742                  */
2743                 if (E < 1)
2744                         return PG_STR_GET_BYTEA("");
2745
2746                 L1 = E - S1;
2747         }
2748
2749         /*
2750          * If the start position is past the end of the string, SQL99 says to
2751          * return a zero-length string -- DatumGetByteaPSlice() will do that for
2752          * us. Convert to zero-based starting position
2753          */
2754         return DatumGetByteaPSlice(str, S1 - 1, L1);
2755 }
2756
2757 /*
2758  * byteaoverlay
2759  *      Replace specified substring of first string with second
2760  *
2761  * The SQL standard defines OVERLAY() in terms of substring and concatenation.
2762  * This code is a direct implementation of what the standard says.
2763  */
2764 Datum
2765 byteaoverlay(PG_FUNCTION_ARGS)
2766 {
2767         bytea      *t1 = PG_GETARG_BYTEA_PP(0);
2768         bytea      *t2 = PG_GETARG_BYTEA_PP(1);
2769         int                     sp = PG_GETARG_INT32(2);                /* substring start position */
2770         int                     sl = PG_GETARG_INT32(3);                /* substring length */
2771
2772         PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
2773 }
2774
2775 Datum
2776 byteaoverlay_no_len(PG_FUNCTION_ARGS)
2777 {
2778         bytea      *t1 = PG_GETARG_BYTEA_PP(0);
2779         bytea      *t2 = PG_GETARG_BYTEA_PP(1);
2780         int                     sp = PG_GETARG_INT32(2);                /* substring start position */
2781         int                     sl;
2782
2783         sl = VARSIZE_ANY_EXHDR(t2); /* defaults to length(t2) */
2784         PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
2785 }
2786
2787 static bytea *
2788 bytea_overlay(bytea *t1, bytea *t2, int sp, int sl)
2789 {
2790         bytea      *result;
2791         bytea      *s1;
2792         bytea      *s2;
2793         int                     sp_pl_sl;
2794
2795         /*
2796          * Check for possible integer-overflow cases.  For negative sp, throw a
2797          * "substring length" error because that's what should be expected
2798          * according to the spec's definition of OVERLAY().
2799          */
2800         if (sp <= 0)
2801                 ereport(ERROR,
2802                                 (errcode(ERRCODE_SUBSTRING_ERROR),
2803                                  errmsg("negative substring length not allowed")));
2804         sp_pl_sl = sp + sl;
2805         if (sp_pl_sl <= sl)
2806                 ereport(ERROR,
2807                                 (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
2808                                  errmsg("integer out of range")));
2809
2810         s1 = bytea_substring(PointerGetDatum(t1), 1, sp - 1, false);
2811         s2 = bytea_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
2812         result = bytea_catenate(s1, t2);
2813         result = bytea_catenate(result, s2);
2814
2815         return result;
2816 }
2817
2818 /*
2819  * byteapos -
2820  *        Return the position of the specified substring.
2821  *        Implements the SQL POSITION() function.
2822  * Cloned from textpos and modified as required.
2823  */
2824 Datum
2825 byteapos(PG_FUNCTION_ARGS)
2826 {
2827         bytea      *t1 = PG_GETARG_BYTEA_PP(0);
2828         bytea      *t2 = PG_GETARG_BYTEA_PP(1);
2829         int                     pos;
2830         int                     px,
2831                                 p;
2832         int                     len1,
2833                                 len2;
2834         char       *p1,
2835                            *p2;
2836
2837         len1 = VARSIZE_ANY_EXHDR(t1);
2838         len2 = VARSIZE_ANY_EXHDR(t2);
2839
2840         if (len2 <= 0)
2841                 PG_RETURN_INT32(1);             /* result for empty pattern */
2842
2843         p1 = VARDATA_ANY(t1);
2844         p2 = VARDATA_ANY(t2);
2845
2846         pos = 0;
2847         px = (len1 - len2);
2848         for (p = 0; p <= px; p++)
2849         {
2850                 if ((*p2 == *p1) && (memcmp(p1, p2, len2) == 0))
2851                 {
2852                         pos = p + 1;
2853                         break;
2854                 };
2855                 p1++;
2856         };
2857
2858         PG_RETURN_INT32(pos);
2859 }
2860
2861 /*-------------------------------------------------------------
2862  * byteaGetByte
2863  *
2864  * this routine treats "bytea" as an array of bytes.
2865  * It returns the Nth byte (a number between 0 and 255).
2866  *-------------------------------------------------------------
2867  */
2868 Datum
2869 byteaGetByte(PG_FUNCTION_ARGS)
2870 {
2871         bytea      *v = PG_GETARG_BYTEA_PP(0);
2872         int32           n = PG_GETARG_INT32(1);
2873         int                     len;
2874         int                     byte;
2875
2876         len = VARSIZE_ANY_EXHDR(v);
2877
2878         if (n < 0 || n >= len)
2879                 ereport(ERROR,
2880                                 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
2881                                  errmsg("index %d out of valid range, 0..%d",
2882                                                 n, len - 1)));
2883
2884         byte = ((unsigned char *) VARDATA_ANY(v))[n];
2885
2886         PG_RETURN_INT32(byte);
2887 }
2888
2889 /*-------------------------------------------------------------
2890  * byteaGetBit
2891  *
2892  * This routine treats a "bytea" type like an array of bits.
2893  * It returns the value of the Nth bit (0 or 1).
2894  *
2895  *-------------------------------------------------------------
2896  */
2897 Datum
2898 byteaGetBit(PG_FUNCTION_ARGS)
2899 {
2900         bytea      *v = PG_GETARG_BYTEA_PP(0);
2901         int32           n = PG_GETARG_INT32(1);
2902         int                     byteNo,
2903                                 bitNo;
2904         int                     len;
2905         int                     byte;
2906
2907         len = VARSIZE_ANY_EXHDR(v);
2908
2909         if (n < 0 || n >= len * 8)
2910                 ereport(ERROR,
2911                                 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
2912                                  errmsg("index %d out of valid range, 0..%d",
2913                                                 n, len * 8 - 1)));
2914
2915         byteNo = n / 8;
2916         bitNo = n % 8;
2917
2918         byte = ((unsigned char *) VARDATA_ANY(v))[byteNo];
2919
2920         if (byte & (1 << bitNo))
2921                 PG_RETURN_INT32(1);
2922         else
2923                 PG_RETURN_INT32(0);
2924 }
2925
2926 /*-------------------------------------------------------------
2927  * byteaSetByte
2928  *
2929  * Given an instance of type 'bytea' creates a new one with
2930  * the Nth byte set to the given value.
2931  *
2932  *-------------------------------------------------------------
2933  */
2934 Datum
2935 byteaSetByte(PG_FUNCTION_ARGS)
2936 {
2937         bytea      *v = PG_GETARG_BYTEA_P(0);
2938         int32           n = PG_GETARG_INT32(1);
2939         int32           newByte = PG_GETARG_INT32(2);
2940         int                     len;
2941         bytea      *res;
2942
2943         len = VARSIZE(v) - VARHDRSZ;
2944
2945         if (n < 0 || n >= len)
2946                 ereport(ERROR,
2947                                 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
2948                                  errmsg("index %d out of valid range, 0..%d",
2949                                                 n, len - 1)));
2950
2951         /*
2952          * Make a copy of the original varlena.
2953          */
2954         res = (bytea *) palloc(VARSIZE(v));
2955         memcpy((char *) res, (char *) v, VARSIZE(v));
2956
2957         /*
2958          * Now set the byte.
2959          */
2960         ((unsigned char *) VARDATA(res))[n] = newByte;
2961
2962         PG_RETURN_BYTEA_P(res);
2963 }
2964
2965 /*-------------------------------------------------------------
2966  * byteaSetBit
2967  *
2968  * Given an instance of type 'bytea' creates a new one with
2969  * the Nth bit set to the given value.
2970  *
2971  *-------------------------------------------------------------
2972  */
2973 Datum
2974 byteaSetBit(PG_FUNCTION_ARGS)
2975 {
2976         bytea      *v = PG_GETARG_BYTEA_P(0);
2977         int32           n = PG_GETARG_INT32(1);
2978         int32           newBit = PG_GETARG_INT32(2);
2979         bytea      *res;
2980         int                     len;
2981         int                     oldByte,
2982                                 newByte;
2983         int                     byteNo,
2984                                 bitNo;
2985
2986         len = VARSIZE(v) - VARHDRSZ;
2987
2988         if (n < 0 || n >= len * 8)
2989                 ereport(ERROR,
2990                                 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
2991                                  errmsg("index %d out of valid range, 0..%d",
2992                                                 n, len * 8 - 1)));
2993
2994         byteNo = n / 8;
2995         bitNo = n % 8;
2996
2997         /*
2998          * sanity check!
2999          */
3000         if (newBit != 0 && newBit != 1)
3001                 ereport(ERROR,
3002                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
3003                                  errmsg("new bit must be 0 or 1")));
3004
3005         /*
3006          * Make a copy of the original varlena.
3007          */
3008         res = (bytea *) palloc(VARSIZE(v));
3009         memcpy((char *) res, (char *) v, VARSIZE(v));
3010
3011         /*
3012          * Update the byte.
3013          */
3014         oldByte = ((unsigned char *) VARDATA(res))[byteNo];
3015
3016         if (newBit == 0)
3017                 newByte = oldByte & (~(1 << bitNo));
3018         else
3019                 newByte = oldByte | (1 << bitNo);
3020
3021         ((unsigned char *) VARDATA(res))[byteNo] = newByte;
3022
3023         PG_RETURN_BYTEA_P(res);
3024 }
3025
3026
3027 /* text_name()
3028  * Converts a text type to a Name type.
3029  */
3030 Datum
3031 text_name(PG_FUNCTION_ARGS)
3032 {
3033         text       *s = PG_GETARG_TEXT_PP(0);
3034         Name            result;
3035         int                     len;
3036
3037         len = VARSIZE_ANY_EXHDR(s);
3038
3039         /* Truncate oversize input */
3040         if (len >= NAMEDATALEN)
3041                 len = pg_mbcliplen(VARDATA_ANY(s), len, NAMEDATALEN - 1);
3042
3043         /* We use palloc0 here to ensure result is zero-padded */
3044         result = (Name) palloc0(NAMEDATALEN);
3045         memcpy(NameStr(*result), VARDATA_ANY(s), len);
3046
3047         PG_RETURN_NAME(result);
3048 }
3049
3050 /* name_text()
3051  * Converts a Name type to a text type.
3052  */
3053 Datum
3054 name_text(PG_FUNCTION_ARGS)
3055 {
3056         Name            s = PG_GETARG_NAME(0);
3057
3058         PG_RETURN_TEXT_P(cstring_to_text(NameStr(*s)));
3059 }
3060
3061
3062 /*
3063  * textToQualifiedNameList - convert a text object to list of names
3064  *
3065  * This implements the input parsing needed by nextval() and other
3066  * functions that take a text parameter representing a qualified name.
3067  * We split the name at dots, downcase if not double-quoted, and
3068  * truncate names if they're too long.
3069  */
3070 List *
3071 textToQualifiedNameList(text *textval)
3072 {
3073         char       *rawname;
3074         List       *result = NIL;
3075         List       *namelist;
3076         ListCell   *l;
3077
3078         /* Convert to C string (handles possible detoasting). */
3079         /* Note we rely on being able to modify rawname below. */
3080         rawname = text_to_cstring(textval);
3081
3082         if (!SplitIdentifierString(rawname, '.', &namelist))
3083                 ereport(ERROR,
3084                                 (errcode(ERRCODE_INVALID_NAME),
3085                                  errmsg("invalid name syntax")));
3086
3087         if (namelist == NIL)
3088                 ereport(ERROR,
3089                                 (errcode(ERRCODE_INVALID_NAME),
3090                                  errmsg("invalid name syntax")));
3091
3092         foreach(l, namelist)
3093         {
3094                 char       *curname = (char *) lfirst(l);
3095
3096                 result = lappend(result, makeString(pstrdup(curname)));
3097         }
3098
3099         pfree(rawname);
3100         list_free(namelist);
3101
3102         return result;
3103 }
3104
3105 /*
3106  * SplitIdentifierString --- parse a string containing identifiers
3107  *
3108  * This is the guts of textToQualifiedNameList, and is exported for use in
3109  * other situations such as parsing GUC variables.  In the GUC case, it's
3110  * important to avoid memory leaks, so the API is designed to minimize the
3111  * amount of stuff that needs to be allocated and freed.
3112  *
3113  * Inputs:
3114  *      rawstring: the input string; must be overwritable!      On return, it's
3115  *                         been modified to contain the separated identifiers.
3116  *      separator: the separator punctuation expected between identifiers
3117  *                         (typically '.' or ',').  Whitespace may also appear around
3118  *                         identifiers.
3119  * Outputs:
3120  *      namelist: filled with a palloc'd list of pointers to identifiers within
3121  *                        rawstring.  Caller should list_free() this even on error return.
3122  *
3123  * Returns TRUE if okay, FALSE if there is a syntax error in the string.
3124  *
3125  * Note that an empty string is considered okay here, though not in
3126  * textToQualifiedNameList.
3127  */
3128 bool
3129 SplitIdentifierString(char *rawstring, char separator,
3130                                           List **namelist)
3131 {
3132         char       *nextp = rawstring;
3133         bool            done = false;
3134
3135         *namelist = NIL;
3136
3137         while (isspace((unsigned char) *nextp))
3138                 nextp++;                                /* skip leading whitespace */
3139
3140         if (*nextp == '\0')
3141                 return true;                    /* allow empty string */
3142
3143         /* At the top of the loop, we are at start of a new identifier. */
3144         do
3145         {
3146                 char       *curname;
3147                 char       *endp;
3148
3149                 if (*nextp == '"')
3150                 {
3151                         /* Quoted name --- collapse quote-quote pairs, no downcasing */
3152                         curname = nextp + 1;
3153                         for (;;)
3154                         {
3155                                 endp = strchr(nextp + 1, '"');
3156                                 if (endp == NULL)
3157                                         return false;           /* mismatched quotes */
3158                                 if (endp[1] != '"')
3159                                         break;          /* found end of quoted name */
3160                                 /* Collapse adjacent quotes into one quote, and look again */
3161                                 memmove(endp, endp + 1, strlen(endp));
3162                                 nextp = endp;
3163                         }
3164                         /* endp now points at the terminating quote */
3165                         nextp = endp + 1;
3166                 }
3167                 else
3168                 {
3169                         /* Unquoted name --- extends to separator or whitespace */
3170                         char       *downname;
3171                         int                     len;
3172
3173                         curname = nextp;
3174                         while (*nextp && *nextp != separator &&
3175                                    !isspace((unsigned char) *nextp))
3176                                 nextp++;
3177                         endp = nextp;
3178                         if (curname == nextp)
3179                                 return false;   /* empty unquoted name not allowed */
3180
3181                         /*
3182                          * Downcase the identifier, using same code as main lexer does.
3183                          *
3184                          * XXX because we want to overwrite the input in-place, we cannot
3185                          * support a downcasing transformation that increases the string
3186                          * length.  This is not a problem given the current implementation
3187                          * of downcase_truncate_identifier, but we'll probably have to do
3188                          * something about this someday.
3189                          */
3190                         len = endp - curname;
3191                         downname = downcase_truncate_identifier(curname, len, false);
3192                         Assert(strlen(downname) <= len);
3193                         strncpy(curname, downname, len);        /* strncpy is required here */
3194                         pfree(downname);
3195                 }
3196
3197                 while (isspace((unsigned char) *nextp))
3198                         nextp++;                        /* skip trailing whitespace */
3199
3200                 if (*nextp == separator)
3201                 {
3202                         nextp++;
3203                         while (isspace((unsigned char) *nextp))
3204                                 nextp++;                /* skip leading whitespace for next */
3205                         /* we expect another name, so done remains false */
3206                 }
3207                 else if (*nextp == '\0')
3208                         done = true;
3209                 else
3210                         return false;           /* invalid syntax */
3211
3212                 /* Now safe to overwrite separator with a null */
3213                 *endp = '\0';
3214
3215                 /* Truncate name if it's overlength */
3216                 truncate_identifier(curname, strlen(curname), false);
3217
3218                 /*
3219                  * Finished isolating current name --- add it to list
3220                  */
3221                 *namelist = lappend(*namelist, curname);
3222
3223                 /* Loop back if we didn't reach end of string */
3224         } while (!done);
3225
3226         return true;
3227 }
3228
3229
3230 /*
3231  * SplitDirectoriesString --- parse a string containing directory names
3232  *
3233  * This is similar to SplitIdentifierString, except that the parsing
3234  * rules are meant to handle pathnames instead of identifiers: there is
3235  * no downcasing, embedded spaces are allowed, the max length is MAXPGPATH-1,
3236  * and we apply canonicalize_path() to each extracted string.  Because of the
3237  * last, the returned strings are separately palloc'd rather than being
3238  * pointers into rawstring --- but we still scribble on rawstring.
3239  *
3240  * Inputs:
3241  *      rawstring: the input string; must be modifiable!
3242  *      separator: the separator punctuation expected between directories
3243  *                         (typically ',' or ';').  Whitespace may also appear around
3244  *                         directories.
3245  * Outputs:
3246  *      namelist: filled with a palloc'd list of directory names.
3247  *                        Caller should list_free_deep() this even on error return.
3248  *
3249  * Returns TRUE if okay, FALSE if there is a syntax error in the string.
3250  *
3251  * Note that an empty string is considered okay here.
3252  */
3253 bool
3254 SplitDirectoriesString(char *rawstring, char separator,
3255                                            List **namelist)
3256 {
3257         char       *nextp = rawstring;
3258         bool            done = false;
3259
3260         *namelist = NIL;
3261
3262         while (isspace((unsigned char) *nextp))
3263                 nextp++;                                /* skip leading whitespace */
3264
3265         if (*nextp == '\0')
3266                 return true;                    /* allow empty string */
3267
3268         /* At the top of the loop, we are at start of a new directory. */
3269         do
3270         {
3271                 char       *curname;
3272                 char       *endp;
3273
3274                 if (*nextp == '"')
3275                 {
3276                         /* Quoted name --- collapse quote-quote pairs */
3277                         curname = nextp + 1;
3278                         for (;;)
3279                         {
3280                                 endp = strchr(nextp + 1, '"');
3281                                 if (endp == NULL)
3282                                         return false;           /* mismatched quotes */
3283                                 if (endp[1] != '"')
3284                                         break;          /* found end of quoted name */
3285                                 /* Collapse adjacent quotes into one quote, and look again */
3286                                 memmove(endp, endp + 1, strlen(endp));
3287                                 nextp = endp;
3288                         }
3289                         /* endp now points at the terminating quote */
3290                         nextp = endp + 1;
3291                 }
3292                 else
3293                 {
3294                         /* Unquoted name --- extends to separator or end of string */
3295                         curname = endp = nextp;
3296                         while (*nextp && *nextp != separator)
3297                         {
3298                                 /* trailing whitespace should not be included in name */
3299                                 if (!isspace((unsigned char) *nextp))
3300                                         endp = nextp + 1;
3301                                 nextp++;
3302                         }
3303                         if (curname == endp)
3304                                 return false;   /* empty unquoted name not allowed */
3305                 }
3306
3307                 while (isspace((unsigned char) *nextp))
3308                         nextp++;                        /* skip trailing whitespace */
3309
3310                 if (*nextp == separator)
3311                 {
3312                         nextp++;
3313                         while (isspace((unsigned char) *nextp))
3314                                 nextp++;                /* skip leading whitespace for next */
3315                         /* we expect another name, so done remains false */
3316                 }
3317                 else if (*nextp == '\0')
3318                         done = true;
3319                 else
3320                         return false;           /* invalid syntax */
3321
3322                 /* Now safe to overwrite separator with a null */
3323                 *endp = '\0';
3324
3325                 /* Truncate path if it's overlength */
3326                 if (strlen(curname) >= MAXPGPATH)
3327                         curname[MAXPGPATH - 1] = '\0';
3328
3329                 /*
3330                  * Finished isolating current name --- add it to list
3331                  */
3332                 curname = pstrdup(curname);
3333                 canonicalize_path(curname);
3334                 *namelist = lappend(*namelist, curname);
3335
3336                 /* Loop back if we didn't reach end of string */
3337         } while (!done);
3338
3339         return true;
3340 }
3341
3342
3343 /*****************************************************************************
3344  *      Comparison Functions used for bytea
3345  *
3346  * Note: btree indexes need these routines not to leak memory; therefore,
3347  * be careful to free working copies of toasted datums.  Most places don't
3348  * need to be so careful.
3349  *****************************************************************************/
3350
3351 Datum
3352 byteaeq(PG_FUNCTION_ARGS)
3353 {
3354         Datum           arg1 = PG_GETARG_DATUM(0);
3355         Datum           arg2 = PG_GETARG_DATUM(1);
3356         bool            result;
3357         Size            len1,
3358                                 len2;
3359
3360         /*
3361          * We can use a fast path for unequal lengths, which might save us from
3362          * having to detoast one or both values.
3363          */
3364         len1 = toast_raw_datum_size(arg1);
3365         len2 = toast_raw_datum_size(arg2);
3366         if (len1 != len2)
3367                 result = false;
3368         else
3369         {
3370                 bytea      *barg1 = DatumGetByteaPP(arg1);
3371                 bytea      *barg2 = DatumGetByteaPP(arg2);
3372
3373                 result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
3374                                                  len1 - VARHDRSZ) == 0);
3375
3376                 PG_FREE_IF_COPY(barg1, 0);
3377                 PG_FREE_IF_COPY(barg2, 1);
3378         }
3379
3380         PG_RETURN_BOOL(result);
3381 }
3382
3383 Datum
3384 byteane(PG_FUNCTION_ARGS)
3385 {
3386         Datum           arg1 = PG_GETARG_DATUM(0);
3387         Datum           arg2 = PG_GETARG_DATUM(1);
3388         bool            result;
3389         Size            len1,
3390                                 len2;
3391
3392         /*
3393          * We can use a fast path for unequal lengths, which might save us from
3394          * having to detoast one or both values.
3395          */
3396         len1 = toast_raw_datum_size(arg1);
3397         len2 = toast_raw_datum_size(arg2);
3398         if (len1 != len2)
3399                 result = true;
3400         else
3401         {
3402                 bytea      *barg1 = DatumGetByteaPP(arg1);
3403                 bytea      *barg2 = DatumGetByteaPP(arg2);
3404
3405                 result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
3406                                                  len1 - VARHDRSZ) != 0);
3407
3408                 PG_FREE_IF_COPY(barg1, 0);
3409                 PG_FREE_IF_COPY(barg2, 1);
3410         }
3411
3412         PG_RETURN_BOOL(result);
3413 }
3414
3415 Datum
3416 bytealt(PG_FUNCTION_ARGS)
3417 {
3418         bytea      *arg1 = PG_GETARG_BYTEA_PP(0);
3419         bytea      *arg2 = PG_GETARG_BYTEA_PP(1);
3420         int                     len1,
3421                                 len2;
3422         int                     cmp;
3423
3424         len1 = VARSIZE_ANY_EXHDR(arg1);
3425         len2 = VARSIZE_ANY_EXHDR(arg2);
3426
3427         cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3428
3429         PG_FREE_IF_COPY(arg1, 0);
3430         PG_FREE_IF_COPY(arg2, 1);
3431
3432         PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 < len2)));
3433 }
3434
3435 Datum
3436 byteale(PG_FUNCTION_ARGS)
3437 {
3438         bytea      *arg1 = PG_GETARG_BYTEA_PP(0);
3439         bytea      *arg2 = PG_GETARG_BYTEA_PP(1);
3440         int                     len1,
3441                                 len2;
3442         int                     cmp;
3443
3444         len1 = VARSIZE_ANY_EXHDR(arg1);
3445         len2 = VARSIZE_ANY_EXHDR(arg2);
3446
3447         cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3448
3449         PG_FREE_IF_COPY(arg1, 0);
3450         PG_FREE_IF_COPY(arg2, 1);
3451
3452         PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 <= len2)));
3453 }
3454
3455 Datum
3456 byteagt(PG_FUNCTION_ARGS)
3457 {
3458         bytea      *arg1 = PG_GETARG_BYTEA_PP(0);
3459         bytea      *arg2 = PG_GETARG_BYTEA_PP(1);
3460         int                     len1,
3461                                 len2;
3462         int                     cmp;
3463
3464         len1 = VARSIZE_ANY_EXHDR(arg1);
3465         len2 = VARSIZE_ANY_EXHDR(arg2);
3466
3467         cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3468
3469         PG_FREE_IF_COPY(arg1, 0);
3470         PG_FREE_IF_COPY(arg2, 1);
3471
3472         PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 > len2)));
3473 }
3474
3475 Datum
3476 byteage(PG_FUNCTION_ARGS)
3477 {
3478         bytea      *arg1 = PG_GETARG_BYTEA_PP(0);
3479         bytea      *arg2 = PG_GETARG_BYTEA_PP(1);
3480         int                     len1,
3481                                 len2;
3482         int                     cmp;
3483
3484         len1 = VARSIZE_ANY_EXHDR(arg1);
3485         len2 = VARSIZE_ANY_EXHDR(arg2);
3486
3487         cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3488
3489         PG_FREE_IF_COPY(arg1, 0);
3490         PG_FREE_IF_COPY(arg2, 1);
3491
3492         PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 >= len2)));
3493 }
3494
3495 Datum
3496 byteacmp(PG_FUNCTION_ARGS)
3497 {
3498         bytea      *arg1 = PG_GETARG_BYTEA_PP(0);
3499         bytea      *arg2 = PG_GETARG_BYTEA_PP(1);
3500         int                     len1,
3501                                 len2;
3502         int                     cmp;
3503
3504         len1 = VARSIZE_ANY_EXHDR(arg1);
3505         len2 = VARSIZE_ANY_EXHDR(arg2);
3506
3507         cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3508         if ((cmp == 0) && (len1 != len2))
3509                 cmp = (len1 < len2) ? -1 : 1;
3510
3511         PG_FREE_IF_COPY(arg1, 0);
3512         PG_FREE_IF_COPY(arg2, 1);
3513
3514         PG_RETURN_INT32(cmp);
3515 }
3516
3517 Datum
3518 bytea_sortsupport(PG_FUNCTION_ARGS)
3519 {
3520         SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
3521         MemoryContext oldcontext;
3522
3523         oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
3524
3525         /* Use generic string SortSupport, forcing "C" collation */
3526         varstr_sortsupport(ssup, C_COLLATION_OID, false);
3527
3528         MemoryContextSwitchTo(oldcontext);
3529
3530         PG_RETURN_VOID();
3531 }
3532
3533 /*
3534  * appendStringInfoText
3535  *
3536  * Append a text to str.
3537  * Like appendStringInfoString(str, text_to_cstring(t)) but faster.
3538  */
3539 static void
3540 appendStringInfoText(StringInfo str, const text *t)
3541 {
3542         appendBinaryStringInfo(str, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t));
3543 }
3544
3545 /*
3546  * replace_text
3547  * replace all occurrences of 'old_sub_str' in 'orig_str'
3548  * with 'new_sub_str' to form 'new_str'
3549  *
3550  * returns 'orig_str' if 'old_sub_str' == '' or 'orig_str' == ''
3551  * otherwise returns 'new_str'
3552  */
3553 Datum
3554 replace_text(PG_FUNCTION_ARGS)
3555 {
3556         text       *src_text = PG_GETARG_TEXT_PP(0);
3557         text       *from_sub_text = PG_GETARG_TEXT_PP(1);
3558         text       *to_sub_text = PG_GETARG_TEXT_PP(2);
3559         int                     src_text_len;
3560         int                     from_sub_text_len;
3561         TextPositionState state;
3562         text       *ret_text;
3563         int                     start_posn;
3564         int                     curr_posn;
3565         int                     chunk_len;
3566         char       *start_ptr;
3567         StringInfoData str;
3568
3569         text_position_setup(src_text, from_sub_text, &state);
3570
3571         /*
3572          * Note: we check the converted string length, not the original, because
3573          * they could be different if the input contained invalid encoding.
3574          */
3575         src_text_len = state.len1;
3576         from_sub_text_len = state.len2;
3577
3578         /* Return unmodified source string if empty source or pattern */
3579         if (src_text_len < 1 || from_sub_text_len < 1)
3580         {
3581                 text_position_cleanup(&state);
3582                 PG_RETURN_TEXT_P(src_text);
3583         }
3584
3585         start_posn = 1;
3586         curr_posn = text_position_next(1, &state);
3587
3588         /* When the from_sub_text is not found, there is nothing to do. */
3589         if (curr_posn == 0)
3590         {
3591                 text_position_cleanup(&state);
3592                 PG_RETURN_TEXT_P(src_text);
3593         }
3594
3595         /* start_ptr points to the start_posn'th character of src_text */
3596         start_ptr = VARDATA_ANY(src_text);
3597
3598         initStringInfo(&str);
3599
3600         do
3601         {
3602                 CHECK_FOR_INTERRUPTS();
3603
3604                 /* copy the data skipped over by last text_position_next() */
3605                 chunk_len = charlen_to_bytelen(start_ptr, curr_posn - start_posn);
3606                 appendBinaryStringInfo(&str, start_ptr, chunk_len);
3607
3608                 appendStringInfoText(&str, to_sub_text);
3609
3610                 start_posn = curr_posn;
3611                 start_ptr += chunk_len;
3612                 start_posn += from_sub_text_len;
3613                 start_ptr += charlen_to_bytelen(start_ptr, from_sub_text_len);
3614
3615                 curr_posn = text_position_next(start_posn, &state);
3616         }
3617         while (curr_posn > 0);
3618
3619         /* copy trailing data */
3620         chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
3621         appendBinaryStringInfo(&str, start_ptr, chunk_len);
3622
3623         text_position_cleanup(&state);
3624
3625         ret_text = cstring_to_text_with_len(str.data, str.len);
3626         pfree(str.data);
3627
3628         PG_RETURN_TEXT_P(ret_text);
3629 }
3630
3631 /*
3632  * check_replace_text_has_escape_char
3633  *
3634  * check whether replace_text contains escape char.
3635  */
3636 static bool
3637 check_replace_text_has_escape_char(const text *replace_text)
3638 {
3639         const char *p = VARDATA_ANY(replace_text);
3640         const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
3641
3642         if (pg_database_encoding_max_length() == 1)
3643         {
3644                 for (; p < p_end; p++)
3645                 {
3646                         if (*p == '\\')
3647                                 return true;
3648                 }
3649         }
3650         else
3651         {
3652                 for (; p < p_end; p += pg_mblen(p))
3653                 {
3654                         if (*p == '\\')
3655                                 return true;
3656                 }
3657         }
3658
3659         return false;
3660 }
3661
3662 /*
3663  * appendStringInfoRegexpSubstr
3664  *
3665  * Append replace_text to str, substituting regexp back references for
3666  * \n escapes.  start_ptr is the start of the match in the source string,
3667  * at logical character position data_pos.
3668  */
3669 static void
3670 appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
3671                                                          regmatch_t *pmatch,
3672                                                          char *start_ptr, int data_pos)
3673 {
3674         const char *p = VARDATA_ANY(replace_text);
3675         const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
3676         int                     eml = pg_database_encoding_max_length();
3677
3678         for (;;)
3679         {
3680                 const char *chunk_start = p;
3681                 int                     so;
3682                 int                     eo;
3683
3684                 /* Find next escape char. */
3685                 if (eml == 1)
3686                 {
3687                         for (; p < p_end && *p != '\\'; p++)
3688                                  /* nothing */ ;
3689                 }
3690                 else
3691                 {
3692                         for (; p < p_end && *p != '\\'; p += pg_mblen(p))
3693                                  /* nothing */ ;
3694                 }
3695
3696                 /* Copy the text we just scanned over, if any. */
3697                 if (p > chunk_start)
3698                         appendBinaryStringInfo(str, chunk_start, p - chunk_start);
3699
3700                 /* Done if at end of string, else advance over escape char. */
3701                 if (p >= p_end)
3702                         break;
3703                 p++;
3704
3705                 if (p >= p_end)
3706                 {
3707                         /* Escape at very end of input.  Treat same as unexpected char */
3708                         appendStringInfoChar(str, '\\');
3709                         break;
3710                 }
3711
3712                 if (*p >= '1' && *p <= '9')
3713                 {
3714                         /* Use the back reference of regexp. */
3715                         int                     idx = *p - '0';
3716
3717                         so = pmatch[idx].rm_so;
3718                         eo = pmatch[idx].rm_eo;
3719                         p++;
3720                 }
3721                 else if (*p == '&')
3722                 {
3723                         /* Use the entire matched string. */
3724                         so = pmatch[0].rm_so;
3725                         eo = pmatch[0].rm_eo;
3726                         p++;
3727                 }
3728                 else if (*p == '\\')
3729                 {
3730                         /* \\ means transfer one \ to output. */
3731                         appendStringInfoChar(str, '\\');
3732                         p++;
3733                         continue;
3734                 }
3735                 else
3736                 {
3737                         /*
3738                          * If escape char is not followed by any expected char, just treat
3739                          * it as ordinary data to copy.  (XXX would it be better to throw
3740                          * an error?)
3741                          */
3742                         appendStringInfoChar(str, '\\');
3743                         continue;
3744                 }
3745
3746                 if (so != -1 && eo != -1)
3747                 {
3748                         /*
3749                          * Copy the text that is back reference of regexp.  Note so and eo
3750                          * are counted in characters not bytes.
3751                          */
3752                         char       *chunk_start;
3753                         int                     chunk_len;
3754
3755                         Assert(so >= data_pos);
3756                         chunk_start = start_ptr;
3757                         chunk_start += charlen_to_bytelen(chunk_start, so - data_pos);
3758                         chunk_len = charlen_to_bytelen(chunk_start, eo - so);
3759                         appendBinaryStringInfo(str, chunk_start, chunk_len);
3760                 }
3761         }
3762 }
3763
3764 #define REGEXP_REPLACE_BACKREF_CNT              10
3765
3766 /*
3767  * replace_text_regexp
3768  *
3769  * replace text that matches to regexp in src_text to replace_text.
3770  *
3771  * Note: to avoid having to include regex.h in builtins.h, we declare
3772  * the regexp argument as void *, but really it's regex_t *.
3773  */
3774 text *
3775 replace_text_regexp(text *src_text, void *regexp,
3776                                         text *replace_text, bool glob)
3777 {
3778         text       *ret_text;
3779         regex_t    *re = (regex_t *) regexp;
3780         int                     src_text_len = VARSIZE_ANY_EXHDR(src_text);
3781         StringInfoData buf;
3782         regmatch_t      pmatch[REGEXP_REPLACE_BACKREF_CNT];
3783         pg_wchar   *data;
3784         size_t          data_len;
3785         int                     search_start;
3786         int                     data_pos;
3787         char       *start_ptr;
3788         bool            have_escape;
3789
3790         initStringInfo(&buf);
3791
3792         /* Convert data string to wide characters. */
3793         data = (pg_wchar *) palloc((src_text_len + 1) * sizeof(pg_wchar));
3794         data_len = pg_mb2wchar_with_len(VARDATA_ANY(src_text), data, src_text_len);
3795
3796         /* Check whether replace_text has escape char. */
3797         have_escape = check_replace_text_has_escape_char(replace_text);
3798
3799         /* start_ptr points to the data_pos'th character of src_text */
3800         start_ptr = (char *) VARDATA_ANY(src_text);
3801         data_pos = 0;
3802
3803         search_start = 0;
3804         while (search_start <= data_len)
3805         {
3806                 int                     regexec_result;
3807
3808                 CHECK_FOR_INTERRUPTS();
3809
3810                 regexec_result = pg_regexec(re,
3811                                                                         data,
3812                                                                         data_len,
3813                                                                         search_start,
3814                                                                         NULL,           /* no details */
3815                                                                         REGEXP_REPLACE_BACKREF_CNT,
3816                                                                         pmatch,
3817                                                                         0);
3818
3819                 if (regexec_result == REG_NOMATCH)
3820                         break;
3821
3822                 if (regexec_result != REG_OKAY)
3823                 {
3824                         char            errMsg[100];
3825
3826                         CHECK_FOR_INTERRUPTS();
3827                         pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
3828                         ereport(ERROR,
3829                                         (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
3830                                          errmsg("regular expression failed: %s", errMsg)));
3831                 }
3832
3833                 /*
3834                  * Copy the text to the left of the match position.  Note we are given
3835                  * character not byte indexes.
3836                  */
3837                 if (pmatch[0].rm_so - data_pos > 0)
3838                 {
3839                         int                     chunk_len;
3840
3841                         chunk_len = charlen_to_bytelen(start_ptr,
3842                                                                                    pmatch[0].rm_so - data_pos);
3843                         appendBinaryStringInfo(&buf, start_ptr, chunk_len);
3844
3845                         /*
3846                          * Advance start_ptr over that text, to avoid multiple rescans of
3847                          * it if the replace_text contains multiple back-references.
3848                          */
3849                         start_ptr += chunk_len;
3850                         data_pos = pmatch[0].rm_so;
3851                 }
3852
3853                 /*
3854                  * Copy the replace_text. Process back references when the
3855                  * replace_text has escape characters.
3856                  */
3857                 if (have_escape)
3858                         appendStringInfoRegexpSubstr(&buf, replace_text, pmatch,
3859                                                                                  start_ptr, data_pos);
3860                 else
3861                         appendStringInfoText(&buf, replace_text);
3862
3863                 /* Advance start_ptr and data_pos over the matched text. */
3864                 start_ptr += charlen_to_bytelen(start_ptr,
3865                                                                                 pmatch[0].rm_eo - data_pos);
3866                 data_pos = pmatch[0].rm_eo;
3867
3868                 /*
3869                  * When global option is off, replace the first instance only.
3870                  */
3871                 if (!glob)
3872                         break;
3873
3874                 /*
3875                  * Advance search position.  Normally we start the next search at the
3876                  * end of the previous match; but if the match was of zero length, we
3877                  * have to advance by one character, or we'd just find the same match
3878                  * again.
3879                  */
3880                 search_start = data_pos;
3881                 if (pmatch[0].rm_so == pmatch[0].rm_eo)
3882                         search_start++;
3883         }
3884
3885         /*
3886          * Copy the text to the right of the last match.
3887          */
3888         if (data_pos < data_len)
3889         {
3890                 int                     chunk_len;
3891
3892                 chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
3893                 appendBinaryStringInfo(&buf, start_ptr, chunk_len);
3894         }
3895
3896         ret_text = cstring_to_text_with_len(buf.data, buf.len);
3897         pfree(buf.data);
3898         pfree(data);
3899
3900         return ret_text;
3901 }
3902
3903 /*
3904  * split_text
3905  * parse input string
3906  * return ord item (1 based)
3907  * based on provided field separator
3908  */
3909 Datum
3910 split_text(PG_FUNCTION_ARGS)
3911 {
3912         text       *inputstring = PG_GETARG_TEXT_PP(0);
3913         text       *fldsep = PG_GETARG_TEXT_PP(1);
3914         int                     fldnum = PG_GETARG_INT32(2);
3915         int                     inputstring_len;
3916         int                     fldsep_len;
3917         TextPositionState state;
3918         int                     start_posn;
3919         int                     end_posn;
3920         text       *result_text;
3921
3922         /* field number is 1 based */
3923         if (fldnum < 1)
3924                 ereport(ERROR,
3925                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
3926                                  errmsg("field position must be greater than zero")));
3927
3928         text_position_setup(inputstring, fldsep, &state);
3929
3930         /*
3931          * Note: we check the converted string length, not the original, because
3932          * they could be different if the input contained invalid encoding.
3933          */
3934         inputstring_len = state.len1;
3935         fldsep_len = state.len2;
3936
3937         /* return empty string for empty input string */
3938         if (inputstring_len < 1)
3939         {
3940                 text_position_cleanup(&state);
3941                 PG_RETURN_TEXT_P(cstring_to_text(""));
3942         }
3943
3944         /* empty field separator */
3945         if (fldsep_len < 1)
3946         {
3947                 text_position_cleanup(&state);
3948                 /* if first field, return input string, else empty string */
3949                 if (fldnum == 1)
3950                         PG_RETURN_TEXT_P(inputstring);
3951                 else
3952                         PG_RETURN_TEXT_P(cstring_to_text(""));
3953         }
3954
3955         /* identify bounds of first field */
3956         start_posn = 1;
3957         end_posn = text_position_next(1, &state);
3958
3959         /* special case if fldsep not found at all */
3960         if (end_posn == 0)
3961         {
3962                 text_position_cleanup(&state);
3963                 /* if field 1 requested, return input string, else empty string */
3964                 if (fldnum == 1)
3965                         PG_RETURN_TEXT_P(inputstring);
3966                 else
3967                         PG_RETURN_TEXT_P(cstring_to_text(""));
3968         }
3969
3970         while (end_posn > 0 && --fldnum > 0)
3971         {
3972                 /* identify bounds of next field */
3973                 start_posn = end_posn + fldsep_len;
3974                 end_posn = text_position_next(start_posn, &state);
3975         }
3976
3977         text_position_cleanup(&state);
3978
3979         if (fldnum > 0)
3980         {
3981                 /* N'th field separator not found */
3982                 /* if last field requested, return it, else empty string */
3983                 if (fldnum == 1)
3984                         result_text = text_substring(PointerGetDatum(inputstring),
3985                                                                                  start_posn,
3986                                                                                  -1,
3987                                                                                  true);
3988                 else
3989                         result_text = cstring_to_text("");
3990         }
3991         else
3992         {
3993                 /* non-last field requested */
3994                 result_text = text_substring(PointerGetDatum(inputstring),
3995                                                                          start_posn,
3996                                                                          end_posn - start_posn,
3997                                                                          false);
3998         }
3999
4000         PG_RETURN_TEXT_P(result_text);
4001 }
4002
4003 /*
4004  * Convenience function to return true when two text params are equal.
4005  */
4006 static bool
4007 text_isequal(text *txt1, text *txt2)
4008 {
4009         return DatumGetBool(DirectFunctionCall2(texteq,
4010                                                                                         PointerGetDatum(txt1),
4011                                                                                         PointerGetDatum(txt2)));
4012 }
4013
4014 /*
4015  * text_to_array
4016  * parse input string and return text array of elements,
4017  * based on provided field separator
4018  */
4019 Datum
4020 text_to_array(PG_FUNCTION_ARGS)
4021 {
4022         return text_to_array_internal(fcinfo);
4023 }
4024
4025 /*
4026  * text_to_array_null
4027  * parse input string and return text array of elements,
4028  * based on provided field separator and null string
4029  *
4030  * This is a separate entry point only to prevent the regression tests from
4031  * complaining about different argument sets for the same internal function.
4032  */
4033 Datum
4034 text_to_array_null(PG_FUNCTION_ARGS)
4035 {
4036         return text_to_array_internal(fcinfo);
4037 }
4038
4039 /*
4040  * common code for text_to_array and text_to_array_null functions
4041  *
4042  * These are not strict so we have to test for null inputs explicitly.
4043  */
4044 static Datum
4045 text_to_array_internal(PG_FUNCTION_ARGS)
4046 {
4047         text       *inputstring;
4048         text       *fldsep;
4049         text       *null_string;
4050         int                     inputstring_len;
4051         int                     fldsep_len;
4052         char       *start_ptr;
4053         text       *result_text;
4054         bool            is_null;
4055         ArrayBuildState *astate = NULL;
4056
4057         /* when input string is NULL, then result is NULL too */
4058         if (PG_ARGISNULL(0))
4059                 PG_RETURN_NULL();
4060
4061         inputstring = PG_GETARG_TEXT_PP(0);
4062
4063         /* fldsep can be NULL */
4064         if (!PG_ARGISNULL(1))
4065                 fldsep = PG_GETARG_TEXT_PP(1);
4066         else
4067                 fldsep = NULL;
4068
4069         /* null_string can be NULL or omitted */
4070         if (PG_NARGS() > 2 && !PG_ARGISNULL(2))
4071                 null_string = PG_GETARG_TEXT_PP(2);
4072         else
4073                 null_string = NULL;
4074
4075         if (fldsep != NULL)
4076         {
4077                 /*
4078                  * Normal case with non-null fldsep.  Use the text_position machinery
4079                  * to search for occurrences of fldsep.
4080                  */
4081                 TextPositionState state;
4082                 int                     fldnum;
4083                 int                     start_posn;
4084                 int                     end_posn;
4085                 int                     chunk_len;
4086
4087                 text_position_setup(inputstring, fldsep, &state);
4088
4089                 /*
4090                  * Note: we check the converted string length, not the original,
4091                  * because they could be different if the input contained invalid
4092                  * encoding.
4093                  */
4094                 inputstring_len = state.len1;
4095                 fldsep_len = state.len2;
4096
4097                 /* return empty array for empty input string */
4098                 if (inputstring_len < 1)
4099                 {
4100                         text_position_cleanup(&state);
4101                         PG_RETURN_ARRAYTYPE_P(construct_empty_array(TEXTOID));
4102                 }
4103
4104                 /*
4105                  * empty field separator: return the input string as a one-element
4106                  * array
4107                  */
4108                 if (fldsep_len < 1)
4109                 {
4110                         text_position_cleanup(&state);
4111                         /* single element can be a NULL too */
4112                         is_null = null_string ? text_isequal(inputstring, null_string) : false;
4113                         PG_RETURN_ARRAYTYPE_P(create_singleton_array(fcinfo, TEXTOID,
4114                                                                                                 PointerGetDatum(inputstring),
4115                                                                                                                  is_null, 1));
4116                 }
4117
4118                 start_posn = 1;
4119                 /* start_ptr points to the start_posn'th character of inputstring */
4120                 start_ptr = VARDATA_ANY(inputstring);
4121
4122                 for (fldnum = 1;; fldnum++)             /* field number is 1 based */
4123                 {
4124                         CHECK_FOR_INTERRUPTS();
4125
4126                         end_posn = text_position_next(start_posn, &state);
4127
4128                         if (end_posn == 0)
4129                         {
4130                                 /* fetch last field */
4131                                 chunk_len = ((char *) inputstring + VARSIZE_ANY(inputstring)) - start_ptr;
4132                         }
4133                         else
4134                         {
4135                                 /* fetch non-last field */
4136                                 chunk_len = charlen_to_bytelen(start_ptr, end_posn - start_posn);
4137                         }
4138
4139                         /* must build a temp text datum to pass to accumArrayResult */
4140                         result_text = cstring_to_text_with_len(start_ptr, chunk_len);
4141                         is_null = null_string ? text_isequal(result_text, null_string) : false;
4142
4143                         /* stash away this field */
4144                         astate = accumArrayResult(astate,
4145                                                                           PointerGetDatum(result_text),
4146                                                                           is_null,
4147                                                                           TEXTOID,
4148                                                                           CurrentMemoryContext);
4149
4150                         pfree(result_text);
4151
4152                         if (end_posn == 0)
4153                                 break;
4154
4155                         start_posn = end_posn;
4156                         start_ptr += chunk_len;
4157                         start_posn += fldsep_len;
4158                         start_ptr += charlen_to_bytelen(start_ptr, fldsep_len);
4159                 }
4160
4161                 text_position_cleanup(&state);
4162         }
4163         else
4164         {
4165                 /*
4166                  * When fldsep is NULL, each character in the inputstring becomes an
4167                  * element in the result array.  The separator is effectively the
4168                  * space between characters.
4169                  */
4170                 inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4171
4172                 /* return empty array for empty input string */
4173                 if (inputstring_len < 1)
4174                         PG_RETURN_ARRAYTYPE_P(construct_empty_array(TEXTOID));
4175
4176                 start_ptr = VARDATA_ANY(inputstring);
4177
4178                 while (inputstring_len > 0)
4179                 {
4180                         int                     chunk_len = pg_mblen(start_ptr);
4181
4182                         CHECK_FOR_INTERRUPTS();
4183
4184                         /* must build a temp text datum to pass to accumArrayResult */
4185                         result_text = cstring_to_text_with_len(start_ptr, chunk_len);
4186                         is_null = null_string ? text_isequal(result_text, null_string) : false;
4187
4188                         /* stash away this field */
4189                         astate = accumArrayResult(astate,
4190                                                                           PointerGetDatum(result_text),
4191                                                                           is_null,
4192                                                                           TEXTOID,
4193                                                                           CurrentMemoryContext);
4194
4195                         pfree(result_text);
4196
4197                         start_ptr += chunk_len;
4198                         inputstring_len -= chunk_len;
4199                 }
4200         }
4201
4202         PG_RETURN_ARRAYTYPE_P(makeArrayResult(astate,
4203                                                                                   CurrentMemoryContext));
4204 }
4205
4206 /*
4207  * array_to_text
4208  * concatenate Cstring representation of input array elements
4209  * using provided field separator
4210  */
4211 Datum
4212 array_to_text(PG_FUNCTION_ARGS)
4213 {
4214         ArrayType  *v = PG_GETARG_ARRAYTYPE_P(0);
4215         char       *fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
4216
4217         PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, NULL));
4218 }
4219
4220 /*
4221  * array_to_text_null
4222  * concatenate Cstring representation of input array elements
4223  * using provided field separator and null string
4224  *
4225  * This version is not strict so we have to test for null inputs explicitly.
4226  */
4227 Datum
4228 array_to_text_null(PG_FUNCTION_ARGS)
4229 {
4230         ArrayType  *v;
4231         char       *fldsep;
4232         char       *null_string;
4233
4234         /* returns NULL when first or second parameter is NULL */
4235         if (PG_ARGISNULL(0) || PG_ARGISNULL(1))
4236                 PG_RETURN_NULL();
4237
4238         v = PG_GETARG_ARRAYTYPE_P(0);
4239         fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
4240
4241         /* NULL null string is passed through as a null pointer */
4242         if (!PG_ARGISNULL(2))
4243                 null_string = text_to_cstring(PG_GETARG_TEXT_PP(2));
4244         else
4245                 null_string = NULL;
4246
4247         PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, null_string));
4248 }
4249
4250 /*
4251  * common code for array_to_text and array_to_text_null functions
4252  */
4253 static text *
4254 array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v,
4255                                            const char *fldsep, const char *null_string)
4256 {
4257         text       *result;
4258         int                     nitems,
4259                            *dims,
4260                                 ndims;
4261         Oid                     element_type;
4262         int                     typlen;
4263         bool            typbyval;
4264         char            typalign;
4265         StringInfoData buf;
4266         bool            printed = false;
4267         char       *p;
4268         bits8      *bitmap;
4269         int                     bitmask;
4270         int                     i;
4271         ArrayMetaState *my_extra;
4272
4273         ndims = ARR_NDIM(v);
4274         dims = ARR_DIMS(v);
4275         nitems = ArrayGetNItems(ndims, dims);
4276
4277         /* if there are no elements, return an empty string */
4278         if (nitems == 0)
4279                 return cstring_to_text_with_len("", 0);
4280
4281         element_type = ARR_ELEMTYPE(v);
4282         initStringInfo(&buf);
4283
4284         /*
4285          * We arrange to look up info about element type, including its output
4286          * conversion proc, only once per series of calls, assuming the element
4287          * type doesn't change underneath us.
4288          */
4289         my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
4290         if (my_extra == NULL)
4291         {
4292                 fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
4293                                                                                                           sizeof(ArrayMetaState));
4294                 my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
4295                 my_extra->element_type = ~element_type;
4296         }
4297
4298         if (my_extra->element_type != element_type)
4299         {
4300                 /*
4301                  * Get info about element type, including its output conversion proc
4302                  */
4303                 get_type_io_data(element_type, IOFunc_output,
4304                                                  &my_extra->typlen, &my_extra->typbyval,
4305                                                  &my_extra->typalign, &my_extra->typdelim,
4306                                                  &my_extra->typioparam, &my_extra->typiofunc);
4307                 fmgr_info_cxt(my_extra->typiofunc, &my_extra->proc,
4308                                           fcinfo->flinfo->fn_mcxt);
4309                 my_extra->element_type = element_type;
4310         }
4311         typlen = my_extra->typlen;
4312         typbyval = my_extra->typbyval;
4313         typalign = my_extra->typalign;
4314
4315         p = ARR_DATA_PTR(v);
4316         bitmap = ARR_NULLBITMAP(v);
4317         bitmask = 1;
4318
4319         for (i = 0; i < nitems; i++)
4320         {
4321                 Datum           itemvalue;
4322                 char       *value;
4323
4324                 /* Get source element, checking for NULL */
4325                 if (bitmap && (*bitmap & bitmask) == 0)
4326                 {
4327                         /* if null_string is NULL, we just ignore null elements */
4328                         if (null_string != NULL)
4329                         {
4330                                 if (printed)
4331                                         appendStringInfo(&buf, "%s%s", fldsep, null_string);
4332                                 else
4333                                         appendStringInfoString(&buf, null_string);
4334                                 printed = true;
4335                         }
4336                 }
4337                 else
4338                 {
4339                         itemvalue = fetch_att(p, typbyval, typlen);
4340
4341                         value = OutputFunctionCall(&my_extra->proc, itemvalue);
4342
4343                         if (printed)
4344                                 appendStringInfo(&buf, "%s%s", fldsep, value);
4345                         else
4346                                 appendStringInfoString(&buf, value);
4347                         printed = true;
4348
4349                         p = att_addlength_pointer(p, typlen, p);
4350                         p = (char *) att_align_nominal(p, typalign);
4351                 }
4352
4353                 /* advance bitmap pointer if any */
4354                 if (bitmap)
4355                 {
4356                         bitmask <<= 1;
4357                         if (bitmask == 0x100)
4358                         {
4359                                 bitmap++;
4360                                 bitmask = 1;
4361                         }
4362                 }
4363         }
4364
4365         result = cstring_to_text_with_len(buf.data, buf.len);
4366         pfree(buf.data);
4367
4368         return result;
4369 }
4370
4371 #define HEXBASE 16
4372 /*
4373  * Convert an int32 to a string containing a base 16 (hex) representation of
4374  * the number.
4375  */
4376 Datum
4377 to_hex32(PG_FUNCTION_ARGS)
4378 {
4379         uint32          value = (uint32) PG_GETARG_INT32(0);
4380         char       *ptr;
4381         const char *digits = "0123456789abcdef";
4382         char            buf[32];                /* bigger than needed, but reasonable */
4383
4384         ptr = buf + sizeof(buf) - 1;
4385         *ptr = '\0';
4386
4387         do
4388         {
4389                 *--ptr = digits[value % HEXBASE];
4390                 value /= HEXBASE;
4391         } while (ptr > buf && value);
4392
4393         PG_RETURN_TEXT_P(cstring_to_text(ptr));
4394 }
4395
4396 /*
4397  * Convert an int64 to a string containing a base 16 (hex) representation of
4398  * the number.
4399  */
4400 Datum
4401 to_hex64(PG_FUNCTION_ARGS)
4402 {
4403         uint64          value = (uint64) PG_GETARG_INT64(0);
4404         char       *ptr;
4405         const char *digits = "0123456789abcdef";
4406         char            buf[32];                /* bigger than needed, but reasonable */
4407
4408         ptr = buf + sizeof(buf) - 1;
4409         *ptr = '\0';
4410
4411         do
4412         {
4413                 *--ptr = digits[value % HEXBASE];
4414                 value /= HEXBASE;
4415         } while (ptr > buf && value);
4416
4417         PG_RETURN_TEXT_P(cstring_to_text(ptr));
4418 }
4419
4420 /*
4421  * Create an md5 hash of a text string and return it as hex
4422  *
4423  * md5 produces a 16 byte (128 bit) hash; double it for hex
4424  */
4425 #define MD5_HASH_LEN  32
4426
4427 Datum
4428 md5_text(PG_FUNCTION_ARGS)
4429 {
4430         text       *in_text = PG_GETARG_TEXT_PP(0);
4431         size_t          len;
4432         char            hexsum[MD5_HASH_LEN + 1];
4433
4434         /* Calculate the length of the buffer using varlena metadata */
4435         len = VARSIZE_ANY_EXHDR(in_text);
4436
4437         /* get the hash result */
4438         if (pg_md5_hash(VARDATA_ANY(in_text), len, hexsum) == false)
4439                 ereport(ERROR,
4440                                 (errcode(ERRCODE_OUT_OF_MEMORY),
4441                                  errmsg("out of memory")));
4442
4443         /* convert to text and return it */
4444         PG_RETURN_TEXT_P(cstring_to_text(hexsum));
4445 }
4446
4447 /*
4448  * Create an md5 hash of a bytea field and return it as a hex string:
4449  * 16-byte md5 digest is represented in 32 hex characters.
4450  */
4451 Datum
4452 md5_bytea(PG_FUNCTION_ARGS)
4453 {
4454         bytea      *in = PG_GETARG_BYTEA_PP(0);
4455         size_t          len;
4456         char            hexsum[MD5_HASH_LEN + 1];
4457
4458         len = VARSIZE_ANY_EXHDR(in);
4459         if (pg_md5_hash(VARDATA_ANY(in), len, hexsum) == false)
4460                 ereport(ERROR,
4461                                 (errcode(ERRCODE_OUT_OF_MEMORY),
4462                                  errmsg("out of memory")));
4463
4464         PG_RETURN_TEXT_P(cstring_to_text(hexsum));
4465 }
4466
4467 /*
4468  * Return the size of a datum, possibly compressed
4469  *
4470  * Works on any data type
4471  */
4472 Datum
4473 pg_column_size(PG_FUNCTION_ARGS)
4474 {
4475         Datum           value = PG_GETARG_DATUM(0);
4476         int32           result;
4477         int                     typlen;
4478
4479         /* On first call, get the input type's typlen, and save at *fn_extra */
4480         if (fcinfo->flinfo->fn_extra == NULL)
4481         {
4482                 /* Lookup the datatype of the supplied argument */
4483                 Oid                     argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
4484
4485                 typlen = get_typlen(argtypeid);
4486                 if (typlen == 0)                /* should not happen */
4487                         elog(ERROR, "cache lookup failed for type %u", argtypeid);
4488
4489                 fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
4490                                                                                                           sizeof(int));
4491                 *((int *) fcinfo->flinfo->fn_extra) = typlen;
4492         }
4493         else
4494                 typlen = *((int *) fcinfo->flinfo->fn_extra);
4495
4496         if (typlen == -1)
4497         {
4498                 /* varlena type, possibly toasted */
4499                 result = toast_datum_size(value);
4500         }
4501         else if (typlen == -2)
4502         {
4503                 /* cstring */
4504                 result = strlen(DatumGetCString(value)) + 1;
4505         }
4506         else
4507         {
4508                 /* ordinary fixed-width type */
4509                 result = typlen;
4510         }
4511
4512         PG_RETURN_INT32(result);
4513 }
4514
4515 /*
4516  * string_agg - Concatenates values and returns string.
4517  *
4518  * Syntax: string_agg(value text, delimiter text) RETURNS text
4519  *
4520  * Note: Any NULL values are ignored. The first-call delimiter isn't
4521  * actually used at all, and on subsequent calls the delimiter precedes
4522  * the associated value.
4523  */
4524
4525 /* subroutine to initialize state */
4526 static StringInfo
4527 makeStringAggState(FunctionCallInfo fcinfo)
4528 {
4529         StringInfo      state;
4530         MemoryContext aggcontext;
4531         MemoryContext oldcontext;
4532
4533         if (!AggCheckCallContext(fcinfo, &aggcontext))
4534         {
4535                 /* cannot be called directly because of internal-type argument */
4536                 elog(ERROR, "string_agg_transfn called in non-aggregate context");
4537         }
4538
4539         /*
4540          * Create state in aggregate context.  It'll stay there across subsequent
4541          * calls.
4542          */
4543         oldcontext = MemoryContextSwitchTo(aggcontext);
4544         state = makeStringInfo();
4545         MemoryContextSwitchTo(oldcontext);
4546
4547         return state;
4548 }
4549
4550 Datum
4551 string_agg_transfn(PG_FUNCTION_ARGS)
4552 {
4553         StringInfo      state;
4554
4555         state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
4556
4557         /* Append the value unless null. */
4558         if (!PG_ARGISNULL(1))
4559         {
4560                 /* On the first time through, we ignore the delimiter. */
4561                 if (state == NULL)
4562                         state = makeStringAggState(fcinfo);
4563                 else if (!PG_ARGISNULL(2))
4564                         appendStringInfoText(state, PG_GETARG_TEXT_PP(2));      /* delimiter */
4565
4566                 appendStringInfoText(state, PG_GETARG_TEXT_PP(1));              /* value */
4567         }
4568
4569         /*
4570          * The transition type for string_agg() is declared to be "internal",
4571          * which is a pass-by-value type the same size as a pointer.
4572          */
4573         PG_RETURN_POINTER(state);
4574 }
4575
4576 Datum
4577 string_agg_finalfn(PG_FUNCTION_ARGS)
4578 {
4579         StringInfo      state;
4580
4581         /* cannot be called directly because of internal-type argument */
4582         Assert(AggCheckCallContext(fcinfo, NULL));
4583
4584         state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
4585
4586         if (state != NULL)
4587                 PG_RETURN_TEXT_P(cstring_to_text_with_len(state->data, state->len));
4588         else
4589                 PG_RETURN_NULL();
4590 }
4591
4592 /*
4593  * Implementation of both concat() and concat_ws().
4594  *
4595  * sepstr is the separator string to place between values.
4596  * argidx identifies the first argument to concatenate (counting from zero).
4597  * Returns NULL if result should be NULL, else text value.
4598  */
4599 static text *
4600 concat_internal(const char *sepstr, int argidx,
4601                                 FunctionCallInfo fcinfo)
4602 {
4603         text       *result;
4604         StringInfoData str;
4605         bool            first_arg = true;
4606         int                     i;
4607
4608         /*
4609          * concat(VARIADIC some-array) is essentially equivalent to
4610          * array_to_text(), ie concat the array elements with the given separator.
4611          * So we just pass the case off to that code.
4612          */
4613         if (get_fn_expr_variadic(fcinfo->flinfo))
4614         {
4615                 ArrayType  *arr;
4616
4617                 /* Should have just the one argument */
4618                 Assert(argidx == PG_NARGS() - 1);
4619
4620                 /* concat(VARIADIC NULL) is defined as NULL */
4621                 if (PG_ARGISNULL(argidx))
4622                         return NULL;
4623
4624                 /*
4625                  * Non-null argument had better be an array.  We assume that any call
4626                  * context that could let get_fn_expr_variadic return true will have
4627                  * checked that a VARIADIC-labeled parameter actually is an array.  So
4628                  * it should be okay to just Assert that it's an array rather than
4629                  * doing a full-fledged error check.
4630                  */
4631                 Assert(OidIsValid(get_base_element_type(get_fn_expr_argtype(fcinfo->flinfo, argidx))));
4632
4633                 /* OK, safe to fetch the array value */
4634                 arr = PG_GETARG_ARRAYTYPE_P(argidx);
4635
4636                 /*
4637                  * And serialize the array.  We tell array_to_text to ignore null
4638                  * elements, which matches the behavior of the loop below.
4639                  */
4640                 return array_to_text_internal(fcinfo, arr, sepstr, NULL);
4641         }
4642
4643         /* Normal case without explicit VARIADIC marker */
4644         initStringInfo(&str);
4645
4646         for (i = argidx; i < PG_NARGS(); i++)
4647         {
4648                 if (!PG_ARGISNULL(i))
4649                 {
4650                         Datum           value = PG_GETARG_DATUM(i);
4651                         Oid                     valtype;
4652                         Oid                     typOutput;
4653                         bool            typIsVarlena;
4654
4655                         /* add separator if appropriate */
4656                         if (first_arg)
4657                                 first_arg = false;
4658                         else
4659                                 appendStringInfoString(&str, sepstr);
4660
4661                         /* call the appropriate type output function, append the result */
4662                         valtype = get_fn_expr_argtype(fcinfo->flinfo, i);
4663                         if (!OidIsValid(valtype))
4664                                 elog(ERROR, "could not determine data type of concat() input");
4665                         getTypeOutputInfo(valtype, &typOutput, &typIsVarlena);
4666                         appendStringInfoString(&str,
4667                                                                    OidOutputFunctionCall(typOutput, value));
4668                 }
4669         }
4670
4671         result = cstring_to_text_with_len(str.data, str.len);
4672         pfree(str.data);
4673
4674         return result;
4675 }
4676
4677 /*
4678  * Concatenate all arguments. NULL arguments are ignored.
4679  */
4680 Datum
4681 text_concat(PG_FUNCTION_ARGS)
4682 {
4683         text       *result;
4684
4685         result = concat_internal("", 0, fcinfo);
4686         if (result == NULL)
4687                 PG_RETURN_NULL();
4688         PG_RETURN_TEXT_P(result);
4689 }
4690
4691 /*
4692  * Concatenate all but first argument value with separators. The first
4693  * parameter is used as the separator. NULL arguments are ignored.
4694  */
4695 Datum
4696 text_concat_ws(PG_FUNCTION_ARGS)
4697 {
4698         char       *sep;
4699         text       *result;
4700
4701         /* return NULL when separator is NULL */
4702         if (PG_ARGISNULL(0))
4703                 PG_RETURN_NULL();
4704         sep = text_to_cstring(PG_GETARG_TEXT_PP(0));
4705
4706         result = concat_internal(sep, 1, fcinfo);
4707         if (result == NULL)
4708                 PG_RETURN_NULL();
4709         PG_RETURN_TEXT_P(result);
4710 }
4711
4712 /*
4713  * Return first n characters in the string. When n is negative,
4714  * return all but last |n| characters.
4715  */
4716 Datum
4717 text_left(PG_FUNCTION_ARGS)
4718 {
4719         text       *str = PG_GETARG_TEXT_PP(0);
4720         const char *p = VARDATA_ANY(str);
4721         int                     len = VARSIZE_ANY_EXHDR(str);
4722         int                     n = PG_GETARG_INT32(1);
4723         int                     rlen;
4724
4725         if (n < 0)
4726                 n = pg_mbstrlen_with_len(p, len) + n;
4727         rlen = pg_mbcharcliplen(p, len, n);
4728
4729         PG_RETURN_TEXT_P(cstring_to_text_with_len(p, rlen));
4730 }
4731
4732 /*
4733  * Return last n characters in the string. When n is negative,
4734  * return all but first |n| characters.
4735  */
4736 Datum
4737 text_right(PG_FUNCTION_ARGS)
4738 {
4739         text       *str = PG_GETARG_TEXT_PP(0);
4740         const char *p = VARDATA_ANY(str);
4741         int                     len = VARSIZE_ANY_EXHDR(str);
4742         int                     n = PG_GETARG_INT32(1);
4743         int                     off;
4744
4745         if (n < 0)
4746                 n = -n;
4747         else
4748                 n = pg_mbstrlen_with_len(p, len) - n;
4749         off = pg_mbcharcliplen(p, len, n);
4750
4751         PG_RETURN_TEXT_P(cstring_to_text_with_len(p + off, len - off));
4752 }
4753
4754 /*
4755  * Return reversed string
4756  */
4757 Datum
4758 text_reverse(PG_FUNCTION_ARGS)
4759 {
4760         text       *str = PG_GETARG_TEXT_PP(0);
4761         const char *p = VARDATA_ANY(str);
4762         int                     len = VARSIZE_ANY_EXHDR(str);
4763         const char *endp = p + len;
4764         text       *result;
4765         char       *dst;
4766
4767         result = palloc(len + VARHDRSZ);
4768         dst = (char *) VARDATA(result) + len;
4769         SET_VARSIZE(result, len + VARHDRSZ);
4770
4771         if (pg_database_encoding_max_length() > 1)
4772         {
4773                 /* multibyte version */
4774                 while (p < endp)
4775                 {
4776                         int                     sz;
4777
4778                         sz = pg_mblen(p);
4779                         dst -= sz;
4780                         memcpy(dst, p, sz);
4781                         p += sz;
4782                 }
4783         }
4784         else
4785         {
4786                 /* single byte version */
4787                 while (p < endp)
4788                         *(--dst) = *p++;
4789         }
4790
4791         PG_RETURN_TEXT_P(result);
4792 }
4793
4794
4795 /*
4796  * Support macros for text_format()
4797  */
4798 #define TEXT_FORMAT_FLAG_MINUS  0x0001  /* is minus flag present? */
4799
4800 #define ADVANCE_PARSE_POINTER(ptr,end_ptr) \
4801         do { \
4802                 if (++(ptr) >= (end_ptr)) \
4803                         ereport(ERROR, \
4804                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
4805                                          errmsg("unterminated format() type specifier"), \
4806                                          errhint("For a single \"%%\" use \"%%%%\"."))); \
4807         } while (0)
4808
4809 /*
4810  * Returns a formatted string
4811  */
4812 Datum
4813 text_format(PG_FUNCTION_ARGS)
4814 {
4815         text       *fmt;
4816         StringInfoData str;
4817         const char *cp;
4818         const char *start_ptr;
4819         const char *end_ptr;
4820         text       *result;
4821         int                     arg;
4822         bool            funcvariadic;
4823         int                     nargs;
4824         Datum      *elements = NULL;
4825         bool       *nulls = NULL;
4826         Oid                     element_type = InvalidOid;
4827         Oid                     prev_type = InvalidOid;
4828         Oid                     prev_width_type = InvalidOid;
4829         FmgrInfo        typoutputfinfo;
4830         FmgrInfo        typoutputinfo_width;
4831
4832         /* When format string is null, immediately return null */
4833         if (PG_ARGISNULL(0))
4834                 PG_RETURN_NULL();
4835
4836         /* If argument is marked VARIADIC, expand array into elements */
4837         if (get_fn_expr_variadic(fcinfo->flinfo))
4838         {
4839                 ArrayType  *arr;
4840                 int16           elmlen;
4841                 bool            elmbyval;
4842                 char            elmalign;
4843                 int                     nitems;
4844
4845                 /* Should have just the one argument */
4846                 Assert(PG_NARGS() == 2);
4847
4848                 /* If argument is NULL, we treat it as zero-length array */
4849                 if (PG_ARGISNULL(1))
4850                         nitems = 0;
4851                 else
4852                 {
4853                         /*
4854                          * Non-null argument had better be an array.  We assume that any
4855                          * call context that could let get_fn_expr_variadic return true
4856                          * will have checked that a VARIADIC-labeled parameter actually is
4857                          * an array.  So it should be okay to just Assert that it's an
4858                          * array rather than doing a full-fledged error check.
4859                          */
4860                         Assert(OidIsValid(get_base_element_type(get_fn_expr_argtype(fcinfo->flinfo, 1))));
4861
4862                         /* OK, safe to fetch the array value */
4863                         arr = PG_GETARG_ARRAYTYPE_P(1);
4864
4865                         /* Get info about array element type */
4866                         element_type = ARR_ELEMTYPE(arr);
4867                         get_typlenbyvalalign(element_type,
4868                                                                  &elmlen, &elmbyval, &elmalign);
4869
4870                         /* Extract all array elements */
4871                         deconstruct_array(arr, element_type, elmlen, elmbyval, elmalign,
4872                                                           &elements, &nulls, &nitems);
4873                 }
4874
4875                 nargs = nitems + 1;
4876                 funcvariadic = true;
4877         }
4878         else
4879         {
4880                 /* Non-variadic case, we'll process the arguments individually */
4881                 nargs = PG_NARGS();
4882                 funcvariadic = false;
4883         }
4884
4885         /* Setup for main loop. */
4886         fmt = PG_GETARG_TEXT_PP(0);
4887         start_ptr = VARDATA_ANY(fmt);
4888         end_ptr = start_ptr + VARSIZE_ANY_EXHDR(fmt);
4889         initStringInfo(&str);
4890         arg = 1;                                        /* next argument position to print */
4891
4892         /* Scan format string, looking for conversion specifiers. */
4893         for (cp = start_ptr; cp < end_ptr; cp++)
4894         {
4895                 int                     argpos;
4896                 int                     widthpos;
4897                 int                     flags;
4898                 int                     width;
4899                 Datum           value;
4900                 bool            isNull;
4901                 Oid                     typid;
4902
4903                 /*
4904                  * If it's not the start of a conversion specifier, just copy it to
4905                  * the output buffer.
4906                  */
4907                 if (*cp != '%')
4908                 {
4909                         appendStringInfoCharMacro(&str, *cp);
4910                         continue;
4911                 }
4912
4913                 ADVANCE_PARSE_POINTER(cp, end_ptr);
4914
4915                 /* Easy case: %% outputs a single % */
4916                 if (*cp == '%')
4917                 {
4918                         appendStringInfoCharMacro(&str, *cp);
4919                         continue;
4920                 }
4921
4922                 /* Parse the optional portions of the format specifier */
4923                 cp = text_format_parse_format(cp, end_ptr,
4924                                                                           &argpos, &widthpos,
4925                                                                           &flags, &width);
4926
4927                 /*
4928                  * Next we should see the main conversion specifier.  Whether or not
4929                  * an argument position was present, it's known that at least one
4930                  * character remains in the string at this point.  Experience suggests
4931                  * that it's worth checking that that character is one of the expected
4932                  * ones before we try to fetch arguments, so as to produce the least
4933                  * confusing response to a mis-formatted specifier.
4934                  */
4935                 if (strchr("sIL", *cp) == NULL)
4936                         ereport(ERROR,
4937                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4938                                          errmsg("unrecognized format() type specifier \"%c\"",
4939                                                         *cp),
4940                                          errhint("For a single \"%%\" use \"%%%%\".")));
4941
4942                 /* If indirect width was specified, get its value */
4943                 if (widthpos >= 0)
4944                 {
4945                         /* Collect the specified or next argument position */
4946                         if (widthpos > 0)
4947                                 arg = widthpos;
4948                         if (arg >= nargs)
4949                                 ereport(ERROR,
4950                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4951                                                  errmsg("too few arguments for format()")));
4952
4953                         /* Get the value and type of the selected argument */
4954                         if (!funcvariadic)
4955                         {
4956                                 value = PG_GETARG_DATUM(arg);
4957                                 isNull = PG_ARGISNULL(arg);
4958                                 typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
4959                         }
4960                         else
4961                         {
4962                                 value = elements[arg - 1];
4963                                 isNull = nulls[arg - 1];
4964                                 typid = element_type;
4965                         }
4966                         if (!OidIsValid(typid))
4967                                 elog(ERROR, "could not determine data type of format() input");
4968
4969                         arg++;
4970
4971                         /* We can treat NULL width the same as zero */
4972                         if (isNull)
4973                                 width = 0;
4974                         else if (typid == INT4OID)
4975                                 width = DatumGetInt32(value);
4976                         else if (typid == INT2OID)
4977                                 width = DatumGetInt16(value);
4978                         else
4979                         {
4980                                 /* For less-usual datatypes, convert to text then to int */
4981                                 char       *str;
4982
4983                                 if (typid != prev_width_type)
4984                                 {
4985                                         Oid                     typoutputfunc;
4986                                         bool            typIsVarlena;
4987
4988                                         getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
4989                                         fmgr_info(typoutputfunc, &typoutputinfo_width);
4990                                         prev_width_type = typid;
4991                                 }
4992
4993                                 str = OutputFunctionCall(&typoutputinfo_width, value);
4994
4995                                 /* pg_atoi will complain about bad data or overflow */
4996                                 width = pg_atoi(str, sizeof(int), '\0');
4997
4998                                 pfree(str);
4999                         }
5000                 }
5001
5002                 /* Collect the specified or next argument position */
5003                 if (argpos > 0)
5004                         arg = argpos;
5005                 if (arg >= nargs)
5006                         ereport(ERROR,
5007                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5008                                          errmsg("too few arguments for format()")));
5009
5010                 /* Get the value and type of the selected argument */
5011                 if (!funcvariadic)
5012                 {
5013                         value = PG_GETARG_DATUM(arg);
5014                         isNull = PG_ARGISNULL(arg);
5015                         typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
5016                 }
5017                 else
5018                 {
5019                         value = elements[arg - 1];
5020                         isNull = nulls[arg - 1];
5021                         typid = element_type;
5022                 }
5023                 if (!OidIsValid(typid))
5024                         elog(ERROR, "could not determine data type of format() input");
5025
5026                 arg++;
5027
5028                 /*
5029                  * Get the appropriate typOutput function, reusing previous one if
5030                  * same type as previous argument.  That's particularly useful in the
5031                  * variadic-array case, but often saves work even for ordinary calls.
5032                  */
5033                 if (typid != prev_type)
5034                 {
5035                         Oid                     typoutputfunc;
5036                         bool            typIsVarlena;
5037
5038                         getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
5039                         fmgr_info(typoutputfunc, &typoutputfinfo);
5040                         prev_type = typid;
5041                 }
5042
5043                 /*
5044                  * And now we can format the value.
5045                  */
5046                 switch (*cp)
5047                 {
5048                         case 's':
5049                         case 'I':
5050                         case 'L':
5051                                 text_format_string_conversion(&str, *cp, &typoutputfinfo,
5052                                                                                           value, isNull,
5053                                                                                           flags, width);
5054                                 break;
5055                         default:
5056                                 /* should not get here, because of previous check */
5057                                 ereport(ERROR,
5058                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5059                                                  errmsg("unrecognized format() type specifier \"%c\"",
5060                                                                 *cp),
5061                                                  errhint("For a single \"%%\" use \"%%%%\".")));
5062                                 break;
5063                 }
5064         }
5065
5066         /* Don't need deconstruct_array results anymore. */
5067         if (elements != NULL)
5068                 pfree(elements);
5069         if (nulls != NULL)
5070                 pfree(nulls);
5071
5072         /* Generate results. */
5073         result = cstring_to_text_with_len(str.data, str.len);
5074         pfree(str.data);
5075
5076         PG_RETURN_TEXT_P(result);
5077 }
5078
5079 /*
5080  * Parse contiguous digits as a decimal number.
5081  *
5082  * Returns true if some digits could be parsed.
5083  * The value is returned into *value, and *ptr is advanced to the next
5084  * character to be parsed.
5085  *
5086  * Note parsing invariant: at least one character is known available before
5087  * string end (end_ptr) at entry, and this is still true at exit.
5088  */
5089 static bool
5090 text_format_parse_digits(const char **ptr, const char *end_ptr, int *value)
5091 {
5092         bool            found = false;
5093         const char *cp = *ptr;
5094         int                     val = 0;
5095
5096         while (*cp >= '0' && *cp <= '9')
5097         {
5098                 int                     newval = val * 10 + (*cp - '0');
5099
5100                 if (newval / 10 != val) /* overflow? */
5101                         ereport(ERROR,
5102                                         (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
5103                                          errmsg("number is out of range")));
5104                 val = newval;
5105                 ADVANCE_PARSE_POINTER(cp, end_ptr);
5106                 found = true;
5107         }
5108
5109         *ptr = cp;
5110         *value = val;
5111
5112         return found;
5113 }
5114
5115 /*
5116  * Parse a format specifier (generally following the SUS printf spec).
5117  *
5118  * We have already advanced over the initial '%', and we are looking for
5119  * [argpos][flags][width]type (but the type character is not consumed here).
5120  *
5121  * Inputs are start_ptr (the position after '%') and end_ptr (string end + 1).
5122  * Output parameters:
5123  *      argpos: argument position for value to be printed.  -1 means unspecified.
5124  *      widthpos: argument position for width.  Zero means the argument position
5125  *                      was unspecified (ie, take the next arg) and -1 means no width
5126  *                      argument (width was omitted or specified as a constant).
5127  *      flags: bitmask of flags.
5128  *      width: directly-specified width value.  Zero means the width was omitted
5129  *                      (note it's not necessary to distinguish this case from an explicit
5130  *                      zero width value).
5131  *
5132  * The function result is the next character position to be parsed, ie, the
5133  * location where the type character is/should be.
5134  *
5135  * Note parsing invariant: at least one character is known available before
5136  * string end (end_ptr) at entry, and this is still true at exit.
5137  */
5138 static const char *
5139 text_format_parse_format(const char *start_ptr, const char *end_ptr,
5140                                                  int *argpos, int *widthpos,
5141                                                  int *flags, int *width)
5142 {
5143         const char *cp = start_ptr;
5144         int                     n;
5145
5146         /* set defaults for output parameters */
5147         *argpos = -1;
5148         *widthpos = -1;
5149         *flags = 0;
5150         *width = 0;
5151
5152         /* try to identify first number */
5153         if (text_format_parse_digits(&cp, end_ptr, &n))
5154         {
5155                 if (*cp != '$')
5156                 {
5157                         /* Must be just a width and a type, so we're done */
5158                         *width = n;
5159                         return cp;
5160                 }
5161                 /* The number was argument position */
5162                 *argpos = n;
5163                 /* Explicit 0 for argument index is immediately refused */
5164                 if (n == 0)
5165                         ereport(ERROR,
5166                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5167                                          errmsg("format specifies argument 0, but arguments are numbered from 1")));
5168                 ADVANCE_PARSE_POINTER(cp, end_ptr);
5169         }
5170
5171         /* Handle flags (only minus is supported now) */
5172         while (*cp == '-')
5173         {
5174                 *flags |= TEXT_FORMAT_FLAG_MINUS;
5175                 ADVANCE_PARSE_POINTER(cp, end_ptr);
5176         }
5177
5178         if (*cp == '*')
5179         {
5180                 /* Handle indirect width */
5181                 ADVANCE_PARSE_POINTER(cp, end_ptr);
5182                 if (text_format_parse_digits(&cp, end_ptr, &n))
5183                 {
5184                         /* number in this position must be closed by $ */
5185                         if (*cp != '$')
5186                                 ereport(ERROR,
5187                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5188                                   errmsg("width argument position must be ended by \"$\"")));
5189                         /* The number was width argument position */
5190                         *widthpos = n;
5191                         /* Explicit 0 for argument index is immediately refused */
5192                         if (n == 0)
5193                                 ereport(ERROR,
5194                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5195                                                  errmsg("format specifies argument 0, but arguments are numbered from 1")));
5196                         ADVANCE_PARSE_POINTER(cp, end_ptr);
5197                 }
5198                 else
5199                         *widthpos = 0;          /* width's argument position is unspecified */
5200         }
5201         else
5202         {
5203                 /* Check for direct width specification */
5204                 if (text_format_parse_digits(&cp, end_ptr, &n))
5205                         *width = n;
5206         }
5207
5208         /* cp should now be pointing at type character */
5209         return cp;
5210 }
5211
5212 /*
5213  * Format a %s, %I, or %L conversion
5214  */
5215 static void
5216 text_format_string_conversion(StringInfo buf, char conversion,
5217                                                           FmgrInfo *typOutputInfo,
5218                                                           Datum value, bool isNull,
5219                                                           int flags, int width)
5220 {
5221         char       *str;
5222
5223         /* Handle NULL arguments before trying to stringify the value. */
5224         if (isNull)
5225         {
5226                 if (conversion == 's')
5227                         text_format_append_string(buf, "", flags, width);
5228                 else if (conversion == 'L')
5229                         text_format_append_string(buf, "NULL", flags, width);
5230                 else if (conversion == 'I')
5231                         ereport(ERROR,
5232                                         (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
5233                         errmsg("null values cannot be formatted as an SQL identifier")));
5234                 return;
5235         }
5236
5237         /* Stringify. */
5238         str = OutputFunctionCall(typOutputInfo, value);
5239
5240         /* Escape. */
5241         if (conversion == 'I')
5242         {
5243                 /* quote_identifier may or may not allocate a new string. */
5244                 text_format_append_string(buf, quote_identifier(str), flags, width);
5245         }
5246         else if (conversion == 'L')
5247         {
5248                 char       *qstr = quote_literal_cstr(str);
5249
5250                 text_format_append_string(buf, qstr, flags, width);
5251                 /* quote_literal_cstr() always allocates a new string */
5252                 pfree(qstr);
5253         }
5254         else
5255                 text_format_append_string(buf, str, flags, width);
5256
5257         /* Cleanup. */
5258         pfree(str);
5259 }
5260
5261 /*
5262  * Append str to buf, padding as directed by flags/width
5263  */
5264 static void
5265 text_format_append_string(StringInfo buf, const char *str,
5266                                                   int flags, int width)
5267 {
5268         bool            align_to_left = false;
5269         int                     len;
5270
5271         /* fast path for typical easy case */
5272         if (width == 0)
5273         {
5274                 appendStringInfoString(buf, str);
5275                 return;
5276         }
5277
5278         if (width < 0)
5279         {
5280                 /* Negative width: implicit '-' flag, then take absolute value */
5281                 align_to_left = true;
5282                 /* -INT_MIN is undefined */
5283                 if (width <= INT_MIN)
5284                         ereport(ERROR,
5285                                         (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
5286                                          errmsg("number is out of range")));
5287                 width = -width;
5288         }
5289         else if (flags & TEXT_FORMAT_FLAG_MINUS)
5290                 align_to_left = true;
5291
5292         len = pg_mbstrlen(str);
5293         if (align_to_left)
5294         {
5295                 /* left justify */
5296                 appendStringInfoString(buf, str);
5297                 if (len < width)
5298                         appendStringInfoSpaces(buf, width - len);
5299         }
5300         else
5301         {
5302                 /* right justify */
5303                 if (len < width)
5304                         appendStringInfoSpaces(buf, width - len);
5305                 appendStringInfoString(buf, str);
5306         }
5307 }
5308
5309 /*
5310  * text_format_nv - nonvariadic wrapper for text_format function.
5311  *
5312  * note: this wrapper is necessary to pass the sanity check in opr_sanity,
5313  * which checks that all built-in functions that share the implementing C
5314  * function take the same number of arguments.
5315  */
5316 Datum
5317 text_format_nv(PG_FUNCTION_ARGS)
5318 {
5319         return text_format(fcinfo);
5320 }
5321
5322 /*
5323  * Helper function for Levenshtein distance functions. Faster than memcmp(),
5324  * for this use case.
5325  */
5326 static inline bool
5327 rest_of_char_same(const char *s1, const char *s2, int len)
5328 {
5329         while (len > 0)
5330         {
5331                 len--;
5332                 if (s1[len] != s2[len])
5333                         return false;
5334         }
5335         return true;
5336 }
5337
5338 /* Expand each Levenshtein distance variant */
5339 #include "levenshtein.c"
5340 #define LEVENSHTEIN_LESS_EQUAL
5341 #include "levenshtein.c"