granicus.if.org Git - postgresql/blob - src/backend/utils/adt/oracle_compat.c

   1 /*-------------------------------------------------------------------------
   2  * oracle_compat.c
   3  *      Oracle compatible functions.
   4  *
   5  * Copyright (c) 1996-2008, PostgreSQL Global Development Group
   6  *
   7  *      Author: Edmund Mergl <E.Mergl@bawue.de>
   8  *      Multibyte enhancement: Tatsuo Ishii <ishii@postgresql.org>
   9  *
  10  *
  11  * IDENTIFICATION
  12  *      $PostgreSQL: pgsql/src/backend/utils/adt/oracle_compat.c,v 1.79 2008/05/19 18:08:16 tgl Exp $
  13  *
  14  *-------------------------------------------------------------------------
  15  */
  16 #include "postgres.h"
  17
  18 #include <ctype.h>
  19 #include <limits.h>
  20 /*
  21  * towlower() and friends should be in <wctype.h>, but some pre-C99 systems
  22  * declare them in <wchar.h>.
  23  */
  24 #ifdef HAVE_WCHAR_H
  25 #include <wchar.h>
  26 #endif
  27 #ifdef HAVE_WCTYPE_H
  28 #include <wctype.h>
  29 #endif
  30
  31 #include "utils/builtins.h"
  32 #include "utils/pg_locale.h"
  33 #include "mb/pg_wchar.h"
  34
  35
  36 /*
  37  * If the system provides the needed functions for wide-character manipulation
  38  * (which are all standardized by C99), then we implement upper/lower/initcap
  39  * using wide-character functions.      Otherwise we use the traditional <ctype.h>
  40  * functions, which of course will not work as desired in multibyte character
  41  * sets.  Note that in either case we are effectively assuming that the
  42  * database character encoding matches the encoding implied by LC_CTYPE.
  43  *
  44  * We assume if we have these two functions, we have their friends too, and
  45  * can use the wide-character method.
  46  */
  47 #if defined(HAVE_WCSTOMBS) && defined(HAVE_TOWLOWER)
  48 #define USE_WIDE_UPPER_LOWER
  49 char       *wstring_lower(char *str);
  50 char       *wstring_upper(char *str);
  51 wchar_t    *texttowcs(const text *txt);
  52 text       *wcstotext(const wchar_t *str, int ncodes);
  53 #endif
  54
  55 static text *dotrim(const char *string, int stringlen,
  56            const char *set, int setlen,
  57            bool doltrim, bool dortrim);
  58
  59
  60 #ifdef USE_WIDE_UPPER_LOWER
  61
  62 /*
  63  * Convert a TEXT value into a palloc'd wchar string.
  64  */
  65 wchar_t *
  66 texttowcs(const text *txt)
  67 {
  68         int                     nbytes = VARSIZE_ANY_EXHDR(txt);
  69         char       *workstr;
  70         wchar_t    *result;
  71         size_t          ncodes;
  72
  73         /* Overflow paranoia */
  74         if (nbytes < 0 ||
  75                 nbytes > (int) (INT_MAX / sizeof(wchar_t)) - 1)
  76                 ereport(ERROR,
  77                                 (errcode(ERRCODE_OUT_OF_MEMORY),
  78                                  errmsg("out of memory")));
  79
  80         /* Need a null-terminated version of the input */
  81         workstr = text_to_cstring(txt);
  82
  83         /* Output workspace cannot have more codes than input bytes */
  84         result = (wchar_t *) palloc((nbytes + 1) * sizeof(wchar_t));
  85
  86         /* Do the conversion */
  87         ncodes = mbstowcs(result, workstr, nbytes + 1);
  88
  89         if (ncodes == (size_t) -1)
  90         {
  91                 /*
  92                  * Invalid multibyte character encountered.  We try to give a useful
  93                  * error message by letting pg_verifymbstr check the string.  But it's
  94                  * possible that the string is OK to us, and not OK to mbstowcs ---
  95                  * this suggests that the LC_CTYPE locale is different from the
  96                  * database encoding.  Give a generic error message if verifymbstr
  97                  * can't find anything wrong.
  98                  */
  99                 pg_verifymbstr(workstr, nbytes, false);
 100                 ereport(ERROR,
 101                                 (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
 102                                  errmsg("invalid multibyte character for locale"),
 103                                  errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding.")));
 104         }
 105
 106         Assert(ncodes <= (size_t) nbytes);
 107
 108         return result;
 109 }
 110
 111
 112 /*
 113  * Convert a wchar string into a palloc'd TEXT value.  The wchar string
 114  * must be zero-terminated, but we also require the caller to pass the string
 115  * length, since it will know it anyway in current uses.
 116  */
 117 text *
 118 wcstotext(const wchar_t *str, int ncodes)
 119 {
 120         text       *result;
 121         size_t          nbytes;
 122
 123         /* Overflow paranoia */
 124         if (ncodes < 0 ||
 125                 ncodes > (int) ((INT_MAX - VARHDRSZ) / MB_CUR_MAX) - 1)
 126                 ereport(ERROR,
 127                                 (errcode(ERRCODE_OUT_OF_MEMORY),
 128                                  errmsg("out of memory")));
 129
 130         /* Make workspace certainly large enough for result */
 131         result = (text *) palloc((ncodes + 1) * MB_CUR_MAX + VARHDRSZ);
 132
 133         /* Do the conversion */
 134         nbytes = wcstombs((char *) VARDATA(result), str,
 135                                           (ncodes + 1) * MB_CUR_MAX);
 136
 137         if (nbytes == (size_t) -1)
 138         {
 139                 /* Invalid multibyte character encountered ... shouldn't happen */
 140                 ereport(ERROR,
 141                                 (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
 142                                  errmsg("invalid multibyte character for locale")));
 143         }
 144
 145         Assert(nbytes <= (size_t) (ncodes * MB_CUR_MAX));
 146
 147         SET_VARSIZE(result, nbytes + VARHDRSZ);
 148
 149         return result;
 150 }
 151 #endif   /* USE_WIDE_UPPER_LOWER */
 152
 153
 154 /*
 155  * On Windows, the "Unicode" locales assume UTF16 not UTF8 encoding.
 156  * To make use of the upper/lower functionality, we need to map UTF8 to
 157  * UTF16, which for some reason mbstowcs and wcstombs won't do for us.
 158  * This conversion layer takes care of it.
 159  */
 160
 161 #ifdef WIN32
 162
 163 /* texttowcs for the case of UTF8 to UTF16 */
 164 static wchar_t *
 165 win32_utf8_texttowcs(const text *txt)
 166 {
 167         int                     nbytes = VARSIZE_ANY_EXHDR(txt);
 168         wchar_t    *result;
 169         int                     r;
 170
 171         /* Overflow paranoia */
 172         if (nbytes < 0 ||
 173                 nbytes > (int) (INT_MAX / sizeof(wchar_t)) - 1)
 174                 ereport(ERROR,
 175                                 (errcode(ERRCODE_OUT_OF_MEMORY),
 176                                  errmsg("out of memory")));
 177
 178         /* Output workspace cannot have more codes than input bytes */
 179         result = (wchar_t *) palloc((nbytes + 1) * sizeof(wchar_t));
 180
 181         /* stupid Microsloth API does not work for zero-length input */
 182         if (nbytes == 0)
 183                 r = 0;
 184         else
 185         {
 186                 /* Do the conversion */
 187                 r = MultiByteToWideChar(CP_UTF8, 0, VARDATA_ANY(txt), nbytes,
 188                                                                 result, nbytes);
 189
 190                 if (r <= 0)                             /* assume it's NO_UNICODE_TRANSLATION */
 191                 {
 192                         /* see notes above about error reporting */
 193                         pg_verifymbstr(VARDATA_ANY(txt), nbytes, false);
 194                         ereport(ERROR,
 195                                         (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
 196                                          errmsg("invalid multibyte character for locale"),
 197                                          errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding.")));
 198                 }
 199         }
 200
 201         /* Append trailing null wchar (MultiByteToWideChar won't have) */
 202         Assert(r <= nbytes);
 203         result[r] = 0;
 204
 205         return result;
 206 }
 207
 208 /* wcstotext for the case of UTF16 to UTF8 */
 209 static text *
 210 win32_utf8_wcstotext(const wchar_t *str)
 211 {
 212         text       *result;
 213         int                     nbytes;
 214         int                     r;
 215
 216         /* Compute size of output string (this *will* include trailing null) */
 217         nbytes = WideCharToMultiByte(CP_UTF8, 0, str, -1, NULL, 0, NULL, NULL);
 218         if (nbytes <= 0)                        /* shouldn't happen */
 219                 ereport(ERROR,
 220                                 (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
 221                                  errmsg("UTF-16 to UTF-8 translation failed: %lu",
 222                                                 GetLastError())));
 223
 224         result = palloc(nbytes + VARHDRSZ);
 225
 226         r = WideCharToMultiByte(CP_UTF8, 0, str, -1, VARDATA(result), nbytes,
 227                                                         NULL, NULL);
 228         if (r != nbytes)                        /* shouldn't happen */
 229                 ereport(ERROR,
 230                                 (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
 231                                  errmsg("UTF-16 to UTF-8 translation failed: %lu",
 232                                                 GetLastError())));
 233
 234         SET_VARSIZE(result, nbytes + VARHDRSZ - 1); /* -1 to ignore null */
 235
 236         return result;
 237 }
 238
 239 /* interface layer to check which encoding is in use */
 240
 241 static wchar_t *
 242 win32_texttowcs(const text *txt)
 243 {
 244         if (GetDatabaseEncoding() == PG_UTF8)
 245                 return win32_utf8_texttowcs(txt);
 246         else
 247                 return texttowcs(txt);
 248 }
 249
 250 static text *
 251 win32_wcstotext(const wchar_t *str, int ncodes)
 252 {
 253         if (GetDatabaseEncoding() == PG_UTF8)
 254                 return win32_utf8_wcstotext(str);
 255         else
 256                 return wcstotext(str, ncodes);
 257 }
 258
 259 /* use macros to cause routines below to call interface layer */
 260
 261 #define texttowcs       win32_texttowcs
 262 #define wcstotext       win32_wcstotext
 263 #endif   /* WIN32 */
 264
 265 #ifdef USE_WIDE_UPPER_LOWER
 266 /*
 267  * string_upper and string_lower are used for correct multibyte upper/lower
 268  * transformations localized strings. Returns pointers to transformated
 269  * string.
 270  */
 271 char *
 272 wstring_upper(char *str)
 273 {
 274         wchar_t    *workspace;
 275         text       *in_text;
 276         text       *out_text;
 277         char       *result;
 278         int                     i;
 279
 280         in_text = cstring_to_text(str);
 281         workspace = texttowcs(in_text);
 282
 283         for (i = 0; workspace[i] != 0; i++)
 284                 workspace[i] = towupper(workspace[i]);
 285
 286         out_text = wcstotext(workspace, i);
 287         result = text_to_cstring(out_text);
 288
 289         pfree(workspace);
 290         pfree(in_text);
 291         pfree(out_text);
 292
 293         return result;
 294 }
 295
 296 char *
 297 wstring_lower(char *str)
 298 {
 299         wchar_t    *workspace;
 300         text       *in_text;
 301         text       *out_text;
 302         char       *result;
 303         int                     i;
 304
 305         in_text = cstring_to_text(str);
 306         workspace = texttowcs(in_text);
 307
 308         for (i = 0; workspace[i] != 0; i++)
 309                 workspace[i] = towlower(workspace[i]);
 310
 311         out_text = wcstotext(workspace, i);
 312         result = text_to_cstring(out_text);
 313
 314         pfree(workspace);
 315         pfree(in_text);
 316         pfree(out_text);
 317
 318         return result;
 319 }
 320 #endif   /* USE_WIDE_UPPER_LOWER */
 321
 322 /********************************************************************
 323  *
 324  * lower
 325  *
 326  * Syntax:
 327  *
 328  *       text lower(text string)
 329  *
 330  * Purpose:
 331  *
 332  *       Returns string, with all letters forced to lowercase.
 333  *
 334  ********************************************************************/
 335
 336 Datum
 337 lower(PG_FUNCTION_ARGS)
 338 {
 339 #ifdef USE_WIDE_UPPER_LOWER
 340
 341         /*
 342          * Use wide char code only when max encoding length > 1 and ctype != C.
 343          * Some operating systems fail with multi-byte encodings and a C locale.
 344          * Also, for a C locale there is no need to process as multibyte.
 345          */
 346         if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c())
 347         {
 348                 text       *string = PG_GETARG_TEXT_PP(0);
 349                 text       *result;
 350                 wchar_t    *workspace;
 351                 int                     i;
 352
 353                 workspace = texttowcs(string);
 354
 355                 for (i = 0; workspace[i] != 0; i++)
 356                         workspace[i] = towlower(workspace[i]);
 357
 358                 result = wcstotext(workspace, i);
 359
 360                 pfree(workspace);
 361
 362                 PG_RETURN_TEXT_P(result);
 363         }
 364         else
 365 #endif   /* USE_WIDE_UPPER_LOWER */
 366         {
 367                 text       *string = PG_GETARG_TEXT_P_COPY(0);
 368                 char       *ptr;
 369                 int                     m;
 370
 371                 /*
 372                  * Since we copied the string, we can scribble directly on the value
 373                  */
 374                 ptr = VARDATA(string);
 375                 m = VARSIZE(string) - VARHDRSZ;
 376
 377                 while (m-- > 0)
 378                 {
 379                         *ptr = tolower((unsigned char) *ptr);
 380                         ptr++;
 381                 }
 382
 383                 PG_RETURN_TEXT_P(string);
 384         }
 385 }
 386
 387
 388 /********************************************************************
 389  *
 390  * upper
 391  *
 392  * Syntax:
 393  *
 394  *       text upper(text string)
 395  *
 396  * Purpose:
 397  *
 398  *       Returns string, with all letters forced to uppercase.
 399  *
 400  ********************************************************************/
 401
 402 Datum
 403 upper(PG_FUNCTION_ARGS)
 404 {
 405 #ifdef USE_WIDE_UPPER_LOWER
 406
 407         /*
 408          * Use wide char code only when max encoding length > 1 and ctype != C.
 409          * Some operating systems fail with multi-byte encodings and a C locale.
 410          * Also, for a C locale there is no need to process as multibyte.
 411          */
 412         if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c())
 413         {
 414                 text       *string = PG_GETARG_TEXT_PP(0);
 415                 text       *result;
 416                 wchar_t    *workspace;
 417                 int                     i;
 418
 419                 workspace = texttowcs(string);
 420
 421                 for (i = 0; workspace[i] != 0; i++)
 422                         workspace[i] = towupper(workspace[i]);
 423
 424                 result = wcstotext(workspace, i);
 425
 426                 pfree(workspace);
 427
 428                 PG_RETURN_TEXT_P(result);
 429         }
 430         else
 431 #endif   /* USE_WIDE_UPPER_LOWER */
 432         {
 433                 text       *string = PG_GETARG_TEXT_P_COPY(0);
 434                 char       *ptr;
 435                 int                     m;
 436
 437                 /*
 438                  * Since we copied the string, we can scribble directly on the value
 439                  */
 440                 ptr = VARDATA(string);
 441                 m = VARSIZE(string) - VARHDRSZ;
 442
 443                 while (m-- > 0)
 444                 {
 445                         *ptr = toupper((unsigned char) *ptr);
 446                         ptr++;
 447                 }
 448
 449                 PG_RETURN_TEXT_P(string);
 450         }
 451 }
 452
 453
 454 /********************************************************************
 455  *
 456  * initcap
 457  *
 458  * Syntax:
 459  *
 460  *       text initcap(text string)
 461  *
 462  * Purpose:
 463  *
 464  *       Returns string, with first letter of each word in uppercase, all
 465  *       other letters in lowercase. A word is defined as a sequence of
 466  *       alphanumeric characters, delimited by non-alphanumeric
 467  *       characters.
 468  *
 469  ********************************************************************/
 470
 471 Datum
 472 initcap(PG_FUNCTION_ARGS)
 473 {
 474 #ifdef USE_WIDE_UPPER_LOWER
 475
 476         /*
 477          * Use wide char code only when max encoding length > 1 and ctype != C.
 478          * Some operating systems fail with multi-byte encodings and a C locale.
 479          * Also, for a C locale there is no need to process as multibyte.
 480          */
 481         if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c())
 482         {
 483                 text       *string = PG_GETARG_TEXT_PP(0);
 484                 text       *result;
 485                 wchar_t    *workspace;
 486                 int                     wasalnum = 0;
 487                 int                     i;
 488
 489                 workspace = texttowcs(string);
 490
 491                 for (i = 0; workspace[i] != 0; i++)
 492                 {
 493                         if (wasalnum)
 494                                 workspace[i] = towlower(workspace[i]);
 495                         else
 496                                 workspace[i] = towupper(workspace[i]);
 497                         wasalnum = iswalnum(workspace[i]);
 498                 }
 499
 500                 result = wcstotext(workspace, i);
 501
 502                 pfree(workspace);
 503
 504                 PG_RETURN_TEXT_P(result);
 505         }
 506         else
 507 #endif   /* USE_WIDE_UPPER_LOWER */
 508         {
 509                 text       *string = PG_GETARG_TEXT_P_COPY(0);
 510                 int                     wasalnum = 0;
 511                 char       *ptr;
 512                 int                     m;
 513
 514                 /*
 515                  * Since we copied the string, we can scribble directly on the value
 516                  */
 517                 ptr = VARDATA(string);
 518                 m = VARSIZE(string) - VARHDRSZ;
 519
 520                 while (m-- > 0)
 521                 {
 522                         if (wasalnum)
 523                                 *ptr = tolower((unsigned char) *ptr);
 524                         else
 525                                 *ptr = toupper((unsigned char) *ptr);
 526                         wasalnum = isalnum((unsigned char) *ptr);
 527                         ptr++;
 528                 }
 529
 530                 PG_RETURN_TEXT_P(string);
 531         }
 532 }
 533
 534
 535 /********************************************************************
 536  *
 537  * lpad
 538  *
 539  * Syntax:
 540  *
 541  *       text lpad(text string1, int4 len, text string2)
 542  *
 543  * Purpose:
 544  *
 545  *       Returns string1, left-padded to length len with the sequence of
 546  *       characters in string2.  If len is less than the length of string1,
 547  *       instead truncate (on the right) to len.
 548  *
 549  ********************************************************************/
 550
 551 Datum
 552 lpad(PG_FUNCTION_ARGS)
 553 {
 554         text       *string1 = PG_GETARG_TEXT_PP(0);
 555         int32           len = PG_GETARG_INT32(1);
 556         text       *string2 = PG_GETARG_TEXT_PP(2);
 557         text       *ret;
 558         char       *ptr1,
 559                            *ptr2,
 560                            *ptr2start,
 561                            *ptr2end,
 562                            *ptr_ret;
 563         int                     m,
 564                                 s1len,
 565                                 s2len;
 566
 567         int                     bytelen;
 568
 569         /* Negative len is silently taken as zero */
 570         if (len < 0)
 571                 len = 0;
 572
 573         s1len = VARSIZE_ANY_EXHDR(string1);
 574         if (s1len < 0)
 575                 s1len = 0;                              /* shouldn't happen */
 576
 577         s2len = VARSIZE_ANY_EXHDR(string2);
 578         if (s2len < 0)
 579                 s2len = 0;                              /* shouldn't happen */
 580
 581         s1len = pg_mbstrlen_with_len(VARDATA_ANY(string1), s1len);
 582
 583         if (s1len > len)
 584                 s1len = len;                    /* truncate string1 to len chars */
 585
 586         if (s2len <= 0)
 587                 len = s1len;                    /* nothing to pad with, so don't pad */
 588
 589         bytelen = pg_database_encoding_max_length() * len;
 590
 591         /* check for integer overflow */
 592         if (len != 0 && bytelen / pg_database_encoding_max_length() != len)
 593                 ereport(ERROR,
 594                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
 595                                  errmsg("requested length too large")));
 596
 597         ret = (text *) palloc(VARHDRSZ + bytelen);
 598
 599         m = len - s1len;
 600
 601         ptr2 = ptr2start = VARDATA_ANY(string2);
 602         ptr2end = ptr2 + s2len;
 603         ptr_ret = VARDATA(ret);
 604
 605         while (m--)
 606         {
 607                 int                     mlen = pg_mblen(ptr2);
 608
 609                 memcpy(ptr_ret, ptr2, mlen);
 610                 ptr_ret += mlen;
 611                 ptr2 += mlen;
 612                 if (ptr2 == ptr2end)    /* wrap around at end of s2 */
 613                         ptr2 = ptr2start;
 614         }
 615
 616         ptr1 = VARDATA_ANY(string1);
 617
 618         while (s1len--)
 619         {
 620                 int                     mlen = pg_mblen(ptr1);
 621
 622                 memcpy(ptr_ret, ptr1, mlen);
 623                 ptr_ret += mlen;
 624                 ptr1 += mlen;
 625         }
 626
 627         SET_VARSIZE(ret, ptr_ret - (char *) ret);
 628
 629         PG_RETURN_TEXT_P(ret);
 630 }
 631
 632
 633 /********************************************************************
 634  *
 635  * rpad
 636  *
 637  * Syntax:
 638  *
 639  *       text rpad(text string1, int4 len, text string2)
 640  *
 641  * Purpose:
 642  *
 643  *       Returns string1, right-padded to length len with the sequence of
 644  *       characters in string2.  If len is less than the length of string1,
 645  *       instead truncate (on the right) to len.
 646  *
 647  ********************************************************************/
 648
 649 Datum
 650 rpad(PG_FUNCTION_ARGS)
 651 {
 652         text       *string1 = PG_GETARG_TEXT_PP(0);
 653         int32           len = PG_GETARG_INT32(1);
 654         text       *string2 = PG_GETARG_TEXT_PP(2);
 655         text       *ret;
 656         char       *ptr1,
 657                            *ptr2,
 658                            *ptr2start,
 659                            *ptr2end,
 660                            *ptr_ret;
 661         int                     m,
 662                                 s1len,
 663                                 s2len;
 664
 665         int                     bytelen;
 666
 667         /* Negative len is silently taken as zero */
 668         if (len < 0)
 669                 len = 0;
 670
 671         s1len = VARSIZE_ANY_EXHDR(string1);
 672         if (s1len < 0)
 673                 s1len = 0;                              /* shouldn't happen */
 674
 675         s2len = VARSIZE_ANY_EXHDR(string2);
 676         if (s2len < 0)
 677                 s2len = 0;                              /* shouldn't happen */
 678
 679         s1len = pg_mbstrlen_with_len(VARDATA_ANY(string1), s1len);
 680
 681         if (s1len > len)
 682                 s1len = len;                    /* truncate string1 to len chars */
 683
 684         if (s2len <= 0)
 685                 len = s1len;                    /* nothing to pad with, so don't pad */
 686
 687         bytelen = pg_database_encoding_max_length() * len;
 688
 689         /* Check for integer overflow */
 690         if (len != 0 && bytelen / pg_database_encoding_max_length() != len)
 691                 ereport(ERROR,
 692                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
 693                                  errmsg("requested length too large")));
 694
 695         ret = (text *) palloc(VARHDRSZ + bytelen);
 696         m = len - s1len;
 697
 698         ptr1 = VARDATA_ANY(string1);
 699         ptr_ret = VARDATA(ret);
 700
 701         while (s1len--)
 702         {
 703                 int                     mlen = pg_mblen(ptr1);
 704
 705                 memcpy(ptr_ret, ptr1, mlen);
 706                 ptr_ret += mlen;
 707                 ptr1 += mlen;
 708         }
 709
 710         ptr2 = ptr2start = VARDATA_ANY(string2);
 711         ptr2end = ptr2 + s2len;
 712
 713         while (m--)
 714         {
 715                 int                     mlen = pg_mblen(ptr2);
 716
 717                 memcpy(ptr_ret, ptr2, mlen);
 718                 ptr_ret += mlen;
 719                 ptr2 += mlen;
 720                 if (ptr2 == ptr2end)    /* wrap around at end of s2 */
 721                         ptr2 = ptr2start;
 722         }
 723
 724         SET_VARSIZE(ret, ptr_ret - (char *) ret);
 725
 726         PG_RETURN_TEXT_P(ret);
 727 }
 728
 729
 730 /********************************************************************
 731  *
 732  * btrim
 733  *
 734  * Syntax:
 735  *
 736  *       text btrim(text string, text set)
 737  *
 738  * Purpose:
 739  *
 740  *       Returns string with characters removed from the front and back
 741  *       up to the first character not in set.
 742  *
 743  ********************************************************************/
 744
 745 Datum
 746 btrim(PG_FUNCTION_ARGS)
 747 {
 748         text       *string = PG_GETARG_TEXT_PP(0);
 749         text       *set = PG_GETARG_TEXT_PP(1);
 750         text       *ret;
 751
 752         ret = dotrim(VARDATA_ANY(string), VARSIZE_ANY_EXHDR(string),
 753                                  VARDATA_ANY(set), VARSIZE_ANY_EXHDR(set),
 754                                  true, true);
 755
 756         PG_RETURN_TEXT_P(ret);
 757 }
 758
 759 /********************************************************************
 760  *
 761  * btrim1 --- btrim with set fixed as ' '
 762  *
 763  ********************************************************************/
 764
 765 Datum
 766 btrim1(PG_FUNCTION_ARGS)
 767 {
 768         text       *string = PG_GETARG_TEXT_PP(0);
 769         text       *ret;
 770
 771         ret = dotrim(VARDATA_ANY(string), VARSIZE_ANY_EXHDR(string),
 772                                  " ", 1,
 773                                  true, true);
 774
 775         PG_RETURN_TEXT_P(ret);
 776 }
 777
 778 /*
 779  * Common implementation for btrim, ltrim, rtrim
 780  */
 781 static text *
 782 dotrim(const char *string, int stringlen,
 783            const char *set, int setlen,
 784            bool doltrim, bool dortrim)
 785 {
 786         int                     i;
 787
 788         /* Nothing to do if either string or set is empty */
 789         if (stringlen > 0 && setlen > 0)
 790         {
 791                 if (pg_database_encoding_max_length() > 1)
 792                 {
 793                         /*
 794                          * In the multibyte-encoding case, build arrays of pointers to
 795                          * character starts, so that we can avoid inefficient checks in
 796                          * the inner loops.
 797                          */
 798                         const char **stringchars;
 799                         const char **setchars;
 800                         int                *stringmblen;
 801                         int                *setmblen;
 802                         int                     stringnchars;
 803                         int                     setnchars;
 804                         int                     resultndx;
 805                         int                     resultnchars;
 806                         const char *p;
 807                         int                     len;
 808                         int                     mblen;
 809                         const char *str_pos;
 810                         int                     str_len;
 811
 812                         stringchars = (const char **) palloc(stringlen * sizeof(char *));
 813                         stringmblen = (int *) palloc(stringlen * sizeof(int));
 814                         stringnchars = 0;
 815                         p = string;
 816                         len = stringlen;
 817                         while (len > 0)
 818                         {
 819                                 stringchars[stringnchars] = p;
 820                                 stringmblen[stringnchars] = mblen = pg_mblen(p);
 821                                 stringnchars++;
 822                                 p += mblen;
 823                                 len -= mblen;
 824                         }
 825
 826                         setchars = (const char **) palloc(setlen * sizeof(char *));
 827                         setmblen = (int *) palloc(setlen * sizeof(int));
 828                         setnchars = 0;
 829                         p = set;
 830                         len = setlen;
 831                         while (len > 0)
 832                         {
 833                                 setchars[setnchars] = p;
 834                                 setmblen[setnchars] = mblen = pg_mblen(p);
 835                                 setnchars++;
 836                                 p += mblen;
 837                                 len -= mblen;
 838                         }
 839
 840                         resultndx = 0;          /* index in stringchars[] */
 841                         resultnchars = stringnchars;
 842
 843                         if (doltrim)
 844                         {
 845                                 while (resultnchars > 0)
 846                                 {
 847                                         str_pos = stringchars[resultndx];
 848                                         str_len = stringmblen[resultndx];
 849                                         for (i = 0; i < setnchars; i++)
 850                                         {
 851                                                 if (str_len == setmblen[i] &&
 852                                                         memcmp(str_pos, setchars[i], str_len) == 0)
 853                                                         break;
 854                                         }
 855                                         if (i >= setnchars)
 856                                                 break;  /* no match here */
 857                                         string += str_len;
 858                                         stringlen -= str_len;
 859                                         resultndx++;
 860                                         resultnchars--;
 861                                 }
 862                         }
 863
 864                         if (dortrim)
 865                         {
 866                                 while (resultnchars > 0)
 867                                 {
 868                                         str_pos = stringchars[resultndx + resultnchars - 1];
 869                                         str_len = stringmblen[resultndx + resultnchars - 1];
 870                                         for (i = 0; i < setnchars; i++)
 871                                         {
 872                                                 if (str_len == setmblen[i] &&
 873                                                         memcmp(str_pos, setchars[i], str_len) == 0)
 874                                                         break;
 875                                         }
 876                                         if (i >= setnchars)
 877                                                 break;  /* no match here */
 878                                         stringlen -= str_len;
 879                                         resultnchars--;
 880                                 }
 881                         }
 882
 883                         pfree(stringchars);
 884                         pfree(stringmblen);
 885                         pfree(setchars);
 886                         pfree(setmblen);
 887                 }
 888                 else
 889                 {
 890                         /*
 891                          * In the single-byte-encoding case, we don't need such overhead.
 892                          */
 893                         if (doltrim)
 894                         {
 895                                 while (stringlen > 0)
 896                                 {
 897                                         char            str_ch = *string;
 898
 899                                         for (i = 0; i < setlen; i++)
 900                                         {
 901                                                 if (str_ch == set[i])
 902                                                         break;
 903                                         }
 904                                         if (i >= setlen)
 905                                                 break;  /* no match here */
 906                                         string++;
 907                                         stringlen--;
 908                                 }
 909                         }
 910
 911                         if (dortrim)
 912                         {
 913                                 while (stringlen > 0)
 914                                 {
 915                                         char            str_ch = string[stringlen - 1];
 916
 917                                         for (i = 0; i < setlen; i++)
 918                                         {
 919                                                 if (str_ch == set[i])
 920                                                         break;
 921                                         }
 922                                         if (i >= setlen)
 923                                                 break;  /* no match here */
 924                                         stringlen--;
 925                                 }
 926                         }
 927                 }
 928         }
 929
 930         /* Return selected portion of string */
 931         return cstring_to_text_with_len(string, stringlen);
 932 }
 933
 934 /********************************************************************
 935  *
 936  * byteatrim
 937  *
 938  * Syntax:
 939  *
 940  *       bytea byteatrim(byta string, bytea set)
 941  *
 942  * Purpose:
 943  *
 944  *       Returns string with characters removed from the front and back
 945  *       up to the first character not in set.
 946  *
 947  * Cloned from btrim and modified as required.
 948  ********************************************************************/
 949
 950 Datum
 951 byteatrim(PG_FUNCTION_ARGS)
 952 {
 953         bytea      *string = PG_GETARG_BYTEA_PP(0);
 954         bytea      *set = PG_GETARG_BYTEA_PP(1);
 955         bytea      *ret;
 956         char       *ptr,
 957                            *end,
 958                            *ptr2,
 959                            *ptr2start,
 960                            *end2;
 961         int                     m,
 962                                 stringlen,
 963                                 setlen;
 964
 965         stringlen = VARSIZE_ANY_EXHDR(string);
 966         setlen = VARSIZE_ANY_EXHDR(set);
 967
 968         if (stringlen <= 0 || setlen <= 0)
 969                 PG_RETURN_BYTEA_P(string);
 970
 971         m = stringlen;
 972         ptr = VARDATA_ANY(string);
 973         end = ptr + stringlen - 1;
 974         ptr2start = VARDATA_ANY(set);
 975         end2 = ptr2start + setlen - 1;
 976
 977         while (m > 0)
 978         {
 979                 ptr2 = ptr2start;
 980                 while (ptr2 <= end2)
 981                 {
 982                         if (*ptr == *ptr2)
 983                                 break;
 984                         ++ptr2;
 985                 }
 986                 if (ptr2 > end2)
 987                         break;
 988                 ptr++;
 989                 m--;
 990         }
 991
 992         while (m > 0)
 993         {
 994                 ptr2 = ptr2start;
 995                 while (ptr2 <= end2)
 996                 {
 997                         if (*end == *ptr2)
 998                                 break;
 999                         ++ptr2;
1000                 }
1001                 if (ptr2 > end2)
1002                         break;
1003                 end--;
1004                 m--;
1005         }
1006
1007         ret = (bytea *) palloc(VARHDRSZ + m);
1008         SET_VARSIZE(ret, VARHDRSZ + m);
1009         memcpy(VARDATA(ret), ptr, m);
1010
1011         PG_RETURN_BYTEA_P(ret);
1012 }
1013
1014 /********************************************************************
1015  *
1016  * ltrim
1017  *
1018  * Syntax:
1019  *
1020  *       text ltrim(text string, text set)
1021  *
1022  * Purpose:
1023  *
1024  *       Returns string with initial characters removed up to the first
1025  *       character not in set.
1026  *
1027  ********************************************************************/
1028
1029 Datum
1030 ltrim(PG_FUNCTION_ARGS)
1031 {
1032         text       *string = PG_GETARG_TEXT_PP(0);
1033         text       *set = PG_GETARG_TEXT_PP(1);
1034         text       *ret;
1035
1036         ret = dotrim(VARDATA_ANY(string), VARSIZE_ANY_EXHDR(string),
1037                                  VARDATA_ANY(set), VARSIZE_ANY_EXHDR(set),
1038                                  true, false);
1039
1040         PG_RETURN_TEXT_P(ret);
1041 }
1042
1043 /********************************************************************
1044  *
1045  * ltrim1 --- ltrim with set fixed as ' '
1046  *
1047  ********************************************************************/
1048
1049 Datum
1050 ltrim1(PG_FUNCTION_ARGS)
1051 {
1052         text       *string = PG_GETARG_TEXT_PP(0);
1053         text       *ret;
1054
1055         ret = dotrim(VARDATA_ANY(string), VARSIZE_ANY_EXHDR(string),
1056                                  " ", 1,
1057                                  true, false);
1058
1059         PG_RETURN_TEXT_P(ret);
1060 }
1061
1062 /********************************************************************
1063  *
1064  * rtrim
1065  *
1066  * Syntax:
1067  *
1068  *       text rtrim(text string, text set)
1069  *
1070  * Purpose:
1071  *
1072  *       Returns string with final characters removed after the last
1073  *       character not in set.
1074  *
1075  ********************************************************************/
1076
1077 Datum
1078 rtrim(PG_FUNCTION_ARGS)
1079 {
1080         text       *string = PG_GETARG_TEXT_PP(0);
1081         text       *set = PG_GETARG_TEXT_PP(1);
1082         text       *ret;
1083
1084         ret = dotrim(VARDATA_ANY(string), VARSIZE_ANY_EXHDR(string),
1085                                  VARDATA_ANY(set), VARSIZE_ANY_EXHDR(set),
1086                                  false, true);
1087
1088         PG_RETURN_TEXT_P(ret);
1089 }
1090
1091 /********************************************************************
1092  *
1093  * rtrim1 --- rtrim with set fixed as ' '
1094  *
1095  ********************************************************************/
1096
1097 Datum
1098 rtrim1(PG_FUNCTION_ARGS)
1099 {
1100         text       *string = PG_GETARG_TEXT_PP(0);
1101         text       *ret;
1102
1103         ret = dotrim(VARDATA_ANY(string), VARSIZE_ANY_EXHDR(string),
1104                                  " ", 1,
1105                                  false, true);
1106
1107         PG_RETURN_TEXT_P(ret);
1108 }
1109
1110
1111 /********************************************************************
1112  *
1113  * translate
1114  *
1115  * Syntax:
1116  *
1117  *       text translate(text string, text from, text to)
1118  *
1119  * Purpose:
1120  *
1121  *       Returns string after replacing all occurrences of characters in from
1122  *       with the corresponding character in to.  If from is longer than to,
1123  *       occurrences of the extra characters in from are deleted.
1124  *       Improved by Edwin Ramirez <ramirez@doc.mssm.edu>.
1125  *
1126  ********************************************************************/
1127
1128 Datum
1129 translate(PG_FUNCTION_ARGS)
1130 {
1131         text       *string = PG_GETARG_TEXT_PP(0);
1132         text       *from = PG_GETARG_TEXT_PP(1);
1133         text       *to = PG_GETARG_TEXT_PP(2);
1134         text       *result;
1135         char       *from_ptr,
1136                            *to_ptr;
1137         char       *source,
1138                            *target;
1139         int                     m,
1140                                 fromlen,
1141                                 tolen,
1142                                 retlen,
1143                                 i;
1144         int                     worst_len;
1145         int                     len;
1146         int                     source_len;
1147         int                     from_index;
1148
1149         m = VARSIZE_ANY_EXHDR(string);
1150         if (m <= 0)
1151                 PG_RETURN_TEXT_P(string);
1152         source = VARDATA_ANY(string);
1153
1154         fromlen = VARSIZE_ANY_EXHDR(from);
1155         from_ptr = VARDATA_ANY(from);
1156         tolen = VARSIZE_ANY_EXHDR(to);
1157         to_ptr = VARDATA_ANY(to);
1158
1159         /*
1160          * The worst-case expansion is to substitute a max-length character for a
1161          * single-byte character at each position of the string.
1162          */
1163         worst_len = pg_database_encoding_max_length() * m;
1164
1165         /* check for integer overflow */
1166         if (worst_len / pg_database_encoding_max_length() != m)
1167                 ereport(ERROR,
1168                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1169                                  errmsg("requested length too large")));
1170
1171         result = (text *) palloc(worst_len + VARHDRSZ);
1172         target = VARDATA(result);
1173         retlen = 0;
1174
1175         while (m > 0)
1176         {
1177                 source_len = pg_mblen(source);
1178                 from_index = 0;
1179
1180                 for (i = 0; i < fromlen; i += len)
1181                 {
1182                         len = pg_mblen(&from_ptr[i]);
1183                         if (len == source_len &&
1184                                 memcmp(source, &from_ptr[i], len) == 0)
1185                                 break;
1186
1187                         from_index++;
1188                 }
1189                 if (i < fromlen)
1190                 {
1191                         /* substitute */
1192                         char       *p = to_ptr;
1193
1194                         for (i = 0; i < from_index; i++)
1195                         {
1196                                 p += pg_mblen(p);
1197                                 if (p >= (to_ptr + tolen))
1198                                         break;
1199                         }
1200                         if (p < (to_ptr + tolen))
1201                         {
1202                                 len = pg_mblen(p);
1203                                 memcpy(target, p, len);
1204                                 target += len;
1205                                 retlen += len;
1206                         }
1207
1208                 }
1209                 else
1210                 {
1211                         /* no match, so copy */
1212                         memcpy(target, source, source_len);
1213                         target += source_len;
1214                         retlen += source_len;
1215                 }
1216
1217                 source += source_len;
1218                 m -= source_len;
1219         }
1220
1221         SET_VARSIZE(result, retlen + VARHDRSZ);
1222
1223         /*
1224          * The function result is probably much bigger than needed, if we're using
1225          * a multibyte encoding, but it's not worth reallocating it; the result
1226          * probably won't live long anyway.
1227          */
1228
1229         PG_RETURN_TEXT_P(result);
1230 }
1231
1232 /********************************************************************
1233  *
1234  * ascii
1235  *
1236  * Syntax:
1237  *
1238  *       int ascii(text string)
1239  *
1240  * Purpose:
1241  *
1242  *       Returns the decimal representation of the first character from
1243  *       string.
1244  *       If the string is empty we return 0.
1245  *       If the database encoding is UTF8, we return the Unicode codepoint.
1246  *       If the database encoding is any other multi-byte encoding, we
1247  *       return the value of the first byte if it is an ASCII character
1248  *       (range 1 .. 127), or raise an error.
1249  *       For all other encodings we return the value of the first byte,
1250  *       (range 1..255).
1251  *
1252  ********************************************************************/
1253
1254 Datum
1255 ascii(PG_FUNCTION_ARGS)
1256 {
1257         text       *string = PG_GETARG_TEXT_PP(0);
1258         int                     encoding = GetDatabaseEncoding();
1259         unsigned char *data;
1260
1261         if (VARSIZE_ANY_EXHDR(string) <= 0)
1262                 PG_RETURN_INT32(0);
1263
1264         data = (unsigned char *) VARDATA_ANY(string);
1265
1266         if (encoding == PG_UTF8 && *data > 127)
1267         {
1268                 /* return the code point for Unicode */
1269
1270                 int                     result = 0,
1271                                         tbytes = 0,
1272                                         i;
1273
1274                 if (*data >= 0xF0)
1275                 {
1276                         result = *data & 0x07;
1277                         tbytes = 3;
1278                 }
1279                 else if (*data >= 0xE0)
1280                 {
1281                         result = *data & 0x0F;
1282                         tbytes = 2;
1283                 }
1284                 else
1285                 {
1286                         Assert(*data > 0xC0);
1287                         result = *data & 0x1f;
1288                         tbytes = 1;
1289                 }
1290
1291                 Assert(tbytes > 0);
1292
1293                 for (i = 1; i <= tbytes; i++)
1294                 {
1295                         Assert((data[i] & 0xC0) == 0x80);
1296                         result = (result << 6) + (data[i] & 0x3f);
1297                 }
1298
1299                 PG_RETURN_INT32(result);
1300         }
1301         else
1302         {
1303                 if (pg_encoding_max_length(encoding) > 1 && *data > 127)
1304                         ereport(ERROR,
1305                                         (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1306                                          errmsg("requested character too large")));
1307
1308
1309                 PG_RETURN_INT32((int32) *data);
1310         }
1311 }
1312
1313 /********************************************************************
1314  *
1315  * chr
1316  *
1317  * Syntax:
1318  *
1319  *       text chr(int val)
1320  *
1321  * Purpose:
1322  *
1323  *      Returns the character having the binary equivalent to val.
1324  *
1325  * For UTF8 we treat the argumwent as a Unicode code point.
1326  * For other multi-byte encodings we raise an error for arguments
1327  * outside the strict ASCII range (1..127).
1328  *
1329  * It's important that we don't ever return a value that is not valid
1330  * in the database encoding, so that this doesn't become a way for
1331  * invalid data to enter the database.
1332  *
1333  ********************************************************************/
1334
1335 Datum
1336 chr                     (PG_FUNCTION_ARGS)
1337 {
1338         uint32          cvalue = PG_GETARG_UINT32(0);
1339         text       *result;
1340         int                     encoding = GetDatabaseEncoding();
1341
1342         if (encoding == PG_UTF8 && cvalue > 127)
1343         {
1344                 /* for Unicode we treat the argument as a code point */
1345                 int                     bytes;
1346                 char       *wch;
1347
1348                 /* We only allow valid Unicode code points */
1349                 if (cvalue > 0x001fffff)
1350                         ereport(ERROR,
1351                                         (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1352                                          errmsg("requested character too large for encoding: %d",
1353                                                         cvalue)));
1354
1355                 if (cvalue > 0xffff)
1356                         bytes = 4;
1357                 else if (cvalue > 0x07ff)
1358                         bytes = 3;
1359                 else
1360                         bytes = 2;
1361
1362                 result = (text *) palloc(VARHDRSZ + bytes);
1363                 SET_VARSIZE(result, VARHDRSZ + bytes);
1364                 wch = VARDATA(result);
1365
1366                 if (bytes == 2)
1367                 {
1368                         wch[0] = 0xC0 | ((cvalue >> 6) & 0x1F);
1369                         wch[1] = 0x80 | (cvalue & 0x3F);;
1370                 }
1371                 else if (bytes == 3)
1372                 {
1373                         wch[0] = 0xE0 | ((cvalue >> 12) & 0x0F);
1374                         wch[1] = 0x80 | ((cvalue >> 6) & 0x3F);
1375                         wch[2] = 0x80 | (cvalue & 0x3F);
1376                 }
1377                 else
1378                 {
1379                         wch[0] = 0xF0 | ((cvalue >> 18) & 0x07);
1380                         wch[1] = 0x80 | ((cvalue >> 12) & 0x3F);
1381                         wch[2] = 0x80 | ((cvalue >> 6) & 0x3F);
1382                         wch[3] = 0x80 | (cvalue & 0x3F);
1383                 }
1384
1385         }
1386
1387         else
1388         {
1389                 bool            is_mb;
1390
1391                 /*
1392                  * Error out on arguments that make no sense or that we can't validly
1393                  * represent in the encoding.
1394                  */
1395
1396                 if (cvalue == 0)
1397                         ereport(ERROR,
1398                                         (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1399                                          errmsg("null character not permitted")));
1400
1401                 is_mb = pg_encoding_max_length(encoding) > 1;
1402
1403                 if ((is_mb && (cvalue > 127)) || (!is_mb && (cvalue > 255)))
1404                         ereport(ERROR,
1405                                         (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1406                                          errmsg("requested character too large for encoding: %d",
1407                                                         cvalue)));
1408
1409
1410                 result = (text *) palloc(VARHDRSZ + 1);
1411                 SET_VARSIZE(result, VARHDRSZ + 1);
1412                 *VARDATA(result) = (char) cvalue;
1413         }
1414
1415         PG_RETURN_TEXT_P(result);
1416 }
1417
1418 /********************************************************************
1419  *
1420  * repeat
1421  *
1422  * Syntax:
1423  *
1424  *       text repeat(text string, int val)
1425  *
1426  * Purpose:
1427  *
1428  *      Repeat string by val.
1429  *
1430  ********************************************************************/
1431
1432 Datum
1433 repeat(PG_FUNCTION_ARGS)
1434 {
1435         text       *string = PG_GETARG_TEXT_PP(0);
1436         int32           count = PG_GETARG_INT32(1);
1437         text       *result;
1438         int                     slen,
1439                                 tlen;
1440         int                     i;
1441         char       *cp,
1442                            *sp;
1443
1444         if (count < 0)
1445                 count = 0;
1446
1447         slen = VARSIZE_ANY_EXHDR(string);
1448         tlen = VARHDRSZ + (count * slen);
1449
1450         /* Check for integer overflow */
1451         if (slen != 0 && count != 0)
1452         {
1453                 int                     check = count * slen;
1454                 int                     check2 = check + VARHDRSZ;
1455
1456                 if ((check / slen) != count || check2 <= check)
1457                         ereport(ERROR,
1458                                         (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1459                                          errmsg("requested length too large")));
1460         }
1461
1462         result = (text *) palloc(tlen);
1463
1464         SET_VARSIZE(result, tlen);
1465         cp = VARDATA(result);
1466         sp = VARDATA_ANY(string);
1467         for (i = 0; i < count; i++)
1468         {
1469                 memcpy(cp, sp, slen);
1470                 cp += slen;
1471         }
1472
1473         PG_RETURN_TEXT_P(result);
1474 }