granicus.if.org Git - postgresql/blob - src/backend/utils/adt/oracle_compat.c

   1 /*-------------------------------------------------------------------------
   2  * oracle_compat.c
   3  *      Oracle compatible functions.
   4  *
   5  * Copyright (c) 1996-2007, PostgreSQL Global Development Group
   6  *
   7  *      Author: Edmund Mergl <E.Mergl@bawue.de>
   8  *      Multibyte enhancement: Tatsuo Ishii <ishii@postgresql.org>
   9  *
  10  *
  11  * IDENTIFICATION
  12  *      $PostgreSQL: pgsql/src/backend/utils/adt/oracle_compat.c,v 1.72 2007/09/21 22:52:52 tgl Exp $
  13  *
  14  *-------------------------------------------------------------------------
  15  */
  16 #include "postgres.h"
  17
  18 #include <ctype.h>
  19 #include <limits.h>
  20 /*
  21  * towlower() and friends should be in <wctype.h>, but some pre-C99 systems
  22  * declare them in <wchar.h>.
  23  */
  24 #ifdef HAVE_WCHAR_H
  25 #include <wchar.h>
  26 #endif
  27 #ifdef HAVE_WCTYPE_H
  28 #include <wctype.h>
  29 #endif
  30
  31 #include "utils/builtins.h"
  32 #include "utils/pg_locale.h"
  33 #include "mb/pg_wchar.h"
  34
  35
  36 /*
  37  * If the system provides the needed functions for wide-character manipulation
  38  * (which are all standardized by C99), then we implement upper/lower/initcap
  39  * using wide-character functions.      Otherwise we use the traditional <ctype.h>
  40  * functions, which of course will not work as desired in multibyte character
  41  * sets.  Note that in either case we are effectively assuming that the
  42  * database character encoding matches the encoding implied by LC_CTYPE.
  43  *
  44  * We assume if we have these two functions, we have their friends too, and
  45  * can use the wide-character method.
  46  */
  47 #if defined(HAVE_WCSTOMBS) && defined(HAVE_TOWLOWER)
  48 #define USE_WIDE_UPPER_LOWER
  49 char *wstring_lower (char *str);
  50 char *wstring_upper(char *str);
  51 #endif
  52
  53 static text *dotrim(const char *string, int stringlen,
  54            const char *set, int setlen,
  55            bool doltrim, bool dortrim);
  56
  57
  58 #ifdef USE_WIDE_UPPER_LOWER
  59
  60 /*
  61  * Convert a TEXT value into a palloc'd wchar string.
  62  */
  63 static wchar_t *
  64 texttowcs(const text *txt)
  65 {
  66         int                     nbytes = VARSIZE_ANY_EXHDR(txt);
  67         char       *workstr;
  68         wchar_t    *result;
  69         size_t          ncodes;
  70
  71         /* Overflow paranoia */
  72         if (nbytes < 0 ||
  73                 nbytes > (int) (INT_MAX / sizeof(wchar_t)) - 1)
  74                 ereport(ERROR,
  75                                 (errcode(ERRCODE_OUT_OF_MEMORY),
  76                                  errmsg("out of memory")));
  77
  78         /* Need a null-terminated version of the input */
  79         workstr = (char *) palloc(nbytes + 1);
  80         memcpy(workstr, VARDATA_ANY(txt), nbytes);
  81         workstr[nbytes] = '\0';
  82
  83         /* Output workspace cannot have more codes than input bytes */
  84         result = (wchar_t *) palloc((nbytes + 1) * sizeof(wchar_t));
  85
  86         /* Do the conversion */
  87         ncodes = mbstowcs(result, workstr, nbytes + 1);
  88
  89         if (ncodes == (size_t) -1)
  90         {
  91                 /*
  92                  * Invalid multibyte character encountered.  We try to give a useful
  93                  * error message by letting pg_verifymbstr check the string.  But it's
  94                  * possible that the string is OK to us, and not OK to mbstowcs ---
  95                  * this suggests that the LC_CTYPE locale is different from the
  96                  * database encoding.  Give a generic error message if verifymbstr
  97                  * can't find anything wrong.
  98                  */
  99                 pg_verifymbstr(workstr, nbytes, false);
 100                 ereport(ERROR,
 101                                 (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
 102                                  errmsg("invalid multibyte character for locale"),
 103                                  errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding.")));
 104         }
 105
 106         Assert(ncodes <= (size_t) nbytes);
 107
 108         return result;
 109 }
 110
 111
 112 /*
 113  * Convert a wchar string into a palloc'd TEXT value.  The wchar string
 114  * must be zero-terminated, but we also require the caller to pass the string
 115  * length, since it will know it anyway in current uses.
 116  */
 117 static text *
 118 wcstotext(const wchar_t *str, int ncodes)
 119 {
 120         text       *result;
 121         size_t          nbytes;
 122
 123         /* Overflow paranoia */
 124         if (ncodes < 0 ||
 125                 ncodes > (int) ((INT_MAX - VARHDRSZ) / MB_CUR_MAX) - 1)
 126                 ereport(ERROR,
 127                                 (errcode(ERRCODE_OUT_OF_MEMORY),
 128                                  errmsg("out of memory")));
 129
 130         /* Make workspace certainly large enough for result */
 131         result = (text *) palloc((ncodes + 1) * MB_CUR_MAX + VARHDRSZ);
 132
 133         /* Do the conversion */
 134         nbytes = wcstombs((char *) VARDATA(result), str,
 135                                           (ncodes + 1) * MB_CUR_MAX);
 136
 137         if (nbytes == (size_t) -1)
 138         {
 139                 /* Invalid multibyte character encountered ... shouldn't happen */
 140                 ereport(ERROR,
 141                                 (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
 142                                  errmsg("invalid multibyte character for locale")));
 143         }
 144
 145         Assert(nbytes <= (size_t) (ncodes * MB_CUR_MAX));
 146
 147         SET_VARSIZE(result, nbytes + VARHDRSZ);
 148
 149         return result;
 150 }
 151 #endif   /* USE_WIDE_UPPER_LOWER */
 152
 153
 154 /*
 155  * On Windows, the "Unicode" locales assume UTF16 not UTF8 encoding.
 156  * To make use of the upper/lower functionality, we need to map UTF8 to
 157  * UTF16, which for some reason mbstowcs and wcstombs won't do for us.
 158  * This conversion layer takes care of it.
 159  */
 160
 161 #ifdef WIN32
 162
 163 /* texttowcs for the case of UTF8 to UTF16 */
 164 static wchar_t *
 165 win32_utf8_texttowcs(const text *txt)
 166 {
 167         int                     nbytes = VARSIZE_ANY_EXHDR(txt);
 168         wchar_t    *result;
 169         int                     r;
 170
 171         /* Overflow paranoia */
 172         if (nbytes < 0 ||
 173                 nbytes > (int) (INT_MAX / sizeof(wchar_t)) - 1)
 174                 ereport(ERROR,
 175                                 (errcode(ERRCODE_OUT_OF_MEMORY),
 176                                  errmsg("out of memory")));
 177
 178         /* Output workspace cannot have more codes than input bytes */
 179         result = (wchar_t *) palloc((nbytes + 1) * sizeof(wchar_t));
 180
 181         /* stupid Microsloth API does not work for zero-length input */
 182         if (nbytes == 0)
 183                 r = 0;
 184         else
 185         {
 186                 /* Do the conversion */
 187                 r = MultiByteToWideChar(CP_UTF8, 0, VARDATA_ANY(txt), nbytes,
 188                                                                 result, nbytes);
 189
 190                 if (!r)                                 /* assume it's NO_UNICODE_TRANSLATION */
 191                 {
 192                         /* see notes above about error reporting */
 193                         pg_verifymbstr(VARDATA_ANY(txt), nbytes, false);
 194                         ereport(ERROR,
 195                                         (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
 196                                          errmsg("invalid multibyte character for locale"),
 197                                          errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding.")));
 198                 }
 199         }
 200
 201         Assert(r <= nbytes);
 202         result[r] = 0;
 203
 204         return result;
 205 }
 206
 207 /* wcstotext for the case of UTF16 to UTF8 */
 208 static text *
 209 win32_utf8_wcstotext(const wchar_t *str)
 210 {
 211         text       *result;
 212         int                     nbytes;
 213         int                     r;
 214
 215         nbytes = WideCharToMultiByte(CP_UTF8, 0, str, -1, NULL, 0, NULL, NULL);
 216         if (nbytes == 0)                        /* shouldn't happen */
 217                 ereport(ERROR,
 218                                 (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
 219                                  errmsg("UTF-16 to UTF-8 translation failed: %lu",
 220                                                 GetLastError())));
 221
 222         result = palloc(nbytes + VARHDRSZ);
 223
 224         r = WideCharToMultiByte(CP_UTF8, 0, str, -1, VARDATA(result), nbytes,
 225                                                         NULL, NULL);
 226         if (r == 0)                                     /* shouldn't happen */
 227                 ereport(ERROR,
 228                                 (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
 229                                  errmsg("UTF-16 to UTF-8 translation failed: %lu",
 230                                                 GetLastError())));
 231
 232         SET_VARSIZE(result, nbytes + VARHDRSZ - 1);             /* -1 to ignore null */
 233
 234         return result;
 235 }
 236
 237 /* interface layer to check which encoding is in use */
 238
 239 static wchar_t *
 240 win32_texttowcs(const text *txt)
 241 {
 242         if (GetDatabaseEncoding() == PG_UTF8)
 243                 return win32_utf8_texttowcs(txt);
 244         else
 245                 return texttowcs(txt);
 246 }
 247
 248 static text *
 249 win32_wcstotext(const wchar_t *str, int ncodes)
 250 {
 251         if (GetDatabaseEncoding() == PG_UTF8)
 252                 return win32_utf8_wcstotext(str);
 253         else
 254                 return wcstotext(str, ncodes);
 255 }
 256
 257 /* use macros to cause routines below to call interface layer */
 258
 259 #define texttowcs       win32_texttowcs
 260 #define wcstotext       win32_wcstotext
 261 #endif   /* WIN32 */
 262
 263 #ifdef USE_WIDE_UPPER_LOWER
 264 /*
 265  * string_upper and string_lower are used for correct multibyte upper/lower
 266  * transformations localized strings. Returns pointers to transformated
 267  * string.
 268  */
 269 char *
 270 wstring_upper(char *str)
 271 {
 272         wchar_t         *workspace;
 273         text            *in_text;
 274         text            *out_text;
 275         char            *result;
 276         int     nbytes = strlen(str);
 277         int     i;
 278
 279         in_text = palloc(nbytes + VARHDRSZ);
 280         memcpy(VARDATA(in_text), str, nbytes);
 281         SET_VARSIZE(in_text, nbytes + VARHDRSZ);
 282
 283         workspace = texttowcs(in_text);
 284
 285         for (i = 0; workspace[i] != 0; i++)
 286                 workspace[i] = towupper(workspace[i]);
 287
 288         out_text = wcstotext(workspace, i);
 289
 290         nbytes = VARSIZE(out_text) - VARHDRSZ;
 291         result = palloc(nbytes + 1);
 292         memcpy(result, VARDATA(out_text), nbytes);
 293
 294         result[nbytes] = '\0';
 295
 296         pfree(workspace);
 297         pfree(in_text);
 298         pfree(out_text);
 299
 300         return result;
 301 }
 302
 303 char *
 304 wstring_lower(char *str)
 305 {
 306         wchar_t         *workspace;
 307         text            *in_text;
 308         text            *out_text;
 309         char            *result;
 310         int     nbytes = strlen(str);
 311         int     i;
 312
 313         in_text = palloc(nbytes + VARHDRSZ);
 314         memcpy(VARDATA(in_text), str, nbytes);
 315         SET_VARSIZE(in_text, nbytes + VARHDRSZ);
 316
 317         workspace = texttowcs(in_text);
 318
 319         for (i = 0; workspace[i] != 0; i++)
 320                 workspace[i] = towlower(workspace[i]);
 321
 322         out_text = wcstotext(workspace, i);
 323
 324         nbytes = VARSIZE(out_text) - VARHDRSZ;
 325         result = palloc(nbytes + 1);
 326         memcpy(result, VARDATA(out_text), nbytes);
 327
 328         result[nbytes] = '\0';
 329
 330         pfree(workspace);
 331         pfree(in_text);
 332         pfree(out_text);
 333
 334         return result;
 335 }
 336 #endif  /* USE_WIDE_UPPER_LOWER */
 337
 338 /********************************************************************
 339  *
 340  * lower
 341  *
 342  * Syntax:
 343  *
 344  *       text lower(text string)
 345  *
 346  * Purpose:
 347  *
 348  *       Returns string, with all letters forced to lowercase.
 349  *
 350  ********************************************************************/
 351
 352 Datum
 353 lower(PG_FUNCTION_ARGS)
 354 {
 355 #ifdef USE_WIDE_UPPER_LOWER
 356
 357         /*
 358          * Use wide char code only when max encoding length > 1 and ctype != C.
 359          * Some operating systems fail with multi-byte encodings and a C locale.
 360          * Also, for a C locale there is no need to process as multibyte.
 361          */
 362         if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c())
 363         {
 364                 text       *string = PG_GETARG_TEXT_PP(0);
 365                 text       *result;
 366                 wchar_t    *workspace;
 367                 int                     i;
 368
 369                 workspace = texttowcs(string);
 370
 371                 for (i = 0; workspace[i] != 0; i++)
 372                         workspace[i] = towlower(workspace[i]);
 373
 374                 result = wcstotext(workspace, i);
 375
 376                 pfree(workspace);
 377
 378                 PG_RETURN_TEXT_P(result);
 379         }
 380         else
 381 #endif   /* USE_WIDE_UPPER_LOWER */
 382         {
 383                 text       *string = PG_GETARG_TEXT_P_COPY(0);
 384                 char       *ptr;
 385                 int                     m;
 386
 387                 /*
 388                  * Since we copied the string, we can scribble directly on the value
 389                  */
 390                 ptr = VARDATA(string);
 391                 m = VARSIZE(string) - VARHDRSZ;
 392
 393                 while (m-- > 0)
 394                 {
 395                         *ptr = tolower((unsigned char) *ptr);
 396                         ptr++;
 397                 }
 398
 399                 PG_RETURN_TEXT_P(string);
 400         }
 401 }
 402
 403
 404 /********************************************************************
 405  *
 406  * upper
 407  *
 408  * Syntax:
 409  *
 410  *       text upper(text string)
 411  *
 412  * Purpose:
 413  *
 414  *       Returns string, with all letters forced to uppercase.
 415  *
 416  ********************************************************************/
 417
 418 Datum
 419 upper(PG_FUNCTION_ARGS)
 420 {
 421 #ifdef USE_WIDE_UPPER_LOWER
 422
 423         /*
 424          * Use wide char code only when max encoding length > 1 and ctype != C.
 425          * Some operating systems fail with multi-byte encodings and a C locale.
 426          * Also, for a C locale there is no need to process as multibyte.
 427          */
 428         if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c())
 429         {
 430                 text       *string = PG_GETARG_TEXT_PP(0);
 431                 text       *result;
 432                 wchar_t    *workspace;
 433                 int                     i;
 434
 435                 workspace = texttowcs(string);
 436
 437                 for (i = 0; workspace[i] != 0; i++)
 438                         workspace[i] = towupper(workspace[i]);
 439
 440                 result = wcstotext(workspace, i);
 441
 442                 pfree(workspace);
 443
 444                 PG_RETURN_TEXT_P(result);
 445         }
 446         else
 447 #endif   /* USE_WIDE_UPPER_LOWER */
 448         {
 449                 text       *string = PG_GETARG_TEXT_P_COPY(0);
 450                 char       *ptr;
 451                 int                     m;
 452
 453                 /*
 454                  * Since we copied the string, we can scribble directly on the value
 455                  */
 456                 ptr = VARDATA(string);
 457                 m = VARSIZE(string) - VARHDRSZ;
 458
 459                 while (m-- > 0)
 460                 {
 461                         *ptr = toupper((unsigned char) *ptr);
 462                         ptr++;
 463                 }
 464
 465                 PG_RETURN_TEXT_P(string);
 466         }
 467 }
 468
 469
 470 /********************************************************************
 471  *
 472  * initcap
 473  *
 474  * Syntax:
 475  *
 476  *       text initcap(text string)
 477  *
 478  * Purpose:
 479  *
 480  *       Returns string, with first letter of each word in uppercase, all
 481  *       other letters in lowercase. A word is defined as a sequence of
 482  *       alphanumeric characters, delimited by non-alphanumeric
 483  *       characters.
 484  *
 485  ********************************************************************/
 486
 487 Datum
 488 initcap(PG_FUNCTION_ARGS)
 489 {
 490 #ifdef USE_WIDE_UPPER_LOWER
 491
 492         /*
 493          * Use wide char code only when max encoding length > 1 and ctype != C.
 494          * Some operating systems fail with multi-byte encodings and a C locale.
 495          * Also, for a C locale there is no need to process as multibyte.
 496          */
 497         if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c())
 498         {
 499                 text       *string = PG_GETARG_TEXT_PP(0);
 500                 text       *result;
 501                 wchar_t    *workspace;
 502                 int                     wasalnum = 0;
 503                 int                     i;
 504
 505                 workspace = texttowcs(string);
 506
 507                 for (i = 0; workspace[i] != 0; i++)
 508                 {
 509                         if (wasalnum)
 510                                 workspace[i] = towlower(workspace[i]);
 511                         else
 512                                 workspace[i] = towupper(workspace[i]);
 513                         wasalnum = iswalnum(workspace[i]);
 514                 }
 515
 516                 result = wcstotext(workspace, i);
 517
 518                 pfree(workspace);
 519
 520                 PG_RETURN_TEXT_P(result);
 521         }
 522         else
 523 #endif   /* USE_WIDE_UPPER_LOWER */
 524         {
 525                 text       *string = PG_GETARG_TEXT_P_COPY(0);
 526                 int                     wasalnum = 0;
 527                 char       *ptr;
 528                 int                     m;
 529
 530                 /*
 531                  * Since we copied the string, we can scribble directly on the value
 532                  */
 533                 ptr = VARDATA(string);
 534                 m = VARSIZE(string) - VARHDRSZ;
 535
 536                 while (m-- > 0)
 537                 {
 538                         if (wasalnum)
 539                                 *ptr = tolower((unsigned char) *ptr);
 540                         else
 541                                 *ptr = toupper((unsigned char) *ptr);
 542                         wasalnum = isalnum((unsigned char) *ptr);
 543                         ptr++;
 544                 }
 545
 546                 PG_RETURN_TEXT_P(string);
 547         }
 548 }
 549
 550
 551 /********************************************************************
 552  *
 553  * lpad
 554  *
 555  * Syntax:
 556  *
 557  *       text lpad(text string1, int4 len, text string2)
 558  *
 559  * Purpose:
 560  *
 561  *       Returns string1, left-padded to length len with the sequence of
 562  *       characters in string2.  If len is less than the length of string1,
 563  *       instead truncate (on the right) to len.
 564  *
 565  ********************************************************************/
 566
 567 Datum
 568 lpad(PG_FUNCTION_ARGS)
 569 {
 570         text       *string1 = PG_GETARG_TEXT_PP(0);
 571         int32           len = PG_GETARG_INT32(1);
 572         text       *string2 = PG_GETARG_TEXT_PP(2);
 573         text       *ret;
 574         char       *ptr1,
 575                            *ptr2,
 576                            *ptr2start,
 577                            *ptr2end,
 578                            *ptr_ret;
 579         int                     m,
 580                                 s1len,
 581                                 s2len;
 582
 583         int                     bytelen;
 584
 585         /* Negative len is silently taken as zero */
 586         if (len < 0)
 587                 len = 0;
 588
 589         s1len = VARSIZE_ANY_EXHDR(string1);
 590         if (s1len < 0)
 591                 s1len = 0;                              /* shouldn't happen */
 592
 593         s2len = VARSIZE_ANY_EXHDR(string2);
 594         if (s2len < 0)
 595                 s2len = 0;                              /* shouldn't happen */
 596
 597         s1len = pg_mbstrlen_with_len(VARDATA_ANY(string1), s1len);
 598
 599         if (s1len > len)
 600                 s1len = len;                    /* truncate string1 to len chars */
 601
 602         if (s2len <= 0)
 603                 len = s1len;                    /* nothing to pad with, so don't pad */
 604
 605         bytelen = pg_database_encoding_max_length() * len;
 606
 607         /* check for integer overflow */
 608         if (len != 0 && bytelen / pg_database_encoding_max_length() != len)
 609                 ereport(ERROR,
 610                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
 611                                  errmsg("requested length too large")));
 612
 613         ret = (text *) palloc(VARHDRSZ + bytelen);
 614
 615         m = len - s1len;
 616
 617         ptr2 = ptr2start = VARDATA_ANY(string2);
 618         ptr2end = ptr2 + s2len;
 619         ptr_ret = VARDATA(ret);
 620
 621         while (m--)
 622         {
 623                 int                     mlen = pg_mblen(ptr2);
 624
 625                 memcpy(ptr_ret, ptr2, mlen);
 626                 ptr_ret += mlen;
 627                 ptr2 += mlen;
 628                 if (ptr2 == ptr2end)    /* wrap around at end of s2 */
 629                         ptr2 = ptr2start;
 630         }
 631
 632         ptr1 = VARDATA_ANY(string1);
 633
 634         while (s1len--)
 635         {
 636                 int                     mlen = pg_mblen(ptr1);
 637
 638                 memcpy(ptr_ret, ptr1, mlen);
 639                 ptr_ret += mlen;
 640                 ptr1 += mlen;
 641         }
 642
 643         SET_VARSIZE(ret, ptr_ret - (char *) ret);
 644
 645         PG_RETURN_TEXT_P(ret);
 646 }
 647
 648
 649 /********************************************************************
 650  *
 651  * rpad
 652  *
 653  * Syntax:
 654  *
 655  *       text rpad(text string1, int4 len, text string2)
 656  *
 657  * Purpose:
 658  *
 659  *       Returns string1, right-padded to length len with the sequence of
 660  *       characters in string2.  If len is less than the length of string1,
 661  *       instead truncate (on the right) to len.
 662  *
 663  ********************************************************************/
 664
 665 Datum
 666 rpad(PG_FUNCTION_ARGS)
 667 {
 668         text       *string1 = PG_GETARG_TEXT_PP(0);
 669         int32           len = PG_GETARG_INT32(1);
 670         text       *string2 = PG_GETARG_TEXT_PP(2);
 671         text       *ret;
 672         char       *ptr1,
 673                            *ptr2,
 674                            *ptr2start,
 675                            *ptr2end,
 676                            *ptr_ret;
 677         int                     m,
 678                                 s1len,
 679                                 s2len;
 680
 681         int                     bytelen;
 682
 683         /* Negative len is silently taken as zero */
 684         if (len < 0)
 685                 len = 0;
 686
 687         s1len = VARSIZE_ANY_EXHDR(string1);
 688         if (s1len < 0)
 689                 s1len = 0;                              /* shouldn't happen */
 690
 691         s2len = VARSIZE_ANY_EXHDR(string2);
 692         if (s2len < 0)
 693                 s2len = 0;                              /* shouldn't happen */
 694
 695         s1len = pg_mbstrlen_with_len(VARDATA_ANY(string1), s1len);
 696
 697         if (s1len > len)
 698                 s1len = len;                    /* truncate string1 to len chars */
 699
 700         if (s2len <= 0)
 701                 len = s1len;                    /* nothing to pad with, so don't pad */
 702
 703         bytelen = pg_database_encoding_max_length() * len;
 704
 705         /* Check for integer overflow */
 706         if (len != 0 && bytelen / pg_database_encoding_max_length() != len)
 707                 ereport(ERROR,
 708                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
 709                                  errmsg("requested length too large")));
 710
 711         ret = (text *) palloc(VARHDRSZ + bytelen);
 712         m = len - s1len;
 713
 714         ptr1 = VARDATA_ANY(string1);
 715         ptr_ret = VARDATA(ret);
 716
 717         while (s1len--)
 718         {
 719                 int                     mlen = pg_mblen(ptr1);
 720
 721                 memcpy(ptr_ret, ptr1, mlen);
 722                 ptr_ret += mlen;
 723                 ptr1 += mlen;
 724         }
 725
 726         ptr2 = ptr2start = VARDATA_ANY(string2);
 727         ptr2end = ptr2 + s2len;
 728
 729         while (m--)
 730         {
 731                 int                     mlen = pg_mblen(ptr2);
 732
 733                 memcpy(ptr_ret, ptr2, mlen);
 734                 ptr_ret += mlen;
 735                 ptr2 += mlen;
 736                 if (ptr2 == ptr2end)    /* wrap around at end of s2 */
 737                         ptr2 = ptr2start;
 738         }
 739
 740         SET_VARSIZE(ret, ptr_ret - (char *) ret);
 741
 742         PG_RETURN_TEXT_P(ret);
 743 }
 744
 745
 746 /********************************************************************
 747  *
 748  * btrim
 749  *
 750  * Syntax:
 751  *
 752  *       text btrim(text string, text set)
 753  *
 754  * Purpose:
 755  *
 756  *       Returns string with characters removed from the front and back
 757  *       up to the first character not in set.
 758  *
 759  ********************************************************************/
 760
 761 Datum
 762 btrim(PG_FUNCTION_ARGS)
 763 {
 764         text       *string = PG_GETARG_TEXT_PP(0);
 765         text       *set = PG_GETARG_TEXT_PP(1);
 766         text       *ret;
 767
 768         ret = dotrim(VARDATA_ANY(string), VARSIZE_ANY_EXHDR(string),
 769                                  VARDATA_ANY(set), VARSIZE_ANY_EXHDR(set),
 770                                  true, true);
 771
 772         PG_RETURN_TEXT_P(ret);
 773 }
 774
 775 /********************************************************************
 776  *
 777  * btrim1 --- btrim with set fixed as ' '
 778  *
 779  ********************************************************************/
 780
 781 Datum
 782 btrim1(PG_FUNCTION_ARGS)
 783 {
 784         text       *string = PG_GETARG_TEXT_PP(0);
 785         text       *ret;
 786
 787         ret = dotrim(VARDATA_ANY(string), VARSIZE_ANY_EXHDR(string),
 788                                  " ", 1,
 789                                  true, true);
 790
 791         PG_RETURN_TEXT_P(ret);
 792 }
 793
 794 /*
 795  * Common implementation for btrim, ltrim, rtrim
 796  */
 797 static text *
 798 dotrim(const char *string, int stringlen,
 799            const char *set, int setlen,
 800            bool doltrim, bool dortrim)
 801 {
 802         text       *result;
 803         int                     i;
 804
 805         /* Nothing to do if either string or set is empty */
 806         if (stringlen > 0 && setlen > 0)
 807         {
 808                 if (pg_database_encoding_max_length() > 1)
 809                 {
 810                         /*
 811                          * In the multibyte-encoding case, build arrays of pointers to
 812                          * character starts, so that we can avoid inefficient checks in
 813                          * the inner loops.
 814                          */
 815                         const char **stringchars;
 816                         const char **setchars;
 817                         int                *stringmblen;
 818                         int                *setmblen;
 819                         int                     stringnchars;
 820                         int                     setnchars;
 821                         int                     resultndx;
 822                         int                     resultnchars;
 823                         const char *p;
 824                         int                     len;
 825                         int                     mblen;
 826                         const char *str_pos;
 827                         int                     str_len;
 828
 829                         stringchars = (const char **) palloc(stringlen * sizeof(char *));
 830                         stringmblen = (int *) palloc(stringlen * sizeof(int));
 831                         stringnchars = 0;
 832                         p = string;
 833                         len = stringlen;
 834                         while (len > 0)
 835                         {
 836                                 stringchars[stringnchars] = p;
 837                                 stringmblen[stringnchars] = mblen = pg_mblen(p);
 838                                 stringnchars++;
 839                                 p += mblen;
 840                                 len -= mblen;
 841                         }
 842
 843                         setchars = (const char **) palloc(setlen * sizeof(char *));
 844                         setmblen = (int *) palloc(setlen * sizeof(int));
 845                         setnchars = 0;
 846                         p = set;
 847                         len = setlen;
 848                         while (len > 0)
 849                         {
 850                                 setchars[setnchars] = p;
 851                                 setmblen[setnchars] = mblen = pg_mblen(p);
 852                                 setnchars++;
 853                                 p += mblen;
 854                                 len -= mblen;
 855                         }
 856
 857                         resultndx = 0;          /* index in stringchars[] */
 858                         resultnchars = stringnchars;
 859
 860                         if (doltrim)
 861                         {
 862                                 while (resultnchars > 0)
 863                                 {
 864                                         str_pos = stringchars[resultndx];
 865                                         str_len = stringmblen[resultndx];
 866                                         for (i = 0; i < setnchars; i++)
 867                                         {
 868                                                 if (str_len == setmblen[i] &&
 869                                                         memcmp(str_pos, setchars[i], str_len) == 0)
 870                                                         break;
 871                                         }
 872                                         if (i >= setnchars)
 873                                                 break;  /* no match here */
 874                                         string += str_len;
 875                                         stringlen -= str_len;
 876                                         resultndx++;
 877                                         resultnchars--;
 878                                 }
 879                         }
 880
 881                         if (dortrim)
 882                         {
 883                                 while (resultnchars > 0)
 884                                 {
 885                                         str_pos = stringchars[resultndx + resultnchars - 1];
 886                                         str_len = stringmblen[resultndx + resultnchars - 1];
 887                                         for (i = 0; i < setnchars; i++)
 888                                         {
 889                                                 if (str_len == setmblen[i] &&
 890                                                         memcmp(str_pos, setchars[i], str_len) == 0)
 891                                                         break;
 892                                         }
 893                                         if (i >= setnchars)
 894                                                 break;  /* no match here */
 895                                         stringlen -= str_len;
 896                                         resultnchars--;
 897                                 }
 898                         }
 899
 900                         pfree(stringchars);
 901                         pfree(stringmblen);
 902                         pfree(setchars);
 903                         pfree(setmblen);
 904                 }
 905                 else
 906                 {
 907                         /*
 908                          * In the single-byte-encoding case, we don't need such overhead.
 909                          */
 910                         if (doltrim)
 911                         {
 912                                 while (stringlen > 0)
 913                                 {
 914                                         char            str_ch = *string;
 915
 916                                         for (i = 0; i < setlen; i++)
 917                                         {
 918                                                 if (str_ch == set[i])
 919                                                         break;
 920                                         }
 921                                         if (i >= setlen)
 922                                                 break;  /* no match here */
 923                                         string++;
 924                                         stringlen--;
 925                                 }
 926                         }
 927
 928                         if (dortrim)
 929                         {
 930                                 while (stringlen > 0)
 931                                 {
 932                                         char            str_ch = string[stringlen - 1];
 933
 934                                         for (i = 0; i < setlen; i++)
 935                                         {
 936                                                 if (str_ch == set[i])
 937                                                         break;
 938                                         }
 939                                         if (i >= setlen)
 940                                                 break;  /* no match here */
 941                                         stringlen--;
 942                                 }
 943                         }
 944                 }
 945         }
 946
 947         /* Return selected portion of string */
 948         result = (text *) palloc(VARHDRSZ + stringlen);
 949         SET_VARSIZE(result, VARHDRSZ + stringlen);
 950         memcpy(VARDATA(result), string, stringlen);
 951
 952         return result;
 953 }
 954
 955 /********************************************************************
 956  *
 957  * byteatrim
 958  *
 959  * Syntax:
 960  *
 961  *       bytea byteatrim(byta string, bytea set)
 962  *
 963  * Purpose:
 964  *
 965  *       Returns string with characters removed from the front and back
 966  *       up to the first character not in set.
 967  *
 968  * Cloned from btrim and modified as required.
 969  ********************************************************************/
 970
 971 Datum
 972 byteatrim(PG_FUNCTION_ARGS)
 973 {
 974         bytea      *string = PG_GETARG_BYTEA_PP(0);
 975         bytea      *set = PG_GETARG_BYTEA_PP(1);
 976         bytea      *ret;
 977         char       *ptr,
 978                            *end,
 979                            *ptr2,
 980                            *ptr2start,
 981                            *end2;
 982         int                     m,
 983                                 stringlen,
 984                                 setlen;
 985
 986         stringlen = VARSIZE_ANY_EXHDR(string);
 987         setlen = VARSIZE_ANY_EXHDR(set);
 988
 989         if (stringlen <= 0 || setlen <= 0)
 990                 PG_RETURN_BYTEA_P(string);
 991
 992         m = stringlen;
 993         ptr = VARDATA_ANY(string);
 994         end = ptr + stringlen - 1;
 995         ptr2start = VARDATA_ANY(set);
 996         end2 = ptr2start + setlen - 1;
 997
 998         while (m > 0)
 999         {
1000                 ptr2 = ptr2start;
1001                 while (ptr2 <= end2)
1002                 {
1003                         if (*ptr == *ptr2)
1004                                 break;
1005                         ++ptr2;
1006                 }
1007                 if (ptr2 > end2)
1008                         break;
1009                 ptr++;
1010                 m--;
1011         }
1012
1013         while (m > 0)
1014         {
1015                 ptr2 = ptr2start;
1016                 while (ptr2 <= end2)
1017                 {
1018                         if (*end == *ptr2)
1019                                 break;
1020                         ++ptr2;
1021                 }
1022                 if (ptr2 > end2)
1023                         break;
1024                 end--;
1025                 m--;
1026         }
1027
1028         ret = (bytea *) palloc(VARHDRSZ + m);
1029         SET_VARSIZE(ret, VARHDRSZ + m);
1030         memcpy(VARDATA(ret), ptr, m);
1031
1032         PG_RETURN_BYTEA_P(ret);
1033 }
1034
1035 /********************************************************************
1036  *
1037  * ltrim
1038  *
1039  * Syntax:
1040  *
1041  *       text ltrim(text string, text set)
1042  *
1043  * Purpose:
1044  *
1045  *       Returns string with initial characters removed up to the first
1046  *       character not in set.
1047  *
1048  ********************************************************************/
1049
1050 Datum
1051 ltrim(PG_FUNCTION_ARGS)
1052 {
1053         text       *string = PG_GETARG_TEXT_PP(0);
1054         text       *set = PG_GETARG_TEXT_PP(1);
1055         text       *ret;
1056
1057         ret = dotrim(VARDATA_ANY(string), VARSIZE_ANY_EXHDR(string),
1058                                  VARDATA_ANY(set), VARSIZE_ANY_EXHDR(set),
1059                                  true, false);
1060
1061         PG_RETURN_TEXT_P(ret);
1062 }
1063
1064 /********************************************************************
1065  *
1066  * ltrim1 --- ltrim with set fixed as ' '
1067  *
1068  ********************************************************************/
1069
1070 Datum
1071 ltrim1(PG_FUNCTION_ARGS)
1072 {
1073         text       *string = PG_GETARG_TEXT_PP(0);
1074         text       *ret;
1075
1076         ret = dotrim(VARDATA_ANY(string), VARSIZE_ANY_EXHDR(string),
1077                                  " ", 1,
1078                                  true, false);
1079
1080         PG_RETURN_TEXT_P(ret);
1081 }
1082
1083 /********************************************************************
1084  *
1085  * rtrim
1086  *
1087  * Syntax:
1088  *
1089  *       text rtrim(text string, text set)
1090  *
1091  * Purpose:
1092  *
1093  *       Returns string with final characters removed after the last
1094  *       character not in set.
1095  *
1096  ********************************************************************/
1097
1098 Datum
1099 rtrim(PG_FUNCTION_ARGS)
1100 {
1101         text       *string = PG_GETARG_TEXT_PP(0);
1102         text       *set = PG_GETARG_TEXT_PP(1);
1103         text       *ret;
1104
1105         ret = dotrim(VARDATA_ANY(string), VARSIZE_ANY_EXHDR(string),
1106                                  VARDATA_ANY(set), VARSIZE_ANY_EXHDR(set),
1107                                  false, true);
1108
1109         PG_RETURN_TEXT_P(ret);
1110 }
1111
1112 /********************************************************************
1113  *
1114  * rtrim1 --- rtrim with set fixed as ' '
1115  *
1116  ********************************************************************/
1117
1118 Datum
1119 rtrim1(PG_FUNCTION_ARGS)
1120 {
1121         text       *string = PG_GETARG_TEXT_PP(0);
1122         text       *ret;
1123
1124         ret = dotrim(VARDATA_ANY(string), VARSIZE_ANY_EXHDR(string),
1125                                  " ", 1,
1126                                  false, true);
1127
1128         PG_RETURN_TEXT_P(ret);
1129 }
1130
1131
1132 /********************************************************************
1133  *
1134  * translate
1135  *
1136  * Syntax:
1137  *
1138  *       text translate(text string, text from, text to)
1139  *
1140  * Purpose:
1141  *
1142  *       Returns string after replacing all occurrences of characters in from
1143  *       with the corresponding character in to.  If from is longer than to,
1144  *       occurrences of the extra characters in from are deleted.
1145  *       Improved by Edwin Ramirez <ramirez@doc.mssm.edu>.
1146  *
1147  ********************************************************************/
1148
1149 Datum
1150 translate(PG_FUNCTION_ARGS)
1151 {
1152         text       *string = PG_GETARG_TEXT_PP(0);
1153         text       *from = PG_GETARG_TEXT_PP(1);
1154         text       *to = PG_GETARG_TEXT_PP(2);
1155         text       *result;
1156         char       *from_ptr,
1157                            *to_ptr;
1158         char       *source,
1159                            *target;
1160         int                     m,
1161                                 fromlen,
1162                                 tolen,
1163                                 retlen,
1164                                 i;
1165
1166         int                     str_len;
1167         int                     estimate_len;
1168         int                     len;
1169         int                     source_len;
1170         int                     from_index;
1171
1172         m = VARSIZE_ANY_EXHDR(string);
1173
1174         if (m <= 0)
1175                 PG_RETURN_TEXT_P(string);
1176
1177         fromlen = VARSIZE_ANY_EXHDR(from);
1178         from_ptr = VARDATA_ANY(from);
1179         tolen = VARSIZE_ANY_EXHDR(to);
1180         to_ptr = VARDATA_ANY(to);
1181
1182         str_len = VARSIZE_ANY_EXHDR(string);
1183         source = VARDATA_ANY(string);
1184
1185         estimate_len = (tolen * 1.0 / fromlen + 0.5) * str_len;
1186         estimate_len = estimate_len > str_len ? estimate_len : str_len;
1187
1188         result = (text *) palloc(estimate_len + VARHDRSZ);
1189         target = VARDATA(result);
1190         retlen = 0;
1191
1192         while (m > 0)
1193         {
1194                 source_len = pg_mblen(source);
1195                 from_index = 0;
1196
1197                 for (i = 0; i < fromlen; i += len)
1198                 {
1199                         len = pg_mblen(&from_ptr[i]);
1200                         if (len == source_len &&
1201                                 memcmp(source, &from_ptr[i], len) == 0)
1202                                 break;
1203
1204                         from_index++;
1205                 }
1206                 if (i < fromlen)
1207                 {
1208                         /* substitute */
1209                         char       *p = to_ptr;
1210
1211                         for (i = 0; i < from_index; i++)
1212                         {
1213                                 p += pg_mblen(p);
1214                                 if (p >= (to_ptr + tolen))
1215                                         break;
1216                         }
1217                         if (p < (to_ptr + tolen))
1218                         {
1219                                 len = pg_mblen(p);
1220                                 memcpy(target, p, len);
1221                                 target += len;
1222                                 retlen += len;
1223                         }
1224
1225                 }
1226                 else
1227                 {
1228                         /* no match, so copy */
1229                         memcpy(target, source, source_len);
1230                         target += source_len;
1231                         retlen += source_len;
1232                 }
1233
1234                 source += source_len;
1235                 m -= source_len;
1236         }
1237
1238         SET_VARSIZE(result, retlen + VARHDRSZ);
1239
1240         /*
1241          * There may be some wasted space in the result if deletions occurred, but
1242          * it's not worth reallocating it; the function result probably won't live
1243          * long anyway.
1244          */
1245
1246         PG_RETURN_TEXT_P(result);
1247 }
1248
1249 /********************************************************************
1250  *
1251  * ascii
1252  *
1253  * Syntax:
1254  *
1255  *       int ascii(text string)
1256  *
1257  * Purpose:
1258  *
1259  *       Returns the decimal representation of the first character from
1260  *       string.
1261  *   If the string is empty we return 0.
1262  *   If the database encoding is UTF8, we return the Unicode codepoint.
1263  *   If the database encoding is any other multi-byte encoding, we
1264  *   return the value of the first byte if it is an ASCII character
1265  *   (range 1 .. 127), or raise an error.
1266  *   For all other encodings we return the value of the first byte,
1267  *   (range 1..255).
1268  *
1269  ********************************************************************/
1270
1271 Datum
1272 ascii(PG_FUNCTION_ARGS)
1273 {
1274         text       *string = PG_GETARG_TEXT_PP(0);
1275         int encoding = GetDatabaseEncoding();
1276         unsigned char *data;
1277
1278         if (VARSIZE_ANY_EXHDR(string) <= 0)
1279                 PG_RETURN_INT32(0);
1280
1281         data = (unsigned char *) VARDATA_ANY(string);
1282
1283         if (encoding == PG_UTF8 && *data > 127)
1284         {
1285                 /* return the code point for Unicode */
1286
1287                 int result = 0, tbytes = 0, i;
1288
1289                 if (*data >= 0xF0)
1290                 {
1291                         result = *data & 0x07;
1292                         tbytes = 3;
1293                 }
1294                 else if (*data >= 0xE0)
1295                 {
1296                         result = *data & 0x0F;
1297                         tbytes = 2;
1298                 }
1299                 else
1300                 {
1301                         Assert (*data > 0xC0);
1302                         result = *data & 0x1f;
1303                         tbytes = 1;
1304                 }
1305
1306                 Assert (tbytes > 0);
1307
1308                 for (i = 1; i <= tbytes; i++)
1309                 {
1310                         Assert ((data[i] & 0xC0) == 0x80);
1311                         result = (result << 6) + (data[i] & 0x3f);
1312                 }
1313
1314                 PG_RETURN_INT32(result);
1315         }
1316         else
1317         {
1318                 if (pg_encoding_max_length(encoding) > 1 && *data > 127)
1319                         ereport(ERROR,
1320                                         (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1321                                          errmsg("requested character too large")));
1322
1323
1324                 PG_RETURN_INT32((int32) *data);
1325         }
1326 }
1327
1328 /********************************************************************
1329  *
1330  * chr
1331  *
1332  * Syntax:
1333  *
1334  *       text chr(int val)
1335  *
1336  * Purpose:
1337  *
1338  *      Returns the character having the binary equivalent to val.
1339  *
1340  * For UTF8 we treat the argumwent as a Unicode code point.
1341  * For other multi-byte encodings we raise an error for arguments
1342  * outside the strict ASCII range (1..127).
1343  *
1344  * It's important that we don't ever return a value that is not valid
1345  * in the database encoding, so that this doesn't become a way for
1346  * invalid data to enter the database.
1347  *
1348  ********************************************************************/
1349
1350 Datum
1351 chr(PG_FUNCTION_ARGS)
1352 {
1353         uint32          cvalue = PG_GETARG_UINT32(0);
1354         text       *result;
1355         int encoding = GetDatabaseEncoding();
1356
1357         if (encoding == PG_UTF8 && cvalue > 127)
1358         {
1359                 /* for Unicode we treat the argument as a code point */
1360                 int bytes ;
1361                 char *wch;
1362
1363                 /* We only allow valid Unicode code points */
1364                 if (cvalue > 0x001fffff)
1365                         ereport(ERROR,
1366                                         (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1367                                          errmsg("requested character too large for encoding: %d",
1368                                                         cvalue)));
1369
1370                 if (cvalue > 0xffff)
1371                         bytes = 4;
1372                 else if (cvalue > 0x07ff)
1373                         bytes = 3;
1374                 else
1375                         bytes = 2;
1376
1377                 result = (text *) palloc(VARHDRSZ + bytes);
1378                 SET_VARSIZE(result, VARHDRSZ + bytes);
1379                 wch = VARDATA(result);
1380
1381                 if (bytes == 2)
1382                 {
1383                         wch[0] = 0xC0 | ((cvalue >> 6) & 0x1F);
1384                         wch[1] = 0x80 | (cvalue & 0x3F);;
1385                 }
1386                 else if (bytes == 3)
1387                 {
1388                         wch[0] = 0xE0 | ((cvalue >> 12) & 0x0F);
1389                         wch[1] = 0x80 | ((cvalue >> 6) & 0x3F);
1390                         wch[2] = 0x80 | (cvalue & 0x3F);
1391                 }
1392                 else
1393                 {
1394                         wch[0] = 0xF0 | ((cvalue >> 18) & 0x07);
1395                         wch[1] = 0x80 | ((cvalue >> 12) & 0x3F);
1396                         wch[2] = 0x80 | ((cvalue >> 6) & 0x3F);
1397                         wch[3] = 0x80 | (cvalue & 0x3F);
1398                 }
1399
1400         }
1401
1402         else
1403         {
1404                 bool is_mb;
1405
1406                 /* Error out on arguments that make no sense or that we
1407                  * can't validly represent in the encoding.
1408                  */
1409
1410                 if (cvalue == 0)
1411                         ereport(ERROR,
1412                                         (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1413                                          errmsg("null character not permitted")));
1414
1415                 is_mb = pg_encoding_max_length(encoding) > 1;
1416
1417                 if ((is_mb && (cvalue > 255)) || (! is_mb && (cvalue > 127)))
1418                         ereport(ERROR,
1419                                         (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1420                                          errmsg("requested character too large for encoding: %d",
1421                                                         cvalue)));
1422
1423
1424                 result = (text *) palloc(VARHDRSZ + 1);
1425                 SET_VARSIZE(result, VARHDRSZ + 1);
1426                 *VARDATA(result) = (char) cvalue;
1427         }
1428
1429         PG_RETURN_TEXT_P(result);
1430 }
1431
1432 /********************************************************************
1433  *
1434  * repeat
1435  *
1436  * Syntax:
1437  *
1438  *       text repeat(text string, int val)
1439  *
1440  * Purpose:
1441  *
1442  *      Repeat string by val.
1443  *
1444  ********************************************************************/
1445
1446 Datum
1447 repeat(PG_FUNCTION_ARGS)
1448 {
1449         text       *string = PG_GETARG_TEXT_PP(0);
1450         int32           count = PG_GETARG_INT32(1);
1451         text       *result;
1452         int                     slen,
1453                                 tlen;
1454         int                     i;
1455         char       *cp,
1456                            *sp;
1457
1458         if (count < 0)
1459                 count = 0;
1460
1461         slen = VARSIZE_ANY_EXHDR(string);
1462         tlen = VARHDRSZ + (count * slen);
1463
1464         /* Check for integer overflow */
1465         if (slen != 0 && count != 0)
1466         {
1467                 int                     check = count * slen;
1468                 int                     check2 = check + VARHDRSZ;
1469
1470                 if ((check / slen) != count || check2 <= check)
1471                         ereport(ERROR,
1472                                         (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1473                                          errmsg("requested length too large")));
1474         }
1475
1476         result = (text *) palloc(tlen);
1477
1478         SET_VARSIZE(result, tlen);
1479         cp = VARDATA(result);
1480         sp = VARDATA_ANY(string);
1481         for (i = 0; i < count; i++)
1482         {
1483                 memcpy(cp, sp, slen);
1484                 cp += slen;
1485         }
1486
1487         PG_RETURN_TEXT_P(result);
1488 }