]> granicus.if.org Git - postgresql/blob - src/backend/utils/adt/oracle_compat.c
Make to_char()'s localized month/day names depend on LC_TIME, not LC_MESSAGES.
[postgresql] / src / backend / utils / adt / oracle_compat.c
1 /*-------------------------------------------------------------------------
2  * oracle_compat.c
3  *      Oracle compatible functions.
4  *
5  * Copyright (c) 1996-2008, PostgreSQL Global Development Group
6  *
7  *      Author: Edmund Mergl <E.Mergl@bawue.de>
8  *      Multibyte enhancement: Tatsuo Ishii <ishii@postgresql.org>
9  *
10  *
11  * IDENTIFICATION
12  *      $PostgreSQL: pgsql/src/backend/utils/adt/oracle_compat.c,v 1.79 2008/05/19 18:08:16 tgl Exp $
13  *
14  *-------------------------------------------------------------------------
15  */
16 #include "postgres.h"
17
18 #include <ctype.h>
19 #include <limits.h>
20 /*
21  * towlower() and friends should be in <wctype.h>, but some pre-C99 systems
22  * declare them in <wchar.h>.
23  */
24 #ifdef HAVE_WCHAR_H
25 #include <wchar.h>
26 #endif
27 #ifdef HAVE_WCTYPE_H
28 #include <wctype.h>
29 #endif
30
31 #include "utils/builtins.h"
32 #include "utils/pg_locale.h"
33 #include "mb/pg_wchar.h"
34
35
36 /*
37  * If the system provides the needed functions for wide-character manipulation
38  * (which are all standardized by C99), then we implement upper/lower/initcap
39  * using wide-character functions.      Otherwise we use the traditional <ctype.h>
40  * functions, which of course will not work as desired in multibyte character
41  * sets.  Note that in either case we are effectively assuming that the
42  * database character encoding matches the encoding implied by LC_CTYPE.
43  *
44  * We assume if we have these two functions, we have their friends too, and
45  * can use the wide-character method.
46  */
47 #if defined(HAVE_WCSTOMBS) && defined(HAVE_TOWLOWER)
48 #define USE_WIDE_UPPER_LOWER
49 char       *wstring_lower(char *str);
50 char       *wstring_upper(char *str);
51 wchar_t    *texttowcs(const text *txt);
52 text       *wcstotext(const wchar_t *str, int ncodes);
53 #endif
54
55 static text *dotrim(const char *string, int stringlen,
56            const char *set, int setlen,
57            bool doltrim, bool dortrim);
58
59
60 #ifdef USE_WIDE_UPPER_LOWER
61
62 /*
63  * Convert a TEXT value into a palloc'd wchar string.
64  */
65 wchar_t *
66 texttowcs(const text *txt)
67 {
68         int                     nbytes = VARSIZE_ANY_EXHDR(txt);
69         char       *workstr;
70         wchar_t    *result;
71         size_t          ncodes;
72
73         /* Overflow paranoia */
74         if (nbytes < 0 ||
75                 nbytes > (int) (INT_MAX / sizeof(wchar_t)) - 1)
76                 ereport(ERROR,
77                                 (errcode(ERRCODE_OUT_OF_MEMORY),
78                                  errmsg("out of memory")));
79
80         /* Need a null-terminated version of the input */
81         workstr = text_to_cstring(txt);
82
83         /* Output workspace cannot have more codes than input bytes */
84         result = (wchar_t *) palloc((nbytes + 1) * sizeof(wchar_t));
85
86         /* Do the conversion */
87         ncodes = mbstowcs(result, workstr, nbytes + 1);
88
89         if (ncodes == (size_t) -1)
90         {
91                 /*
92                  * Invalid multibyte character encountered.  We try to give a useful
93                  * error message by letting pg_verifymbstr check the string.  But it's
94                  * possible that the string is OK to us, and not OK to mbstowcs ---
95                  * this suggests that the LC_CTYPE locale is different from the
96                  * database encoding.  Give a generic error message if verifymbstr
97                  * can't find anything wrong.
98                  */
99                 pg_verifymbstr(workstr, nbytes, false);
100                 ereport(ERROR,
101                                 (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
102                                  errmsg("invalid multibyte character for locale"),
103                                  errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding.")));
104         }
105
106         Assert(ncodes <= (size_t) nbytes);
107
108         return result;
109 }
110
111
112 /*
113  * Convert a wchar string into a palloc'd TEXT value.  The wchar string
114  * must be zero-terminated, but we also require the caller to pass the string
115  * length, since it will know it anyway in current uses.
116  */
117 text *
118 wcstotext(const wchar_t *str, int ncodes)
119 {
120         text       *result;
121         size_t          nbytes;
122
123         /* Overflow paranoia */
124         if (ncodes < 0 ||
125                 ncodes > (int) ((INT_MAX - VARHDRSZ) / MB_CUR_MAX) - 1)
126                 ereport(ERROR,
127                                 (errcode(ERRCODE_OUT_OF_MEMORY),
128                                  errmsg("out of memory")));
129
130         /* Make workspace certainly large enough for result */
131         result = (text *) palloc((ncodes + 1) * MB_CUR_MAX + VARHDRSZ);
132
133         /* Do the conversion */
134         nbytes = wcstombs((char *) VARDATA(result), str,
135                                           (ncodes + 1) * MB_CUR_MAX);
136
137         if (nbytes == (size_t) -1)
138         {
139                 /* Invalid multibyte character encountered ... shouldn't happen */
140                 ereport(ERROR,
141                                 (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
142                                  errmsg("invalid multibyte character for locale")));
143         }
144
145         Assert(nbytes <= (size_t) (ncodes * MB_CUR_MAX));
146
147         SET_VARSIZE(result, nbytes + VARHDRSZ);
148
149         return result;
150 }
151 #endif   /* USE_WIDE_UPPER_LOWER */
152
153
154 /*
155  * On Windows, the "Unicode" locales assume UTF16 not UTF8 encoding.
156  * To make use of the upper/lower functionality, we need to map UTF8 to
157  * UTF16, which for some reason mbstowcs and wcstombs won't do for us.
158  * This conversion layer takes care of it.
159  */
160
161 #ifdef WIN32
162
163 /* texttowcs for the case of UTF8 to UTF16 */
164 static wchar_t *
165 win32_utf8_texttowcs(const text *txt)
166 {
167         int                     nbytes = VARSIZE_ANY_EXHDR(txt);
168         wchar_t    *result;
169         int                     r;
170
171         /* Overflow paranoia */
172         if (nbytes < 0 ||
173                 nbytes > (int) (INT_MAX / sizeof(wchar_t)) - 1)
174                 ereport(ERROR,
175                                 (errcode(ERRCODE_OUT_OF_MEMORY),
176                                  errmsg("out of memory")));
177
178         /* Output workspace cannot have more codes than input bytes */
179         result = (wchar_t *) palloc((nbytes + 1) * sizeof(wchar_t));
180
181         /* stupid Microsloth API does not work for zero-length input */
182         if (nbytes == 0)
183                 r = 0;
184         else
185         {
186                 /* Do the conversion */
187                 r = MultiByteToWideChar(CP_UTF8, 0, VARDATA_ANY(txt), nbytes,
188                                                                 result, nbytes);
189
190                 if (r <= 0)                             /* assume it's NO_UNICODE_TRANSLATION */
191                 {
192                         /* see notes above about error reporting */
193                         pg_verifymbstr(VARDATA_ANY(txt), nbytes, false);
194                         ereport(ERROR,
195                                         (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
196                                          errmsg("invalid multibyte character for locale"),
197                                          errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding.")));
198                 }
199         }
200
201         /* Append trailing null wchar (MultiByteToWideChar won't have) */
202         Assert(r <= nbytes);
203         result[r] = 0;
204
205         return result;
206 }
207
208 /* wcstotext for the case of UTF16 to UTF8 */
209 static text *
210 win32_utf8_wcstotext(const wchar_t *str)
211 {
212         text       *result;
213         int                     nbytes;
214         int                     r;
215
216         /* Compute size of output string (this *will* include trailing null) */
217         nbytes = WideCharToMultiByte(CP_UTF8, 0, str, -1, NULL, 0, NULL, NULL);
218         if (nbytes <= 0)                        /* shouldn't happen */
219                 ereport(ERROR,
220                                 (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
221                                  errmsg("UTF-16 to UTF-8 translation failed: %lu",
222                                                 GetLastError())));
223
224         result = palloc(nbytes + VARHDRSZ);
225
226         r = WideCharToMultiByte(CP_UTF8, 0, str, -1, VARDATA(result), nbytes,
227                                                         NULL, NULL);
228         if (r != nbytes)                        /* shouldn't happen */
229                 ereport(ERROR,
230                                 (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
231                                  errmsg("UTF-16 to UTF-8 translation failed: %lu",
232                                                 GetLastError())));
233
234         SET_VARSIZE(result, nbytes + VARHDRSZ - 1); /* -1 to ignore null */
235
236         return result;
237 }
238
239 /* interface layer to check which encoding is in use */
240
241 static wchar_t *
242 win32_texttowcs(const text *txt)
243 {
244         if (GetDatabaseEncoding() == PG_UTF8)
245                 return win32_utf8_texttowcs(txt);
246         else
247                 return texttowcs(txt);
248 }
249
250 static text *
251 win32_wcstotext(const wchar_t *str, int ncodes)
252 {
253         if (GetDatabaseEncoding() == PG_UTF8)
254                 return win32_utf8_wcstotext(str);
255         else
256                 return wcstotext(str, ncodes);
257 }
258
259 /* use macros to cause routines below to call interface layer */
260
261 #define texttowcs       win32_texttowcs
262 #define wcstotext       win32_wcstotext
263 #endif   /* WIN32 */
264
265 #ifdef USE_WIDE_UPPER_LOWER
266 /*
267  * string_upper and string_lower are used for correct multibyte upper/lower
268  * transformations localized strings. Returns pointers to transformated
269  * string.
270  */
271 char *
272 wstring_upper(char *str)
273 {
274         wchar_t    *workspace;
275         text       *in_text;
276         text       *out_text;
277         char       *result;
278         int                     i;
279
280         in_text = cstring_to_text(str);
281         workspace = texttowcs(in_text);
282
283         for (i = 0; workspace[i] != 0; i++)
284                 workspace[i] = towupper(workspace[i]);
285
286         out_text = wcstotext(workspace, i);
287         result = text_to_cstring(out_text);
288
289         pfree(workspace);
290         pfree(in_text);
291         pfree(out_text);
292
293         return result;
294 }
295
296 char *
297 wstring_lower(char *str)
298 {
299         wchar_t    *workspace;
300         text       *in_text;
301         text       *out_text;
302         char       *result;
303         int                     i;
304
305         in_text = cstring_to_text(str);
306         workspace = texttowcs(in_text);
307
308         for (i = 0; workspace[i] != 0; i++)
309                 workspace[i] = towlower(workspace[i]);
310
311         out_text = wcstotext(workspace, i);
312         result = text_to_cstring(out_text);
313
314         pfree(workspace);
315         pfree(in_text);
316         pfree(out_text);
317
318         return result;
319 }
320 #endif   /* USE_WIDE_UPPER_LOWER */
321
322 /********************************************************************
323  *
324  * lower
325  *
326  * Syntax:
327  *
328  *       text lower(text string)
329  *
330  * Purpose:
331  *
332  *       Returns string, with all letters forced to lowercase.
333  *
334  ********************************************************************/
335
336 Datum
337 lower(PG_FUNCTION_ARGS)
338 {
339 #ifdef USE_WIDE_UPPER_LOWER
340
341         /*
342          * Use wide char code only when max encoding length > 1 and ctype != C.
343          * Some operating systems fail with multi-byte encodings and a C locale.
344          * Also, for a C locale there is no need to process as multibyte.
345          */
346         if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c())
347         {
348                 text       *string = PG_GETARG_TEXT_PP(0);
349                 text       *result;
350                 wchar_t    *workspace;
351                 int                     i;
352
353                 workspace = texttowcs(string);
354
355                 for (i = 0; workspace[i] != 0; i++)
356                         workspace[i] = towlower(workspace[i]);
357
358                 result = wcstotext(workspace, i);
359
360                 pfree(workspace);
361
362                 PG_RETURN_TEXT_P(result);
363         }
364         else
365 #endif   /* USE_WIDE_UPPER_LOWER */
366         {
367                 text       *string = PG_GETARG_TEXT_P_COPY(0);
368                 char       *ptr;
369                 int                     m;
370
371                 /*
372                  * Since we copied the string, we can scribble directly on the value
373                  */
374                 ptr = VARDATA(string);
375                 m = VARSIZE(string) - VARHDRSZ;
376
377                 while (m-- > 0)
378                 {
379                         *ptr = tolower((unsigned char) *ptr);
380                         ptr++;
381                 }
382
383                 PG_RETURN_TEXT_P(string);
384         }
385 }
386
387
388 /********************************************************************
389  *
390  * upper
391  *
392  * Syntax:
393  *
394  *       text upper(text string)
395  *
396  * Purpose:
397  *
398  *       Returns string, with all letters forced to uppercase.
399  *
400  ********************************************************************/
401
402 Datum
403 upper(PG_FUNCTION_ARGS)
404 {
405 #ifdef USE_WIDE_UPPER_LOWER
406
407         /*
408          * Use wide char code only when max encoding length > 1 and ctype != C.
409          * Some operating systems fail with multi-byte encodings and a C locale.
410          * Also, for a C locale there is no need to process as multibyte.
411          */
412         if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c())
413         {
414                 text       *string = PG_GETARG_TEXT_PP(0);
415                 text       *result;
416                 wchar_t    *workspace;
417                 int                     i;
418
419                 workspace = texttowcs(string);
420
421                 for (i = 0; workspace[i] != 0; i++)
422                         workspace[i] = towupper(workspace[i]);
423
424                 result = wcstotext(workspace, i);
425
426                 pfree(workspace);
427
428                 PG_RETURN_TEXT_P(result);
429         }
430         else
431 #endif   /* USE_WIDE_UPPER_LOWER */
432         {
433                 text       *string = PG_GETARG_TEXT_P_COPY(0);
434                 char       *ptr;
435                 int                     m;
436
437                 /*
438                  * Since we copied the string, we can scribble directly on the value
439                  */
440                 ptr = VARDATA(string);
441                 m = VARSIZE(string) - VARHDRSZ;
442
443                 while (m-- > 0)
444                 {
445                         *ptr = toupper((unsigned char) *ptr);
446                         ptr++;
447                 }
448
449                 PG_RETURN_TEXT_P(string);
450         }
451 }
452
453
454 /********************************************************************
455  *
456  * initcap
457  *
458  * Syntax:
459  *
460  *       text initcap(text string)
461  *
462  * Purpose:
463  *
464  *       Returns string, with first letter of each word in uppercase, all
465  *       other letters in lowercase. A word is defined as a sequence of
466  *       alphanumeric characters, delimited by non-alphanumeric
467  *       characters.
468  *
469  ********************************************************************/
470
471 Datum
472 initcap(PG_FUNCTION_ARGS)
473 {
474 #ifdef USE_WIDE_UPPER_LOWER
475
476         /*
477          * Use wide char code only when max encoding length > 1 and ctype != C.
478          * Some operating systems fail with multi-byte encodings and a C locale.
479          * Also, for a C locale there is no need to process as multibyte.
480          */
481         if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c())
482         {
483                 text       *string = PG_GETARG_TEXT_PP(0);
484                 text       *result;
485                 wchar_t    *workspace;
486                 int                     wasalnum = 0;
487                 int                     i;
488
489                 workspace = texttowcs(string);
490
491                 for (i = 0; workspace[i] != 0; i++)
492                 {
493                         if (wasalnum)
494                                 workspace[i] = towlower(workspace[i]);
495                         else
496                                 workspace[i] = towupper(workspace[i]);
497                         wasalnum = iswalnum(workspace[i]);
498                 }
499
500                 result = wcstotext(workspace, i);
501
502                 pfree(workspace);
503
504                 PG_RETURN_TEXT_P(result);
505         }
506         else
507 #endif   /* USE_WIDE_UPPER_LOWER */
508         {
509                 text       *string = PG_GETARG_TEXT_P_COPY(0);
510                 int                     wasalnum = 0;
511                 char       *ptr;
512                 int                     m;
513
514                 /*
515                  * Since we copied the string, we can scribble directly on the value
516                  */
517                 ptr = VARDATA(string);
518                 m = VARSIZE(string) - VARHDRSZ;
519
520                 while (m-- > 0)
521                 {
522                         if (wasalnum)
523                                 *ptr = tolower((unsigned char) *ptr);
524                         else
525                                 *ptr = toupper((unsigned char) *ptr);
526                         wasalnum = isalnum((unsigned char) *ptr);
527                         ptr++;
528                 }
529
530                 PG_RETURN_TEXT_P(string);
531         }
532 }
533
534
535 /********************************************************************
536  *
537  * lpad
538  *
539  * Syntax:
540  *
541  *       text lpad(text string1, int4 len, text string2)
542  *
543  * Purpose:
544  *
545  *       Returns string1, left-padded to length len with the sequence of
546  *       characters in string2.  If len is less than the length of string1,
547  *       instead truncate (on the right) to len.
548  *
549  ********************************************************************/
550
551 Datum
552 lpad(PG_FUNCTION_ARGS)
553 {
554         text       *string1 = PG_GETARG_TEXT_PP(0);
555         int32           len = PG_GETARG_INT32(1);
556         text       *string2 = PG_GETARG_TEXT_PP(2);
557         text       *ret;
558         char       *ptr1,
559                            *ptr2,
560                            *ptr2start,
561                            *ptr2end,
562                            *ptr_ret;
563         int                     m,
564                                 s1len,
565                                 s2len;
566
567         int                     bytelen;
568
569         /* Negative len is silently taken as zero */
570         if (len < 0)
571                 len = 0;
572
573         s1len = VARSIZE_ANY_EXHDR(string1);
574         if (s1len < 0)
575                 s1len = 0;                              /* shouldn't happen */
576
577         s2len = VARSIZE_ANY_EXHDR(string2);
578         if (s2len < 0)
579                 s2len = 0;                              /* shouldn't happen */
580
581         s1len = pg_mbstrlen_with_len(VARDATA_ANY(string1), s1len);
582
583         if (s1len > len)
584                 s1len = len;                    /* truncate string1 to len chars */
585
586         if (s2len <= 0)
587                 len = s1len;                    /* nothing to pad with, so don't pad */
588
589         bytelen = pg_database_encoding_max_length() * len;
590
591         /* check for integer overflow */
592         if (len != 0 && bytelen / pg_database_encoding_max_length() != len)
593                 ereport(ERROR,
594                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
595                                  errmsg("requested length too large")));
596
597         ret = (text *) palloc(VARHDRSZ + bytelen);
598
599         m = len - s1len;
600
601         ptr2 = ptr2start = VARDATA_ANY(string2);
602         ptr2end = ptr2 + s2len;
603         ptr_ret = VARDATA(ret);
604
605         while (m--)
606         {
607                 int                     mlen = pg_mblen(ptr2);
608
609                 memcpy(ptr_ret, ptr2, mlen);
610                 ptr_ret += mlen;
611                 ptr2 += mlen;
612                 if (ptr2 == ptr2end)    /* wrap around at end of s2 */
613                         ptr2 = ptr2start;
614         }
615
616         ptr1 = VARDATA_ANY(string1);
617
618         while (s1len--)
619         {
620                 int                     mlen = pg_mblen(ptr1);
621
622                 memcpy(ptr_ret, ptr1, mlen);
623                 ptr_ret += mlen;
624                 ptr1 += mlen;
625         }
626
627         SET_VARSIZE(ret, ptr_ret - (char *) ret);
628
629         PG_RETURN_TEXT_P(ret);
630 }
631
632
633 /********************************************************************
634  *
635  * rpad
636  *
637  * Syntax:
638  *
639  *       text rpad(text string1, int4 len, text string2)
640  *
641  * Purpose:
642  *
643  *       Returns string1, right-padded to length len with the sequence of
644  *       characters in string2.  If len is less than the length of string1,
645  *       instead truncate (on the right) to len.
646  *
647  ********************************************************************/
648
649 Datum
650 rpad(PG_FUNCTION_ARGS)
651 {
652         text       *string1 = PG_GETARG_TEXT_PP(0);
653         int32           len = PG_GETARG_INT32(1);
654         text       *string2 = PG_GETARG_TEXT_PP(2);
655         text       *ret;
656         char       *ptr1,
657                            *ptr2,
658                            *ptr2start,
659                            *ptr2end,
660                            *ptr_ret;
661         int                     m,
662                                 s1len,
663                                 s2len;
664
665         int                     bytelen;
666
667         /* Negative len is silently taken as zero */
668         if (len < 0)
669                 len = 0;
670
671         s1len = VARSIZE_ANY_EXHDR(string1);
672         if (s1len < 0)
673                 s1len = 0;                              /* shouldn't happen */
674
675         s2len = VARSIZE_ANY_EXHDR(string2);
676         if (s2len < 0)
677                 s2len = 0;                              /* shouldn't happen */
678
679         s1len = pg_mbstrlen_with_len(VARDATA_ANY(string1), s1len);
680
681         if (s1len > len)
682                 s1len = len;                    /* truncate string1 to len chars */
683
684         if (s2len <= 0)
685                 len = s1len;                    /* nothing to pad with, so don't pad */
686
687         bytelen = pg_database_encoding_max_length() * len;
688
689         /* Check for integer overflow */
690         if (len != 0 && bytelen / pg_database_encoding_max_length() != len)
691                 ereport(ERROR,
692                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
693                                  errmsg("requested length too large")));
694
695         ret = (text *) palloc(VARHDRSZ + bytelen);
696         m = len - s1len;
697
698         ptr1 = VARDATA_ANY(string1);
699         ptr_ret = VARDATA(ret);
700
701         while (s1len--)
702         {
703                 int                     mlen = pg_mblen(ptr1);
704
705                 memcpy(ptr_ret, ptr1, mlen);
706                 ptr_ret += mlen;
707                 ptr1 += mlen;
708         }
709
710         ptr2 = ptr2start = VARDATA_ANY(string2);
711         ptr2end = ptr2 + s2len;
712
713         while (m--)
714         {
715                 int                     mlen = pg_mblen(ptr2);
716
717                 memcpy(ptr_ret, ptr2, mlen);
718                 ptr_ret += mlen;
719                 ptr2 += mlen;
720                 if (ptr2 == ptr2end)    /* wrap around at end of s2 */
721                         ptr2 = ptr2start;
722         }
723
724         SET_VARSIZE(ret, ptr_ret - (char *) ret);
725
726         PG_RETURN_TEXT_P(ret);
727 }
728
729
730 /********************************************************************
731  *
732  * btrim
733  *
734  * Syntax:
735  *
736  *       text btrim(text string, text set)
737  *
738  * Purpose:
739  *
740  *       Returns string with characters removed from the front and back
741  *       up to the first character not in set.
742  *
743  ********************************************************************/
744
745 Datum
746 btrim(PG_FUNCTION_ARGS)
747 {
748         text       *string = PG_GETARG_TEXT_PP(0);
749         text       *set = PG_GETARG_TEXT_PP(1);
750         text       *ret;
751
752         ret = dotrim(VARDATA_ANY(string), VARSIZE_ANY_EXHDR(string),
753                                  VARDATA_ANY(set), VARSIZE_ANY_EXHDR(set),
754                                  true, true);
755
756         PG_RETURN_TEXT_P(ret);
757 }
758
759 /********************************************************************
760  *
761  * btrim1 --- btrim with set fixed as ' '
762  *
763  ********************************************************************/
764
765 Datum
766 btrim1(PG_FUNCTION_ARGS)
767 {
768         text       *string = PG_GETARG_TEXT_PP(0);
769         text       *ret;
770
771         ret = dotrim(VARDATA_ANY(string), VARSIZE_ANY_EXHDR(string),
772                                  " ", 1,
773                                  true, true);
774
775         PG_RETURN_TEXT_P(ret);
776 }
777
778 /*
779  * Common implementation for btrim, ltrim, rtrim
780  */
781 static text *
782 dotrim(const char *string, int stringlen,
783            const char *set, int setlen,
784            bool doltrim, bool dortrim)
785 {
786         int                     i;
787
788         /* Nothing to do if either string or set is empty */
789         if (stringlen > 0 && setlen > 0)
790         {
791                 if (pg_database_encoding_max_length() > 1)
792                 {
793                         /*
794                          * In the multibyte-encoding case, build arrays of pointers to
795                          * character starts, so that we can avoid inefficient checks in
796                          * the inner loops.
797                          */
798                         const char **stringchars;
799                         const char **setchars;
800                         int                *stringmblen;
801                         int                *setmblen;
802                         int                     stringnchars;
803                         int                     setnchars;
804                         int                     resultndx;
805                         int                     resultnchars;
806                         const char *p;
807                         int                     len;
808                         int                     mblen;
809                         const char *str_pos;
810                         int                     str_len;
811
812                         stringchars = (const char **) palloc(stringlen * sizeof(char *));
813                         stringmblen = (int *) palloc(stringlen * sizeof(int));
814                         stringnchars = 0;
815                         p = string;
816                         len = stringlen;
817                         while (len > 0)
818                         {
819                                 stringchars[stringnchars] = p;
820                                 stringmblen[stringnchars] = mblen = pg_mblen(p);
821                                 stringnchars++;
822                                 p += mblen;
823                                 len -= mblen;
824                         }
825
826                         setchars = (const char **) palloc(setlen * sizeof(char *));
827                         setmblen = (int *) palloc(setlen * sizeof(int));
828                         setnchars = 0;
829                         p = set;
830                         len = setlen;
831                         while (len > 0)
832                         {
833                                 setchars[setnchars] = p;
834                                 setmblen[setnchars] = mblen = pg_mblen(p);
835                                 setnchars++;
836                                 p += mblen;
837                                 len -= mblen;
838                         }
839
840                         resultndx = 0;          /* index in stringchars[] */
841                         resultnchars = stringnchars;
842
843                         if (doltrim)
844                         {
845                                 while (resultnchars > 0)
846                                 {
847                                         str_pos = stringchars[resultndx];
848                                         str_len = stringmblen[resultndx];
849                                         for (i = 0; i < setnchars; i++)
850                                         {
851                                                 if (str_len == setmblen[i] &&
852                                                         memcmp(str_pos, setchars[i], str_len) == 0)
853                                                         break;
854                                         }
855                                         if (i >= setnchars)
856                                                 break;  /* no match here */
857                                         string += str_len;
858                                         stringlen -= str_len;
859                                         resultndx++;
860                                         resultnchars--;
861                                 }
862                         }
863
864                         if (dortrim)
865                         {
866                                 while (resultnchars > 0)
867                                 {
868                                         str_pos = stringchars[resultndx + resultnchars - 1];
869                                         str_len = stringmblen[resultndx + resultnchars - 1];
870                                         for (i = 0; i < setnchars; i++)
871                                         {
872                                                 if (str_len == setmblen[i] &&
873                                                         memcmp(str_pos, setchars[i], str_len) == 0)
874                                                         break;
875                                         }
876                                         if (i >= setnchars)
877                                                 break;  /* no match here */
878                                         stringlen -= str_len;
879                                         resultnchars--;
880                                 }
881                         }
882
883                         pfree(stringchars);
884                         pfree(stringmblen);
885                         pfree(setchars);
886                         pfree(setmblen);
887                 }
888                 else
889                 {
890                         /*
891                          * In the single-byte-encoding case, we don't need such overhead.
892                          */
893                         if (doltrim)
894                         {
895                                 while (stringlen > 0)
896                                 {
897                                         char            str_ch = *string;
898
899                                         for (i = 0; i < setlen; i++)
900                                         {
901                                                 if (str_ch == set[i])
902                                                         break;
903                                         }
904                                         if (i >= setlen)
905                                                 break;  /* no match here */
906                                         string++;
907                                         stringlen--;
908                                 }
909                         }
910
911                         if (dortrim)
912                         {
913                                 while (stringlen > 0)
914                                 {
915                                         char            str_ch = string[stringlen - 1];
916
917                                         for (i = 0; i < setlen; i++)
918                                         {
919                                                 if (str_ch == set[i])
920                                                         break;
921                                         }
922                                         if (i >= setlen)
923                                                 break;  /* no match here */
924                                         stringlen--;
925                                 }
926                         }
927                 }
928         }
929
930         /* Return selected portion of string */
931         return cstring_to_text_with_len(string, stringlen);
932 }
933
934 /********************************************************************
935  *
936  * byteatrim
937  *
938  * Syntax:
939  *
940  *       bytea byteatrim(byta string, bytea set)
941  *
942  * Purpose:
943  *
944  *       Returns string with characters removed from the front and back
945  *       up to the first character not in set.
946  *
947  * Cloned from btrim and modified as required.
948  ********************************************************************/
949
950 Datum
951 byteatrim(PG_FUNCTION_ARGS)
952 {
953         bytea      *string = PG_GETARG_BYTEA_PP(0);
954         bytea      *set = PG_GETARG_BYTEA_PP(1);
955         bytea      *ret;
956         char       *ptr,
957                            *end,
958                            *ptr2,
959                            *ptr2start,
960                            *end2;
961         int                     m,
962                                 stringlen,
963                                 setlen;
964
965         stringlen = VARSIZE_ANY_EXHDR(string);
966         setlen = VARSIZE_ANY_EXHDR(set);
967
968         if (stringlen <= 0 || setlen <= 0)
969                 PG_RETURN_BYTEA_P(string);
970
971         m = stringlen;
972         ptr = VARDATA_ANY(string);
973         end = ptr + stringlen - 1;
974         ptr2start = VARDATA_ANY(set);
975         end2 = ptr2start + setlen - 1;
976
977         while (m > 0)
978         {
979                 ptr2 = ptr2start;
980                 while (ptr2 <= end2)
981                 {
982                         if (*ptr == *ptr2)
983                                 break;
984                         ++ptr2;
985                 }
986                 if (ptr2 > end2)
987                         break;
988                 ptr++;
989                 m--;
990         }
991
992         while (m > 0)
993         {
994                 ptr2 = ptr2start;
995                 while (ptr2 <= end2)
996                 {
997                         if (*end == *ptr2)
998                                 break;
999                         ++ptr2;
1000                 }
1001                 if (ptr2 > end2)
1002                         break;
1003                 end--;
1004                 m--;
1005         }
1006
1007         ret = (bytea *) palloc(VARHDRSZ + m);
1008         SET_VARSIZE(ret, VARHDRSZ + m);
1009         memcpy(VARDATA(ret), ptr, m);
1010
1011         PG_RETURN_BYTEA_P(ret);
1012 }
1013
1014 /********************************************************************
1015  *
1016  * ltrim
1017  *
1018  * Syntax:
1019  *
1020  *       text ltrim(text string, text set)
1021  *
1022  * Purpose:
1023  *
1024  *       Returns string with initial characters removed up to the first
1025  *       character not in set.
1026  *
1027  ********************************************************************/
1028
1029 Datum
1030 ltrim(PG_FUNCTION_ARGS)
1031 {
1032         text       *string = PG_GETARG_TEXT_PP(0);
1033         text       *set = PG_GETARG_TEXT_PP(1);
1034         text       *ret;
1035
1036         ret = dotrim(VARDATA_ANY(string), VARSIZE_ANY_EXHDR(string),
1037                                  VARDATA_ANY(set), VARSIZE_ANY_EXHDR(set),
1038                                  true, false);
1039
1040         PG_RETURN_TEXT_P(ret);
1041 }
1042
1043 /********************************************************************
1044  *
1045  * ltrim1 --- ltrim with set fixed as ' '
1046  *
1047  ********************************************************************/
1048
1049 Datum
1050 ltrim1(PG_FUNCTION_ARGS)
1051 {
1052         text       *string = PG_GETARG_TEXT_PP(0);
1053         text       *ret;
1054
1055         ret = dotrim(VARDATA_ANY(string), VARSIZE_ANY_EXHDR(string),
1056                                  " ", 1,
1057                                  true, false);
1058
1059         PG_RETURN_TEXT_P(ret);
1060 }
1061
1062 /********************************************************************
1063  *
1064  * rtrim
1065  *
1066  * Syntax:
1067  *
1068  *       text rtrim(text string, text set)
1069  *
1070  * Purpose:
1071  *
1072  *       Returns string with final characters removed after the last
1073  *       character not in set.
1074  *
1075  ********************************************************************/
1076
1077 Datum
1078 rtrim(PG_FUNCTION_ARGS)
1079 {
1080         text       *string = PG_GETARG_TEXT_PP(0);
1081         text       *set = PG_GETARG_TEXT_PP(1);
1082         text       *ret;
1083
1084         ret = dotrim(VARDATA_ANY(string), VARSIZE_ANY_EXHDR(string),
1085                                  VARDATA_ANY(set), VARSIZE_ANY_EXHDR(set),
1086                                  false, true);
1087
1088         PG_RETURN_TEXT_P(ret);
1089 }
1090
1091 /********************************************************************
1092  *
1093  * rtrim1 --- rtrim with set fixed as ' '
1094  *
1095  ********************************************************************/
1096
1097 Datum
1098 rtrim1(PG_FUNCTION_ARGS)
1099 {
1100         text       *string = PG_GETARG_TEXT_PP(0);
1101         text       *ret;
1102
1103         ret = dotrim(VARDATA_ANY(string), VARSIZE_ANY_EXHDR(string),
1104                                  " ", 1,
1105                                  false, true);
1106
1107         PG_RETURN_TEXT_P(ret);
1108 }
1109
1110
1111 /********************************************************************
1112  *
1113  * translate
1114  *
1115  * Syntax:
1116  *
1117  *       text translate(text string, text from, text to)
1118  *
1119  * Purpose:
1120  *
1121  *       Returns string after replacing all occurrences of characters in from
1122  *       with the corresponding character in to.  If from is longer than to,
1123  *       occurrences of the extra characters in from are deleted.
1124  *       Improved by Edwin Ramirez <ramirez@doc.mssm.edu>.
1125  *
1126  ********************************************************************/
1127
1128 Datum
1129 translate(PG_FUNCTION_ARGS)
1130 {
1131         text       *string = PG_GETARG_TEXT_PP(0);
1132         text       *from = PG_GETARG_TEXT_PP(1);
1133         text       *to = PG_GETARG_TEXT_PP(2);
1134         text       *result;
1135         char       *from_ptr,
1136                            *to_ptr;
1137         char       *source,
1138                            *target;
1139         int                     m,
1140                                 fromlen,
1141                                 tolen,
1142                                 retlen,
1143                                 i;
1144         int                     worst_len;
1145         int                     len;
1146         int                     source_len;
1147         int                     from_index;
1148
1149         m = VARSIZE_ANY_EXHDR(string);
1150         if (m <= 0)
1151                 PG_RETURN_TEXT_P(string);
1152         source = VARDATA_ANY(string);
1153
1154         fromlen = VARSIZE_ANY_EXHDR(from);
1155         from_ptr = VARDATA_ANY(from);
1156         tolen = VARSIZE_ANY_EXHDR(to);
1157         to_ptr = VARDATA_ANY(to);
1158
1159         /*
1160          * The worst-case expansion is to substitute a max-length character for a
1161          * single-byte character at each position of the string.
1162          */
1163         worst_len = pg_database_encoding_max_length() * m;
1164
1165         /* check for integer overflow */
1166         if (worst_len / pg_database_encoding_max_length() != m)
1167                 ereport(ERROR,
1168                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1169                                  errmsg("requested length too large")));
1170
1171         result = (text *) palloc(worst_len + VARHDRSZ);
1172         target = VARDATA(result);
1173         retlen = 0;
1174
1175         while (m > 0)
1176         {
1177                 source_len = pg_mblen(source);
1178                 from_index = 0;
1179
1180                 for (i = 0; i < fromlen; i += len)
1181                 {
1182                         len = pg_mblen(&from_ptr[i]);
1183                         if (len == source_len &&
1184                                 memcmp(source, &from_ptr[i], len) == 0)
1185                                 break;
1186
1187                         from_index++;
1188                 }
1189                 if (i < fromlen)
1190                 {
1191                         /* substitute */
1192                         char       *p = to_ptr;
1193
1194                         for (i = 0; i < from_index; i++)
1195                         {
1196                                 p += pg_mblen(p);
1197                                 if (p >= (to_ptr + tolen))
1198                                         break;
1199                         }
1200                         if (p < (to_ptr + tolen))
1201                         {
1202                                 len = pg_mblen(p);
1203                                 memcpy(target, p, len);
1204                                 target += len;
1205                                 retlen += len;
1206                         }
1207
1208                 }
1209                 else
1210                 {
1211                         /* no match, so copy */
1212                         memcpy(target, source, source_len);
1213                         target += source_len;
1214                         retlen += source_len;
1215                 }
1216
1217                 source += source_len;
1218                 m -= source_len;
1219         }
1220
1221         SET_VARSIZE(result, retlen + VARHDRSZ);
1222
1223         /*
1224          * The function result is probably much bigger than needed, if we're using
1225          * a multibyte encoding, but it's not worth reallocating it; the result
1226          * probably won't live long anyway.
1227          */
1228
1229         PG_RETURN_TEXT_P(result);
1230 }
1231
1232 /********************************************************************
1233  *
1234  * ascii
1235  *
1236  * Syntax:
1237  *
1238  *       int ascii(text string)
1239  *
1240  * Purpose:
1241  *
1242  *       Returns the decimal representation of the first character from
1243  *       string.
1244  *       If the string is empty we return 0.
1245  *       If the database encoding is UTF8, we return the Unicode codepoint.
1246  *       If the database encoding is any other multi-byte encoding, we
1247  *       return the value of the first byte if it is an ASCII character
1248  *       (range 1 .. 127), or raise an error.
1249  *       For all other encodings we return the value of the first byte,
1250  *       (range 1..255).
1251  *
1252  ********************************************************************/
1253
1254 Datum
1255 ascii(PG_FUNCTION_ARGS)
1256 {
1257         text       *string = PG_GETARG_TEXT_PP(0);
1258         int                     encoding = GetDatabaseEncoding();
1259         unsigned char *data;
1260
1261         if (VARSIZE_ANY_EXHDR(string) <= 0)
1262                 PG_RETURN_INT32(0);
1263
1264         data = (unsigned char *) VARDATA_ANY(string);
1265
1266         if (encoding == PG_UTF8 && *data > 127)
1267         {
1268                 /* return the code point for Unicode */
1269
1270                 int                     result = 0,
1271                                         tbytes = 0,
1272                                         i;
1273
1274                 if (*data >= 0xF0)
1275                 {
1276                         result = *data & 0x07;
1277                         tbytes = 3;
1278                 }
1279                 else if (*data >= 0xE0)
1280                 {
1281                         result = *data & 0x0F;
1282                         tbytes = 2;
1283                 }
1284                 else
1285                 {
1286                         Assert(*data > 0xC0);
1287                         result = *data & 0x1f;
1288                         tbytes = 1;
1289                 }
1290
1291                 Assert(tbytes > 0);
1292
1293                 for (i = 1; i <= tbytes; i++)
1294                 {
1295                         Assert((data[i] & 0xC0) == 0x80);
1296                         result = (result << 6) + (data[i] & 0x3f);
1297                 }
1298
1299                 PG_RETURN_INT32(result);
1300         }
1301         else
1302         {
1303                 if (pg_encoding_max_length(encoding) > 1 && *data > 127)
1304                         ereport(ERROR,
1305                                         (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1306                                          errmsg("requested character too large")));
1307
1308
1309                 PG_RETURN_INT32((int32) *data);
1310         }
1311 }
1312
1313 /********************************************************************
1314  *
1315  * chr
1316  *
1317  * Syntax:
1318  *
1319  *       text chr(int val)
1320  *
1321  * Purpose:
1322  *
1323  *      Returns the character having the binary equivalent to val.
1324  *
1325  * For UTF8 we treat the argumwent as a Unicode code point.
1326  * For other multi-byte encodings we raise an error for arguments
1327  * outside the strict ASCII range (1..127).
1328  *
1329  * It's important that we don't ever return a value that is not valid
1330  * in the database encoding, so that this doesn't become a way for
1331  * invalid data to enter the database.
1332  *
1333  ********************************************************************/
1334
1335 Datum
1336 chr                     (PG_FUNCTION_ARGS)
1337 {
1338         uint32          cvalue = PG_GETARG_UINT32(0);
1339         text       *result;
1340         int                     encoding = GetDatabaseEncoding();
1341
1342         if (encoding == PG_UTF8 && cvalue > 127)
1343         {
1344                 /* for Unicode we treat the argument as a code point */
1345                 int                     bytes;
1346                 char       *wch;
1347
1348                 /* We only allow valid Unicode code points */
1349                 if (cvalue > 0x001fffff)
1350                         ereport(ERROR,
1351                                         (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1352                                          errmsg("requested character too large for encoding: %d",
1353                                                         cvalue)));
1354
1355                 if (cvalue > 0xffff)
1356                         bytes = 4;
1357                 else if (cvalue > 0x07ff)
1358                         bytes = 3;
1359                 else
1360                         bytes = 2;
1361
1362                 result = (text *) palloc(VARHDRSZ + bytes);
1363                 SET_VARSIZE(result, VARHDRSZ + bytes);
1364                 wch = VARDATA(result);
1365
1366                 if (bytes == 2)
1367                 {
1368                         wch[0] = 0xC0 | ((cvalue >> 6) & 0x1F);
1369                         wch[1] = 0x80 | (cvalue & 0x3F);;
1370                 }
1371                 else if (bytes == 3)
1372                 {
1373                         wch[0] = 0xE0 | ((cvalue >> 12) & 0x0F);
1374                         wch[1] = 0x80 | ((cvalue >> 6) & 0x3F);
1375                         wch[2] = 0x80 | (cvalue & 0x3F);
1376                 }
1377                 else
1378                 {
1379                         wch[0] = 0xF0 | ((cvalue >> 18) & 0x07);
1380                         wch[1] = 0x80 | ((cvalue >> 12) & 0x3F);
1381                         wch[2] = 0x80 | ((cvalue >> 6) & 0x3F);
1382                         wch[3] = 0x80 | (cvalue & 0x3F);
1383                 }
1384
1385         }
1386
1387         else
1388         {
1389                 bool            is_mb;
1390
1391                 /*
1392                  * Error out on arguments that make no sense or that we can't validly
1393                  * represent in the encoding.
1394                  */
1395
1396                 if (cvalue == 0)
1397                         ereport(ERROR,
1398                                         (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1399                                          errmsg("null character not permitted")));
1400
1401                 is_mb = pg_encoding_max_length(encoding) > 1;
1402
1403                 if ((is_mb && (cvalue > 127)) || (!is_mb && (cvalue > 255)))
1404                         ereport(ERROR,
1405                                         (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1406                                          errmsg("requested character too large for encoding: %d",
1407                                                         cvalue)));
1408
1409
1410                 result = (text *) palloc(VARHDRSZ + 1);
1411                 SET_VARSIZE(result, VARHDRSZ + 1);
1412                 *VARDATA(result) = (char) cvalue;
1413         }
1414
1415         PG_RETURN_TEXT_P(result);
1416 }
1417
1418 /********************************************************************
1419  *
1420  * repeat
1421  *
1422  * Syntax:
1423  *
1424  *       text repeat(text string, int val)
1425  *
1426  * Purpose:
1427  *
1428  *      Repeat string by val.
1429  *
1430  ********************************************************************/
1431
1432 Datum
1433 repeat(PG_FUNCTION_ARGS)
1434 {
1435         text       *string = PG_GETARG_TEXT_PP(0);
1436         int32           count = PG_GETARG_INT32(1);
1437         text       *result;
1438         int                     slen,
1439                                 tlen;
1440         int                     i;
1441         char       *cp,
1442                            *sp;
1443
1444         if (count < 0)
1445                 count = 0;
1446
1447         slen = VARSIZE_ANY_EXHDR(string);
1448         tlen = VARHDRSZ + (count * slen);
1449
1450         /* Check for integer overflow */
1451         if (slen != 0 && count != 0)
1452         {
1453                 int                     check = count * slen;
1454                 int                     check2 = check + VARHDRSZ;
1455
1456                 if ((check / slen) != count || check2 <= check)
1457                         ereport(ERROR,
1458                                         (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1459                                          errmsg("requested length too large")));
1460         }
1461
1462         result = (text *) palloc(tlen);
1463
1464         SET_VARSIZE(result, tlen);
1465         cp = VARDATA(result);
1466         sp = VARDATA_ANY(string);
1467         for (i = 0; i < count; i++)
1468         {
1469                 memcpy(cp, sp, slen);
1470                 cp += slen;
1471         }
1472
1473         PG_RETURN_TEXT_P(result);
1474 }