]> granicus.if.org Git - postgresql/blob - src/backend/utils/adt/oracle_compat.c
Fix regex, LIKE, and some other second-rank text-manipulation functions
[postgresql] / src / backend / utils / adt / oracle_compat.c
1 /*-------------------------------------------------------------------------
2  * oracle_compat.c
3  *      Oracle compatible functions.
4  *
5  * Copyright (c) 1996-2007, PostgreSQL Global Development Group
6  *
7  *      Author: Edmund Mergl <E.Mergl@bawue.de>
8  *      Multibyte enhancement: Tatsuo Ishii <ishii@postgresql.org>
9  *
10  *
11  * IDENTIFICATION
12  *      $PostgreSQL: pgsql/src/backend/utils/adt/oracle_compat.c,v 1.72 2007/09/21 22:52:52 tgl Exp $
13  *
14  *-------------------------------------------------------------------------
15  */
16 #include "postgres.h"
17
18 #include <ctype.h>
19 #include <limits.h>
20 /*
21  * towlower() and friends should be in <wctype.h>, but some pre-C99 systems
22  * declare them in <wchar.h>.
23  */
24 #ifdef HAVE_WCHAR_H
25 #include <wchar.h>
26 #endif
27 #ifdef HAVE_WCTYPE_H
28 #include <wctype.h>
29 #endif
30
31 #include "utils/builtins.h"
32 #include "utils/pg_locale.h"
33 #include "mb/pg_wchar.h"
34
35
36 /*
37  * If the system provides the needed functions for wide-character manipulation
38  * (which are all standardized by C99), then we implement upper/lower/initcap
39  * using wide-character functions.      Otherwise we use the traditional <ctype.h>
40  * functions, which of course will not work as desired in multibyte character
41  * sets.  Note that in either case we are effectively assuming that the
42  * database character encoding matches the encoding implied by LC_CTYPE.
43  *
44  * We assume if we have these two functions, we have their friends too, and
45  * can use the wide-character method.
46  */
47 #if defined(HAVE_WCSTOMBS) && defined(HAVE_TOWLOWER)
48 #define USE_WIDE_UPPER_LOWER
49 char *wstring_lower (char *str);
50 char *wstring_upper(char *str);
51 #endif
52
53 static text *dotrim(const char *string, int stringlen,
54            const char *set, int setlen,
55            bool doltrim, bool dortrim);
56
57
58 #ifdef USE_WIDE_UPPER_LOWER
59
60 /*
61  * Convert a TEXT value into a palloc'd wchar string.
62  */
63 static wchar_t *
64 texttowcs(const text *txt)
65 {
66         int                     nbytes = VARSIZE_ANY_EXHDR(txt);
67         char       *workstr;
68         wchar_t    *result;
69         size_t          ncodes;
70
71         /* Overflow paranoia */
72         if (nbytes < 0 ||
73                 nbytes > (int) (INT_MAX / sizeof(wchar_t)) - 1)
74                 ereport(ERROR,
75                                 (errcode(ERRCODE_OUT_OF_MEMORY),
76                                  errmsg("out of memory")));
77
78         /* Need a null-terminated version of the input */
79         workstr = (char *) palloc(nbytes + 1);
80         memcpy(workstr, VARDATA_ANY(txt), nbytes);
81         workstr[nbytes] = '\0';
82
83         /* Output workspace cannot have more codes than input bytes */
84         result = (wchar_t *) palloc((nbytes + 1) * sizeof(wchar_t));
85
86         /* Do the conversion */
87         ncodes = mbstowcs(result, workstr, nbytes + 1);
88
89         if (ncodes == (size_t) -1)
90         {
91                 /*
92                  * Invalid multibyte character encountered.  We try to give a useful
93                  * error message by letting pg_verifymbstr check the string.  But it's
94                  * possible that the string is OK to us, and not OK to mbstowcs ---
95                  * this suggests that the LC_CTYPE locale is different from the
96                  * database encoding.  Give a generic error message if verifymbstr
97                  * can't find anything wrong.
98                  */
99                 pg_verifymbstr(workstr, nbytes, false);
100                 ereport(ERROR,
101                                 (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
102                                  errmsg("invalid multibyte character for locale"),
103                                  errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding.")));
104         }
105
106         Assert(ncodes <= (size_t) nbytes);
107
108         return result;
109 }
110
111
112 /*
113  * Convert a wchar string into a palloc'd TEXT value.  The wchar string
114  * must be zero-terminated, but we also require the caller to pass the string
115  * length, since it will know it anyway in current uses.
116  */
117 static text *
118 wcstotext(const wchar_t *str, int ncodes)
119 {
120         text       *result;
121         size_t          nbytes;
122
123         /* Overflow paranoia */
124         if (ncodes < 0 ||
125                 ncodes > (int) ((INT_MAX - VARHDRSZ) / MB_CUR_MAX) - 1)
126                 ereport(ERROR,
127                                 (errcode(ERRCODE_OUT_OF_MEMORY),
128                                  errmsg("out of memory")));
129
130         /* Make workspace certainly large enough for result */
131         result = (text *) palloc((ncodes + 1) * MB_CUR_MAX + VARHDRSZ);
132
133         /* Do the conversion */
134         nbytes = wcstombs((char *) VARDATA(result), str,
135                                           (ncodes + 1) * MB_CUR_MAX);
136
137         if (nbytes == (size_t) -1)
138         {
139                 /* Invalid multibyte character encountered ... shouldn't happen */
140                 ereport(ERROR,
141                                 (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
142                                  errmsg("invalid multibyte character for locale")));
143         }
144
145         Assert(nbytes <= (size_t) (ncodes * MB_CUR_MAX));
146
147         SET_VARSIZE(result, nbytes + VARHDRSZ);
148
149         return result;
150 }
151 #endif   /* USE_WIDE_UPPER_LOWER */
152
153
154 /*
155  * On Windows, the "Unicode" locales assume UTF16 not UTF8 encoding.
156  * To make use of the upper/lower functionality, we need to map UTF8 to
157  * UTF16, which for some reason mbstowcs and wcstombs won't do for us.
158  * This conversion layer takes care of it.
159  */
160
161 #ifdef WIN32
162
163 /* texttowcs for the case of UTF8 to UTF16 */
164 static wchar_t *
165 win32_utf8_texttowcs(const text *txt)
166 {
167         int                     nbytes = VARSIZE_ANY_EXHDR(txt);
168         wchar_t    *result;
169         int                     r;
170
171         /* Overflow paranoia */
172         if (nbytes < 0 ||
173                 nbytes > (int) (INT_MAX / sizeof(wchar_t)) - 1)
174                 ereport(ERROR,
175                                 (errcode(ERRCODE_OUT_OF_MEMORY),
176                                  errmsg("out of memory")));
177
178         /* Output workspace cannot have more codes than input bytes */
179         result = (wchar_t *) palloc((nbytes + 1) * sizeof(wchar_t));
180
181         /* stupid Microsloth API does not work for zero-length input */
182         if (nbytes == 0)
183                 r = 0;
184         else
185         {
186                 /* Do the conversion */
187                 r = MultiByteToWideChar(CP_UTF8, 0, VARDATA_ANY(txt), nbytes,
188                                                                 result, nbytes);
189
190                 if (!r)                                 /* assume it's NO_UNICODE_TRANSLATION */
191                 {
192                         /* see notes above about error reporting */
193                         pg_verifymbstr(VARDATA_ANY(txt), nbytes, false);
194                         ereport(ERROR,
195                                         (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
196                                          errmsg("invalid multibyte character for locale"),
197                                          errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding.")));
198                 }
199         }
200
201         Assert(r <= nbytes);
202         result[r] = 0;
203
204         return result;
205 }
206
207 /* wcstotext for the case of UTF16 to UTF8 */
208 static text *
209 win32_utf8_wcstotext(const wchar_t *str)
210 {
211         text       *result;
212         int                     nbytes;
213         int                     r;
214
215         nbytes = WideCharToMultiByte(CP_UTF8, 0, str, -1, NULL, 0, NULL, NULL);
216         if (nbytes == 0)                        /* shouldn't happen */
217                 ereport(ERROR,
218                                 (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
219                                  errmsg("UTF-16 to UTF-8 translation failed: %lu",
220                                                 GetLastError())));
221
222         result = palloc(nbytes + VARHDRSZ);
223
224         r = WideCharToMultiByte(CP_UTF8, 0, str, -1, VARDATA(result), nbytes,
225                                                         NULL, NULL);
226         if (r == 0)                                     /* shouldn't happen */
227                 ereport(ERROR,
228                                 (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
229                                  errmsg("UTF-16 to UTF-8 translation failed: %lu",
230                                                 GetLastError())));
231
232         SET_VARSIZE(result, nbytes + VARHDRSZ - 1);             /* -1 to ignore null */
233
234         return result;
235 }
236
237 /* interface layer to check which encoding is in use */
238
239 static wchar_t *
240 win32_texttowcs(const text *txt)
241 {
242         if (GetDatabaseEncoding() == PG_UTF8)
243                 return win32_utf8_texttowcs(txt);
244         else
245                 return texttowcs(txt);
246 }
247
248 static text *
249 win32_wcstotext(const wchar_t *str, int ncodes)
250 {
251         if (GetDatabaseEncoding() == PG_UTF8)
252                 return win32_utf8_wcstotext(str);
253         else
254                 return wcstotext(str, ncodes);
255 }
256
257 /* use macros to cause routines below to call interface layer */
258
259 #define texttowcs       win32_texttowcs
260 #define wcstotext       win32_wcstotext
261 #endif   /* WIN32 */
262
263 #ifdef USE_WIDE_UPPER_LOWER
264 /* 
265  * string_upper and string_lower are used for correct multibyte upper/lower 
266  * transformations localized strings. Returns pointers to transformated
267  * string.
268  */
269 char *
270 wstring_upper(char *str)
271 {
272         wchar_t         *workspace;
273         text            *in_text;
274         text            *out_text;
275         char            *result;    
276         int     nbytes = strlen(str);
277         int     i;
278         
279         in_text = palloc(nbytes + VARHDRSZ);
280         memcpy(VARDATA(in_text), str, nbytes);
281         SET_VARSIZE(in_text, nbytes + VARHDRSZ);
282
283         workspace = texttowcs(in_text);
284
285         for (i = 0; workspace[i] != 0; i++)
286                 workspace[i] = towupper(workspace[i]);
287
288         out_text = wcstotext(workspace, i);
289         
290         nbytes = VARSIZE(out_text) - VARHDRSZ;
291         result = palloc(nbytes + 1);
292         memcpy(result, VARDATA(out_text), nbytes);
293
294         result[nbytes] = '\0';
295
296         pfree(workspace);
297         pfree(in_text);
298         pfree(out_text);
299         
300         return result;
301 }
302
303 char *
304 wstring_lower(char *str)
305 {
306         wchar_t         *workspace;
307         text            *in_text;
308         text            *out_text;
309         char            *result;    
310         int     nbytes = strlen(str);
311         int     i;
312         
313         in_text = palloc(nbytes + VARHDRSZ);
314         memcpy(VARDATA(in_text), str, nbytes);
315         SET_VARSIZE(in_text, nbytes + VARHDRSZ);
316
317         workspace = texttowcs(in_text);
318
319         for (i = 0; workspace[i] != 0; i++)
320                 workspace[i] = towlower(workspace[i]);
321
322         out_text = wcstotext(workspace, i);
323         
324         nbytes = VARSIZE(out_text) - VARHDRSZ;
325         result = palloc(nbytes + 1);
326         memcpy(result, VARDATA(out_text), nbytes);
327
328         result[nbytes] = '\0';
329
330         pfree(workspace);
331         pfree(in_text);
332         pfree(out_text);
333         
334         return result;
335 }
336 #endif  /* USE_WIDE_UPPER_LOWER */
337
338 /********************************************************************
339  *
340  * lower
341  *
342  * Syntax:
343  *
344  *       text lower(text string)
345  *
346  * Purpose:
347  *
348  *       Returns string, with all letters forced to lowercase.
349  *
350  ********************************************************************/
351
352 Datum
353 lower(PG_FUNCTION_ARGS)
354 {
355 #ifdef USE_WIDE_UPPER_LOWER
356
357         /*
358          * Use wide char code only when max encoding length > 1 and ctype != C.
359          * Some operating systems fail with multi-byte encodings and a C locale.
360          * Also, for a C locale there is no need to process as multibyte.
361          */
362         if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c())
363         {
364                 text       *string = PG_GETARG_TEXT_PP(0);
365                 text       *result;
366                 wchar_t    *workspace;
367                 int                     i;
368
369                 workspace = texttowcs(string);
370
371                 for (i = 0; workspace[i] != 0; i++)
372                         workspace[i] = towlower(workspace[i]);
373
374                 result = wcstotext(workspace, i);
375
376                 pfree(workspace);
377
378                 PG_RETURN_TEXT_P(result);
379         }
380         else
381 #endif   /* USE_WIDE_UPPER_LOWER */
382         {
383                 text       *string = PG_GETARG_TEXT_P_COPY(0);
384                 char       *ptr;
385                 int                     m;
386
387                 /*
388                  * Since we copied the string, we can scribble directly on the value
389                  */
390                 ptr = VARDATA(string);
391                 m = VARSIZE(string) - VARHDRSZ;
392
393                 while (m-- > 0)
394                 {
395                         *ptr = tolower((unsigned char) *ptr);
396                         ptr++;
397                 }
398
399                 PG_RETURN_TEXT_P(string);
400         }
401 }
402
403
404 /********************************************************************
405  *
406  * upper
407  *
408  * Syntax:
409  *
410  *       text upper(text string)
411  *
412  * Purpose:
413  *
414  *       Returns string, with all letters forced to uppercase.
415  *
416  ********************************************************************/
417
418 Datum
419 upper(PG_FUNCTION_ARGS)
420 {
421 #ifdef USE_WIDE_UPPER_LOWER
422
423         /*
424          * Use wide char code only when max encoding length > 1 and ctype != C.
425          * Some operating systems fail with multi-byte encodings and a C locale.
426          * Also, for a C locale there is no need to process as multibyte.
427          */
428         if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c())
429         {
430                 text       *string = PG_GETARG_TEXT_PP(0);
431                 text       *result;
432                 wchar_t    *workspace;
433                 int                     i;
434
435                 workspace = texttowcs(string);
436
437                 for (i = 0; workspace[i] != 0; i++)
438                         workspace[i] = towupper(workspace[i]);
439
440                 result = wcstotext(workspace, i);
441
442                 pfree(workspace);
443
444                 PG_RETURN_TEXT_P(result);
445         }
446         else
447 #endif   /* USE_WIDE_UPPER_LOWER */
448         {
449                 text       *string = PG_GETARG_TEXT_P_COPY(0);
450                 char       *ptr;
451                 int                     m;
452
453                 /*
454                  * Since we copied the string, we can scribble directly on the value
455                  */
456                 ptr = VARDATA(string);
457                 m = VARSIZE(string) - VARHDRSZ;
458
459                 while (m-- > 0)
460                 {
461                         *ptr = toupper((unsigned char) *ptr);
462                         ptr++;
463                 }
464
465                 PG_RETURN_TEXT_P(string);
466         }
467 }
468
469
470 /********************************************************************
471  *
472  * initcap
473  *
474  * Syntax:
475  *
476  *       text initcap(text string)
477  *
478  * Purpose:
479  *
480  *       Returns string, with first letter of each word in uppercase, all
481  *       other letters in lowercase. A word is defined as a sequence of
482  *       alphanumeric characters, delimited by non-alphanumeric
483  *       characters.
484  *
485  ********************************************************************/
486
487 Datum
488 initcap(PG_FUNCTION_ARGS)
489 {
490 #ifdef USE_WIDE_UPPER_LOWER
491
492         /*
493          * Use wide char code only when max encoding length > 1 and ctype != C.
494          * Some operating systems fail with multi-byte encodings and a C locale.
495          * Also, for a C locale there is no need to process as multibyte.
496          */
497         if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c())
498         {
499                 text       *string = PG_GETARG_TEXT_PP(0);
500                 text       *result;
501                 wchar_t    *workspace;
502                 int                     wasalnum = 0;
503                 int                     i;
504
505                 workspace = texttowcs(string);
506
507                 for (i = 0; workspace[i] != 0; i++)
508                 {
509                         if (wasalnum)
510                                 workspace[i] = towlower(workspace[i]);
511                         else
512                                 workspace[i] = towupper(workspace[i]);
513                         wasalnum = iswalnum(workspace[i]);
514                 }
515
516                 result = wcstotext(workspace, i);
517
518                 pfree(workspace);
519
520                 PG_RETURN_TEXT_P(result);
521         }
522         else
523 #endif   /* USE_WIDE_UPPER_LOWER */
524         {
525                 text       *string = PG_GETARG_TEXT_P_COPY(0);
526                 int                     wasalnum = 0;
527                 char       *ptr;
528                 int                     m;
529
530                 /*
531                  * Since we copied the string, we can scribble directly on the value
532                  */
533                 ptr = VARDATA(string);
534                 m = VARSIZE(string) - VARHDRSZ;
535
536                 while (m-- > 0)
537                 {
538                         if (wasalnum)
539                                 *ptr = tolower((unsigned char) *ptr);
540                         else
541                                 *ptr = toupper((unsigned char) *ptr);
542                         wasalnum = isalnum((unsigned char) *ptr);
543                         ptr++;
544                 }
545
546                 PG_RETURN_TEXT_P(string);
547         }
548 }
549
550
551 /********************************************************************
552  *
553  * lpad
554  *
555  * Syntax:
556  *
557  *       text lpad(text string1, int4 len, text string2)
558  *
559  * Purpose:
560  *
561  *       Returns string1, left-padded to length len with the sequence of
562  *       characters in string2.  If len is less than the length of string1,
563  *       instead truncate (on the right) to len.
564  *
565  ********************************************************************/
566
567 Datum
568 lpad(PG_FUNCTION_ARGS)
569 {
570         text       *string1 = PG_GETARG_TEXT_PP(0);
571         int32           len = PG_GETARG_INT32(1);
572         text       *string2 = PG_GETARG_TEXT_PP(2);
573         text       *ret;
574         char       *ptr1,
575                            *ptr2,
576                            *ptr2start,
577                            *ptr2end,
578                            *ptr_ret;
579         int                     m,
580                                 s1len,
581                                 s2len;
582
583         int                     bytelen;
584
585         /* Negative len is silently taken as zero */
586         if (len < 0)
587                 len = 0;
588
589         s1len = VARSIZE_ANY_EXHDR(string1);
590         if (s1len < 0)
591                 s1len = 0;                              /* shouldn't happen */
592
593         s2len = VARSIZE_ANY_EXHDR(string2);
594         if (s2len < 0)
595                 s2len = 0;                              /* shouldn't happen */
596
597         s1len = pg_mbstrlen_with_len(VARDATA_ANY(string1), s1len);
598
599         if (s1len > len)
600                 s1len = len;                    /* truncate string1 to len chars */
601
602         if (s2len <= 0)
603                 len = s1len;                    /* nothing to pad with, so don't pad */
604
605         bytelen = pg_database_encoding_max_length() * len;
606
607         /* check for integer overflow */
608         if (len != 0 && bytelen / pg_database_encoding_max_length() != len)
609                 ereport(ERROR,
610                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
611                                  errmsg("requested length too large")));
612
613         ret = (text *) palloc(VARHDRSZ + bytelen);
614
615         m = len - s1len;
616
617         ptr2 = ptr2start = VARDATA_ANY(string2);
618         ptr2end = ptr2 + s2len;
619         ptr_ret = VARDATA(ret);
620
621         while (m--)
622         {
623                 int                     mlen = pg_mblen(ptr2);
624
625                 memcpy(ptr_ret, ptr2, mlen);
626                 ptr_ret += mlen;
627                 ptr2 += mlen;
628                 if (ptr2 == ptr2end)    /* wrap around at end of s2 */
629                         ptr2 = ptr2start;
630         }
631
632         ptr1 = VARDATA_ANY(string1);
633
634         while (s1len--)
635         {
636                 int                     mlen = pg_mblen(ptr1);
637
638                 memcpy(ptr_ret, ptr1, mlen);
639                 ptr_ret += mlen;
640                 ptr1 += mlen;
641         }
642
643         SET_VARSIZE(ret, ptr_ret - (char *) ret);
644
645         PG_RETURN_TEXT_P(ret);
646 }
647
648
649 /********************************************************************
650  *
651  * rpad
652  *
653  * Syntax:
654  *
655  *       text rpad(text string1, int4 len, text string2)
656  *
657  * Purpose:
658  *
659  *       Returns string1, right-padded to length len with the sequence of
660  *       characters in string2.  If len is less than the length of string1,
661  *       instead truncate (on the right) to len.
662  *
663  ********************************************************************/
664
665 Datum
666 rpad(PG_FUNCTION_ARGS)
667 {
668         text       *string1 = PG_GETARG_TEXT_PP(0);
669         int32           len = PG_GETARG_INT32(1);
670         text       *string2 = PG_GETARG_TEXT_PP(2);
671         text       *ret;
672         char       *ptr1,
673                            *ptr2,
674                            *ptr2start,
675                            *ptr2end,
676                            *ptr_ret;
677         int                     m,
678                                 s1len,
679                                 s2len;
680
681         int                     bytelen;
682
683         /* Negative len is silently taken as zero */
684         if (len < 0)
685                 len = 0;
686
687         s1len = VARSIZE_ANY_EXHDR(string1);
688         if (s1len < 0)
689                 s1len = 0;                              /* shouldn't happen */
690
691         s2len = VARSIZE_ANY_EXHDR(string2);
692         if (s2len < 0)
693                 s2len = 0;                              /* shouldn't happen */
694
695         s1len = pg_mbstrlen_with_len(VARDATA_ANY(string1), s1len);
696
697         if (s1len > len)
698                 s1len = len;                    /* truncate string1 to len chars */
699
700         if (s2len <= 0)
701                 len = s1len;                    /* nothing to pad with, so don't pad */
702
703         bytelen = pg_database_encoding_max_length() * len;
704
705         /* Check for integer overflow */
706         if (len != 0 && bytelen / pg_database_encoding_max_length() != len)
707                 ereport(ERROR,
708                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
709                                  errmsg("requested length too large")));
710
711         ret = (text *) palloc(VARHDRSZ + bytelen);
712         m = len - s1len;
713
714         ptr1 = VARDATA_ANY(string1);
715         ptr_ret = VARDATA(ret);
716
717         while (s1len--)
718         {
719                 int                     mlen = pg_mblen(ptr1);
720
721                 memcpy(ptr_ret, ptr1, mlen);
722                 ptr_ret += mlen;
723                 ptr1 += mlen;
724         }
725
726         ptr2 = ptr2start = VARDATA_ANY(string2);
727         ptr2end = ptr2 + s2len;
728
729         while (m--)
730         {
731                 int                     mlen = pg_mblen(ptr2);
732
733                 memcpy(ptr_ret, ptr2, mlen);
734                 ptr_ret += mlen;
735                 ptr2 += mlen;
736                 if (ptr2 == ptr2end)    /* wrap around at end of s2 */
737                         ptr2 = ptr2start;
738         }
739
740         SET_VARSIZE(ret, ptr_ret - (char *) ret);
741
742         PG_RETURN_TEXT_P(ret);
743 }
744
745
746 /********************************************************************
747  *
748  * btrim
749  *
750  * Syntax:
751  *
752  *       text btrim(text string, text set)
753  *
754  * Purpose:
755  *
756  *       Returns string with characters removed from the front and back
757  *       up to the first character not in set.
758  *
759  ********************************************************************/
760
761 Datum
762 btrim(PG_FUNCTION_ARGS)
763 {
764         text       *string = PG_GETARG_TEXT_PP(0);
765         text       *set = PG_GETARG_TEXT_PP(1);
766         text       *ret;
767
768         ret = dotrim(VARDATA_ANY(string), VARSIZE_ANY_EXHDR(string),
769                                  VARDATA_ANY(set), VARSIZE_ANY_EXHDR(set),
770                                  true, true);
771
772         PG_RETURN_TEXT_P(ret);
773 }
774
775 /********************************************************************
776  *
777  * btrim1 --- btrim with set fixed as ' '
778  *
779  ********************************************************************/
780
781 Datum
782 btrim1(PG_FUNCTION_ARGS)
783 {
784         text       *string = PG_GETARG_TEXT_PP(0);
785         text       *ret;
786
787         ret = dotrim(VARDATA_ANY(string), VARSIZE_ANY_EXHDR(string),
788                                  " ", 1,
789                                  true, true);
790
791         PG_RETURN_TEXT_P(ret);
792 }
793
794 /*
795  * Common implementation for btrim, ltrim, rtrim
796  */
797 static text *
798 dotrim(const char *string, int stringlen,
799            const char *set, int setlen,
800            bool doltrim, bool dortrim)
801 {
802         text       *result;
803         int                     i;
804
805         /* Nothing to do if either string or set is empty */
806         if (stringlen > 0 && setlen > 0)
807         {
808                 if (pg_database_encoding_max_length() > 1)
809                 {
810                         /*
811                          * In the multibyte-encoding case, build arrays of pointers to
812                          * character starts, so that we can avoid inefficient checks in
813                          * the inner loops.
814                          */
815                         const char **stringchars;
816                         const char **setchars;
817                         int                *stringmblen;
818                         int                *setmblen;
819                         int                     stringnchars;
820                         int                     setnchars;
821                         int                     resultndx;
822                         int                     resultnchars;
823                         const char *p;
824                         int                     len;
825                         int                     mblen;
826                         const char *str_pos;
827                         int                     str_len;
828
829                         stringchars = (const char **) palloc(stringlen * sizeof(char *));
830                         stringmblen = (int *) palloc(stringlen * sizeof(int));
831                         stringnchars = 0;
832                         p = string;
833                         len = stringlen;
834                         while (len > 0)
835                         {
836                                 stringchars[stringnchars] = p;
837                                 stringmblen[stringnchars] = mblen = pg_mblen(p);
838                                 stringnchars++;
839                                 p += mblen;
840                                 len -= mblen;
841                         }
842
843                         setchars = (const char **) palloc(setlen * sizeof(char *));
844                         setmblen = (int *) palloc(setlen * sizeof(int));
845                         setnchars = 0;
846                         p = set;
847                         len = setlen;
848                         while (len > 0)
849                         {
850                                 setchars[setnchars] = p;
851                                 setmblen[setnchars] = mblen = pg_mblen(p);
852                                 setnchars++;
853                                 p += mblen;
854                                 len -= mblen;
855                         }
856
857                         resultndx = 0;          /* index in stringchars[] */
858                         resultnchars = stringnchars;
859
860                         if (doltrim)
861                         {
862                                 while (resultnchars > 0)
863                                 {
864                                         str_pos = stringchars[resultndx];
865                                         str_len = stringmblen[resultndx];
866                                         for (i = 0; i < setnchars; i++)
867                                         {
868                                                 if (str_len == setmblen[i] &&
869                                                         memcmp(str_pos, setchars[i], str_len) == 0)
870                                                         break;
871                                         }
872                                         if (i >= setnchars)
873                                                 break;  /* no match here */
874                                         string += str_len;
875                                         stringlen -= str_len;
876                                         resultndx++;
877                                         resultnchars--;
878                                 }
879                         }
880
881                         if (dortrim)
882                         {
883                                 while (resultnchars > 0)
884                                 {
885                                         str_pos = stringchars[resultndx + resultnchars - 1];
886                                         str_len = stringmblen[resultndx + resultnchars - 1];
887                                         for (i = 0; i < setnchars; i++)
888                                         {
889                                                 if (str_len == setmblen[i] &&
890                                                         memcmp(str_pos, setchars[i], str_len) == 0)
891                                                         break;
892                                         }
893                                         if (i >= setnchars)
894                                                 break;  /* no match here */
895                                         stringlen -= str_len;
896                                         resultnchars--;
897                                 }
898                         }
899
900                         pfree(stringchars);
901                         pfree(stringmblen);
902                         pfree(setchars);
903                         pfree(setmblen);
904                 }
905                 else
906                 {
907                         /*
908                          * In the single-byte-encoding case, we don't need such overhead.
909                          */
910                         if (doltrim)
911                         {
912                                 while (stringlen > 0)
913                                 {
914                                         char            str_ch = *string;
915
916                                         for (i = 0; i < setlen; i++)
917                                         {
918                                                 if (str_ch == set[i])
919                                                         break;
920                                         }
921                                         if (i >= setlen)
922                                                 break;  /* no match here */
923                                         string++;
924                                         stringlen--;
925                                 }
926                         }
927
928                         if (dortrim)
929                         {
930                                 while (stringlen > 0)
931                                 {
932                                         char            str_ch = string[stringlen - 1];
933
934                                         for (i = 0; i < setlen; i++)
935                                         {
936                                                 if (str_ch == set[i])
937                                                         break;
938                                         }
939                                         if (i >= setlen)
940                                                 break;  /* no match here */
941                                         stringlen--;
942                                 }
943                         }
944                 }
945         }
946
947         /* Return selected portion of string */
948         result = (text *) palloc(VARHDRSZ + stringlen);
949         SET_VARSIZE(result, VARHDRSZ + stringlen);
950         memcpy(VARDATA(result), string, stringlen);
951
952         return result;
953 }
954
955 /********************************************************************
956  *
957  * byteatrim
958  *
959  * Syntax:
960  *
961  *       bytea byteatrim(byta string, bytea set)
962  *
963  * Purpose:
964  *
965  *       Returns string with characters removed from the front and back
966  *       up to the first character not in set.
967  *
968  * Cloned from btrim and modified as required.
969  ********************************************************************/
970
971 Datum
972 byteatrim(PG_FUNCTION_ARGS)
973 {
974         bytea      *string = PG_GETARG_BYTEA_PP(0);
975         bytea      *set = PG_GETARG_BYTEA_PP(1);
976         bytea      *ret;
977         char       *ptr,
978                            *end,
979                            *ptr2,
980                            *ptr2start,
981                            *end2;
982         int                     m, 
983                                 stringlen, 
984                                 setlen;
985
986         stringlen = VARSIZE_ANY_EXHDR(string);
987         setlen = VARSIZE_ANY_EXHDR(set);
988         
989         if (stringlen <= 0 || setlen <= 0)
990                 PG_RETURN_BYTEA_P(string);
991
992         m = stringlen;
993         ptr = VARDATA_ANY(string);
994         end = ptr + stringlen - 1;
995         ptr2start = VARDATA_ANY(set);
996         end2 = ptr2start + setlen - 1;
997
998         while (m > 0)
999         {
1000                 ptr2 = ptr2start;
1001                 while (ptr2 <= end2)
1002                 {
1003                         if (*ptr == *ptr2)
1004                                 break;
1005                         ++ptr2;
1006                 }
1007                 if (ptr2 > end2)
1008                         break;
1009                 ptr++;
1010                 m--;
1011         }
1012
1013         while (m > 0)
1014         {
1015                 ptr2 = ptr2start;
1016                 while (ptr2 <= end2)
1017                 {
1018                         if (*end == *ptr2)
1019                                 break;
1020                         ++ptr2;
1021                 }
1022                 if (ptr2 > end2)
1023                         break;
1024                 end--;
1025                 m--;
1026         }
1027
1028         ret = (bytea *) palloc(VARHDRSZ + m);
1029         SET_VARSIZE(ret, VARHDRSZ + m);
1030         memcpy(VARDATA(ret), ptr, m);
1031
1032         PG_RETURN_BYTEA_P(ret);
1033 }
1034
1035 /********************************************************************
1036  *
1037  * ltrim
1038  *
1039  * Syntax:
1040  *
1041  *       text ltrim(text string, text set)
1042  *
1043  * Purpose:
1044  *
1045  *       Returns string with initial characters removed up to the first
1046  *       character not in set.
1047  *
1048  ********************************************************************/
1049
1050 Datum
1051 ltrim(PG_FUNCTION_ARGS)
1052 {
1053         text       *string = PG_GETARG_TEXT_PP(0);
1054         text       *set = PG_GETARG_TEXT_PP(1);
1055         text       *ret;
1056
1057         ret = dotrim(VARDATA_ANY(string), VARSIZE_ANY_EXHDR(string),
1058                                  VARDATA_ANY(set), VARSIZE_ANY_EXHDR(set),
1059                                  true, false);
1060
1061         PG_RETURN_TEXT_P(ret);
1062 }
1063
1064 /********************************************************************
1065  *
1066  * ltrim1 --- ltrim with set fixed as ' '
1067  *
1068  ********************************************************************/
1069
1070 Datum
1071 ltrim1(PG_FUNCTION_ARGS)
1072 {
1073         text       *string = PG_GETARG_TEXT_PP(0);
1074         text       *ret;
1075
1076         ret = dotrim(VARDATA_ANY(string), VARSIZE_ANY_EXHDR(string),
1077                                  " ", 1,
1078                                  true, false);
1079
1080         PG_RETURN_TEXT_P(ret);
1081 }
1082
1083 /********************************************************************
1084  *
1085  * rtrim
1086  *
1087  * Syntax:
1088  *
1089  *       text rtrim(text string, text set)
1090  *
1091  * Purpose:
1092  *
1093  *       Returns string with final characters removed after the last
1094  *       character not in set.
1095  *
1096  ********************************************************************/
1097
1098 Datum
1099 rtrim(PG_FUNCTION_ARGS)
1100 {
1101         text       *string = PG_GETARG_TEXT_PP(0);
1102         text       *set = PG_GETARG_TEXT_PP(1);
1103         text       *ret;
1104
1105         ret = dotrim(VARDATA_ANY(string), VARSIZE_ANY_EXHDR(string),
1106                                  VARDATA_ANY(set), VARSIZE_ANY_EXHDR(set),
1107                                  false, true);
1108
1109         PG_RETURN_TEXT_P(ret);
1110 }
1111
1112 /********************************************************************
1113  *
1114  * rtrim1 --- rtrim with set fixed as ' '
1115  *
1116  ********************************************************************/
1117
1118 Datum
1119 rtrim1(PG_FUNCTION_ARGS)
1120 {
1121         text       *string = PG_GETARG_TEXT_PP(0);
1122         text       *ret;
1123
1124         ret = dotrim(VARDATA_ANY(string), VARSIZE_ANY_EXHDR(string),
1125                                  " ", 1,
1126                                  false, true);
1127
1128         PG_RETURN_TEXT_P(ret);
1129 }
1130
1131
1132 /********************************************************************
1133  *
1134  * translate
1135  *
1136  * Syntax:
1137  *
1138  *       text translate(text string, text from, text to)
1139  *
1140  * Purpose:
1141  *
1142  *       Returns string after replacing all occurrences of characters in from
1143  *       with the corresponding character in to.  If from is longer than to,
1144  *       occurrences of the extra characters in from are deleted.
1145  *       Improved by Edwin Ramirez <ramirez@doc.mssm.edu>.
1146  *
1147  ********************************************************************/
1148
1149 Datum
1150 translate(PG_FUNCTION_ARGS)
1151 {
1152         text       *string = PG_GETARG_TEXT_PP(0);
1153         text       *from = PG_GETARG_TEXT_PP(1);
1154         text       *to = PG_GETARG_TEXT_PP(2);
1155         text       *result;
1156         char       *from_ptr,
1157                            *to_ptr;
1158         char       *source,
1159                            *target;
1160         int                     m,
1161                                 fromlen,
1162                                 tolen,
1163                                 retlen,
1164                                 i;
1165
1166         int                     str_len;
1167         int                     estimate_len;
1168         int                     len;
1169         int                     source_len;
1170         int                     from_index;
1171
1172         m = VARSIZE_ANY_EXHDR(string);
1173         
1174         if (m <= 0)
1175                 PG_RETURN_TEXT_P(string);
1176
1177         fromlen = VARSIZE_ANY_EXHDR(from);
1178         from_ptr = VARDATA_ANY(from);
1179         tolen = VARSIZE_ANY_EXHDR(to);
1180         to_ptr = VARDATA_ANY(to);
1181
1182         str_len = VARSIZE_ANY_EXHDR(string);
1183         source = VARDATA_ANY(string);
1184
1185         estimate_len = (tolen * 1.0 / fromlen + 0.5) * str_len;
1186         estimate_len = estimate_len > str_len ? estimate_len : str_len;
1187
1188         result = (text *) palloc(estimate_len + VARHDRSZ);
1189         target = VARDATA(result);
1190         retlen = 0;
1191
1192         while (m > 0)
1193         {
1194                 source_len = pg_mblen(source);
1195                 from_index = 0;
1196
1197                 for (i = 0; i < fromlen; i += len)
1198                 {
1199                         len = pg_mblen(&from_ptr[i]);
1200                         if (len == source_len &&
1201                                 memcmp(source, &from_ptr[i], len) == 0)
1202                                 break;
1203
1204                         from_index++;
1205                 }
1206                 if (i < fromlen)
1207                 {
1208                         /* substitute */
1209                         char       *p = to_ptr;
1210
1211                         for (i = 0; i < from_index; i++)
1212                         {
1213                                 p += pg_mblen(p);
1214                                 if (p >= (to_ptr + tolen))
1215                                         break;
1216                         }
1217                         if (p < (to_ptr + tolen))
1218                         {
1219                                 len = pg_mblen(p);
1220                                 memcpy(target, p, len);
1221                                 target += len;
1222                                 retlen += len;
1223                         }
1224
1225                 }
1226                 else
1227                 {
1228                         /* no match, so copy */
1229                         memcpy(target, source, source_len);
1230                         target += source_len;
1231                         retlen += source_len;
1232                 }
1233
1234                 source += source_len;
1235                 m -= source_len;
1236         }
1237
1238         SET_VARSIZE(result, retlen + VARHDRSZ);
1239
1240         /*
1241          * There may be some wasted space in the result if deletions occurred, but
1242          * it's not worth reallocating it; the function result probably won't live
1243          * long anyway.
1244          */
1245
1246         PG_RETURN_TEXT_P(result);
1247 }
1248
1249 /********************************************************************
1250  *
1251  * ascii
1252  *
1253  * Syntax:
1254  *
1255  *       int ascii(text string)
1256  *
1257  * Purpose:
1258  *
1259  *       Returns the decimal representation of the first character from
1260  *       string.
1261  *   If the string is empty we return 0.
1262  *   If the database encoding is UTF8, we return the Unicode codepoint. 
1263  *   If the database encoding is any other multi-byte encoding, we
1264  *   return the value of the first byte if it is an ASCII character
1265  *   (range 1 .. 127), or raise an error.
1266  *   For all other encodings we return the value of the first byte,
1267  *   (range 1..255).
1268  *
1269  ********************************************************************/
1270
1271 Datum
1272 ascii(PG_FUNCTION_ARGS)
1273 {
1274         text       *string = PG_GETARG_TEXT_PP(0);
1275         int encoding = GetDatabaseEncoding();
1276         unsigned char *data;
1277
1278         if (VARSIZE_ANY_EXHDR(string) <= 0)
1279                 PG_RETURN_INT32(0);
1280
1281         data = (unsigned char *) VARDATA_ANY(string);
1282
1283         if (encoding == PG_UTF8 && *data > 127)
1284         {
1285                 /* return the code point for Unicode */
1286
1287                 int result = 0, tbytes = 0, i;
1288
1289                 if (*data >= 0xF0)
1290                 {
1291                         result = *data & 0x07;
1292                         tbytes = 3;
1293                 }
1294                 else if (*data >= 0xE0)
1295                 {
1296                         result = *data & 0x0F;
1297                         tbytes = 2;
1298                 }
1299                 else
1300                 {
1301                         Assert (*data > 0xC0);
1302                         result = *data & 0x1f;
1303                         tbytes = 1;
1304                 }
1305
1306                 Assert (tbytes > 0);
1307
1308                 for (i = 1; i <= tbytes; i++)
1309                 {
1310                         Assert ((data[i] & 0xC0) == 0x80);
1311                         result = (result << 6) + (data[i] & 0x3f);
1312                 }
1313
1314                 PG_RETURN_INT32(result);
1315         }
1316         else
1317         {
1318                 if (pg_encoding_max_length(encoding) > 1 && *data > 127)
1319                         ereport(ERROR,
1320                                         (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1321                                          errmsg("requested character too large")));
1322
1323
1324                 PG_RETURN_INT32((int32) *data);
1325         }
1326 }
1327
1328 /********************************************************************
1329  *
1330  * chr
1331  *
1332  * Syntax:
1333  *
1334  *       text chr(int val)
1335  *
1336  * Purpose:
1337  *
1338  *      Returns the character having the binary equivalent to val.
1339  *
1340  * For UTF8 we treat the argumwent as a Unicode code point.
1341  * For other multi-byte encodings we raise an error for arguments
1342  * outside the strict ASCII range (1..127).
1343  *
1344  * It's important that we don't ever return a value that is not valid
1345  * in the database encoding, so that this doesn't become a way for
1346  * invalid data to enter the database.
1347  *
1348  ********************************************************************/
1349
1350 Datum
1351 chr(PG_FUNCTION_ARGS)
1352 {
1353         uint32          cvalue = PG_GETARG_UINT32(0);
1354         text       *result;
1355         int encoding = GetDatabaseEncoding();
1356
1357         if (encoding == PG_UTF8 && cvalue > 127)
1358         {
1359                 /* for Unicode we treat the argument as a code point */
1360                 int bytes ;
1361                 char *wch;
1362
1363                 /* We only allow valid Unicode code points */
1364                 if (cvalue > 0x001fffff)
1365                         ereport(ERROR,
1366                                         (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1367                                          errmsg("requested character too large for encoding: %d", 
1368                                                         cvalue)));
1369
1370                 if (cvalue > 0xffff)
1371                         bytes = 4;
1372                 else if (cvalue > 0x07ff)
1373                         bytes = 3;
1374                 else
1375                         bytes = 2;
1376
1377                 result = (text *) palloc(VARHDRSZ + bytes);
1378                 SET_VARSIZE(result, VARHDRSZ + bytes);
1379                 wch = VARDATA(result);
1380
1381                 if (bytes == 2)
1382                 {
1383                         wch[0] = 0xC0 | ((cvalue >> 6) & 0x1F);
1384                         wch[1] = 0x80 | (cvalue & 0x3F);;
1385                 }
1386                 else if (bytes == 3)
1387                 {
1388                         wch[0] = 0xE0 | ((cvalue >> 12) & 0x0F);
1389                         wch[1] = 0x80 | ((cvalue >> 6) & 0x3F);
1390                         wch[2] = 0x80 | (cvalue & 0x3F);
1391                 }
1392                 else
1393                 {
1394                         wch[0] = 0xF0 | ((cvalue >> 18) & 0x07);
1395                         wch[1] = 0x80 | ((cvalue >> 12) & 0x3F);
1396                         wch[2] = 0x80 | ((cvalue >> 6) & 0x3F);
1397                         wch[3] = 0x80 | (cvalue & 0x3F);
1398                 }
1399                 
1400         }
1401
1402         else
1403         {
1404                 bool is_mb;
1405
1406                 /* Error out on arguments that make no sense or that we
1407                  * can't validly represent in the encoding.
1408                  */
1409
1410                 if (cvalue == 0)
1411                         ereport(ERROR,
1412                                         (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1413                                          errmsg("null character not permitted")));
1414
1415                 is_mb = pg_encoding_max_length(encoding) > 1;
1416
1417                 if ((is_mb && (cvalue > 255)) || (! is_mb && (cvalue > 127)))
1418                         ereport(ERROR,
1419                                         (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1420                                          errmsg("requested character too large for encoding: %d",
1421                                                         cvalue)));
1422                 
1423
1424                 result = (text *) palloc(VARHDRSZ + 1);
1425                 SET_VARSIZE(result, VARHDRSZ + 1);
1426                 *VARDATA(result) = (char) cvalue;
1427         }
1428
1429         PG_RETURN_TEXT_P(result);
1430 }
1431
1432 /********************************************************************
1433  *
1434  * repeat
1435  *
1436  * Syntax:
1437  *
1438  *       text repeat(text string, int val)
1439  *
1440  * Purpose:
1441  *
1442  *      Repeat string by val.
1443  *
1444  ********************************************************************/
1445
1446 Datum
1447 repeat(PG_FUNCTION_ARGS)
1448 {
1449         text       *string = PG_GETARG_TEXT_PP(0);
1450         int32           count = PG_GETARG_INT32(1);
1451         text       *result;
1452         int                     slen,
1453                                 tlen;
1454         int                     i;
1455         char       *cp,
1456                            *sp;
1457
1458         if (count < 0)
1459                 count = 0;
1460
1461         slen = VARSIZE_ANY_EXHDR(string);
1462         tlen = VARHDRSZ + (count * slen);
1463
1464         /* Check for integer overflow */
1465         if (slen != 0 && count != 0)
1466         {
1467                 int                     check = count * slen;
1468                 int                     check2 = check + VARHDRSZ;
1469
1470                 if ((check / slen) != count || check2 <= check)
1471                         ereport(ERROR,
1472                                         (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1473                                          errmsg("requested length too large")));
1474         }
1475
1476         result = (text *) palloc(tlen);
1477
1478         SET_VARSIZE(result, tlen);
1479         cp = VARDATA(result);
1480         sp = VARDATA_ANY(string);
1481         for (i = 0; i < count; i++)
1482         {
1483                 memcpy(cp, sp, slen);
1484                 cp += slen;
1485         }
1486
1487         PG_RETURN_TEXT_P(result);
1488 }