1 /*-------------------------------------------------------------------------
4 * locale compatibility layer for tsearch
6 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
10 * src/backend/tsearch/ts_locale.c
12 *-------------------------------------------------------------------------
16 #include "catalog/pg_collation.h"
17 #include "storage/fd.h"
18 #include "tsearch/ts_locale.h"
19 #include "tsearch/ts_public.h"
21 static void tsearch_readline_callback(void *arg);
25 * The reason these functions use a 3-wchar_t output buffer, not 2 as you
26 * might expect, is that on Windows "wchar_t" is 16 bits and what we'll be
27 * getting from char2wchar() is UTF16 not UTF32. A single input character
28 * may therefore produce a surrogate pair rather than just one wchar_t;
29 * we also need room for a trailing null. When we do get a surrogate pair,
30 * we pass just the first code to iswdigit() etc, so that these functions will
31 * always return false for characters outside the Basic Multilingual Plane.
36 t_isdigit(const char *ptr)
38 int clen = pg_mblen(ptr);
39 wchar_t character[WC_BUF_LEN];
40 Oid collation = DEFAULT_COLLATION_OID; /* TODO */
41 pg_locale_t mylocale = 0; /* TODO */
43 if (clen == 1 || lc_ctype_is_c(collation))
44 return isdigit(TOUCHAR(ptr));
46 char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
48 return iswdigit((wint_t) character[0]);
52 t_isspace(const char *ptr)
54 int clen = pg_mblen(ptr);
55 wchar_t character[WC_BUF_LEN];
56 Oid collation = DEFAULT_COLLATION_OID; /* TODO */
57 pg_locale_t mylocale = 0; /* TODO */
59 if (clen == 1 || lc_ctype_is_c(collation))
60 return isspace(TOUCHAR(ptr));
62 char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
64 return iswspace((wint_t) character[0]);
68 t_isalpha(const char *ptr)
70 int clen = pg_mblen(ptr);
71 wchar_t character[WC_BUF_LEN];
72 Oid collation = DEFAULT_COLLATION_OID; /* TODO */
73 pg_locale_t mylocale = 0; /* TODO */
75 if (clen == 1 || lc_ctype_is_c(collation))
76 return isalpha(TOUCHAR(ptr));
78 char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
80 return iswalpha((wint_t) character[0]);
84 t_isprint(const char *ptr)
86 int clen = pg_mblen(ptr);
87 wchar_t character[WC_BUF_LEN];
88 Oid collation = DEFAULT_COLLATION_OID; /* TODO */
89 pg_locale_t mylocale = 0; /* TODO */
91 if (clen == 1 || lc_ctype_is_c(collation))
92 return isprint(TOUCHAR(ptr));
94 char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
96 return iswprint((wint_t) character[0]);
101 * Set up to read a file using tsearch_readline(). This facility is
102 * better than just reading the file directly because it provides error
103 * context pointing to the specific line where a problem is detected.
107 * tsearch_readline_state trst;
109 * if (!tsearch_readline_begin(&trst, filename))
111 * (errcode(ERRCODE_CONFIG_FILE_ERROR),
112 * errmsg("could not open stop-word file \"%s\": %m",
114 * while ((line = tsearch_readline(&trst)) != NULL)
116 * tsearch_readline_end(&trst);
118 * Note that the caller supplies the ereport() for file open failure;
119 * this is so that a custom message can be provided. The filename string
120 * passed to tsearch_readline_begin() must remain valid through
121 * tsearch_readline_end().
124 tsearch_readline_begin(tsearch_readline_state *stp,
125 const char *filename)
127 if ((stp->fp = AllocateFile(filename, "r")) == NULL)
129 stp->filename = filename;
132 /* Setup error traceback support for ereport() */
133 stp->cb.callback = tsearch_readline_callback;
134 stp->cb.arg = (void *) stp;
135 stp->cb.previous = error_context_stack;
136 error_context_stack = &stp->cb;
141 * Read the next line from a tsearch data file (expected to be in UTF-8), and
142 * convert it to database encoding if needed. The returned string is palloc'd.
143 * NULL return means EOF.
146 tsearch_readline(tsearch_readline_state *stp)
152 result = t_readline(stp->fp);
153 stp->curline = result;
158 * Close down after reading a file with tsearch_readline()
161 tsearch_readline_end(tsearch_readline_state *stp)
164 /* Pop the error context stack */
165 error_context_stack = stp->cb.previous;
169 * Error context callback for errors occurring while reading a tsearch
170 * configuration file.
173 tsearch_readline_callback(void *arg)
175 tsearch_readline_state *stp = (tsearch_readline_state *) arg;
178 * We can't include the text of the config line for errors that occur
179 * during t_readline() itself. This is only partly a consequence of our
180 * arms-length use of that routine: the major cause of such errors is
181 * encoding violations, and we daren't try to print error messages
182 * containing badly-encoded data.
185 errcontext("line %d of configuration file \"%s\": \"%s\"",
190 errcontext("line %d of configuration file \"%s\"",
197 * Read the next line from a tsearch data file (expected to be in UTF-8), and
198 * convert it to database encoding if needed. The returned string is palloc'd.
199 * NULL return means EOF.
201 * Note: direct use of this function is now deprecated. Go through
202 * tsearch_readline() to provide better error reporting.
209 char buf[4096]; /* lines must not be longer than this */
211 if (fgets(buf, sizeof(buf), fp) == NULL)
216 /* Make sure the input is valid UTF-8 */
217 (void) pg_verify_mbstr(PG_UTF8, buf, len, false);
220 recoded = pg_any_to_server(buf, len, PG_UTF8);
224 * conversion didn't pstrdup, so we must. We can use the length of the
225 * original string, because no conversion was done.
227 recoded = pnstrdup(recoded, len);
234 * lowerstr --- fold null-terminated string to lower case
236 * Returned string is palloc'd
239 lowerstr(const char *str)
241 return lowerstr_with_len(str, strlen(str));
245 * lowerstr_with_len --- fold string to lower case
247 * Input string need not be null-terminated.
249 * Returned string is palloc'd
252 lowerstr_with_len(const char *str, int len)
255 Oid collation = DEFAULT_COLLATION_OID; /* TODO */
256 pg_locale_t mylocale = 0; /* TODO */
262 * Use wide char code only when max encoding length > 1 and ctype != C.
263 * Some operating systems fail with multi-byte encodings and a C locale.
264 * Also, for a C locale there is no need to process as multibyte. From
265 * backend/utils/adt/oracle_compat.c Teodor
267 if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c(collation))
274 * alloc number of wchar_t for worst case, len contains number of
275 * bytes >= number of characters and alloc 1 wchar_t for 0, because
276 * wchar2char wants zero-terminated string
278 wptr = wstr = (wchar_t *) palloc(sizeof(wchar_t) * (len + 1));
280 wlen = char2wchar(wstr, len + 1, str, len, mylocale);
285 *wptr = towlower((wint_t) *wptr);
290 * Alloc result string for worst case + '\0'
292 len = pg_database_encoding_max_length() * wlen + 1;
293 out = (char *) palloc(len);
295 wlen = wchar2char(out, wstr, len, mylocale);
301 (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
302 errmsg("conversion from wchar_t to server encoding failed: %m")));
307 const char *ptr = str;
310 outptr = out = (char *) palloc(sizeof(char) * (len + 1));
311 while ((ptr - str) < len && *ptr)
313 *outptr++ = tolower(TOUCHAR(ptr));