]> granicus.if.org Git - postgresql/blob - src/backend/tsearch/ts_locale.c
Create a selectivity estimation function for the text search @@ operator.
[postgresql] / src / backend / tsearch / ts_locale.c
1 /*-------------------------------------------------------------------------
2  *
3  * ts_locale.c
4  *              locale compatibility layer for tsearch
5  *
6  * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
7  *
8  *
9  * IDENTIFICATION
10  *        $PostgreSQL: pgsql/src/backend/tsearch/ts_locale.c,v 1.10 2008/06/18 20:55:42 tgl Exp $
11  *
12  *-------------------------------------------------------------------------
13  */
14 #include "postgres.h"
15
16 #include "storage/fd.h"
17 #include "tsearch/ts_locale.h"
18 #include "tsearch/ts_public.h"
19
20 static void tsearch_readline_callback(void *arg);
21
22
23 #ifdef USE_WIDE_UPPER_LOWER
24
25 int
26 t_isdigit(const char *ptr)
27 {
28         int                     clen = pg_mblen(ptr);
29         wchar_t         character[2];
30
31         if (clen == 1 || lc_ctype_is_c())
32                 return isdigit(TOUCHAR(ptr));
33
34         char2wchar(character, 2, ptr, clen);
35
36         return iswdigit((wint_t) character[0]);
37 }
38
39 int
40 t_isspace(const char *ptr)
41 {
42         int                     clen = pg_mblen(ptr);
43         wchar_t         character[2];
44
45         if (clen == 1 || lc_ctype_is_c())
46                 return isspace(TOUCHAR(ptr));
47
48         char2wchar(character, 2, ptr, clen);
49
50         return iswspace((wint_t) character[0]);
51 }
52
53 int
54 t_isalpha(const char *ptr)
55 {
56         int                     clen = pg_mblen(ptr);
57         wchar_t         character[2];
58
59         if (clen == 1 || lc_ctype_is_c())
60                 return isalpha(TOUCHAR(ptr));
61
62         char2wchar(character, 2, ptr, clen);
63
64         return iswalpha((wint_t) character[0]);
65 }
66
67 int
68 t_isprint(const char *ptr)
69 {
70         int                     clen = pg_mblen(ptr);
71         wchar_t         character[2];
72
73         if (clen == 1 || lc_ctype_is_c())
74                 return isprint(TOUCHAR(ptr));
75
76         char2wchar(character, 2, ptr, clen);
77
78         return iswprint((wint_t) character[0]);
79 }
80 #endif   /* USE_WIDE_UPPER_LOWER */
81
82
83 /*
84  * Set up to read a file using tsearch_readline().  This facility is
85  * better than just reading the file directly because it provides error
86  * context pointing to the specific line where a problem is detected.
87  *
88  * Expected usage is:
89  *
90  *              tsearch_readline_state trst;
91  *
92  *              if (!tsearch_readline_begin(&trst, filename))
93  *                      ereport(ERROR,
94  *                                      (errcode(ERRCODE_CONFIG_FILE_ERROR),
95  *                                       errmsg("could not open stop-word file \"%s\": %m",
96  *                                                      filename)));
97  *              while ((line = tsearch_readline(&trst)) != NULL)
98  *                      process line;
99  *              tsearch_readline_end(&trst);
100  *
101  * Note that the caller supplies the ereport() for file open failure;
102  * this is so that a custom message can be provided.  The filename string
103  * passed to tsearch_readline_begin() must remain valid through
104  * tsearch_readline_end().
105  */
106 bool
107 tsearch_readline_begin(tsearch_readline_state *stp,
108                                            const char *filename)
109 {
110         if ((stp->fp = AllocateFile(filename, "r")) == NULL)
111                 return false;
112         stp->filename = filename;
113         stp->lineno = 0;
114         stp->curline = NULL;
115         /* Setup error traceback support for ereport() */
116         stp->cb.callback = tsearch_readline_callback;
117         stp->cb.arg = (void *) stp;
118         stp->cb.previous = error_context_stack;
119         error_context_stack = &stp->cb;
120         return true;
121 }
122
123 /*
124  * Read the next line from a tsearch data file (expected to be in UTF-8), and
125  * convert it to database encoding if needed. The returned string is palloc'd.
126  * NULL return means EOF.
127  */
128 char *
129 tsearch_readline(tsearch_readline_state *stp)
130 {
131         char       *result;
132
133         stp->lineno++;
134         stp->curline = NULL;
135         result = t_readline(stp->fp);
136         stp->curline = result;
137         return result;
138 }
139
140 /*
141  * Close down after reading a file with tsearch_readline()
142  */
143 void
144 tsearch_readline_end(tsearch_readline_state *stp)
145 {
146         FreeFile(stp->fp);
147         /* Pop the error context stack */
148         error_context_stack = stp->cb.previous;
149 }
150
151 /*
152  * Error context callback for errors occurring while reading a tsearch
153  * configuration file.
154  */
155 static void
156 tsearch_readline_callback(void *arg)
157 {
158         tsearch_readline_state *stp = (tsearch_readline_state *) arg;
159
160         /*
161          * We can't include the text of the config line for errors that occur
162          * during t_readline() itself.  This is only partly a consequence of
163          * our arms-length use of that routine: the major cause of such
164          * errors is encoding violations, and we daren't try to print error
165          * messages containing badly-encoded data.
166          */
167         if (stp->curline)
168                 errcontext("line %d of configuration file \"%s\": \"%s\"",
169                                    stp->lineno,
170                                    stp->filename,
171                                    stp->curline);
172         else
173                 errcontext("line %d of configuration file \"%s\"",
174                                    stp->lineno,
175                                    stp->filename);
176 }
177
178
179 /*
180  * Read the next line from a tsearch data file (expected to be in UTF-8), and
181  * convert it to database encoding if needed. The returned string is palloc'd.
182  * NULL return means EOF.
183  *
184  * Note: direct use of this function is now deprecated.  Go through
185  * tsearch_readline() to provide better error reporting.
186  */
187 char *
188 t_readline(FILE *fp)
189 {
190         int                     len;
191         char       *recoded;
192         char            buf[4096];              /* lines must not be longer than this */
193
194         if (fgets(buf, sizeof(buf), fp) == NULL)
195                 return NULL;
196
197         len = strlen(buf);
198
199         /* Make sure the input is valid UTF-8 */
200         (void) pg_verify_mbstr(PG_UTF8, buf, len, false);
201
202         /* And convert */
203         recoded = (char *) pg_do_encoding_conversion((unsigned char *) buf,
204                                                                                                  len,
205                                                                                                  PG_UTF8,
206                                                                                                  GetDatabaseEncoding());
207
208         if (recoded == NULL)            /* should not happen */
209                 elog(ERROR, "encoding conversion failed");
210
211         if (recoded == buf)
212         {
213                 /*
214                  * conversion didn't pstrdup, so we must. We can use the length of the
215                  * original string, because no conversion was done.
216                  */
217                 recoded = pnstrdup(recoded, len);
218         }
219
220         return recoded;
221 }
222
223 /*
224  * lowerstr --- fold null-terminated string to lower case
225  *
226  * Returned string is palloc'd
227  */
228 char *
229 lowerstr(const char *str)
230 {
231         return lowerstr_with_len(str, strlen(str));
232 }
233
234 /*
235  * lowerstr_with_len --- fold string to lower case
236  *
237  * Input string need not be null-terminated.
238  *
239  * Returned string is palloc'd
240  */
241 char *
242 lowerstr_with_len(const char *str, int len)
243 {
244         char       *out;
245
246         if (len == 0)
247                 return pstrdup("");
248
249 #ifdef USE_WIDE_UPPER_LOWER
250
251         /*
252          * Use wide char code only when max encoding length > 1 and ctype != C.
253          * Some operating systems fail with multi-byte encodings and a C locale.
254          * Also, for a C locale there is no need to process as multibyte. From
255          * backend/utils/adt/oracle_compat.c Teodor
256          */
257         if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c())
258         {
259                 wchar_t    *wstr,
260                                    *wptr;
261                 int                     wlen;
262
263                 /*
264                  * alloc number of wchar_t for worst case, len contains number of
265                  * bytes >= number of characters and alloc 1 wchar_t for 0, because
266                  * wchar2char wants zero-terminated string
267                  */
268                 wptr = wstr = (wchar_t *) palloc(sizeof(wchar_t) * (len + 1));
269
270                 wlen = char2wchar(wstr, len + 1, str, len);
271                 Assert(wlen <= len);
272
273                 while (*wptr)
274                 {
275                         *wptr = towlower((wint_t) *wptr);
276                         wptr++;
277                 }
278
279                 /*
280                  * Alloc result string for worst case + '\0'
281                  */
282                 len = pg_database_encoding_max_length() * wlen + 1;
283                 out = (char *) palloc(len);
284
285                 wlen = wchar2char(out, wstr, len);
286
287                 pfree(wstr);
288
289                 if (wlen < 0)
290                         ereport(ERROR,
291                                         (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
292                                          errmsg("conversion from wchar_t to server encoding failed: %m")));
293                 Assert(wlen < len);
294         }
295         else
296 #endif   /* USE_WIDE_UPPER_LOWER */
297         {
298                 const char *ptr = str;
299                 char       *outptr;
300
301                 outptr = out = (char *) palloc(sizeof(char) * (len + 1));
302                 while ((ptr - str) < len && *ptr)
303                 {
304                         *outptr++ = tolower(TOUCHAR(ptr));
305                         ptr++;
306                 }
307                 *outptr = '\0';
308         }
309
310         return out;
311 }