]> granicus.if.org Git - postgresql/blob - contrib/unaccent/unaccent.c
Update copyright notices for year 2012.
[postgresql] / contrib / unaccent / unaccent.c
1 /*-------------------------------------------------------------------------
2  *
3  * unaccent.c
4  *        Text search unaccent dictionary
5  *
6  * Copyright (c) 2009-2012, PostgreSQL Global Development Group
7  *
8  * IDENTIFICATION
9  *        contrib/unaccent/unaccent.c
10  *
11  *-------------------------------------------------------------------------
12  */
13
14 #include "postgres.h"
15
16 #include "catalog/namespace.h"
17 #include "commands/defrem.h"
18 #include "tsearch/ts_cache.h"
19 #include "tsearch/ts_locale.h"
20 #include "tsearch/ts_public.h"
21 #include "utils/builtins.h"
22
23 PG_MODULE_MAGIC;
24
25 /*
26  * Unaccent dictionary uses uncompressed suffix tree to find a
27  * character to replace. Each node of tree is an array of
28  * SuffixChar struct with length = 256 (n-th element of array
29  * corresponds to byte)
30  */
31 typedef struct SuffixChar
32 {
33         struct SuffixChar *nextChar;
34         char       *replaceTo;
35         int                     replacelen;
36 } SuffixChar;
37
38 /*
39  * placeChar - put str into tree's structure, byte by byte.
40  */
41 static SuffixChar *
42 placeChar(SuffixChar *node, unsigned char *str, int lenstr, char *replaceTo, int replacelen)
43 {
44         SuffixChar *curnode;
45
46         if (!node)
47         {
48                 node = palloc(sizeof(SuffixChar) * 256);
49                 memset(node, 0, sizeof(SuffixChar) * 256);
50         }
51
52         curnode = node + *str;
53
54         if (lenstr == 1)
55         {
56                 if (curnode->replaceTo)
57                         elog(WARNING, "duplicate TO argument, use first one");
58                 else
59                 {
60                         curnode->replacelen = replacelen;
61                         curnode->replaceTo = palloc(replacelen);
62                         memcpy(curnode->replaceTo, replaceTo, replacelen);
63                 }
64         }
65         else
66         {
67                 curnode->nextChar = placeChar(curnode->nextChar, str + 1, lenstr - 1, replaceTo, replacelen);
68         }
69
70         return node;
71 }
72
73 /*
74  * initSuffixTree  - create suffix tree from file. Function converts
75  * UTF8-encoded file into current encoding.
76  */
77 static SuffixChar *
78 initSuffixTree(char *filename)
79 {
80         SuffixChar *volatile rootSuffixTree = NULL;
81         MemoryContext ccxt = CurrentMemoryContext;
82         tsearch_readline_state trst;
83         volatile bool skip;
84
85         filename = get_tsearch_config_filename(filename, "rules");
86         if (!tsearch_readline_begin(&trst, filename))
87                 ereport(ERROR,
88                                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
89                                  errmsg("could not open unaccent file \"%s\": %m",
90                                                 filename)));
91
92         do
93         {
94                 /*
95                  * pg_do_encoding_conversion() (called by tsearch_readline()) will
96                  * emit exception if it finds untranslatable characters in current
97                  * locale. We just skip such lines, continuing with the next.
98                  */
99                 skip = true;
100
101                 PG_TRY();
102                 {
103                         char       *line;
104
105                         while ((line = tsearch_readline(&trst)) != NULL)
106                         {
107                                 /*
108                                  * The format of each line must be "src trg" where src and trg
109                                  * are sequences of one or more non-whitespace characters,
110                                  * separated by whitespace.  Whitespace at start or end of
111                                  * line is ignored.
112                                  */
113                                 int                     state;
114                                 char       *ptr;
115                                 char       *src = NULL;
116                                 char       *trg = NULL;
117                                 int                     ptrlen;
118                                 int                     srclen = 0;
119                                 int                     trglen = 0;
120
121                                 state = 0;
122                                 for (ptr = line; *ptr; ptr += ptrlen)
123                                 {
124                                         ptrlen = pg_mblen(ptr);
125                                         /* ignore whitespace, but end src or trg */
126                                         if (t_isspace(ptr))
127                                         {
128                                                 if (state == 1)
129                                                         state = 2;
130                                                 else if (state == 3)
131                                                         state = 4;
132                                                 continue;
133                                         }
134                                         switch (state)
135                                         {
136                                                 case 0:
137                                                         /* start of src */
138                                                         src = ptr;
139                                                         srclen = ptrlen;
140                                                         state = 1;
141                                                         break;
142                                                 case 1:
143                                                         /* continue src */
144                                                         srclen += ptrlen;
145                                                         break;
146                                                 case 2:
147                                                         /* start of trg */
148                                                         trg = ptr;
149                                                         trglen = ptrlen;
150                                                         state = 3;
151                                                         break;
152                                                 case 3:
153                                                         /* continue trg */
154                                                         trglen += ptrlen;
155                                                         break;
156                                                 default:
157                                                         /* bogus line format */
158                                                         state = -1;
159                                                         break;
160                                         }
161                                 }
162
163                                 if (state >= 3)
164                                         rootSuffixTree = placeChar(rootSuffixTree,
165                                                                                            (unsigned char *) src, srclen,
166                                                                                            trg, trglen);
167
168                                 pfree(line);
169                         }
170                         skip = false;
171                 }
172                 PG_CATCH();
173                 {
174                         ErrorData  *errdata;
175                         MemoryContext ecxt;
176
177                         ecxt = MemoryContextSwitchTo(ccxt);
178                         errdata = CopyErrorData();
179                         if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER)
180                         {
181                                 FlushErrorState();
182                         }
183                         else
184                         {
185                                 MemoryContextSwitchTo(ecxt);
186                                 PG_RE_THROW();
187                         }
188                 }
189                 PG_END_TRY();
190         }
191         while (skip);
192
193         tsearch_readline_end(&trst);
194
195         return rootSuffixTree;
196 }
197
198 /*
199  * findReplaceTo - find multibyte character in tree
200  */
201 static SuffixChar *
202 findReplaceTo(SuffixChar *node, unsigned char *src, int srclen)
203 {
204         while (node)
205         {
206                 node = node + *src;
207                 if (srclen == 1)
208                         return node;
209
210                 src++;
211                 srclen--;
212                 node = node->nextChar;
213         }
214
215         return NULL;
216 }
217
218 PG_FUNCTION_INFO_V1(unaccent_init);
219 Datum           unaccent_init(PG_FUNCTION_ARGS);
220 Datum
221 unaccent_init(PG_FUNCTION_ARGS)
222 {
223         List       *dictoptions = (List *) PG_GETARG_POINTER(0);
224         SuffixChar *rootSuffixTree = NULL;
225         bool            fileloaded = false;
226         ListCell   *l;
227
228         foreach(l, dictoptions)
229         {
230                 DefElem    *defel = (DefElem *) lfirst(l);
231
232                 if (pg_strcasecmp("Rules", defel->defname) == 0)
233                 {
234                         if (fileloaded)
235                                 ereport(ERROR,
236                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
237                                                  errmsg("multiple Rules parameters")));
238                         rootSuffixTree = initSuffixTree(defGetString(defel));
239                         fileloaded = true;
240                 }
241                 else
242                 {
243                         ereport(ERROR,
244                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
245                                          errmsg("unrecognized Unaccent parameter: \"%s\"",
246                                                         defel->defname)));
247                 }
248         }
249
250         if (!fileloaded)
251         {
252                 ereport(ERROR,
253                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
254                                  errmsg("missing Rules parameter")));
255         }
256
257         PG_RETURN_POINTER(rootSuffixTree);
258 }
259
260 PG_FUNCTION_INFO_V1(unaccent_lexize);
261 Datum           unaccent_lexize(PG_FUNCTION_ARGS);
262 Datum
263 unaccent_lexize(PG_FUNCTION_ARGS)
264 {
265         SuffixChar *rootSuffixTree = (SuffixChar *) PG_GETARG_POINTER(0);
266         char       *srcchar = (char *) PG_GETARG_POINTER(1);
267         int32           len = PG_GETARG_INT32(2);
268         char       *srcstart,
269                            *trgchar = NULL;
270         int                     charlen;
271         TSLexeme   *res = NULL;
272         SuffixChar *node;
273
274         srcstart = srcchar;
275         while (srcchar - srcstart < len)
276         {
277                 charlen = pg_mblen(srcchar);
278
279                 node = findReplaceTo(rootSuffixTree, (unsigned char *) srcchar, charlen);
280                 if (node && node->replaceTo)
281                 {
282                         if (!res)
283                         {
284                                 /* allocate res only it it's needed */
285                                 res = palloc0(sizeof(TSLexeme) * 2);
286                                 res->lexeme = trgchar = palloc(len * pg_database_encoding_max_length() + 1 /* \0 */ );
287                                 res->flags = TSL_FILTER;
288                                 if (srcchar != srcstart)
289                                 {
290                                         memcpy(trgchar, srcstart, srcchar - srcstart);
291                                         trgchar += (srcchar - srcstart);
292                                 }
293                         }
294                         memcpy(trgchar, node->replaceTo, node->replacelen);
295                         trgchar += node->replacelen;
296                 }
297                 else if (res)
298                 {
299                         memcpy(trgchar, srcchar, charlen);
300                         trgchar += charlen;
301                 }
302
303                 srcchar += charlen;
304         }
305
306         if (res)
307                 *trgchar = '\0';
308
309         PG_RETURN_POINTER(res);
310 }
311
312 /*
313  * Function-like wrapper for dictionary
314  */
315 PG_FUNCTION_INFO_V1(unaccent_dict);
316 Datum           unaccent_dict(PG_FUNCTION_ARGS);
317 Datum
318 unaccent_dict(PG_FUNCTION_ARGS)
319 {
320         text       *str;
321         int                     strArg;
322         Oid                     dictOid;
323         TSDictionaryCacheEntry *dict;
324         TSLexeme   *res;
325
326         if (PG_NARGS() == 1)
327         {
328                 dictOid = get_ts_dict_oid(stringToQualifiedNameList("unaccent"), false);
329                 strArg = 0;
330         }
331         else
332         {
333                 dictOid = PG_GETARG_OID(0);
334                 strArg = 1;
335         }
336         str = PG_GETARG_TEXT_P(strArg);
337
338         dict = lookup_ts_dictionary_cache(dictOid);
339
340         res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
341                                                                                          PointerGetDatum(dict->dictData),
342                                                                                            PointerGetDatum(VARDATA(str)),
343                                                                           Int32GetDatum(VARSIZE(str) - VARHDRSZ),
344                                                                                                          PointerGetDatum(NULL)));
345
346         PG_FREE_IF_COPY(str, strArg);
347
348         if (res == NULL)
349         {
350                 PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
351         }
352         else if (res->lexeme == NULL)
353         {
354                 pfree(res);
355                 PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
356         }
357         else
358         {
359                 text       *txt = cstring_to_text(res->lexeme);
360
361                 pfree(res->lexeme);
362                 pfree(res);
363
364                 PG_RETURN_TEXT_P(txt);
365         }
366 }