]> granicus.if.org Git - postgresql/blob - contrib/unaccent/unaccent.c
Create function prototype as part of PG_FUNCTION_INFO_V1 macro
[postgresql] / contrib / unaccent / unaccent.c
1 /*-------------------------------------------------------------------------
2  *
3  * unaccent.c
4  *        Text search unaccent dictionary
5  *
6  * Copyright (c) 2009-2014, PostgreSQL Global Development Group
7  *
8  * IDENTIFICATION
9  *        contrib/unaccent/unaccent.c
10  *
11  *-------------------------------------------------------------------------
12  */
13
14 #include "postgres.h"
15
16 #include "catalog/namespace.h"
17 #include "commands/defrem.h"
18 #include "tsearch/ts_cache.h"
19 #include "tsearch/ts_locale.h"
20 #include "tsearch/ts_public.h"
21 #include "utils/builtins.h"
22
23 PG_MODULE_MAGIC;
24
25 /*
26  * Unaccent dictionary uses a trie to find a character to replace. Each node of
27  * the trie is an array of 256 TrieChar structs (n-th element of array
28  * corresponds to byte)
29  */
30 typedef struct TrieChar
31 {
32         struct TrieChar *nextChar;
33         char       *replaceTo;
34         int                     replacelen;
35 } TrieChar;
36
37 /*
38  * placeChar - put str into trie's structure, byte by byte.
39  */
40 static TrieChar *
41 placeChar(TrieChar *node, unsigned char *str, int lenstr, char *replaceTo, int replacelen)
42 {
43         TrieChar   *curnode;
44
45         if (!node)
46         {
47                 node = palloc(sizeof(TrieChar) * 256);
48                 memset(node, 0, sizeof(TrieChar) * 256);
49         }
50
51         curnode = node + *str;
52
53         if (lenstr == 1)
54         {
55                 if (curnode->replaceTo)
56                         elog(WARNING, "duplicate TO argument, use first one");
57                 else
58                 {
59                         curnode->replacelen = replacelen;
60                         curnode->replaceTo = palloc(replacelen);
61                         memcpy(curnode->replaceTo, replaceTo, replacelen);
62                 }
63         }
64         else
65         {
66                 curnode->nextChar = placeChar(curnode->nextChar, str + 1, lenstr - 1, replaceTo, replacelen);
67         }
68
69         return node;
70 }
71
72 /*
73  * initTrie  - create trie from file.
74  *
75  * Function converts UTF8-encoded file into current encoding.
76  */
77 static TrieChar *
78 initTrie(char *filename)
79 {
80         TrieChar   *volatile rootTrie = NULL;
81         MemoryContext ccxt = CurrentMemoryContext;
82         tsearch_readline_state trst;
83         volatile bool skip;
84
85         filename = get_tsearch_config_filename(filename, "rules");
86         if (!tsearch_readline_begin(&trst, filename))
87                 ereport(ERROR,
88                                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
89                                  errmsg("could not open unaccent file \"%s\": %m",
90                                                 filename)));
91
92         do
93         {
94                 /*
95                  * pg_do_encoding_conversion() (called by tsearch_readline()) will
96                  * emit exception if it finds untranslatable characters in current
97                  * locale. We just skip such lines, continuing with the next.
98                  */
99                 skip = true;
100
101                 PG_TRY();
102                 {
103                         char       *line;
104
105                         while ((line = tsearch_readline(&trst)) != NULL)
106                         {
107                                 /*
108                                  * The format of each line must be "src trg" where src and trg
109                                  * are sequences of one or more non-whitespace characters,
110                                  * separated by whitespace.  Whitespace at start or end of
111                                  * line is ignored.
112                                  */
113                                 int                     state;
114                                 char       *ptr;
115                                 char       *src = NULL;
116                                 char       *trg = NULL;
117                                 int                     ptrlen;
118                                 int                     srclen = 0;
119                                 int                     trglen = 0;
120
121                                 state = 0;
122                                 for (ptr = line; *ptr; ptr += ptrlen)
123                                 {
124                                         ptrlen = pg_mblen(ptr);
125                                         /* ignore whitespace, but end src or trg */
126                                         if (t_isspace(ptr))
127                                         {
128                                                 if (state == 1)
129                                                         state = 2;
130                                                 else if (state == 3)
131                                                         state = 4;
132                                                 continue;
133                                         }
134                                         switch (state)
135                                         {
136                                                 case 0:
137                                                         /* start of src */
138                                                         src = ptr;
139                                                         srclen = ptrlen;
140                                                         state = 1;
141                                                         break;
142                                                 case 1:
143                                                         /* continue src */
144                                                         srclen += ptrlen;
145                                                         break;
146                                                 case 2:
147                                                         /* start of trg */
148                                                         trg = ptr;
149                                                         trglen = ptrlen;
150                                                         state = 3;
151                                                         break;
152                                                 case 3:
153                                                         /* continue trg */
154                                                         trglen += ptrlen;
155                                                         break;
156                                                 default:
157                                                         /* bogus line format */
158                                                         state = -1;
159                                                         break;
160                                         }
161                                 }
162
163                                 if (state >= 3)
164                                         rootTrie = placeChar(rootTrie,
165                                                                                  (unsigned char *) src, srclen,
166                                                                                  trg, trglen);
167
168                                 pfree(line);
169                         }
170                         skip = false;
171                 }
172                 PG_CATCH();
173                 {
174                         ErrorData  *errdata;
175                         MemoryContext ecxt;
176
177                         ecxt = MemoryContextSwitchTo(ccxt);
178                         errdata = CopyErrorData();
179                         if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER)
180                         {
181                                 FlushErrorState();
182                         }
183                         else
184                         {
185                                 MemoryContextSwitchTo(ecxt);
186                                 PG_RE_THROW();
187                         }
188                 }
189                 PG_END_TRY();
190         }
191         while (skip);
192
193         tsearch_readline_end(&trst);
194
195         return rootTrie;
196 }
197
198 /*
199  * findReplaceTo - find multibyte character in trie
200  */
201 static TrieChar *
202 findReplaceTo(TrieChar *node, unsigned char *src, int srclen)
203 {
204         while (node)
205         {
206                 node = node + *src;
207                 if (srclen == 1)
208                         return node;
209
210                 src++;
211                 srclen--;
212                 node = node->nextChar;
213         }
214
215         return NULL;
216 }
217
218 PG_FUNCTION_INFO_V1(unaccent_init);
219 Datum
220 unaccent_init(PG_FUNCTION_ARGS)
221 {
222         List       *dictoptions = (List *) PG_GETARG_POINTER(0);
223         TrieChar   *rootTrie = NULL;
224         bool            fileloaded = false;
225         ListCell   *l;
226
227         foreach(l, dictoptions)
228         {
229                 DefElem    *defel = (DefElem *) lfirst(l);
230
231                 if (pg_strcasecmp("Rules", defel->defname) == 0)
232                 {
233                         if (fileloaded)
234                                 ereport(ERROR,
235                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
236                                                  errmsg("multiple Rules parameters")));
237                         rootTrie = initTrie(defGetString(defel));
238                         fileloaded = true;
239                 }
240                 else
241                 {
242                         ereport(ERROR,
243                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
244                                          errmsg("unrecognized Unaccent parameter: \"%s\"",
245                                                         defel->defname)));
246                 }
247         }
248
249         if (!fileloaded)
250         {
251                 ereport(ERROR,
252                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
253                                  errmsg("missing Rules parameter")));
254         }
255
256         PG_RETURN_POINTER(rootTrie);
257 }
258
259 PG_FUNCTION_INFO_V1(unaccent_lexize);
260 Datum
261 unaccent_lexize(PG_FUNCTION_ARGS)
262 {
263         TrieChar   *rootTrie = (TrieChar *) PG_GETARG_POINTER(0);
264         char       *srcchar = (char *) PG_GETARG_POINTER(1);
265         int32           len = PG_GETARG_INT32(2);
266         char       *srcstart,
267                            *trgchar = NULL;
268         int                     charlen;
269         TSLexeme   *res = NULL;
270         TrieChar   *node;
271
272         srcstart = srcchar;
273         while (srcchar - srcstart < len)
274         {
275                 charlen = pg_mblen(srcchar);
276
277                 node = findReplaceTo(rootTrie, (unsigned char *) srcchar, charlen);
278                 if (node && node->replaceTo)
279                 {
280                         if (!res)
281                         {
282                                 /* allocate res only if it's needed */
283                                 res = palloc0(sizeof(TSLexeme) * 2);
284                                 res->lexeme = trgchar = palloc(len * pg_database_encoding_max_length() + 1 /* \0 */ );
285                                 res->flags = TSL_FILTER;
286                                 if (srcchar != srcstart)
287                                 {
288                                         memcpy(trgchar, srcstart, srcchar - srcstart);
289                                         trgchar += (srcchar - srcstart);
290                                 }
291                         }
292                         memcpy(trgchar, node->replaceTo, node->replacelen);
293                         trgchar += node->replacelen;
294                 }
295                 else if (res)
296                 {
297                         memcpy(trgchar, srcchar, charlen);
298                         trgchar += charlen;
299                 }
300
301                 srcchar += charlen;
302         }
303
304         if (res)
305                 *trgchar = '\0';
306
307         PG_RETURN_POINTER(res);
308 }
309
310 /*
311  * Function-like wrapper for dictionary
312  */
313 PG_FUNCTION_INFO_V1(unaccent_dict);
314 Datum
315 unaccent_dict(PG_FUNCTION_ARGS)
316 {
317         text       *str;
318         int                     strArg;
319         Oid                     dictOid;
320         TSDictionaryCacheEntry *dict;
321         TSLexeme   *res;
322
323         if (PG_NARGS() == 1)
324         {
325                 dictOid = get_ts_dict_oid(stringToQualifiedNameList("unaccent"), false);
326                 strArg = 0;
327         }
328         else
329         {
330                 dictOid = PG_GETARG_OID(0);
331                 strArg = 1;
332         }
333         str = PG_GETARG_TEXT_P(strArg);
334
335         dict = lookup_ts_dictionary_cache(dictOid);
336
337         res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
338                                                                                          PointerGetDatum(dict->dictData),
339                                                                                            PointerGetDatum(VARDATA(str)),
340                                                                           Int32GetDatum(VARSIZE(str) - VARHDRSZ),
341                                                                                                          PointerGetDatum(NULL)));
342
343         PG_FREE_IF_COPY(str, strArg);
344
345         if (res == NULL)
346         {
347                 PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
348         }
349         else if (res->lexeme == NULL)
350         {
351                 pfree(res);
352                 PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
353         }
354         else
355         {
356                 text       *txt = cstring_to_text(res->lexeme);
357
358                 pfree(res->lexeme);
359                 pfree(res);
360
361                 PG_RETURN_TEXT_P(txt);
362         }
363 }