]> granicus.if.org Git - postgresql/blob - contrib/unaccent/unaccent.c
Stamp copyrights for year 2011.
[postgresql] / contrib / unaccent / unaccent.c
1 /*-------------------------------------------------------------------------
2  *
3  * unaccent.c
4  *        Text search unaccent dictionary
5  *
6  * Copyright (c) 2009-2011, PostgreSQL Global Development Group
7  *
8  * IDENTIFICATION
9  *        contrib/unaccent/unaccent.c
10  *
11  *-------------------------------------------------------------------------
12  */
13
14 #include "postgres.h"
15
16 #include "fmgr.h"
17 #include "catalog/namespace.h"
18 #include "commands/defrem.h"
19 #include "mb/pg_wchar.h"
20 #include "tsearch/ts_cache.h"
21 #include "tsearch/ts_locale.h"
22 #include "tsearch/ts_public.h"
23 #include "utils/builtins.h"
24
25 PG_MODULE_MAGIC;
26
27 /*
28  * Unaccent dictionary uses uncompressed suffix tree to find a
29  * character to replace. Each node of tree is an array of
30  * SuffixChar struct with length = 256 (n-th element of array
31  * corresponds to byte)
32  */
33 typedef struct SuffixChar
34 {
35         struct SuffixChar *nextChar;
36         char       *replaceTo;
37         int                     replacelen;
38 } SuffixChar;
39
40 /*
41  * placeChar - put str into tree's structure, byte by byte.
42  */
43 static SuffixChar *
44 placeChar(SuffixChar *node, unsigned char *str, int lenstr, char *replaceTo, int replacelen)
45 {
46         SuffixChar *curnode;
47
48         if (!node)
49         {
50                 node = palloc(sizeof(SuffixChar) * 256);
51                 memset(node, 0, sizeof(SuffixChar) * 256);
52         }
53
54         curnode = node + *str;
55
56         if (lenstr == 1)
57         {
58                 if (curnode->replaceTo)
59                         elog(WARNING, "duplicate TO argument, use first one");
60                 else
61                 {
62                         curnode->replacelen = replacelen;
63                         curnode->replaceTo = palloc(replacelen);
64                         memcpy(curnode->replaceTo, replaceTo, replacelen);
65                 }
66         }
67         else
68         {
69                 curnode->nextChar = placeChar(curnode->nextChar, str + 1, lenstr - 1, replaceTo, replacelen);
70         }
71
72         return node;
73 }
74
75 /*
76  * initSuffixTree  - create suffix tree from file. Function converts
77  * UTF8-encoded file into current encoding.
78  */
79 static SuffixChar *
80 initSuffixTree(char *filename)
81 {
82         SuffixChar *volatile rootSuffixTree = NULL;
83         MemoryContext ccxt = CurrentMemoryContext;
84         tsearch_readline_state trst;
85         volatile bool skip;
86
87         filename = get_tsearch_config_filename(filename, "rules");
88         if (!tsearch_readline_begin(&trst, filename))
89                 ereport(ERROR,
90                                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
91                                  errmsg("could not open unaccent file \"%s\": %m",
92                                                 filename)));
93
94         do
95         {
96                 char            src[4096];
97                 char            trg[4096];
98                 int                     srclen;
99                 int                     trglen;
100                 char       *line = NULL;
101
102                 skip = true;
103
104                 PG_TRY();
105                 {
106                         /*
107                          * pg_do_encoding_conversion() (called by tsearch_readline()) will
108                          * emit exception if it finds untranslatable characters in current
109                          * locale. We just skip such characters.
110                          */
111                         while ((line = tsearch_readline(&trst)) != NULL)
112                         {
113                                 if (sscanf(line, "%s\t%s\n", src, trg) != 2)
114                                         continue;
115
116                                 srclen = strlen(src);
117                                 trglen = strlen(trg);
118
119                                 rootSuffixTree = placeChar(rootSuffixTree,
120                                                                                    (unsigned char *) src, srclen,
121                                                                                    trg, trglen);
122                                 skip = false;
123                                 pfree(line);
124                         }
125                 }
126                 PG_CATCH();
127                 {
128                         ErrorData  *errdata;
129                         MemoryContext ecxt;
130
131                         ecxt = MemoryContextSwitchTo(ccxt);
132                         errdata = CopyErrorData();
133                         if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER)
134                         {
135                                 FlushErrorState();
136                         }
137                         else
138                         {
139                                 MemoryContextSwitchTo(ecxt);
140                                 PG_RE_THROW();
141                         }
142                 }
143                 PG_END_TRY();
144         }
145         while (skip);
146
147         tsearch_readline_end(&trst);
148
149         return rootSuffixTree;
150 }
151
152 /*
153  * findReplaceTo - find multibyte character in tree
154  */
155 static SuffixChar *
156 findReplaceTo(SuffixChar *node, unsigned char *src, int srclen)
157 {
158         while (node)
159         {
160                 node = node + *src;
161                 if (srclen == 1)
162                         return node;
163
164                 src++;
165                 srclen--;
166                 node = node->nextChar;
167         }
168
169         return NULL;
170 }
171
172 PG_FUNCTION_INFO_V1(unaccent_init);
173 Datum           unaccent_init(PG_FUNCTION_ARGS);
174 Datum
175 unaccent_init(PG_FUNCTION_ARGS)
176 {
177         List       *dictoptions = (List *) PG_GETARG_POINTER(0);
178         SuffixChar *rootSuffixTree = NULL;
179         bool            fileloaded = false;
180         ListCell   *l;
181
182         foreach(l, dictoptions)
183         {
184                 DefElem    *defel = (DefElem *) lfirst(l);
185
186                 if (pg_strcasecmp("Rules", defel->defname) == 0)
187                 {
188                         if (fileloaded)
189                                 ereport(ERROR,
190                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
191                                                  errmsg("multiple Rules parameters")));
192                         rootSuffixTree = initSuffixTree(defGetString(defel));
193                         fileloaded = true;
194                 }
195                 else
196                 {
197                         ereport(ERROR,
198                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
199                                          errmsg("unrecognized Unaccent parameter: \"%s\"",
200                                                         defel->defname)));
201                 }
202         }
203
204         if (!fileloaded)
205         {
206                 ereport(ERROR,
207                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
208                                  errmsg("missing Rules parameter")));
209         }
210
211         PG_RETURN_POINTER(rootSuffixTree);
212 }
213
214 PG_FUNCTION_INFO_V1(unaccent_lexize);
215 Datum           unaccent_lexize(PG_FUNCTION_ARGS);
216 Datum
217 unaccent_lexize(PG_FUNCTION_ARGS)
218 {
219         SuffixChar *rootSuffixTree = (SuffixChar *) PG_GETARG_POINTER(0);
220         char       *srcchar = (char *) PG_GETARG_POINTER(1);
221         int32           len = PG_GETARG_INT32(2);
222         char       *srcstart,
223                            *trgchar = NULL;
224         int                     charlen;
225         TSLexeme   *res = NULL;
226         SuffixChar *node;
227
228         srcstart = srcchar;
229         while (srcchar - srcstart < len)
230         {
231                 charlen = pg_mblen(srcchar);
232
233                 node = findReplaceTo(rootSuffixTree, (unsigned char *) srcchar, charlen);
234                 if (node && node->replaceTo)
235                 {
236                         if (!res)
237                         {
238                                 /* allocate res only it it's needed */
239                                 res = palloc0(sizeof(TSLexeme) * 2);
240                                 res->lexeme = trgchar = palloc(len * pg_database_encoding_max_length() + 1 /* \0 */ );
241                                 res->flags = TSL_FILTER;
242                                 if (srcchar != srcstart)
243                                 {
244                                         memcpy(trgchar, srcstart, srcchar - srcstart);
245                                         trgchar += (srcchar - srcstart);
246                                 }
247                         }
248                         memcpy(trgchar, node->replaceTo, node->replacelen);
249                         trgchar += node->replacelen;
250                 }
251                 else if (res)
252                 {
253                         memcpy(trgchar, srcchar, charlen);
254                         trgchar += charlen;
255                 }
256
257                 srcchar += charlen;
258         }
259
260         if (res)
261                 *trgchar = '\0';
262
263         PG_RETURN_POINTER(res);
264 }
265
266 /*
267  * Function-like wrapper for dictionary
268  */
269 PG_FUNCTION_INFO_V1(unaccent_dict);
270 Datum           unaccent_dict(PG_FUNCTION_ARGS);
271 Datum
272 unaccent_dict(PG_FUNCTION_ARGS)
273 {
274         text       *str;
275         int                     strArg;
276         Oid                     dictOid;
277         TSDictionaryCacheEntry *dict;
278         TSLexeme   *res;
279
280         if (PG_NARGS() == 1)
281         {
282                 dictOid = get_ts_dict_oid(stringToQualifiedNameList("unaccent"), false);
283                 strArg = 0;
284         }
285         else
286         {
287                 dictOid = PG_GETARG_OID(0);
288                 strArg = 1;
289         }
290         str = PG_GETARG_TEXT_P(strArg);
291
292         dict = lookup_ts_dictionary_cache(dictOid);
293
294         res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
295                                                                                          PointerGetDatum(dict->dictData),
296                                                                                            PointerGetDatum(VARDATA(str)),
297                                                                           Int32GetDatum(VARSIZE(str) - VARHDRSZ),
298                                                                                                          PointerGetDatum(NULL)));
299
300         PG_FREE_IF_COPY(str, strArg);
301
302         if (res == NULL)
303         {
304                 PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
305         }
306         else if (res->lexeme == NULL)
307         {
308                 pfree(res);
309                 PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
310         }
311         else
312         {
313                 text       *txt = cstring_to_text(res->lexeme);
314
315                 pfree(res->lexeme);
316                 pfree(res);
317
318                 PG_RETURN_TEXT_P(txt);
319         }
320 }