1 /*-------------------------------------------------------------------------
4 * Text search unaccent dictionary
6 * Copyright (c) 2009-2011, PostgreSQL Global Development Group
9 * contrib/unaccent/unaccent.c
11 *-------------------------------------------------------------------------
17 #include "catalog/namespace.h"
18 #include "commands/defrem.h"
19 #include "mb/pg_wchar.h"
20 #include "tsearch/ts_cache.h"
21 #include "tsearch/ts_locale.h"
22 #include "tsearch/ts_public.h"
23 #include "utils/builtins.h"
28 * Unaccent dictionary uses uncompressed suffix tree to find a
29 * character to replace. Each node of tree is an array of
30 * SuffixChar struct with length = 256 (n-th element of array
31 * corresponds to byte)
33 typedef struct SuffixChar
35 struct SuffixChar *nextChar;
41 * placeChar - put str into tree's structure, byte by byte.
44 placeChar(SuffixChar *node, unsigned char *str, int lenstr, char *replaceTo, int replacelen)
50 node = palloc(sizeof(SuffixChar) * 256);
51 memset(node, 0, sizeof(SuffixChar) * 256);
54 curnode = node + *str;
58 if (curnode->replaceTo)
59 elog(WARNING, "duplicate TO argument, use first one");
62 curnode->replacelen = replacelen;
63 curnode->replaceTo = palloc(replacelen);
64 memcpy(curnode->replaceTo, replaceTo, replacelen);
69 curnode->nextChar = placeChar(curnode->nextChar, str + 1, lenstr - 1, replaceTo, replacelen);
76 * initSuffixTree - create suffix tree from file. Function converts
77 * UTF8-encoded file into current encoding.
80 initSuffixTree(char *filename)
82 SuffixChar *volatile rootSuffixTree = NULL;
83 MemoryContext ccxt = CurrentMemoryContext;
84 tsearch_readline_state trst;
87 filename = get_tsearch_config_filename(filename, "rules");
88 if (!tsearch_readline_begin(&trst, filename))
90 (errcode(ERRCODE_CONFIG_FILE_ERROR),
91 errmsg("could not open unaccent file \"%s\": %m",
107 * pg_do_encoding_conversion() (called by tsearch_readline()) will
108 * emit exception if it finds untranslatable characters in current
109 * locale. We just skip such characters.
111 while ((line = tsearch_readline(&trst)) != NULL)
113 if (sscanf(line, "%s\t%s\n", src, trg) != 2)
116 srclen = strlen(src);
117 trglen = strlen(trg);
119 rootSuffixTree = placeChar(rootSuffixTree,
120 (unsigned char *) src, srclen,
131 ecxt = MemoryContextSwitchTo(ccxt);
132 errdata = CopyErrorData();
133 if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER)
139 MemoryContextSwitchTo(ecxt);
147 tsearch_readline_end(&trst);
149 return rootSuffixTree;
153 * findReplaceTo - find multibyte character in tree
156 findReplaceTo(SuffixChar *node, unsigned char *src, int srclen)
166 node = node->nextChar;
172 PG_FUNCTION_INFO_V1(unaccent_init);
173 Datum unaccent_init(PG_FUNCTION_ARGS);
175 unaccent_init(PG_FUNCTION_ARGS)
177 List *dictoptions = (List *) PG_GETARG_POINTER(0);
178 SuffixChar *rootSuffixTree = NULL;
179 bool fileloaded = false;
182 foreach(l, dictoptions)
184 DefElem *defel = (DefElem *) lfirst(l);
186 if (pg_strcasecmp("Rules", defel->defname) == 0)
190 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
191 errmsg("multiple Rules parameters")));
192 rootSuffixTree = initSuffixTree(defGetString(defel));
198 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
199 errmsg("unrecognized Unaccent parameter: \"%s\"",
207 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
208 errmsg("missing Rules parameter")));
211 PG_RETURN_POINTER(rootSuffixTree);
214 PG_FUNCTION_INFO_V1(unaccent_lexize);
215 Datum unaccent_lexize(PG_FUNCTION_ARGS);
217 unaccent_lexize(PG_FUNCTION_ARGS)
219 SuffixChar *rootSuffixTree = (SuffixChar *) PG_GETARG_POINTER(0);
220 char *srcchar = (char *) PG_GETARG_POINTER(1);
221 int32 len = PG_GETARG_INT32(2);
225 TSLexeme *res = NULL;
229 while (srcchar - srcstart < len)
231 charlen = pg_mblen(srcchar);
233 node = findReplaceTo(rootSuffixTree, (unsigned char *) srcchar, charlen);
234 if (node && node->replaceTo)
238 /* allocate res only it it's needed */
239 res = palloc0(sizeof(TSLexeme) * 2);
240 res->lexeme = trgchar = palloc(len * pg_database_encoding_max_length() + 1 /* \0 */ );
241 res->flags = TSL_FILTER;
242 if (srcchar != srcstart)
244 memcpy(trgchar, srcstart, srcchar - srcstart);
245 trgchar += (srcchar - srcstart);
248 memcpy(trgchar, node->replaceTo, node->replacelen);
249 trgchar += node->replacelen;
253 memcpy(trgchar, srcchar, charlen);
263 PG_RETURN_POINTER(res);
267 * Function-like wrapper for dictionary
269 PG_FUNCTION_INFO_V1(unaccent_dict);
270 Datum unaccent_dict(PG_FUNCTION_ARGS);
272 unaccent_dict(PG_FUNCTION_ARGS)
277 TSDictionaryCacheEntry *dict;
282 dictOid = get_ts_dict_oid(stringToQualifiedNameList("unaccent"), false);
287 dictOid = PG_GETARG_OID(0);
290 str = PG_GETARG_TEXT_P(strArg);
292 dict = lookup_ts_dictionary_cache(dictOid);
294 res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
295 PointerGetDatum(dict->dictData),
296 PointerGetDatum(VARDATA(str)),
297 Int32GetDatum(VARSIZE(str) - VARHDRSZ),
298 PointerGetDatum(NULL)));
300 PG_FREE_IF_COPY(str, strArg);
304 PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
306 else if (res->lexeme == NULL)
309 PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
313 text *txt = cstring_to_text(res->lexeme);
318 PG_RETURN_TEXT_P(txt);