1 /*-------------------------------------------------------------------------
4 * Text search unaccent dictionary
6 * Copyright (c) 2009-2014, PostgreSQL Global Development Group
9 * contrib/unaccent/unaccent.c
11 *-------------------------------------------------------------------------
16 #include "catalog/namespace.h"
17 #include "commands/defrem.h"
18 #include "tsearch/ts_cache.h"
19 #include "tsearch/ts_locale.h"
20 #include "tsearch/ts_public.h"
21 #include "utils/builtins.h"
26 * Unaccent dictionary uses a trie to find a character to replace. Each node of
27 * the trie is an array of 256 TrieChar structs (n-th element of array
28 * corresponds to byte)
30 typedef struct TrieChar
32 struct TrieChar *nextChar;
38 * placeChar - put str into trie's structure, byte by byte.
41 placeChar(TrieChar *node, unsigned char *str, int lenstr, char *replaceTo, int replacelen)
47 node = palloc(sizeof(TrieChar) * 256);
48 memset(node, 0, sizeof(TrieChar) * 256);
51 curnode = node + *str;
55 if (curnode->replaceTo)
56 elog(WARNING, "duplicate TO argument, use first one");
59 curnode->replacelen = replacelen;
60 curnode->replaceTo = palloc(replacelen);
61 memcpy(curnode->replaceTo, replaceTo, replacelen);
66 curnode->nextChar = placeChar(curnode->nextChar, str + 1, lenstr - 1, replaceTo, replacelen);
73 * initTrie - create trie from file.
75 * Function converts UTF8-encoded file into current encoding.
78 initTrie(char *filename)
80 TrieChar *volatile rootTrie = NULL;
81 MemoryContext ccxt = CurrentMemoryContext;
82 tsearch_readline_state trst;
85 filename = get_tsearch_config_filename(filename, "rules");
86 if (!tsearch_readline_begin(&trst, filename))
88 (errcode(ERRCODE_CONFIG_FILE_ERROR),
89 errmsg("could not open unaccent file \"%s\": %m",
95 * pg_do_encoding_conversion() (called by tsearch_readline()) will
96 * emit exception if it finds untranslatable characters in current
97 * locale. We just skip such lines, continuing with the next.
105 while ((line = tsearch_readline(&trst)) != NULL)
108 * The format of each line must be "src trg" where src and trg
109 * are sequences of one or more non-whitespace characters,
110 * separated by whitespace. Whitespace at start or end of
122 for (ptr = line; *ptr; ptr += ptrlen)
124 ptrlen = pg_mblen(ptr);
125 /* ignore whitespace, but end src or trg */
157 /* bogus line format */
164 rootTrie = placeChar(rootTrie,
165 (unsigned char *) src, srclen,
177 ecxt = MemoryContextSwitchTo(ccxt);
178 errdata = CopyErrorData();
179 if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER)
185 MemoryContextSwitchTo(ecxt);
193 tsearch_readline_end(&trst);
199 * findReplaceTo - find multibyte character in trie
202 findReplaceTo(TrieChar *node, unsigned char *src, int srclen)
212 node = node->nextChar;
218 PG_FUNCTION_INFO_V1(unaccent_init);
220 unaccent_init(PG_FUNCTION_ARGS)
222 List *dictoptions = (List *) PG_GETARG_POINTER(0);
223 TrieChar *rootTrie = NULL;
224 bool fileloaded = false;
227 foreach(l, dictoptions)
229 DefElem *defel = (DefElem *) lfirst(l);
231 if (pg_strcasecmp("Rules", defel->defname) == 0)
235 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
236 errmsg("multiple Rules parameters")));
237 rootTrie = initTrie(defGetString(defel));
243 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
244 errmsg("unrecognized Unaccent parameter: \"%s\"",
252 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
253 errmsg("missing Rules parameter")));
256 PG_RETURN_POINTER(rootTrie);
259 PG_FUNCTION_INFO_V1(unaccent_lexize);
261 unaccent_lexize(PG_FUNCTION_ARGS)
263 TrieChar *rootTrie = (TrieChar *) PG_GETARG_POINTER(0);
264 char *srcchar = (char *) PG_GETARG_POINTER(1);
265 int32 len = PG_GETARG_INT32(2);
269 TSLexeme *res = NULL;
273 while (srcchar - srcstart < len)
275 charlen = pg_mblen(srcchar);
277 node = findReplaceTo(rootTrie, (unsigned char *) srcchar, charlen);
278 if (node && node->replaceTo)
282 /* allocate res only if it's needed */
283 res = palloc0(sizeof(TSLexeme) * 2);
284 res->lexeme = trgchar = palloc(len * pg_database_encoding_max_length() + 1 /* \0 */ );
285 res->flags = TSL_FILTER;
286 if (srcchar != srcstart)
288 memcpy(trgchar, srcstart, srcchar - srcstart);
289 trgchar += (srcchar - srcstart);
292 memcpy(trgchar, node->replaceTo, node->replacelen);
293 trgchar += node->replacelen;
297 memcpy(trgchar, srcchar, charlen);
307 PG_RETURN_POINTER(res);
311 * Function-like wrapper for dictionary
313 PG_FUNCTION_INFO_V1(unaccent_dict);
315 unaccent_dict(PG_FUNCTION_ARGS)
320 TSDictionaryCacheEntry *dict;
325 dictOid = get_ts_dict_oid(stringToQualifiedNameList("unaccent"), false);
330 dictOid = PG_GETARG_OID(0);
333 str = PG_GETARG_TEXT_P(strArg);
335 dict = lookup_ts_dictionary_cache(dictOid);
337 res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
338 PointerGetDatum(dict->dictData),
339 PointerGetDatum(VARDATA(str)),
340 Int32GetDatum(VARSIZE(str) - VARHDRSZ),
341 PointerGetDatum(NULL)));
343 PG_FREE_IF_COPY(str, strArg);
347 PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
349 else if (res->lexeme == NULL)
352 PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
356 text *txt = cstring_to_text(res->lexeme);
361 PG_RETURN_TEXT_P(txt);