1 /*-------------------------------------------------------------------------
4 * Text search unaccent dictionary
6 * Copyright (c) 2009-2012, PostgreSQL Global Development Group
9 * contrib/unaccent/unaccent.c
11 *-------------------------------------------------------------------------
16 #include "catalog/namespace.h"
17 #include "commands/defrem.h"
18 #include "tsearch/ts_cache.h"
19 #include "tsearch/ts_locale.h"
20 #include "tsearch/ts_public.h"
21 #include "utils/builtins.h"
26 * Unaccent dictionary uses uncompressed suffix tree to find a
27 * character to replace. Each node of tree is an array of
28 * SuffixChar struct with length = 256 (n-th element of array
29 * corresponds to byte)
31 typedef struct SuffixChar
33 struct SuffixChar *nextChar;
39 * placeChar - put str into tree's structure, byte by byte.
42 placeChar(SuffixChar *node, unsigned char *str, int lenstr, char *replaceTo, int replacelen)
48 node = palloc(sizeof(SuffixChar) * 256);
49 memset(node, 0, sizeof(SuffixChar) * 256);
52 curnode = node + *str;
56 if (curnode->replaceTo)
57 elog(WARNING, "duplicate TO argument, use first one");
60 curnode->replacelen = replacelen;
61 curnode->replaceTo = palloc(replacelen);
62 memcpy(curnode->replaceTo, replaceTo, replacelen);
67 curnode->nextChar = placeChar(curnode->nextChar, str + 1, lenstr - 1, replaceTo, replacelen);
74 * initSuffixTree - create suffix tree from file. Function converts
75 * UTF8-encoded file into current encoding.
78 initSuffixTree(char *filename)
80 SuffixChar *volatile rootSuffixTree = NULL;
81 MemoryContext ccxt = CurrentMemoryContext;
82 tsearch_readline_state trst;
85 filename = get_tsearch_config_filename(filename, "rules");
86 if (!tsearch_readline_begin(&trst, filename))
88 (errcode(ERRCODE_CONFIG_FILE_ERROR),
89 errmsg("could not open unaccent file \"%s\": %m",
95 * pg_do_encoding_conversion() (called by tsearch_readline()) will
96 * emit exception if it finds untranslatable characters in current
97 * locale. We just skip such lines, continuing with the next.
105 while ((line = tsearch_readline(&trst)) != NULL)
108 * The format of each line must be "src trg" where src and trg
109 * are sequences of one or more non-whitespace characters,
110 * separated by whitespace. Whitespace at start or end of
122 for (ptr = line; *ptr; ptr += ptrlen)
124 ptrlen = pg_mblen(ptr);
125 /* ignore whitespace, but end src or trg */
157 /* bogus line format */
164 rootSuffixTree = placeChar(rootSuffixTree,
165 (unsigned char *) src, srclen,
177 ecxt = MemoryContextSwitchTo(ccxt);
178 errdata = CopyErrorData();
179 if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER)
185 MemoryContextSwitchTo(ecxt);
193 tsearch_readline_end(&trst);
195 return rootSuffixTree;
199 * findReplaceTo - find multibyte character in tree
202 findReplaceTo(SuffixChar *node, unsigned char *src, int srclen)
212 node = node->nextChar;
218 PG_FUNCTION_INFO_V1(unaccent_init);
219 Datum unaccent_init(PG_FUNCTION_ARGS);
221 unaccent_init(PG_FUNCTION_ARGS)
223 List *dictoptions = (List *) PG_GETARG_POINTER(0);
224 SuffixChar *rootSuffixTree = NULL;
225 bool fileloaded = false;
228 foreach(l, dictoptions)
230 DefElem *defel = (DefElem *) lfirst(l);
232 if (pg_strcasecmp("Rules", defel->defname) == 0)
236 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
237 errmsg("multiple Rules parameters")));
238 rootSuffixTree = initSuffixTree(defGetString(defel));
244 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
245 errmsg("unrecognized Unaccent parameter: \"%s\"",
253 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
254 errmsg("missing Rules parameter")));
257 PG_RETURN_POINTER(rootSuffixTree);
260 PG_FUNCTION_INFO_V1(unaccent_lexize);
261 Datum unaccent_lexize(PG_FUNCTION_ARGS);
263 unaccent_lexize(PG_FUNCTION_ARGS)
265 SuffixChar *rootSuffixTree = (SuffixChar *) PG_GETARG_POINTER(0);
266 char *srcchar = (char *) PG_GETARG_POINTER(1);
267 int32 len = PG_GETARG_INT32(2);
271 TSLexeme *res = NULL;
275 while (srcchar - srcstart < len)
277 charlen = pg_mblen(srcchar);
279 node = findReplaceTo(rootSuffixTree, (unsigned char *) srcchar, charlen);
280 if (node && node->replaceTo)
284 /* allocate res only it it's needed */
285 res = palloc0(sizeof(TSLexeme) * 2);
286 res->lexeme = trgchar = palloc(len * pg_database_encoding_max_length() + 1 /* \0 */ );
287 res->flags = TSL_FILTER;
288 if (srcchar != srcstart)
290 memcpy(trgchar, srcstart, srcchar - srcstart);
291 trgchar += (srcchar - srcstart);
294 memcpy(trgchar, node->replaceTo, node->replacelen);
295 trgchar += node->replacelen;
299 memcpy(trgchar, srcchar, charlen);
309 PG_RETURN_POINTER(res);
313 * Function-like wrapper for dictionary
315 PG_FUNCTION_INFO_V1(unaccent_dict);
316 Datum unaccent_dict(PG_FUNCTION_ARGS);
318 unaccent_dict(PG_FUNCTION_ARGS)
323 TSDictionaryCacheEntry *dict;
328 dictOid = get_ts_dict_oid(stringToQualifiedNameList("unaccent"), false);
333 dictOid = PG_GETARG_OID(0);
336 str = PG_GETARG_TEXT_P(strArg);
338 dict = lookup_ts_dictionary_cache(dictOid);
340 res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
341 PointerGetDatum(dict->dictData),
342 PointerGetDatum(VARDATA(str)),
343 Int32GetDatum(VARSIZE(str) - VARHDRSZ),
344 PointerGetDatum(NULL)));
346 PG_FREE_IF_COPY(str, strArg);
350 PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
352 else if (res->lexeme == NULL)
355 PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
359 text *txt = cstring_to_text(res->lexeme);
364 PG_RETURN_TEXT_P(txt);