granicus.if.org Git - postgresql/blob - contrib/unaccent/unaccent.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * unaccent.c
   4  *        Text search unaccent dictionary
   5  *
   6  * Copyright (c) 2009-2011, PostgreSQL Global Development Group
   7  *
   8  * IDENTIFICATION
   9  *        contrib/unaccent/unaccent.c
  10  *
  11  *-------------------------------------------------------------------------
  12  */
  13
  14 #include "postgres.h"
  15
  16 #include "fmgr.h"
  17 #include "catalog/namespace.h"
  18 #include "commands/defrem.h"
  19 #include "mb/pg_wchar.h"
  20 #include "tsearch/ts_cache.h"
  21 #include "tsearch/ts_locale.h"
  22 #include "tsearch/ts_public.h"
  23 #include "utils/builtins.h"
  24
  25 PG_MODULE_MAGIC;
  26
  27 /*
  28  * Unaccent dictionary uses uncompressed suffix tree to find a
  29  * character to replace. Each node of tree is an array of
  30  * SuffixChar struct with length = 256 (n-th element of array
  31  * corresponds to byte)
  32  */
  33 typedef struct SuffixChar
  34 {
  35         struct SuffixChar *nextChar;
  36         char       *replaceTo;
  37         int                     replacelen;
  38 } SuffixChar;
  39
  40 /*
  41  * placeChar - put str into tree's structure, byte by byte.
  42  */
  43 static SuffixChar *
  44 placeChar(SuffixChar *node, unsigned char *str, int lenstr, char *replaceTo, int replacelen)
  45 {
  46         SuffixChar *curnode;
  47
  48         if (!node)
  49         {
  50                 node = palloc(sizeof(SuffixChar) * 256);
  51                 memset(node, 0, sizeof(SuffixChar) * 256);
  52         }
  53
  54         curnode = node + *str;
  55
  56         if (lenstr == 1)
  57         {
  58                 if (curnode->replaceTo)
  59                         elog(WARNING, "duplicate TO argument, use first one");
  60                 else
  61                 {
  62                         curnode->replacelen = replacelen;
  63                         curnode->replaceTo = palloc(replacelen);
  64                         memcpy(curnode->replaceTo, replaceTo, replacelen);
  65                 }
  66         }
  67         else
  68         {
  69                 curnode->nextChar = placeChar(curnode->nextChar, str + 1, lenstr - 1, replaceTo, replacelen);
  70         }
  71
  72         return node;
  73 }
  74
  75 /*
  76  * initSuffixTree  - create suffix tree from file. Function converts
  77  * UTF8-encoded file into current encoding.
  78  */
  79 static SuffixChar *
  80 initSuffixTree(char *filename)
  81 {
  82         SuffixChar *volatile rootSuffixTree = NULL;
  83         MemoryContext ccxt = CurrentMemoryContext;
  84         tsearch_readline_state trst;
  85         volatile bool skip;
  86
  87         filename = get_tsearch_config_filename(filename, "rules");
  88         if (!tsearch_readline_begin(&trst, filename))
  89                 ereport(ERROR,
  90                                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
  91                                  errmsg("could not open unaccent file \"%s\": %m",
  92                                                 filename)));
  93
  94         do
  95         {
  96                 char            src[4096];
  97                 char            trg[4096];
  98                 int                     srclen;
  99                 int                     trglen;
 100                 char       *line = NULL;
 101
 102                 skip = true;
 103
 104                 PG_TRY();
 105                 {
 106                         /*
 107                          * pg_do_encoding_conversion() (called by tsearch_readline()) will
 108                          * emit exception if it finds untranslatable characters in current
 109                          * locale. We just skip such characters.
 110                          */
 111                         while ((line = tsearch_readline(&trst)) != NULL)
 112                         {
 113                                 if (sscanf(line, "%s\t%s\n", src, trg) != 2)
 114                                         continue;
 115
 116                                 srclen = strlen(src);
 117                                 trglen = strlen(trg);
 118
 119                                 rootSuffixTree = placeChar(rootSuffixTree,
 120                                                                                    (unsigned char *) src, srclen,
 121                                                                                    trg, trglen);
 122                                 skip = false;
 123                                 pfree(line);
 124                         }
 125                 }
 126                 PG_CATCH();
 127                 {
 128                         ErrorData  *errdata;
 129                         MemoryContext ecxt;
 130
 131                         ecxt = MemoryContextSwitchTo(ccxt);
 132                         errdata = CopyErrorData();
 133                         if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER)
 134                         {
 135                                 FlushErrorState();
 136                         }
 137                         else
 138                         {
 139                                 MemoryContextSwitchTo(ecxt);
 140                                 PG_RE_THROW();
 141                         }
 142                 }
 143                 PG_END_TRY();
 144         }
 145         while (skip);
 146
 147         tsearch_readline_end(&trst);
 148
 149         return rootSuffixTree;
 150 }
 151
 152 /*
 153  * findReplaceTo - find multibyte character in tree
 154  */
 155 static SuffixChar *
 156 findReplaceTo(SuffixChar *node, unsigned char *src, int srclen)
 157 {
 158         while (node)
 159         {
 160                 node = node + *src;
 161                 if (srclen == 1)
 162                         return node;
 163
 164                 src++;
 165                 srclen--;
 166                 node = node->nextChar;
 167         }
 168
 169         return NULL;
 170 }
 171
 172 PG_FUNCTION_INFO_V1(unaccent_init);
 173 Datum           unaccent_init(PG_FUNCTION_ARGS);
 174 Datum
 175 unaccent_init(PG_FUNCTION_ARGS)
 176 {
 177         List       *dictoptions = (List *) PG_GETARG_POINTER(0);
 178         SuffixChar *rootSuffixTree = NULL;
 179         bool            fileloaded = false;
 180         ListCell   *l;
 181
 182         foreach(l, dictoptions)
 183         {
 184                 DefElem    *defel = (DefElem *) lfirst(l);
 185
 186                 if (pg_strcasecmp("Rules", defel->defname) == 0)
 187                 {
 188                         if (fileloaded)
 189                                 ereport(ERROR,
 190                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 191                                                  errmsg("multiple Rules parameters")));
 192                         rootSuffixTree = initSuffixTree(defGetString(defel));
 193                         fileloaded = true;
 194                 }
 195                 else
 196                 {
 197                         ereport(ERROR,
 198                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 199                                          errmsg("unrecognized Unaccent parameter: \"%s\"",
 200                                                         defel->defname)));
 201                 }
 202         }
 203
 204         if (!fileloaded)
 205         {
 206                 ereport(ERROR,
 207                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 208                                  errmsg("missing Rules parameter")));
 209         }
 210
 211         PG_RETURN_POINTER(rootSuffixTree);
 212 }
 213
 214 PG_FUNCTION_INFO_V1(unaccent_lexize);
 215 Datum           unaccent_lexize(PG_FUNCTION_ARGS);
 216 Datum
 217 unaccent_lexize(PG_FUNCTION_ARGS)
 218 {
 219         SuffixChar *rootSuffixTree = (SuffixChar *) PG_GETARG_POINTER(0);
 220         char       *srcchar = (char *) PG_GETARG_POINTER(1);
 221         int32           len = PG_GETARG_INT32(2);
 222         char       *srcstart,
 223                            *trgchar = NULL;
 224         int                     charlen;
 225         TSLexeme   *res = NULL;
 226         SuffixChar *node;
 227
 228         srcstart = srcchar;
 229         while (srcchar - srcstart < len)
 230         {
 231                 charlen = pg_mblen(srcchar);
 232
 233                 node = findReplaceTo(rootSuffixTree, (unsigned char *) srcchar, charlen);
 234                 if (node && node->replaceTo)
 235                 {
 236                         if (!res)
 237                         {
 238                                 /* allocate res only it it's needed */
 239                                 res = palloc0(sizeof(TSLexeme) * 2);
 240                                 res->lexeme = trgchar = palloc(len * pg_database_encoding_max_length() + 1 /* \0 */ );
 241                                 res->flags = TSL_FILTER;
 242                                 if (srcchar != srcstart)
 243                                 {
 244                                         memcpy(trgchar, srcstart, srcchar - srcstart);
 245                                         trgchar += (srcchar - srcstart);
 246                                 }
 247                         }
 248                         memcpy(trgchar, node->replaceTo, node->replacelen);
 249                         trgchar += node->replacelen;
 250                 }
 251                 else if (res)
 252                 {
 253                         memcpy(trgchar, srcchar, charlen);
 254                         trgchar += charlen;
 255                 }
 256
 257                 srcchar += charlen;
 258         }
 259
 260         if (res)
 261                 *trgchar = '\0';
 262
 263         PG_RETURN_POINTER(res);
 264 }
 265
 266 /*
 267  * Function-like wrapper for dictionary
 268  */
 269 PG_FUNCTION_INFO_V1(unaccent_dict);
 270 Datum           unaccent_dict(PG_FUNCTION_ARGS);
 271 Datum
 272 unaccent_dict(PG_FUNCTION_ARGS)
 273 {
 274         text       *str;
 275         int                     strArg;
 276         Oid                     dictOid;
 277         TSDictionaryCacheEntry *dict;
 278         TSLexeme   *res;
 279
 280         if (PG_NARGS() == 1)
 281         {
 282                 dictOid = get_ts_dict_oid(stringToQualifiedNameList("unaccent"), false);
 283                 strArg = 0;
 284         }
 285         else
 286         {
 287                 dictOid = PG_GETARG_OID(0);
 288                 strArg = 1;
 289         }
 290         str = PG_GETARG_TEXT_P(strArg);
 291
 292         dict = lookup_ts_dictionary_cache(dictOid);
 293
 294         res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
 295                                                                                          PointerGetDatum(dict->dictData),
 296                                                                                            PointerGetDatum(VARDATA(str)),
 297                                                                           Int32GetDatum(VARSIZE(str) - VARHDRSZ),
 298                                                                                                          PointerGetDatum(NULL)));
 299
 300         PG_FREE_IF_COPY(str, strArg);
 301
 302         if (res == NULL)
 303         {
 304                 PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
 305         }
 306         else if (res->lexeme == NULL)
 307         {
 308                 pfree(res);
 309                 PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
 310         }
 311         else
 312         {
 313                 text       *txt = cstring_to_text(res->lexeme);
 314
 315                 pfree(res->lexeme);
 316                 pfree(res);
 317
 318                 PG_RETURN_TEXT_P(txt);
 319         }
 320 }