granicus.if.org Git - postgresql/blob - contrib/unaccent/unaccent.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * unaccent.c
   4  *        Text search unaccent dictionary
   5  *
   6  * Copyright (c) 2009-2014, PostgreSQL Global Development Group
   7  *
   8  * IDENTIFICATION
   9  *        contrib/unaccent/unaccent.c
  10  *
  11  *-------------------------------------------------------------------------
  12  */
  13
  14 #include "postgres.h"
  15
  16 #include "catalog/namespace.h"
  17 #include "commands/defrem.h"
  18 #include "tsearch/ts_cache.h"
  19 #include "tsearch/ts_locale.h"
  20 #include "tsearch/ts_public.h"
  21 #include "utils/builtins.h"
  22
  23 PG_MODULE_MAGIC;
  24
  25 /*
  26  * Unaccent dictionary uses a trie to find a character to replace. Each node of
  27  * the trie is an array of 256 TrieChar structs (n-th element of array
  28  * corresponds to byte)
  29  */
  30 typedef struct TrieChar
  31 {
  32         struct TrieChar *nextChar;
  33         char       *replaceTo;
  34         int                     replacelen;
  35 } TrieChar;
  36
  37 /*
  38  * placeChar - put str into trie's structure, byte by byte.
  39  */
  40 static TrieChar *
  41 placeChar(TrieChar *node, unsigned char *str, int lenstr, char *replaceTo, int replacelen)
  42 {
  43         TrieChar   *curnode;
  44
  45         if (!node)
  46         {
  47                 node = palloc(sizeof(TrieChar) * 256);
  48                 memset(node, 0, sizeof(TrieChar) * 256);
  49         }
  50
  51         curnode = node + *str;
  52
  53         if (lenstr == 1)
  54         {
  55                 if (curnode->replaceTo)
  56                         elog(WARNING, "duplicate TO argument, use first one");
  57                 else
  58                 {
  59                         curnode->replacelen = replacelen;
  60                         curnode->replaceTo = palloc(replacelen);
  61                         memcpy(curnode->replaceTo, replaceTo, replacelen);
  62                 }
  63         }
  64         else
  65         {
  66                 curnode->nextChar = placeChar(curnode->nextChar, str + 1, lenstr - 1, replaceTo, replacelen);
  67         }
  68
  69         return node;
  70 }
  71
  72 /*
  73  * initTrie  - create trie from file.
  74  *
  75  * Function converts UTF8-encoded file into current encoding.
  76  */
  77 static TrieChar *
  78 initTrie(char *filename)
  79 {
  80         TrieChar   *volatile rootTrie = NULL;
  81         MemoryContext ccxt = CurrentMemoryContext;
  82         tsearch_readline_state trst;
  83         volatile bool skip;
  84
  85         filename = get_tsearch_config_filename(filename, "rules");
  86         if (!tsearch_readline_begin(&trst, filename))
  87                 ereport(ERROR,
  88                                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
  89                                  errmsg("could not open unaccent file \"%s\": %m",
  90                                                 filename)));
  91
  92         do
  93         {
  94                 /*
  95                  * pg_do_encoding_conversion() (called by tsearch_readline()) will
  96                  * emit exception if it finds untranslatable characters in current
  97                  * locale. We just skip such lines, continuing with the next.
  98                  */
  99                 skip = true;
 100
 101                 PG_TRY();
 102                 {
 103                         char       *line;
 104
 105                         while ((line = tsearch_readline(&trst)) != NULL)
 106                         {
 107                                 /*
 108                                  * The format of each line must be "src trg" where src and trg
 109                                  * are sequences of one or more non-whitespace characters,
 110                                  * separated by whitespace.  Whitespace at start or end of
 111                                  * line is ignored.
 112                                  */
 113                                 int                     state;
 114                                 char       *ptr;
 115                                 char       *src = NULL;
 116                                 char       *trg = NULL;
 117                                 int                     ptrlen;
 118                                 int                     srclen = 0;
 119                                 int                     trglen = 0;
 120
 121                                 state = 0;
 122                                 for (ptr = line; *ptr; ptr += ptrlen)
 123                                 {
 124                                         ptrlen = pg_mblen(ptr);
 125                                         /* ignore whitespace, but end src or trg */
 126                                         if (t_isspace(ptr))
 127                                         {
 128                                                 if (state == 1)
 129                                                         state = 2;
 130                                                 else if (state == 3)
 131                                                         state = 4;
 132                                                 continue;
 133                                         }
 134                                         switch (state)
 135                                         {
 136                                                 case 0:
 137                                                         /* start of src */
 138                                                         src = ptr;
 139                                                         srclen = ptrlen;
 140                                                         state = 1;
 141                                                         break;
 142                                                 case 1:
 143                                                         /* continue src */
 144                                                         srclen += ptrlen;
 145                                                         break;
 146                                                 case 2:
 147                                                         /* start of trg */
 148                                                         trg = ptr;
 149                                                         trglen = ptrlen;
 150                                                         state = 3;
 151                                                         break;
 152                                                 case 3:
 153                                                         /* continue trg */
 154                                                         trglen += ptrlen;
 155                                                         break;
 156                                                 default:
 157                                                         /* bogus line format */
 158                                                         state = -1;
 159                                                         break;
 160                                         }
 161                                 }
 162
 163                                 if (state >= 3)
 164                                         rootTrie = placeChar(rootTrie,
 165                                                                                  (unsigned char *) src, srclen,
 166                                                                                  trg, trglen);
 167
 168                                 pfree(line);
 169                         }
 170                         skip = false;
 171                 }
 172                 PG_CATCH();
 173                 {
 174                         ErrorData  *errdata;
 175                         MemoryContext ecxt;
 176
 177                         ecxt = MemoryContextSwitchTo(ccxt);
 178                         errdata = CopyErrorData();
 179                         if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER)
 180                         {
 181                                 FlushErrorState();
 182                         }
 183                         else
 184                         {
 185                                 MemoryContextSwitchTo(ecxt);
 186                                 PG_RE_THROW();
 187                         }
 188                 }
 189                 PG_END_TRY();
 190         }
 191         while (skip);
 192
 193         tsearch_readline_end(&trst);
 194
 195         return rootTrie;
 196 }
 197
 198 /*
 199  * findReplaceTo - find multibyte character in trie
 200  */
 201 static TrieChar *
 202 findReplaceTo(TrieChar *node, unsigned char *src, int srclen)
 203 {
 204         while (node)
 205         {
 206                 node = node + *src;
 207                 if (srclen == 1)
 208                         return node;
 209
 210                 src++;
 211                 srclen--;
 212                 node = node->nextChar;
 213         }
 214
 215         return NULL;
 216 }
 217
 218 PG_FUNCTION_INFO_V1(unaccent_init);
 219 Datum
 220 unaccent_init(PG_FUNCTION_ARGS)
 221 {
 222         List       *dictoptions = (List *) PG_GETARG_POINTER(0);
 223         TrieChar   *rootTrie = NULL;
 224         bool            fileloaded = false;
 225         ListCell   *l;
 226
 227         foreach(l, dictoptions)
 228         {
 229                 DefElem    *defel = (DefElem *) lfirst(l);
 230
 231                 if (pg_strcasecmp("Rules", defel->defname) == 0)
 232                 {
 233                         if (fileloaded)
 234                                 ereport(ERROR,
 235                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 236                                                  errmsg("multiple Rules parameters")));
 237                         rootTrie = initTrie(defGetString(defel));
 238                         fileloaded = true;
 239                 }
 240                 else
 241                 {
 242                         ereport(ERROR,
 243                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 244                                          errmsg("unrecognized Unaccent parameter: \"%s\"",
 245                                                         defel->defname)));
 246                 }
 247         }
 248
 249         if (!fileloaded)
 250         {
 251                 ereport(ERROR,
 252                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 253                                  errmsg("missing Rules parameter")));
 254         }
 255
 256         PG_RETURN_POINTER(rootTrie);
 257 }
 258
 259 PG_FUNCTION_INFO_V1(unaccent_lexize);
 260 Datum
 261 unaccent_lexize(PG_FUNCTION_ARGS)
 262 {
 263         TrieChar   *rootTrie = (TrieChar *) PG_GETARG_POINTER(0);
 264         char       *srcchar = (char *) PG_GETARG_POINTER(1);
 265         int32           len = PG_GETARG_INT32(2);
 266         char       *srcstart,
 267                            *trgchar = NULL;
 268         int                     charlen;
 269         TSLexeme   *res = NULL;
 270         TrieChar   *node;
 271
 272         srcstart = srcchar;
 273         while (srcchar - srcstart < len)
 274         {
 275                 charlen = pg_mblen(srcchar);
 276
 277                 node = findReplaceTo(rootTrie, (unsigned char *) srcchar, charlen);
 278                 if (node && node->replaceTo)
 279                 {
 280                         if (!res)
 281                         {
 282                                 /* allocate res only if it's needed */
 283                                 res = palloc0(sizeof(TSLexeme) * 2);
 284                                 res->lexeme = trgchar = palloc(len * pg_database_encoding_max_length() + 1 /* \0 */ );
 285                                 res->flags = TSL_FILTER;
 286                                 if (srcchar != srcstart)
 287                                 {
 288                                         memcpy(trgchar, srcstart, srcchar - srcstart);
 289                                         trgchar += (srcchar - srcstart);
 290                                 }
 291                         }
 292                         memcpy(trgchar, node->replaceTo, node->replacelen);
 293                         trgchar += node->replacelen;
 294                 }
 295                 else if (res)
 296                 {
 297                         memcpy(trgchar, srcchar, charlen);
 298                         trgchar += charlen;
 299                 }
 300
 301                 srcchar += charlen;
 302         }
 303
 304         if (res)
 305                 *trgchar = '\0';
 306
 307         PG_RETURN_POINTER(res);
 308 }
 309
 310 /*
 311  * Function-like wrapper for dictionary
 312  */
 313 PG_FUNCTION_INFO_V1(unaccent_dict);
 314 Datum
 315 unaccent_dict(PG_FUNCTION_ARGS)
 316 {
 317         text       *str;
 318         int                     strArg;
 319         Oid                     dictOid;
 320         TSDictionaryCacheEntry *dict;
 321         TSLexeme   *res;
 322
 323         if (PG_NARGS() == 1)
 324         {
 325                 dictOid = get_ts_dict_oid(stringToQualifiedNameList("unaccent"), false);
 326                 strArg = 0;
 327         }
 328         else
 329         {
 330                 dictOid = PG_GETARG_OID(0);
 331                 strArg = 1;
 332         }
 333         str = PG_GETARG_TEXT_P(strArg);
 334
 335         dict = lookup_ts_dictionary_cache(dictOid);
 336
 337         res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
 338                                                                                          PointerGetDatum(dict->dictData),
 339                                                                                            PointerGetDatum(VARDATA(str)),
 340                                                                           Int32GetDatum(VARSIZE(str) - VARHDRSZ),
 341                                                                                                          PointerGetDatum(NULL)));
 342
 343         PG_FREE_IF_COPY(str, strArg);
 344
 345         if (res == NULL)
 346         {
 347                 PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
 348         }
 349         else if (res->lexeme == NULL)
 350         {
 351                 pfree(res);
 352                 PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
 353         }
 354         else
 355         {
 356                 text       *txt = cstring_to_text(res->lexeme);
 357
 358                 pfree(res->lexeme);
 359                 pfree(res);
 360
 361                 PG_RETURN_TEXT_P(txt);
 362         }
 363 }