granicus.if.org Git - postgresql/blob - contrib/unaccent/unaccent.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * unaccent.c
   4  *        Text search unaccent dictionary
   5  *
   6  * Copyright (c) 2009-2012, PostgreSQL Global Development Group
   7  *
   8  * IDENTIFICATION
   9  *        contrib/unaccent/unaccent.c
  10  *
  11  *-------------------------------------------------------------------------
  12  */
  13
  14 #include "postgres.h"
  15
  16 #include "catalog/namespace.h"
  17 #include "commands/defrem.h"
  18 #include "tsearch/ts_cache.h"
  19 #include "tsearch/ts_locale.h"
  20 #include "tsearch/ts_public.h"
  21 #include "utils/builtins.h"
  22
  23 PG_MODULE_MAGIC;
  24
  25 /*
  26  * Unaccent dictionary uses uncompressed suffix tree to find a
  27  * character to replace. Each node of tree is an array of
  28  * SuffixChar struct with length = 256 (n-th element of array
  29  * corresponds to byte)
  30  */
  31 typedef struct SuffixChar
  32 {
  33         struct SuffixChar *nextChar;
  34         char       *replaceTo;
  35         int                     replacelen;
  36 } SuffixChar;
  37
  38 /*
  39  * placeChar - put str into tree's structure, byte by byte.
  40  */
  41 static SuffixChar *
  42 placeChar(SuffixChar *node, unsigned char *str, int lenstr, char *replaceTo, int replacelen)
  43 {
  44         SuffixChar *curnode;
  45
  46         if (!node)
  47         {
  48                 node = palloc(sizeof(SuffixChar) * 256);
  49                 memset(node, 0, sizeof(SuffixChar) * 256);
  50         }
  51
  52         curnode = node + *str;
  53
  54         if (lenstr == 1)
  55         {
  56                 if (curnode->replaceTo)
  57                         elog(WARNING, "duplicate TO argument, use first one");
  58                 else
  59                 {
  60                         curnode->replacelen = replacelen;
  61                         curnode->replaceTo = palloc(replacelen);
  62                         memcpy(curnode->replaceTo, replaceTo, replacelen);
  63                 }
  64         }
  65         else
  66         {
  67                 curnode->nextChar = placeChar(curnode->nextChar, str + 1, lenstr - 1, replaceTo, replacelen);
  68         }
  69
  70         return node;
  71 }
  72
  73 /*
  74  * initSuffixTree  - create suffix tree from file. Function converts
  75  * UTF8-encoded file into current encoding.
  76  */
  77 static SuffixChar *
  78 initSuffixTree(char *filename)
  79 {
  80         SuffixChar *volatile rootSuffixTree = NULL;
  81         MemoryContext ccxt = CurrentMemoryContext;
  82         tsearch_readline_state trst;
  83         volatile bool skip;
  84
  85         filename = get_tsearch_config_filename(filename, "rules");
  86         if (!tsearch_readline_begin(&trst, filename))
  87                 ereport(ERROR,
  88                                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
  89                                  errmsg("could not open unaccent file \"%s\": %m",
  90                                                 filename)));
  91
  92         do
  93         {
  94                 /*
  95                  * pg_do_encoding_conversion() (called by tsearch_readline()) will
  96                  * emit exception if it finds untranslatable characters in current
  97                  * locale. We just skip such lines, continuing with the next.
  98                  */
  99                 skip = true;
 100
 101                 PG_TRY();
 102                 {
 103                         char       *line;
 104
 105                         while ((line = tsearch_readline(&trst)) != NULL)
 106                         {
 107                                 /*
 108                                  * The format of each line must be "src trg" where src and trg
 109                                  * are sequences of one or more non-whitespace characters,
 110                                  * separated by whitespace.  Whitespace at start or end of
 111                                  * line is ignored.
 112                                  */
 113                                 int                     state;
 114                                 char       *ptr;
 115                                 char       *src = NULL;
 116                                 char       *trg = NULL;
 117                                 int                     ptrlen;
 118                                 int                     srclen = 0;
 119                                 int                     trglen = 0;
 120
 121                                 state = 0;
 122                                 for (ptr = line; *ptr; ptr += ptrlen)
 123                                 {
 124                                         ptrlen = pg_mblen(ptr);
 125                                         /* ignore whitespace, but end src or trg */
 126                                         if (t_isspace(ptr))
 127                                         {
 128                                                 if (state == 1)
 129                                                         state = 2;
 130                                                 else if (state == 3)
 131                                                         state = 4;
 132                                                 continue;
 133                                         }
 134                                         switch (state)
 135                                         {
 136                                                 case 0:
 137                                                         /* start of src */
 138                                                         src = ptr;
 139                                                         srclen = ptrlen;
 140                                                         state = 1;
 141                                                         break;
 142                                                 case 1:
 143                                                         /* continue src */
 144                                                         srclen += ptrlen;
 145                                                         break;
 146                                                 case 2:
 147                                                         /* start of trg */
 148                                                         trg = ptr;
 149                                                         trglen = ptrlen;
 150                                                         state = 3;
 151                                                         break;
 152                                                 case 3:
 153                                                         /* continue trg */
 154                                                         trglen += ptrlen;
 155                                                         break;
 156                                                 default:
 157                                                         /* bogus line format */
 158                                                         state = -1;
 159                                                         break;
 160                                         }
 161                                 }
 162
 163                                 if (state >= 3)
 164                                         rootSuffixTree = placeChar(rootSuffixTree,
 165                                                                                            (unsigned char *) src, srclen,
 166                                                                                            trg, trglen);
 167
 168                                 pfree(line);
 169                         }
 170                         skip = false;
 171                 }
 172                 PG_CATCH();
 173                 {
 174                         ErrorData  *errdata;
 175                         MemoryContext ecxt;
 176
 177                         ecxt = MemoryContextSwitchTo(ccxt);
 178                         errdata = CopyErrorData();
 179                         if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER)
 180                         {
 181                                 FlushErrorState();
 182                         }
 183                         else
 184                         {
 185                                 MemoryContextSwitchTo(ecxt);
 186                                 PG_RE_THROW();
 187                         }
 188                 }
 189                 PG_END_TRY();
 190         }
 191         while (skip);
 192
 193         tsearch_readline_end(&trst);
 194
 195         return rootSuffixTree;
 196 }
 197
 198 /*
 199  * findReplaceTo - find multibyte character in tree
 200  */
 201 static SuffixChar *
 202 findReplaceTo(SuffixChar *node, unsigned char *src, int srclen)
 203 {
 204         while (node)
 205         {
 206                 node = node + *src;
 207                 if (srclen == 1)
 208                         return node;
 209
 210                 src++;
 211                 srclen--;
 212                 node = node->nextChar;
 213         }
 214
 215         return NULL;
 216 }
 217
 218 PG_FUNCTION_INFO_V1(unaccent_init);
 219 Datum           unaccent_init(PG_FUNCTION_ARGS);
 220 Datum
 221 unaccent_init(PG_FUNCTION_ARGS)
 222 {
 223         List       *dictoptions = (List *) PG_GETARG_POINTER(0);
 224         SuffixChar *rootSuffixTree = NULL;
 225         bool            fileloaded = false;
 226         ListCell   *l;
 227
 228         foreach(l, dictoptions)
 229         {
 230                 DefElem    *defel = (DefElem *) lfirst(l);
 231
 232                 if (pg_strcasecmp("Rules", defel->defname) == 0)
 233                 {
 234                         if (fileloaded)
 235                                 ereport(ERROR,
 236                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 237                                                  errmsg("multiple Rules parameters")));
 238                         rootSuffixTree = initSuffixTree(defGetString(defel));
 239                         fileloaded = true;
 240                 }
 241                 else
 242                 {
 243                         ereport(ERROR,
 244                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 245                                          errmsg("unrecognized Unaccent parameter: \"%s\"",
 246                                                         defel->defname)));
 247                 }
 248         }
 249
 250         if (!fileloaded)
 251         {
 252                 ereport(ERROR,
 253                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 254                                  errmsg("missing Rules parameter")));
 255         }
 256
 257         PG_RETURN_POINTER(rootSuffixTree);
 258 }
 259
 260 PG_FUNCTION_INFO_V1(unaccent_lexize);
 261 Datum           unaccent_lexize(PG_FUNCTION_ARGS);
 262 Datum
 263 unaccent_lexize(PG_FUNCTION_ARGS)
 264 {
 265         SuffixChar *rootSuffixTree = (SuffixChar *) PG_GETARG_POINTER(0);
 266         char       *srcchar = (char *) PG_GETARG_POINTER(1);
 267         int32           len = PG_GETARG_INT32(2);
 268         char       *srcstart,
 269                            *trgchar = NULL;
 270         int                     charlen;
 271         TSLexeme   *res = NULL;
 272         SuffixChar *node;
 273
 274         srcstart = srcchar;
 275         while (srcchar - srcstart < len)
 276         {
 277                 charlen = pg_mblen(srcchar);
 278
 279                 node = findReplaceTo(rootSuffixTree, (unsigned char *) srcchar, charlen);
 280                 if (node && node->replaceTo)
 281                 {
 282                         if (!res)
 283                         {
 284                                 /* allocate res only it it's needed */
 285                                 res = palloc0(sizeof(TSLexeme) * 2);
 286                                 res->lexeme = trgchar = palloc(len * pg_database_encoding_max_length() + 1 /* \0 */ );
 287                                 res->flags = TSL_FILTER;
 288                                 if (srcchar != srcstart)
 289                                 {
 290                                         memcpy(trgchar, srcstart, srcchar - srcstart);
 291                                         trgchar += (srcchar - srcstart);
 292                                 }
 293                         }
 294                         memcpy(trgchar, node->replaceTo, node->replacelen);
 295                         trgchar += node->replacelen;
 296                 }
 297                 else if (res)
 298                 {
 299                         memcpy(trgchar, srcchar, charlen);
 300                         trgchar += charlen;
 301                 }
 302
 303                 srcchar += charlen;
 304         }
 305
 306         if (res)
 307                 *trgchar = '\0';
 308
 309         PG_RETURN_POINTER(res);
 310 }
 311
 312 /*
 313  * Function-like wrapper for dictionary
 314  */
 315 PG_FUNCTION_INFO_V1(unaccent_dict);
 316 Datum           unaccent_dict(PG_FUNCTION_ARGS);
 317 Datum
 318 unaccent_dict(PG_FUNCTION_ARGS)
 319 {
 320         text       *str;
 321         int                     strArg;
 322         Oid                     dictOid;
 323         TSDictionaryCacheEntry *dict;
 324         TSLexeme   *res;
 325
 326         if (PG_NARGS() == 1)
 327         {
 328                 dictOid = get_ts_dict_oid(stringToQualifiedNameList("unaccent"), false);
 329                 strArg = 0;
 330         }
 331         else
 332         {
 333                 dictOid = PG_GETARG_OID(0);
 334                 strArg = 1;
 335         }
 336         str = PG_GETARG_TEXT_P(strArg);
 337
 338         dict = lookup_ts_dictionary_cache(dictOid);
 339
 340         res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
 341                                                                                          PointerGetDatum(dict->dictData),
 342                                                                                            PointerGetDatum(VARDATA(str)),
 343                                                                           Int32GetDatum(VARSIZE(str) - VARHDRSZ),
 344                                                                                                          PointerGetDatum(NULL)));
 345
 346         PG_FREE_IF_COPY(str, strArg);
 347
 348         if (res == NULL)
 349         {
 350                 PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
 351         }
 352         else if (res->lexeme == NULL)
 353         {
 354                 pfree(res);
 355                 PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
 356         }
 357         else
 358         {
 359                 text       *txt = cstring_to_text(res->lexeme);
 360
 361                 pfree(res->lexeme);
 362                 pfree(res);
 363
 364                 PG_RETURN_TEXT_P(txt);
 365         }
 366 }