1 /*-------------------------------------------------------------------------
4 * Thesaurus dictionary: phrase to phrase substitution
6 * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
10 * src/backend/tsearch/dict_thesaurus.c
12 *-------------------------------------------------------------------------
16 #include "catalog/namespace.h"
17 #include "commands/defrem.h"
18 #include "tsearch/ts_cache.h"
19 #include "tsearch/ts_locale.h"
20 #include "tsearch/ts_utils.h"
21 #include "utils/builtins.h"
25 * Temporay we use TSLexeme.flags for inner use...
27 #define DT_USEASIS 0x1000
29 typedef struct LexemeInfo
31 uint16 idsubst; /* entry's number in DictThesaurus->subst */
32 uint16 posinsubst; /* pos info in entry */
33 uint16 tnvariant; /* total num lexemes in one variant */
34 struct LexemeInfo *nextentry;
35 struct LexemeInfo *nextvariant;
46 uint16 lastlexeme; /* number lexemes to substitute */
48 TSLexeme *res; /* prepared substituted result */
53 /* subdictionary to normalize lexemes */
55 TSDictionaryCacheEntry *subdict;
57 /* Array to search lexeme by exact match */
59 int nwrds; /* current number of words */
60 int ntwrds; /* allocated array length */
63 * Storage of substituted result, n-th element is for n-th expression
71 newLexeme(DictThesaurus *d, char *b, char *e, uint16 idsubst, uint16 posinsubst)
75 if (d->nwrds >= d->ntwrds)
80 d->wrds = (TheLexeme *) palloc(sizeof(TheLexeme) * d->ntwrds);
85 d->wrds = (TheLexeme *) repalloc(d->wrds, sizeof(TheLexeme) * d->ntwrds);
89 ptr = d->wrds + d->nwrds;
92 ptr->lexeme = palloc(e - b + 1);
94 memcpy(ptr->lexeme, b, e - b);
95 ptr->lexeme[e - b] = '\0';
97 ptr->entries = (LexemeInfo *) palloc(sizeof(LexemeInfo));
99 ptr->entries->nextentry = NULL;
100 ptr->entries->idsubst = idsubst;
101 ptr->entries->posinsubst = posinsubst;
105 addWrd(DictThesaurus *d, char *b, char *e, uint16 idsubst, uint16 nwrd, uint16 posinsubst, bool useasis)
108 static int ntres = 0;
115 if (idsubst >= d->nsubst)
120 d->subst = (TheSubstitute *) palloc(sizeof(TheSubstitute) * d->nsubst);
125 d->subst = (TheSubstitute *) repalloc(d->subst, sizeof(TheSubstitute) * d->nsubst);
130 ptr = d->subst + idsubst;
132 ptr->lastlexeme = posinsubst - 1;
134 if (nres + 1 >= ntres)
139 ptr->res = (TSLexeme *) palloc(sizeof(TSLexeme) * ntres);
144 ptr->res = (TSLexeme *) repalloc(ptr->res, sizeof(TSLexeme) * ntres);
149 ptr->res[nres].lexeme = palloc(e - b + 1);
150 memcpy(ptr->res[nres].lexeme, b, e - b);
151 ptr->res[nres].lexeme[e - b] = '\0';
153 ptr->res[nres].nvariant = nwrd;
155 ptr->res[nres].flags = DT_USEASIS;
157 ptr->res[nres].flags = 0;
159 ptr->res[++nres].lexeme = NULL;
164 #define TR_WAITSUBS 3
168 thesaurusRead(char *filename, DictThesaurus *d)
170 tsearch_readline_state trst;
172 bool useasis = false;
175 filename = get_tsearch_config_filename(filename, "ths");
176 if (!tsearch_readline_begin(&trst, filename))
178 (errcode(ERRCODE_CONFIG_FILE_ERROR),
179 errmsg("could not open thesaurus file \"%s\": %m",
182 while ((line = tsearch_readline(&trst)) != NULL)
185 int state = TR_WAITLEX;
186 char *beginwrd = NULL;
187 uint16 posinsubst = 0;
192 /* is it a comment? */
193 while (*ptr && t_isspace(ptr))
194 ptr += pg_mblen(ptr);
196 if (t_iseq(ptr, '#') || *ptr == '\0' ||
197 t_iseq(ptr, '\n') || t_iseq(ptr, '\r'))
205 if (state == TR_WAITLEX)
207 if (t_iseq(ptr, ':'))
211 (errcode(ERRCODE_CONFIG_FILE_ERROR),
212 errmsg("unexpected delimiter")));
215 else if (!t_isspace(ptr))
221 else if (state == TR_INLEX)
223 if (t_iseq(ptr, ':'))
225 newLexeme(d, beginwrd, ptr, idsubst, posinsubst++);
228 else if (t_isspace(ptr))
230 newLexeme(d, beginwrd, ptr, idsubst, posinsubst++);
234 else if (state == TR_WAITSUBS)
236 if (t_iseq(ptr, '*'))
240 beginwrd = ptr + pg_mblen(ptr);
242 else if (t_iseq(ptr, '\\'))
246 beginwrd = ptr + pg_mblen(ptr);
248 else if (!t_isspace(ptr))
255 else if (state == TR_INSUBS)
261 (errcode(ERRCODE_CONFIG_FILE_ERROR),
262 errmsg("unexpected end of line or lexeme")));
263 addWrd(d, beginwrd, ptr, idsubst, nwrd++, posinsubst, useasis);
268 elog(ERROR, "unrecognized thesaurus state: %d", state);
270 ptr += pg_mblen(ptr);
273 if (state == TR_INSUBS)
277 (errcode(ERRCODE_CONFIG_FILE_ERROR),
278 errmsg("unexpected end of line or lexeme")));
279 addWrd(d, beginwrd, ptr, idsubst, nwrd++, posinsubst, useasis);
284 if (!(nwrd && posinsubst))
286 (errcode(ERRCODE_CONFIG_FILE_ERROR),
287 errmsg("unexpected end of line")));
294 tsearch_readline_end(&trst);
298 addCompiledLexeme(TheLexeme *newwrds, int *nnw, int *tnm, TSLexeme *lexeme, LexemeInfo *src, uint16 tnvariant)
303 newwrds = (TheLexeme *) repalloc(newwrds, sizeof(TheLexeme) * *tnm);
306 newwrds[*nnw].entries = (LexemeInfo *) palloc(sizeof(LexemeInfo));
308 if (lexeme && lexeme->lexeme)
310 newwrds[*nnw].lexeme = pstrdup(lexeme->lexeme);
311 newwrds[*nnw].entries->tnvariant = tnvariant;
315 newwrds[*nnw].lexeme = NULL;
316 newwrds[*nnw].entries->tnvariant = 1;
319 newwrds[*nnw].entries->idsubst = src->idsubst;
320 newwrds[*nnw].entries->posinsubst = src->posinsubst;
322 newwrds[*nnw].entries->nextentry = NULL;
329 cmpLexemeInfo(LexemeInfo *a, LexemeInfo *b)
331 if (a == NULL || b == NULL)
334 if (a->idsubst == b->idsubst)
336 if (a->posinsubst == b->posinsubst)
338 if (a->tnvariant == b->tnvariant)
341 return (a->tnvariant > b->tnvariant) ? 1 : -1;
344 return (a->posinsubst > b->posinsubst) ? 1 : -1;
347 return (a->idsubst > b->idsubst) ? 1 : -1;
351 cmpLexeme(const TheLexeme *a, const TheLexeme *b)
353 if (a->lexeme == NULL)
355 if (b->lexeme == NULL)
360 else if (b->lexeme == NULL)
363 return strcmp(a->lexeme, b->lexeme);
367 cmpLexemeQ(const void *a, const void *b)
369 return cmpLexeme((const TheLexeme *) a, (const TheLexeme *) b);
373 cmpTheLexeme(const void *a, const void *b)
375 const TheLexeme *la = (const TheLexeme *) a;
376 const TheLexeme *lb = (const TheLexeme *) b;
379 if ((res = cmpLexeme(la, lb)) != 0)
382 return -cmpLexemeInfo(la->entries, lb->entries);
386 compileTheLexeme(DictThesaurus *d)
391 TheLexeme *newwrds = (TheLexeme *) palloc(sizeof(TheLexeme) * tnm),
394 for (i = 0; i < d->nwrds; i++)
398 if (strcmp(d->wrds[i].lexeme, "?") == 0) /* Is stop word marker? */
399 newwrds = addCompiledLexeme(newwrds, &nnw, &tnm, NULL, d->wrds[i].entries, 0);
402 ptr = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize),
403 PointerGetDatum(d->subdict->dictData),
404 PointerGetDatum(d->wrds[i].lexeme),
405 Int32GetDatum(strlen(d->wrds[i].lexeme)),
406 PointerGetDatum(NULL)));
410 (errcode(ERRCODE_CONFIG_FILE_ERROR),
411 errmsg("thesaurus sample word \"%s\" isn't recognized by subdictionary (rule %d)",
413 d->wrds[i].entries->idsubst + 1)));
414 else if (!(ptr->lexeme))
416 (errcode(ERRCODE_CONFIG_FILE_ERROR),
417 errmsg("thesaurus sample word \"%s\" is a stop word (rule %d)",
419 d->wrds[i].entries->idsubst + 1),
420 errhint("Use \"?\" to represent a stop word within a sample phrase.")));
425 TSLexeme *remptr = ptr + 1;
427 int curvar = ptr->nvariant;
429 /* compute n words in one variant */
430 while (remptr->lexeme)
432 if (remptr->nvariant != (remptr - 1)->nvariant)
439 while (remptr->lexeme && remptr->nvariant == curvar)
441 newwrds = addCompiledLexeme(newwrds, &nnw, &tnm, remptr, d->wrds[i].entries, tnvar);
450 pfree(d->wrds[i].lexeme);
451 pfree(d->wrds[i].entries);
462 qsort(d->wrds, d->nwrds, sizeof(TheLexeme), cmpTheLexeme);
466 ptrwrds = d->wrds + 1;
467 while (ptrwrds - d->wrds < d->nwrds)
469 if (cmpLexeme(ptrwrds, newwrds) == 0)
471 if (cmpLexemeInfo(ptrwrds->entries, newwrds->entries))
473 ptrwrds->entries->nextentry = newwrds->entries;
474 newwrds->entries = ptrwrds->entries;
477 pfree(ptrwrds->entries);
480 pfree(ptrwrds->lexeme);
491 d->nwrds = newwrds - d->wrds + 1;
492 d->wrds = (TheLexeme *) repalloc(d->wrds, sizeof(TheLexeme) * d->nwrds);
497 compileTheSubstitute(DictThesaurus *d)
501 for (i = 0; i < d->nsubst; i++)
503 TSLexeme *rem = d->subst[i].res,
508 outptr = d->subst[i].res = (TSLexeme *) palloc(sizeof(TSLexeme) * n);
509 outptr->lexeme = NULL;
512 while (inptr && inptr->lexeme)
517 if (inptr->flags & DT_USEASIS)
518 { /* do not lexize */
521 tmplex[1].lexeme = NULL;
526 lexized = (TSLexeme *) DatumGetPointer(
528 &(d->subdict->lexize),
529 PointerGetDatum(d->subdict->dictData),
530 PointerGetDatum(inptr->lexeme),
531 Int32GetDatum(strlen(inptr->lexeme)),
532 PointerGetDatum(NULL)
537 if (lexized && lexized->lexeme)
539 int toset = (lexized->lexeme && outptr != d->subst[i].res) ? (outptr - d->subst[i].res) : -1;
541 while (lexized->lexeme)
543 if (outptr - d->subst[i].res + 1 >= n)
545 int diff = outptr - d->subst[i].res;
548 d->subst[i].res = (TSLexeme *) repalloc(d->subst[i].res, sizeof(TSLexeme) * n);
549 outptr = d->subst[i].res + diff;
553 outptr->lexeme = pstrdup(lexized->lexeme);
560 d->subst[i].res[toset].flags |= TSL_ADDPOS;
565 (errcode(ERRCODE_CONFIG_FILE_ERROR),
566 errmsg("thesaurus substitute word \"%s\" is a stop word (rule %d)",
567 inptr->lexeme, i + 1)));
572 (errcode(ERRCODE_CONFIG_FILE_ERROR),
573 errmsg("thesaurus substitute word \"%s\" isn't recognized by subdictionary (rule %d)",
574 inptr->lexeme, i + 1)));
578 pfree(inptr->lexeme);
582 if (outptr == d->subst[i].res)
584 (errcode(ERRCODE_CONFIG_FILE_ERROR),
585 errmsg("thesaurus substitute phrase is empty (rule %d)",
588 d->subst[i].reslen = outptr - d->subst[i].res;
595 thesaurus_init(PG_FUNCTION_ARGS)
597 List *dictoptions = (List *) PG_GETARG_POINTER(0);
599 char *subdictname = NULL;
600 bool fileloaded = false;
603 d = (DictThesaurus *) palloc0(sizeof(DictThesaurus));
605 foreach(l, dictoptions)
607 DefElem *defel = (DefElem *) lfirst(l);
609 if (pg_strcasecmp("DictFile", defel->defname) == 0)
613 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
614 errmsg("multiple DictFile parameters")));
615 thesaurusRead(defGetString(defel), d);
618 else if (pg_strcasecmp("Dictionary", defel->defname) == 0)
622 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
623 errmsg("multiple Dictionary parameters")));
624 subdictname = pstrdup(defGetString(defel));
629 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
630 errmsg("unrecognized Thesaurus parameter: \"%s\"",
637 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
638 errmsg("missing DictFile parameter")));
641 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
642 errmsg("missing Dictionary parameter")));
644 d->subdictOid = get_ts_dict_oid(stringToQualifiedNameList(subdictname), false);
645 d->subdict = lookup_ts_dictionary_cache(d->subdictOid);
648 compileTheSubstitute(d);
650 PG_RETURN_POINTER(d);
654 findTheLexeme(DictThesaurus *d, char *lexeme)
665 res = bsearch(&key, d->wrds, d->nwrds, sizeof(TheLexeme), cmpLexemeQ);
673 matchIdSubst(LexemeInfo *stored, uint16 idsubst)
681 for (; stored; stored = stored->nextvariant)
682 if (stored->idsubst == idsubst)
693 findVariant(LexemeInfo *in, LexemeInfo *stored, uint16 curpos, LexemeInfo **newin, int newn)
698 LexemeInfo *ptr = newin[0];
700 for (i = 0; i < newn; i++)
702 while (newin[i] && newin[i]->idsubst < ptr->idsubst)
703 newin[i] = newin[i]->nextentry;
705 if (newin[i] == NULL)
708 if (newin[i]->idsubst > ptr->idsubst)
715 while (newin[i]->idsubst == ptr->idsubst)
717 if (newin[i]->posinsubst == curpos && newin[i]->tnvariant == newn)
723 newin[i] = newin[i]->nextentry;
724 if (newin[i] == NULL)
728 if (newin[i]->idsubst != ptr->idsubst)
736 if (i == newn && matchIdSubst(stored, ptr->idsubst) && (in == NULL || !matchIdSubst(in, ptr->idsubst)))
739 ptr->nextvariant = in;
744 for (i = 0; i < newn; i++)
745 newin[i] = newin[i]->nextentry;
752 copyTSLexeme(TheSubstitute *ts)
757 res = (TSLexeme *) palloc(sizeof(TSLexeme) * (ts->reslen + 1));
758 for (i = 0; i < ts->reslen; i++)
761 res[i].lexeme = pstrdup(ts->res[i].lexeme);
764 res[ts->reslen].lexeme = NULL;
770 checkMatch(DictThesaurus *d, LexemeInfo *info, uint16 curpos, bool *moreres)
775 Assert(info->idsubst < d->nsubst);
776 if (info->nextvariant)
778 if (d->subst[info->idsubst].lastlexeme == curpos)
779 return copyTSLexeme(d->subst + info->idsubst);
780 info = info->nextvariant;
787 thesaurus_lexize(PG_FUNCTION_ARGS)
789 DictThesaurus *d = (DictThesaurus *) PG_GETARG_POINTER(0);
790 DictSubState *dstate = (DictSubState *) PG_GETARG_POINTER(3);
791 TSLexeme *res = NULL;
795 bool moreres = false;
797 if (PG_NARGS() != 4 || dstate == NULL)
798 elog(ERROR, "forbidden call of thesaurus or nested call");
801 PG_RETURN_POINTER(NULL);
802 stored = (LexemeInfo *) dstate->private_state;
805 curpos = stored->posinsubst + 1;
807 if (!d->subdict->isvalid)
808 d->subdict = lookup_ts_dictionary_cache(d->subdictOid);
810 res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize),
811 PointerGetDatum(d->subdict->dictData),
814 PointerGetDatum(NULL)));
816 if (res && res->lexeme)
823 uint16 nv = ptr->nvariant;
829 while (ptr->lexeme && nv == ptr->nvariant)
835 infos = (LexemeInfo **) palloc(sizeof(LexemeInfo *) * nlex);
836 for (i = 0; i < nlex; i++)
837 if ((infos[i] = findTheLexeme(d, basevar[i].lexeme)) == NULL)
842 /* no chance to find */
847 info = findVariant(info, stored, curpos, infos, nlex);
852 LexemeInfo *infos = findTheLexeme(d, NULL);
854 info = findVariant(NULL, stored, curpos, &infos, 1);
858 info = NULL; /* word isn't recognized */
861 dstate->private_state = (void *) info;
865 dstate->getnext = false;
866 PG_RETURN_POINTER(NULL);
869 if ((res = checkMatch(d, info, curpos, &moreres)) != NULL)
871 dstate->getnext = moreres;
872 PG_RETURN_POINTER(res);
875 dstate->getnext = true;
877 PG_RETURN_POINTER(NULL);