1 /*-------------------------------------------------------------------------
4 * Thesaurus dictionary: phrase to phrase substitution
6 * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
10 * $PostgreSQL: pgsql/src/backend/tsearch/dict_thesaurus.c,v 1.12 2008/06/18 20:55:42 tgl Exp $
12 *-------------------------------------------------------------------------
16 #include "catalog/namespace.h"
17 #include "commands/defrem.h"
18 #include "tsearch/ts_cache.h"
19 #include "tsearch/ts_locale.h"
20 #include "tsearch/ts_public.h"
21 #include "tsearch/ts_utils.h"
22 #include "utils/builtins.h"
26 * Temporay we use TSLexeme.flags for inner use...
28 #define DT_USEASIS 0x1000
30 typedef struct LexemeInfo
32 uint16 idsubst; /* entry's number in DictThesaurus->subst */
33 uint16 posinsubst; /* pos info in entry */
34 uint16 tnvariant; /* total num lexemes in one variant */
35 struct LexemeInfo *nextentry;
36 struct LexemeInfo *nextvariant;
47 uint16 lastlexeme; /* number lexemes to substitute */
49 TSLexeme *res; /* prepared substituted result */
54 /* subdictionary to normalize lexemes */
56 TSDictionaryCacheEntry *subdict;
58 /* Array to search lexeme by exact match */
64 * Storage of substituted result, n-th element is for n-th expression
72 newLexeme(DictThesaurus *d, char *b, char *e, uint16 idsubst, uint16 posinsubst)
76 if (d->nwrds >= d->ntwrds)
81 d->wrds = (TheLexeme *) palloc(sizeof(TheLexeme) * d->ntwrds);
86 d->wrds = (TheLexeme *) repalloc(d->wrds, sizeof(TheLexeme) * d->ntwrds);
90 ptr = d->wrds + d->nwrds;
93 ptr->lexeme = palloc(e - b + 1);
95 memcpy(ptr->lexeme, b, e - b);
96 ptr->lexeme[e - b] = '\0';
98 ptr->entries = (LexemeInfo *) palloc(sizeof(LexemeInfo));
100 ptr->entries->nextentry = NULL;
101 ptr->entries->idsubst = idsubst;
102 ptr->entries->posinsubst = posinsubst;
106 addWrd(DictThesaurus *d, char *b, char *e, uint16 idsubst, uint16 nwrd, uint16 posinsubst, bool useasis)
109 static int ntres = 0;
116 if (idsubst >= d->nsubst)
121 d->subst = (TheSubstitute *) palloc(sizeof(TheSubstitute) * d->nsubst);
126 d->subst = (TheSubstitute *) repalloc(d->subst, sizeof(TheSubstitute) * d->nsubst);
131 ptr = d->subst + idsubst;
133 ptr->lastlexeme = posinsubst - 1;
135 if (nres + 1 >= ntres)
140 ptr->res = (TSLexeme *) palloc(sizeof(TSLexeme) * ntres);
145 ptr->res = (TSLexeme *) repalloc(ptr->res, sizeof(TSLexeme) * ntres);
150 ptr->res[nres].lexeme = palloc(e - b + 1);
151 memcpy(ptr->res[nres].lexeme, b, e - b);
152 ptr->res[nres].lexeme[e - b] = '\0';
154 ptr->res[nres].nvariant = nwrd;
156 ptr->res[nres].flags = DT_USEASIS;
158 ptr->res[nres].flags = 0;
160 ptr->res[++nres].lexeme = NULL;
165 #define TR_WAITSUBS 3
169 thesaurusRead(char *filename, DictThesaurus *d)
171 tsearch_readline_state trst;
173 bool useasis = false;
176 filename = get_tsearch_config_filename(filename, "ths");
177 if (!tsearch_readline_begin(&trst, filename))
179 (errcode(ERRCODE_CONFIG_FILE_ERROR),
180 errmsg("could not open thesaurus file \"%s\": %m",
183 while ((line = tsearch_readline(&trst)) != NULL)
186 int state = TR_WAITLEX;
187 char *beginwrd = NULL;
188 uint16 posinsubst = 0;
193 /* is it a comment? */
194 while (*ptr && t_isspace(ptr))
195 ptr += pg_mblen(ptr);
197 if (t_iseq(ptr, '#') || *ptr == '\0' ||
198 t_iseq(ptr, '\n') || t_iseq(ptr, '\r'))
206 if (state == TR_WAITLEX)
208 if (t_iseq(ptr, ':'))
212 (errcode(ERRCODE_CONFIG_FILE_ERROR),
213 errmsg("unexpected delimiter")));
216 else if (!t_isspace(ptr))
222 else if (state == TR_INLEX)
224 if (t_iseq(ptr, ':'))
226 newLexeme(d, beginwrd, ptr, idsubst, posinsubst++);
229 else if (t_isspace(ptr))
231 newLexeme(d, beginwrd, ptr, idsubst, posinsubst++);
235 else if (state == TR_WAITSUBS)
237 if (t_iseq(ptr, '*'))
241 beginwrd = ptr + pg_mblen(ptr);
243 else if (t_iseq(ptr, '\\'))
247 beginwrd = ptr + pg_mblen(ptr);
249 else if (!t_isspace(ptr))
256 else if (state == TR_INSUBS)
262 (errcode(ERRCODE_CONFIG_FILE_ERROR),
263 errmsg("unexpected end of line or lexeme")));
264 addWrd(d, beginwrd, ptr, idsubst, nwrd++, posinsubst, useasis);
269 elog(ERROR, "unrecognized thesaurus state: %d", state);
271 ptr += pg_mblen(ptr);
274 if (state == TR_INSUBS)
278 (errcode(ERRCODE_CONFIG_FILE_ERROR),
279 errmsg("unexpected end of line or lexeme")));
280 addWrd(d, beginwrd, ptr, idsubst, nwrd++, posinsubst, useasis);
285 if (!(nwrd && posinsubst))
287 (errcode(ERRCODE_CONFIG_FILE_ERROR),
288 errmsg("unexpected end of line")));
295 tsearch_readline_end(&trst);
299 addCompiledLexeme(TheLexeme *newwrds, int *nnw, int *tnm, TSLexeme *lexeme, LexemeInfo *src, uint16 tnvariant)
305 newwrds = (TheLexeme *) repalloc(newwrds, sizeof(TheLexeme) * *tnm);
308 newwrds[*nnw].entries = (LexemeInfo *) palloc(sizeof(LexemeInfo));
310 if (lexeme && lexeme->lexeme)
312 newwrds[*nnw].lexeme = pstrdup(lexeme->lexeme);
313 newwrds[*nnw].entries->tnvariant = tnvariant;
317 newwrds[*nnw].lexeme = NULL;
318 newwrds[*nnw].entries->tnvariant = 1;
321 newwrds[*nnw].entries->idsubst = src->idsubst;
322 newwrds[*nnw].entries->posinsubst = src->posinsubst;
324 newwrds[*nnw].entries->nextentry = NULL;
331 cmpLexemeInfo(LexemeInfo *a, LexemeInfo *b)
333 if (a == NULL || b == NULL)
336 if (a->idsubst == b->idsubst)
338 if (a->posinsubst == b->posinsubst)
340 if (a->tnvariant == b->tnvariant)
343 return (a->tnvariant > b->tnvariant) ? 1 : -1;
346 return (a->posinsubst > b->posinsubst) ? 1 : -1;
349 return (a->idsubst > b->idsubst) ? 1 : -1;
353 cmpLexeme(TheLexeme *a, TheLexeme *b)
355 if (a->lexeme == NULL)
357 if (b->lexeme == NULL)
362 else if (b->lexeme == NULL)
365 return strcmp(a->lexeme, b->lexeme);
369 cmpLexemeQ(const void *a, const void *b)
371 return cmpLexeme((TheLexeme *) a, (TheLexeme *) b);
375 cmpTheLexeme(const void *a, const void *b)
377 TheLexeme *la = (TheLexeme *) a;
378 TheLexeme *lb = (TheLexeme *) b;
381 if ((res = cmpLexeme(la, lb)) != 0)
384 return -cmpLexemeInfo(la->entries, lb->entries);
388 compileTheLexeme(DictThesaurus *d)
393 TheLexeme *newwrds = (TheLexeme *) palloc(sizeof(TheLexeme) * tnm),
396 for (i = 0; i < d->nwrds; i++)
400 if (strcmp(d->wrds[i].lexeme, "?") == 0) /* Is stop word marker? */
401 newwrds = addCompiledLexeme(newwrds, &nnw, &tnm, NULL, d->wrds[i].entries, 0);
404 ptr = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize),
405 PointerGetDatum(d->subdict->dictData),
406 PointerGetDatum(d->wrds[i].lexeme),
407 Int32GetDatum(strlen(d->wrds[i].lexeme)),
408 PointerGetDatum(NULL)));
412 (errcode(ERRCODE_CONFIG_FILE_ERROR),
413 errmsg("thesaurus sample word \"%s\" isn't recognized by subdictionary (rule %d)",
415 d->wrds[i].entries->idsubst + 1)));
416 else if (!(ptr->lexeme))
418 (errcode(ERRCODE_CONFIG_FILE_ERROR),
419 errmsg("thesaurus sample word \"%s\" is a stop word (rule %d)",
421 d->wrds[i].entries->idsubst + 1),
422 errhint("Use \"?\" to represent a stop word within a sample phrase.")));
427 TSLexeme *remptr = ptr + 1;
429 int curvar = ptr->nvariant;
431 /* compute n words in one variant */
432 while (remptr->lexeme)
434 if (remptr->nvariant != (remptr - 1)->nvariant)
441 while (remptr->lexeme && remptr->nvariant == curvar)
443 newwrds = addCompiledLexeme(newwrds, &nnw, &tnm, remptr, d->wrds[i].entries, tnvar);
452 pfree(d->wrds[i].lexeme);
453 pfree(d->wrds[i].entries);
463 qsort(d->wrds, d->nwrds, sizeof(TheLexeme), cmpTheLexeme);
467 ptrwrds = d->wrds + 1;
468 while (ptrwrds - d->wrds < d->nwrds)
470 if (cmpLexeme(ptrwrds, newwrds) == 0)
472 if (cmpLexemeInfo(ptrwrds->entries, newwrds->entries))
474 ptrwrds->entries->nextentry = newwrds->entries;
475 newwrds->entries = ptrwrds->entries;
478 pfree(ptrwrds->entries);
481 pfree(ptrwrds->lexeme);
492 d->nwrds = newwrds - d->wrds + 1;
493 d->wrds = (TheLexeme *) repalloc(d->wrds, sizeof(TheLexeme) * d->nwrds);
498 compileTheSubstitute(DictThesaurus *d)
502 for (i = 0; i < d->nsubst; i++)
504 TSLexeme *rem = d->subst[i].res,
509 outptr = d->subst[i].res = (TSLexeme *) palloc(sizeof(TSLexeme) * n);
510 outptr->lexeme = NULL;
513 while (inptr && inptr->lexeme)
518 if (inptr->flags & DT_USEASIS)
519 { /* do not lexize */
522 tmplex[1].lexeme = NULL;
527 lexized = (TSLexeme *) DatumGetPointer(
529 &(d->subdict->lexize),
530 PointerGetDatum(d->subdict->dictData),
531 PointerGetDatum(inptr->lexeme),
532 Int32GetDatum(strlen(inptr->lexeme)),
533 PointerGetDatum(NULL)
538 if (lexized && lexized->lexeme)
540 int toset = (lexized->lexeme && outptr != d->subst[i].res) ? (outptr - d->subst[i].res) : -1;
542 while (lexized->lexeme)
544 if (outptr - d->subst[i].res + 1 >= n)
546 int diff = outptr - d->subst[i].res;
549 d->subst[i].res = (TSLexeme *) repalloc(d->subst[i].res, sizeof(TSLexeme) * n);
550 outptr = d->subst[i].res + diff;
554 outptr->lexeme = pstrdup(lexized->lexeme);
561 d->subst[i].res[toset].flags |= TSL_ADDPOS;
566 (errcode(ERRCODE_CONFIG_FILE_ERROR),
567 errmsg("thesaurus substitute word \"%s\" is a stop word (rule %d)",
568 inptr->lexeme, i + 1)));
573 (errcode(ERRCODE_CONFIG_FILE_ERROR),
574 errmsg("thesaurus substitute word \"%s\" isn't recognized by subdictionary (rule %d)",
575 inptr->lexeme, i + 1)));
579 pfree(inptr->lexeme);
583 if (outptr == d->subst[i].res)
585 (errcode(ERRCODE_CONFIG_FILE_ERROR),
586 errmsg("thesaurus substitute phrase is empty (rule %d)",
589 d->subst[i].reslen = outptr - d->subst[i].res;
596 thesaurus_init(PG_FUNCTION_ARGS)
598 List *dictoptions = (List *) PG_GETARG_POINTER(0);
600 char *subdictname = NULL;
601 bool fileloaded = false;
604 d = (DictThesaurus *) palloc0(sizeof(DictThesaurus));
606 foreach(l, dictoptions)
608 DefElem *defel = (DefElem *) lfirst(l);
610 if (pg_strcasecmp("DictFile", defel->defname) == 0)
614 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
615 errmsg("multiple DictFile parameters")));
616 thesaurusRead(defGetString(defel), d);
619 else if (pg_strcasecmp("Dictionary", defel->defname) == 0)
623 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
624 errmsg("multiple Dictionary parameters")));
625 subdictname = pstrdup(defGetString(defel));
630 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
631 errmsg("unrecognized Thesaurus parameter: \"%s\"",
638 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
639 errmsg("missing DictFile parameter")));
642 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
643 errmsg("missing Dictionary parameter")));
645 d->subdictOid = TSDictionaryGetDictid(stringToQualifiedNameList(subdictname), false);
646 d->subdict = lookup_ts_dictionary_cache(d->subdictOid);
649 compileTheSubstitute(d);
651 PG_RETURN_POINTER(d);
655 findTheLexeme(DictThesaurus *d, char *lexeme)
666 res = bsearch(&key, d->wrds, d->nwrds, sizeof(TheLexeme), cmpLexemeQ);
674 matchIdSubst(LexemeInfo *stored, uint16 idsubst)
682 for (; stored; stored = stored->nextvariant)
683 if (stored->idsubst == idsubst)
694 findVariant(LexemeInfo *in, LexemeInfo *stored, uint16 curpos, LexemeInfo **newin, int newn)
699 LexemeInfo *ptr = newin[0];
701 for (i = 0; i < newn; i++)
703 while (newin[i] && newin[i]->idsubst < ptr->idsubst)
704 newin[i] = newin[i]->nextentry;
706 if (newin[i] == NULL)
709 if (newin[i]->idsubst > ptr->idsubst)
716 while (newin[i]->idsubst == ptr->idsubst)
718 if (newin[i]->posinsubst == curpos && newin[i]->tnvariant == newn)
724 newin[i] = newin[i]->nextentry;
725 if (newin[i] == NULL)
729 if (newin[i]->idsubst != ptr->idsubst)
737 if (i == newn && matchIdSubst(stored, ptr->idsubst) && (in == NULL || !matchIdSubst(in, ptr->idsubst)))
740 ptr->nextvariant = in;
745 for (i = 0; i < newn; i++)
746 newin[i] = newin[i]->nextentry;
753 copyTSLexeme(TheSubstitute *ts)
758 res = (TSLexeme *) palloc(sizeof(TSLexeme) * (ts->reslen + 1));
759 for (i = 0; i < ts->reslen; i++)
762 res[i].lexeme = pstrdup(ts->res[i].lexeme);
765 res[ts->reslen].lexeme = NULL;
771 checkMatch(DictThesaurus *d, LexemeInfo *info, uint16 curpos, bool *moreres)
776 Assert(info->idsubst < d->nsubst);
777 if (info->nextvariant)
779 if (d->subst[info->idsubst].lastlexeme == curpos)
780 return copyTSLexeme(d->subst + info->idsubst);
781 info = info->nextvariant;
788 thesaurus_lexize(PG_FUNCTION_ARGS)
790 DictThesaurus *d = (DictThesaurus *) PG_GETARG_POINTER(0);
791 DictSubState *dstate = (DictSubState *) PG_GETARG_POINTER(3);
792 TSLexeme *res = NULL;
796 bool moreres = false;
798 if (PG_NARGS() != 4 || dstate == NULL)
799 elog(ERROR, "forbidden call of thesaurus or nested call");
802 PG_RETURN_POINTER(NULL);
803 stored = (LexemeInfo *) dstate->private;
806 curpos = stored->posinsubst + 1;
808 if (!d->subdict->isvalid)
809 d->subdict = lookup_ts_dictionary_cache(d->subdictOid);
811 res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize),
812 PointerGetDatum(d->subdict->dictData),
815 PointerGetDatum(NULL)));
817 if (res && res->lexeme)
824 uint16 nv = ptr->nvariant;
830 while (ptr->lexeme && nv == ptr->nvariant)
836 infos = (LexemeInfo **) palloc(sizeof(LexemeInfo *) * nlex);
837 for (i = 0; i < nlex; i++)
838 if ((infos[i] = findTheLexeme(d, basevar[i].lexeme)) == NULL)
843 /* no chance to find */
848 info = findVariant(info, stored, curpos, infos, nlex);
853 LexemeInfo *infos = findTheLexeme(d, NULL);
855 info = findVariant(NULL, stored, curpos, &infos, 1);
859 info = NULL; /* word isn't recognized */
862 dstate->private = (void *) info;
866 dstate->getnext = false;
867 PG_RETURN_POINTER(NULL);
870 if ((res = checkMatch(d, info, curpos, &moreres)) != NULL)
872 dstate->getnext = moreres;
873 PG_RETURN_POINTER(res);
876 dstate->getnext = true;
878 PG_RETURN_POINTER(NULL);