]> granicus.if.org Git - postgresql/blob - src/backend/tsearch/dict_thesaurus.c
e0d21c2ad61fbd075570c0238f59d5978d95dba0
[postgresql] / src / backend / tsearch / dict_thesaurus.c
1 /*-------------------------------------------------------------------------
2  *
3  * dict_thesaurus.c
4  *              Thesaurus dictionary: phrase to phrase substitution
5  *
6  * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
7  *
8  *
9  * IDENTIFICATION
10  *        src/backend/tsearch/dict_thesaurus.c
11  *
12  *-------------------------------------------------------------------------
13  */
14 #include "postgres.h"
15
16 #include "catalog/namespace.h"
17 #include "commands/defrem.h"
18 #include "tsearch/ts_cache.h"
19 #include "tsearch/ts_locale.h"
20 #include "tsearch/ts_utils.h"
21 #include "utils/builtins.h"
22
23
24 /*
25  * Temporay we use TSLexeme.flags for inner use...
26  */
27 #define DT_USEASIS              0x1000
28
29 typedef struct LexemeInfo
30 {
31         uint16          idsubst;                /* entry's number in DictThesaurus->subst */
32         uint16          posinsubst;             /* pos info in entry */
33         uint16          tnvariant;              /* total num lexemes in one variant */
34         struct LexemeInfo *nextentry;
35         struct LexemeInfo *nextvariant;
36 } LexemeInfo;
37
38 typedef struct
39 {
40         char       *lexeme;
41         LexemeInfo *entries;
42 } TheLexeme;
43
44 typedef struct
45 {
46         uint16          lastlexeme;             /* number lexemes to substitute */
47         uint16          reslen;
48         TSLexeme   *res;                        /* prepared substituted result */
49 } TheSubstitute;
50
51 typedef struct
52 {
53         /* subdictionary to normalize lexemes */
54         Oid                     subdictOid;
55         TSDictionaryCacheEntry *subdict;
56
57         /* Array to search lexeme by exact match */
58         TheLexeme  *wrds;
59         int                     nwrds;                  /* current number of words */
60         int                     ntwrds;                 /* allocated array length */
61
62         /*
63          * Storage of substituted result, n-th element is for n-th expression
64          */
65         TheSubstitute *subst;
66         int                     nsubst;
67 } DictThesaurus;
68
69
70 static void
71 newLexeme(DictThesaurus *d, char *b, char *e, uint16 idsubst, uint16 posinsubst)
72 {
73         TheLexeme  *ptr;
74
75         if (d->nwrds >= d->ntwrds)
76         {
77                 if (d->ntwrds == 0)
78                 {
79                         d->ntwrds = 16;
80                         d->wrds = (TheLexeme *) palloc(sizeof(TheLexeme) * d->ntwrds);
81                 }
82                 else
83                 {
84                         d->ntwrds *= 2;
85                         d->wrds = (TheLexeme *) repalloc(d->wrds, sizeof(TheLexeme) * d->ntwrds);
86                 }
87         }
88
89         ptr = d->wrds + d->nwrds;
90         d->nwrds++;
91
92         ptr->lexeme = palloc(e - b + 1);
93
94         memcpy(ptr->lexeme, b, e - b);
95         ptr->lexeme[e - b] = '\0';
96
97         ptr->entries = (LexemeInfo *) palloc(sizeof(LexemeInfo));
98
99         ptr->entries->nextentry = NULL;
100         ptr->entries->idsubst = idsubst;
101         ptr->entries->posinsubst = posinsubst;
102 }
103
104 static void
105 addWrd(DictThesaurus *d, char *b, char *e, uint16 idsubst, uint16 nwrd, uint16 posinsubst, bool useasis)
106 {
107         static int      nres = 0;
108         static int      ntres = 0;
109         TheSubstitute *ptr;
110
111         if (nwrd == 0)
112         {
113                 nres = ntres = 0;
114
115                 if (idsubst >= d->nsubst)
116                 {
117                         if (d->nsubst == 0)
118                         {
119                                 d->nsubst = 16;
120                                 d->subst = (TheSubstitute *) palloc(sizeof(TheSubstitute) * d->nsubst);
121                         }
122                         else
123                         {
124                                 d->nsubst *= 2;
125                                 d->subst = (TheSubstitute *) repalloc(d->subst, sizeof(TheSubstitute) * d->nsubst);
126                         }
127                 }
128         }
129
130         ptr = d->subst + idsubst;
131
132         ptr->lastlexeme = posinsubst - 1;
133
134         if (nres + 1 >= ntres)
135         {
136                 if (ntres == 0)
137                 {
138                         ntres = 2;
139                         ptr->res = (TSLexeme *) palloc(sizeof(TSLexeme) * ntres);
140                 }
141                 else
142                 {
143                         ntres *= 2;
144                         ptr->res = (TSLexeme *) repalloc(ptr->res, sizeof(TSLexeme) * ntres);
145                 }
146
147         }
148
149         ptr->res[nres].lexeme = palloc(e - b + 1);
150         memcpy(ptr->res[nres].lexeme, b, e - b);
151         ptr->res[nres].lexeme[e - b] = '\0';
152
153         ptr->res[nres].nvariant = nwrd;
154         if (useasis)
155                 ptr->res[nres].flags = DT_USEASIS;
156         else
157                 ptr->res[nres].flags = 0;
158
159         ptr->res[++nres].lexeme = NULL;
160 }
161
162 #define TR_WAITLEX      1
163 #define TR_INLEX        2
164 #define TR_WAITSUBS 3
165 #define TR_INSUBS       4
166
167 static void
168 thesaurusRead(char *filename, DictThesaurus *d)
169 {
170         tsearch_readline_state trst;
171         uint16          idsubst = 0;
172         bool            useasis = false;
173         char       *line;
174
175         filename = get_tsearch_config_filename(filename, "ths");
176         if (!tsearch_readline_begin(&trst, filename))
177                 ereport(ERROR,
178                                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
179                                  errmsg("could not open thesaurus file \"%s\": %m",
180                                                 filename)));
181
182         while ((line = tsearch_readline(&trst)) != NULL)
183         {
184                 char       *ptr;
185                 int                     state = TR_WAITLEX;
186                 char       *beginwrd = NULL;
187                 uint16          posinsubst = 0;
188                 uint16          nwrd = 0;
189
190                 ptr = line;
191
192                 /* is it a comment? */
193                 while (*ptr && t_isspace(ptr))
194                         ptr += pg_mblen(ptr);
195
196                 if (t_iseq(ptr, '#') || *ptr == '\0' ||
197                         t_iseq(ptr, '\n') || t_iseq(ptr, '\r'))
198                 {
199                         pfree(line);
200                         continue;
201                 }
202
203                 while (*ptr)
204                 {
205                         if (state == TR_WAITLEX)
206                         {
207                                 if (t_iseq(ptr, ':'))
208                                 {
209                                         if (posinsubst == 0)
210                                                 ereport(ERROR,
211                                                                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
212                                                                  errmsg("unexpected delimiter")));
213                                         state = TR_WAITSUBS;
214                                 }
215                                 else if (!t_isspace(ptr))
216                                 {
217                                         beginwrd = ptr;
218                                         state = TR_INLEX;
219                                 }
220                         }
221                         else if (state == TR_INLEX)
222                         {
223                                 if (t_iseq(ptr, ':'))
224                                 {
225                                         newLexeme(d, beginwrd, ptr, idsubst, posinsubst++);
226                                         state = TR_WAITSUBS;
227                                 }
228                                 else if (t_isspace(ptr))
229                                 {
230                                         newLexeme(d, beginwrd, ptr, idsubst, posinsubst++);
231                                         state = TR_WAITLEX;
232                                 }
233                         }
234                         else if (state == TR_WAITSUBS)
235                         {
236                                 if (t_iseq(ptr, '*'))
237                                 {
238                                         useasis = true;
239                                         state = TR_INSUBS;
240                                         beginwrd = ptr + pg_mblen(ptr);
241                                 }
242                                 else if (t_iseq(ptr, '\\'))
243                                 {
244                                         useasis = false;
245                                         state = TR_INSUBS;
246                                         beginwrd = ptr + pg_mblen(ptr);
247                                 }
248                                 else if (!t_isspace(ptr))
249                                 {
250                                         useasis = false;
251                                         beginwrd = ptr;
252                                         state = TR_INSUBS;
253                                 }
254                         }
255                         else if (state == TR_INSUBS)
256                         {
257                                 if (t_isspace(ptr))
258                                 {
259                                         if (ptr == beginwrd)
260                                                 ereport(ERROR,
261                                                                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
262                                                                  errmsg("unexpected end of line or lexeme")));
263                                         addWrd(d, beginwrd, ptr, idsubst, nwrd++, posinsubst, useasis);
264                                         state = TR_WAITSUBS;
265                                 }
266                         }
267                         else
268                                 elog(ERROR, "unrecognized thesaurus state: %d", state);
269
270                         ptr += pg_mblen(ptr);
271                 }
272
273                 if (state == TR_INSUBS)
274                 {
275                         if (ptr == beginwrd)
276                                 ereport(ERROR,
277                                                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
278                                                  errmsg("unexpected end of line or lexeme")));
279                         addWrd(d, beginwrd, ptr, idsubst, nwrd++, posinsubst, useasis);
280                 }
281
282                 idsubst++;
283
284                 if (!(nwrd && posinsubst))
285                         ereport(ERROR,
286                                         (errcode(ERRCODE_CONFIG_FILE_ERROR),
287                                          errmsg("unexpected end of line")));
288
289                 pfree(line);
290         }
291
292         d->nsubst = idsubst;
293
294         tsearch_readline_end(&trst);
295 }
296
297 static TheLexeme *
298 addCompiledLexeme(TheLexeme *newwrds, int *nnw, int *tnm, TSLexeme *lexeme, LexemeInfo *src, uint16 tnvariant)
299 {
300         if (*nnw >= *tnm)
301         {
302                 *tnm *= 2;
303                 newwrds = (TheLexeme *) repalloc(newwrds, sizeof(TheLexeme) * *tnm);
304         }
305
306         newwrds[*nnw].entries = (LexemeInfo *) palloc(sizeof(LexemeInfo));
307
308         if (lexeme && lexeme->lexeme)
309         {
310                 newwrds[*nnw].lexeme = pstrdup(lexeme->lexeme);
311                 newwrds[*nnw].entries->tnvariant = tnvariant;
312         }
313         else
314         {
315                 newwrds[*nnw].lexeme = NULL;
316                 newwrds[*nnw].entries->tnvariant = 1;
317         }
318
319         newwrds[*nnw].entries->idsubst = src->idsubst;
320         newwrds[*nnw].entries->posinsubst = src->posinsubst;
321
322         newwrds[*nnw].entries->nextentry = NULL;
323
324         (*nnw)++;
325         return newwrds;
326 }
327
328 static int
329 cmpLexemeInfo(LexemeInfo *a, LexemeInfo *b)
330 {
331         if (a == NULL || b == NULL)
332                 return 0;
333
334         if (a->idsubst == b->idsubst)
335         {
336                 if (a->posinsubst == b->posinsubst)
337                 {
338                         if (a->tnvariant == b->tnvariant)
339                                 return 0;
340
341                         return (a->tnvariant > b->tnvariant) ? 1 : -1;
342                 }
343
344                 return (a->posinsubst > b->posinsubst) ? 1 : -1;
345         }
346
347         return (a->idsubst > b->idsubst) ? 1 : -1;
348 }
349
350 static int
351 cmpLexeme(const TheLexeme *a, const TheLexeme *b)
352 {
353         if (a->lexeme == NULL)
354         {
355                 if (b->lexeme == NULL)
356                         return 0;
357                 else
358                         return 1;
359         }
360         else if (b->lexeme == NULL)
361                 return -1;
362
363         return strcmp(a->lexeme, b->lexeme);
364 }
365
366 static int
367 cmpLexemeQ(const void *a, const void *b)
368 {
369         return cmpLexeme((const TheLexeme *) a, (const TheLexeme *) b);
370 }
371
372 static int
373 cmpTheLexeme(const void *a, const void *b)
374 {
375         const TheLexeme  *la = (const TheLexeme *) a;
376         const TheLexeme  *lb = (const TheLexeme *) b;
377         int                     res;
378
379         if ((res = cmpLexeme(la, lb)) != 0)
380                 return res;
381
382         return -cmpLexemeInfo(la->entries, lb->entries);
383 }
384
385 static void
386 compileTheLexeme(DictThesaurus *d)
387 {
388         int                     i,
389                                 nnw = 0,
390                                 tnm = 16;
391         TheLexeme  *newwrds = (TheLexeme *) palloc(sizeof(TheLexeme) * tnm),
392                            *ptrwrds;
393
394         for (i = 0; i < d->nwrds; i++)
395         {
396                 TSLexeme   *ptr;
397
398                 if (strcmp(d->wrds[i].lexeme, "?") == 0)                /* Is stop word marker? */
399                         newwrds = addCompiledLexeme(newwrds, &nnw, &tnm, NULL, d->wrds[i].entries, 0);
400                 else
401                 {
402                         ptr = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize),
403                                                                            PointerGetDatum(d->subdict->dictData),
404                                                                                   PointerGetDatum(d->wrds[i].lexeme),
405                                                                         Int32GetDatum(strlen(d->wrds[i].lexeme)),
406                                                                                                          PointerGetDatum(NULL)));
407
408                         if (!ptr)
409                                 ereport(ERROR,
410                                                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
411                                                  errmsg("thesaurus sample word \"%s\" isn't recognized by subdictionary (rule %d)",
412                                                                 d->wrds[i].lexeme,
413                                                                 d->wrds[i].entries->idsubst + 1)));
414                         else if (!(ptr->lexeme))
415                                 ereport(ERROR,
416                                                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
417                                                  errmsg("thesaurus sample word \"%s\" is a stop word (rule %d)",
418                                                                 d->wrds[i].lexeme,
419                                                                 d->wrds[i].entries->idsubst + 1),
420                                                  errhint("Use \"?\" to represent a stop word within a sample phrase.")));
421                         else
422                         {
423                                 while (ptr->lexeme)
424                                 {
425                                         TSLexeme   *remptr = ptr + 1;
426                                         int                     tnvar = 1;
427                                         int                     curvar = ptr->nvariant;
428
429                                         /* compute n words in one variant */
430                                         while (remptr->lexeme)
431                                         {
432                                                 if (remptr->nvariant != (remptr - 1)->nvariant)
433                                                         break;
434                                                 tnvar++;
435                                                 remptr++;
436                                         }
437
438                                         remptr = ptr;
439                                         while (remptr->lexeme && remptr->nvariant == curvar)
440                                         {
441                                                 newwrds = addCompiledLexeme(newwrds, &nnw, &tnm, remptr, d->wrds[i].entries, tnvar);
442                                                 remptr++;
443                                         }
444
445                                         ptr = remptr;
446                                 }
447                         }
448                 }
449
450                 pfree(d->wrds[i].lexeme);
451                 pfree(d->wrds[i].entries);
452         }
453
454         if (d->wrds)
455                 pfree(d->wrds);
456         d->wrds = newwrds;
457         d->nwrds = nnw;
458         d->ntwrds = tnm;
459
460         if (d->nwrds > 1)
461         {
462                 qsort(d->wrds, d->nwrds, sizeof(TheLexeme), cmpTheLexeme);
463
464                 /* uniq */
465                 newwrds = d->wrds;
466                 ptrwrds = d->wrds + 1;
467                 while (ptrwrds - d->wrds < d->nwrds)
468                 {
469                         if (cmpLexeme(ptrwrds, newwrds) == 0)
470                         {
471                                 if (cmpLexemeInfo(ptrwrds->entries, newwrds->entries))
472                                 {
473                                         ptrwrds->entries->nextentry = newwrds->entries;
474                                         newwrds->entries = ptrwrds->entries;
475                                 }
476                                 else
477                                         pfree(ptrwrds->entries);
478
479                                 if (ptrwrds->lexeme)
480                                         pfree(ptrwrds->lexeme);
481                         }
482                         else
483                         {
484                                 newwrds++;
485                                 *newwrds = *ptrwrds;
486                         }
487
488                         ptrwrds++;
489                 }
490
491                 d->nwrds = newwrds - d->wrds + 1;
492                 d->wrds = (TheLexeme *) repalloc(d->wrds, sizeof(TheLexeme) * d->nwrds);
493         }
494 }
495
496 static void
497 compileTheSubstitute(DictThesaurus *d)
498 {
499         int                     i;
500
501         for (i = 0; i < d->nsubst; i++)
502         {
503                 TSLexeme   *rem = d->subst[i].res,
504                                    *outptr,
505                                    *inptr;
506                 int                     n = 2;
507
508                 outptr = d->subst[i].res = (TSLexeme *) palloc(sizeof(TSLexeme) * n);
509                 outptr->lexeme = NULL;
510                 inptr = rem;
511
512                 while (inptr && inptr->lexeme)
513                 {
514                         TSLexeme   *lexized,
515                                                 tmplex[2];
516
517                         if (inptr->flags & DT_USEASIS)
518                         {                                       /* do not lexize */
519                                 tmplex[0] = *inptr;
520                                 tmplex[0].flags = 0;
521                                 tmplex[1].lexeme = NULL;
522                                 lexized = tmplex;
523                         }
524                         else
525                         {
526                                 lexized = (TSLexeme *) DatumGetPointer(
527                                                                                                            FunctionCall4(
528                                                                                                            &(d->subdict->lexize),
529                                                                            PointerGetDatum(d->subdict->dictData),
530                                                                                           PointerGetDatum(inptr->lexeme),
531                                                                                 Int32GetDatum(strlen(inptr->lexeme)),
532                                                                                                                 PointerGetDatum(NULL)
533                                                                                                                                          )
534                                         );
535                         }
536
537                         if (lexized && lexized->lexeme)
538                         {
539                                 int                     toset = (lexized->lexeme && outptr != d->subst[i].res) ? (outptr - d->subst[i].res) : -1;
540
541                                 while (lexized->lexeme)
542                                 {
543                                         if (outptr - d->subst[i].res + 1 >= n)
544                                         {
545                                                 int                     diff = outptr - d->subst[i].res;
546
547                                                 n *= 2;
548                                                 d->subst[i].res = (TSLexeme *) repalloc(d->subst[i].res, sizeof(TSLexeme) * n);
549                                                 outptr = d->subst[i].res + diff;
550                                         }
551
552                                         *outptr = *lexized;
553                                         outptr->lexeme = pstrdup(lexized->lexeme);
554
555                                         outptr++;
556                                         lexized++;
557                                 }
558
559                                 if (toset > 0)
560                                         d->subst[i].res[toset].flags |= TSL_ADDPOS;
561                         }
562                         else if (lexized)
563                         {
564                                 ereport(ERROR,
565                                                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
566                                                  errmsg("thesaurus substitute word \"%s\" is a stop word (rule %d)",
567                                                                 inptr->lexeme, i + 1)));
568                         }
569                         else
570                         {
571                                 ereport(ERROR,
572                                                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
573                                                  errmsg("thesaurus substitute word \"%s\" isn't recognized by subdictionary (rule %d)",
574                                                                 inptr->lexeme, i + 1)));
575                         }
576
577                         if (inptr->lexeme)
578                                 pfree(inptr->lexeme);
579                         inptr++;
580                 }
581
582                 if (outptr == d->subst[i].res)
583                         ereport(ERROR,
584                                         (errcode(ERRCODE_CONFIG_FILE_ERROR),
585                                          errmsg("thesaurus substitute phrase is empty (rule %d)",
586                                                         i + 1)));
587
588                 d->subst[i].reslen = outptr - d->subst[i].res;
589
590                 pfree(rem);
591         }
592 }
593
594 Datum
595 thesaurus_init(PG_FUNCTION_ARGS)
596 {
597         List       *dictoptions = (List *) PG_GETARG_POINTER(0);
598         DictThesaurus *d;
599         char       *subdictname = NULL;
600         bool            fileloaded = false;
601         ListCell   *l;
602
603         d = (DictThesaurus *) palloc0(sizeof(DictThesaurus));
604
605         foreach(l, dictoptions)
606         {
607                 DefElem    *defel = (DefElem *) lfirst(l);
608
609                 if (pg_strcasecmp("DictFile", defel->defname) == 0)
610                 {
611                         if (fileloaded)
612                                 ereport(ERROR,
613                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
614                                                  errmsg("multiple DictFile parameters")));
615                         thesaurusRead(defGetString(defel), d);
616                         fileloaded = true;
617                 }
618                 else if (pg_strcasecmp("Dictionary", defel->defname) == 0)
619                 {
620                         if (subdictname)
621                                 ereport(ERROR,
622                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
623                                                  errmsg("multiple Dictionary parameters")));
624                         subdictname = pstrdup(defGetString(defel));
625                 }
626                 else
627                 {
628                         ereport(ERROR,
629                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
630                                          errmsg("unrecognized Thesaurus parameter: \"%s\"",
631                                                         defel->defname)));
632                 }
633         }
634
635         if (!fileloaded)
636                 ereport(ERROR,
637                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
638                                  errmsg("missing DictFile parameter")));
639         if (!subdictname)
640                 ereport(ERROR,
641                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
642                                  errmsg("missing Dictionary parameter")));
643
644         d->subdictOid = get_ts_dict_oid(stringToQualifiedNameList(subdictname), false);
645         d->subdict = lookup_ts_dictionary_cache(d->subdictOid);
646
647         compileTheLexeme(d);
648         compileTheSubstitute(d);
649
650         PG_RETURN_POINTER(d);
651 }
652
653 static LexemeInfo *
654 findTheLexeme(DictThesaurus *d, char *lexeme)
655 {
656         TheLexeme       key,
657                            *res;
658
659         if (d->nwrds == 0)
660                 return NULL;
661
662         key.lexeme = lexeme;
663         key.entries = NULL;
664
665         res = bsearch(&key, d->wrds, d->nwrds, sizeof(TheLexeme), cmpLexemeQ);
666
667         if (res == NULL)
668                 return NULL;
669         return res->entries;
670 }
671
672 static bool
673 matchIdSubst(LexemeInfo *stored, uint16 idsubst)
674 {
675         bool            res = true;
676
677         if (stored)
678         {
679                 res = false;
680
681                 for (; stored; stored = stored->nextvariant)
682                         if (stored->idsubst == idsubst)
683                         {
684                                 res = true;
685                                 break;
686                         }
687         }
688
689         return res;
690 }
691
692 static LexemeInfo *
693 findVariant(LexemeInfo *in, LexemeInfo *stored, uint16 curpos, LexemeInfo **newin, int newn)
694 {
695         for (;;)
696         {
697                 int                     i;
698                 LexemeInfo *ptr = newin[0];
699
700                 for (i = 0; i < newn; i++)
701                 {
702                         while (newin[i] && newin[i]->idsubst < ptr->idsubst)
703                                 newin[i] = newin[i]->nextentry;
704
705                         if (newin[i] == NULL)
706                                 return in;
707
708                         if (newin[i]->idsubst > ptr->idsubst)
709                         {
710                                 ptr = newin[i];
711                                 i = -1;
712                                 continue;
713                         }
714
715                         while (newin[i]->idsubst == ptr->idsubst)
716                         {
717                                 if (newin[i]->posinsubst == curpos && newin[i]->tnvariant == newn)
718                                 {
719                                         ptr = newin[i];
720                                         break;
721                                 }
722
723                                 newin[i] = newin[i]->nextentry;
724                                 if (newin[i] == NULL)
725                                         return in;
726                         }
727
728                         if (newin[i]->idsubst != ptr->idsubst)
729                         {
730                                 ptr = newin[i];
731                                 i = -1;
732                                 continue;
733                         }
734                 }
735
736                 if (i == newn && matchIdSubst(stored, ptr->idsubst) && (in == NULL || !matchIdSubst(in, ptr->idsubst)))
737                 {                                               /* found */
738
739                         ptr->nextvariant = in;
740                         in = ptr;
741                 }
742
743                 /* step forward */
744                 for (i = 0; i < newn; i++)
745                         newin[i] = newin[i]->nextentry;
746         }
747
748         return NULL;
749 }
750
751 static TSLexeme *
752 copyTSLexeme(TheSubstitute *ts)
753 {
754         TSLexeme   *res;
755         uint16          i;
756
757         res = (TSLexeme *) palloc(sizeof(TSLexeme) * (ts->reslen + 1));
758         for (i = 0; i < ts->reslen; i++)
759         {
760                 res[i] = ts->res[i];
761                 res[i].lexeme = pstrdup(ts->res[i].lexeme);
762         }
763
764         res[ts->reslen].lexeme = NULL;
765
766         return res;
767 }
768
769 static TSLexeme *
770 checkMatch(DictThesaurus *d, LexemeInfo *info, uint16 curpos, bool *moreres)
771 {
772         *moreres = false;
773         while (info)
774         {
775                 Assert(info->idsubst < d->nsubst);
776                 if (info->nextvariant)
777                         *moreres = true;
778                 if (d->subst[info->idsubst].lastlexeme == curpos)
779                         return copyTSLexeme(d->subst + info->idsubst);
780                 info = info->nextvariant;
781         }
782
783         return NULL;
784 }
785
786 Datum
787 thesaurus_lexize(PG_FUNCTION_ARGS)
788 {
789         DictThesaurus *d = (DictThesaurus *) PG_GETARG_POINTER(0);
790         DictSubState *dstate = (DictSubState *) PG_GETARG_POINTER(3);
791         TSLexeme   *res = NULL;
792         LexemeInfo *stored,
793                            *info = NULL;
794         uint16          curpos = 0;
795         bool            moreres = false;
796
797         if (PG_NARGS() != 4 || dstate == NULL)
798                 elog(ERROR, "forbidden call of thesaurus or nested call");
799
800         if (dstate->isend)
801                 PG_RETURN_POINTER(NULL);
802         stored = (LexemeInfo *) dstate->private_state;
803
804         if (stored)
805                 curpos = stored->posinsubst + 1;
806
807         if (!d->subdict->isvalid)
808                 d->subdict = lookup_ts_dictionary_cache(d->subdictOid);
809
810         res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize),
811                                                                            PointerGetDatum(d->subdict->dictData),
812                                                                                                          PG_GETARG_DATUM(1),
813                                                                                                          PG_GETARG_DATUM(2),
814                                                                                                          PointerGetDatum(NULL)));
815
816         if (res && res->lexeme)
817         {
818                 TSLexeme   *ptr = res,
819                                    *basevar;
820
821                 while (ptr->lexeme)
822                 {
823                         uint16          nv = ptr->nvariant;
824                         uint16          i,
825                                                 nlex = 0;
826                         LexemeInfo **infos;
827
828                         basevar = ptr;
829                         while (ptr->lexeme && nv == ptr->nvariant)
830                         {
831                                 nlex++;
832                                 ptr++;
833                         }
834
835                         infos = (LexemeInfo **) palloc(sizeof(LexemeInfo *) * nlex);
836                         for (i = 0; i < nlex; i++)
837                                 if ((infos[i] = findTheLexeme(d, basevar[i].lexeme)) == NULL)
838                                         break;
839
840                         if (i < nlex)
841                         {
842                                 /* no chance to find */
843                                 pfree(infos);
844                                 continue;
845                         }
846
847                         info = findVariant(info, stored, curpos, infos, nlex);
848                 }
849         }
850         else if (res)
851         {                                                       /* stop-word */
852                 LexemeInfo *infos = findTheLexeme(d, NULL);
853
854                 info = findVariant(NULL, stored, curpos, &infos, 1);
855         }
856         else
857         {
858                 info = NULL;                    /* word isn't recognized */
859         }
860
861         dstate->private_state = (void *) info;
862
863         if (!info)
864         {
865                 dstate->getnext = false;
866                 PG_RETURN_POINTER(NULL);
867         }
868
869         if ((res = checkMatch(d, info, curpos, &moreres)) != NULL)
870         {
871                 dstate->getnext = moreres;
872                 PG_RETURN_POINTER(res);
873         }
874
875         dstate->getnext = true;
876
877         PG_RETURN_POINTER(NULL);
878 }