]> granicus.if.org Git - postgresql/blob - src/backend/tsearch/dict_thesaurus.c
Implement SEMI and ANTI joins in the planner and executor. (Semijoins replace
[postgresql] / src / backend / tsearch / dict_thesaurus.c
1 /*-------------------------------------------------------------------------
2  *
3  * dict_thesaurus.c
4  *              Thesaurus dictionary: phrase to phrase substitution
5  *
6  * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
7  *
8  *
9  * IDENTIFICATION
10  *        $PostgreSQL: pgsql/src/backend/tsearch/dict_thesaurus.c,v 1.12 2008/06/18 20:55:42 tgl Exp $
11  *
12  *-------------------------------------------------------------------------
13  */
14 #include "postgres.h"
15
16 #include "catalog/namespace.h"
17 #include "commands/defrem.h"
18 #include "tsearch/ts_cache.h"
19 #include "tsearch/ts_locale.h"
20 #include "tsearch/ts_public.h"
21 #include "tsearch/ts_utils.h"
22 #include "utils/builtins.h"
23
24
25 /*
26  * Temporay we use TSLexeme.flags for inner use...
27  */
28 #define DT_USEASIS              0x1000
29
30 typedef struct LexemeInfo
31 {
32         uint16          idsubst;                /* entry's number in DictThesaurus->subst */
33         uint16          posinsubst;             /* pos info in entry */
34         uint16          tnvariant;              /* total num lexemes in one variant */
35         struct LexemeInfo *nextentry;
36         struct LexemeInfo *nextvariant;
37 } LexemeInfo;
38
39 typedef struct
40 {
41         char       *lexeme;
42         LexemeInfo *entries;
43 } TheLexeme;
44
45 typedef struct
46 {
47         uint16          lastlexeme;             /* number lexemes to substitute */
48         uint16          reslen;
49         TSLexeme   *res;                        /* prepared substituted result */
50 } TheSubstitute;
51
52 typedef struct
53 {
54         /* subdictionary to normalize lexemes */
55         Oid                     subdictOid;
56         TSDictionaryCacheEntry *subdict;
57
58         /* Array to search lexeme by exact match */
59         TheLexeme  *wrds;
60         int                     nwrds;
61         int                     ntwrds;
62
63         /*
64          * Storage of substituted result, n-th element is for n-th expression
65          */
66         TheSubstitute *subst;
67         int                     nsubst;
68 } DictThesaurus;
69
70
71 static void
72 newLexeme(DictThesaurus *d, char *b, char *e, uint16 idsubst, uint16 posinsubst)
73 {
74         TheLexeme  *ptr;
75
76         if (d->nwrds >= d->ntwrds)
77         {
78                 if (d->ntwrds == 0)
79                 {
80                         d->ntwrds = 16;
81                         d->wrds = (TheLexeme *) palloc(sizeof(TheLexeme) * d->ntwrds);
82                 }
83                 else
84                 {
85                         d->ntwrds *= 2;
86                         d->wrds = (TheLexeme *) repalloc(d->wrds, sizeof(TheLexeme) * d->ntwrds);
87                 }
88         }
89
90         ptr = d->wrds + d->nwrds;
91         d->nwrds++;
92
93         ptr->lexeme = palloc(e - b + 1);
94
95         memcpy(ptr->lexeme, b, e - b);
96         ptr->lexeme[e - b] = '\0';
97
98         ptr->entries = (LexemeInfo *) palloc(sizeof(LexemeInfo));
99
100         ptr->entries->nextentry = NULL;
101         ptr->entries->idsubst = idsubst;
102         ptr->entries->posinsubst = posinsubst;
103 }
104
105 static void
106 addWrd(DictThesaurus *d, char *b, char *e, uint16 idsubst, uint16 nwrd, uint16 posinsubst, bool useasis)
107 {
108         static int      nres = 0;
109         static int      ntres = 0;
110         TheSubstitute *ptr;
111
112         if (nwrd == 0)
113         {
114                 nres = ntres = 0;
115
116                 if (idsubst >= d->nsubst)
117                 {
118                         if (d->nsubst == 0)
119                         {
120                                 d->nsubst = 16;
121                                 d->subst = (TheSubstitute *) palloc(sizeof(TheSubstitute) * d->nsubst);
122                         }
123                         else
124                         {
125                                 d->nsubst *= 2;
126                                 d->subst = (TheSubstitute *) repalloc(d->subst, sizeof(TheSubstitute) * d->nsubst);
127                         }
128                 }
129         }
130
131         ptr = d->subst + idsubst;
132
133         ptr->lastlexeme = posinsubst - 1;
134
135         if (nres + 1 >= ntres)
136         {
137                 if (ntres == 0)
138                 {
139                         ntres = 2;
140                         ptr->res = (TSLexeme *) palloc(sizeof(TSLexeme) * ntres);
141                 }
142                 else
143                 {
144                         ntres *= 2;
145                         ptr->res = (TSLexeme *) repalloc(ptr->res, sizeof(TSLexeme) * ntres);
146                 }
147
148         }
149
150         ptr->res[nres].lexeme = palloc(e - b + 1);
151         memcpy(ptr->res[nres].lexeme, b, e - b);
152         ptr->res[nres].lexeme[e - b] = '\0';
153
154         ptr->res[nres].nvariant = nwrd;
155         if (useasis)
156                 ptr->res[nres].flags = DT_USEASIS;
157         else
158                 ptr->res[nres].flags = 0;
159
160         ptr->res[++nres].lexeme = NULL;
161 }
162
163 #define TR_WAITLEX      1
164 #define TR_INLEX        2
165 #define TR_WAITSUBS 3
166 #define TR_INSUBS       4
167
168 static void
169 thesaurusRead(char *filename, DictThesaurus *d)
170 {
171         tsearch_readline_state trst;
172         uint16          idsubst = 0;
173         bool            useasis = false;
174         char       *line;
175
176         filename = get_tsearch_config_filename(filename, "ths");
177         if (!tsearch_readline_begin(&trst, filename))
178                 ereport(ERROR,
179                                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
180                                  errmsg("could not open thesaurus file \"%s\": %m",
181                                                 filename)));
182
183         while ((line = tsearch_readline(&trst)) != NULL)
184         {
185                 char       *ptr;
186                 int                     state = TR_WAITLEX;
187                 char       *beginwrd = NULL;
188                 uint16          posinsubst = 0;
189                 uint16          nwrd = 0;
190
191                 ptr = line;
192
193                 /* is it a comment? */
194                 while (*ptr && t_isspace(ptr))
195                         ptr += pg_mblen(ptr);
196
197                 if (t_iseq(ptr, '#') || *ptr == '\0' ||
198                         t_iseq(ptr, '\n') || t_iseq(ptr, '\r'))
199                 {
200                         pfree(line);
201                         continue;
202                 }
203
204                 while (*ptr)
205                 {
206                         if (state == TR_WAITLEX)
207                         {
208                                 if (t_iseq(ptr, ':'))
209                                 {
210                                         if (posinsubst == 0)
211                                                 ereport(ERROR,
212                                                                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
213                                                                  errmsg("unexpected delimiter")));
214                                         state = TR_WAITSUBS;
215                                 }
216                                 else if (!t_isspace(ptr))
217                                 {
218                                         beginwrd = ptr;
219                                         state = TR_INLEX;
220                                 }
221                         }
222                         else if (state == TR_INLEX)
223                         {
224                                 if (t_iseq(ptr, ':'))
225                                 {
226                                         newLexeme(d, beginwrd, ptr, idsubst, posinsubst++);
227                                         state = TR_WAITSUBS;
228                                 }
229                                 else if (t_isspace(ptr))
230                                 {
231                                         newLexeme(d, beginwrd, ptr, idsubst, posinsubst++);
232                                         state = TR_WAITLEX;
233                                 }
234                         }
235                         else if (state == TR_WAITSUBS)
236                         {
237                                 if (t_iseq(ptr, '*'))
238                                 {
239                                         useasis = true;
240                                         state = TR_INSUBS;
241                                         beginwrd = ptr + pg_mblen(ptr);
242                                 }
243                                 else if (t_iseq(ptr, '\\'))
244                                 {
245                                         useasis = false;
246                                         state = TR_INSUBS;
247                                         beginwrd = ptr + pg_mblen(ptr);
248                                 }
249                                 else if (!t_isspace(ptr))
250                                 {
251                                         useasis = false;
252                                         beginwrd = ptr;
253                                         state = TR_INSUBS;
254                                 }
255                         }
256                         else if (state == TR_INSUBS)
257                         {
258                                 if (t_isspace(ptr))
259                                 {
260                                         if (ptr == beginwrd)
261                                                 ereport(ERROR,
262                                                                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
263                                                                  errmsg("unexpected end of line or lexeme")));
264                                         addWrd(d, beginwrd, ptr, idsubst, nwrd++, posinsubst, useasis);
265                                         state = TR_WAITSUBS;
266                                 }
267                         }
268                         else
269                                 elog(ERROR, "unrecognized thesaurus state: %d", state);
270
271                         ptr += pg_mblen(ptr);
272                 }
273
274                 if (state == TR_INSUBS)
275                 {
276                         if (ptr == beginwrd)
277                                 ereport(ERROR,
278                                                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
279                                                  errmsg("unexpected end of line or lexeme")));
280                         addWrd(d, beginwrd, ptr, idsubst, nwrd++, posinsubst, useasis);
281                 }
282
283                 idsubst++;
284
285                 if (!(nwrd && posinsubst))
286                         ereport(ERROR,
287                                         (errcode(ERRCODE_CONFIG_FILE_ERROR),
288                                          errmsg("unexpected end of line")));
289
290                 pfree(line);
291         }
292
293         d->nsubst = idsubst;
294
295         tsearch_readline_end(&trst);
296 }
297
298 static TheLexeme *
299 addCompiledLexeme(TheLexeme *newwrds, int *nnw, int *tnm, TSLexeme *lexeme, LexemeInfo *src, uint16 tnvariant)
300 {
301
302         if (*nnw >= *tnm)
303         {
304                 *tnm *= 2;
305                 newwrds = (TheLexeme *) repalloc(newwrds, sizeof(TheLexeme) * *tnm);
306         }
307
308         newwrds[*nnw].entries = (LexemeInfo *) palloc(sizeof(LexemeInfo));
309
310         if (lexeme && lexeme->lexeme)
311         {
312                 newwrds[*nnw].lexeme = pstrdup(lexeme->lexeme);
313                 newwrds[*nnw].entries->tnvariant = tnvariant;
314         }
315         else
316         {
317                 newwrds[*nnw].lexeme = NULL;
318                 newwrds[*nnw].entries->tnvariant = 1;
319         }
320
321         newwrds[*nnw].entries->idsubst = src->idsubst;
322         newwrds[*nnw].entries->posinsubst = src->posinsubst;
323
324         newwrds[*nnw].entries->nextentry = NULL;
325
326         (*nnw)++;
327         return newwrds;
328 }
329
330 static int
331 cmpLexemeInfo(LexemeInfo *a, LexemeInfo *b)
332 {
333         if (a == NULL || b == NULL)
334                 return 0;
335
336         if (a->idsubst == b->idsubst)
337         {
338                 if (a->posinsubst == b->posinsubst)
339                 {
340                         if (a->tnvariant == b->tnvariant)
341                                 return 0;
342
343                         return (a->tnvariant > b->tnvariant) ? 1 : -1;
344                 }
345
346                 return (a->posinsubst > b->posinsubst) ? 1 : -1;
347         }
348
349         return (a->idsubst > b->idsubst) ? 1 : -1;
350 }
351
352 static int
353 cmpLexeme(TheLexeme *a, TheLexeme *b)
354 {
355         if (a->lexeme == NULL)
356         {
357                 if (b->lexeme == NULL)
358                         return 0;
359                 else
360                         return 1;
361         }
362         else if (b->lexeme == NULL)
363                 return -1;
364
365         return strcmp(a->lexeme, b->lexeme);
366 }
367
368 static int
369 cmpLexemeQ(const void *a, const void *b)
370 {
371         return cmpLexeme((TheLexeme *) a, (TheLexeme *) b);
372 }
373
374 static int
375 cmpTheLexeme(const void *a, const void *b)
376 {
377         TheLexeme  *la = (TheLexeme *) a;
378         TheLexeme  *lb = (TheLexeme *) b;
379         int                     res;
380
381         if ((res = cmpLexeme(la, lb)) != 0)
382                 return res;
383
384         return -cmpLexemeInfo(la->entries, lb->entries);
385 }
386
387 static void
388 compileTheLexeme(DictThesaurus *d)
389 {
390         int                     i,
391                                 nnw = 0,
392                                 tnm = 16;
393         TheLexeme  *newwrds = (TheLexeme *) palloc(sizeof(TheLexeme) * tnm),
394                            *ptrwrds;
395
396         for (i = 0; i < d->nwrds; i++)
397         {
398                 TSLexeme   *ptr;
399
400                 if (strcmp(d->wrds[i].lexeme, "?") == 0)                /* Is stop word marker? */
401                         newwrds = addCompiledLexeme(newwrds, &nnw, &tnm, NULL, d->wrds[i].entries, 0);
402                 else
403                 {
404                         ptr = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize),
405                                                                            PointerGetDatum(d->subdict->dictData),
406                                                                                   PointerGetDatum(d->wrds[i].lexeme),
407                                                                         Int32GetDatum(strlen(d->wrds[i].lexeme)),
408                                                                                                          PointerGetDatum(NULL)));
409
410                         if (!ptr)
411                                 ereport(ERROR,
412                                                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
413                                                  errmsg("thesaurus sample word \"%s\" isn't recognized by subdictionary (rule %d)",
414                                                                 d->wrds[i].lexeme,
415                                                                 d->wrds[i].entries->idsubst + 1)));
416                         else if (!(ptr->lexeme))
417                                 ereport(ERROR,
418                                                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
419                                                  errmsg("thesaurus sample word \"%s\" is a stop word (rule %d)",
420                                                                 d->wrds[i].lexeme,
421                                                                 d->wrds[i].entries->idsubst + 1),
422                                                  errhint("Use \"?\" to represent a stop word within a sample phrase.")));
423                         else
424                         {
425                                 while (ptr->lexeme)
426                                 {
427                                         TSLexeme   *remptr = ptr + 1;
428                                         int                     tnvar = 1;
429                                         int                     curvar = ptr->nvariant;
430
431                                         /* compute n words in one variant */
432                                         while (remptr->lexeme)
433                                         {
434                                                 if (remptr->nvariant != (remptr - 1)->nvariant)
435                                                         break;
436                                                 tnvar++;
437                                                 remptr++;
438                                         }
439
440                                         remptr = ptr;
441                                         while (remptr->lexeme && remptr->nvariant == curvar)
442                                         {
443                                                 newwrds = addCompiledLexeme(newwrds, &nnw, &tnm, remptr, d->wrds[i].entries, tnvar);
444                                                 remptr++;
445                                         }
446
447                                         ptr = remptr;
448                                 }
449                         }
450                 }
451
452                 pfree(d->wrds[i].lexeme);
453                 pfree(d->wrds[i].entries);
454         }
455
456         pfree(d->wrds);
457         d->wrds = newwrds;
458         d->nwrds = nnw;
459         d->ntwrds = tnm;
460
461         if (d->nwrds > 1)
462         {
463                 qsort(d->wrds, d->nwrds, sizeof(TheLexeme), cmpTheLexeme);
464
465                 /* uniq */
466                 newwrds = d->wrds;
467                 ptrwrds = d->wrds + 1;
468                 while (ptrwrds - d->wrds < d->nwrds)
469                 {
470                         if (cmpLexeme(ptrwrds, newwrds) == 0)
471                         {
472                                 if (cmpLexemeInfo(ptrwrds->entries, newwrds->entries))
473                                 {
474                                         ptrwrds->entries->nextentry = newwrds->entries;
475                                         newwrds->entries = ptrwrds->entries;
476                                 }
477                                 else
478                                         pfree(ptrwrds->entries);
479
480                                 if (ptrwrds->lexeme)
481                                         pfree(ptrwrds->lexeme);
482                         }
483                         else
484                         {
485                                 newwrds++;
486                                 *newwrds = *ptrwrds;
487                         }
488
489                         ptrwrds++;
490                 }
491
492                 d->nwrds = newwrds - d->wrds + 1;
493                 d->wrds = (TheLexeme *) repalloc(d->wrds, sizeof(TheLexeme) * d->nwrds);
494         }
495 }
496
497 static void
498 compileTheSubstitute(DictThesaurus *d)
499 {
500         int                     i;
501
502         for (i = 0; i < d->nsubst; i++)
503         {
504                 TSLexeme   *rem = d->subst[i].res,
505                                    *outptr,
506                                    *inptr;
507                 int                     n = 2;
508
509                 outptr = d->subst[i].res = (TSLexeme *) palloc(sizeof(TSLexeme) * n);
510                 outptr->lexeme = NULL;
511                 inptr = rem;
512
513                 while (inptr && inptr->lexeme)
514                 {
515                         TSLexeme   *lexized,
516                                                 tmplex[2];
517
518                         if (inptr->flags & DT_USEASIS)
519                         {                                       /* do not lexize */
520                                 tmplex[0] = *inptr;
521                                 tmplex[0].flags = 0;
522                                 tmplex[1].lexeme = NULL;
523                                 lexized = tmplex;
524                         }
525                         else
526                         {
527                                 lexized = (TSLexeme *) DatumGetPointer(
528                                                                                                            FunctionCall4(
529                                                                                                            &(d->subdict->lexize),
530                                                                            PointerGetDatum(d->subdict->dictData),
531                                                                                           PointerGetDatum(inptr->lexeme),
532                                                                                 Int32GetDatum(strlen(inptr->lexeme)),
533                                                                                                                 PointerGetDatum(NULL)
534                                                                                                                                          )
535                                         );
536                         }
537
538                         if (lexized && lexized->lexeme)
539                         {
540                                 int                     toset = (lexized->lexeme && outptr != d->subst[i].res) ? (outptr - d->subst[i].res) : -1;
541
542                                 while (lexized->lexeme)
543                                 {
544                                         if (outptr - d->subst[i].res + 1 >= n)
545                                         {
546                                                 int                     diff = outptr - d->subst[i].res;
547
548                                                 n *= 2;
549                                                 d->subst[i].res = (TSLexeme *) repalloc(d->subst[i].res, sizeof(TSLexeme) * n);
550                                                 outptr = d->subst[i].res + diff;
551                                         }
552
553                                         *outptr = *lexized;
554                                         outptr->lexeme = pstrdup(lexized->lexeme);
555
556                                         outptr++;
557                                         lexized++;
558                                 }
559
560                                 if (toset > 0)
561                                         d->subst[i].res[toset].flags |= TSL_ADDPOS;
562                         }
563                         else if (lexized)
564                         {
565                                 ereport(ERROR,
566                                                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
567                                                  errmsg("thesaurus substitute word \"%s\" is a stop word (rule %d)",
568                                                                 inptr->lexeme, i + 1)));
569                         }
570                         else
571                         {
572                                 ereport(ERROR,
573                                                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
574                                                  errmsg("thesaurus substitute word \"%s\" isn't recognized by subdictionary (rule %d)",
575                                                                 inptr->lexeme, i + 1)));
576                         }
577
578                         if (inptr->lexeme)
579                                 pfree(inptr->lexeme);
580                         inptr++;
581                 }
582
583                 if (outptr == d->subst[i].res)
584                         ereport(ERROR,
585                                         (errcode(ERRCODE_CONFIG_FILE_ERROR),
586                                          errmsg("thesaurus substitute phrase is empty (rule %d)",
587                                                         i + 1)));
588
589                 d->subst[i].reslen = outptr - d->subst[i].res;
590
591                 pfree(rem);
592         }
593 }
594
595 Datum
596 thesaurus_init(PG_FUNCTION_ARGS)
597 {
598         List       *dictoptions = (List *) PG_GETARG_POINTER(0);
599         DictThesaurus *d;
600         char       *subdictname = NULL;
601         bool            fileloaded = false;
602         ListCell   *l;
603
604         d = (DictThesaurus *) palloc0(sizeof(DictThesaurus));
605
606         foreach(l, dictoptions)
607         {
608                 DefElem    *defel = (DefElem *) lfirst(l);
609
610                 if (pg_strcasecmp("DictFile", defel->defname) == 0)
611                 {
612                         if (fileloaded)
613                                 ereport(ERROR,
614                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
615                                                  errmsg("multiple DictFile parameters")));
616                         thesaurusRead(defGetString(defel), d);
617                         fileloaded = true;
618                 }
619                 else if (pg_strcasecmp("Dictionary", defel->defname) == 0)
620                 {
621                         if (subdictname)
622                                 ereport(ERROR,
623                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
624                                                  errmsg("multiple Dictionary parameters")));
625                         subdictname = pstrdup(defGetString(defel));
626                 }
627                 else
628                 {
629                         ereport(ERROR,
630                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
631                                          errmsg("unrecognized Thesaurus parameter: \"%s\"",
632                                                         defel->defname)));
633                 }
634         }
635
636         if (!fileloaded)
637                 ereport(ERROR,
638                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
639                                  errmsg("missing DictFile parameter")));
640         if (!subdictname)
641                 ereport(ERROR,
642                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
643                                  errmsg("missing Dictionary parameter")));
644
645         d->subdictOid = TSDictionaryGetDictid(stringToQualifiedNameList(subdictname), false);
646         d->subdict = lookup_ts_dictionary_cache(d->subdictOid);
647
648         compileTheLexeme(d);
649         compileTheSubstitute(d);
650
651         PG_RETURN_POINTER(d);
652 }
653
654 static LexemeInfo *
655 findTheLexeme(DictThesaurus *d, char *lexeme)
656 {
657         TheLexeme       key,
658                            *res;
659
660         if (d->nwrds == 0)
661                 return NULL;
662
663         key.lexeme = lexeme;
664         key.entries = NULL;
665
666         res = bsearch(&key, d->wrds, d->nwrds, sizeof(TheLexeme), cmpLexemeQ);
667
668         if (res == NULL)
669                 return NULL;
670         return res->entries;
671 }
672
673 static bool
674 matchIdSubst(LexemeInfo *stored, uint16 idsubst)
675 {
676         bool            res = true;
677
678         if (stored)
679         {
680                 res = false;
681
682                 for (; stored; stored = stored->nextvariant)
683                         if (stored->idsubst == idsubst)
684                         {
685                                 res = true;
686                                 break;
687                         }
688         }
689
690         return res;
691 }
692
693 static LexemeInfo *
694 findVariant(LexemeInfo *in, LexemeInfo *stored, uint16 curpos, LexemeInfo **newin, int newn)
695 {
696         for (;;)
697         {
698                 int                     i;
699                 LexemeInfo *ptr = newin[0];
700
701                 for (i = 0; i < newn; i++)
702                 {
703                         while (newin[i] && newin[i]->idsubst < ptr->idsubst)
704                                 newin[i] = newin[i]->nextentry;
705
706                         if (newin[i] == NULL)
707                                 return in;
708
709                         if (newin[i]->idsubst > ptr->idsubst)
710                         {
711                                 ptr = newin[i];
712                                 i = -1;
713                                 continue;
714                         }
715
716                         while (newin[i]->idsubst == ptr->idsubst)
717                         {
718                                 if (newin[i]->posinsubst == curpos && newin[i]->tnvariant == newn)
719                                 {
720                                         ptr = newin[i];
721                                         break;
722                                 }
723
724                                 newin[i] = newin[i]->nextentry;
725                                 if (newin[i] == NULL)
726                                         return in;
727                         }
728
729                         if (newin[i]->idsubst != ptr->idsubst)
730                         {
731                                 ptr = newin[i];
732                                 i = -1;
733                                 continue;
734                         }
735                 }
736
737                 if (i == newn && matchIdSubst(stored, ptr->idsubst) && (in == NULL || !matchIdSubst(in, ptr->idsubst)))
738                 {                                               /* found */
739
740                         ptr->nextvariant = in;
741                         in = ptr;
742                 }
743
744                 /* step forward */
745                 for (i = 0; i < newn; i++)
746                         newin[i] = newin[i]->nextentry;
747         }
748
749         return NULL;
750 }
751
752 static TSLexeme *
753 copyTSLexeme(TheSubstitute *ts)
754 {
755         TSLexeme   *res;
756         uint16          i;
757
758         res = (TSLexeme *) palloc(sizeof(TSLexeme) * (ts->reslen + 1));
759         for (i = 0; i < ts->reslen; i++)
760         {
761                 res[i] = ts->res[i];
762                 res[i].lexeme = pstrdup(ts->res[i].lexeme);
763         }
764
765         res[ts->reslen].lexeme = NULL;
766
767         return res;
768 }
769
770 static TSLexeme *
771 checkMatch(DictThesaurus *d, LexemeInfo *info, uint16 curpos, bool *moreres)
772 {
773         *moreres = false;
774         while (info)
775         {
776                 Assert(info->idsubst < d->nsubst);
777                 if (info->nextvariant)
778                         *moreres = true;
779                 if (d->subst[info->idsubst].lastlexeme == curpos)
780                         return copyTSLexeme(d->subst + info->idsubst);
781                 info = info->nextvariant;
782         }
783
784         return NULL;
785 }
786
787 Datum
788 thesaurus_lexize(PG_FUNCTION_ARGS)
789 {
790         DictThesaurus *d = (DictThesaurus *) PG_GETARG_POINTER(0);
791         DictSubState *dstate = (DictSubState *) PG_GETARG_POINTER(3);
792         TSLexeme   *res = NULL;
793         LexemeInfo *stored,
794                            *info = NULL;
795         uint16          curpos = 0;
796         bool            moreres = false;
797
798         if (PG_NARGS() != 4 || dstate == NULL)
799                 elog(ERROR, "forbidden call of thesaurus or nested call");
800
801         if (dstate->isend)
802                 PG_RETURN_POINTER(NULL);
803         stored = (LexemeInfo *) dstate->private;
804
805         if (stored)
806                 curpos = stored->posinsubst + 1;
807
808         if (!d->subdict->isvalid)
809                 d->subdict = lookup_ts_dictionary_cache(d->subdictOid);
810
811         res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize),
812                                                                            PointerGetDatum(d->subdict->dictData),
813                                                                                                          PG_GETARG_DATUM(1),
814                                                                                                          PG_GETARG_DATUM(2),
815                                                                                                          PointerGetDatum(NULL)));
816
817         if (res && res->lexeme)
818         {
819                 TSLexeme   *ptr = res,
820                                    *basevar;
821
822                 while (ptr->lexeme)
823                 {
824                         uint16          nv = ptr->nvariant;
825                         uint16          i,
826                                                 nlex = 0;
827                         LexemeInfo **infos;
828
829                         basevar = ptr;
830                         while (ptr->lexeme && nv == ptr->nvariant)
831                         {
832                                 nlex++;
833                                 ptr++;
834                         }
835
836                         infos = (LexemeInfo **) palloc(sizeof(LexemeInfo *) * nlex);
837                         for (i = 0; i < nlex; i++)
838                                 if ((infos[i] = findTheLexeme(d, basevar[i].lexeme)) == NULL)
839                                         break;
840
841                         if (i < nlex)
842                         {
843                                 /* no chance to find */
844                                 pfree(infos);
845                                 continue;
846                         }
847
848                         info = findVariant(info, stored, curpos, infos, nlex);
849                 }
850         }
851         else if (res)
852         {                                                       /* stop-word */
853                 LexemeInfo *infos = findTheLexeme(d, NULL);
854
855                 info = findVariant(NULL, stored, curpos, &infos, 1);
856         }
857         else
858         {
859                 info = NULL;                    /* word isn't recognized */
860         }
861
862         dstate->private = (void *) info;
863
864         if (!info)
865         {
866                 dstate->getnext = false;
867                 PG_RETURN_POINTER(NULL);
868         }
869
870         if ((res = checkMatch(d, info, curpos, &moreres)) != NULL)
871         {
872                 dstate->getnext = moreres;
873                 PG_RETURN_POINTER(res);
874         }
875
876         dstate->getnext = true;
877
878         PG_RETURN_POINTER(NULL);
879 }