]> granicus.if.org Git - postgresql/blob - src/backend/tsearch/ts_parse.c
Create a selectivity estimation function for the text search @@ operator.
[postgresql] / src / backend / tsearch / ts_parse.c
1 /*-------------------------------------------------------------------------
2  *
3  * ts_parse.c
4  *              main parse functions for tsearch
5  *
6  * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
7  *
8  *
9  * IDENTIFICATION
10  *        $PostgreSQL: pgsql/src/backend/tsearch/ts_parse.c,v 1.8 2008/05/16 16:31:01 tgl Exp $
11  *
12  *-------------------------------------------------------------------------
13  */
14
15 #include "postgres.h"
16
17 #include "tsearch/ts_cache.h"
18 #include "tsearch/ts_public.h"
19 #include "tsearch/ts_utils.h"
20
21 #define IGNORE_LONGLEXEME       1
22
23 /*
24  * Lexize subsystem
25  */
26
27 typedef struct ParsedLex
28 {
29         int                     type;
30         char       *lemm;
31         int                     lenlemm;
32         bool            resfollow;
33         struct ParsedLex *next;
34 } ParsedLex;
35
36 typedef struct ListParsedLex
37 {
38         ParsedLex  *head;
39         ParsedLex  *tail;
40 } ListParsedLex;
41
42 typedef struct
43 {
44         TSConfigCacheEntry *cfg;
45         Oid                     curDictId;
46         int                     posDict;
47         DictSubState dictState;
48         ParsedLex  *curSub;
49         ListParsedLex towork;           /* current list to work */
50         ListParsedLex waste;            /* list of lexemes that already lexized */
51
52         /*
53          * fields to store last variant to lexize (basically, thesaurus or similar
54          * to, which wants      several lexemes
55          */
56
57         ParsedLex  *lastRes;
58         TSLexeme   *tmpRes;
59 } LexizeData;
60
61 static void
62 LexizeInit(LexizeData *ld, TSConfigCacheEntry *cfg)
63 {
64         ld->cfg = cfg;
65         ld->curDictId = InvalidOid;
66         ld->posDict = 0;
67         ld->towork.head = ld->towork.tail = ld->curSub = NULL;
68         ld->waste.head = ld->waste.tail = NULL;
69         ld->lastRes = NULL;
70         ld->tmpRes = NULL;
71 }
72
73 static void
74 LPLAddTail(ListParsedLex *list, ParsedLex *newpl)
75 {
76         if (list->tail)
77         {
78                 list->tail->next = newpl;
79                 list->tail = newpl;
80         }
81         else
82                 list->head = list->tail = newpl;
83         newpl->next = NULL;
84 }
85
86 static ParsedLex *
87 LPLRemoveHead(ListParsedLex *list)
88 {
89         ParsedLex  *res = list->head;
90
91         if (list->head)
92                 list->head = list->head->next;
93
94         if (list->head == NULL)
95                 list->tail = NULL;
96
97         return res;
98 }
99
100 static void
101 LexizeAddLemm(LexizeData *ld, int type, char *lemm, int lenlemm)
102 {
103         ParsedLex  *newpl = (ParsedLex *) palloc(sizeof(ParsedLex));
104
105         newpl = (ParsedLex *) palloc(sizeof(ParsedLex));
106         newpl->type = type;
107         newpl->lemm = lemm;
108         newpl->lenlemm = lenlemm;
109         LPLAddTail(&ld->towork, newpl);
110         ld->curSub = ld->towork.tail;
111 }
112
113 static void
114 RemoveHead(LexizeData *ld)
115 {
116         LPLAddTail(&ld->waste, LPLRemoveHead(&ld->towork));
117
118         ld->posDict = 0;
119 }
120
121 static void
122 setCorrLex(LexizeData *ld, ParsedLex **correspondLexem)
123 {
124         if (correspondLexem)
125         {
126                 *correspondLexem = ld->waste.head;
127         }
128         else
129         {
130                 ParsedLex  *tmp,
131                                    *ptr = ld->waste.head;
132
133                 while (ptr)
134                 {
135                         tmp = ptr->next;
136                         pfree(ptr);
137                         ptr = tmp;
138                 }
139         }
140         ld->waste.head = ld->waste.tail = NULL;
141 }
142
143 static void
144 moveToWaste(LexizeData *ld, ParsedLex *stop)
145 {
146         bool            go = true;
147
148         while (ld->towork.head && go)
149         {
150                 if (ld->towork.head == stop)
151                 {
152                         ld->curSub = stop->next;
153                         go = false;
154                 }
155                 RemoveHead(ld);
156         }
157 }
158
159 static void
160 setNewTmpRes(LexizeData *ld, ParsedLex *lex, TSLexeme *res)
161 {
162         if (ld->tmpRes)
163         {
164                 TSLexeme   *ptr;
165
166                 for (ptr = ld->tmpRes; ptr->lexeme; ptr++)
167                         pfree(ptr->lexeme);
168                 pfree(ld->tmpRes);
169         }
170         ld->tmpRes = res;
171         ld->lastRes = lex;
172 }
173
174 static TSLexeme *
175 LexizeExec(LexizeData *ld, ParsedLex **correspondLexem)
176 {
177         int                     i;
178         ListDictionary *map;
179         TSDictionaryCacheEntry *dict;
180         TSLexeme   *res;
181
182         if (ld->curDictId == InvalidOid)
183         {
184                 /*
185                  * usial mode: dictionary wants only one word, but we should keep in
186                  * mind that we should go through all stack
187                  */
188
189                 while (ld->towork.head)
190                 {
191                         ParsedLex  *curVal = ld->towork.head;
192
193                         map = ld->cfg->map + curVal->type;
194
195                         if (curVal->type == 0 || curVal->type >= ld->cfg->lenmap || map->len == 0)
196                         {
197                                 /* skip this type of lexeme */
198                                 RemoveHead(ld);
199                                 continue;
200                         }
201
202                         for (i = ld->posDict; i < map->len; i++)
203                         {
204                                 dict = lookup_ts_dictionary_cache(map->dictIds[i]);
205
206                                 ld->dictState.isend = ld->dictState.getnext = false;
207                                 ld->dictState.private = NULL;
208                                 res = (TSLexeme *) DatumGetPointer(FunctionCall4(
209                                                                                                                          &(dict->lexize),
210                                                                                          PointerGetDatum(dict->dictData),
211                                                                                            PointerGetDatum(curVal->lemm),
212                                                                                           Int32GetDatum(curVal->lenlemm),
213                                                                                           PointerGetDatum(&ld->dictState)
214                                                                                                                                  ));
215
216                                 if (ld->dictState.getnext)
217                                 {
218                                         /*
219                                          * dictionary wants next word, so setup and store current
220                                          * position and go to multiword mode
221                                          */
222
223                                         ld->curDictId = DatumGetObjectId(map->dictIds[i]);
224                                         ld->posDict = i + 1;
225                                         ld->curSub = curVal->next;
226                                         if (res)
227                                                 setNewTmpRes(ld, curVal, res);
228                                         return LexizeExec(ld, correspondLexem);
229                                 }
230
231                                 if (!res)               /* dictionary doesn't know this lexeme */
232                                         continue;
233
234                                 RemoveHead(ld);
235                                 setCorrLex(ld, correspondLexem);
236                                 return res;
237                         }
238
239                         RemoveHead(ld);
240                 }
241         }
242         else
243         {                                                       /* curDictId is valid */
244                 dict = lookup_ts_dictionary_cache(ld->curDictId);
245
246                 /*
247                  * Dictionary ld->curDictId asks  us about following words
248                  */
249
250                 while (ld->curSub)
251                 {
252                         ParsedLex  *curVal = ld->curSub;
253
254                         map = ld->cfg->map + curVal->type;
255
256                         if (curVal->type != 0)
257                         {
258                                 bool            dictExists = false;
259
260                                 if (curVal->type >= ld->cfg->lenmap || map->len == 0)
261                                 {
262                                         /* skip this type of lexeme */
263                                         ld->curSub = curVal->next;
264                                         continue;
265                                 }
266
267                                 /*
268                                  * We should be sure that current type of lexeme is recognized
269                                  * by our dictinonary: we just check is it exist in list of
270                                  * dictionaries ?
271                                  */
272                                 for (i = 0; i < map->len && !dictExists; i++)
273                                         if (ld->curDictId == DatumGetObjectId(map->dictIds[i]))
274                                                 dictExists = true;
275
276                                 if (!dictExists)
277                                 {
278                                         /*
279                                          * Dictionary can't work with current tpe of lexeme,
280                                          * return to basic mode and redo all stored lexemes
281                                          */
282                                         ld->curDictId = InvalidOid;
283                                         return LexizeExec(ld, correspondLexem);
284                                 }
285                         }
286
287                         ld->dictState.isend = (curVal->type == 0) ? true : false;
288                         ld->dictState.getnext = false;
289
290                         res = (TSLexeme *) DatumGetPointer(FunctionCall4(
291                                                                                                                          &(dict->lexize),
292                                                                                          PointerGetDatum(dict->dictData),
293                                                                                            PointerGetDatum(curVal->lemm),
294                                                                                           Int32GetDatum(curVal->lenlemm),
295                                                                                           PointerGetDatum(&ld->dictState)
296                                                                                                                          ));
297
298                         if (ld->dictState.getnext)
299                         {
300                                 /* Dictionary wants one more */
301                                 ld->curSub = curVal->next;
302                                 if (res)
303                                         setNewTmpRes(ld, curVal, res);
304                                 continue;
305                         }
306
307                         if (res || ld->tmpRes)
308                         {
309                                 /*
310                                  * Dictionary normalizes lexemes, so we remove from stack all
311                                  * used lexemes, return to basic mode and redo end of stack
312                                  * (if it exists)
313                                  */
314                                 if (res)
315                                 {
316                                         moveToWaste(ld, ld->curSub);
317                                 }
318                                 else
319                                 {
320                                         res = ld->tmpRes;
321                                         moveToWaste(ld, ld->lastRes);
322                                 }
323
324                                 /* reset to initial state */
325                                 ld->curDictId = InvalidOid;
326                                 ld->posDict = 0;
327                                 ld->lastRes = NULL;
328                                 ld->tmpRes = NULL;
329                                 setCorrLex(ld, correspondLexem);
330                                 return res;
331                         }
332
333                         /*
334                          * Dict don't want next lexem and didn't recognize anything, redo
335                          * from ld->towork.head
336                          */
337                         ld->curDictId = InvalidOid;
338                         return LexizeExec(ld, correspondLexem);
339                 }
340         }
341
342         setCorrLex(ld, correspondLexem);
343         return NULL;
344 }
345
346 /*
347  * Parse string and lexize words.
348  *
349  * prs will be filled in.
350  */
351 void
352 parsetext(Oid cfgId, ParsedText *prs, char *buf, int buflen)
353 {
354         int                     type,
355                                 lenlemm;
356         char       *lemm = NULL;
357         LexizeData      ldata;
358         TSLexeme   *norms;
359         TSConfigCacheEntry *cfg;
360         TSParserCacheEntry *prsobj;
361         void       *prsdata;
362
363         cfg = lookup_ts_config_cache(cfgId);
364         prsobj = lookup_ts_parser_cache(cfg->prsId);
365
366         prsdata = (void *) DatumGetPointer(FunctionCall2(&prsobj->prsstart,
367                                                                                                          PointerGetDatum(buf),
368                                                                                                          Int32GetDatum(buflen)));
369
370         LexizeInit(&ldata, cfg);
371
372         do
373         {
374                 type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken),
375                                                                                    PointerGetDatum(prsdata),
376                                                                                    PointerGetDatum(&lemm),
377                                                                                    PointerGetDatum(&lenlemm)));
378
379                 if (type > 0 && lenlemm >= MAXSTRLEN)
380                 {
381 #ifdef IGNORE_LONGLEXEME
382                         ereport(NOTICE,
383                                         (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
384                                          errmsg("word is too long to be indexed"),
385                                          errdetail("Words longer than %d characters are ignored.",
386                                                            MAXSTRLEN)));
387                         continue;
388 #else
389                         ereport(ERROR,
390                                         (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
391                                          errmsg("word is too long to be indexed"),
392                                          errdetail("Words longer than %d characters are ignored.",
393                                                            MAXSTRLEN)));
394 #endif
395                 }
396
397                 LexizeAddLemm(&ldata, type, lemm, lenlemm);
398
399                 while ((norms = LexizeExec(&ldata, NULL)) != NULL)
400                 {
401                         TSLexeme   *ptr = norms;
402
403                         prs->pos++;                     /* set pos */
404
405                         while (ptr->lexeme)
406                         {
407                                 if (prs->curwords == prs->lenwords)
408                                 {
409                                         prs->lenwords *= 2;
410                                         prs->words = (ParsedWord *) repalloc((void *) prs->words, prs->lenwords * sizeof(ParsedWord));
411                                 }
412
413                                 if (ptr->flags & TSL_ADDPOS)
414                                         prs->pos++;
415                                 prs->words[prs->curwords].len = strlen(ptr->lexeme);
416                                 prs->words[prs->curwords].word = ptr->lexeme;
417                                 prs->words[prs->curwords].nvariant = ptr->nvariant;
418                                 prs->words[prs->curwords].flags = ptr->flags & TSL_PREFIX;
419                                 prs->words[prs->curwords].alen = 0;
420                                 prs->words[prs->curwords].pos.pos = LIMITPOS(prs->pos);
421                                 ptr++;
422                                 prs->curwords++;
423                         }
424                         pfree(norms);
425                 }
426         } while (type > 0);
427
428         FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
429 }
430
431 /*
432  * Headline framework
433  */
434 static void
435 hladdword(HeadlineParsedText *prs, char *buf, int buflen, int type)
436 {
437         while (prs->curwords >= prs->lenwords)
438         {
439                 prs->lenwords *= 2;
440                 prs->words = (HeadlineWordEntry *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWordEntry));
441         }
442         memset(&(prs->words[prs->curwords]), 0, sizeof(HeadlineWordEntry));
443         prs->words[prs->curwords].type = (uint8) type;
444         prs->words[prs->curwords].len = buflen;
445         prs->words[prs->curwords].word = palloc(buflen);
446         memcpy(prs->words[prs->curwords].word, buf, buflen);
447         prs->curwords++;
448 }
449
450 static void
451 hlfinditem(HeadlineParsedText *prs, TSQuery query, char *buf, int buflen)
452 {
453         int                     i;
454         QueryItem  *item = GETQUERY(query);
455         HeadlineWordEntry *word;
456
457         while (prs->curwords + query->size >= prs->lenwords)
458         {
459                 prs->lenwords *= 2;
460                 prs->words = (HeadlineWordEntry *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWordEntry));
461         }
462
463         word = &(prs->words[prs->curwords - 1]);
464         for (i = 0; i < query->size; i++)
465         {
466                 if (item->type == QI_VAL &&
467                         tsCompareString( GETOPERAND(query) + item->operand.distance, item->operand.length,
468                                                          buf, buflen, item->operand.prefix ) == 0 )
469                 {
470                         if (word->item)
471                         {
472                                 memcpy(&(prs->words[prs->curwords]), word, sizeof(HeadlineWordEntry));
473                                 prs->words[prs->curwords].item = &item->operand;
474                                 prs->words[prs->curwords].repeated = 1;
475                                 prs->curwords++;
476                         }
477                         else
478                                 word->item = &item->operand;
479                 }
480                 item++;
481         }
482 }
483
484 static void
485 addHLParsedLex(HeadlineParsedText *prs, TSQuery query, ParsedLex *lexs, TSLexeme *norms)
486 {
487         ParsedLex  *tmplexs;
488         TSLexeme   *ptr;
489
490         while (lexs)
491         {
492
493                 if (lexs->type > 0)
494                         hladdword(prs, lexs->lemm, lexs->lenlemm, lexs->type);
495
496                 ptr = norms;
497                 while (ptr && ptr->lexeme)
498                 {
499                         hlfinditem(prs, query, ptr->lexeme, strlen(ptr->lexeme));
500                         ptr++;
501                 }
502
503                 tmplexs = lexs->next;
504                 pfree(lexs);
505                 lexs = tmplexs;
506         }
507
508         if (norms)
509         {
510                 ptr = norms;
511                 while (ptr->lexeme)
512                 {
513                         pfree(ptr->lexeme);
514                         ptr++;
515                 }
516                 pfree(norms);
517         }
518 }
519
520 void
521 hlparsetext(Oid cfgId, HeadlineParsedText *prs, TSQuery query, char *buf, int buflen)
522 {
523         int                     type,
524                                 lenlemm;
525         char       *lemm = NULL;
526         LexizeData      ldata;
527         TSLexeme   *norms;
528         ParsedLex  *lexs;
529         TSConfigCacheEntry *cfg;
530         TSParserCacheEntry *prsobj;
531         void       *prsdata;
532
533         cfg = lookup_ts_config_cache(cfgId);
534         prsobj = lookup_ts_parser_cache(cfg->prsId);
535
536         prsdata = (void *) DatumGetPointer(FunctionCall2(&(prsobj->prsstart),
537                                                                                                          PointerGetDatum(buf),
538                                                                                                          Int32GetDatum(buflen)));
539
540         LexizeInit(&ldata, cfg);
541
542         do
543         {
544                 type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken),
545                                                                                    PointerGetDatum(prsdata),
546                                                                                    PointerGetDatum(&lemm),
547                                                                                    PointerGetDatum(&lenlemm)));
548
549                 if (type > 0 && lenlemm >= MAXSTRLEN)
550                 {
551 #ifdef IGNORE_LONGLEXEME
552                         ereport(NOTICE,
553                                         (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
554                                          errmsg("word is too long to be indexed"),
555                                          errdetail("Words longer than %d characters are ignored.",
556                                                            MAXSTRLEN)));
557                         continue;
558 #else
559                         ereport(ERROR,
560                                         (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
561                                          errmsg("word is too long to be indexed"),
562                                          errdetail("Words longer than %d characters are ignored.",
563                                                            MAXSTRLEN)));
564 #endif
565                 }
566
567                 LexizeAddLemm(&ldata, type, lemm, lenlemm);
568
569                 do
570                 {
571                         if ((norms = LexizeExec(&ldata, &lexs)) != NULL)
572                                 addHLParsedLex(prs, query, lexs, norms);
573                         else
574                                 addHLParsedLex(prs, query, lexs, NULL);
575                 } while (norms);
576
577         } while (type > 0);
578
579         FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
580 }
581
582 text *
583 generateHeadline(HeadlineParsedText *prs)
584 {
585         text       *out;
586         int                     len = 128;
587         char       *ptr;
588         HeadlineWordEntry *wrd = prs->words;
589
590         out = (text *) palloc(len);
591         ptr = ((char *) out) + VARHDRSZ;
592
593         while (wrd - prs->words < prs->curwords)
594         {
595                 while (wrd->len + prs->stopsellen + prs->startsellen + (ptr - ((char *) out)) >= len)
596                 {
597                         int                     dist = ptr - ((char *) out);
598
599                         len *= 2;
600                         out = (text *) repalloc(out, len);
601                         ptr = ((char *) out) + dist;
602                 }
603
604                 if (wrd->in && !wrd->repeated)
605                 {
606                         if (wrd->replace)
607                         {
608                                 *ptr = ' ';
609                                 ptr++;
610                         }
611                         else
612                         {
613                                 if (wrd->selected)
614                                 {
615                                         memcpy(ptr, prs->startsel, prs->startsellen);
616                                         ptr += prs->startsellen;
617                                 }
618                                 memcpy(ptr, wrd->word, wrd->len);
619                                 ptr += wrd->len;
620                                 if (wrd->selected)
621                                 {
622                                         memcpy(ptr, prs->stopsel, prs->stopsellen);
623                                         ptr += prs->stopsellen;
624                                 }
625                         }
626                 }
627                 else if (!wrd->repeated)
628                         pfree(wrd->word);
629
630                 wrd++;
631         }
632
633         SET_VARSIZE(out, ptr - ((char *) out));
634         return out;
635 }