1 /*-------------------------------------------------------------------------
4 * main parse functions for tsearch
6 * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
10 * $PostgreSQL: pgsql/src/backend/tsearch/ts_parse.c,v 1.8 2008/05/16 16:31:01 tgl Exp $
12 *-------------------------------------------------------------------------
17 #include "tsearch/ts_cache.h"
18 #include "tsearch/ts_public.h"
19 #include "tsearch/ts_utils.h"
21 #define IGNORE_LONGLEXEME 1
27 typedef struct ParsedLex
33 struct ParsedLex *next;
36 typedef struct ListParsedLex
44 TSConfigCacheEntry *cfg;
47 DictSubState dictState;
49 ListParsedLex towork; /* current list to work */
50 ListParsedLex waste; /* list of lexemes that already lexized */
53 * fields to store last variant to lexize (basically, thesaurus or similar
54 * to, which wants several lexemes
62 LexizeInit(LexizeData *ld, TSConfigCacheEntry *cfg)
65 ld->curDictId = InvalidOid;
67 ld->towork.head = ld->towork.tail = ld->curSub = NULL;
68 ld->waste.head = ld->waste.tail = NULL;
74 LPLAddTail(ListParsedLex *list, ParsedLex *newpl)
78 list->tail->next = newpl;
82 list->head = list->tail = newpl;
87 LPLRemoveHead(ListParsedLex *list)
89 ParsedLex *res = list->head;
92 list->head = list->head->next;
94 if (list->head == NULL)
101 LexizeAddLemm(LexizeData *ld, int type, char *lemm, int lenlemm)
103 ParsedLex *newpl = (ParsedLex *) palloc(sizeof(ParsedLex));
105 newpl = (ParsedLex *) palloc(sizeof(ParsedLex));
108 newpl->lenlemm = lenlemm;
109 LPLAddTail(&ld->towork, newpl);
110 ld->curSub = ld->towork.tail;
114 RemoveHead(LexizeData *ld)
116 LPLAddTail(&ld->waste, LPLRemoveHead(&ld->towork));
122 setCorrLex(LexizeData *ld, ParsedLex **correspondLexem)
126 *correspondLexem = ld->waste.head;
131 *ptr = ld->waste.head;
140 ld->waste.head = ld->waste.tail = NULL;
144 moveToWaste(LexizeData *ld, ParsedLex *stop)
148 while (ld->towork.head && go)
150 if (ld->towork.head == stop)
152 ld->curSub = stop->next;
160 setNewTmpRes(LexizeData *ld, ParsedLex *lex, TSLexeme *res)
166 for (ptr = ld->tmpRes; ptr->lexeme; ptr++)
175 LexizeExec(LexizeData *ld, ParsedLex **correspondLexem)
179 TSDictionaryCacheEntry *dict;
182 if (ld->curDictId == InvalidOid)
185 * usial mode: dictionary wants only one word, but we should keep in
186 * mind that we should go through all stack
189 while (ld->towork.head)
191 ParsedLex *curVal = ld->towork.head;
193 map = ld->cfg->map + curVal->type;
195 if (curVal->type == 0 || curVal->type >= ld->cfg->lenmap || map->len == 0)
197 /* skip this type of lexeme */
202 for (i = ld->posDict; i < map->len; i++)
204 dict = lookup_ts_dictionary_cache(map->dictIds[i]);
206 ld->dictState.isend = ld->dictState.getnext = false;
207 ld->dictState.private = NULL;
208 res = (TSLexeme *) DatumGetPointer(FunctionCall4(
210 PointerGetDatum(dict->dictData),
211 PointerGetDatum(curVal->lemm),
212 Int32GetDatum(curVal->lenlemm),
213 PointerGetDatum(&ld->dictState)
216 if (ld->dictState.getnext)
219 * dictionary wants next word, so setup and store current
220 * position and go to multiword mode
223 ld->curDictId = DatumGetObjectId(map->dictIds[i]);
225 ld->curSub = curVal->next;
227 setNewTmpRes(ld, curVal, res);
228 return LexizeExec(ld, correspondLexem);
231 if (!res) /* dictionary doesn't know this lexeme */
235 setCorrLex(ld, correspondLexem);
243 { /* curDictId is valid */
244 dict = lookup_ts_dictionary_cache(ld->curDictId);
247 * Dictionary ld->curDictId asks us about following words
252 ParsedLex *curVal = ld->curSub;
254 map = ld->cfg->map + curVal->type;
256 if (curVal->type != 0)
258 bool dictExists = false;
260 if (curVal->type >= ld->cfg->lenmap || map->len == 0)
262 /* skip this type of lexeme */
263 ld->curSub = curVal->next;
268 * We should be sure that current type of lexeme is recognized
269 * by our dictinonary: we just check is it exist in list of
272 for (i = 0; i < map->len && !dictExists; i++)
273 if (ld->curDictId == DatumGetObjectId(map->dictIds[i]))
279 * Dictionary can't work with current tpe of lexeme,
280 * return to basic mode and redo all stored lexemes
282 ld->curDictId = InvalidOid;
283 return LexizeExec(ld, correspondLexem);
287 ld->dictState.isend = (curVal->type == 0) ? true : false;
288 ld->dictState.getnext = false;
290 res = (TSLexeme *) DatumGetPointer(FunctionCall4(
292 PointerGetDatum(dict->dictData),
293 PointerGetDatum(curVal->lemm),
294 Int32GetDatum(curVal->lenlemm),
295 PointerGetDatum(&ld->dictState)
298 if (ld->dictState.getnext)
300 /* Dictionary wants one more */
301 ld->curSub = curVal->next;
303 setNewTmpRes(ld, curVal, res);
307 if (res || ld->tmpRes)
310 * Dictionary normalizes lexemes, so we remove from stack all
311 * used lexemes, return to basic mode and redo end of stack
316 moveToWaste(ld, ld->curSub);
321 moveToWaste(ld, ld->lastRes);
324 /* reset to initial state */
325 ld->curDictId = InvalidOid;
329 setCorrLex(ld, correspondLexem);
334 * Dict don't want next lexem and didn't recognize anything, redo
335 * from ld->towork.head
337 ld->curDictId = InvalidOid;
338 return LexizeExec(ld, correspondLexem);
342 setCorrLex(ld, correspondLexem);
347 * Parse string and lexize words.
349 * prs will be filled in.
352 parsetext(Oid cfgId, ParsedText *prs, char *buf, int buflen)
359 TSConfigCacheEntry *cfg;
360 TSParserCacheEntry *prsobj;
363 cfg = lookup_ts_config_cache(cfgId);
364 prsobj = lookup_ts_parser_cache(cfg->prsId);
366 prsdata = (void *) DatumGetPointer(FunctionCall2(&prsobj->prsstart,
367 PointerGetDatum(buf),
368 Int32GetDatum(buflen)));
370 LexizeInit(&ldata, cfg);
374 type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken),
375 PointerGetDatum(prsdata),
376 PointerGetDatum(&lemm),
377 PointerGetDatum(&lenlemm)));
379 if (type > 0 && lenlemm >= MAXSTRLEN)
381 #ifdef IGNORE_LONGLEXEME
383 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
384 errmsg("word is too long to be indexed"),
385 errdetail("Words longer than %d characters are ignored.",
390 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
391 errmsg("word is too long to be indexed"),
392 errdetail("Words longer than %d characters are ignored.",
397 LexizeAddLemm(&ldata, type, lemm, lenlemm);
399 while ((norms = LexizeExec(&ldata, NULL)) != NULL)
401 TSLexeme *ptr = norms;
403 prs->pos++; /* set pos */
407 if (prs->curwords == prs->lenwords)
410 prs->words = (ParsedWord *) repalloc((void *) prs->words, prs->lenwords * sizeof(ParsedWord));
413 if (ptr->flags & TSL_ADDPOS)
415 prs->words[prs->curwords].len = strlen(ptr->lexeme);
416 prs->words[prs->curwords].word = ptr->lexeme;
417 prs->words[prs->curwords].nvariant = ptr->nvariant;
418 prs->words[prs->curwords].flags = ptr->flags & TSL_PREFIX;
419 prs->words[prs->curwords].alen = 0;
420 prs->words[prs->curwords].pos.pos = LIMITPOS(prs->pos);
428 FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
435 hladdword(HeadlineParsedText *prs, char *buf, int buflen, int type)
437 while (prs->curwords >= prs->lenwords)
440 prs->words = (HeadlineWordEntry *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWordEntry));
442 memset(&(prs->words[prs->curwords]), 0, sizeof(HeadlineWordEntry));
443 prs->words[prs->curwords].type = (uint8) type;
444 prs->words[prs->curwords].len = buflen;
445 prs->words[prs->curwords].word = palloc(buflen);
446 memcpy(prs->words[prs->curwords].word, buf, buflen);
451 hlfinditem(HeadlineParsedText *prs, TSQuery query, char *buf, int buflen)
454 QueryItem *item = GETQUERY(query);
455 HeadlineWordEntry *word;
457 while (prs->curwords + query->size >= prs->lenwords)
460 prs->words = (HeadlineWordEntry *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWordEntry));
463 word = &(prs->words[prs->curwords - 1]);
464 for (i = 0; i < query->size; i++)
466 if (item->type == QI_VAL &&
467 tsCompareString( GETOPERAND(query) + item->operand.distance, item->operand.length,
468 buf, buflen, item->operand.prefix ) == 0 )
472 memcpy(&(prs->words[prs->curwords]), word, sizeof(HeadlineWordEntry));
473 prs->words[prs->curwords].item = &item->operand;
474 prs->words[prs->curwords].repeated = 1;
478 word->item = &item->operand;
485 addHLParsedLex(HeadlineParsedText *prs, TSQuery query, ParsedLex *lexs, TSLexeme *norms)
494 hladdword(prs, lexs->lemm, lexs->lenlemm, lexs->type);
497 while (ptr && ptr->lexeme)
499 hlfinditem(prs, query, ptr->lexeme, strlen(ptr->lexeme));
503 tmplexs = lexs->next;
521 hlparsetext(Oid cfgId, HeadlineParsedText *prs, TSQuery query, char *buf, int buflen)
529 TSConfigCacheEntry *cfg;
530 TSParserCacheEntry *prsobj;
533 cfg = lookup_ts_config_cache(cfgId);
534 prsobj = lookup_ts_parser_cache(cfg->prsId);
536 prsdata = (void *) DatumGetPointer(FunctionCall2(&(prsobj->prsstart),
537 PointerGetDatum(buf),
538 Int32GetDatum(buflen)));
540 LexizeInit(&ldata, cfg);
544 type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken),
545 PointerGetDatum(prsdata),
546 PointerGetDatum(&lemm),
547 PointerGetDatum(&lenlemm)));
549 if (type > 0 && lenlemm >= MAXSTRLEN)
551 #ifdef IGNORE_LONGLEXEME
553 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
554 errmsg("word is too long to be indexed"),
555 errdetail("Words longer than %d characters are ignored.",
560 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
561 errmsg("word is too long to be indexed"),
562 errdetail("Words longer than %d characters are ignored.",
567 LexizeAddLemm(&ldata, type, lemm, lenlemm);
571 if ((norms = LexizeExec(&ldata, &lexs)) != NULL)
572 addHLParsedLex(prs, query, lexs, norms);
574 addHLParsedLex(prs, query, lexs, NULL);
579 FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
583 generateHeadline(HeadlineParsedText *prs)
588 HeadlineWordEntry *wrd = prs->words;
590 out = (text *) palloc(len);
591 ptr = ((char *) out) + VARHDRSZ;
593 while (wrd - prs->words < prs->curwords)
595 while (wrd->len + prs->stopsellen + prs->startsellen + (ptr - ((char *) out)) >= len)
597 int dist = ptr - ((char *) out);
600 out = (text *) repalloc(out, len);
601 ptr = ((char *) out) + dist;
604 if (wrd->in && !wrd->repeated)
615 memcpy(ptr, prs->startsel, prs->startsellen);
616 ptr += prs->startsellen;
618 memcpy(ptr, wrd->word, wrd->len);
622 memcpy(ptr, prs->stopsel, prs->stopsellen);
623 ptr += prs->stopsellen;
627 else if (!wrd->repeated)
633 SET_VARSIZE(out, ptr - ((char *) out));