1 /*-------------------------------------------------------------------------
4 * rank tsvector by tsquery
6 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
10 * src/backend/utils/adt/tsrank.c
12 *-------------------------------------------------------------------------
19 #include "tsearch/ts_utils.h"
20 #include "utils/array.h"
21 #include "utils/builtins.h"
22 #include "miscadmin.h"
25 static const float weights[] = {0.1f, 0.2f, 0.4f, 1.0f};
27 #define wpos(wep) ( w[ WEP_GETWEIGHT(wep) ] )
29 #define RANK_NO_NORM 0x00
30 #define RANK_NORM_LOGLENGTH 0x01
31 #define RANK_NORM_LENGTH 0x02
32 #define RANK_NORM_EXTDIST 0x04
33 #define RANK_NORM_UNIQ 0x08
34 #define RANK_NORM_LOGUNIQ 0x10
35 #define RANK_NORM_RDIVRPLUS1 0x20
36 #define DEF_NORM_METHOD RANK_NO_NORM
38 static float calc_rank_or(const float *w, TSVector t, TSQuery q);
39 static float calc_rank_and(const float *w, TSVector t, TSQuery q);
42 * Returns a weight of a word collocation
45 word_distance(int32 w)
50 return 1.0 / (1.005 + 0.05 * exp(((float4) w) / 1.5 - 2));
54 cnt_length(TSVector t)
56 WordEntry *ptr = ARRPTR(t),
57 *end = (WordEntry *) STRPTR(t);
62 int clen = POSDATALEN(t, ptr);
76 #define WordECompareQueryItem(e,q,p,i,m) \
77 tsCompareString((q) + (i)->distance, (i)->length, \
78 (e) + (p)->pos, (p)->len, (m))
82 * Returns a pointer to a WordEntry's array corresponding to 'item' from
83 * tsvector 't'. 'q' is the TSQuery containing 'item'.
84 * Returns NULL if not found.
87 find_wordentry(TSVector t, TSQuery q, QueryOperand *item, int32 *nitem)
89 WordEntry *StopLow = ARRPTR(t);
90 WordEntry *StopHigh = (WordEntry *) STRPTR(t);
91 WordEntry *StopMiddle = StopHigh;
96 /* Loop invariant: StopLow <= item < StopHigh */
97 while (StopLow < StopHigh)
99 StopMiddle = StopLow + (StopHigh - StopLow) / 2;
100 difference = WordECompareQueryItem(STRPTR(t), GETOPERAND(q), StopMiddle, item, false);
103 StopHigh = StopMiddle;
107 else if (difference > 0)
108 StopLow = StopMiddle + 1;
110 StopHigh = StopMiddle;
115 if (StopLow >= StopHigh)
116 StopMiddle = StopHigh;
120 while (StopMiddle < (WordEntry *) STRPTR(t) &&
121 WordECompareQueryItem(STRPTR(t), GETOPERAND(q), StopMiddle, item, true) == 0)
128 return (*nitem > 0) ? StopHigh : NULL;
133 * sort QueryOperands by (length, word)
136 compareQueryOperand(const void *a, const void *b, void *arg)
138 char *operand = (char *) arg;
139 QueryOperand *qa = (*(QueryOperand *const *) a);
140 QueryOperand *qb = (*(QueryOperand *const *) b);
142 return tsCompareString(operand + qa->distance, qa->length,
143 operand + qb->distance, qb->length,
148 * Returns a sorted, de-duplicated array of QueryOperands in a query.
149 * The returned QueryOperands are pointers to the original QueryOperands
152 * Length of the returned array is stored in *size
154 static QueryOperand **
155 SortAndUniqItems(TSQuery q, int *size)
157 char *operand = GETOPERAND(q);
158 QueryItem *item = GETQUERY(q);
163 ptr = res = (QueryOperand **) palloc(sizeof(QueryOperand *) * *size);
165 /* Collect all operands from the tree to res */
168 if (item->type == QI_VAL)
170 *ptr = (QueryOperand *) item;
180 qsort_arg(res, *size, sizeof(QueryOperand *), compareQueryOperand, (void *) operand);
185 /* remove duplicates */
186 while (ptr - res < *size)
188 if (compareQueryOperand((void *) ptr, (void *) prevptr, (void *) operand) != 0)
196 *size = prevptr + 1 - res;
201 calc_rank_and(const float *w, TSVector t, TSQuery q)
203 WordEntryPosVector **pos;
204 WordEntryPosVector1 posnull;
205 WordEntryPosVector *POSNULL;
222 item = SortAndUniqItems(q, &size);
226 return calc_rank_or(w, t, q);
228 pos = (WordEntryPosVector **) palloc0(sizeof(WordEntryPosVector *) * q->size);
230 /* A dummy WordEntryPos array to use when haspos is false */
233 WEP_SETPOS(posnull.pos[0], MAXENTRYPOS - 1);
234 POSNULL = (WordEntryPosVector *) &posnull;
236 for (i = 0; i < size; i++)
238 firstentry = entry = find_wordentry(t, q, item[i], &nitem);
242 while (entry - firstentry < nitem)
245 pos[i] = _POSVECPTR(t, entry);
251 for (k = 0; k < i; k++)
255 lenct = pos[k]->npos;
257 for (l = 0; l < dimt; l++)
259 for (p = 0; p < lenct; p++)
261 dist = Abs((int) WEP_GETPOS(post[l]) - (int) WEP_GETPOS(ct[p]));
262 if (dist || (dist == 0 && (pos[i] == POSNULL || pos[k] == POSNULL)))
268 curw = sqrt(wpos(post[l]) * wpos(ct[p]) * word_distance(dist));
269 res = (res < 0) ? curw : 1.0 - (1.0 - res) * (1.0 - curw);
284 calc_rank_or(const float *w, TSVector t, TSQuery q)
288 WordEntryPosVector1 posnull;
298 /* A dummy WordEntryPos array to use when haspos is false */
302 item = SortAndUniqItems(q, &size);
304 for (i = 0; i < size; i++)
310 firstentry = entry = find_wordentry(t, q, item[i], &nitem);
314 while (entry - firstentry < nitem)
318 dimt = POSDATALEN(t, entry);
319 post = POSDATAPTR(t, entry);
330 for (j = 0; j < dimt; j++)
332 resj = resj + wpos(post[j]) / ((j + 1) * (j + 1));
333 if (wpos(post[j]) > wjm)
340 limit (sum(i/i^2),i->inf) = pi^2/6
341 resj = sum(wi/i^2),i=1,noccurence,
342 wi - should be sorted desc,
343 don't sort for now, just choose maximum weight. This should be corrected
346 res = res + (wjm + resj - wjm / ((jm + 1) * (jm + 1))) / 1.64493406685;
358 calc_rank(const float *w, TSVector t, TSQuery q, int32 method)
360 QueryItem *item = GETQUERY(q);
364 if (!t->size || !q->size)
367 /* XXX: What about NOT? */
368 res = (item->type == QI_OPR && (item->qoperator.oper == OP_AND ||
369 item->qoperator.oper == OP_PHRASE)) ?
370 calc_rank_and(w, t, q) :
371 calc_rank_or(w, t, q);
376 if ((method & RANK_NORM_LOGLENGTH) && t->size > 0)
377 res /= log((double) (cnt_length(t) + 1)) / log(2.0);
379 if (method & RANK_NORM_LENGTH)
386 /* RANK_NORM_EXTDIST not applicable */
388 if ((method & RANK_NORM_UNIQ) && t->size > 0)
389 res /= (float) (t->size);
391 if ((method & RANK_NORM_LOGUNIQ) && t->size > 0)
392 res /= log((double) (t->size + 1)) / log(2.0);
394 if (method & RANK_NORM_RDIVRPLUS1)
401 getWeights(ArrayType *win)
403 static float ws[lengthof(weights)];
410 if (ARR_NDIM(win) != 1)
412 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
413 errmsg("array of weight must be one-dimensional")));
415 if (ArrayGetNItems(ARR_NDIM(win), ARR_DIMS(win)) < lengthof(weights))
417 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
418 errmsg("array of weight is too short")));
420 if (array_contains_nulls(win))
422 (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
423 errmsg("array of weight must not contain nulls")));
425 arrdata = (float4 *) ARR_DATA_PTR(win);
426 for (i = 0; i < lengthof(weights); i++)
428 ws[i] = (arrdata[i] >= 0) ? arrdata[i] : weights[i];
431 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
432 errmsg("weight out of range")));
439 ts_rank_wttf(PG_FUNCTION_ARGS)
441 ArrayType *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
442 TSVector txt = PG_GETARG_TSVECTOR(1);
443 TSQuery query = PG_GETARG_TSQUERY(2);
444 int method = PG_GETARG_INT32(3);
447 res = calc_rank(getWeights(win), txt, query, method);
449 PG_FREE_IF_COPY(win, 0);
450 PG_FREE_IF_COPY(txt, 1);
451 PG_FREE_IF_COPY(query, 2);
452 PG_RETURN_FLOAT4(res);
456 ts_rank_wtt(PG_FUNCTION_ARGS)
458 ArrayType *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
459 TSVector txt = PG_GETARG_TSVECTOR(1);
460 TSQuery query = PG_GETARG_TSQUERY(2);
463 res = calc_rank(getWeights(win), txt, query, DEF_NORM_METHOD);
465 PG_FREE_IF_COPY(win, 0);
466 PG_FREE_IF_COPY(txt, 1);
467 PG_FREE_IF_COPY(query, 2);
468 PG_RETURN_FLOAT4(res);
472 ts_rank_ttf(PG_FUNCTION_ARGS)
474 TSVector txt = PG_GETARG_TSVECTOR(0);
475 TSQuery query = PG_GETARG_TSQUERY(1);
476 int method = PG_GETARG_INT32(2);
479 res = calc_rank(getWeights(NULL), txt, query, method);
481 PG_FREE_IF_COPY(txt, 0);
482 PG_FREE_IF_COPY(query, 1);
483 PG_RETURN_FLOAT4(res);
487 ts_rank_tt(PG_FUNCTION_ARGS)
489 TSVector txt = PG_GETARG_TSVECTOR(0);
490 TSQuery query = PG_GETARG_TSQUERY(1);
493 res = calc_rank(getWeights(NULL), txt, query, DEF_NORM_METHOD);
495 PG_FREE_IF_COPY(txt, 0);
496 PG_FREE_IF_COPY(query, 1);
497 PG_RETURN_FLOAT4(res);
505 { /* compiled doc representation */
510 { /* struct is used for preparing doc
520 compareDocR(const void *va, const void *vb)
522 const DocRepresentation *a = (const DocRepresentation *) va;
523 const DocRepresentation *b = (const DocRepresentation *) vb;
525 if (WEP_GETPOS(a->pos) == WEP_GETPOS(b->pos))
527 if (WEP_GETWEIGHT(a->pos) == WEP_GETWEIGHT(b->pos))
529 if (a->data.map.entry == b->data.map.entry)
532 return (a->data.map.entry > b->data.map.entry) ? 1 : -1;
535 return (WEP_GETWEIGHT(a->pos) > WEP_GETWEIGHT(b->pos)) ? 1 : -1;
538 return (WEP_GETPOS(a->pos) > WEP_GETPOS(b->pos)) ? 1 : -1;
541 #define MAXQROPOS MAXENTRYPOS
545 bool reverseinsert; /* indicates insert order, true means
546 * descending order */
548 WordEntryPos pos[MAXQROPOS];
549 } QueryRepresentationOperand;
554 QueryRepresentationOperand *operandData;
555 } QueryRepresentation;
557 #define QR_GET_OPERAND_DATA(q, v) \
558 ( (q)->operandData + (((QueryItem*)(v)) - GETQUERY((q)->query)) )
561 checkcondition_QueryOperand(void *checkval, QueryOperand *val, ExecPhraseData *data)
563 QueryRepresentation *qr = (QueryRepresentation *) checkval;
564 QueryRepresentationOperand *opData = QR_GET_OPERAND_DATA(qr, val);
566 if (!opData->operandexists)
571 data->npos = opData->npos;
572 data->pos = opData->pos;
573 if (opData->reverseinsert)
574 data->pos += MAXQROPOS - opData->npos;
585 DocRepresentation *begin;
586 DocRepresentation *end;
590 resetQueryRepresentation(QueryRepresentation *qr, bool reverseinsert)
594 for (i = 0; i < qr->query->size; i++)
596 qr->operandData[i].operandexists = false;
597 qr->operandData[i].reverseinsert = reverseinsert;
598 qr->operandData[i].npos = 0;
603 fillQueryRepresentationData(QueryRepresentation *qr, DocRepresentation *entry)
607 QueryRepresentationOperand *opData;
609 for (i = 0; i < entry->data.query.nitem; i++)
611 if (entry->data.query.items[i]->type != QI_VAL)
614 opData = QR_GET_OPERAND_DATA(qr, entry->data.query.items[i]);
616 opData->operandexists = true;
618 if (opData->npos == 0)
620 lastPos = (opData->reverseinsert) ? (MAXQROPOS - 1) : 0;
621 opData->pos[lastPos] = entry->pos;
626 lastPos = opData->reverseinsert ?
627 (MAXQROPOS - opData->npos) :
630 if (WEP_GETPOS(opData->pos[lastPos]) != WEP_GETPOS(entry->pos))
632 lastPos = opData->reverseinsert ?
633 (MAXQROPOS - 1 - opData->npos) :
636 opData->pos[lastPos] = entry->pos;
643 Cover(DocRepresentation *doc, int len, QueryRepresentation *qr, CoverExt *ext)
645 DocRepresentation *ptr;
646 int lastpos = ext->pos;
650 * since this function recurses, it could be driven to stack overflow.
651 * (though any decent compiler will optimize away the tail-recursion.
655 resetQueryRepresentation(qr, false);
659 ptr = doc + ext->pos;
661 /* find upper bound of cover from current position, move up */
662 while (ptr - doc < len)
664 fillQueryRepresentationData(qr, ptr);
666 if (TS_execute(GETQUERY(qr->query), (void *) qr,
667 TS_EXEC_EMPTY, checkcondition_QueryOperand))
669 if (WEP_GETPOS(ptr->pos) > ext->q)
671 ext->q = WEP_GETPOS(ptr->pos);
684 resetQueryRepresentation(qr, true);
688 /* find lower bound of cover from found upper bound, move down */
689 while (ptr >= doc + ext->pos)
692 * we scan doc from right to left, so pos info in reverse order!
694 fillQueryRepresentationData(qr, ptr);
696 if (TS_execute(GETQUERY(qr->query), (void *) qr,
697 TS_EXEC_CALC_NOT, checkcondition_QueryOperand))
699 if (WEP_GETPOS(ptr->pos) < ext->p)
702 ext->p = WEP_GETPOS(ptr->pos);
709 if (ext->p <= ext->q)
712 * set position for next try to next lexeme after beginning of found
715 ext->pos = (ptr - doc) + 1;
720 return Cover(doc, len, qr, ext);
723 static DocRepresentation *
724 get_docrep(TSVector txt, QueryRepresentation *qr, int *doclen)
726 QueryItem *item = GETQUERY(qr->query);
730 int32 dimt, /* number of 'post' items */
734 int len = qr->query->size * 4,
736 DocRepresentation *doc;
738 doc = (DocRepresentation *) palloc(sizeof(DocRepresentation) * len);
741 * Iterate through query to make DocRepresentation for words and it's
742 * entries satisfied by query
744 for (i = 0; i < qr->query->size; i++)
746 QueryOperand *curoperand;
748 if (item[i].type != QI_VAL)
751 curoperand = &item[i].qoperand;
753 firstentry = entry = find_wordentry(txt, qr->query, curoperand, &nitem);
757 /* iterations over entries in tsvector */
758 while (entry - firstentry < nitem)
762 dimt = POSDATALEN(txt, entry);
763 post = POSDATAPTR(txt, entry);
767 /* ignore words without positions */
772 while (cur + dimt >= len)
775 doc = (DocRepresentation *) repalloc(doc, sizeof(DocRepresentation) * len);
778 /* iterations over entry's positions */
779 for (j = 0; j < dimt; j++)
781 if (curoperand->weight == 0 ||
782 curoperand->weight & (1 << WEP_GETWEIGHT(post[j])))
784 doc[cur].pos = post[j];
785 doc[cur].data.map.entry = entry;
786 doc[cur].data.map.item = (QueryItem *) curoperand;
797 DocRepresentation *rptr = doc + 1,
802 * Sort representation in ascending order by pos and entry
804 qsort((void *) doc, cur, sizeof(DocRepresentation), compareDocR);
807 * Join QueryItem per WordEntry and it's position
809 storage.pos = doc->pos;
810 storage.data.query.items = palloc(sizeof(QueryItem *) * qr->query->size);
811 storage.data.query.items[0] = doc->data.map.item;
812 storage.data.query.nitem = 1;
814 while (rptr - doc < cur)
816 if (rptr->pos == (rptr - 1)->pos &&
817 rptr->data.map.entry == (rptr - 1)->data.map.entry)
819 storage.data.query.items[storage.data.query.nitem] = rptr->data.map.item;
820 storage.data.query.nitem++;
826 storage.pos = rptr->pos;
827 storage.data.query.items = palloc(sizeof(QueryItem *) * qr->query->size);
828 storage.data.query.items[0] = rptr->data.map.item;
829 storage.data.query.nitem = 1;
838 *doclen = wptr - doc;
847 calc_rank_cd(const float4 *arrdata, TSVector txt, TSQuery query, int method)
849 DocRepresentation *doc;
855 double invws[lengthof(weights)];
856 double SumDist = 0.0,
860 QueryRepresentation qr;
863 for (i = 0; i < lengthof(weights); i++)
865 invws[i] = ((double) ((arrdata[i] >= 0) ? arrdata[i] : weights[i]));
868 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
869 errmsg("weight out of range")));
870 invws[i] = 1.0 / invws[i];
874 qr.operandData = (QueryRepresentationOperand *)
875 palloc0(sizeof(QueryRepresentationOperand) * query->size);
877 doc = get_docrep(txt, &qr, &doclen);
880 pfree(qr.operandData);
884 MemSet(&ext, 0, sizeof(CoverExt));
885 while (Cover(doc, doclen, &qr, &ext))
890 DocRepresentation *ptr = ext.begin;
892 while (ptr <= ext.end)
894 InvSum += invws[WEP_GETWEIGHT(ptr->pos)];
898 Cpos = ((double) (ext.end - ext.begin + 1)) / InvSum;
901 * if doc are big enough then ext.q may be equal to ext.p due to limit
902 * of positional information. In this case we approximate number of
903 * noise word as half cover's length
905 nNoise = (ext.q - ext.p) - (ext.end - ext.begin);
907 nNoise = (ext.end - ext.begin) / 2;
908 Wdoc += Cpos / ((double) (1 + nNoise));
910 CurExtPos = ((double) (ext.q + ext.p)) / 2.0;
911 if (NExtent > 0 && CurExtPos > PrevExtPos /* prevent division by
913 * multiple lexize */ )
914 SumDist += 1.0 / (CurExtPos - PrevExtPos);
916 PrevExtPos = CurExtPos;
920 if ((method & RANK_NORM_LOGLENGTH) && txt->size > 0)
921 Wdoc /= log((double) (cnt_length(txt) + 1));
923 if (method & RANK_NORM_LENGTH)
925 len = cnt_length(txt);
927 Wdoc /= (double) len;
930 if ((method & RANK_NORM_EXTDIST) && NExtent > 0 && SumDist > 0)
931 Wdoc /= ((double) NExtent) / SumDist;
933 if ((method & RANK_NORM_UNIQ) && txt->size > 0)
934 Wdoc /= (double) (txt->size);
936 if ((method & RANK_NORM_LOGUNIQ) && txt->size > 0)
937 Wdoc /= log((double) (txt->size + 1)) / log(2.0);
939 if (method & RANK_NORM_RDIVRPLUS1)
944 pfree(qr.operandData);
946 return (float4) Wdoc;
950 ts_rankcd_wttf(PG_FUNCTION_ARGS)
952 ArrayType *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
953 TSVector txt = PG_GETARG_TSVECTOR(1);
954 TSQuery query = PG_GETARG_TSQUERY(2);
955 int method = PG_GETARG_INT32(3);
958 res = calc_rank_cd(getWeights(win), txt, query, method);
960 PG_FREE_IF_COPY(win, 0);
961 PG_FREE_IF_COPY(txt, 1);
962 PG_FREE_IF_COPY(query, 2);
963 PG_RETURN_FLOAT4(res);
967 ts_rankcd_wtt(PG_FUNCTION_ARGS)
969 ArrayType *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
970 TSVector txt = PG_GETARG_TSVECTOR(1);
971 TSQuery query = PG_GETARG_TSQUERY(2);
974 res = calc_rank_cd(getWeights(win), txt, query, DEF_NORM_METHOD);
976 PG_FREE_IF_COPY(win, 0);
977 PG_FREE_IF_COPY(txt, 1);
978 PG_FREE_IF_COPY(query, 2);
979 PG_RETURN_FLOAT4(res);
983 ts_rankcd_ttf(PG_FUNCTION_ARGS)
985 TSVector txt = PG_GETARG_TSVECTOR(0);
986 TSQuery query = PG_GETARG_TSQUERY(1);
987 int method = PG_GETARG_INT32(2);
990 res = calc_rank_cd(getWeights(NULL), txt, query, method);
992 PG_FREE_IF_COPY(txt, 0);
993 PG_FREE_IF_COPY(query, 1);
994 PG_RETURN_FLOAT4(res);
998 ts_rankcd_tt(PG_FUNCTION_ARGS)
1000 TSVector txt = PG_GETARG_TSVECTOR(0);
1001 TSQuery query = PG_GETARG_TSQUERY(1);
1004 res = calc_rank_cd(getWeights(NULL), txt, query, DEF_NORM_METHOD);
1006 PG_FREE_IF_COPY(txt, 0);
1007 PG_FREE_IF_COPY(query, 1);
1008 PG_RETURN_FLOAT4(res);