1 /*-------------------------------------------------------------------------
4 * rank tsvector by tsquery
6 * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
10 * src/backend/utils/adt/tsrank.c
12 *-------------------------------------------------------------------------
18 #include "tsearch/ts_utils.h"
19 #include "utils/array.h"
20 #include "miscadmin.h"
23 static const float weights[] = {0.1f, 0.2f, 0.4f, 1.0f};
25 #define wpos(wep) ( w[ WEP_GETWEIGHT(wep) ] )
27 #define RANK_NO_NORM 0x00
28 #define RANK_NORM_LOGLENGTH 0x01
29 #define RANK_NORM_LENGTH 0x02
30 #define RANK_NORM_EXTDIST 0x04
31 #define RANK_NORM_UNIQ 0x08
32 #define RANK_NORM_LOGUNIQ 0x10
33 #define RANK_NORM_RDIVRPLUS1 0x20
34 #define DEF_NORM_METHOD RANK_NO_NORM
36 static float calc_rank_or(const float *w, TSVector t, TSQuery q);
37 static float calc_rank_and(const float *w, TSVector t, TSQuery q);
40 * Returns a weight of a word collocation
43 word_distance(int32 w)
48 return 1.0 / (1.005 + 0.05 * exp(((float4) w) / 1.5 - 2));
52 cnt_length(TSVector t)
54 WordEntry *ptr = ARRPTR(t),
55 *end = (WordEntry *) STRPTR(t);
60 int clen = POSDATALEN(t, ptr);
74 #define WordECompareQueryItem(e,q,p,i,m) \
75 tsCompareString((q) + (i)->distance, (i)->length, \
76 (e) + (p)->pos, (p)->len, (m))
80 * Returns a pointer to a WordEntry's array corresponding to 'item' from
81 * tsvector 't'. 'q' is the TSQuery containing 'item'.
82 * Returns NULL if not found.
85 find_wordentry(TSVector t, TSQuery q, QueryOperand *item, int32 *nitem)
87 WordEntry *StopLow = ARRPTR(t);
88 WordEntry *StopHigh = (WordEntry *) STRPTR(t);
89 WordEntry *StopMiddle = StopHigh;
94 /* Loop invariant: StopLow <= item < StopHigh */
95 while (StopLow < StopHigh)
97 StopMiddle = StopLow + (StopHigh - StopLow) / 2;
98 difference = WordECompareQueryItem(STRPTR(t), GETOPERAND(q), StopMiddle, item, false);
101 StopHigh = StopMiddle;
105 else if (difference > 0)
106 StopLow = StopMiddle + 1;
108 StopHigh = StopMiddle;
113 if (StopLow >= StopHigh)
114 StopMiddle = StopHigh;
118 while (StopMiddle < (WordEntry *) STRPTR(t) &&
119 WordECompareQueryItem(STRPTR(t), GETOPERAND(q), StopMiddle, item, true) == 0)
126 return (*nitem > 0) ? StopHigh : NULL;
131 * sort QueryOperands by (length, word)
134 compareQueryOperand(const void *a, const void *b, void *arg)
136 char *operand = (char *) arg;
137 QueryOperand *qa = (*(QueryOperand *const *) a);
138 QueryOperand *qb = (*(QueryOperand *const *) b);
140 return tsCompareString(operand + qa->distance, qa->length,
141 operand + qb->distance, qb->length,
146 * Returns a sorted, de-duplicated array of QueryOperands in a query.
147 * The returned QueryOperands are pointers to the original QueryOperands
150 * Length of the returned array is stored in *size
152 static QueryOperand **
153 SortAndUniqItems(TSQuery q, int *size)
155 char *operand = GETOPERAND(q);
156 QueryItem *item = GETQUERY(q);
161 ptr = res = (QueryOperand **) palloc(sizeof(QueryOperand *) * *size);
163 /* Collect all operands from the tree to res */
166 if (item->type == QI_VAL)
168 *ptr = (QueryOperand *) item;
178 qsort_arg(res, *size, sizeof(QueryOperand *), compareQueryOperand, (void *) operand);
183 /* remove duplicates */
184 while (ptr - res < *size)
186 if (compareQueryOperand((void *) ptr, (void *) prevptr, (void *) operand) != 0)
194 *size = prevptr + 1 - res;
198 /* A dummy WordEntryPos array to use when haspos is false */
199 static WordEntryPosVector POSNULL = {
200 1, /* Number of elements that follow */
205 calc_rank_and(const float *w, TSVector t, TSQuery q)
207 WordEntryPosVector **pos;
224 item = SortAndUniqItems(q, &size);
228 return calc_rank_or(w, t, q);
230 pos = (WordEntryPosVector **) palloc0(sizeof(WordEntryPosVector *) * q->size);
231 WEP_SETPOS(POSNULL.pos[0], MAXENTRYPOS - 1);
233 for (i = 0; i < size; i++)
235 firstentry = entry = find_wordentry(t, q, item[i], &nitem);
239 while (entry - firstentry < nitem)
242 pos[i] = _POSVECPTR(t, entry);
248 for (k = 0; k < i; k++)
252 lenct = pos[k]->npos;
254 for (l = 0; l < dimt; l++)
256 for (p = 0; p < lenct; p++)
258 dist = Abs((int) WEP_GETPOS(post[l]) - (int) WEP_GETPOS(ct[p]));
259 if (dist || (dist == 0 && (pos[i] == &POSNULL || pos[k] == &POSNULL)))
265 curw = sqrt(wpos(post[l]) * wpos(ct[p]) * word_distance(dist));
266 res = (res < 0) ? curw : 1.0 - (1.0 - res) * (1.0 - curw);
281 calc_rank_or(const float *w, TSVector t, TSQuery q)
294 item = SortAndUniqItems(q, &size);
296 for (i = 0; i < size; i++)
302 firstentry = entry = find_wordentry(t, q, item[i], &nitem);
306 while (entry - firstentry < nitem)
310 dimt = POSDATALEN(t, entry);
311 post = POSDATAPTR(t, entry);
322 for (j = 0; j < dimt; j++)
324 resj = resj + wpos(post[j]) / ((j + 1) * (j + 1));
325 if (wpos(post[j]) > wjm)
332 limit (sum(i/i^2),i->inf) = pi^2/6
333 resj = sum(wi/i^2),i=1,noccurence,
334 wi - should be sorted desc,
335 don't sort for now, just choose maximum weight. This should be corrected
338 res = res + (wjm + resj - wjm / ((jm + 1) * (jm + 1))) / 1.64493406685;
350 calc_rank(const float *w, TSVector t, TSQuery q, int32 method)
352 QueryItem *item = GETQUERY(q);
356 if (!t->size || !q->size)
359 /* XXX: What about NOT? */
360 res = (item->type == QI_OPR && item->qoperator.oper == OP_AND) ?
361 calc_rank_and(w, t, q) : calc_rank_or(w, t, q);
366 if ((method & RANK_NORM_LOGLENGTH) && t->size > 0)
367 res /= log((double) (cnt_length(t) + 1)) / log(2.0);
369 if (method & RANK_NORM_LENGTH)
376 /* RANK_NORM_EXTDIST not applicable */
378 if ((method & RANK_NORM_UNIQ) && t->size > 0)
379 res /= (float) (t->size);
381 if ((method & RANK_NORM_LOGUNIQ) && t->size > 0)
382 res /= log((double) (t->size + 1)) / log(2.0);
384 if (method & RANK_NORM_RDIVRPLUS1)
391 getWeights(ArrayType *win)
393 static float ws[lengthof(weights)];
400 if (ARR_NDIM(win) != 1)
402 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
403 errmsg("array of weight must be one-dimensional")));
405 if (ArrayGetNItems(ARR_NDIM(win), ARR_DIMS(win)) < lengthof(weights))
407 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
408 errmsg("array of weight is too short")));
410 if (array_contains_nulls(win))
412 (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
413 errmsg("array of weight must not contain nulls")));
415 arrdata = (float4 *) ARR_DATA_PTR(win);
416 for (i = 0; i < lengthof(weights); i++)
418 ws[i] = (arrdata[i] >= 0) ? arrdata[i] : weights[i];
421 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
422 errmsg("weight out of range")));
429 ts_rank_wttf(PG_FUNCTION_ARGS)
431 ArrayType *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
432 TSVector txt = PG_GETARG_TSVECTOR(1);
433 TSQuery query = PG_GETARG_TSQUERY(2);
434 int method = PG_GETARG_INT32(3);
437 res = calc_rank(getWeights(win), txt, query, method);
439 PG_FREE_IF_COPY(win, 0);
440 PG_FREE_IF_COPY(txt, 1);
441 PG_FREE_IF_COPY(query, 2);
442 PG_RETURN_FLOAT4(res);
446 ts_rank_wtt(PG_FUNCTION_ARGS)
448 ArrayType *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
449 TSVector txt = PG_GETARG_TSVECTOR(1);
450 TSQuery query = PG_GETARG_TSQUERY(2);
453 res = calc_rank(getWeights(win), txt, query, DEF_NORM_METHOD);
455 PG_FREE_IF_COPY(win, 0);
456 PG_FREE_IF_COPY(txt, 1);
457 PG_FREE_IF_COPY(query, 2);
458 PG_RETURN_FLOAT4(res);
462 ts_rank_ttf(PG_FUNCTION_ARGS)
464 TSVector txt = PG_GETARG_TSVECTOR(0);
465 TSQuery query = PG_GETARG_TSQUERY(1);
466 int method = PG_GETARG_INT32(2);
469 res = calc_rank(getWeights(NULL), txt, query, method);
471 PG_FREE_IF_COPY(txt, 0);
472 PG_FREE_IF_COPY(query, 1);
473 PG_RETURN_FLOAT4(res);
477 ts_rank_tt(PG_FUNCTION_ARGS)
479 TSVector txt = PG_GETARG_TSVECTOR(0);
480 TSQuery query = PG_GETARG_TSQUERY(1);
483 res = calc_rank(getWeights(NULL), txt, query, DEF_NORM_METHOD);
485 PG_FREE_IF_COPY(txt, 0);
486 PG_FREE_IF_COPY(query, 1);
487 PG_RETURN_FLOAT4(res);
499 compareDocR(const void *va, const void *vb)
501 const DocRepresentation *a = (const DocRepresentation *) va;
502 const DocRepresentation *b = (const DocRepresentation *) vb;
504 if (a->pos == b->pos)
506 return (a->pos > b->pos) ? 1 : -1;
513 } QueryRepresentation;
515 #define QR_GET_OPERAND_EXISTS(q, v) ( (q)->operandexist[ ((QueryItem*)(v)) - GETQUERY((q)->query) ] )
516 #define QR_SET_OPERAND_EXISTS(q, v) QR_GET_OPERAND_EXISTS(q,v) = true
519 checkcondition_QueryOperand(void *checkval, QueryOperand *val)
521 QueryRepresentation *qr = (QueryRepresentation *) checkval;
523 return QR_GET_OPERAND_EXISTS(qr, val);
531 DocRepresentation *begin;
532 DocRepresentation *end;
537 Cover(DocRepresentation *doc, int len, QueryRepresentation *qr, CoverExt *ext)
539 DocRepresentation *ptr;
540 int lastpos = ext->pos;
545 * since this function recurses, it could be driven to stack overflow.
546 * (though any decent compiler will optimize away the tail-recursion.
550 memset(qr->operandexist, 0, sizeof(bool) * qr->query->size);
554 ptr = doc + ext->pos;
556 /* find upper bound of cover from current position, move up */
557 while (ptr - doc < len)
559 for (i = 0; i < ptr->nitem; i++)
561 if (ptr->item[i]->type == QI_VAL)
562 QR_SET_OPERAND_EXISTS(qr, ptr->item[i]);
564 if (TS_execute(GETQUERY(qr->query), (void *) qr, false, checkcondition_QueryOperand))
566 if (ptr->pos > ext->q)
581 memset(qr->operandexist, 0, sizeof(bool) * qr->query->size);
585 /* find lower bound of cover from found upper bound, move down */
586 while (ptr >= doc + ext->pos)
588 for (i = 0; i < ptr->nitem; i++)
589 if (ptr->item[i]->type == QI_VAL)
590 QR_SET_OPERAND_EXISTS(qr, ptr->item[i]);
591 if (TS_execute(GETQUERY(qr->query), (void *) qr, true, checkcondition_QueryOperand))
593 if (ptr->pos < ext->p)
603 if (ext->p <= ext->q)
606 * set position for next try to next lexeme after beginning of found
609 ext->pos = (ptr - doc) + 1;
614 return Cover(doc, len, qr, ext);
617 static DocRepresentation *
618 get_docrep(TSVector txt, QueryRepresentation *qr, int *doclen)
620 QueryItem *item = GETQUERY(qr->query);
628 int len = qr->query->size * 4,
630 DocRepresentation *doc;
633 doc = (DocRepresentation *) palloc(sizeof(DocRepresentation) * len);
634 operand = GETOPERAND(qr->query);
636 for (i = 0; i < qr->query->size; i++)
638 QueryOperand *curoperand;
640 if (item[i].type != QI_VAL)
643 curoperand = &item[i].qoperand;
645 if (QR_GET_OPERAND_EXISTS(qr, &item[i]))
648 firstentry = entry = find_wordentry(txt, qr->query, curoperand, &nitem);
652 while (entry - firstentry < nitem)
656 dimt = POSDATALEN(txt, entry);
657 post = POSDATAPTR(txt, entry);
665 while (cur + dimt >= len)
668 doc = (DocRepresentation *) repalloc(doc, sizeof(DocRepresentation) * len);
671 for (j = 0; j < dimt; j++)
678 doc[cur].item = (QueryItem **) palloc(sizeof(QueryItem *) * qr->query->size);
680 for (k = 0; k < qr->query->size; k++)
682 QueryOperand *kptr = &item[k].qoperand;
683 QueryOperand *iptr = &item[i].qoperand;
686 (item[k].type == QI_VAL &&
687 compareQueryOperand(&kptr, &iptr, operand) == 0))
690 * if k == i, we've already checked above that
693 doc[cur].item[doc[cur].nitem] = item + k;
695 QR_SET_OPERAND_EXISTS(qr, item + k);
701 doc[cur].nitem = doc[cur - 1].nitem;
702 doc[cur].item = doc[cur - 1].item;
704 doc[cur].pos = WEP_GETPOS(post[j]);
705 doc[cur].wclass = WEP_GETWEIGHT(post[j]);
717 qsort((void *) doc, cur, sizeof(DocRepresentation), compareDocR);
726 calc_rank_cd(const float4 *arrdata, TSVector txt, TSQuery query, int method)
728 DocRepresentation *doc;
734 double invws[lengthof(weights)];
735 double SumDist = 0.0,
739 QueryRepresentation qr;
742 for (i = 0; i < lengthof(weights); i++)
744 invws[i] = ((double) ((arrdata[i] >= 0) ? arrdata[i] : weights[i]));
747 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
748 errmsg("weight out of range")));
749 invws[i] = 1.0 / invws[i];
753 qr.operandexist = (bool *) palloc0(sizeof(bool) * query->size);
755 doc = get_docrep(txt, &qr, &doclen);
758 pfree(qr.operandexist);
762 MemSet(&ext, 0, sizeof(CoverExt));
763 while (Cover(doc, doclen, &qr, &ext))
768 DocRepresentation *ptr = ext.begin;
770 while (ptr <= ext.end)
772 InvSum += invws[ptr->wclass];
776 Cpos = ((double) (ext.end - ext.begin + 1)) / InvSum;
779 * if doc are big enough then ext.q may be equal to ext.p due to limit
780 * of posional information. In this case we approximate number of
781 * noise word as half cover's length
783 nNoise = (ext.q - ext.p) - (ext.end - ext.begin);
785 nNoise = (ext.end - ext.begin) / 2;
786 Wdoc += Cpos / ((double) (1 + nNoise));
788 CurExtPos = ((double) (ext.q + ext.p)) / 2.0;
789 if (NExtent > 0 && CurExtPos > PrevExtPos /* prevent devision by
792 SumDist += 1.0 / (CurExtPos - PrevExtPos);
794 PrevExtPos = CurExtPos;
798 if ((method & RANK_NORM_LOGLENGTH) && txt->size > 0)
799 Wdoc /= log((double) (cnt_length(txt) + 1));
801 if (method & RANK_NORM_LENGTH)
803 len = cnt_length(txt);
805 Wdoc /= (double) len;
808 if ((method & RANK_NORM_EXTDIST) && NExtent > 0 && SumDist > 0)
809 Wdoc /= ((double) NExtent) / SumDist;
811 if ((method & RANK_NORM_UNIQ) && txt->size > 0)
812 Wdoc /= (double) (txt->size);
814 if ((method & RANK_NORM_LOGUNIQ) && txt->size > 0)
815 Wdoc /= log((double) (txt->size + 1)) / log(2.0);
817 if (method & RANK_NORM_RDIVRPLUS1)
822 pfree(qr.operandexist);
824 return (float4) Wdoc;
828 ts_rankcd_wttf(PG_FUNCTION_ARGS)
830 ArrayType *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
831 TSVector txt = PG_GETARG_TSVECTOR(1);
832 TSQuery query = PG_GETARG_TSQUERY(2);
833 int method = PG_GETARG_INT32(3);
836 res = calc_rank_cd(getWeights(win), txt, query, method);
838 PG_FREE_IF_COPY(win, 0);
839 PG_FREE_IF_COPY(txt, 1);
840 PG_FREE_IF_COPY(query, 2);
841 PG_RETURN_FLOAT4(res);
845 ts_rankcd_wtt(PG_FUNCTION_ARGS)
847 ArrayType *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
848 TSVector txt = PG_GETARG_TSVECTOR(1);
849 TSQuery query = PG_GETARG_TSQUERY(2);
852 res = calc_rank_cd(getWeights(win), txt, query, DEF_NORM_METHOD);
854 PG_FREE_IF_COPY(win, 0);
855 PG_FREE_IF_COPY(txt, 1);
856 PG_FREE_IF_COPY(query, 2);
857 PG_RETURN_FLOAT4(res);
861 ts_rankcd_ttf(PG_FUNCTION_ARGS)
863 TSVector txt = PG_GETARG_TSVECTOR(0);
864 TSQuery query = PG_GETARG_TSQUERY(1);
865 int method = PG_GETARG_INT32(2);
868 res = calc_rank_cd(getWeights(NULL), txt, query, method);
870 PG_FREE_IF_COPY(txt, 0);
871 PG_FREE_IF_COPY(query, 1);
872 PG_RETURN_FLOAT4(res);
876 ts_rankcd_tt(PG_FUNCTION_ARGS)
878 TSVector txt = PG_GETARG_TSVECTOR(0);
879 TSQuery query = PG_GETARG_TSQUERY(1);
882 res = calc_rank_cd(getWeights(NULL), txt, query, DEF_NORM_METHOD);
884 PG_FREE_IF_COPY(txt, 0);
885 PG_FREE_IF_COPY(query, 1);
886 PG_RETURN_FLOAT4(res);