1 /*-------------------------------------------------------------------------
4 * rank tsvector by tsquery
6 * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
10 * src/backend/utils/adt/tsrank.c
12 *-------------------------------------------------------------------------
19 #include "tsearch/ts_utils.h"
20 #include "utils/array.h"
21 #include "miscadmin.h"
24 static const float weights[] = {0.1f, 0.2f, 0.4f, 1.0f};
26 #define wpos(wep) ( w[ WEP_GETWEIGHT(wep) ] )
28 #define RANK_NO_NORM 0x00
29 #define RANK_NORM_LOGLENGTH 0x01
30 #define RANK_NORM_LENGTH 0x02
31 #define RANK_NORM_EXTDIST 0x04
32 #define RANK_NORM_UNIQ 0x08
33 #define RANK_NORM_LOGUNIQ 0x10
34 #define RANK_NORM_RDIVRPLUS1 0x20
35 #define DEF_NORM_METHOD RANK_NO_NORM
37 static float calc_rank_or(const float *w, TSVector t, TSQuery q);
38 static float calc_rank_and(const float *w, TSVector t, TSQuery q);
41 * Returns a weight of a word collocation
44 word_distance(int32 w)
49 return 1.0 / (1.005 + 0.05 * exp(((float4) w) / 1.5 - 2));
53 cnt_length(TSVector t)
55 WordEntry *ptr = ARRPTR(t),
56 *end = (WordEntry *) STRPTR(t);
61 int clen = POSDATALEN(t, ptr);
75 #define WordECompareQueryItem(e,q,p,i,m) \
76 tsCompareString((q) + (i)->distance, (i)->length, \
77 (e) + (p)->pos, (p)->len, (m))
81 * Returns a pointer to a WordEntry's array corresponding to 'item' from
82 * tsvector 't'. 'q' is the TSQuery containing 'item'.
83 * Returns NULL if not found.
86 find_wordentry(TSVector t, TSQuery q, QueryOperand *item, int32 *nitem)
88 WordEntry *StopLow = ARRPTR(t);
89 WordEntry *StopHigh = (WordEntry *) STRPTR(t);
90 WordEntry *StopMiddle = StopHigh;
95 /* Loop invariant: StopLow <= item < StopHigh */
96 while (StopLow < StopHigh)
98 StopMiddle = StopLow + (StopHigh - StopLow) / 2;
99 difference = WordECompareQueryItem(STRPTR(t), GETOPERAND(q), StopMiddle, item, false);
102 StopHigh = StopMiddle;
106 else if (difference > 0)
107 StopLow = StopMiddle + 1;
109 StopHigh = StopMiddle;
114 if (StopLow >= StopHigh)
115 StopMiddle = StopHigh;
119 while (StopMiddle < (WordEntry *) STRPTR(t) &&
120 WordECompareQueryItem(STRPTR(t), GETOPERAND(q), StopMiddle, item, true) == 0)
127 return (*nitem > 0) ? StopHigh : NULL;
132 * sort QueryOperands by (length, word)
135 compareQueryOperand(const void *a, const void *b, void *arg)
137 char *operand = (char *) arg;
138 QueryOperand *qa = (*(QueryOperand *const *) a);
139 QueryOperand *qb = (*(QueryOperand *const *) b);
141 return tsCompareString(operand + qa->distance, qa->length,
142 operand + qb->distance, qb->length,
147 * Returns a sorted, de-duplicated array of QueryOperands in a query.
148 * The returned QueryOperands are pointers to the original QueryOperands
151 * Length of the returned array is stored in *size
153 static QueryOperand **
154 SortAndUniqItems(TSQuery q, int *size)
156 char *operand = GETOPERAND(q);
157 QueryItem *item = GETQUERY(q);
162 ptr = res = (QueryOperand **) palloc(sizeof(QueryOperand *) * *size);
164 /* Collect all operands from the tree to res */
167 if (item->type == QI_VAL)
169 *ptr = (QueryOperand *) item;
179 qsort_arg(res, *size, sizeof(QueryOperand *), compareQueryOperand, (void *) operand);
184 /* remove duplicates */
185 while (ptr - res < *size)
187 if (compareQueryOperand((void *) ptr, (void *) prevptr, (void *) operand) != 0)
195 *size = prevptr + 1 - res;
200 calc_rank_and(const float *w, TSVector t, TSQuery q)
202 WordEntryPosVector **pos;
203 WordEntryPosVector1 posnull;
204 WordEntryPosVector *POSNULL;
221 item = SortAndUniqItems(q, &size);
225 return calc_rank_or(w, t, q);
227 pos = (WordEntryPosVector **) palloc0(sizeof(WordEntryPosVector *) * q->size);
229 /* A dummy WordEntryPos array to use when haspos is false */
232 WEP_SETPOS(posnull.pos[0], MAXENTRYPOS - 1);
233 POSNULL = (WordEntryPosVector *) &posnull;
235 for (i = 0; i < size; i++)
237 firstentry = entry = find_wordentry(t, q, item[i], &nitem);
241 while (entry - firstentry < nitem)
244 pos[i] = _POSVECPTR(t, entry);
250 for (k = 0; k < i; k++)
254 lenct = pos[k]->npos;
256 for (l = 0; l < dimt; l++)
258 for (p = 0; p < lenct; p++)
260 dist = Abs((int) WEP_GETPOS(post[l]) - (int) WEP_GETPOS(ct[p]));
261 if (dist || (dist == 0 && (pos[i] == POSNULL || pos[k] == POSNULL)))
267 curw = sqrt(wpos(post[l]) * wpos(ct[p]) * word_distance(dist));
268 res = (res < 0) ? curw : 1.0 - (1.0 - res) * (1.0 - curw);
283 calc_rank_or(const float *w, TSVector t, TSQuery q)
287 WordEntryPosVector1 posnull;
297 /* A dummy WordEntryPos array to use when haspos is false */
301 item = SortAndUniqItems(q, &size);
303 for (i = 0; i < size; i++)
309 firstentry = entry = find_wordentry(t, q, item[i], &nitem);
313 while (entry - firstentry < nitem)
317 dimt = POSDATALEN(t, entry);
318 post = POSDATAPTR(t, entry);
329 for (j = 0; j < dimt; j++)
331 resj = resj + wpos(post[j]) / ((j + 1) * (j + 1));
332 if (wpos(post[j]) > wjm)
339 limit (sum(i/i^2),i->inf) = pi^2/6
340 resj = sum(wi/i^2),i=1,noccurence,
341 wi - should be sorted desc,
342 don't sort for now, just choose maximum weight. This should be corrected
345 res = res + (wjm + resj - wjm / ((jm + 1) * (jm + 1))) / 1.64493406685;
357 calc_rank(const float *w, TSVector t, TSQuery q, int32 method)
359 QueryItem *item = GETQUERY(q);
363 if (!t->size || !q->size)
366 /* XXX: What about NOT? */
367 res = (item->type == QI_OPR && item->qoperator.oper == OP_AND) ?
368 calc_rank_and(w, t, q) : calc_rank_or(w, t, q);
373 if ((method & RANK_NORM_LOGLENGTH) && t->size > 0)
374 res /= log((double) (cnt_length(t) + 1)) / log(2.0);
376 if (method & RANK_NORM_LENGTH)
383 /* RANK_NORM_EXTDIST not applicable */
385 if ((method & RANK_NORM_UNIQ) && t->size > 0)
386 res /= (float) (t->size);
388 if ((method & RANK_NORM_LOGUNIQ) && t->size > 0)
389 res /= log((double) (t->size + 1)) / log(2.0);
391 if (method & RANK_NORM_RDIVRPLUS1)
398 getWeights(ArrayType *win)
400 static float ws[lengthof(weights)];
407 if (ARR_NDIM(win) != 1)
409 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
410 errmsg("array of weight must be one-dimensional")));
412 if (ArrayGetNItems(ARR_NDIM(win), ARR_DIMS(win)) < lengthof(weights))
414 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
415 errmsg("array of weight is too short")));
417 if (array_contains_nulls(win))
419 (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
420 errmsg("array of weight must not contain nulls")));
422 arrdata = (float4 *) ARR_DATA_PTR(win);
423 for (i = 0; i < lengthof(weights); i++)
425 ws[i] = (arrdata[i] >= 0) ? arrdata[i] : weights[i];
428 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
429 errmsg("weight out of range")));
436 ts_rank_wttf(PG_FUNCTION_ARGS)
438 ArrayType *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
439 TSVector txt = PG_GETARG_TSVECTOR(1);
440 TSQuery query = PG_GETARG_TSQUERY(2);
441 int method = PG_GETARG_INT32(3);
444 res = calc_rank(getWeights(win), txt, query, method);
446 PG_FREE_IF_COPY(win, 0);
447 PG_FREE_IF_COPY(txt, 1);
448 PG_FREE_IF_COPY(query, 2);
449 PG_RETURN_FLOAT4(res);
453 ts_rank_wtt(PG_FUNCTION_ARGS)
455 ArrayType *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
456 TSVector txt = PG_GETARG_TSVECTOR(1);
457 TSQuery query = PG_GETARG_TSQUERY(2);
460 res = calc_rank(getWeights(win), txt, query, DEF_NORM_METHOD);
462 PG_FREE_IF_COPY(win, 0);
463 PG_FREE_IF_COPY(txt, 1);
464 PG_FREE_IF_COPY(query, 2);
465 PG_RETURN_FLOAT4(res);
469 ts_rank_ttf(PG_FUNCTION_ARGS)
471 TSVector txt = PG_GETARG_TSVECTOR(0);
472 TSQuery query = PG_GETARG_TSQUERY(1);
473 int method = PG_GETARG_INT32(2);
476 res = calc_rank(getWeights(NULL), txt, query, method);
478 PG_FREE_IF_COPY(txt, 0);
479 PG_FREE_IF_COPY(query, 1);
480 PG_RETURN_FLOAT4(res);
484 ts_rank_tt(PG_FUNCTION_ARGS)
486 TSVector txt = PG_GETARG_TSVECTOR(0);
487 TSQuery query = PG_GETARG_TSQUERY(1);
490 res = calc_rank(getWeights(NULL), txt, query, DEF_NORM_METHOD);
492 PG_FREE_IF_COPY(txt, 0);
493 PG_FREE_IF_COPY(query, 1);
494 PG_RETURN_FLOAT4(res);
506 compareDocR(const void *va, const void *vb)
508 const DocRepresentation *a = (const DocRepresentation *) va;
509 const DocRepresentation *b = (const DocRepresentation *) vb;
511 if (a->pos == b->pos)
513 return (a->pos > b->pos) ? 1 : -1;
520 } QueryRepresentation;
522 #define QR_GET_OPERAND_EXISTS(q, v) ( (q)->operandexist[ ((QueryItem*)(v)) - GETQUERY((q)->query) ] )
523 #define QR_SET_OPERAND_EXISTS(q, v) QR_GET_OPERAND_EXISTS(q,v) = true
526 checkcondition_QueryOperand(void *checkval, QueryOperand *val)
528 QueryRepresentation *qr = (QueryRepresentation *) checkval;
530 return QR_GET_OPERAND_EXISTS(qr, val);
538 DocRepresentation *begin;
539 DocRepresentation *end;
544 Cover(DocRepresentation *doc, int len, QueryRepresentation *qr, CoverExt *ext)
546 DocRepresentation *ptr;
547 int lastpos = ext->pos;
552 * since this function recurses, it could be driven to stack overflow.
553 * (though any decent compiler will optimize away the tail-recursion.
557 memset(qr->operandexist, 0, sizeof(bool) * qr->query->size);
561 ptr = doc + ext->pos;
563 /* find upper bound of cover from current position, move up */
564 while (ptr - doc < len)
566 for (i = 0; i < ptr->nitem; i++)
568 if (ptr->item[i]->type == QI_VAL)
569 QR_SET_OPERAND_EXISTS(qr, ptr->item[i]);
571 if (TS_execute(GETQUERY(qr->query), (void *) qr, false, checkcondition_QueryOperand))
573 if (ptr->pos > ext->q)
588 memset(qr->operandexist, 0, sizeof(bool) * qr->query->size);
592 /* find lower bound of cover from found upper bound, move down */
593 while (ptr >= doc + ext->pos)
595 for (i = 0; i < ptr->nitem; i++)
596 if (ptr->item[i]->type == QI_VAL)
597 QR_SET_OPERAND_EXISTS(qr, ptr->item[i]);
598 if (TS_execute(GETQUERY(qr->query), (void *) qr, true, checkcondition_QueryOperand))
600 if (ptr->pos < ext->p)
610 if (ext->p <= ext->q)
613 * set position for next try to next lexeme after beginning of found
616 ext->pos = (ptr - doc) + 1;
621 return Cover(doc, len, qr, ext);
624 static DocRepresentation *
625 get_docrep(TSVector txt, QueryRepresentation *qr, int *doclen)
627 QueryItem *item = GETQUERY(qr->query);
635 int len = qr->query->size * 4,
637 DocRepresentation *doc;
640 doc = (DocRepresentation *) palloc(sizeof(DocRepresentation) * len);
641 operand = GETOPERAND(qr->query);
643 for (i = 0; i < qr->query->size; i++)
645 QueryOperand *curoperand;
647 if (item[i].type != QI_VAL)
650 curoperand = &item[i].qoperand;
652 if (QR_GET_OPERAND_EXISTS(qr, &item[i]))
655 firstentry = entry = find_wordentry(txt, qr->query, curoperand, &nitem);
659 while (entry - firstentry < nitem)
663 dimt = POSDATALEN(txt, entry);
664 post = POSDATAPTR(txt, entry);
668 /* ignore words without positions */
673 while (cur + dimt >= len)
676 doc = (DocRepresentation *) repalloc(doc, sizeof(DocRepresentation) * len);
679 for (j = 0; j < dimt; j++)
686 doc[cur].item = (QueryItem **) palloc(sizeof(QueryItem *) * qr->query->size);
688 for (k = 0; k < qr->query->size; k++)
690 QueryOperand *kptr = &item[k].qoperand;
691 QueryOperand *iptr = &item[i].qoperand;
694 (item[k].type == QI_VAL &&
695 compareQueryOperand(&kptr, &iptr, operand) == 0))
698 * if k == i, we've already checked above that
701 doc[cur].item[doc[cur].nitem] = item + k;
703 QR_SET_OPERAND_EXISTS(qr, item + k);
709 doc[cur].nitem = doc[cur - 1].nitem;
710 doc[cur].item = doc[cur - 1].item;
712 doc[cur].pos = WEP_GETPOS(post[j]);
713 doc[cur].wclass = WEP_GETWEIGHT(post[j]);
725 qsort((void *) doc, cur, sizeof(DocRepresentation), compareDocR);
734 calc_rank_cd(const float4 *arrdata, TSVector txt, TSQuery query, int method)
736 DocRepresentation *doc;
742 double invws[lengthof(weights)];
743 double SumDist = 0.0,
747 QueryRepresentation qr;
750 for (i = 0; i < lengthof(weights); i++)
752 invws[i] = ((double) ((arrdata[i] >= 0) ? arrdata[i] : weights[i]));
755 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
756 errmsg("weight out of range")));
757 invws[i] = 1.0 / invws[i];
761 qr.operandexist = (bool *) palloc0(sizeof(bool) * query->size);
763 doc = get_docrep(txt, &qr, &doclen);
766 pfree(qr.operandexist);
770 MemSet(&ext, 0, sizeof(CoverExt));
771 while (Cover(doc, doclen, &qr, &ext))
776 DocRepresentation *ptr = ext.begin;
778 while (ptr <= ext.end)
780 InvSum += invws[ptr->wclass];
784 Cpos = ((double) (ext.end - ext.begin + 1)) / InvSum;
787 * if doc are big enough then ext.q may be equal to ext.p due to limit
788 * of posional information. In this case we approximate number of
789 * noise word as half cover's length
791 nNoise = (ext.q - ext.p) - (ext.end - ext.begin);
793 nNoise = (ext.end - ext.begin) / 2;
794 Wdoc += Cpos / ((double) (1 + nNoise));
796 CurExtPos = ((double) (ext.q + ext.p)) / 2.0;
797 if (NExtent > 0 && CurExtPos > PrevExtPos /* prevent devision by
800 SumDist += 1.0 / (CurExtPos - PrevExtPos);
802 PrevExtPos = CurExtPos;
806 if ((method & RANK_NORM_LOGLENGTH) && txt->size > 0)
807 Wdoc /= log((double) (cnt_length(txt) + 1));
809 if (method & RANK_NORM_LENGTH)
811 len = cnt_length(txt);
813 Wdoc /= (double) len;
816 if ((method & RANK_NORM_EXTDIST) && NExtent > 0 && SumDist > 0)
817 Wdoc /= ((double) NExtent) / SumDist;
819 if ((method & RANK_NORM_UNIQ) && txt->size > 0)
820 Wdoc /= (double) (txt->size);
822 if ((method & RANK_NORM_LOGUNIQ) && txt->size > 0)
823 Wdoc /= log((double) (txt->size + 1)) / log(2.0);
825 if (method & RANK_NORM_RDIVRPLUS1)
830 pfree(qr.operandexist);
832 return (float4) Wdoc;
836 ts_rankcd_wttf(PG_FUNCTION_ARGS)
838 ArrayType *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
839 TSVector txt = PG_GETARG_TSVECTOR(1);
840 TSQuery query = PG_GETARG_TSQUERY(2);
841 int method = PG_GETARG_INT32(3);
844 res = calc_rank_cd(getWeights(win), txt, query, method);
846 PG_FREE_IF_COPY(win, 0);
847 PG_FREE_IF_COPY(txt, 1);
848 PG_FREE_IF_COPY(query, 2);
849 PG_RETURN_FLOAT4(res);
853 ts_rankcd_wtt(PG_FUNCTION_ARGS)
855 ArrayType *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
856 TSVector txt = PG_GETARG_TSVECTOR(1);
857 TSQuery query = PG_GETARG_TSQUERY(2);
860 res = calc_rank_cd(getWeights(win), txt, query, DEF_NORM_METHOD);
862 PG_FREE_IF_COPY(win, 0);
863 PG_FREE_IF_COPY(txt, 1);
864 PG_FREE_IF_COPY(query, 2);
865 PG_RETURN_FLOAT4(res);
869 ts_rankcd_ttf(PG_FUNCTION_ARGS)
871 TSVector txt = PG_GETARG_TSVECTOR(0);
872 TSQuery query = PG_GETARG_TSQUERY(1);
873 int method = PG_GETARG_INT32(2);
876 res = calc_rank_cd(getWeights(NULL), txt, query, method);
878 PG_FREE_IF_COPY(txt, 0);
879 PG_FREE_IF_COPY(query, 1);
880 PG_RETURN_FLOAT4(res);
884 ts_rankcd_tt(PG_FUNCTION_ARGS)
886 TSVector txt = PG_GETARG_TSVECTOR(0);
887 TSQuery query = PG_GETARG_TSQUERY(1);
890 res = calc_rank_cd(getWeights(NULL), txt, query, DEF_NORM_METHOD);
892 PG_FREE_IF_COPY(txt, 0);
893 PG_FREE_IF_COPY(query, 1);
894 PG_RETURN_FLOAT4(res);