3 * Teodor Sigaev <teodor@sigaev.ru>
9 #include "access/gist.h"
10 #include "access/itup.h"
11 #include "catalog/namespace.h"
12 #include "commands/trigger.h"
13 #include "executor/spi.h"
16 #include "nodes/pg_list.h"
17 #include "storage/bufpage.h"
18 #include "utils/array.h"
19 #include "utils/builtins.h"
25 PG_FUNCTION_INFO_V1(rank);
26 Datum rank(PG_FUNCTION_ARGS);
28 PG_FUNCTION_INFO_V1(rank_def);
29 Datum rank_def(PG_FUNCTION_ARGS);
31 PG_FUNCTION_INFO_V1(rank_cd);
32 Datum rank_cd(PG_FUNCTION_ARGS);
34 PG_FUNCTION_INFO_V1(rank_cd_def);
35 Datum rank_cd_def(PG_FUNCTION_ARGS);
37 PG_FUNCTION_INFO_V1(get_covers);
38 Datum get_covers(PG_FUNCTION_ARGS);
40 static float weights[] = {0.1f, 0.2f, 0.4f, 1.0f};
42 #define wpos(wep) ( w[ WEP_GETWEIGHT(wep) ] )
44 #define RANK_NO_NORM 0x00
45 #define RANK_NORM_LOGLENGTH 0x01
46 #define RANK_NORM_LENGTH 0x02
47 #define RANK_NORM_EXTDIST 0x04
48 #define RANK_NORM_UNIQ 0x08
49 #define RANK_NORM_LOGUNIQ 0x10
50 #define DEF_NORM_METHOD RANK_NO_NORM
52 static float calc_rank_or(float *w, tsvector * t, QUERYTYPE * q);
53 static float calc_rank_and(float *w, tsvector * t, QUERYTYPE * q);
56 * Returns a weight of a word collocation
64 return 1.0 / (1.005 + 0.05 * exp(((float4) w) / 1.5 - 2));
68 cnt_length(tsvector * t)
70 WordEntry *ptr = ARRPTR(t),
71 *end = (WordEntry *) STRPTR(t);
77 if ((clen = POSDATALEN(t, ptr)) == 0)
88 WordECompareITEM(char *eval, char *qval, WordEntry * ptr, ITEM * item)
90 if (ptr->len == item->length)
93 qval + item->distance,
96 return (ptr->len > item->length) ? 1 : -1;
100 find_wordentry(tsvector * t, QUERYTYPE * q, ITEM * item)
102 WordEntry *StopLow = ARRPTR(t);
103 WordEntry *StopHigh = (WordEntry *) STRPTR(t);
104 WordEntry *StopMiddle;
107 /* Loop invariant: StopLow <= item < StopHigh */
109 while (StopLow < StopHigh)
111 StopMiddle = StopLow + (StopHigh - StopLow) / 2;
112 difference = WordECompareITEM(STRPTR(t), GETOPERAND(q), StopMiddle, item);
115 else if (difference < 0)
116 StopLow = StopMiddle + 1;
118 StopHigh = StopMiddle;
126 compareITEM(const void *a, const void *b, void *arg)
128 char *operand = (char *) arg;
130 if ((*(ITEM **) a)->length == (*(ITEM **) b)->length)
131 return strncmp(operand + (*(ITEM **) a)->distance,
132 operand + (*(ITEM **) b)->distance,
133 (*(ITEM **) b)->length);
135 return ((*(ITEM **) a)->length > (*(ITEM **) b)->length) ? 1 : -1;
139 SortAndUniqItems(char *operand, ITEM * item, int *size)
145 ptr = res = (ITEM **) palloc(sizeof(ITEM *) * *size);
149 if (item->type == VAL)
161 qsort_arg(res, *size, sizeof(ITEM **), compareITEM, (void *) operand);
166 while (ptr - res < *size)
168 if (compareITEM((void *) ptr, (void *) prevptr, (void *) operand) != 0)
176 *size = prevptr + 1 - res;
180 static WordEntryPos POSNULL[] = {
186 calc_rank_and(float *w, tsvector * t, QUERYTYPE * q)
203 item = SortAndUniqItems(GETOPERAND(q), GETQUERY(q), &size);
207 return calc_rank_or(w, t, q);
209 pos = (uint16 **) palloc(sizeof(uint16 *) * q->size);
210 memset(pos, 0, sizeof(uint16 *) * q->size);
211 *(uint16 *) POSNULL = lengthof(POSNULL) - 1;
212 WEP_SETPOS(POSNULL[1], MAXENTRYPOS - 1);
214 for (i = 0; i < size; i++)
216 entry = find_wordentry(t, q, item[i]);
221 pos[i] = (uint16 *) _POSDATAPTR(t, entry);
223 pos[i] = (uint16 *) POSNULL;
226 dimt = *(uint16 *) (pos[i]);
227 post = (WordEntryPos *) (pos[i] + 1);
228 for (k = 0; k < i; k++)
232 lenct = *(uint16 *) (pos[k]);
233 ct = (WordEntryPos *) (pos[k] + 1);
234 for (l = 0; l < dimt; l++)
236 for (p = 0; p < lenct; p++)
238 dist = Abs((int) WEP_GETPOS(post[l]) - (int) WEP_GETPOS(ct[p]));
239 if (dist || (dist == 0 && (pos[i] == (uint16 *) POSNULL || pos[k] == (uint16 *) POSNULL)))
245 curw = sqrt(wpos(post[l]) * wpos(ct[p]) * word_distance(dist));
246 res = (res < 0) ? curw : 1.0 - (1.0 - res) * (1.0 - curw);
258 calc_rank_or(float *w, tsvector * t, QUERYTYPE * q)
269 *(uint16 *) POSNULL = lengthof(POSNULL) - 1;
270 item = SortAndUniqItems(GETOPERAND(q), GETQUERY(q), &size);
272 for (i = 0; i < size; i++)
278 entry = find_wordentry(t, q, item[i]);
284 dimt = POSDATALEN(t, entry);
285 post = POSDATAPTR(t, entry);
289 dimt = *(uint16 *) POSNULL;
296 for (j = 0; j < dimt; j++)
298 resj = resj + wpos(post[j]) / ((j + 1) * (j + 1));
299 if (wpos(post[j]) > wjm)
306 limit (sum(i/i^2),i->inf) = pi^2/6
307 resj = sum(wi/i^2),i=1,noccurence,
308 wi - should be sorted desc,
309 don't sort for now, just choose maximum weight. This should be corrected
312 res = res + (wjm + resj - wjm / ((jm + 1) * (jm + 1))) / 1.64493406685;
321 calc_rank(float *w, tsvector * t, QUERYTYPE * q, int4 method)
323 ITEM *item = GETQUERY(q);
327 if (!t->size || !q->size)
330 res = (item->type != VAL && item->val == (int4) '&') ?
331 calc_rank_and(w, t, q) : calc_rank_or(w, t, q);
336 if ((method & RANK_NORM_LOGLENGTH) && t->size > 0)
337 res /= log((double) (cnt_length(t) + 1)) / log(2.0);
339 if (method & RANK_NORM_LENGTH)
346 if ((method & RANK_NORM_UNIQ) && t->size > 0)
347 res /= (float) (t->size);
349 if ((method & RANK_NORM_LOGUNIQ) && t->size > 0)
350 res /= log((double) (t->size + 1)) / log(2.0);
356 rank(PG_FUNCTION_ARGS)
358 ArrayType *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
359 tsvector *txt = (tsvector *) PG_DETOAST_DATUM(PG_GETARG_DATUM(1));
360 QUERYTYPE *query = (QUERYTYPE *) PG_DETOAST_DATUM(PG_GETARG_DATUM(2));
361 int method = DEF_NORM_METHOD;
363 float ws[lengthof(weights)];
367 if (ARR_NDIM(win) != 1)
369 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
370 errmsg("array of weight must be one-dimensional")));
372 if (ARRNELEMS(win) < lengthof(weights))
374 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
375 errmsg("array of weight is too short")));
377 if (ARR_HASNULL(win))
379 (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
380 errmsg("array of weight must not contain nulls")));
382 arrdata = (float4 *) ARR_DATA_PTR(win);
383 for (i = 0; i < lengthof(weights); i++)
385 ws[i] = (arrdata[i] >= 0) ? arrdata[i] : weights[i];
388 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
389 errmsg("weight out of range")));
393 method = PG_GETARG_INT32(3);
395 res = calc_rank(ws, txt, query, method);
397 PG_FREE_IF_COPY(win, 0);
398 PG_FREE_IF_COPY(txt, 1);
399 PG_FREE_IF_COPY(query, 2);
400 PG_RETURN_FLOAT4(res);
404 rank_def(PG_FUNCTION_ARGS)
406 tsvector *txt = (tsvector *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
407 QUERYTYPE *query = (QUERYTYPE *) PG_DETOAST_DATUM(PG_GETARG_DATUM(1));
409 int method = DEF_NORM_METHOD;
412 method = PG_GETARG_INT32(2);
414 res = calc_rank(weights, txt, query, method);
416 PG_FREE_IF_COPY(txt, 0);
417 PG_FREE_IF_COPY(query, 1);
418 PG_RETURN_FLOAT4(res);
432 compareDocR(const void *a, const void *b)
434 if (((DocRepresentation *) a)->pos == ((DocRepresentation *) b)->pos)
436 return (((DocRepresentation *) a)->pos > ((DocRepresentation *) b)->pos) ? 1 : -1;
440 checkcondition_ITEM(void *checkval, ITEM * val)
442 return (bool) (val->istrue);
446 reset_istrue_flag(QUERYTYPE * query)
448 ITEM *item = GETQUERY(query);
451 /* reset istrue flag */
452 for (i = 0; i < query->size; i++)
454 if (item->type == VAL)
465 DocRepresentation *begin;
466 DocRepresentation *end;
471 Cover(DocRepresentation * doc, int len, QUERYTYPE * query, Extention * ext)
473 DocRepresentation *ptr;
474 int lastpos = ext->pos;
478 reset_istrue_flag(query);
482 ptr = doc + ext->pos;
484 /* find upper bound of cover from current position, move up */
485 while (ptr - doc < len)
487 for (i = 0; i < ptr->nitem; i++)
488 ptr->item[i]->istrue = 1;
489 if (TS_execute(GETQUERY(query), NULL, false, checkcondition_ITEM))
491 if (ptr->pos > ext->q)
506 reset_istrue_flag(query);
510 /* find lower bound of cover from founded upper bound, move down */
513 for (i = 0; i < ptr->nitem; i++)
514 ptr->item[i]->istrue = 1;
515 if (TS_execute(GETQUERY(query), NULL, true, checkcondition_ITEM))
517 if (ptr->pos < ext->p)
527 if (ext->p <= ext->q)
530 * set position for next try to next lexeme after begining of founded
533 ext->pos = (ptr - doc) + 1;
538 return Cover(doc, len, query, ext);
541 static DocRepresentation *
542 get_docrep(tsvector * txt, QUERYTYPE * query, int *doclen)
544 ITEM *item = GETQUERY(query);
550 int len = query->size * 4,
552 DocRepresentation *doc;
555 *(uint16 *) POSNULL = lengthof(POSNULL) - 1;
556 doc = (DocRepresentation *) palloc(sizeof(DocRepresentation) * len);
557 operand = GETOPERAND(query);
558 reset_istrue_flag(query);
560 for (i = 0; i < query->size; i++)
562 if (item[i].type != VAL || item[i].istrue)
565 entry = find_wordentry(txt, query, &(item[i]));
571 dimt = POSDATALEN(txt, entry);
572 post = POSDATAPTR(txt, entry);
576 dimt = *(uint16 *) POSNULL;
580 while (cur + dimt >= len)
583 doc = (DocRepresentation *) repalloc(doc, sizeof(DocRepresentation) * len);
586 for (j = 0; j < dimt; j++)
594 doc[cur].needfree = false;
596 doc[cur].item = (ITEM **) palloc(sizeof(ITEM *) * query->size);
598 for (k = 0; k < query->size; k++)
602 (item[k].type == VAL &&
603 compareITEM(&kptr, &iptr, operand) == 0))
605 doc[cur].item[doc[cur].nitem] = item + k;
613 doc[cur].needfree = false;
614 doc[cur].nitem = doc[cur - 1].nitem;
615 doc[cur].item = doc[cur - 1].item;
617 doc[cur].pos = WEP_GETPOS(post[j]);
618 doc[cur].wclass = WEP_GETWEIGHT(post[j]);
628 qsort((void *) doc, cur, sizeof(DocRepresentation), compareDocR);
637 calc_rank_cd(float4 *arrdata, tsvector * txt, QUERYTYPE * query, int method)
639 DocRepresentation *doc;
645 double invws[lengthof(weights)];
646 double SumDist = 0.0,
651 for (i = 0; i < lengthof(weights); i++)
653 invws[i] = ((double) ((arrdata[i] >= 0) ? arrdata[i] : weights[i]));
656 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
657 errmsg("weight out of range")));
658 invws[i] = 1.0 / invws[i];
661 doc = get_docrep(txt, query, &doclen);
665 MemSet(&ext, 0, sizeof(Extention));
666 while (Cover(doc, doclen, query, &ext))
671 DocRepresentation *ptr = ext.begin;
673 while (ptr <= ext.end)
675 InvSum += invws[ptr->wclass];
679 Cpos = ((double) (ext.end - ext.begin + 1)) / InvSum;
681 * if doc are big enough then ext.q may be equal to ext.p
682 * due to limit of posional information. In this case we
683 * approximate number of noise word as half cover's
686 nNoise = (ext.q - ext.p) - (ext.end - ext.begin);
688 nNoise = (ext.end - ext.begin) / 2;
689 Wdoc += Cpos / ((double) (1 + nNoise));
691 CurExtPos = ((double) (ext.q + ext.p)) / 2.0;
692 if (NExtent > 0 && CurExtPos > PrevExtPos /* prevent devision by
695 SumDist += 1.0 / (CurExtPos - PrevExtPos);
697 PrevExtPos = CurExtPos;
701 if ((method & RANK_NORM_LOGLENGTH) && txt->size > 0)
702 Wdoc /= log((double) (cnt_length(txt) + 1));
704 if (method & RANK_NORM_LENGTH)
706 len = cnt_length(txt);
708 Wdoc /= (double) len;
711 if ((method & RANK_NORM_EXTDIST) && SumDist > 0)
712 Wdoc /= ((double) NExtent) / SumDist;
714 if ((method & RANK_NORM_UNIQ) && txt->size > 0)
715 Wdoc /= (double) (txt->size);
717 if ((method & RANK_NORM_LOGUNIQ) && txt->size > 0)
718 Wdoc /= log((double) (txt->size + 1)) / log(2.0);
720 for (i = 0; i < doclen; i++)
725 return (float4) Wdoc;
729 rank_cd(PG_FUNCTION_ARGS)
732 tsvector *txt = (tsvector *) PG_DETOAST_DATUM(PG_GETARG_DATUM(1));
733 QUERYTYPE *query = (QUERYTYPE *) PG_DETOAST_DATUM_COPY(PG_GETARG_DATUM(2));
734 int method = DEF_NORM_METHOD;
738 * Pre-8.2, rank_cd took just a plain int as its first argument.
739 * It was a mistake to keep the same C function name while changing the
740 * signature, but it's too late to fix that. Instead, do a runtime test
741 * to make sure the expected datatype has been passed. This is needed
742 * to prevent core dumps if tsearch2 function definitions from an old
743 * database are loaded into an 8.2 server.
745 if (get_fn_expr_argtype(fcinfo->flinfo, 0) != FLOAT4ARRAYOID)
747 (errcode(ERRCODE_INVALID_FUNCTION_DEFINITION),
748 errmsg("rank_cd() now takes real[] as its first argument, not integer")));
750 /* now safe to dereference the first arg */
751 win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
753 if (ARR_NDIM(win) != 1)
755 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
756 errmsg("array of weight must be one-dimensional")));
758 if (ARRNELEMS(win) < lengthof(weights))
760 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
761 errmsg("array of weight is too short")));
763 if (ARR_HASNULL(win))
765 (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
766 errmsg("array of weight must not contain nulls")));
769 method = PG_GETARG_INT32(3);
771 res = calc_rank_cd((float4 *) ARR_DATA_PTR(win), txt, query, method);
773 PG_FREE_IF_COPY(win, 0);
774 PG_FREE_IF_COPY(txt, 1);
775 PG_FREE_IF_COPY(query, 2);
777 PG_RETURN_FLOAT4(res);
782 rank_cd_def(PG_FUNCTION_ARGS)
784 tsvector *txt = (tsvector *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
785 QUERYTYPE *query = (QUERYTYPE *) PG_DETOAST_DATUM_COPY(PG_GETARG_DATUM(1));
788 res = calc_rank_cd(weights, txt, query, (PG_NARGS() == 3) ? PG_GETARG_DATUM(2) : DEF_NORM_METHOD);
790 PG_FREE_IF_COPY(txt, 0);
791 PG_FREE_IF_COPY(query, 1);
793 PG_RETURN_FLOAT4(res);
796 /**************debug*************/
808 compareDocWord(const void *a, const void *b)
810 if (((DocWord *) a)->pos == ((DocWord *) b)->pos)
812 return (((DocWord *) a)->pos > ((DocWord *) b)->pos) ? 1 : -1;
817 get_covers(PG_FUNCTION_ARGS)
819 tsvector *txt = (tsvector *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
820 QUERYTYPE *query = (QUERYTYPE *) PG_DETOAST_DATUM_COPY(PG_GETARG_DATUM(1));
821 WordEntry *pptr = ARRPTR(txt);
832 DocRepresentation *doc;
837 doc = get_docrep(txt, query, &rlen);
841 out = palloc(VARHDRSZ);
842 VARATT_SIZEP(out) = VARHDRSZ;
843 PG_FREE_IF_COPY(txt, 0);
844 PG_FREE_IF_COPY(query, 1);
845 PG_RETURN_POINTER(out);
848 for (i = 0; i < txt->size; i++)
852 (errcode(ERRCODE_SYNTAX_ERROR),
853 errmsg("no pos info")));
854 dlen += POSDATALEN(txt, &(pptr[i]));
857 dwptr = dw = palloc(sizeof(DocWord) * dlen);
858 memset(dw, 0, sizeof(DocWord) * dlen);
860 for (i = 0; i < txt->size; i++)
862 WordEntryPos *posdata = POSDATAPTR(txt, &(pptr[i]));
864 for (j = 0; j < POSDATALEN(txt, &(pptr[i])); j++)
866 dw[cur].w = STRPTR(txt) + pptr[i].pos;
867 dw[cur].len = pptr[i].len;
868 dw[cur].pos = WEP_GETPOS(posdata[j]);
871 len += (pptr[i].len + 1) * (int) POSDATALEN(txt, &(pptr[i]));
873 qsort((void *) dw, dlen, sizeof(DocWord), compareDocWord);
875 MemSet(&ext, 0, sizeof(Extention));
876 while (Cover(doc, rlen, query, &ext))
878 dwptr = dw + olddwpos;
879 while (dwptr->pos < ext.p && dwptr - dw < dlen)
881 olddwpos = dwptr - dw;
882 dwptr->start = ncover;
883 while (dwptr->pos < ext.q + 1 && dwptr - dw < dlen)
885 (dwptr - 1)->finish = ncover;
886 len += 4 /* {}+two spaces */ + 2 * 16 /* numbers */ ;
890 out = palloc(VARHDRSZ + len);
891 cptr = ((char *) out) + VARHDRSZ;
894 while (dwptr - dw < dlen)
898 sprintf(cptr, "{%d ", dwptr->start);
899 cptr = strchr(cptr, '\0');
901 memcpy(cptr, dwptr->w, dwptr->len);
907 sprintf(cptr, "}%d ", dwptr->finish);
908 cptr = strchr(cptr, '\0');
913 VARATT_SIZEP(out) = cptr - ((char *) out);
916 for (i = 0; i < rlen; i++)
921 PG_FREE_IF_COPY(txt, 0);
922 PG_FREE_IF_COPY(query, 1);
923 PG_RETURN_POINTER(out);