3 * Teodor Sigaev <teodor@sigaev.ru>
9 #include "access/gist.h"
10 #include "access/itup.h"
11 #include "catalog/namespace.h"
12 #include "commands/trigger.h"
13 #include "executor/spi.h"
16 #include "nodes/pg_list.h"
17 #include "storage/bufpage.h"
18 #include "utils/array.h"
19 #include "utils/builtins.h"
25 PG_FUNCTION_INFO_V1(rank);
26 Datum rank(PG_FUNCTION_ARGS);
28 PG_FUNCTION_INFO_V1(rank_def);
29 Datum rank_def(PG_FUNCTION_ARGS);
31 PG_FUNCTION_INFO_V1(rank_cd);
32 Datum rank_cd(PG_FUNCTION_ARGS);
34 PG_FUNCTION_INFO_V1(rank_cd_def);
35 Datum rank_cd_def(PG_FUNCTION_ARGS);
37 PG_FUNCTION_INFO_V1(get_covers);
38 Datum get_covers(PG_FUNCTION_ARGS);
40 static float weights[] = {0.1, 0.2, 0.4, 1.0};
42 #define wpos(wep) ( w[ WEP_GETWEIGHT(wep) ] )
44 #define DEF_NORM_METHOD 0
46 static float calc_rank_or(float *w, tsvector * t, QUERYTYPE * q);
47 static float calc_rank_and(float *w, tsvector * t, QUERYTYPE * q);
50 * Returns a weight of a word collocation
58 return 1.0 / (1.005 + 0.05 * exp(((float4) w) / 1.5 - 2));
62 cnt_length(tsvector * t)
64 WordEntry *ptr = ARRPTR(t),
65 *end = (WordEntry *) STRPTR(t);
71 if ((clen = POSDATALEN(t, ptr)) == 0)
82 WordECompareITEM(char *eval, char *qval, WordEntry * ptr, ITEM * item)
84 if (ptr->len == item->length)
87 qval + item->distance,
90 return (ptr->len > item->length) ? 1 : -1;
94 find_wordentry(tsvector * t, QUERYTYPE * q, ITEM * item)
96 WordEntry *StopLow = ARRPTR(t);
97 WordEntry *StopHigh = (WordEntry *) STRPTR(t);
98 WordEntry *StopMiddle;
101 /* Loop invariant: StopLow <= item < StopHigh */
103 while (StopLow < StopHigh)
105 StopMiddle = StopLow + (StopHigh - StopLow) / 2;
106 difference = WordECompareITEM(STRPTR(t), GETOPERAND(q), StopMiddle, item);
109 else if (difference < 0)
110 StopLow = StopMiddle + 1;
112 StopHigh = StopMiddle;
119 static char *SortAndUniqOperand = NULL;
122 compareITEM(const void *a, const void *b)
124 if ((*(ITEM **) a)->length == (*(ITEM **) b)->length)
125 return strncmp(SortAndUniqOperand + (*(ITEM **) a)->distance,
126 SortAndUniqOperand + (*(ITEM **) b)->distance,
127 (*(ITEM **) b)->length);
129 return ((*(ITEM **) a)->length > (*(ITEM **) b)->length) ? 1 : -1;
133 SortAndUniqItems(char *operand, ITEM * item, int *size)
139 ptr = res = (ITEM **) palloc(sizeof(ITEM *) * *size);
143 if (item->type == VAL)
155 SortAndUniqOperand = operand;
156 qsort(res, *size, sizeof(ITEM **), compareITEM);
161 while (ptr - res < *size)
163 if (compareITEM((void *) ptr, (void *) prevptr) != 0)
171 *size = prevptr + 1 - res;
175 static WordEntryPos POSNULL[] = {
181 calc_rank_and(float *w, tsvector * t, QUERYTYPE * q)
198 item = SortAndUniqItems(GETOPERAND(q), GETQUERY(q), &size);
202 return calc_rank_or(w, t, q);
204 pos = (uint16 **) palloc(sizeof(uint16 *) * q->size);
205 memset(pos, 0, sizeof(uint16 *) * q->size);
206 *(uint16 *) POSNULL = lengthof(POSNULL) - 1;
207 WEP_SETPOS(POSNULL[1], MAXENTRYPOS - 1);
209 for (i = 0; i < size; i++)
211 entry = find_wordentry(t, q, item[i]);
216 pos[i] = (uint16 *) _POSDATAPTR(t, entry);
218 pos[i] = (uint16 *) POSNULL;
221 dimt = *(uint16 *) (pos[i]);
222 post = (WordEntryPos *) (pos[i] + 1);
223 for (k = 0; k < i; k++)
227 lenct = *(uint16 *) (pos[k]);
228 ct = (WordEntryPos *) (pos[k] + 1);
229 for (l = 0; l < dimt; l++)
231 for (p = 0; p < lenct; p++)
233 dist = Abs((int) WEP_GETPOS(post[l]) - (int) WEP_GETPOS(ct[p]));
234 if (dist || (dist == 0 && (pos[i] == (uint16 *) POSNULL || pos[k] == (uint16 *) POSNULL)))
240 curw = sqrt(wpos(post[l]) * wpos(ct[p]) * word_distance(dist));
241 res = (res < 0) ? curw : 1.0 - (1.0 - res) * (1.0 - curw);
253 calc_rank_or(float *w, tsvector * t, QUERYTYPE * q)
264 *(uint16 *) POSNULL = lengthof(POSNULL) - 1;
265 item = SortAndUniqItems(GETOPERAND(q), GETQUERY(q), &size);
267 for (i = 0; i < size; i++)
271 entry = find_wordentry(t, q, item[i]);
277 dimt = POSDATALEN(t, entry);
278 post = POSDATAPTR(t, entry);
282 dimt = *(uint16 *) POSNULL;
289 for (j = 0; j < dimt; j++)
291 resj = resj + wpos(post[j])/((j+1)*(j+1));
292 if ( wpos(post[j]) > wjm ) {
298 limit (sum(i/i^2),i->inf) = pi^2/6
299 resj = sum(wi/i^2),i=1,noccurence,
300 wi - should be sorted desc,
301 don't sort for now, just choose maximum weight. This should be corrected
304 res = res + ( wjm + resj - wjm/((jm+1)*(jm+1)))/1.64493406685;
313 calc_rank(float *w, tsvector * t, QUERYTYPE * q, int4 method)
315 ITEM *item = GETQUERY(q);
319 if (!t->size || !q->size)
322 res = (item->type != VAL && item->val == (int4) '&') ?
323 calc_rank_and(w, t, q) : calc_rank_or(w, t, q);
333 res /= log((float) (cnt_length(t) + 1)) / log(2.0);
342 elog(ERROR, "unrecognized normalization method: %d", method);
349 rank(PG_FUNCTION_ARGS)
351 ArrayType *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
352 tsvector *txt = (tsvector *) PG_DETOAST_DATUM(PG_GETARG_DATUM(1));
353 QUERYTYPE *query = (QUERYTYPE *) PG_DETOAST_DATUM(PG_GETARG_DATUM(2));
354 int method = DEF_NORM_METHOD;
356 float ws[lengthof(weights)];
360 if (ARR_NDIM(win) != 1)
362 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
363 errmsg("array of weight must be one-dimensional")));
365 if (ARRNELEMS(win) < lengthof(weights))
367 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
368 errmsg("array of weight is too short")));
370 if (ARR_HASNULL(win))
372 (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
373 errmsg("array of weight must not contain nulls")));
375 arrdata = (float4 *) ARR_DATA_PTR(win);
376 for (i = 0; i < lengthof(weights); i++)
378 ws[i] = (arrdata[i] >= 0) ? arrdata[i] : weights[i];
381 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
382 errmsg("weight out of range")));
386 method = PG_GETARG_INT32(3);
388 res = calc_rank(ws, txt, query, method);
390 PG_FREE_IF_COPY(win, 0);
391 PG_FREE_IF_COPY(txt, 1);
392 PG_FREE_IF_COPY(query, 2);
393 PG_RETURN_FLOAT4(res);
397 rank_def(PG_FUNCTION_ARGS)
399 tsvector *txt = (tsvector *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
400 QUERYTYPE *query = (QUERYTYPE *) PG_DETOAST_DATUM(PG_GETARG_DATUM(1));
402 int method = DEF_NORM_METHOD;
405 method = PG_GETARG_INT32(2);
407 res = calc_rank(weights, txt, query, method);
409 PG_FREE_IF_COPY(txt, 0);
410 PG_FREE_IF_COPY(query, 1);
411 PG_RETURN_FLOAT4(res);
424 compareDocR(const void *a, const void *b)
426 if (((DocRepresentation *) a)->pos == ((DocRepresentation *) b)->pos)
428 return (((DocRepresentation *) a)->pos > ((DocRepresentation *) b)->pos) ? 1 : -1;
432 checkcondition_ITEM(void *checkval, ITEM * val) {
433 return (bool)(val->istrue);
437 reset_istrue_flag(QUERYTYPE *query) {
438 ITEM *item = GETQUERY(query);
441 /* reset istrue flag */
442 for(i = 0; i < query->size; i++) {
443 if ( item->type == VAL )
450 Cover(DocRepresentation * doc, int len, QUERYTYPE * query, int *pos, int *p, int *q)
452 DocRepresentation *ptr;
457 reset_istrue_flag(query);
463 /* find upper bound of cover from current position, move up */
464 while (ptr - doc < len) {
465 for(i=0;i<ptr->nitem;i++)
466 ptr->item[i]->istrue = 1;
467 if ( TS_execute(GETQUERY(query), NULL, false, checkcondition_ITEM) ) {
481 reset_istrue_flag(query);
485 /* find lower bound of cover from founded upper bound, move down */
486 while (ptr >= doc ) {
487 for(i=0;i<ptr->nitem;i++)
488 ptr->item[i]->istrue = 1;
489 if ( TS_execute(GETQUERY(query), NULL, true, checkcondition_ITEM) ) {
498 /* set position for next try to next lexeme after begining of founded cover */
504 return Cover( doc, len, query, pos, p, q );
507 static DocRepresentation *
508 get_docrep(tsvector * txt, QUERYTYPE * query, int *doclen)
510 ITEM *item = GETQUERY(query);
516 int len = query->size * 4,
518 DocRepresentation *doc;
520 *(uint16 *) POSNULL = lengthof(POSNULL) - 1;
521 doc = (DocRepresentation *) palloc(sizeof(DocRepresentation) * len);
522 SortAndUniqOperand = GETOPERAND(query);
523 reset_istrue_flag(query);
525 for (i = 0; i < query->size; i++)
527 if (item[i].type != VAL || item[i].istrue)
530 entry = find_wordentry(txt, query, &(item[i]));
536 dimt = POSDATALEN(txt, entry);
537 post = POSDATAPTR(txt, entry);
541 dimt = *(uint16 *) POSNULL;
545 while (cur + dimt >= len)
548 doc = (DocRepresentation *) repalloc(doc, sizeof(DocRepresentation) * len);
551 for (j = 0; j < dimt; j++)
554 ITEM *kptr, *iptr = item+i;
557 doc[cur].needfree = false;
559 doc[cur].item = (ITEM**)palloc( sizeof(ITEM*) * query->size );
561 for(k=0; k < query->size; k++) {
563 if ( k==i || ( item[k].type == VAL && compareITEM( &kptr, &iptr ) == 0 ) ) {
564 doc[cur].item[ doc[cur].nitem ] = item+k;
570 doc[cur].needfree = false;
571 doc[cur].nitem = doc[cur-1].nitem;
572 doc[cur].item = doc[cur-1].item;
574 doc[cur].pos = WEP_GETPOS(post[j]);
584 qsort((void *) doc, cur, sizeof(DocRepresentation), compareDocR);
594 rank_cd(PG_FUNCTION_ARGS)
596 int K = PG_GETARG_INT32(0);
597 tsvector *txt = (tsvector *) PG_DETOAST_DATUM(PG_GETARG_DATUM(1));
598 QUERYTYPE *query = (QUERYTYPE *) PG_DETOAST_DATUM_COPY(PG_GETARG_DATUM(2));
599 int method = DEF_NORM_METHOD;
600 DocRepresentation *doc;
609 doc = get_docrep(txt, query, &doclen);
612 PG_FREE_IF_COPY(txt, 1);
613 PG_FREE_IF_COPY(query, 2);
614 PG_RETURN_FLOAT4(0.0);
620 while (Cover(doc, doclen, query, &cur, &p, &q))
621 res += (q - p + 1 > K) ? ((float) K) / ((float) (q - p + 1)) : 1.0;
624 method = PG_GETARG_INT32(3);
631 res /= log((float) (cnt_length(txt) + 1));
634 len = cnt_length(txt);
640 elog(ERROR, "unrecognized normalization method: %d", method);
643 for(i=0;i<doclen;i++)
644 if ( doc[i].needfree )
645 pfree( doc[i].item );
647 PG_FREE_IF_COPY(txt, 1);
648 PG_FREE_IF_COPY(query, 2);
650 PG_RETURN_FLOAT4(res);
655 rank_cd_def(PG_FUNCTION_ARGS)
657 PG_RETURN_DATUM(DirectFunctionCall4(
662 (PG_NARGS() == 3) ? PG_GETARG_DATUM(2) : Int32GetDatum(DEF_NORM_METHOD)
666 /**************debug*************/
678 compareDocWord(const void *a, const void *b)
680 if (((DocWord *) a)->pos == ((DocWord *) b)->pos)
682 return (((DocWord *) a)->pos > ((DocWord *) b)->pos) ? 1 : -1;
687 get_covers(PG_FUNCTION_ARGS)
689 tsvector *txt = (tsvector *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
690 QUERYTYPE *query = (QUERYTYPE *) PG_DETOAST_DATUM_COPY(PG_GETARG_DATUM(1));
691 WordEntry *pptr = ARRPTR(txt);
702 DocRepresentation *doc;
709 doc = get_docrep(txt, query, &rlen);
713 out = palloc(VARHDRSZ);
714 VARATT_SIZEP(out) = VARHDRSZ;
715 PG_FREE_IF_COPY(txt, 0);
716 PG_FREE_IF_COPY(query, 1);
717 PG_RETURN_POINTER(out);
720 for (i = 0; i < txt->size; i++)
724 (errcode(ERRCODE_SYNTAX_ERROR),
725 errmsg("no pos info")));
726 dlen += POSDATALEN(txt, &(pptr[i]));
729 dwptr = dw = palloc(sizeof(DocWord) * dlen);
730 memset(dw, 0, sizeof(DocWord) * dlen);
732 for (i = 0; i < txt->size; i++)
734 WordEntryPos *posdata = POSDATAPTR(txt, &(pptr[i]));
736 for (j = 0; j < POSDATALEN(txt, &(pptr[i])); j++)
738 dw[cur].w = STRPTR(txt) + pptr[i].pos;
739 dw[cur].len = pptr[i].len;
740 dw[cur].pos = WEP_GETPOS(posdata[j]);
743 len += (pptr[i].len + 1) * (int) POSDATALEN(txt, &(pptr[i]));
745 qsort((void *) dw, dlen, sizeof(DocWord), compareDocWord);
747 while (Cover(doc, rlen, query, &pos, &p, &q))
749 dwptr = dw + olddwpos;
750 while (dwptr->pos < p && dwptr - dw < dlen)
752 olddwpos = dwptr - dw;
753 dwptr->start = ncover;
754 while (dwptr->pos < q + 1 && dwptr - dw < dlen)
756 (dwptr - 1)->finish = ncover;
757 len += 4 /* {}+two spaces */ + 2 * 16 /* numbers */ ;
761 out = palloc(VARHDRSZ + len);
762 cptr = ((char *) out) + VARHDRSZ;
765 while (dwptr - dw < dlen)
769 sprintf(cptr, "{%d ", dwptr->start);
770 cptr = strchr(cptr, '\0');
772 memcpy(cptr, dwptr->w, dwptr->len);
778 sprintf(cptr, "}%d ", dwptr->finish);
779 cptr = strchr(cptr, '\0');
784 VARATT_SIZEP(out) = cptr - ((char *) out);
788 if ( doc[i].needfree )
789 pfree( doc[i].item );
792 PG_FREE_IF_COPY(txt, 0);
793 PG_FREE_IF_COPY(query, 1);
794 PG_RETURN_POINTER(out);