3 * Teodor Sigaev <teodor@sigaev.ru>
8 #include "access/gist.h"
9 #include "access/itup.h"
10 #include "utils/builtins.h"
13 #include "storage/bufpage.h"
14 #include "executor/spi.h"
15 #include "commands/trigger.h"
16 #include "nodes/pg_list.h"
17 #include "catalog/namespace.h"
19 #include "utils/array.h"
25 PG_FUNCTION_INFO_V1(rank);
26 Datum rank(PG_FUNCTION_ARGS);
28 PG_FUNCTION_INFO_V1(rank_def);
29 Datum rank_def(PG_FUNCTION_ARGS);
31 PG_FUNCTION_INFO_V1(rank_cd);
32 Datum rank_cd(PG_FUNCTION_ARGS);
34 PG_FUNCTION_INFO_V1(rank_cd_def);
35 Datum rank_cd_def(PG_FUNCTION_ARGS);
37 PG_FUNCTION_INFO_V1(get_covers);
38 Datum get_covers(PG_FUNCTION_ARGS);
40 static float weights[] = {0.1, 0.2, 0.4, 1.0};
42 #define wpos(wep) ( w[ ((WordEntryPos*)(wep))->weight ] )
44 #define DEF_NORM_METHOD 0
47 * Returns a weight of a word collocation
55 return 1.0 / (1.005 + 0.05 * exp(((float4) w) / 1.5 - 2));
59 cnt_length(tsvector * t)
61 WordEntry *ptr = ARRPTR(t),
62 *end = (WordEntry *) STRPTR(t);
68 if ((clen = POSDATALEN(t, ptr)) == 0)
79 WordECompareITEM(char *eval, char *qval, WordEntry * ptr, ITEM * item)
81 if (ptr->len == item->length)
84 qval + item->distance,
87 return (ptr->len > item->length) ? 1 : -1;
91 find_wordentry(tsvector * t, QUERYTYPE * q, ITEM * item)
93 WordEntry *StopLow = ARRPTR(t);
94 WordEntry *StopHigh = (WordEntry *) STRPTR(t);
95 WordEntry *StopMiddle;
98 /* Loop invariant: StopLow <= item < StopHigh */
100 while (StopLow < StopHigh)
102 StopMiddle = StopLow + (StopHigh - StopLow) / 2;
103 difference = WordECompareITEM(STRPTR(t), GETOPERAND(q), StopMiddle, item);
106 else if (difference < 0)
107 StopLow = StopMiddle + 1;
109 StopHigh = StopMiddle;
115 static WordEntryPos POSNULL[] = {
121 calc_rank_and(float *w, tsvector * t, QUERYTYPE * q)
123 uint16 **pos = (uint16 **) palloc(sizeof(uint16 *) * q->size);
135 ITEM *item = GETQUERY(q);
137 memset(pos, 0, sizeof(uint16 **) * q->size);
138 *(uint16 *) POSNULL = lengthof(POSNULL) - 1;
140 for (i = 0; i < q->size; i++)
143 if (item[i].type != VAL)
146 entry = find_wordentry(t, q, &(item[i]));
151 pos[i] = (uint16 *) _POSDATAPTR(t, entry);
153 pos[i] = (uint16 *) POSNULL;
156 dimt = *(uint16 *) (pos[i]);
157 post = (WordEntryPos *) (pos[i] + 1);
158 for (k = 0; k < i; k++)
162 lenct = *(uint16 *) (pos[k]);
163 ct = (WordEntryPos *) (pos[k] + 1);
164 for (l = 0; l < dimt; l++)
166 for (p = 0; p < lenct; p++)
168 dist = abs(post[l].pos - ct[p].pos);
169 if (dist || (dist == 0 && (pos[i] == (uint16 *) POSNULL || pos[k] == (uint16 *) POSNULL)))
175 curw = sqrt(wpos(&(post[l])) * wpos(&(ct[p])) * word_distance(dist));
176 res = (res < 0) ? curw : 1.0 - (1.0 - res) * (1.0 - curw);
187 calc_rank_or(float *w, tsvector * t, QUERYTYPE * q)
195 ITEM *item = GETQUERY(q);
197 *(uint16 *) POSNULL = lengthof(POSNULL) - 1;
199 for (i = 0; i < q->size; i++)
201 if (item[i].type != VAL)
204 entry = find_wordentry(t, q, &(item[i]));
210 dimt = POSDATALEN(t, entry);
211 post = POSDATAPTR(t, entry);
215 dimt = *(uint16 *) POSNULL;
219 for (j = 0; j < dimt; j++)
222 res = wpos(&(post[j]));
224 res = 1.0 - (1.0 - res) * (1.0 - wpos(&(post[j])));
231 calc_rank(float *w, tsvector * t, QUERYTYPE * q, int4 method)
233 ITEM *item = GETQUERY(q);
236 if (!t->size || !q->size)
239 res = (item->type != VAL && item->val == (int4) '&') ?
240 calc_rank_and(w, t, q) : calc_rank_or(w, t, q);
250 res /= log((float) cnt_length(t));
253 res /= (float) cnt_length(t);
257 elog(ERROR, "unrecognized normalization method: %d", method);
264 rank(PG_FUNCTION_ARGS)
266 ArrayType *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
267 tsvector *txt = (tsvector *) PG_DETOAST_DATUM(PG_GETARG_DATUM(1));
268 QUERYTYPE *query = (QUERYTYPE *) PG_DETOAST_DATUM(PG_GETARG_DATUM(2));
269 int method = DEF_NORM_METHOD;
271 float ws[lengthof(weights)];
274 if (ARR_NDIM(win) != 1)
276 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
277 errmsg("array of weight must be one-dimensional")));
279 if (ARRNELEMS(win) < lengthof(weights))
281 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
282 errmsg("array of weight is too short")));
284 for (i = 0; i < lengthof(weights); i++)
286 ws[i] = (((float4 *) ARR_DATA_PTR(win))[i] >= 0) ? ((float4 *) ARR_DATA_PTR(win))[i] : weights[i];
289 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
290 errmsg("weight out of range")));
294 method = PG_GETARG_INT32(3);
296 res = calc_rank(ws, txt, query, method);
298 PG_FREE_IF_COPY(win, 0);
299 PG_FREE_IF_COPY(txt, 1);
300 PG_FREE_IF_COPY(query, 2);
301 PG_RETURN_FLOAT4(res);
305 rank_def(PG_FUNCTION_ARGS)
307 tsvector *txt = (tsvector *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
308 QUERYTYPE *query = (QUERYTYPE *) PG_DETOAST_DATUM(PG_GETARG_DATUM(1));
310 int method = DEF_NORM_METHOD;
313 method = PG_GETARG_INT32(2);
315 res = calc_rank(weights, txt, query, method);
317 PG_FREE_IF_COPY(txt, 0);
318 PG_FREE_IF_COPY(query, 1);
319 PG_RETURN_FLOAT4(res);
330 compareDocR(const void *a, const void *b)
332 if (((DocRepresentation *) a)->pos == ((DocRepresentation *) b)->pos)
334 return (((DocRepresentation *) a)->pos > ((DocRepresentation *) b)->pos) ? 1 : -1;
340 DocRepresentation *doc;
345 checkcondition_DR(void *checkval, ITEM * val)
347 DocRepresentation *ptr = ((ChkDocR *) checkval)->doc;
349 while (ptr - ((ChkDocR *) checkval)->doc < ((ChkDocR *) checkval)->len)
351 if (val == ptr->item)
361 Cover(DocRepresentation * doc, int len, QUERYTYPE * query, int *pos, int *p, int *q)
364 DocRepresentation *ptr,
365 *f = (DocRepresentation *) 0xffffffff;
366 ITEM *item = GETQUERY(query);
373 for (i = 0; i < query->size; i++)
375 if (item->type != VAL)
382 while (ptr - doc < len)
384 if (ptr->item == item)
403 { /* already check this pos */
405 return Cover(doc, len, query, pos, p, q);
408 item = GETQUERY(query);
409 for (i = 0; i < query->size; i++)
411 if (item->type != VAL)
418 while (ptr >= doc + *pos)
420 if (ptr->item == item)
439 ch.len = (doc + lastpos) - f + 1;
441 if (TS_execute(GETQUERY(query), &ch, false, checkcondition_DR))
444 * elog(NOTICE,"OP:%d NP:%d P:%d Q:%d", *pos, lastpos, *p,
450 return Cover(doc, len, query, pos, p, q);
456 static DocRepresentation *
457 get_docrep(tsvector * txt, QUERYTYPE * query, int *doclen)
459 ITEM *item = GETQUERY(query);
465 int len = query->size * 4,
467 DocRepresentation *doc;
469 *(uint16 *) POSNULL = lengthof(POSNULL) - 1;
470 doc = (DocRepresentation *) palloc(sizeof(DocRepresentation) * len);
471 for (i = 0; i < query->size; i++)
473 if (item[i].type != VAL)
476 entry = find_wordentry(txt, query, &(item[i]));
482 dimt = POSDATALEN(txt, entry);
483 post = POSDATAPTR(txt, entry);
487 dimt = *(uint16 *) POSNULL;
491 while (cur + dimt >= len)
494 doc = (DocRepresentation *) repalloc(doc, sizeof(DocRepresentation) * len);
497 for (j = 0; j < dimt; j++)
499 doc[cur].item = &(item[i]);
500 doc[cur].pos = post[j].pos;
510 qsort((void *) doc, cur, sizeof(DocRepresentation), compareDocR);
520 rank_cd(PG_FUNCTION_ARGS)
522 int K = PG_GETARG_INT32(0);
523 tsvector *txt = (tsvector *) PG_DETOAST_DATUM(PG_GETARG_DATUM(1));
524 QUERYTYPE *query = (QUERYTYPE *) PG_DETOAST_DATUM(PG_GETARG_DATUM(2));
525 int method = DEF_NORM_METHOD;
526 DocRepresentation *doc;
533 doc = get_docrep(txt, query, &len);
536 PG_FREE_IF_COPY(txt, 1);
537 PG_FREE_IF_COPY(query, 2);
538 PG_RETURN_FLOAT4(0.0);
544 while (Cover(doc, len, query, &cur, &p, &q))
545 res += (q - p + 1 > K) ? ((float) K) / ((float) (q - p + 1)) : 1.0;
548 method = PG_GETARG_INT32(3);
555 res /= log((float) cnt_length(txt));
558 res /= (float) cnt_length(txt);
562 elog(ERROR, "unrecognized normalization method: %d", method);
566 PG_FREE_IF_COPY(txt, 1);
567 PG_FREE_IF_COPY(query, 2);
569 PG_RETURN_FLOAT4(res);
574 rank_cd_def(PG_FUNCTION_ARGS)
576 PG_RETURN_DATUM(DirectFunctionCall4(
581 (PG_NARGS() == 3) ? PG_GETARG_DATUM(2) : Int32GetDatum(DEF_NORM_METHOD)
585 /**************debug*************/
597 compareDocWord(const void *a, const void *b)
599 if (((DocWord *) a)->pos == ((DocWord *) b)->pos)
601 return (((DocWord *) a)->pos > ((DocWord *) b)->pos) ? 1 : -1;
606 get_covers(PG_FUNCTION_ARGS)
608 tsvector *txt = (tsvector *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
609 QUERYTYPE *query = (QUERYTYPE *) PG_DETOAST_DATUM(PG_GETARG_DATUM(1));
610 WordEntry *pptr = ARRPTR(txt);
621 DocRepresentation *doc;
628 doc = get_docrep(txt, query, &rlen);
632 out = palloc(VARHDRSZ);
633 VARATT_SIZEP(out) = VARHDRSZ;
634 PG_FREE_IF_COPY(txt, 0);
635 PG_FREE_IF_COPY(query, 1);
636 PG_RETURN_POINTER(out);
639 for (i = 0; i < txt->size; i++)
643 (errcode(ERRCODE_SYNTAX_ERROR),
644 errmsg("no pos info")));
645 dlen += POSDATALEN(txt, &(pptr[i]));
648 dwptr = dw = palloc(sizeof(DocWord) * dlen);
649 memset(dw, 0, sizeof(DocWord) * dlen);
651 for (i = 0; i < txt->size; i++)
653 WordEntryPos *posdata = POSDATAPTR(txt, &(pptr[i]));
655 for (j = 0; j < POSDATALEN(txt, &(pptr[i])); j++)
657 dw[cur].w = STRPTR(txt) + pptr[i].pos;
658 dw[cur].len = pptr[i].len;
659 dw[cur].pos = posdata[j].pos;
662 len += (pptr[i].len + 1) * (int) POSDATALEN(txt, &(pptr[i]));
664 qsort((void *) dw, dlen, sizeof(DocWord), compareDocWord);
666 while (Cover(doc, rlen, query, &pos, &p, &q))
668 dwptr = dw + olddwpos;
669 while (dwptr->pos < p && dwptr - dw < dlen)
671 olddwpos = dwptr - dw;
672 dwptr->start = ncover;
673 while (dwptr->pos < q + 1 && dwptr - dw < dlen)
675 (dwptr - 1)->finish = ncover;
676 len += 4 /* {}+two spaces */ + 2 * 16 /* numbers */ ;
680 out = palloc(VARHDRSZ + len);
681 cptr = ((char *) out) + VARHDRSZ;
684 while (dwptr - dw < dlen)
688 sprintf(cptr, "{%d ", dwptr->start);
689 cptr = strchr(cptr, '\0');
691 memcpy(cptr, dwptr->w, dwptr->len);
697 sprintf(cptr, "}%d ", dwptr->finish);
698 cptr = strchr(cptr, '\0');
703 VARATT_SIZEP(out) = cptr - ((char *) out);
708 PG_FREE_IF_COPY(txt, 0);
709 PG_FREE_IF_COPY(query, 1);
710 PG_RETURN_POINTER(out);