From: Teodor Sigaev Date: Thu, 2 Mar 2006 19:07:19 +0000 (+0000) Subject: Significantly improve ranking: X-Git-Tag: REL8_2_BETA1~1336 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=38c4fe87ac86965253235edf46a5908e49c8478e;p=postgresql Significantly improve ranking: 1) rank_cd now use weight of lexemes 2) rank_cd and rank can use any combination of normalization methods: no normalization normalization by log(length of document) -----/------- by length of document -----/------- by number of unique word in document -----/------- by log(number of unique word in document) -----/------- by number of covers (only rank_cd) Improve cover's search. TODO: changes in documentation --- diff --git a/contrib/tsearch2/expected/tsearch2.out b/contrib/tsearch2/expected/tsearch2.out index 8fb16b045d..2a60e317cd 100644 --- a/contrib/tsearch2/expected/tsearch2.out +++ b/contrib/tsearch2/expected/tsearch2.out @@ -2315,9 +2315,9 @@ An hour of storm to place The sculpture of these granite seams, Upon a woman s face. E. J. Pratt (1882 1964) '), to_tsquery('sea&thousand&years')); - rank_cd ---------- - 1.2 + rank_cd +----------- + 0.0555556 (1 row) select rank_cd(to_tsvector('Erosion It took the sea a thousand years, @@ -2329,9 +2329,9 @@ An hour of storm to place The sculpture of these granite seams, Upon a woman s face. E. J. Pratt (1882 1964) '), to_tsquery('granite&sea')); - rank_cd ----------- - 0.880303 + rank_cd +----------- + 0.0238095 (1 row) select rank_cd(to_tsvector('Erosion It took the sea a thousand years, @@ -2345,7 +2345,7 @@ Upon a woman s face. E. J. Pratt (1882 1964) '), to_tsquery('sea')); rank_cd --------- - 2 + 0.2 (1 row) select get_covers(to_tsvector('Erosion It took the sea a thousand years, diff --git a/contrib/tsearch2/rank.c b/contrib/tsearch2/rank.c index a8c30bf329..29732c1c91 100644 --- a/contrib/tsearch2/rank.c +++ b/contrib/tsearch2/rank.c @@ -41,7 +41,13 @@ static float weights[] = {0.1, 0.2, 0.4, 1.0}; #define wpos(wep) ( w[ WEP_GETWEIGHT(wep) ] ) -#define DEF_NORM_METHOD 0 +#define RANK_NO_NORM 0x00 +#define RANK_NORM_LOGLENGTH 0x01 +#define RANK_NORM_LENGTH 0x02 +#define RANK_NORM_EXTDIST 0x04 +#define RANK_NORM_UNIQ 0x08 +#define RANK_NORM_LOGUNIQ 0x10 +#define DEF_NORM_METHOD RANK_NO_NORM static float calc_rank_or(float *w, tsvector * t, QUERYTYPE * q); static float calc_rank_and(float *w, tsvector * t, QUERYTYPE * q); @@ -328,23 +334,21 @@ calc_rank(float *w, tsvector * t, QUERYTYPE * q, int4 method) if (res < 0) res = 1e-20; - switch (method) - { - case 0: - break; - case 1: - res /= log((float) (cnt_length(t) + 1)) / log(2.0); - break; - case 2: - len = cnt_length(t); - if (len > 0) - res /= (float) len; - break; - default: - /* internal error */ - elog(ERROR, "unrecognized normalization method: %d", method); + if ( (method & RANK_NORM_LOGLENGTH) && t->size>0 ) + res /= log((double) (cnt_length(t) + 1)) / log(2.0); + + if ( method & RANK_NORM_LENGTH ) { + len = cnt_length(t); + if ( len>0 ) + res /= (float) len; } + if ( (method & RANK_NORM_UNIQ) && t->size > 0 ) + res /= (float)( t->size ); + + if ( (method & RANK_NORM_LOGUNIQ) && t->size > 0 ) + res /= log((double) (t->size + 1)) / log(2.0); + return res; } @@ -420,6 +424,7 @@ typedef struct ITEM **item; int16 nitem; bool needfree; + uint8 wclass; int32 pos; } DocRepresentation; @@ -452,19 +457,28 @@ reset_istrue_flag(QUERYTYPE * query) } } +typedef struct { + int pos; + int p; + int q; + DocRepresentation *begin; + DocRepresentation *end; +} Extention; + + static bool -Cover(DocRepresentation * doc, int len, QUERYTYPE * query, int *pos, int *p, int *q) +Cover(DocRepresentation * doc, int len, QUERYTYPE * query, Extention *ext) { DocRepresentation *ptr; - int lastpos = *pos; + int lastpos = ext->pos; int i; bool found = false; reset_istrue_flag(query); - *p = 0x7fffffff; - *q = 0; - ptr = doc + *pos; + ext->p = 0x7fffffff; + ext->q = 0; + ptr = doc + ext->pos; /* find upper bound of cover from current position, move up */ while (ptr - doc < len) @@ -473,9 +487,10 @@ Cover(DocRepresentation * doc, int len, QUERYTYPE * query, int *pos, int *p, int ptr->item[i]->istrue = 1; if (TS_execute(GETQUERY(query), NULL, false, checkcondition_ITEM)) { - if (ptr->pos > *q) + if (ptr->pos > ext->q) { - *q = ptr->pos; + ext->q = ptr->pos; + ext->end = ptr; lastpos = ptr - doc; found = true; } @@ -498,25 +513,27 @@ Cover(DocRepresentation * doc, int len, QUERYTYPE * query, int *pos, int *p, int ptr->item[i]->istrue = 1; if (TS_execute(GETQUERY(query), NULL, true, checkcondition_ITEM)) { - if (ptr->pos < *p) - *p = ptr->pos; + if (ptr->pos < ext->p) { + ext->begin = ptr; + ext->p = ptr->pos; + } break; } ptr--; } - if (*p <= *q) + if (ext->p <= ext->q) { /* * set position for next try to next lexeme after begining of founded * cover */ - *pos = (ptr - doc) + 1; + ext->pos = (ptr - doc) + 1; return true; } - (*pos)++; - return Cover(doc, len, query, pos, p, q); + ext->pos++; + return Cover(doc, len, query, ext); } static DocRepresentation * @@ -593,6 +610,7 @@ get_docrep(tsvector * txt, QUERYTYPE * query, int *doclen) doc[cur].item = doc[cur - 1].item; } doc[cur].pos = WEP_GETPOS(post[j]); + doc[cur].wclass = WEP_GETWEIGHT(post[j]); cur++; } } @@ -610,61 +628,110 @@ get_docrep(tsvector * txt, QUERYTYPE * query, int *doclen) return NULL; } - -Datum -rank_cd(PG_FUNCTION_ARGS) -{ - int K = PG_GETARG_INT32(0); - tsvector *txt = (tsvector *) PG_DETOAST_DATUM(PG_GETARG_DATUM(1)); - QUERYTYPE *query = (QUERYTYPE *) PG_DETOAST_DATUM_COPY(PG_GETARG_DATUM(2)); - int method = DEF_NORM_METHOD; +static float4 +calc_rank_cd(float4 *arrdata, tsvector *txt, QUERYTYPE *query, int method) { DocRepresentation *doc; - float res = 0.0; - int p = 0, - q = 0, - len, - cur, + int len, i, doclen = 0; + Extention ext; + double Wdoc = 0.0; + double invws[lengthof(weights)]; + double SumDist=0.0, PrevExtPos=0.0, CurExtPos=0.0; + int NExtent=0; - doc = get_docrep(txt, query, &doclen); - if (!doc) + for (i = 0; i < lengthof(weights); i++) { - PG_FREE_IF_COPY(txt, 1); - PG_FREE_IF_COPY(query, 2); - PG_RETURN_FLOAT4(0.0); + invws[i] = ((double)((arrdata[i] >= 0) ? arrdata[i] : weights[i])); + if (invws[i] > 1.0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("weight out of range"))); + invws[i] = 1.0/invws[i]; } - cur = 0; - if (K <= 0) - K = 4; - while (Cover(doc, doclen, query, &cur, &p, &q)) - res += (q - p + 1 > K) ? ((float) K) / ((float) (q - p + 1)) : 1.0; + doc = get_docrep(txt, query, &doclen); + if (!doc) + return 0.0; - if (PG_NARGS() == 4) - method = PG_GETARG_INT32(3); + MemSet( &ext, 0, sizeof(Extention) ); + while (Cover(doc, doclen, query, &ext)) { + double Cpos = 0.0; + double InvSum = 0.0; + DocRepresentation *ptr = ext.begin; - switch (method) - { - case 0: - break; - case 1: - res /= log((float) (cnt_length(txt) + 1)); - break; - case 2: - len = cnt_length(txt); - if (len > 0) - res /= (float) len; - break; - default: - /* internal error */ - elog(ERROR, "unrecognized normalization method: %d", method); + while ( ptr<=ext.end ) { + InvSum += invws[ ptr->wclass ]; + ptr++; + } + + Cpos = ((double)( ext.end-ext.begin+1 )) / InvSum; + Wdoc += Cpos / ( (double)(( 1 + (ext.q - ext.p) - (ext.end - ext.begin) )) ); + + CurExtPos = ((double)(ext.q + ext.p))/2.0; + if ( NExtent>0 && CurExtPos > PrevExtPos /* prevent devision by zero in a case of multiple lexize */ ) + SumDist += 1.0/( CurExtPos - PrevExtPos ); + + PrevExtPos = CurExtPos; + NExtent++; + } + + if ( (method & RANK_NORM_LOGLENGTH) && txt->size > 0 ) + Wdoc /= log((double) (cnt_length(txt) + 1)); + + if ( method & RANK_NORM_LENGTH ) { + len = cnt_length(txt); + if ( len>0 ) + Wdoc /= (double) len; } + if ( (method & RANK_NORM_EXTDIST) && SumDist > 0 ) + Wdoc /= ((double)NExtent) / SumDist; + + if ( (method & RANK_NORM_UNIQ) && txt->size > 0 ) + Wdoc /= (double)( txt->size ); + + if ( (method & RANK_NORM_LOGUNIQ) && txt->size > 0 ) + Wdoc /= log((double) (txt->size + 1)) / log(2.0); + for (i = 0; i < doclen; i++) if (doc[i].needfree) pfree(doc[i].item); pfree(doc); + + return (float4)Wdoc; +} + +Datum +rank_cd(PG_FUNCTION_ARGS) +{ + ArrayType *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0)); + tsvector *txt = (tsvector *) PG_DETOAST_DATUM(PG_GETARG_DATUM(1)); + QUERYTYPE *query = (QUERYTYPE *) PG_DETOAST_DATUM_COPY(PG_GETARG_DATUM(2)); + int method = DEF_NORM_METHOD; + float4 res; + + if (ARR_NDIM(win) != 1) + ereport(ERROR, + (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR), + errmsg("array of weight must be one-dimensional"))); + + if (ARRNELEMS(win) < lengthof(weights)) + ereport(ERROR, + (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR), + errmsg("array of weight is too short"))); + + if (ARR_HASNULL(win)) + ereport(ERROR, + (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), + errmsg("array of weight must not contain nulls"))); + + if (PG_NARGS() == 4) + method = PG_GETARG_INT32(3); + + res = calc_rank_cd( (float4 *) ARR_DATA_PTR(win), txt, query, method); + + PG_FREE_IF_COPY(win, 0); PG_FREE_IF_COPY(txt, 1); PG_FREE_IF_COPY(query, 2); @@ -675,13 +742,16 @@ rank_cd(PG_FUNCTION_ARGS) Datum rank_cd_def(PG_FUNCTION_ARGS) { - PG_RETURN_DATUM(DirectFunctionCall4( - rank_cd, - Int32GetDatum(-1), - PG_GETARG_DATUM(0), - PG_GETARG_DATUM(1), - (PG_NARGS() == 3) ? PG_GETARG_DATUM(2) : Int32GetDatum(DEF_NORM_METHOD) - )); + tsvector *txt = (tsvector *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0)); + QUERYTYPE *query = (QUERYTYPE *) PG_DETOAST_DATUM_COPY(PG_GETARG_DATUM(1)); + float4 res; + + res = calc_rank_cd( weights, txt, query, (PG_NARGS() == 3) ? PG_GETARG_DATUM(2) : DEF_NORM_METHOD); + + PG_FREE_IF_COPY(txt, 1); + PG_FREE_IF_COPY(query, 2); + + PG_RETURN_FLOAT4(res); } /**************debug*************/ @@ -721,11 +791,9 @@ get_covers(PG_FUNCTION_ARGS) text *out; char *cptr; DocRepresentation *doc; - int pos = 0, - p, - q, - olddwpos = 0; + int olddwpos = 0; int ncover = 1; + Extention ext; doc = get_docrep(txt, query, &rlen); @@ -765,14 +833,15 @@ get_covers(PG_FUNCTION_ARGS) } qsort((void *) dw, dlen, sizeof(DocWord), compareDocWord); - while (Cover(doc, rlen, query, &pos, &p, &q)) + MemSet( &ext, 0, sizeof(Extention) ); + while (Cover(doc, rlen, query, &ext)) { dwptr = dw + olddwpos; - while (dwptr->pos < p && dwptr - dw < dlen) + while (dwptr->pos < ext.p && dwptr - dw < dlen) dwptr++; olddwpos = dwptr - dw; dwptr->start = ncover; - while (dwptr->pos < q + 1 && dwptr - dw < dlen) + while (dwptr->pos < ext.q + 1 && dwptr - dw < dlen) dwptr++; (dwptr - 1)->finish = ncover; len += 4 /* {}+two spaces */ + 2 * 16 /* numbers */ ; diff --git a/contrib/tsearch2/tsearch.sql.in b/contrib/tsearch2/tsearch.sql.in index c30a80e51f..0ab0887b45 100644 --- a/contrib/tsearch2/tsearch.sql.in +++ b/contrib/tsearch2/tsearch.sql.in @@ -534,12 +534,12 @@ RETURNS float4 AS 'MODULE_PATHNAME', 'rank_def' LANGUAGE C RETURNS NULL ON NULL INPUT IMMUTABLE; -CREATE FUNCTION rank_cd(int4, tsvector, tsquery) +CREATE FUNCTION rank_cd(float4[], tsvector, tsquery) RETURNS float4 AS 'MODULE_PATHNAME' LANGUAGE C RETURNS NULL ON NULL INPUT IMMUTABLE; -CREATE FUNCTION rank_cd(int4, tsvector, tsquery, int4) +CREATE FUNCTION rank_cd(float4[], tsvector, tsquery, int4) RETURNS float4 AS 'MODULE_PATHNAME' LANGUAGE C RETURNS NULL ON NULL INPUT IMMUTABLE;