From 98edd617f3b62a02cb2df9b418fcc4ece45c7ec0 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 15 May 2015 17:59:46 +0300 Subject: [PATCH] Fix datatype confusion with the new lossy GiST distance functions. We can only support a lossy distance function when the distance function's datatype is comparable with the original ordering operator's datatype. The distance function always returns a float8, so we are limited to float8, and float4 (by a hard-coded cast of the float8 to float4). In light of this limitation, it seems like a good idea to have a separate 'recheck' flag for the ORDER BY expressions, so that if you have a non-lossy distance function, it still works with lossy quals. There are cases like that with the build-in or contrib opclasses, but it's plausible. There was a hidden assumption that the ORDER BY values returned by GiST match the original ordering operator's return type, but there are plenty of examples where that's not true, e.g. in btree_gist and pg_trgm. As long as the distance function is not lossy, we can tolerate that and just not return the distance to the executor (or rather, always return NULL). The executor doesn't need the distances if there are no lossy results. There was another little bug: the recheck variable was not initialized before calling the distance function. That revealed the bigger issue, as the executor tried to reorder tuples that didn't need reordering, and that failed because of the datatype mismatch. --- doc/src/sgml/gist.sgml | 13 +++++-- src/backend/access/gist/gistget.c | 56 ++++++++++++++++++++++------ src/backend/access/gist/gistscan.c | 16 ++++++++ src/backend/executor/nodeIndexscan.c | 5 +++ src/include/access/gist_private.h | 3 ++ src/include/access/relscan.h | 1 + 6 files changed, 78 insertions(+), 16 deletions(-) diff --git a/doc/src/sgml/gist.sgml b/doc/src/sgml/gist.sgml index 1291f8dd0c..641d1d04ba 100644 --- a/doc/src/sgml/gist.sgml +++ b/doc/src/sgml/gist.sgml @@ -818,10 +818,15 @@ my_distance(PG_FUNCTION_ARGS) - The result value can be any finite float8 value. (Infinity and - minus infinity are used internally to handle cases such as nulls, so it - is not recommended that distance functions return these - values.) + If the distance function returns *recheck=true for a leaf node, the + original ordering operator's return type must be float8 or float4, and + the distance function's return value must be comparable with the actual + distance operator. Otherwise, the distance function's return type can + be any finit float8 value, as long as the relative order of + the returned values matches the order returned by the ordering operator. + (Infinity and minus infinity are used internally to handle cases such as + nulls, so it is not recommended that distance functions + return these values.) diff --git a/src/backend/access/gist/gistget.c b/src/backend/access/gist/gistget.c index 1f791a4f8e..09fba06f73 100644 --- a/src/backend/access/gist/gistget.c +++ b/src/backend/access/gist/gistget.c @@ -16,6 +16,7 @@ #include "access/gist_private.h" #include "access/relscan.h" +#include "catalog/pg_type.h" #include "miscadmin.h" #include "pgstat.h" #include "lib/pairingheap.h" @@ -31,9 +32,11 @@ * depending on whether the containing page is a leaf page or not. * * On success return for a heap tuple, *recheck_p is set to indicate whether - * recheck is needed. We recheck if any of the consistent() or distance() + * the quals need to be rechecked. We recheck if any of the consistent() * functions request it. recheck is not interesting when examining a non-leaf * entry, since we must visit the lower index page if there's any doubt. + * Similarly, *recheck_distances_p is set to indicate whether the distances + * need to be rechecked, and it is also ignored for non-leaf entries. * * If we are doing an ordered scan, so->distances[] is filled with distance * data from the distance() functions before returning success. @@ -50,7 +53,8 @@ gistindex_keytest(IndexScanDesc scan, IndexTuple tuple, Page page, OffsetNumber offset, - bool *recheck_p) + bool *recheck_p, + bool *recheck_distances_p) { GISTScanOpaque so = (GISTScanOpaque) scan->opaque; GISTSTATE *giststate = so->giststate; @@ -60,6 +64,7 @@ gistindex_keytest(IndexScanDesc scan, Relation r = scan->indexRelation; *recheck_p = false; + *recheck_distances_p = false; /* * If it's a leftover invalid tuple from pre-9.1, treat it as a match with @@ -194,11 +199,14 @@ gistindex_keytest(IndexScanDesc scan, * use.) * * Distance functions get a recheck argument as well. In this - * case the returned distance is the lower bound of distance - * and needs to be rechecked. We return single recheck flag - * which means that both quals and distances are to be - * rechecked. + * case the returned distance is the lower bound of distance and + * needs to be rechecked. We return single recheck flag which + * means that both quals and distances are to be rechecked. We + * initialize the flag to 'false'. The flag was added in version + * 9.5 and the distance operators written before that won't know + * about the flag, and are never lossy. */ + recheck = false; dist = FunctionCall5Coll(&key->sk_func, key->sk_collation, PointerGetDatum(&de), @@ -206,9 +214,7 @@ gistindex_keytest(IndexScanDesc scan, Int32GetDatum(key->sk_strategy), ObjectIdGetDatum(key->sk_subtype), PointerGetDatum(&recheck)); - - *recheck_p |= recheck; - + *recheck_distances_p |= recheck; *distance_p = DatumGetFloat8(dist); } @@ -310,6 +316,7 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, double *myDistances, IndexTuple it = (IndexTuple) PageGetItem(page, PageGetItemId(page, i)); bool match; bool recheck; + bool recheck_distances; /* * Must call gistindex_keytest in tempCxt, and clean up any leftover @@ -317,7 +324,8 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, double *myDistances, */ oldcxt = MemoryContextSwitchTo(so->giststate->tempCxt); - match = gistindex_keytest(scan, it, page, i, &recheck); + match = gistindex_keytest(scan, it, page, i, + &recheck, &recheck_distances); MemoryContextSwitchTo(oldcxt); MemoryContextReset(so->giststate->tempCxt); @@ -375,6 +383,7 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, double *myDistances, item->blkno = InvalidBlockNumber; item->data.heap.heapPtr = it->t_tid; item->data.heap.recheck = recheck; + item->data.heap.recheckDistances = recheck_distances; /* * In an index-only scan, also fetch the data from the tuple. @@ -461,10 +470,33 @@ getNextNearest(IndexScanDesc scan) /* found a heap item at currently minimal distance */ scan->xs_ctup.t_self = item->data.heap.heapPtr; scan->xs_recheck = item->data.heap.recheck; + scan->xs_recheckorderby = item->data.heap.recheckDistances; for (i = 0; i < scan->numberOfOrderBys; i++) { - scan->xs_orderbyvals[i] = Float8GetDatum(item->distances[i]); - scan->xs_orderbynulls[i] = false; + if (so->orderByTypes[i] == FLOAT8OID) + { + scan->xs_orderbyvals[i] = Float8GetDatum(item->distances[i]); + scan->xs_orderbynulls[i] = false; + } + else if (so->orderByTypes[i] == FLOAT4OID) + { + scan->xs_orderbyvals[i] = Float4GetDatum((float4) item->distances[i]); + scan->xs_orderbynulls[i] = false; + } + else + { + /* + * If the ordering operator's return value is anything + * else, we don't know how to convert the float8 bound + * calculated by the distance function to that. The + * executor won't actually need the order by values we + * return here, if there are no lossy results, so only + * insist on the datatype if the *recheck is set. + */ + if (scan->xs_recheckorderby) + elog(ERROR, "GiST operator family's FOR ORDER BY operator must return float8 or float4 if the distance function is lossy"); + scan->xs_orderbynulls[i] = true; + } } /* in an index-only scan, also return the reconstructed tuple. */ diff --git a/src/backend/access/gist/gistscan.c b/src/backend/access/gist/gistscan.c index 099849a606..d0fea2b86c 100644 --- a/src/backend/access/gist/gistscan.c +++ b/src/backend/access/gist/gistscan.c @@ -17,6 +17,7 @@ #include "access/gist_private.h" #include "access/gistscan.h" #include "access/relscan.h" +#include "utils/lsyscache.h" #include "utils/memutils.h" #include "utils/rel.h" @@ -263,6 +264,8 @@ gistrescan(PG_FUNCTION_ARGS) memmove(scan->orderByData, orderbys, scan->numberOfOrderBys * sizeof(ScanKeyData)); + so->orderByTypes = (Oid *) palloc(scan->numberOfOrderBys * sizeof(Oid)); + /* * Modify the order-by key so that the Distance method is called for * all comparisons. The original operator is passed to the Distance @@ -281,6 +284,19 @@ gistrescan(PG_FUNCTION_ARGS) GIST_DISTANCE_PROC, skey->sk_attno, RelationGetRelationName(scan->indexRelation)); + /* + * Look up the datatype returned by the original ordering operator. + * GiST always uses a float8 for the distance function, but the + * ordering operator could be anything else. + * + * XXX: The distance function is only allowed to be lossy if the + * ordering operator's result type is float4 or float8. Otherwise + * we don't know how to return the distance to the executor. But + * we cannot check that here, as we won't know if the distance + * function is lossy until it returns *recheck = true for the + * first time. + */ + so->orderByTypes[i] = get_func_rettype(skey->sk_func.fn_oid); fmgr_info_copy(&(skey->sk_func), finfo, so->giststate->scanCxt); /* Restore prior fn_extra pointers, if not first time */ diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c index 2164da0c84..434a1d95f0 100644 --- a/src/backend/executor/nodeIndexscan.c +++ b/src/backend/executor/nodeIndexscan.c @@ -236,7 +236,12 @@ next_indextuple: InstrCountFiltered2(node, 1); goto next_indextuple; } + } + if (scandesc->xs_recheckorderby) + { + econtext->ecxt_scantuple = slot; + ResetExprContext(econtext); EvalOrderByExpressions(node, econtext); /* diff --git a/src/include/access/gist_private.h b/src/include/access/gist_private.h index 9d3714d27d..0e819d7b51 100644 --- a/src/include/access/gist_private.h +++ b/src/include/access/gist_private.h @@ -121,6 +121,7 @@ typedef struct GISTSearchHeapItem { ItemPointerData heapPtr; bool recheck; /* T if quals must be rechecked */ + bool recheckDistances; /* T if distances must be rechecked */ IndexTuple ftup; /* data fetched back from the index, used in * index-only scans */ } GISTSearchHeapItem; @@ -150,6 +151,8 @@ typedef struct GISTSearchItem typedef struct GISTScanOpaqueData { GISTSTATE *giststate; /* index information, see above */ + Oid *orderByTypes; /* datatypes of ORDER BY expressions */ + pairingheap *queue; /* queue of unvisited items */ MemoryContext queueCxt; /* context holding the queue */ bool qual_ok; /* false if qual can never be satisfied */ diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index 865d36403a..5a0d724aca 100644 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -99,6 +99,7 @@ typedef struct IndexScanDescData */ Datum *xs_orderbyvals; bool *xs_orderbynulls; + bool xs_recheckorderby; /* T means ORDER BY exprs must be rechecked */ /* state data for traversing HOT chains in index_getnext */ bool xs_continue_hot; /* T if must keep walking HOT chain */ -- 2.40.0