From ae99a524037e76c041d18f410cd3fb037d81e33b Mon Sep 17 00:00:00 2001 From: Paul Ramsey Date: Tue, 13 Nov 2012 22:10:35 +0000 Subject: [PATCH] #1895, New node splitting algorithm for GiST Set the KOROTKOV_SPLIT define to 1 to use the new approach, to 0 to use the old approach. After testing is complete, we can set the new split as the default. git-svn-id: http://svn.osgeo.org/postgis/trunk@10665 b70326c6-7e19-0410-871a-916f4a2858ee --- NEWS | 1 + postgis/gserialized_gist_2d.c | 717 +++++++++++++++++++++++++++++++++- 2 files changed, 716 insertions(+), 2 deletions(-) diff --git a/NEWS b/NEWS index bae94c8b4..ebff2595e 100644 --- a/NEWS +++ b/NEWS @@ -47,6 +47,7 @@ PostGIS 2.1.0 (Bborie Park / UC Davis) - ST_Tile(raster) to break up a raster into tiles (Bborie Park / UC Davis) - #739, UpdateRasterSRID() + - #1895, new r-tree node splitting algorithm (Alex Korotkov) * Enhancements * - #823, tiger geocoder: Make loader_generate_script download portion diff --git a/postgis/gserialized_gist_2d.c b/postgis/gserialized_gist_2d.c index 39e20a6ba..1c6213f7a 100644 --- a/postgis/gserialized_gist_2d.c +++ b/postgis/gserialized_gist_2d.c @@ -20,6 +20,8 @@ ** [3] N. Beckmann, H.-P. Kriegel, R. Schneider, B. Seeger. The R*tree: an ** efficient and robust access method for points and rectangles. ** Proceedings of the ACM SIGMOD Conference. June 1990. +** [4] A. Korotkov, "A new double sorting-based node splitting algorithm for R-tree", +** http://syrcose.ispras.ru/2011/files/SYRCoSE2011_Proceedings.pdf#page=36 */ #include "postgres.h" @@ -40,6 +42,12 @@ */ #define LIMIT_RATIO 0.1 +/* +** 0 == don't use it +** 1 == use it +*/ +#define KOROTKOV_SPLIT 0 + /* ** For debugging */ @@ -133,8 +141,7 @@ static void box2df_merge(BOX2DF *b_union, BOX2DF *b_new) return; } - - +#if KOROTKOV_SPLIT < 1 static bool box2df_intersection(const BOX2DF *a, const BOX2DF *b, BOX2DF *n) { POSTGIS_DEBUGF(5, "calculating intersection of %s with %s", box2df_to_string(a), box2df_to_string(b)); @@ -154,6 +161,7 @@ static bool box2df_intersection(const BOX2DF *a, const BOX2DF *b, BOX2DF *n) return TRUE; } +#endif static float box2df_size(const BOX2DF *a) { @@ -1107,6 +1115,709 @@ Datum gserialized_gist_same_2d(PG_FUNCTION_ARGS) PG_RETURN_POINTER(result); } +#if KOROTKOV_SPLIT > 0 +/* + * Adjust BOX2DF b boundaries with insertion of addon. + */ +static void +adjustBox(BOX2DF *b, BOX2DF *addon) +{ + if (b->xmax < addon->xmax) + b->xmax = addon->xmax; + if (b->xmin > addon->xmin) + b->xmin = addon->xmin; + if (b->ymax < addon->ymax) + b->ymax = addon->ymax; + if (b->ymin > addon->ymin) + b->ymin = addon->ymin; +} + +/* + * Trivial split: half of entries will be placed on one page + * and another half - to another + */ +static void +fallbackSplit(GistEntryVector *entryvec, GIST_SPLITVEC *v) +{ + OffsetNumber i, + maxoff; + BOX2DF *unionL = NULL, + *unionR = NULL; + int nbytes; + + maxoff = entryvec->n - 1; + + nbytes = (maxoff + 2) * sizeof(OffsetNumber); + v->spl_left = (OffsetNumber *) palloc(nbytes); + v->spl_right = (OffsetNumber *) palloc(nbytes); + v->spl_nleft = v->spl_nright = 0; + + for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) + { + BOX2DF *cur = (BOX2DF *) DatumGetPointer(entryvec->vector[i].key); + + if (i <= (maxoff - FirstOffsetNumber + 1) / 2) + { + v->spl_left[v->spl_nleft] = i; + if (unionL == NULL) + { + unionL = (BOX2DF *) palloc(sizeof(BOX2DF)); + *unionL = *cur; + } + else + adjustBox(unionL, cur); + + v->spl_nleft++; + } + else + { + v->spl_right[v->spl_nright] = i; + if (unionR == NULL) + { + unionR = (BOX2DF *) palloc(sizeof(BOX2DF)); + *unionR = *cur; + } + else + adjustBox(unionR, cur); + + v->spl_nright++; + } + } + + if (v->spl_ldatum_exists) + adjustBox(unionL, (BOX2DF *) DatumGetPointer(v->spl_ldatum)); + v->spl_ldatum = BoxPGetDatum(unionL); + + if (v->spl_rdatum_exists) + adjustBox(unionR, (BOX2DF *) DatumGetPointer(v->spl_rdatum)); + v->spl_rdatum = BoxPGetDatum(unionR); + + v->spl_ldatum_exists = v->spl_rdatum_exists = false; +} + +/* + * Represents information about an entry that can be placed to either group + * without affecting overlap over selected axis ("common entry"). + */ +typedef struct +{ + /* Index of entry in the initial array */ + int index; + /* Delta between penalties of entry insertion into different groups */ + float delta; +} CommonEntry; + +/* + * Context for g_box_consider_split. Contains information about currently + * selected split and some general information. + */ +typedef struct +{ + int entriesCount; /* total number of entries being split */ + BOX2DF boundingBox; /* minimum bounding box across all entries */ + + /* Information about currently selected split follows */ + + bool first; /* true if no split was selected yet */ + + float leftUpper; /* upper bound of left interval */ + float rightLower; /* lower bound of right interval */ + + float4 ratio; + float4 overlap; + int dim; /* axis of this split */ + float range; /* width of general MBR projection to the + * selected axis */ +} ConsiderSplitContext; + +/* + * Interval represents projection of box to axis. + */ +typedef struct +{ + float lower, + upper; +} SplitInterval; + +/* + * Interval comparison function by lower bound of the interval; + */ +static int +interval_cmp_lower(const void *i1, const void *i2) +{ + float lower1 = ((const SplitInterval *) i1)->lower, + lower2 = ((const SplitInterval *) i2)->lower; + + if (lower1 < lower2) + return -1; + else if (lower1 > lower2) + return 1; + else + return 0; +} + +/* + * Interval comparison function by upper bound of the interval; + */ +static int +interval_cmp_upper(const void *i1, const void *i2) +{ + float upper1 = ((const SplitInterval *) i1)->upper, + upper2 = ((const SplitInterval *) i2)->upper; + + if (upper1 < upper2) + return -1; + else if (upper1 > upper2) + return 1; + else + return 0; +} + +/* + * Replace negative value with zero. + */ +static inline float +non_negative(float val) +{ + if (val >= 0.0f) + return val; + else + return 0.0f; +} + +/* + * Consider replacement of currently selected split with the better one. + */ +static inline void +g_box_consider_split(ConsiderSplitContext *context, int dimNum, + float rightLower, int minLeftCount, + float leftUpper, int maxLeftCount) +{ + int leftCount, + rightCount; + float4 ratio, + overlap; + float range; + + POSTGIS_DEBUGF(5, "consider split: dimNum = %d, rightLower = %f, " + "minLeftCount = %d, leftUpper = %f, maxLeftCount = %d ", + dimNum, rightLower, minLeftCount, leftUpper, maxLeftCount); + + /* + * Calculate entries distribution ratio assuming most uniform distribution + * of common entries. + */ + if (minLeftCount >= (context->entriesCount + 1) / 2) + { + leftCount = minLeftCount; + } + else + { + if (maxLeftCount <= context->entriesCount / 2) + leftCount = maxLeftCount; + else + leftCount = context->entriesCount / 2; + } + rightCount = context->entriesCount - leftCount; + + /* + * Ratio of split - quotient between size of lesser group and total + * entries count. + */ + ratio = ((float4) Min(leftCount, rightCount)) / + ((float4) context->entriesCount); + + if (ratio > LIMIT_RATIO) + { + bool selectthis = false; + + /* + * The ratio is acceptable, so compare current split with previously + * selected one. Between splits of one dimension we search for minimal + * overlap (allowing negative values) and minimal ration (between same + * overlaps. We switch dimension if find less overlap (non-negative) + * or less range with same overlap. + */ + if (dimNum == 0) + range = context->boundingBox.xmax - context->boundingBox.xmin; + else + range = context->boundingBox.ymax - context->boundingBox.ymin; + + overlap = (leftUpper - rightLower) / range; + + /* If there is no previous selection, select this */ + if (context->first) + selectthis = true; + else if (context->dim == dimNum) + { + /* + * Within the same dimension, choose the new split if it has a + * smaller overlap, or same overlap but better ratio. + */ + if (overlap < context->overlap || + (overlap == context->overlap && ratio > context->ratio)) + selectthis = true; + } + else + { + /* + * Across dimensions, choose the new split if it has a smaller + * *non-negative* overlap, or same *non-negative* overlap but + * bigger range. This condition differs from the one described in + * the article. On the datasets where leaf MBRs don't overlap + * themselves, non-overlapping splits (i.e. splits which have zero + * *non-negative* overlap) are frequently possible. In this case + * splits tends to be along one dimension, because most distant + * non-overlapping splits (i.e. having lowest negative overlap) + * appears to be in the same dimension as in the previous split. + * Therefore MBRs appear to be very prolonged along another + * dimension, which leads to bad search performance. Using range + * as the second split criteria makes MBRs more quadratic. Using + * *non-negative* overlap instead of overlap as the first split + * criteria gives to range criteria a chance to matter, because + * non-overlapping splits are equivalent in this criteria. + */ + if (non_negative(overlap) < non_negative(context->overlap) || + (range > context->range && + non_negative(overlap) <= non_negative(context->overlap))) + selectthis = true; + } + + if (selectthis) + { + /* save information about selected split */ + context->first = false; + context->ratio = ratio; + context->range = range; + context->overlap = overlap; + context->rightLower = rightLower; + context->leftUpper = leftUpper; + context->dim = dimNum; + POSTGIS_DEBUG(5, "split selected"); + } + } +} + +/* + * Return increase of original BOX2DF area by new BOX2DF area insertion. + */ +static float +box_penalty(BOX2DF *original, BOX2DF *new) +{ + float union_width, + union_height; + + union_width = Max(original->xmax, new->xmax) - + Min(original->xmin, new->xmin); + union_height = Max(original->ymax, new->ymax) - + Min(original->ymin, new->ymin); + return union_width * union_height - (original->xmax - original->xmin) * + (original->ymax - original->ymin); +} + +/* + * Compare common entries by their deltas. + */ +static int +common_entry_cmp(const void *i1, const void *i2) +{ + float delta1 = ((const CommonEntry *) i1)->delta, + delta2 = ((const CommonEntry *) i2)->delta; + + if (delta1 < delta2) + return -1; + else if (delta1 > delta2) + return 1; + else + return 0; +} + +/* + * -------------------------------------------------------------------------- + * Double sorting split algorithm. This is used for both boxes and points. + * + * The algorithm finds split of boxes by considering splits along each axis. + * Each entry is first projected as an interval on the X-axis, and different + * ways to split the intervals into two groups are considered, trying to + * minimize the overlap of the groups. Then the same is repeated for the + * Y-axis, and the overall best split is chosen. The quality of a split is + * determined by overlap along that axis and some other criteria (see + * g_box_consider_split). + * + * After that, all the entries are divided into three groups: + * + * 1) Entries which should be placed to the left group + * 2) Entries which should be placed to the right group + * 3) "Common entries" which can be placed to any of groups without affecting + * of overlap along selected axis. + * + * The common entries are distributed by minimizing penalty. + * + * For details see: + * "A new double sorting-based node splitting algorithm for R-tree", A. Korotkov + * http://syrcose.ispras.ru/2011/files/SYRCoSE2011_Proceedings.pdf#page=36 + * -------------------------------------------------------------------------- + */ +PG_FUNCTION_INFO_V1(gserialized_gist_picksplit_2d); +Datum gserialized_gist_picksplit_2d(PG_FUNCTION_ARGS) +{ + GistEntryVector *entryvec = (GistEntryVector *) PG_GETARG_POINTER(0); + GIST_SPLITVEC *v = (GIST_SPLITVEC *) PG_GETARG_POINTER(1); + OffsetNumber i, + maxoff; + ConsiderSplitContext context; + BOX2DF *box, + *leftBox, + *rightBox; + int dim, + commonEntriesCount; + SplitInterval *intervalsLower, + *intervalsUpper; + CommonEntry *commonEntries; + int nentries; + + POSTGIS_DEBUG(3, "[GIST] 'picksplit' entered"); + + memset(&context, 0, sizeof(ConsiderSplitContext)); + + maxoff = entryvec->n - 1; + nentries = context.entriesCount = maxoff - FirstOffsetNumber + 1; + + /* Allocate arrays for intervals along axes */ + intervalsLower = (SplitInterval *) palloc(nentries * sizeof(SplitInterval)); + intervalsUpper = (SplitInterval *) palloc(nentries * sizeof(SplitInterval)); + + /* + * Calculate the overall minimum bounding box over all the entries. + */ + for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) + { + box = (BOX2DF *) DatumGetPointer(entryvec->vector[i].key); + if (i == FirstOffsetNumber) + context.boundingBox = *box; + else + adjustBox(&context.boundingBox, box); + } + + POSTGIS_DEBUGF(4, "boundingBox is %s", box2df_to_string( + &context.boundingBox)); + + /* + * Iterate over axes for optimal split searching. + */ + context.first = true; /* nothing selected yet */ + for (dim = 0; dim < 2; dim++) + { + float leftUpper, + rightLower; + int i1, + i2; + + /* Project each entry as an interval on the selected axis. */ + for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) + { + box = (BOX2DF *) DatumGetPointer(entryvec->vector[i].key); + if (dim == 0) + { + intervalsLower[i - FirstOffsetNumber].lower = box->xmin; + intervalsLower[i - FirstOffsetNumber].upper = box->xmax; + } + else + { + intervalsLower[i - FirstOffsetNumber].lower = box->ymin; + intervalsLower[i - FirstOffsetNumber].upper = box->ymax; + } + } + + /* + * Make two arrays of intervals: one sorted by lower bound and another + * sorted by upper bound. + */ + memcpy(intervalsUpper, intervalsLower, + sizeof(SplitInterval) * nentries); + qsort(intervalsLower, nentries, sizeof(SplitInterval), + interval_cmp_lower); + qsort(intervalsUpper, nentries, sizeof(SplitInterval), + interval_cmp_upper); + + /*---- + * The goal is to form a left and right interval, so that every entry + * interval is contained by either left or right interval (or both). + * + * For example, with the intervals (0,1), (1,3), (2,3), (2,4): + * + * 0 1 2 3 4 + * +-+ + * +---+ + * +-+ + * +---+ + * + * The left and right intervals are of the form (0,a) and (b,4). + * We first consider splits where b is the lower bound of an entry. + * We iterate through all entries, and for each b, calculate the + * smallest possible a. Then we consider splits where a is the + * uppper bound of an entry, and for each a, calculate the greatest + * possible b. + * + * In the above example, the first loop would consider splits: + * b=0: (0,1)-(0,4) + * b=1: (0,1)-(1,4) + * b=2: (0,3)-(2,4) + * + * And the second loop: + * a=1: (0,1)-(1,4) + * a=3: (0,3)-(2,4) + * a=4: (0,4)-(2,4) + */ + + /* + * Iterate over lower bound of right group, finding smallest possible + * upper bound of left group. + */ + i1 = 0; + i2 = 0; + rightLower = intervalsLower[i1].lower; + leftUpper = intervalsUpper[i2].lower; + while (true) + { + /* + * Find next lower bound of right group. + */ + while (i1 < nentries && rightLower == intervalsLower[i1].lower) + { + leftUpper = Max(leftUpper, intervalsLower[i1].upper); + i1++; + } + if (i1 >= nentries) + break; + rightLower = intervalsLower[i1].lower; + + /* + * Find count of intervals which anyway should be placed to the + * left group. + */ + while (i2 < nentries && intervalsUpper[i2].upper <= leftUpper) + i2++; + + /* + * Consider found split. + */ + g_box_consider_split(&context, dim, rightLower, i1, leftUpper, i2); + } + + /* + * Iterate over upper bound of left group finding greates possible + * lower bound of right group. + */ + i1 = nentries - 1; + i2 = nentries - 1; + rightLower = intervalsLower[i1].upper; + leftUpper = intervalsUpper[i2].upper; + while (true) + { + /* + * Find next upper bound of left group. + */ + while (i2 >= 0 && leftUpper == intervalsUpper[i2].upper) + { + rightLower = Min(rightLower, intervalsUpper[i2].lower); + i2--; + } + if (i2 < 0) + break; + leftUpper = intervalsUpper[i2].upper; + + /* + * Find count of intervals which anyway should be placed to the + * right group. + */ + while (i1 >= 0 && intervalsLower[i1].lower >= rightLower) + i1--; + + /* + * Consider found split. + */ + g_box_consider_split(&context, dim, + rightLower, i1 + 1, leftUpper, i2 + 1); + } + } + + /* + * If we failed to find any acceptable splits, use trivial split. + */ + if (context.first) + { + POSTGIS_DEBUG(4, "no acceptable splits, trivial split"); + fallbackSplit(entryvec, v); + PG_RETURN_POINTER(v); + } + + /* + * Ok, we have now selected the split across one axis. + * + * While considering the splits, we already determined that there will be + * enough entries in both groups to reach the desired ratio, but we did + * not memorize which entries go to which group. So determine that now. + */ + + POSTGIS_DEBUGF(4, "split direction: %d", context.dim); + + /* Allocate vectors for results */ + v->spl_left = (OffsetNumber *) palloc(nentries * sizeof(OffsetNumber)); + v->spl_right = (OffsetNumber *) palloc(nentries * sizeof(OffsetNumber)); + v->spl_nleft = 0; + v->spl_nright = 0; + + /* Allocate bounding boxes of left and right groups */ + leftBox = palloc0(sizeof(BOX2DF)); + rightBox = palloc0(sizeof(BOX2DF)); + + /* + * Allocate an array for "common entries" - entries which can be placed to + * either group without affecting overlap along selected axis. + */ + commonEntriesCount = 0; + commonEntries = (CommonEntry *) palloc(nentries * sizeof(CommonEntry)); + + /* Helper macros to place an entry in the left or right group */ +#define PLACE_LEFT(box, off) \ + do { \ + if (v->spl_nleft > 0) \ + adjustBox(leftBox, box); \ + else \ + *leftBox = *(box); \ + v->spl_left[v->spl_nleft++] = off; \ + } while(0) + +#define PLACE_RIGHT(box, off) \ + do { \ + if (v->spl_nright > 0) \ + adjustBox(rightBox, box); \ + else \ + *rightBox = *(box); \ + v->spl_right[v->spl_nright++] = off; \ + } while(0) + + /* + * Distribute entries which can be distributed unambiguously, and collect + * common entries. + */ + for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) + { + float lower, + upper; + + /* + * Get upper and lower bounds along selected axis. + */ + box = (BOX2DF *) DatumGetPointer(entryvec->vector[i].key); + if (context.dim == 0) + { + lower = box->xmin; + upper = box->xmax; + } + else + { + lower = box->ymin; + upper = box->ymax; + } + + if (upper <= context.leftUpper) + { + /* Fits to the left group */ + if (lower >= context.rightLower) + { + /* Fits also to the right group, so "common entry" */ + commonEntries[commonEntriesCount++].index = i; + } + else + { + /* Doesn't fit to the right group, so join to the left group */ + PLACE_LEFT(box, i); + } + } + else + { + /* + * Each entry should fit on either left or right group. Since this + * entry didn't fit on the left group, it better fit in the right + * group. + */ + Assert(lower >= context.rightLower); + + /* Doesn't fit to the left group, so join to the right group */ + PLACE_RIGHT(box, i); + } + } + + POSTGIS_DEBUGF(4, "leftBox is %s", box2df_to_string(leftBox)); + POSTGIS_DEBUGF(4, "rightBox is %s", box2df_to_string(rightBox)); + + /* + * Distribute "common entries", if any. + */ + if (commonEntriesCount > 0) + { + /* + * Calculate minimum number of entries that must be placed in both + * groups, to reach LIMIT_RATIO. + */ + int m = ceil(LIMIT_RATIO * (double) nentries); + + /* + * Calculate delta between penalties of join "common entries" to + * different groups. + */ + for (i = 0; i < commonEntriesCount; i++) + { + box = (BOX2DF *) DatumGetPointer(entryvec->vector[ + commonEntries[i].index].key); + commonEntries[i].delta = Abs(box_penalty(leftBox, box) - + box_penalty(rightBox, box)); + } + + /* + * Sort "common entries" by calculated deltas in order to distribute + * the most ambiguous entries first. + */ + qsort(commonEntries, commonEntriesCount, sizeof(CommonEntry), common_entry_cmp); + + /* + * Distribute "common entries" between groups. + */ + for (i = 0; i < commonEntriesCount; i++) + { + box = (BOX2DF *) DatumGetPointer(entryvec->vector[ + commonEntries[i].index].key); + + /* + * Check if we have to place this entry in either group to achieve + * LIMIT_RATIO. + */ + if (v->spl_nleft + (commonEntriesCount - i) <= m) + PLACE_LEFT(box, commonEntries[i].index); + else if (v->spl_nright + (commonEntriesCount - i) <= m) + PLACE_RIGHT(box, commonEntries[i].index); + else + { + /* Otherwise select the group by minimal penalty */ + if (box_penalty(leftBox, box) < box_penalty(rightBox, box)) + PLACE_LEFT(box, commonEntries[i].index); + else + PLACE_RIGHT(box, commonEntries[i].index); + } + } + } + v->spl_ldatum = PointerGetDatum(leftBox); + v->spl_rdatum = PointerGetDatum(rightBox); + + POSTGIS_DEBUG(4, "[GIST] 'picksplit' completed"); + + PG_RETURN_POINTER(v); +} + +#else /* !KOROTOV_SPLIT */ + typedef struct { BOX2DF *key; @@ -1363,6 +2074,8 @@ Datum gserialized_gist_picksplit_2d(PG_FUNCTION_ARGS) PG_RETURN_POINTER(v); } +#endif + /* ** The BOX32DF key must be defined as a PostgreSQL type, even though it is only -- 2.50.1