1 /*-------------------------------------------------------------------------
4 * Partitioning related data structures and functions.
6 * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
11 * src/backend/catalog/partition.c
13 *-------------------------------------------------------------------------
18 #include "access/heapam.h"
19 #include "access/htup_details.h"
20 #include "access/nbtree.h"
21 #include "access/sysattr.h"
22 #include "catalog/dependency.h"
23 #include "catalog/indexing.h"
24 #include "catalog/objectaddress.h"
25 #include "catalog/partition.h"
26 #include "catalog/pg_collation.h"
27 #include "catalog/pg_inherits.h"
28 #include "catalog/pg_inherits_fn.h"
29 #include "catalog/pg_opclass.h"
30 #include "catalog/pg_type.h"
31 #include "executor/executor.h"
32 #include "miscadmin.h"
33 #include "nodes/makefuncs.h"
34 #include "nodes/nodeFuncs.h"
35 #include "nodes/parsenodes.h"
36 #include "optimizer/clauses.h"
37 #include "optimizer/planmain.h"
38 #include "optimizer/var.h"
39 #include "rewrite/rewriteManip.h"
40 #include "storage/lmgr.h"
41 #include "utils/array.h"
42 #include "utils/builtins.h"
43 #include "utils/datum.h"
44 #include "utils/memutils.h"
45 #include "utils/fmgroids.h"
46 #include "utils/inval.h"
47 #include "utils/lsyscache.h"
48 #include "utils/rel.h"
49 #include "utils/ruleutils.h"
50 #include "utils/syscache.h"
53 * Information about bounds of a partitioned relation
55 * A list partition datum that is known to be NULL is never put into the
56 * datums array. Instead, it is tracked using has_null and null_index fields.
58 * In the case of range partitioning, ndatums will typically be far less than
59 * 2 * nparts, because a partition's upper bound and the next partition's lower
60 * bound are the same in most common cases, and we only store one of them.
62 * In the case of list partitioning, the indexes array stores one entry for
63 * every datum, which is the index of the partition that accepts a given datum.
64 * In case of range partitioning, it stores one entry per distinct range
65 * datum, which is the index of the partition for which a given datum
69 /* Ternary value to represent what's contained in a range bound datum */
70 typedef enum RangeDatumContent
72 RANGE_DATUM_FINITE = 0, /* actual datum stored elsewhere */
73 RANGE_DATUM_NEG_INF, /* negative infinity */
74 RANGE_DATUM_POS_INF /* positive infinity */
77 typedef struct PartitionBoundInfoData
79 char strategy; /* list or range bounds? */
80 int ndatums; /* Length of the datums following array */
81 Datum **datums; /* Array of datum-tuples with key->partnatts
83 RangeDatumContent **content;/* what's contained in each range bound datum?
84 * (see the above enum); NULL for list
85 * partitioned tables */
86 int *indexes; /* Partition indexes; one entry per member of
87 * the datums array (plus one if range
88 * partitioned table) */
89 bool has_null; /* Is there a null-accepting partition? false
90 * for range partitioned tables */
91 int null_index; /* Index of the null-accepting partition; -1
92 * for range partitioned tables */
93 } PartitionBoundInfoData;
96 * When qsort'ing partition bounds after reading from the catalog, each bound
97 * is represented with one of the following structs.
100 /* One value coming from some (index'th) list partition */
101 typedef struct PartitionListValue
105 } PartitionListValue;
107 /* One bound of a range partition */
108 typedef struct PartitionRangeBound
111 Datum *datums; /* range bound datums */
112 RangeDatumContent *content; /* what's contained in each datum? */
113 bool lower; /* this is the lower (vs upper) bound */
114 } PartitionRangeBound;
116 static int32 qsort_partition_list_value_cmp(const void *a, const void *b,
118 static int32 qsort_partition_rbound_cmp(const void *a, const void *b,
121 static List *get_qual_for_list(PartitionKey key, PartitionBoundSpec *spec);
122 static List *get_qual_for_range(PartitionKey key, PartitionBoundSpec *spec);
123 static Oid get_partition_operator(PartitionKey key, int col,
124 StrategyNumber strategy, bool *need_relabel);
125 static List *generate_partition_qual(Relation rel);
127 static PartitionRangeBound *make_one_range_bound(PartitionKey key, int index,
128 List *datums, bool lower);
129 static int32 partition_rbound_cmp(PartitionKey key,
130 Datum *datums1, RangeDatumContent *content1, bool lower1,
131 PartitionRangeBound *b2);
132 static int32 partition_rbound_datum_cmp(PartitionKey key,
133 Datum *rb_datums, RangeDatumContent *rb_content,
134 Datum *tuple_datums);
136 static int32 partition_bound_cmp(PartitionKey key,
137 PartitionBoundInfo boundinfo,
138 int offset, void *probe, bool probe_is_bound);
139 static int partition_bound_bsearch(PartitionKey key,
140 PartitionBoundInfo boundinfo,
141 void *probe, bool probe_is_bound, bool *is_equal);
144 * RelationBuildPartitionDesc
145 * Form rel's partition descriptor
147 * Not flushed from the cache by RelationClearRelation() unless changed because
148 * of addition or removal of partition.
151 RelationBuildPartitionDesc(Relation rel)
156 List *boundspecs = NIL;
160 PartitionKey key = RelationGetPartitionKey(rel);
161 PartitionDesc result;
162 MemoryContext oldcxt;
166 /* List partitioning specific */
167 PartitionListValue **all_values = NULL;
168 bool found_null = false;
171 /* Range partitioning specific */
172 PartitionRangeBound **rbounds = NULL;
175 * The following could happen in situations where rel has a pg_class entry
176 * but not the pg_partitioned_table entry yet.
181 /* Get partition oids from pg_inherits */
182 inhoids = find_inheritance_children(RelationGetRelid(rel), NoLock);
184 /* Collect bound spec nodes in a list */
187 foreach(cell, inhoids)
189 Oid inhrelid = lfirst_oid(cell);
195 tuple = SearchSysCache1(RELOID, inhrelid);
196 if (!HeapTupleIsValid(tuple))
197 elog(ERROR, "cache lookup failed for relation %u", inhrelid);
200 * It is possible that the pg_class tuple of a partition has not been
201 * updated yet to set its relpartbound field. The only case where
202 * this happens is when we open the parent relation to check using its
203 * partition descriptor that a new partition's bound does not overlap
204 * some existing partition.
206 if (!((Form_pg_class) GETSTRUCT(tuple))->relispartition)
208 ReleaseSysCache(tuple);
212 datum = SysCacheGetAttr(RELOID, tuple,
213 Anum_pg_class_relpartbound,
216 boundspec = (Node *) stringToNode(TextDatumGetCString(datum));
217 boundspecs = lappend(boundspecs, boundspec);
218 partoids = lappend_oid(partoids, inhrelid);
219 ReleaseSysCache(tuple);
222 nparts = list_length(partoids);
226 oids = (Oid *) palloc(nparts * sizeof(Oid));
228 foreach(cell, partoids)
229 oids[i++] = lfirst_oid(cell);
231 /* Convert from node to the internal representation */
232 if (key->strategy == PARTITION_STRATEGY_LIST)
234 List *non_null_values = NIL;
237 * Create a unified list of non-null values across all partitions.
242 foreach(cell, boundspecs)
245 PartitionBoundSpec *spec = lfirst(cell);
247 if (spec->strategy != PARTITION_STRATEGY_LIST)
248 elog(ERROR, "invalid strategy in partition bound spec");
250 foreach(c, spec->listdatums)
252 Const *val = lfirst(c);
253 PartitionListValue *list_value = NULL;
255 if (!val->constisnull)
257 list_value = (PartitionListValue *)
258 palloc0(sizeof(PartitionListValue));
259 list_value->index = i;
260 list_value->value = val->constvalue;
265 * Never put a null into the values array, flag
266 * instead for the code further down below where we
267 * construct the actual relcache struct.
270 elog(ERROR, "found null more than once");
276 non_null_values = lappend(non_null_values,
283 ndatums = list_length(non_null_values);
286 * Collect all list values in one array. Alongside the value, we
287 * also save the index of partition the value comes from.
289 all_values = (PartitionListValue **) palloc(ndatums *
290 sizeof(PartitionListValue *));
292 foreach(cell, non_null_values)
294 PartitionListValue *src = lfirst(cell);
296 all_values[i] = (PartitionListValue *)
297 palloc(sizeof(PartitionListValue));
298 all_values[i]->value = src->value;
299 all_values[i]->index = src->index;
303 qsort_arg(all_values, ndatums, sizeof(PartitionListValue *),
304 qsort_partition_list_value_cmp, (void *) key);
306 else if (key->strategy == PARTITION_STRATEGY_RANGE)
310 PartitionRangeBound **all_bounds,
312 bool *distinct_indexes;
314 all_bounds = (PartitionRangeBound **) palloc0(2 * nparts *
315 sizeof(PartitionRangeBound *));
316 distinct_indexes = (bool *) palloc(2 * nparts * sizeof(bool));
319 * Create a unified list of range bounds across all the
323 foreach(cell, boundspecs)
325 PartitionBoundSpec *spec = lfirst(cell);
326 PartitionRangeBound *lower,
329 if (spec->strategy != PARTITION_STRATEGY_RANGE)
330 elog(ERROR, "invalid strategy in partition bound spec");
332 lower = make_one_range_bound(key, i, spec->lowerdatums,
334 upper = make_one_range_bound(key, i, spec->upperdatums,
336 all_bounds[j] = lower;
337 all_bounds[j + 1] = upper;
341 Assert(j == 2 * nparts);
343 /* Sort all the bounds in ascending order */
344 qsort_arg(all_bounds, 2 * nparts,
345 sizeof(PartitionRangeBound *),
346 qsort_partition_rbound_cmp,
350 * Count the number of distinct bounds to allocate an array of
355 for (i = 0; i < 2 * nparts; i++)
357 PartitionRangeBound *cur = all_bounds[i];
358 bool is_distinct = false;
361 /* Is current bound is distinct from the previous? */
362 for (j = 0; j < key->partnatts; j++)
373 * If either of them has infinite element, we can't equate
374 * them. Even when both are infinite, they'd have
375 * opposite signs, because only one of cur and prev is a
378 if (cur->content[j] != RANGE_DATUM_FINITE ||
379 prev->content[j] != RANGE_DATUM_FINITE)
384 cmpval = FunctionCall2Coll(&key->partsupfunc[j],
385 key->partcollation[j],
388 if (DatumGetInt32(cmpval) != 0)
396 * Count the current bound if it is distinct from the previous
397 * one. Also, store if the index i contains a distinct bound
398 * that we'd like put in the relcache array.
402 distinct_indexes[i] = true;
406 distinct_indexes[i] = false;
412 * Finally save them in an array from where they will be copied
415 rbounds = (PartitionRangeBound **) palloc(ndatums *
416 sizeof(PartitionRangeBound *));
418 for (i = 0; i < 2 * nparts; i++)
420 if (distinct_indexes[i])
421 rbounds[k++] = all_bounds[i];
423 Assert(k == ndatums);
426 elog(ERROR, "unexpected partition strategy: %d",
427 (int) key->strategy);
430 /* Now build the actual relcache partition descriptor */
431 rel->rd_pdcxt = AllocSetContextCreate(CacheMemoryContext,
432 RelationGetRelationName(rel),
433 ALLOCSET_DEFAULT_SIZES);
434 oldcxt = MemoryContextSwitchTo(rel->rd_pdcxt);
436 result = (PartitionDescData *) palloc0(sizeof(PartitionDescData));
437 result->nparts = nparts;
440 PartitionBoundInfo boundinfo;
444 result->oids = (Oid *) palloc0(nparts * sizeof(Oid));
446 boundinfo = (PartitionBoundInfoData *)
447 palloc0(sizeof(PartitionBoundInfoData));
448 boundinfo->strategy = key->strategy;
449 boundinfo->ndatums = ndatums;
450 boundinfo->datums = (Datum **) palloc0(ndatums * sizeof(Datum *));
452 /* Initialize mapping array with invalid values */
453 mapping = (int *) palloc(sizeof(int) * nparts);
454 for (i = 0; i < nparts; i++)
457 switch (key->strategy)
459 case PARTITION_STRATEGY_LIST:
461 boundinfo->has_null = found_null;
462 boundinfo->indexes = (int *) palloc(ndatums * sizeof(int));
465 * Copy values. Indexes of individual values are mapped
466 * to canonical values so that they match for any two list
467 * partitioned tables with same number of partitions and
468 * same lists per partition. One way to canonicalize is
469 * to assign the index in all_values[] of the smallest
470 * value of each partition, as the index of all of the
471 * partition's values.
473 for (i = 0; i < ndatums; i++)
475 boundinfo->datums[i] = (Datum *) palloc(sizeof(Datum));
476 boundinfo->datums[i][0] = datumCopy(all_values[i]->value,
477 key->parttypbyval[0],
480 /* If the old index has no mapping, assign one */
481 if (mapping[all_values[i]->index] == -1)
482 mapping[all_values[i]->index] = next_index++;
484 boundinfo->indexes[i] = mapping[all_values[i]->index];
488 * If null-accepting partition has no mapped index yet,
489 * assign one. This could happen if such partition
490 * accepts only null and hence not covered in the above
491 * loop which only handled non-null values.
495 Assert(null_index >= 0);
496 if (mapping[null_index] == -1)
497 mapping[null_index] = next_index++;
500 /* All partition must now have a valid mapping */
501 Assert(next_index == nparts);
504 boundinfo->null_index = mapping[null_index];
506 boundinfo->null_index = -1;
510 case PARTITION_STRATEGY_RANGE:
512 boundinfo->content = (RangeDatumContent **) palloc(ndatums *
513 sizeof(RangeDatumContent *));
514 boundinfo->indexes = (int *) palloc((ndatums + 1) *
517 for (i = 0; i < ndatums; i++)
521 boundinfo->datums[i] = (Datum *) palloc(key->partnatts *
523 boundinfo->content[i] = (RangeDatumContent *)
524 palloc(key->partnatts *
525 sizeof(RangeDatumContent));
526 for (j = 0; j < key->partnatts; j++)
528 if (rbounds[i]->content[j] == RANGE_DATUM_FINITE)
529 boundinfo->datums[i][j] =
530 datumCopy(rbounds[i]->datums[j],
531 key->parttypbyval[j],
533 /* Remember, we are storing the tri-state value. */
534 boundinfo->content[i][j] = rbounds[i]->content[j];
538 * There is no mapping for invalid indexes.
540 * Any lower bounds in the rbounds array have invalid
541 * indexes assigned, because the values between the
542 * previous bound (if there is one) and this (lower)
543 * bound are not part of the range of any existing
546 if (rbounds[i]->lower)
547 boundinfo->indexes[i] = -1;
550 int orig_index = rbounds[i]->index;
552 /* If the old index is has no mapping, assign one */
553 if (mapping[orig_index] == -1)
554 mapping[orig_index] = next_index++;
556 boundinfo->indexes[i] = mapping[orig_index];
559 boundinfo->indexes[i] = -1;
564 elog(ERROR, "unexpected partition strategy: %d",
565 (int) key->strategy);
568 result->boundinfo = boundinfo;
571 * Now assign OIDs from the original array into mapped indexes of the
572 * result array. Order of OIDs in the former is defined by the
573 * catalog scan that retrived them, whereas that in the latter is
574 * defined by canonicalized representation of the list values or the
577 for (i = 0; i < nparts; i++)
578 result->oids[mapping[i]] = oids[i];
582 MemoryContextSwitchTo(oldcxt);
583 rel->rd_partdesc = result;
587 * Are two partition bound collections logically equal?
589 * Used in the keep logic of relcache.c (ie, in RelationClearRelation()).
590 * This is also useful when b1 and b2 are bound collections of two separate
591 * relations, respectively, because PartitionBoundInfo is a canonical
592 * representation of partition bounds.
595 partition_bounds_equal(PartitionKey key,
596 PartitionBoundInfo b1, PartitionBoundInfo b2)
600 if (b1->strategy != b2->strategy)
603 if (b1->ndatums != b2->ndatums)
606 if (b1->has_null != b2->has_null)
609 if (b1->null_index != b2->null_index)
612 for (i = 0; i < b1->ndatums; i++)
616 for (j = 0; j < key->partnatts; j++)
618 /* For range partitions, the bounds might not be finite. */
619 if (b1->content != NULL)
622 * A finite bound always differs from an infinite bound, and
623 * different kinds of infinities differ from each other.
625 if (b1->content[i][j] != b2->content[i][j])
628 /* Non-finite bounds are equal without further examination. */
629 if (b1->content[i][j] != RANGE_DATUM_FINITE)
634 * Compare the actual values. Note that it would be both incorrect
635 * and unsafe to invoke the comparison operator derived from the
636 * partitioning specification here. It would be incorrect because
637 * we want the relcache entry to be updated for ANY change to the
638 * partition bounds, not just those that the partitioning operator
639 * thinks are significant. It would be unsafe because we might
640 * reach this code in the context of an aborted transaction, and
641 * an arbitrary partitioning operator might not be safe in that
642 * context. datumIsEqual() should be simple enough to be safe.
644 if (!datumIsEqual(b1->datums[i][j], b2->datums[i][j],
645 key->parttypbyval[j],
650 if (b1->indexes[i] != b2->indexes[i])
654 /* There are ndatums+1 indexes in case of range partitions */
655 if (key->strategy == PARTITION_STRATEGY_RANGE &&
656 b1->indexes[i] != b2->indexes[i])
663 * check_new_partition_bound
665 * Checks if the new partition's bound overlaps any of the existing partitions
666 * of parent. Also performs additional checks as necessary per strategy.
669 check_new_partition_bound(char *relname, Relation parent, Node *bound)
671 PartitionBoundSpec *spec = (PartitionBoundSpec *) bound;
672 PartitionKey key = RelationGetPartitionKey(parent);
673 PartitionDesc partdesc = RelationGetPartitionDesc(parent);
674 ParseState *pstate = make_parsestate(NULL);
676 bool overlap = false;
678 switch (key->strategy)
680 case PARTITION_STRATEGY_LIST:
682 Assert(spec->strategy == PARTITION_STRATEGY_LIST);
684 if (partdesc->nparts > 0)
686 PartitionBoundInfo boundinfo = partdesc->boundinfo;
690 boundinfo->strategy == PARTITION_STRATEGY_LIST &&
691 (boundinfo->ndatums > 0 || boundinfo->has_null));
693 foreach(cell, spec->listdatums)
695 Const *val = lfirst(cell);
697 if (!val->constisnull)
702 offset = partition_bound_bsearch(key, boundinfo,
705 if (offset >= 0 && equal)
708 with = boundinfo->indexes[offset];
712 else if (boundinfo->has_null)
715 with = boundinfo->null_index;
724 case PARTITION_STRATEGY_RANGE:
726 PartitionRangeBound *lower,
729 Assert(spec->strategy == PARTITION_STRATEGY_RANGE);
730 lower = make_one_range_bound(key, -1, spec->lowerdatums, true);
731 upper = make_one_range_bound(key, -1, spec->upperdatums, false);
734 * First check if the resulting range would be empty with
735 * specified lower and upper bounds
737 if (partition_rbound_cmp(key, lower->datums, lower->content, true,
740 (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
741 errmsg("cannot create range partition with empty range"),
742 parser_errposition(pstate, spec->location)));
744 if (partdesc->nparts > 0)
746 PartitionBoundInfo boundinfo = partdesc->boundinfo;
751 Assert(boundinfo && boundinfo->ndatums > 0 &&
752 boundinfo->strategy == PARTITION_STRATEGY_RANGE);
755 * Firstly, find the greatest range bound that is less
756 * than or equal to the new lower bound.
758 off1 = partition_bound_bsearch(key, boundinfo, lower, true,
762 * off1 == -1 means that all existing bounds are greater
763 * than the new lower bound. In that case and the case
764 * where no partition is defined between the bounds at
765 * off1 and off1 + 1, we have a "gap" in the range that
766 * could be occupied by the new partition. We confirm if
767 * so by checking whether the new upper bound is confined
770 if (!equal && boundinfo->indexes[off1 + 1] < 0)
772 off2 = partition_bound_bsearch(key, boundinfo, upper,
776 * If the new upper bound is returned to be equal to
777 * the bound at off2, the latter must be the upper
778 * bound of some partition with which the new
779 * partition clearly overlaps.
781 * Also, if bound at off2 is not same as the one
782 * returned for the new lower bound (IOW, off1 !=
783 * off2), then the new partition overlaps at least one
786 if (equal || off1 != off2)
791 * The bound at off2 could be the lower bound of
792 * the partition with which the new partition
793 * overlaps. In that case, use the upper bound
794 * (that is, the bound at off2 + 1) to get the
795 * index of that partition.
797 if (boundinfo->indexes[off2] < 0)
798 with = boundinfo->indexes[off2 + 1];
800 with = boundinfo->indexes[off2];
806 * Equal has been set to true and there is no "gap"
807 * between the bound at off1 and that at off1 + 1, so
808 * the new partition will overlap some partition. In
809 * the former case, the new lower bound is found to be
810 * equal to the bound at off1, which could only ever
811 * be true if the latter is the lower bound of some
812 * partition. It's clear in such a case that the new
813 * partition overlaps that partition, whose index we
814 * get using its upper bound (that is, using the bound
818 with = boundinfo->indexes[off1 + 1];
826 elog(ERROR, "unexpected partition strategy: %d",
827 (int) key->strategy);
834 (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
835 errmsg("partition \"%s\" would overlap partition \"%s\"",
836 relname, get_rel_name(partdesc->oids[with])),
837 parser_errposition(pstate, spec->location)));
842 * get_partition_parent
844 * Returns inheritance parent of a partition by scanning pg_inherits
846 * Note: Because this function assumes that the relation whose OID is passed
847 * as an argument will have precisely one parent, it should only be called
848 * when it is known that the relation is a partition.
851 get_partition_parent(Oid relid)
853 Form_pg_inherits form;
854 Relation catalogRelation;
860 catalogRelation = heap_open(InheritsRelationId, AccessShareLock);
863 Anum_pg_inherits_inhrelid,
864 BTEqualStrategyNumber, F_OIDEQ,
865 ObjectIdGetDatum(relid));
867 Anum_pg_inherits_inhseqno,
868 BTEqualStrategyNumber, F_INT4EQ,
871 scan = systable_beginscan(catalogRelation, InheritsRelidSeqnoIndexId, true,
874 tuple = systable_getnext(scan);
875 Assert(HeapTupleIsValid(tuple));
877 form = (Form_pg_inherits) GETSTRUCT(tuple);
878 result = form->inhparent;
880 systable_endscan(scan);
881 heap_close(catalogRelation, AccessShareLock);
887 * get_qual_from_partbound
888 * Given a parser node for partition bound, return the list of executable
889 * expressions as partition constraint
892 get_qual_from_partbound(Relation rel, Relation parent, Node *bound)
894 PartitionBoundSpec *spec = (PartitionBoundSpec *) bound;
895 PartitionKey key = RelationGetPartitionKey(parent);
900 switch (key->strategy)
902 case PARTITION_STRATEGY_LIST:
903 Assert(spec->strategy == PARTITION_STRATEGY_LIST);
904 my_qual = get_qual_for_list(key, spec);
907 case PARTITION_STRATEGY_RANGE:
908 Assert(spec->strategy == PARTITION_STRATEGY_RANGE);
909 my_qual = get_qual_for_range(key, spec);
913 elog(ERROR, "unexpected partition strategy: %d",
914 (int) key->strategy);
921 * map_partition_varattnos - maps varattno of any Vars in expr from the
922 * parent attno to partition attno.
924 * We must allow for cases where physical attnos of a partition can be
925 * different from the parent's.
927 * Note: this will work on any node tree, so really the argument and result
928 * should be declared "Node *". But a substantial majority of the callers
929 * are working on Lists, so it's less messy to do the casts internally.
932 map_partition_varattnos(List *expr, int target_varno,
933 Relation partrel, Relation parent)
935 AttrNumber *part_attnos;
936 bool found_whole_row;
941 part_attnos = convert_tuples_by_name_map(RelationGetDescr(partrel),
942 RelationGetDescr(parent),
943 gettext_noop("could not convert row type"));
944 expr = (List *) map_variable_attnos((Node *) expr,
947 RelationGetDescr(parent)->natts,
949 /* There can never be a whole-row reference here */
951 elog(ERROR, "unexpected whole-row reference found in partition key");
957 * RelationGetPartitionQual
959 * Returns a list of partition quals
962 RelationGetPartitionQual(Relation rel)
965 if (!rel->rd_rel->relispartition)
968 return generate_partition_qual(rel);
972 * Append OIDs of rel's partitions to the list 'partoids' and for each OID,
973 * append pointer rel to the list 'parents'.
975 #define APPEND_REL_PARTITION_OIDS(rel, partoids, parents) \
979 for (i = 0; i < (rel)->rd_partdesc->nparts; i++)\
981 (partoids) = lappend_oid((partoids), (rel)->rd_partdesc->oids[i]);\
982 (parents) = lappend((parents), (rel));\
987 * RelationGetPartitionDispatchInfo
988 * Returns information necessary to route tuples down a partition tree
990 * All the partitions will be locked with lockmode, unless it is NoLock.
991 * A list of the OIDs of all the leaf partition of rel is returned in
995 RelationGetPartitionDispatchInfo(Relation rel, int lockmode,
996 int *num_parted, List **leaf_part_oids)
998 PartitionDispatchData **pd;
999 List *all_parts = NIL,
1002 *parted_rel_parents;
1010 * Lock partitions and make a list of the partitioned ones to prepare
1011 * their PartitionDispatch objects below.
1013 * Cannot use find_all_inheritors() here, because then the order of OIDs
1014 * in parted_rels list would be unknown, which does not help, because we
1015 * we assign indexes within individual PartitionDispatch in an order that
1016 * is predetermined (determined by the order of OIDs in individual
1017 * partition descriptors).
1020 parted_rels = list_make1(rel);
1021 /* Root partitioned table has no parent, so NULL for parent */
1022 parted_rel_parents = list_make1(NULL);
1023 APPEND_REL_PARTITION_OIDS(rel, all_parts, all_parents);
1024 forboth(lc1, all_parts, lc2, all_parents)
1026 Relation partrel = heap_open(lfirst_oid(lc1), lockmode);
1027 Relation parent = lfirst(lc2);
1028 PartitionDesc partdesc = RelationGetPartitionDesc(partrel);
1031 * If this partition is a partitioned table, add its children to the
1032 * end of the list, so that they are processed as well.
1037 parted_rels = lappend(parted_rels, partrel);
1038 parted_rel_parents = lappend(parted_rel_parents, parent);
1039 APPEND_REL_PARTITION_OIDS(partrel, all_parts, all_parents);
1042 heap_close(partrel, NoLock);
1045 * We keep the partitioned ones open until we're done using the
1046 * information being collected here (for example, see
1047 * ExecEndModifyTable).
1052 * We want to create two arrays - one for leaf partitions and another for
1053 * partitioned tables (including the root table and internal partitions).
1054 * While we only create the latter here, leaf partition array of suitable
1055 * objects (such as, ResultRelInfo) is created by the caller using the
1056 * list of OIDs we return. Indexes into these arrays get assigned in a
1057 * breadth-first manner, whereby partitions of any given level are placed
1058 * consecutively in the respective arrays.
1060 pd = (PartitionDispatchData **) palloc(*num_parted *
1061 sizeof(PartitionDispatchData *));
1062 *leaf_part_oids = NIL;
1064 forboth(lc1, parted_rels, lc2, parted_rel_parents)
1066 Relation partrel = lfirst(lc1);
1067 Relation parent = lfirst(lc2);
1068 PartitionKey partkey = RelationGetPartitionKey(partrel);
1069 TupleDesc tupdesc = RelationGetDescr(partrel);
1070 PartitionDesc partdesc = RelationGetPartitionDesc(partrel);
1074 pd[i] = (PartitionDispatch) palloc(sizeof(PartitionDispatchData));
1075 pd[i]->reldesc = partrel;
1076 pd[i]->key = partkey;
1077 pd[i]->keystate = NIL;
1078 pd[i]->partdesc = partdesc;
1082 * For every partitioned table other than root, we must store a
1083 * tuple table slot initialized with its tuple descriptor and a
1084 * tuple conversion map to convert a tuple from its parent's
1085 * rowtype to its own. That is to make sure that we are looking at
1086 * the correct row using the correct tuple descriptor when
1087 * computing its partition key for tuple routing.
1089 pd[i]->tupslot = MakeSingleTupleTableSlot(tupdesc);
1090 pd[i]->tupmap = convert_tuples_by_name(RelationGetDescr(parent),
1092 gettext_noop("could not convert row type"));
1096 /* Not required for the root partitioned table */
1097 pd[i]->tupslot = NULL;
1098 pd[i]->tupmap = NULL;
1100 pd[i]->indexes = (int *) palloc(partdesc->nparts * sizeof(int));
1103 * Indexes corresponding to the internal partitions are multiplied by
1104 * -1 to distinguish them from those of leaf partitions. Encountering
1105 * an index >= 0 means we found a leaf partition, which is immediately
1106 * returned as the partition we are looking for. A negative index
1107 * means we found a partitioned table, whose PartitionDispatch object
1108 * is located at the above index multiplied back by -1. Using the
1109 * PartitionDispatch object, search is continued further down the
1113 for (j = 0; j < partdesc->nparts; j++)
1115 Oid partrelid = partdesc->oids[j];
1117 if (get_rel_relkind(partrelid) != RELKIND_PARTITIONED_TABLE)
1119 *leaf_part_oids = lappend_oid(*leaf_part_oids, partrelid);
1120 pd[i]->indexes[j] = k++;
1125 * offset denotes the number of partitioned tables of upper
1126 * levels including those of the current level. Any partition
1127 * of this table must belong to the next level and hence will
1128 * be placed after the last partitioned table of this level.
1130 pd[i]->indexes[j] = -(1 + offset + m);
1137 * This counts the number of partitioned tables at upper levels
1138 * including those of the current level.
1146 /* Module-local functions */
1151 * Returns a list of expressions to use as a list partition's constraint.
1154 get_qual_for_list(PartitionKey key, PartitionBoundSpec *spec)
1158 ScalarArrayOpExpr *opexpr;
1165 list_has_null = false;
1166 NullTest *nulltest1 = NULL,
1169 /* Left operand is either a simple Var or arbitrary expression */
1170 if (key->partattrs[0] != 0)
1171 keyCol = (Node *) makeVar(1,
1175 key->parttypcoll[0],
1178 keyCol = (Node *) copyObject(linitial(key->partexprs));
1181 * We must remove any NULL value in the list; we handle it separately
1185 for (cell = list_head(spec->listdatums); cell; cell = next)
1187 Const *val = (Const *) lfirst(cell);
1191 if (val->constisnull)
1193 list_has_null = true;
1194 spec->listdatums = list_delete_cell(spec->listdatums,
1204 * Gin up a col IS NOT NULL test that will be AND'd with other
1207 nulltest1 = makeNode(NullTest);
1208 nulltest1->arg = (Expr *) keyCol;
1209 nulltest1->nulltesttype = IS_NOT_NULL;
1210 nulltest1->argisrow = false;
1211 nulltest1->location = -1;
1216 * Gin up a col IS NULL test that will be OR'd with other expressions
1218 nulltest2 = makeNode(NullTest);
1219 nulltest2->arg = (Expr *) keyCol;
1220 nulltest2->nulltesttype = IS_NULL;
1221 nulltest2->argisrow = false;
1222 nulltest2->location = -1;
1225 /* Right operand is an ArrayExpr containing this partition's values */
1226 arr = makeNode(ArrayExpr);
1227 arr->array_typeid = !type_is_array(key->parttypid[0])
1228 ? get_array_type(key->parttypid[0])
1229 : key->parttypid[0];
1230 arr->array_collid = key->parttypcoll[0];
1231 arr->element_typeid = key->parttypid[0];
1232 arr->elements = spec->listdatums;
1233 arr->multidims = false;
1236 /* Get the correct btree equality operator */
1237 operoid = get_partition_operator(key, 0, BTEqualStrategyNumber,
1239 if (need_relabel || key->partcollation[0] != key->parttypcoll[0])
1240 keyCol = (Node *) makeRelabelType((Expr *) keyCol,
1241 key->partopcintype[0],
1243 key->partcollation[0],
1244 COERCE_EXPLICIT_CAST);
1246 /* Build leftop = ANY (rightop) */
1247 opexpr = makeNode(ScalarArrayOpExpr);
1248 opexpr->opno = operoid;
1249 opexpr->opfuncid = get_opcode(operoid);
1250 opexpr->useOr = true;
1251 opexpr->inputcollid = key->partcollation[0];
1252 opexpr->args = list_make2(keyCol, arr);
1253 opexpr->location = -1;
1256 result = list_make2(nulltest1, opexpr);
1261 or = makeBoolExpr(OR_EXPR, list_make2(nulltest2, opexpr), -1);
1262 result = list_make1(or);
1265 result = list_make1(opexpr);
1271 * get_qual_for_range
1273 * Get a list of OpExpr's to use as a range partition's constraint.
1276 get_qual_for_range(PartitionKey key, PartitionBoundSpec *spec)
1285 * Iterate over columns of the key, emitting an OpExpr for each using the
1286 * corresponding lower and upper datums as constant operands.
1289 partexprs_item = list_head(key->partexprs);
1290 forboth(cell1, spec->lowerdatums, cell2, spec->upperdatums)
1292 PartitionRangeDatum *ldatum = lfirst(cell1),
1293 *udatum = lfirst(cell2);
1295 Const *lower_val = NULL,
1298 MemoryContext oldcxt;
1300 ExprState *test_exprstate;
1303 bool need_relabel = false;
1308 if (key->partattrs[i] != 0)
1310 keyCol = (Node *) makeVar(1,
1314 key->parttypcoll[i],
1319 keyCol = (Node *) copyObject(lfirst(partexprs_item));
1320 partexprs_item = lnext(partexprs_item);
1324 * Emit a IS NOT NULL expression for non-Var keys, because whereas
1325 * simple attributes are covered by NOT NULL constraints, expression
1326 * keys are still nullable which is not acceptable in case of range
1329 if (!IsA(keyCol, Var))
1331 nulltest = makeNode(NullTest);
1332 nulltest->arg = (Expr *) keyCol;
1333 nulltest->nulltesttype = IS_NOT_NULL;
1334 nulltest->argisrow = false;
1335 nulltest->location = -1;
1336 result = lappend(result, nulltest);
1340 * Stop at this column if either of lower or upper datum is infinite,
1341 * but do emit an OpExpr for the non-infinite datum.
1343 if (!ldatum->infinite)
1344 lower_val = (Const *) ldatum->value;
1345 if (!udatum->infinite)
1346 upper_val = (Const *) udatum->value;
1349 * If lower_val and upper_val are both finite and happen to be equal,
1350 * emit only (keyCol = lower_val) for this column, because all rows in
1351 * this partition could only ever contain this value (ie, lower_val)
1352 * in the current partitioning column. We must consider further
1353 * columns because the above condition does not fully constrain the
1354 * rows of this partition.
1356 if (lower_val && upper_val)
1358 /* Get the correct btree equality operator for the test */
1359 operoid = get_partition_operator(key, i, BTEqualStrategyNumber,
1362 /* Create the test expression */
1363 estate = CreateExecutorState();
1364 oldcxt = MemoryContextSwitchTo(estate->es_query_cxt);
1365 test_expr = make_opclause(operoid,
1371 key->partcollation[i]);
1372 fix_opfuncids((Node *) test_expr);
1373 test_exprstate = ExecInitExpr(test_expr, NULL);
1374 test_result = ExecEvalExprSwitchContext(test_exprstate,
1375 GetPerTupleExprContext(estate),
1377 MemoryContextSwitchTo(oldcxt);
1378 FreeExecutorState(estate);
1380 if (DatumGetBool(test_result))
1382 /* This can never be, but it's better to make sure */
1383 if (i == key->partnatts - 1)
1384 elog(ERROR, "invalid range bound specification");
1386 if (need_relabel || key->partcollation[i] != key->parttypcoll[i])
1387 keyCol = (Node *) makeRelabelType((Expr *) keyCol,
1388 key->partopcintype[i],
1390 key->partcollation[i],
1391 COERCE_EXPLICIT_CAST);
1392 result = lappend(result,
1393 make_opclause(operoid,
1399 key->partcollation[i]));
1401 /* Go over to consider the next column. */
1408 * We can say here that lower_val != upper_val. Emit expressions
1409 * (keyCol >= lower_val) and (keyCol < upper_val), then stop.
1413 operoid = get_partition_operator(key, i,
1414 BTGreaterEqualStrategyNumber,
1417 if (need_relabel || key->partcollation[i] != key->parttypcoll[i])
1418 keyCol = (Node *) makeRelabelType((Expr *) keyCol,
1419 key->partopcintype[i],
1421 key->partcollation[i],
1422 COERCE_EXPLICIT_CAST);
1423 result = lappend(result,
1424 make_opclause(operoid,
1430 key->partcollation[i]));
1435 operoid = get_partition_operator(key, i,
1436 BTLessStrategyNumber,
1439 if (need_relabel || key->partcollation[i] != key->parttypcoll[i])
1440 keyCol = (Node *) makeRelabelType((Expr *) keyCol,
1441 key->partopcintype[i],
1443 key->partcollation[i],
1444 COERCE_EXPLICIT_CAST);
1446 result = lappend(result,
1447 make_opclause(operoid,
1453 key->partcollation[i]));
1457 * We can stop at this column, because we would not have checked the
1458 * next column when routing a given row into this partition.
1467 * get_partition_operator
1469 * Return oid of the operator of given strategy for a given partition key
1473 get_partition_operator(PartitionKey key, int col, StrategyNumber strategy,
1479 * First check if there exists an operator of the given strategy, with
1480 * this column's type as both its lefttype and righttype, in the
1481 * partitioning operator family specified for the column.
1483 operoid = get_opfamily_member(key->partopfamily[col],
1484 key->parttypid[col],
1485 key->parttypid[col],
1489 * If one doesn't exist, we must resort to using an operator in the same
1490 * opreator family but with the operator class declared input type. It is
1491 * OK to do so, because the column's type is known to be binary-coercible
1492 * with the operator class input type (otherwise, the operator class in
1493 * question would not have been accepted as the partitioning operator
1494 * class). We must however inform the caller to wrap the non-Const
1495 * expression with a RelabelType node to denote the implicit coercion. It
1496 * ensures that the resulting expression structurally matches similarly
1497 * processed expressions within the optimizer.
1499 if (!OidIsValid(operoid))
1501 operoid = get_opfamily_member(key->partopfamily[col],
1502 key->partopcintype[col],
1503 key->partopcintype[col],
1505 *need_relabel = true;
1508 *need_relabel = false;
1510 if (!OidIsValid(operoid))
1511 elog(ERROR, "could not find operator for partitioning");
1517 * generate_partition_qual
1519 * Generate partition predicate from rel's partition bound expression
1521 * Result expression tree is stored CacheMemoryContext to ensure it survives
1522 * as long as the relcache entry. But we should be running in a less long-lived
1523 * working context. To avoid leaking cache memory if this routine fails partway
1524 * through, we build in working memory and then copy the completed structure
1525 * into cache memory.
1528 generate_partition_qual(Relation rel)
1531 MemoryContext oldcxt;
1535 List *my_qual = NIL,
1539 /* Guard against stack overflow due to overly deep partition tree */
1540 check_stack_depth();
1543 if (rel->rd_partcheck != NIL)
1544 return copyObject(rel->rd_partcheck);
1546 /* Grab at least an AccessShareLock on the parent table */
1547 parent = heap_open(get_partition_parent(RelationGetRelid(rel)),
1550 /* Get pg_class.relpartbound */
1551 tuple = SearchSysCache1(RELOID, RelationGetRelid(rel));
1552 if (!HeapTupleIsValid(tuple))
1553 elog(ERROR, "cache lookup failed for relation %u",
1554 RelationGetRelid(rel));
1556 boundDatum = SysCacheGetAttr(RELOID, tuple,
1557 Anum_pg_class_relpartbound,
1559 if (isnull) /* should not happen */
1560 elog(ERROR, "relation \"%s\" has relpartbound = null",
1561 RelationGetRelationName(rel));
1562 bound = stringToNode(TextDatumGetCString(boundDatum));
1563 ReleaseSysCache(tuple);
1565 my_qual = get_qual_from_partbound(rel, parent, bound);
1567 /* Add the parent's quals to the list (if any) */
1568 if (parent->rd_rel->relispartition)
1569 result = list_concat(generate_partition_qual(parent), my_qual);
1574 * Change Vars to have partition's attnos instead of the parent's. We do
1575 * this after we concatenate the parent's quals, because we want every Var
1576 * in it to bear this relation's attnos. It's safe to assume varno = 1
1579 result = map_partition_varattnos(result, 1, rel, parent);
1581 /* Save a copy in the relcache */
1582 oldcxt = MemoryContextSwitchTo(CacheMemoryContext);
1583 rel->rd_partcheck = copyObject(result);
1584 MemoryContextSwitchTo(oldcxt);
1586 /* Keep the parent locked until commit */
1587 heap_close(parent, NoLock);
1593 * FormPartitionKeyDatum
1594 * Construct values[] and isnull[] arrays for the partition key
1597 * pd Partition dispatch object of the partitioned table
1598 * slot Heap tuple from which to extract partition key
1599 * estate executor state for evaluating any partition key
1600 * expressions (must be non-NULL)
1601 * values Array of partition key Datums (output area)
1602 * isnull Array of is-null indicators (output area)
1604 * the ecxt_scantuple slot of estate's per-tuple expr context must point to
1605 * the heap tuple passed in.
1609 FormPartitionKeyDatum(PartitionDispatch pd,
1610 TupleTableSlot *slot,
1615 ListCell *partexpr_item;
1618 if (pd->key->partexprs != NIL && pd->keystate == NIL)
1620 /* Check caller has set up context correctly */
1621 Assert(estate != NULL &&
1622 GetPerTupleExprContext(estate)->ecxt_scantuple == slot);
1624 /* First time through, set up expression evaluation state */
1625 pd->keystate = ExecPrepareExprList(pd->key->partexprs, estate);
1628 partexpr_item = list_head(pd->keystate);
1629 for (i = 0; i < pd->key->partnatts; i++)
1631 AttrNumber keycol = pd->key->partattrs[i];
1637 /* Plain column; get the value directly from the heap tuple */
1638 datum = slot_getattr(slot, keycol, &isNull);
1642 /* Expression; need to evaluate it */
1643 if (partexpr_item == NULL)
1644 elog(ERROR, "wrong number of partition key expressions");
1645 datum = ExecEvalExprSwitchContext((ExprState *) lfirst(partexpr_item),
1646 GetPerTupleExprContext(estate),
1648 partexpr_item = lnext(partexpr_item);
1654 if (partexpr_item != NULL)
1655 elog(ERROR, "wrong number of partition key expressions");
1659 * get_partition_for_tuple
1660 * Finds a leaf partition for tuple contained in *slot
1662 * Returned value is the sequence number of the leaf partition thus found,
1663 * or -1 if no leaf partition is found for the tuple. *failed_at is set
1664 * to the OID of the partitioned table whose partition was not found in
1668 get_partition_for_tuple(PartitionDispatch *pd,
1669 TupleTableSlot *slot,
1671 PartitionDispatchData **failed_at,
1672 TupleTableSlot **failed_slot)
1674 PartitionDispatch parent;
1675 Datum values[PARTITION_MAX_KEYS];
1676 bool isnull[PARTITION_MAX_KEYS];
1681 ExprContext *ecxt = GetPerTupleExprContext(estate);
1682 TupleTableSlot *ecxt_scantuple_old = ecxt->ecxt_scantuple;
1684 /* start with the root partitioned table */
1688 PartitionKey key = parent->key;
1689 PartitionDesc partdesc = parent->partdesc;
1690 TupleTableSlot *myslot = parent->tupslot;
1691 TupleConversionMap *map = parent->tupmap;
1693 if (myslot != NULL && map != NULL)
1695 HeapTuple tuple = ExecFetchSlotTuple(slot);
1697 ExecClearTuple(myslot);
1698 tuple = do_convert_tuple(tuple, map);
1699 ExecStoreTuple(tuple, myslot, InvalidBuffer, true);
1704 if (partdesc->nparts == 0)
1706 *failed_at = parent;
1707 *failed_slot = slot;
1712 * Extract partition key from tuple. Expression evaluation machinery
1713 * that FormPartitionKeyDatum() invokes expects ecxt_scantuple to
1714 * point to the correct tuple slot. The slot might have changed from
1715 * what was used for the parent table if the table of the current
1716 * partitioning level has different tuple descriptor from the parent.
1717 * So update ecxt_scantuple accordingly.
1719 ecxt->ecxt_scantuple = slot;
1720 FormPartitionKeyDatum(parent, slot, estate, values, isnull);
1722 if (key->strategy == PARTITION_STRATEGY_RANGE)
1724 /* Disallow nulls in the range partition key of the tuple */
1725 for (i = 0; i < key->partnatts; i++)
1728 (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
1729 errmsg("range partition key of row contains null")));
1732 if (partdesc->boundinfo->has_null && isnull[0])
1733 /* Tuple maps to the null-accepting list partition */
1734 cur_index = partdesc->boundinfo->null_index;
1737 /* Else bsearch in partdesc->boundinfo */
1740 cur_offset = partition_bound_bsearch(key, partdesc->boundinfo,
1741 values, false, &equal);
1742 switch (key->strategy)
1744 case PARTITION_STRATEGY_LIST:
1745 if (cur_offset >= 0 && equal)
1746 cur_index = partdesc->boundinfo->indexes[cur_offset];
1751 case PARTITION_STRATEGY_RANGE:
1754 * Offset returned is such that the bound at offset is
1755 * found to be less or equal with the tuple. So, the bound
1756 * at offset+1 would be the upper bound.
1758 cur_index = partdesc->boundinfo->indexes[cur_offset + 1];
1762 elog(ERROR, "unexpected partition strategy: %d",
1763 (int) key->strategy);
1768 * cur_index < 0 means we failed to find a partition of this parent.
1769 * cur_index >= 0 means we either found the leaf partition, or the
1770 * next parent to find a partition of.
1775 *failed_at = parent;
1776 *failed_slot = slot;
1779 else if (parent->indexes[cur_index] >= 0)
1781 result = parent->indexes[cur_index];
1785 parent = pd[-parent->indexes[cur_index]];
1788 ecxt->ecxt_scantuple = ecxt_scantuple_old;
1793 * qsort_partition_list_value_cmp
1795 * Compare two list partition bound datums
1798 qsort_partition_list_value_cmp(const void *a, const void *b, void *arg)
1800 Datum val1 = (*(const PartitionListValue **) a)->value,
1801 val2 = (*(const PartitionListValue **) b)->value;
1802 PartitionKey key = (PartitionKey) arg;
1804 return DatumGetInt32(FunctionCall2Coll(&key->partsupfunc[0],
1805 key->partcollation[0],
1810 * make_one_range_bound
1812 * Return a PartitionRangeBound given a list of PartitionRangeDatum elements
1813 * and a flag telling whether the bound is lower or not. Made into a function
1814 * because there are multiple sites that want to use this facility.
1816 static PartitionRangeBound *
1817 make_one_range_bound(PartitionKey key, int index, List *datums, bool lower)
1819 PartitionRangeBound *bound;
1823 bound = (PartitionRangeBound *) palloc0(sizeof(PartitionRangeBound));
1824 bound->index = index;
1825 bound->datums = (Datum *) palloc0(key->partnatts * sizeof(Datum));
1826 bound->content = (RangeDatumContent *) palloc0(key->partnatts *
1827 sizeof(RangeDatumContent));
1828 bound->lower = lower;
1831 foreach(cell, datums)
1833 PartitionRangeDatum *datum = lfirst(cell);
1835 /* What's contained in this range datum? */
1836 bound->content[i] = !datum->infinite
1837 ? RANGE_DATUM_FINITE
1838 : (lower ? RANGE_DATUM_NEG_INF
1839 : RANGE_DATUM_POS_INF);
1841 if (bound->content[i] == RANGE_DATUM_FINITE)
1843 Const *val = (Const *) datum->value;
1845 if (val->constisnull)
1846 elog(ERROR, "invalid range bound datum");
1847 bound->datums[i] = val->constvalue;
1856 /* Used when sorting range bounds across all range partitions */
1858 qsort_partition_rbound_cmp(const void *a, const void *b, void *arg)
1860 PartitionRangeBound *b1 = (*(PartitionRangeBound *const *) a);
1861 PartitionRangeBound *b2 = (*(PartitionRangeBound *const *) b);
1862 PartitionKey key = (PartitionKey) arg;
1864 return partition_rbound_cmp(key, b1->datums, b1->content, b1->lower, b2);
1868 * partition_rbound_cmp
1870 * Return for two range bounds whether the 1st one (specified in datum1,
1871 * content1, and lower1) is <=, =, >= the bound specified in *b2
1874 partition_rbound_cmp(PartitionKey key,
1875 Datum *datums1, RangeDatumContent *content1, bool lower1,
1876 PartitionRangeBound *b2)
1878 int32 cmpval = 0; /* placate compiler */
1880 Datum *datums2 = b2->datums;
1881 RangeDatumContent *content2 = b2->content;
1882 bool lower2 = b2->lower;
1884 for (i = 0; i < key->partnatts; i++)
1887 * First, handle cases involving infinity, which don't require
1888 * invoking the comparison proc.
1890 if (content1[i] != RANGE_DATUM_FINITE &&
1891 content2[i] != RANGE_DATUM_FINITE)
1894 * Both are infinity, so they are equal unless one is negative
1895 * infinity and other positive (or vice versa)
1897 return content1[i] == content2[i] ? 0
1898 : (content1[i] < content2[i] ? -1 : 1);
1899 else if (content1[i] != RANGE_DATUM_FINITE)
1900 return content1[i] == RANGE_DATUM_NEG_INF ? -1 : 1;
1901 else if (content2[i] != RANGE_DATUM_FINITE)
1902 return content2[i] == RANGE_DATUM_NEG_INF ? 1 : -1;
1904 cmpval = DatumGetInt32(FunctionCall2Coll(&key->partsupfunc[i],
1905 key->partcollation[i],
1913 * If the comparison is anything other than equal, we're done. If they
1914 * compare equal though, we still have to consider whether the boundaries
1915 * are inclusive or exclusive. Exclusive one is considered smaller of the
1918 if (cmpval == 0 && lower1 != lower2)
1919 cmpval = lower1 ? 1 : -1;
1925 * partition_rbound_datum_cmp
1927 * Return whether range bound (specified in rb_datums, rb_content, and
1928 * rb_lower) <=, =, >= partition key of tuple (tuple_datums)
1931 partition_rbound_datum_cmp(PartitionKey key,
1932 Datum *rb_datums, RangeDatumContent *rb_content,
1933 Datum *tuple_datums)
1938 for (i = 0; i < key->partnatts; i++)
1940 if (rb_content[i] != RANGE_DATUM_FINITE)
1941 return rb_content[i] == RANGE_DATUM_NEG_INF ? -1 : 1;
1943 cmpval = DatumGetInt32(FunctionCall2Coll(&key->partsupfunc[i],
1944 key->partcollation[i],
1955 * partition_bound_cmp
1957 * Return whether the bound at offset in boundinfo is <=, =, >= the argument
1958 * specified in *probe.
1961 partition_bound_cmp(PartitionKey key, PartitionBoundInfo boundinfo,
1962 int offset, void *probe, bool probe_is_bound)
1964 Datum *bound_datums = boundinfo->datums[offset];
1967 switch (key->strategy)
1969 case PARTITION_STRATEGY_LIST:
1970 cmpval = DatumGetInt32(FunctionCall2Coll(&key->partsupfunc[0],
1971 key->partcollation[0],
1976 case PARTITION_STRATEGY_RANGE:
1978 RangeDatumContent *content = boundinfo->content[offset];
1983 * We need to pass whether the existing bound is a lower
1984 * bound, so that two equal-valued lower and upper bounds
1985 * are not regarded equal.
1987 bool lower = boundinfo->indexes[offset] < 0;
1989 cmpval = partition_rbound_cmp(key,
1990 bound_datums, content, lower,
1991 (PartitionRangeBound *) probe);
1994 cmpval = partition_rbound_datum_cmp(key,
1995 bound_datums, content,
2001 elog(ERROR, "unexpected partition strategy: %d",
2002 (int) key->strategy);
2009 * Binary search on a collection of partition bounds. Returns greatest
2010 * bound in array boundinfo->datums which is less than or equal to *probe
2011 * If all bounds in the array are greater than *probe, -1 is returned.
2013 * *probe could either be a partition bound or a Datum array representing
2014 * the partition key of a tuple being routed; probe_is_bound tells which.
2015 * We pass that down to the comparison function so that it can interpret the
2016 * contents of *probe accordingly.
2018 * *is_equal is set to whether the bound at the returned index is equal with
2022 partition_bound_bsearch(PartitionKey key, PartitionBoundInfo boundinfo,
2023 void *probe, bool probe_is_bound, bool *is_equal)
2030 hi = boundinfo->ndatums - 1;
2035 mid = (lo + hi + 1) / 2;
2036 cmpval = partition_bound_cmp(key, boundinfo, mid, probe,
2041 *is_equal = (cmpval == 0);