]> granicus.if.org Git - postgresql/blob - src/backend/executor/execPartition.c
59a0ca4597c2f4631750c9bd54015c8820c10639
[postgresql] / src / backend / executor / execPartition.c
1 /*-------------------------------------------------------------------------
2  *
3  * execPartition.c
4  *        Support routines for partitioning.
5  *
6  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  * IDENTIFICATION
10  *        src/backend/executor/execPartition.c
11  *
12  *-------------------------------------------------------------------------
13  */
14
15 #include "postgres.h"
16
17 #include "catalog/pg_inherits_fn.h"
18 #include "executor/execPartition.h"
19 #include "executor/executor.h"
20 #include "mb/pg_wchar.h"
21 #include "miscadmin.h"
22 #include "utils/lsyscache.h"
23 #include "utils/rls.h"
24 #include "utils/ruleutils.h"
25
26 static PartitionDispatch *RelationGetPartitionDispatchInfo(Relation rel,
27                                                                  int *num_parted, List **leaf_part_oids);
28 static void get_partition_dispatch_recurse(Relation rel, Relation parent,
29                                                            List **pds, List **leaf_part_oids);
30 static void FormPartitionKeyDatum(PartitionDispatch pd,
31                                           TupleTableSlot *slot,
32                                           EState *estate,
33                                           Datum *values,
34                                           bool *isnull);
35 static char *ExecBuildSlotPartitionKeyDescription(Relation rel,
36                                                                          Datum *values,
37                                                                          bool *isnull,
38                                                                          int maxfieldlen);
39
40 /*
41  * ExecSetupPartitionTupleRouting - set up information needed during
42  * tuple routing for partitioned tables
43  *
44  * Output arguments:
45  * 'pd' receives an array of PartitionDispatch objects with one entry for
46  *              every partitioned table in the partition tree
47  * 'partitions' receives an array of ResultRelInfo* objects with one entry for
48  *              every leaf partition in the partition tree
49  * 'tup_conv_maps' receives an array of TupleConversionMap objects with one
50  *              entry for every leaf partition (required to convert input tuple based
51  *              on the root table's rowtype to a leaf partition's rowtype after tuple
52  *              routing is done)
53  * 'partition_tuple_slot' receives a standalone TupleTableSlot to be used
54  *              to manipulate any given leaf partition's rowtype after that partition
55  *              is chosen by tuple-routing.
56  * 'num_parted' receives the number of partitioned tables in the partition
57  *              tree (= the number of entries in the 'pd' output array)
58  * 'num_partitions' receives the number of leaf partitions in the partition
59  *              tree (= the number of entries in the 'partitions' and 'tup_conv_maps'
60  *              output arrays
61  *
62  * Note that all the relations in the partition tree are locked using the
63  * RowExclusiveLock mode upon return from this function.
64  */
65 void
66 ExecSetupPartitionTupleRouting(Relation rel,
67                                                            Index resultRTindex,
68                                                            EState *estate,
69                                                            PartitionDispatch **pd,
70                                                            ResultRelInfo ***partitions,
71                                                            TupleConversionMap ***tup_conv_maps,
72                                                            TupleTableSlot **partition_tuple_slot,
73                                                            int *num_parted, int *num_partitions)
74 {
75         TupleDesc       tupDesc = RelationGetDescr(rel);
76         List       *leaf_parts;
77         ListCell   *cell;
78         int                     i;
79         ResultRelInfo *leaf_part_rri;
80
81         /*
82          * Get the information about the partition tree after locking all the
83          * partitions.
84          */
85         (void) find_all_inheritors(RelationGetRelid(rel), RowExclusiveLock, NULL);
86         *pd = RelationGetPartitionDispatchInfo(rel, num_parted, &leaf_parts);
87         *num_partitions = list_length(leaf_parts);
88         *partitions = (ResultRelInfo **) palloc(*num_partitions *
89                                                                                         sizeof(ResultRelInfo *));
90         *tup_conv_maps = (TupleConversionMap **) palloc0(*num_partitions *
91                                                                                                          sizeof(TupleConversionMap *));
92
93         /*
94          * Initialize an empty slot that will be used to manipulate tuples of any
95          * given partition's rowtype.  It is attached to the caller-specified node
96          * (such as ModifyTableState) and released when the node finishes
97          * processing.
98          */
99         *partition_tuple_slot = MakeTupleTableSlot();
100
101         leaf_part_rri = (ResultRelInfo *) palloc0(*num_partitions *
102                                                                                           sizeof(ResultRelInfo));
103         i = 0;
104         foreach(cell, leaf_parts)
105         {
106                 Relation        partrel;
107                 TupleDesc       part_tupdesc;
108
109                 /*
110                  * We locked all the partitions above including the leaf partitions.
111                  * Note that each of the relations in *partitions are eventually
112                  * closed by the caller.
113                  */
114                 partrel = heap_open(lfirst_oid(cell), NoLock);
115                 part_tupdesc = RelationGetDescr(partrel);
116
117                 /*
118                  * Save a tuple conversion map to convert a tuple routed to this
119                  * partition from the parent's type to the partition's.
120                  */
121                 (*tup_conv_maps)[i] = convert_tuples_by_name(tupDesc, part_tupdesc,
122                                                                                                          gettext_noop("could not convert row type"));
123
124                 InitResultRelInfo(leaf_part_rri,
125                                                   partrel,
126                                                   resultRTindex,
127                                                   rel,
128                                                   estate->es_instrument);
129
130                 /*
131                  * Verify result relation is a valid target for INSERT.
132                  */
133                 CheckValidResultRel(leaf_part_rri, CMD_INSERT);
134
135                 /*
136                  * Open partition indices (remember we do not support ON CONFLICT in
137                  * case of partitioned tables, so we do not need support information
138                  * for speculative insertion)
139                  */
140                 if (leaf_part_rri->ri_RelationDesc->rd_rel->relhasindex &&
141                         leaf_part_rri->ri_IndexRelationDescs == NULL)
142                         ExecOpenIndices(leaf_part_rri, false);
143
144                 estate->es_leaf_result_relations =
145                         lappend(estate->es_leaf_result_relations, leaf_part_rri);
146
147                 (*partitions)[i] = leaf_part_rri++;
148                 i++;
149         }
150 }
151
152 /*
153  * ExecFindPartition -- Find a leaf partition in the partition tree rooted
154  * at parent, for the heap tuple contained in *slot
155  *
156  * estate must be non-NULL; we'll need it to compute any expressions in the
157  * partition key(s)
158  *
159  * If no leaf partition is found, this routine errors out with the appropriate
160  * error message, else it returns the leaf partition sequence number
161  * as an index into the array of (ResultRelInfos of) all leaf partitions in
162  * the partition tree.
163  */
164 int
165 ExecFindPartition(ResultRelInfo *resultRelInfo, PartitionDispatch *pd,
166                                   TupleTableSlot *slot, EState *estate)
167 {
168         int                     result;
169         Datum           values[PARTITION_MAX_KEYS];
170         bool            isnull[PARTITION_MAX_KEYS];
171         Relation        rel;
172         PartitionDispatch parent;
173         ExprContext *ecxt = GetPerTupleExprContext(estate);
174         TupleTableSlot *ecxt_scantuple_old = ecxt->ecxt_scantuple;
175
176         /*
177          * First check the root table's partition constraint, if any.  No point in
178          * routing the tuple if it doesn't belong in the root table itself.
179          */
180         if (resultRelInfo->ri_PartitionCheck)
181                 ExecPartitionCheck(resultRelInfo, slot, estate);
182
183         /* start with the root partitioned table */
184         parent = pd[0];
185         while (true)
186         {
187                 PartitionDesc partdesc;
188                 TupleTableSlot *myslot = parent->tupslot;
189                 TupleConversionMap *map = parent->tupmap;
190                 int                     cur_index = -1;
191
192                 rel = parent->reldesc;
193                 partdesc = RelationGetPartitionDesc(rel);
194
195                 /*
196                  * Convert the tuple to this parent's layout so that we can do certain
197                  * things we do below.
198                  */
199                 if (myslot != NULL && map != NULL)
200                 {
201                         HeapTuple       tuple = ExecFetchSlotTuple(slot);
202
203                         ExecClearTuple(myslot);
204                         tuple = do_convert_tuple(tuple, map);
205                         ExecStoreTuple(tuple, myslot, InvalidBuffer, true);
206                         slot = myslot;
207                 }
208
209                 /* Quick exit */
210                 if (partdesc->nparts == 0)
211                 {
212                         result = -1;
213                         break;
214                 }
215
216                 /*
217                  * Extract partition key from tuple. Expression evaluation machinery
218                  * that FormPartitionKeyDatum() invokes expects ecxt_scantuple to
219                  * point to the correct tuple slot.  The slot might have changed from
220                  * what was used for the parent table if the table of the current
221                  * partitioning level has different tuple descriptor from the parent.
222                  * So update ecxt_scantuple accordingly.
223                  */
224                 ecxt->ecxt_scantuple = slot;
225                 FormPartitionKeyDatum(parent, slot, estate, values, isnull);
226                 cur_index = get_partition_for_tuple(rel, values, isnull);
227
228                 /*
229                  * cur_index < 0 means we failed to find a partition of this parent.
230                  * cur_index >= 0 means we either found the leaf partition, or the
231                  * next parent to find a partition of.
232                  */
233                 if (cur_index < 0)
234                 {
235                         result = -1;
236                         break;
237                 }
238                 else if (parent->indexes[cur_index] >= 0)
239                 {
240                         result = parent->indexes[cur_index];
241                         break;
242                 }
243                 else
244                         parent = pd[-parent->indexes[cur_index]];
245         }
246
247         /* A partition was not found. */
248         if (result < 0)
249         {
250                 char       *val_desc;
251
252                 val_desc = ExecBuildSlotPartitionKeyDescription(rel,
253                                                                                                                 values, isnull, 64);
254                 Assert(OidIsValid(RelationGetRelid(rel)));
255                 ereport(ERROR,
256                                 (errcode(ERRCODE_CHECK_VIOLATION),
257                                  errmsg("no partition of relation \"%s\" found for row",
258                                                 RelationGetRelationName(rel)),
259                                  val_desc ? errdetail("Partition key of the failing row contains %s.", val_desc) : 0));
260         }
261
262         ecxt->ecxt_scantuple = ecxt_scantuple_old;
263         return result;
264 }
265
266 /*
267  * RelationGetPartitionDispatchInfo
268  *              Returns information necessary to route tuples down a partition tree
269  *
270  * The number of elements in the returned array (that is, the number of
271  * PartitionDispatch objects for the partitioned tables in the partition tree)
272  * is returned in *num_parted and a list of the OIDs of all the leaf
273  * partitions of rel is returned in *leaf_part_oids.
274  *
275  * All the relations in the partition tree (including 'rel') must have been
276  * locked (using at least the AccessShareLock) by the caller.
277  */
278 static PartitionDispatch *
279 RelationGetPartitionDispatchInfo(Relation rel,
280                                                                  int *num_parted, List **leaf_part_oids)
281 {
282         List       *pdlist = NIL;
283         PartitionDispatchData **pd;
284         ListCell   *lc;
285         int                     i;
286
287         Assert(rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE);
288
289         *num_parted = 0;
290         *leaf_part_oids = NIL;
291
292         get_partition_dispatch_recurse(rel, NULL, &pdlist, leaf_part_oids);
293         *num_parted = list_length(pdlist);
294         pd = (PartitionDispatchData **) palloc(*num_parted *
295                                                                                    sizeof(PartitionDispatchData *));
296         i = 0;
297         foreach(lc, pdlist)
298         {
299                 pd[i++] = lfirst(lc);
300         }
301
302         return pd;
303 }
304
305 /*
306  * get_partition_dispatch_recurse
307  *              Recursively expand partition tree rooted at rel
308  *
309  * As the partition tree is expanded in a depth-first manner, we maintain two
310  * global lists: of PartitionDispatch objects corresponding to partitioned
311  * tables in *pds and of the leaf partition OIDs in *leaf_part_oids.
312  *
313  * Note that the order of OIDs of leaf partitions in leaf_part_oids matches
314  * the order in which the planner's expand_partitioned_rtentry() processes
315  * them.  It's not necessarily the case that the offsets match up exactly,
316  * because constraint exclusion might prune away some partitions on the
317  * planner side, whereas we'll always have the complete list; but unpruned
318  * partitions will appear in the same order in the plan as they are returned
319  * here.
320  */
321 static void
322 get_partition_dispatch_recurse(Relation rel, Relation parent,
323                                                            List **pds, List **leaf_part_oids)
324 {
325         TupleDesc       tupdesc = RelationGetDescr(rel);
326         PartitionDesc partdesc = RelationGetPartitionDesc(rel);
327         PartitionKey partkey = RelationGetPartitionKey(rel);
328         PartitionDispatch pd;
329         int                     i;
330
331         check_stack_depth();
332
333         /* Build a PartitionDispatch for this table and add it to *pds. */
334         pd = (PartitionDispatch) palloc(sizeof(PartitionDispatchData));
335         *pds = lappend(*pds, pd);
336         pd->reldesc = rel;
337         pd->key = partkey;
338         pd->keystate = NIL;
339         pd->partdesc = partdesc;
340         if (parent != NULL)
341         {
342                 /*
343                  * For every partitioned table other than the root, we must store a
344                  * tuple table slot initialized with its tuple descriptor and a tuple
345                  * conversion map to convert a tuple from its parent's rowtype to its
346                  * own. That is to make sure that we are looking at the correct row
347                  * using the correct tuple descriptor when computing its partition key
348                  * for tuple routing.
349                  */
350                 pd->tupslot = MakeSingleTupleTableSlot(tupdesc);
351                 pd->tupmap = convert_tuples_by_name(RelationGetDescr(parent),
352                                                                                         tupdesc,
353                                                                                         gettext_noop("could not convert row type"));
354         }
355         else
356         {
357                 /* Not required for the root partitioned table */
358                 pd->tupslot = NULL;
359                 pd->tupmap = NULL;
360         }
361
362         /*
363          * Go look at each partition of this table.  If it's a leaf partition,
364          * simply add its OID to *leaf_part_oids.  If it's a partitioned table,
365          * recursively call get_partition_dispatch_recurse(), so that its
366          * partitions are processed as well and a corresponding PartitionDispatch
367          * object gets added to *pds.
368          *
369          * About the values in pd->indexes: for a leaf partition, it contains the
370          * leaf partition's position in the global list *leaf_part_oids minus 1,
371          * whereas for a partitioned table partition, it contains the partition's
372          * position in the global list *pds multiplied by -1.  The latter is
373          * multiplied by -1 to distinguish partitioned tables from leaf partitions
374          * when going through the values in pd->indexes.  So, for example, when
375          * using it during tuple-routing, encountering a value >= 0 means we found
376          * a leaf partition.  It is immediately returned as the index in the array
377          * of ResultRelInfos of all the leaf partitions, using which we insert the
378          * tuple into that leaf partition.  A negative value means we found a
379          * partitioned table.  The value multiplied by -1 is returned as the index
380          * in the array of PartitionDispatch objects of all partitioned tables in
381          * the tree.  This value is used to continue the search in the next level
382          * of the partition tree.
383          */
384         pd->indexes = (int *) palloc(partdesc->nparts * sizeof(int));
385         for (i = 0; i < partdesc->nparts; i++)
386         {
387                 Oid                     partrelid = partdesc->oids[i];
388
389                 if (get_rel_relkind(partrelid) != RELKIND_PARTITIONED_TABLE)
390                 {
391                         *leaf_part_oids = lappend_oid(*leaf_part_oids, partrelid);
392                         pd->indexes[i] = list_length(*leaf_part_oids) - 1;
393                 }
394                 else
395                 {
396                         /*
397                          * We assume all tables in the partition tree were already locked
398                          * by the caller.
399                          */
400                         Relation        partrel = heap_open(partrelid, NoLock);
401
402                         pd->indexes[i] = -list_length(*pds);
403                         get_partition_dispatch_recurse(partrel, rel, pds, leaf_part_oids);
404                 }
405         }
406 }
407
408 /* ----------------
409  *              FormPartitionKeyDatum
410  *                      Construct values[] and isnull[] arrays for the partition key
411  *                      of a tuple.
412  *
413  *      pd                              Partition dispatch object of the partitioned table
414  *      slot                    Heap tuple from which to extract partition key
415  *      estate                  executor state for evaluating any partition key
416  *                                      expressions (must be non-NULL)
417  *      values                  Array of partition key Datums (output area)
418  *      isnull                  Array of is-null indicators (output area)
419  *
420  * the ecxt_scantuple slot of estate's per-tuple expr context must point to
421  * the heap tuple passed in.
422  * ----------------
423  */
424 static void
425 FormPartitionKeyDatum(PartitionDispatch pd,
426                                           TupleTableSlot *slot,
427                                           EState *estate,
428                                           Datum *values,
429                                           bool *isnull)
430 {
431         ListCell   *partexpr_item;
432         int                     i;
433
434         if (pd->key->partexprs != NIL && pd->keystate == NIL)
435         {
436                 /* Check caller has set up context correctly */
437                 Assert(estate != NULL &&
438                            GetPerTupleExprContext(estate)->ecxt_scantuple == slot);
439
440                 /* First time through, set up expression evaluation state */
441                 pd->keystate = ExecPrepareExprList(pd->key->partexprs, estate);
442         }
443
444         partexpr_item = list_head(pd->keystate);
445         for (i = 0; i < pd->key->partnatts; i++)
446         {
447                 AttrNumber      keycol = pd->key->partattrs[i];
448                 Datum           datum;
449                 bool            isNull;
450
451                 if (keycol != 0)
452                 {
453                         /* Plain column; get the value directly from the heap tuple */
454                         datum = slot_getattr(slot, keycol, &isNull);
455                 }
456                 else
457                 {
458                         /* Expression; need to evaluate it */
459                         if (partexpr_item == NULL)
460                                 elog(ERROR, "wrong number of partition key expressions");
461                         datum = ExecEvalExprSwitchContext((ExprState *) lfirst(partexpr_item),
462                                                                                           GetPerTupleExprContext(estate),
463                                                                                           &isNull);
464                         partexpr_item = lnext(partexpr_item);
465                 }
466                 values[i] = datum;
467                 isnull[i] = isNull;
468         }
469
470         if (partexpr_item != NULL)
471                 elog(ERROR, "wrong number of partition key expressions");
472 }
473
474 /*
475  * ExecBuildSlotPartitionKeyDescription
476  *
477  * This works very much like BuildIndexValueDescription() and is currently
478  * used for building error messages when ExecFindPartition() fails to find
479  * partition for a row.
480  */
481 static char *
482 ExecBuildSlotPartitionKeyDescription(Relation rel,
483                                                                          Datum *values,
484                                                                          bool *isnull,
485                                                                          int maxfieldlen)
486 {
487         StringInfoData buf;
488         PartitionKey key = RelationGetPartitionKey(rel);
489         int                     partnatts = get_partition_natts(key);
490         int                     i;
491         Oid                     relid = RelationGetRelid(rel);
492         AclResult       aclresult;
493
494         if (check_enable_rls(relid, InvalidOid, true) == RLS_ENABLED)
495                 return NULL;
496
497         /* If the user has table-level access, just go build the description. */
498         aclresult = pg_class_aclcheck(relid, GetUserId(), ACL_SELECT);
499         if (aclresult != ACLCHECK_OK)
500         {
501                 /*
502                  * Step through the columns of the partition key and make sure the
503                  * user has SELECT rights on all of them.
504                  */
505                 for (i = 0; i < partnatts; i++)
506                 {
507                         AttrNumber      attnum = get_partition_col_attnum(key, i);
508
509                         /*
510                          * If this partition key column is an expression, we return no
511                          * detail rather than try to figure out what column(s) the
512                          * expression includes and if the user has SELECT rights on them.
513                          */
514                         if (attnum == InvalidAttrNumber ||
515                                 pg_attribute_aclcheck(relid, attnum, GetUserId(),
516                                                                           ACL_SELECT) != ACLCHECK_OK)
517                                 return NULL;
518                 }
519         }
520
521         initStringInfo(&buf);
522         appendStringInfo(&buf, "(%s) = (",
523                                          pg_get_partkeydef_columns(relid, true));
524
525         for (i = 0; i < partnatts; i++)
526         {
527                 char       *val;
528                 int                     vallen;
529
530                 if (isnull[i])
531                         val = "null";
532                 else
533                 {
534                         Oid                     foutoid;
535                         bool            typisvarlena;
536
537                         getTypeOutputInfo(get_partition_col_typid(key, i),
538                                                           &foutoid, &typisvarlena);
539                         val = OidOutputFunctionCall(foutoid, values[i]);
540                 }
541
542                 if (i > 0)
543                         appendStringInfoString(&buf, ", ");
544
545                 /* truncate if needed */
546                 vallen = strlen(val);
547                 if (vallen <= maxfieldlen)
548                         appendStringInfoString(&buf, val);
549                 else
550                 {
551                         vallen = pg_mbcliplen(val, vallen, maxfieldlen);
552                         appendBinaryStringInfo(&buf, val, vallen);
553                         appendStringInfoString(&buf, "...");
554                 }
555         }
556
557         appendStringInfoChar(&buf, ')');
558
559         return buf.data;
560 }