the system will be able to skip the scan to validate the implicit
partition constraint. Without such a constraint, the table will be
scanned to validate the partition constraint while holding an
- <literal>ACCESS EXCLUSIVE</literal> lock on the parent table.
+ <literal>ACCESS EXCLUSIVE</literal> lock on that partition
+ and a <literal>SHARE UPDATE EXCLUSIVE</literal> lock on the parent table.
One may then drop the constraint after <command>ATTACH PARTITION</command>
is finished, because it is no longer necessary.
</para>
* CopyFrom tuple routing.
*/
if (cstate->rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
- proute = ExecSetupPartitionTupleRouting(NULL, cstate->rel);
+ proute = ExecSetupPartitionTupleRouting(estate, NULL, cstate->rel);
if (cstate->whereClause)
cstate->qualexpr = ExecInitQual(castNode(List, cstate->whereClause),
break;
case AT_AttachPartition:
+ cmd_lockmode = ShareUpdateExclusiveLock;
+ break;
+
case AT_DetachPartition:
cmd_lockmode = AccessExclusiveLock;
break;
PartitionDispatch dispatch,
ResultRelInfo *partRelInfo,
int partidx);
-static PartitionDispatch ExecInitPartitionDispatchInfo(PartitionTupleRouting *proute,
+static PartitionDispatch ExecInitPartitionDispatchInfo(EState *estate,
+ PartitionTupleRouting *proute,
Oid partoid, PartitionDispatch parent_pd, int partidx);
static void FormPartitionKeyDatum(PartitionDispatch pd,
TupleTableSlot *slot,
* it should be estate->es_query_cxt.
*/
PartitionTupleRouting *
-ExecSetupPartitionTupleRouting(ModifyTableState *mtstate, Relation rel)
+ExecSetupPartitionTupleRouting(EState *estate, ModifyTableState *mtstate,
+ Relation rel)
{
PartitionTupleRouting *proute;
ModifyTable *node = mtstate ? (ModifyTable *) mtstate->ps.plan : NULL;
* parent as NULL as we don't need to care about any parent of the target
* partitioned table.
*/
- ExecInitPartitionDispatchInfo(proute, RelationGetRelid(rel), NULL, 0);
+ ExecInitPartitionDispatchInfo(estate, proute, RelationGetRelid(rel),
+ NULL, 0);
/*
* If performing an UPDATE with tuple routing, we can reuse partition
* Create the new PartitionDispatch. We pass the current one
* in as the parent PartitionDispatch
*/
- subdispatch = ExecInitPartitionDispatchInfo(proute,
+ subdispatch = ExecInitPartitionDispatchInfo(mtstate->ps.state,
+ proute,
partdesc->oids[partidx],
dispatch, partidx);
Assert(dispatch->indexes[partidx] >= 0 &&
* PartitionDispatch later.
*/
static PartitionDispatch
-ExecInitPartitionDispatchInfo(PartitionTupleRouting *proute, Oid partoid,
+ExecInitPartitionDispatchInfo(EState *estate,
+ PartitionTupleRouting *proute, Oid partoid,
PartitionDispatch parent_pd, int partidx)
{
Relation rel;
int dispatchidx;
MemoryContext oldcxt;
+ if (estate->es_partition_directory == NULL)
+ estate->es_partition_directory =
+ CreatePartitionDirectory(estate->es_query_cxt);
+
oldcxt = MemoryContextSwitchTo(proute->memcxt);
/*
rel = table_open(partoid, RowExclusiveLock);
else
rel = proute->partition_root;
- partdesc = RelationGetPartitionDesc(rel);
+ partdesc = PartitionDirectoryLookup(estate->es_partition_directory, rel);
pd = (PartitionDispatch) palloc(offsetof(PartitionDispatchData, indexes) +
partdesc->nparts * sizeof(int));
ListCell *lc;
int i;
+ if (estate->es_partition_directory == NULL)
+ estate->es_partition_directory =
+ CreatePartitionDirectory(estate->es_query_cxt);
+
n_part_hierarchies = list_length(partitionpruneinfo->prune_infos);
Assert(n_part_hierarchies > 0);
int n_steps;
ListCell *lc3;
- /*
- * We must copy the subplan_map rather than pointing directly to
- * the plan's version, as we may end up making modifications to it
- * later.
- */
- pprune->subplan_map = palloc(sizeof(int) * pinfo->nparts);
- memcpy(pprune->subplan_map, pinfo->subplan_map,
- sizeof(int) * pinfo->nparts);
-
- /* We can use the subpart_map verbatim, since we never modify it */
- pprune->subpart_map = pinfo->subpart_map;
-
/* present_parts is also subject to later modification */
pprune->present_parts = bms_copy(pinfo->present_parts);
*/
partrel = ExecGetRangeTableRelation(estate, pinfo->rtindex);
partkey = RelationGetPartitionKey(partrel);
- partdesc = RelationGetPartitionDesc(partrel);
+ partdesc = PartitionDirectoryLookup(estate->es_partition_directory,
+ partrel);
+
+ /*
+ * Initialize the subplan_map and subpart_map. Since detaching a
+ * partition requires AccessExclusiveLock, no partitions can have
+ * disappeared, nor can the bounds for any partition have changed.
+ * However, new partitions may have been added.
+ */
+ Assert(partdesc->nparts >= pinfo->nparts);
+ pprune->subplan_map = palloc(sizeof(int) * partdesc->nparts);
+ if (partdesc->nparts == pinfo->nparts)
+ {
+ /*
+ * There are no new partitions, so this is simple. We can
+ * simply point to the subpart_map from the plan, but we must
+ * copy the subplan_map since we may change it later.
+ */
+ pprune->subpart_map = pinfo->subpart_map;
+ memcpy(pprune->subplan_map, pinfo->subplan_map,
+ sizeof(int) * pinfo->nparts);
+
+ /* Double-check that list of relations has not changed. */
+ Assert(memcmp(partdesc->oids, pinfo->relid_map,
+ pinfo->nparts * sizeof(Oid)) == 0);
+ }
+ else
+ {
+ int pd_idx = 0;
+ int pp_idx;
+
+ /*
+ * Some new partitions have appeared since plan time, and
+ * those are reflected in our PartitionDesc but were not
+ * present in the one used to construct subplan_map and
+ * subpart_map. So we must construct new and longer arrays
+ * where the partitions that were originally present map to the
+ * same place, and any added indexes map to -1, as if the
+ * new partitions had been pruned.
+ */
+ pprune->subpart_map = palloc(sizeof(int) * partdesc->nparts);
+ for (pp_idx = 0; pp_idx < partdesc->nparts; ++pp_idx)
+ {
+ if (pinfo->relid_map[pd_idx] != partdesc->oids[pp_idx])
+ {
+ pprune->subplan_map[pp_idx] = -1;
+ pprune->subpart_map[pp_idx] = -1;
+ }
+ else
+ {
+ pprune->subplan_map[pp_idx] =
+ pinfo->subplan_map[pd_idx];
+ pprune->subpart_map[pp_idx] =
+ pinfo->subpart_map[pd_idx++];
+ }
+ }
+ Assert(pd_idx == pinfo->nparts);
+ }
n_steps = list_length(pinfo->pruning_steps);
#include "mb/pg_wchar.h"
#include "nodes/nodeFuncs.h"
#include "parser/parsetree.h"
+#include "partitioning/partdesc.h"
#include "storage/lmgr.h"
#include "utils/builtins.h"
#include "utils/memutils.h"
estate->es_jit = NULL;
}
+ /* release partition directory, if allocated */
+ if (estate->es_partition_directory)
+ {
+ DestroyPartitionDirectory(estate->es_partition_directory);
+ estate->es_partition_directory = NULL;
+ }
+
/*
* Free the per-query memory context, thereby releasing all working
* memory, including the EState node itself.
if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE &&
(operation == CMD_INSERT || update_tuple_routing_needed))
mtstate->mt_partition_tuple_routing =
- ExecSetupPartitionTupleRouting(mtstate, rel);
+ ExecSetupPartitionTupleRouting(estate, mtstate, rel);
/*
* Build state for collecting transition tuples. This requires having a
COPY_SCALAR_FIELD(nexprs);
COPY_POINTER_FIELD(subplan_map, from->nparts * sizeof(int));
COPY_POINTER_FIELD(subpart_map, from->nparts * sizeof(int));
+ COPY_POINTER_FIELD(relid_map, from->nparts * sizeof(int));
COPY_POINTER_FIELD(hasexecparam, from->nexprs * sizeof(bool));
COPY_SCALAR_FIELD(do_initial_prune);
COPY_SCALAR_FIELD(do_exec_prune);
WRITE_INT_FIELD(nexprs);
WRITE_INT_ARRAY(subplan_map, node->nparts);
WRITE_INT_ARRAY(subpart_map, node->nparts);
+ WRITE_OID_ARRAY(relid_map, node->nparts);
WRITE_BOOL_ARRAY(hasexecparam, node->nexprs);
WRITE_BOOL_FIELD(do_initial_prune);
WRITE_BOOL_FIELD(do_exec_prune);
READ_INT_FIELD(nexprs);
READ_INT_ARRAY(subplan_map, local_node->nparts);
READ_INT_ARRAY(subpart_map, local_node->nparts);
+ READ_OID_ARRAY(relid_map, local_node->nparts);
READ_BOOL_ARRAY(hasexecparam, local_node->nexprs);
READ_BOOL_FIELD(do_initial_prune);
READ_BOOL_FIELD(do_exec_prune);
#include "parser/analyze.h"
#include "parser/parsetree.h"
#include "parser/parse_agg.h"
+#include "partitioning/partdesc.h"
#include "rewrite/rewriteManip.h"
#include "storage/dsm_impl.h"
#include "utils/rel.h"
result->jitFlags |= PGJIT_DEFORM;
}
+ if (glob->partition_directory != NULL)
+ DestroyPartitionDirectory(glob->partition_directory);
+
return result;
}
{
Assert(rte->relkind == RELKIND_PARTITIONED_TABLE);
+ if (root->glob->partition_directory == NULL)
+ root->glob->partition_directory =
+ CreatePartitionDirectory(CurrentMemoryContext);
+
/*
* If this table has partitions, recursively expand and lock them.
* While at it, also extract the partition key columns of all the
int i;
RangeTblEntry *childrte;
Index childRTindex;
- PartitionDesc partdesc = RelationGetPartitionDesc(parentrel);
+ PartitionDesc partdesc;
+
+ partdesc = PartitionDirectoryLookup(root->glob->partition_directory,
+ parentrel);
check_stack_depth();
Assert(relation->rd_rel->relkind == RELKIND_PARTITIONED_TABLE);
- partdesc = RelationGetPartitionDesc(relation);
+ partdesc = PartitionDirectoryLookup(root->glob->partition_directory,
+ relation);
partkey = RelationGetPartitionKey(relation);
rel->part_scheme = find_partition_scheme(root, relation);
Assert(partdesc != NULL && rel->part_scheme != NULL);
#include "postgres.h"
+#include "access/genam.h"
+#include "access/htup_details.h"
+#include "access/table.h"
+#include "catalog/indexing.h"
#include "catalog/partition.h"
#include "catalog/pg_inherits.h"
#include "partitioning/partbounds.h"
#include "partitioning/partdesc.h"
+#include "storage/bufmgr.h"
+#include "storage/sinval.h"
#include "utils/builtins.h"
+#include "utils/inval.h"
+#include "utils/fmgroids.h"
+#include "utils/hsearch.h"
#include "utils/lsyscache.h"
#include "utils/memutils.h"
#include "utils/rel.h"
#include "utils/partcache.h"
#include "utils/syscache.h"
+typedef struct PartitionDirectoryData
+{
+ MemoryContext pdir_mcxt;
+ HTAB *pdir_hash;
+} PartitionDirectoryData;
+
+typedef struct PartitionDirectoryEntry
+{
+ Oid reloid;
+ Relation rel;
+ PartitionDesc pd;
+} PartitionDirectoryEntry;
+
/*
* RelationBuildPartitionDesc
* Form rel's partition descriptor
MemoryContext oldcxt;
int *mapping;
- /* Get partition oids from pg_inherits */
+ /*
+ * Get partition oids from pg_inherits. This uses a single snapshot to
+ * fetch the list of children, so while more children may be getting
+ * added concurrently, whatever this function returns will be accurate
+ * as of some well-defined point in time.
+ */
inhoids = find_inheritance_children(RelationGetRelid(rel), NoLock);
nparts = list_length(inhoids);
+ /* Allocate arrays for OIDs and boundspecs. */
if (nparts > 0)
{
oids = palloc(nparts * sizeof(Oid));
boundspecs = palloc(nparts * sizeof(PartitionBoundSpec *));
}
- /* Collect bound spec nodes for each partition */
+ /* Collect bound spec nodes for each partition. */
i = 0;
foreach(cell, inhoids)
{
Oid inhrelid = lfirst_oid(cell);
HeapTuple tuple;
- Datum datum;
- bool isnull;
- PartitionBoundSpec *boundspec;
+ PartitionBoundSpec *boundspec = NULL;
+ /* Try fetching the tuple from the catcache, for speed. */
tuple = SearchSysCache1(RELOID, inhrelid);
- if (!HeapTupleIsValid(tuple))
- elog(ERROR, "cache lookup failed for relation %u", inhrelid);
-
- datum = SysCacheGetAttr(RELOID, tuple,
- Anum_pg_class_relpartbound,
- &isnull);
- if (isnull)
- elog(ERROR, "null relpartbound for relation %u", inhrelid);
- boundspec = stringToNode(TextDatumGetCString(datum));
+ if (HeapTupleIsValid(tuple))
+ {
+ Datum datum;
+ bool isnull;
+
+ datum = SysCacheGetAttr(RELOID, tuple,
+ Anum_pg_class_relpartbound,
+ &isnull);
+ if (!isnull)
+ boundspec = stringToNode(TextDatumGetCString(datum));
+ ReleaseSysCache(tuple);
+ }
+
+ /*
+ * The system cache may be out of date; if so, we may find no pg_class
+ * tuple or an old one where relpartbound is NULL. In that case, try
+ * the table directly. We can't just AcceptInvalidationMessages() and
+ * retry the system cache lookup because it's possible that a
+ * concurrent ATTACH PARTITION operation has removed itself to the
+ * ProcArray but yet added invalidation messages to the shared queue;
+ * InvalidateSystemCaches() would work, but seems excessive.
+ *
+ * Note that this algorithm assumes that PartitionBoundSpec we manage
+ * to fetch is the right one -- so this is only good enough for
+ * concurrent ATTACH PARTITION, not concurrent DETACH PARTITION
+ * or some hypothetical operation that changes the partition bounds.
+ */
+ if (boundspec == NULL)
+ {
+ Relation pg_class;
+ SysScanDesc scan;
+ ScanKeyData key[1];
+ Datum datum;
+ bool isnull;
+
+ pg_class = table_open(RelationRelationId, AccessShareLock);
+ ScanKeyInit(&key[0],
+ Anum_pg_class_oid,
+ BTEqualStrategyNumber, F_OIDEQ,
+ ObjectIdGetDatum(inhrelid));
+ scan = systable_beginscan(pg_class, ClassOidIndexId, true,
+ NULL, 1, key);
+ tuple = systable_getnext(scan);
+ datum = heap_getattr(tuple, Anum_pg_class_relpartbound,
+ RelationGetDescr(pg_class), &isnull);
+ if (!isnull)
+ boundspec = stringToNode(TextDatumGetCString(datum));
+ systable_endscan(scan);
+ table_close(pg_class, AccessShareLock);
+ }
+
+ /* Sanity checks. */
+ if (!boundspec)
+ elog(ERROR, "missing relpartbound for relation %u", inhrelid);
if (!IsA(boundspec, PartitionBoundSpec))
elog(ERROR, "invalid relpartbound for relation %u", inhrelid);
/*
- * Sanity check: If the PartitionBoundSpec says this is the default
- * partition, its OID should correspond to whatever's stored in
- * pg_partitioned_table.partdefid; if not, the catalog is corrupt.
+ * If the PartitionBoundSpec says this is the default partition, its
+ * OID should match pg_partitioned_table.partdefid; if not, the
+ * catalog is corrupt.
*/
if (boundspec->is_default)
{
inhrelid, partdefid);
}
+ /* Save results. */
oids[i] = inhrelid;
boundspecs[i] = boundspec;
++i;
- ReleaseSysCache(tuple);
}
/* Now build the actual relcache partition descriptor */
partdesc->oids[index] = oids[i];
/* Record if the partition is a leaf partition */
partdesc->is_leaf[index] =
- (get_rel_relkind(oids[i]) != RELKIND_PARTITIONED_TABLE);
+ (get_rel_relkind(oids[i]) != RELKIND_PARTITIONED_TABLE);
}
MemoryContextSwitchTo(oldcxt);
rel->rd_partdesc = partdesc;
}
+/*
+ * CreatePartitionDirectory
+ * Create a new partition directory object.
+ */
+PartitionDirectory
+CreatePartitionDirectory(MemoryContext mcxt)
+{
+ MemoryContext oldcontext = MemoryContextSwitchTo(mcxt);
+ PartitionDirectory pdir;
+ HASHCTL ctl;
+
+ MemSet(&ctl, 0, sizeof(HASHCTL));
+ ctl.keysize = sizeof(Oid);
+ ctl.entrysize = sizeof(PartitionDirectoryEntry);
+ ctl.hcxt = mcxt;
+
+ pdir = palloc(sizeof(PartitionDirectoryData));
+ pdir->pdir_mcxt = mcxt;
+ pdir->pdir_hash = hash_create("partition directory", 256, &ctl,
+ HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
+
+ MemoryContextSwitchTo(oldcontext);
+ return pdir;
+}
+
+/*
+ * PartitionDirectoryLookup
+ * Look up the partition descriptor for a relation in the directory.
+ *
+ * The purpose of this function is to ensure that we get the same
+ * PartitionDesc for each relation every time we look it up. In the
+ * face of current DDL, different PartitionDescs may be constructed with
+ * different views of the catalog state, but any single particular OID
+ * will always get the same PartitionDesc for as long as the same
+ * PartitionDirectory is used.
+ */
+PartitionDesc
+PartitionDirectoryLookup(PartitionDirectory pdir, Relation rel)
+{
+ PartitionDirectoryEntry *pde;
+ Oid relid = RelationGetRelid(rel);
+ bool found;
+
+ pde = hash_search(pdir->pdir_hash, &relid, HASH_ENTER, &found);
+ if (!found)
+ {
+ /*
+ * We must keep a reference count on the relation so that the
+ * PartitionDesc to which we are pointing can't get destroyed.
+ */
+ RelationIncrementReferenceCount(rel);
+ pde->rel = rel;
+ pde->pd = RelationGetPartitionDesc(rel);
+ Assert(pde->pd != NULL);
+ }
+ return pde->pd;
+}
+
+/*
+ * DestroyPartitionDirectory
+ * Destroy a partition directory.
+ *
+ * Release the reference counts we're holding.
+ */
+void
+DestroyPartitionDirectory(PartitionDirectory pdir)
+{
+ HASH_SEQ_STATUS status;
+ PartitionDirectoryEntry *pde;
+
+ hash_seq_init(&status, pdir->pdir_hash);
+ while ((pde = hash_seq_search(&status)) != NULL)
+ RelationDecrementReferenceCount(pde->rel);
+}
+
/*
* equalPartitionDescs
* Compare two partition descriptors for logical equality
#include "optimizer/appendinfo.h"
#include "optimizer/optimizer.h"
#include "optimizer/pathnode.h"
-#include "partitioning/partprune.h"
+#include "parser/parsetree.h"
#include "partitioning/partbounds.h"
+#include "partitioning/partprune.h"
#include "rewrite/rewriteManip.h"
#include "utils/lsyscache.h"
int partnatts = subpart->part_scheme->partnatts;
int *subplan_map;
int *subpart_map;
+ Oid *relid_map;
List *partprunequal;
List *pruning_steps;
bool contradictory;
*/
subplan_map = (int *) palloc(nparts * sizeof(int));
subpart_map = (int *) palloc(nparts * sizeof(int));
+ relid_map = (Oid *) palloc(nparts * sizeof(int));
present_parts = NULL;
for (i = 0; i < nparts; i++)
subplan_map[i] = subplanidx;
subpart_map[i] = subpartidx;
+ relid_map[i] = planner_rt_fetch(partrel->relid, root)->relid;
if (subplanidx >= 0)
{
present_parts = bms_add_member(present_parts, i);
pinfo->nparts = nparts;
pinfo->subplan_map = subplan_map;
pinfo->subpart_map = subpart_map;
+ pinfo->relid_map = relid_map;
/* Determine which pruning types should be enabled at this level */
doruntimeprune |= analyze_partkey_exprs(pinfo, pruning_steps,
SWAPFIELD(PartitionDesc, rd_partdesc);
SWAPFIELD(MemoryContext, rd_pdcxt);
}
+ else if (rebuild && newrel->rd_pdcxt != NULL)
+ {
+ /*
+ * We are rebuilding a partitioned relation with a non-zero
+ * reference count, so keep the old partition descriptor around,
+ * in case there's a PartitionDirectory with a pointer to it.
+ * Attach it to the new rd_pdcxt so that it gets cleaned up
+ * eventually. In the case where the reference count is 0, this
+ * code is not reached, which should be OK because in that case
+ * there should be no PartitionDirectory with a pointer to the old
+ * entry.
+ *
+ * Note that newrel and relation have already been swapped, so
+ * the "old" partition descriptor is actually the one hanging off
+ * of newrel.
+ */
+ MemoryContextSetParent(newrel->rd_pdcxt, relation->rd_pdcxt);
+ newrel->rd_partdesc = NULL;
+ newrel->rd_pdcxt = NULL;
+ }
#undef SWAPFIELD
PartitionPruningData *partprunedata[FLEXIBLE_ARRAY_MEMBER];
} PartitionPruneState;
-extern PartitionTupleRouting *ExecSetupPartitionTupleRouting(ModifyTableState *mtstate,
+extern PartitionTupleRouting *ExecSetupPartitionTupleRouting(EState *estate,
+ ModifyTableState *mtstate,
Relation rel);
extern ResultRelInfo *ExecFindPartition(ModifyTableState *mtstate,
ResultRelInfo *rootResultRelInfo,
#include "lib/pairingheap.h"
#include "nodes/params.h"
#include "nodes/plannodes.h"
+#include "partitioning/partdefs.h"
#include "utils/hsearch.h"
#include "utils/queryenvironment.h"
#include "utils/reltrigger.h"
*/
ResultRelInfo *es_root_result_relations; /* array of ResultRelInfos */
int es_num_root_result_relations; /* length of the array */
+ PartitionDirectory es_partition_directory; /* for PartitionDesc lookup */
/*
* The following list contains ResultRelInfos created by the tuple routing
bool parallelModeNeeded; /* parallel mode actually required? */
char maxParallelHazard; /* worst PROPARALLEL hazard level */
+
+ PartitionDirectory partition_directory; /* partition descriptors */
} PlannerGlobal;
/* macro for fetching the Plan associated with a SubPlan node */
int nexprs; /* Length of hasexecparam[] */
int *subplan_map; /* subplan index by partition index, or -1 */
int *subpart_map; /* subpart index by partition index, or -1 */
+ Oid *relid_map; /* relation OID by partition index, or -1 */
bool *hasexecparam; /* true if corresponding pruning_step contains
* any PARAM_EXEC Params. */
bool do_initial_prune; /* true if pruning should be performed
typedef struct PartitionDescData *PartitionDesc;
+typedef struct PartitionDirectoryData *PartitionDirectory;
+
#endif /* PARTDEFS_H */
extern void RelationBuildPartitionDesc(Relation rel);
+extern PartitionDirectory CreatePartitionDirectory(MemoryContext mcxt);
+extern PartitionDesc PartitionDirectoryLookup(PartitionDirectory, Relation);
+extern void DestroyPartitionDirectory(PartitionDirectory pdir);
+
extern Oid get_default_oid_from_partdesc(PartitionDesc partdesc);
extern bool equalPartitionDescs(PartitionKey key, PartitionDesc partdesc1,