From 312b1a983fb57aff6a73fb65718e9121468ba472 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Fri, 24 Oct 2008 23:42:35 +0000 Subject: [PATCH] Reduce the memory footprint of large pending-trigger-event lists, as per my recent proposal. In typical cases, we now need 12 bytes per insert or delete event and 16 bytes per update event; previously we needed 40 bytes per event on 32-bit hardware and 80 bytes per event on 64-bit hardware. Even in the worst case usage pattern with a large number of distinct triggers being fired in one query, usage is at most 32 bytes per event. It seems to be a bit faster than the old code as well, due to reduction of palloc overhead. This commit doesn't address the TODO item of allowing the event list to spill to disk; rather it's trying to stave off the need for that. However, it probably makes that task a bit easier by reducing the data structure's dependency on pointers. It would now be practical to dump an event list to disk by "chunks" instead of individual events. --- src/backend/commands/trigger.c | 837 ++++++++++++++++++++------------- src/include/commands/trigger.h | 8 +- 2 files changed, 516 insertions(+), 329 deletions(-) diff --git a/src/backend/commands/trigger.c b/src/backend/commands/trigger.c index 3461056c57..ffdb168236 100644 --- a/src/backend/commands/trigger.c +++ b/src/backend/commands/trigger.c @@ -7,7 +7,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/commands/trigger.c,v 1.237 2008/09/01 20:42:44 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/commands/trigger.c,v 1.238 2008/10/24 23:42:35 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -2234,12 +2234,14 @@ ltrmark:; * during the current transaction tree. (BEFORE triggers are fired * immediately so we don't need any persistent state about them.) The struct * and most of its subsidiary data are kept in TopTransactionContext; however - * the individual event records are kept in separate contexts, to make them - * easy to delete during subtransaction abort. + * the individual event records are kept in a separate sub-context. This is + * done mainly so that it's easy to tell from a memory context dump how much + * space is being eaten by trigger events. * - * Because the list of pending events can grow large, we go to some effort - * to minimize memory consumption. We do not use the generic List mechanism - * but thread the events manually. + * Because the list of pending events can grow large, we go to some + * considerable effort to minimize per-event memory consumption. The event + * records are grouped into chunks and common data for similar events in the + * same chunk is only stored once. * * XXX We need to be able to save the per-event data in a file if it grows too * large. @@ -2280,30 +2282,101 @@ typedef SetConstraintStateData *SetConstraintState; /* * Per-trigger-event data * - * Note: ate_firing_id is meaningful when either AFTER_TRIGGER_DONE - * or AFTER_TRIGGER_IN_PROGRESS is set. It indicates which trigger firing - * cycle the trigger was or will be fired in. + * The actual per-event data, AfterTriggerEventData, includes DONE/IN_PROGRESS + * status bits and one or two tuple CTIDs. Each event record also has an + * associated AfterTriggerSharedData that is shared across all instances + * of similar events within a "chunk". + * + * We arrange not to waste storage on ate_ctid2 for non-update events. + * We could go further and not store either ctid for statement-level triggers, + * but that seems unlikely to be worth the trouble. + * + * Note: ats_firing_id is initially zero and is set to something else when + * AFTER_TRIGGER_IN_PROGRESS is set. It indicates which trigger firing + * cycle the trigger will be fired in (or was fired in, if DONE is set). + * Although this is mutable state, we can keep it in AfterTriggerSharedData + * because all instances of the same type of event in a given event list will + * be fired at the same time, if they were queued between the same firing + * cycles. So we need only ensure that ats_firing_id is zero when attaching + * a new event to an existing AfterTriggerSharedData record. */ +typedef uint32 TriggerFlags; + +#define AFTER_TRIGGER_OFFSET 0x0FFFFFFF /* must be low-order bits */ +#define AFTER_TRIGGER_2CTIDS 0x10000000 +#define AFTER_TRIGGER_DONE 0x20000000 +#define AFTER_TRIGGER_IN_PROGRESS 0x40000000 + +typedef struct AfterTriggerSharedData *AfterTriggerShared; + +typedef struct AfterTriggerSharedData +{ + TriggerEvent ats_event; /* event type indicator, see trigger.h */ + Oid ats_tgoid; /* the trigger's ID */ + Oid ats_relid; /* the relation it's on */ + CommandId ats_firing_id; /* ID for firing cycle */ +} AfterTriggerSharedData; + typedef struct AfterTriggerEventData *AfterTriggerEvent; typedef struct AfterTriggerEventData { - AfterTriggerEvent ate_next; /* list link */ - TriggerEvent ate_event; /* event type and status bits */ - CommandId ate_firing_id; /* ID for firing cycle */ - Oid ate_tgoid; /* the trigger's ID */ - Oid ate_relid; /* the relation it's on */ - ItemPointerData ate_oldctid; /* specific tuple(s) involved */ - ItemPointerData ate_newctid; + TriggerFlags ate_flags; /* status bits and offset to shared data */ + ItemPointerData ate_ctid1; /* inserted, deleted, or old updated tuple */ + ItemPointerData ate_ctid2; /* new updated tuple */ } AfterTriggerEventData; +/* This struct must exactly match the one above except for not having ctid2 */ +typedef struct AfterTriggerEventDataOneCtid +{ + TriggerFlags ate_flags; /* status bits and offset to shared data */ + ItemPointerData ate_ctid1; /* inserted, deleted, or old updated tuple */ +} AfterTriggerEventDataOneCtid; + +#define SizeofTriggerEvent(evt) \ + (((evt)->ate_flags & AFTER_TRIGGER_2CTIDS) ? \ + sizeof(AfterTriggerEventData) : sizeof(AfterTriggerEventDataOneCtid)) + +#define GetTriggerSharedData(evt) \ + ((AfterTriggerShared) ((char *) (evt) + ((evt)->ate_flags & AFTER_TRIGGER_OFFSET))) + +/* + * To avoid palloc overhead, we keep trigger events in arrays in successively- + * larger chunks (a slightly more sophisticated version of an expansible + * array). The space between CHUNK_DATA_START and freeptr is occupied by + * AfterTriggerEventData records; the space between endfree and endptr is + * occupied by AfterTriggerSharedData records. + */ +typedef struct AfterTriggerEventChunk +{ + struct AfterTriggerEventChunk *next; /* list link */ + char *freeptr; /* start of free space in chunk */ + char *endfree; /* end of free space in chunk */ + char *endptr; /* end of chunk */ + /* event data follows here */ +} AfterTriggerEventChunk; + +#define CHUNK_DATA_START(cptr) ((char *) (cptr) + MAXALIGN(sizeof(AfterTriggerEventChunk))) + /* A list of events */ typedef struct AfterTriggerEventList { - AfterTriggerEvent head; - AfterTriggerEvent tail; + AfterTriggerEventChunk *head; + AfterTriggerEventChunk *tail; + char *tailfree; /* freeptr of tail chunk */ } AfterTriggerEventList; +/* Macros to help in iterating over a list of events */ +#define for_each_chunk(cptr, evtlist) \ + for (cptr = (evtlist).head; cptr != NULL; cptr = cptr->next) +#define for_each_event(eptr, cptr) \ + for (eptr = (AfterTriggerEvent) CHUNK_DATA_START(cptr); \ + (char *) eptr < (cptr)->freeptr; \ + eptr = (AfterTriggerEvent) (((char *) eptr) + SizeofTriggerEvent(eptr))) +/* Use this if no special per-chunk processing is needed */ +#define for_each_event_chunk(eptr, cptr, evtlist) \ + for_each_chunk(cptr, evtlist) for_each_event(eptr, cptr) + /* * All per-transaction data for the AFTER TRIGGERS module. @@ -2323,9 +2396,7 @@ typedef struct AfterTriggerEventList * all subtransactions of the current transaction. In a subtransaction * abort, we know that the events added by the subtransaction are at the * end of the list, so it is relatively easy to discard them. The event - * structs themselves are stored in event_cxt if generated by the top-level - * transaction, else in per-subtransaction contexts identified by the - * entries in cxt_stack. + * list chunks themselves are stored in event_cxt. * * query_depth is the current depth of nested AfterTriggerBeginQuery calls * (-1 when the stack is empty). @@ -2367,8 +2438,7 @@ typedef struct AfterTriggersData int query_depth; /* current query list index */ AfterTriggerEventList *query_stack; /* events pending from each query */ int maxquerydepth; /* allocated len of above array */ - MemoryContext event_cxt; /* top transaction's event context, if any */ - MemoryContext *cxt_stack; /* per-subtransaction event contexts */ + MemoryContext event_cxt; /* memory context for events, if any */ /* these fields are just for resetting at subtrans abort: */ @@ -2398,13 +2468,13 @@ static SetConstraintState SetConstraintStateAddItem(SetConstraintState state, /* ---------- * afterTriggerCheckState() * - * Returns true if the trigger identified by tgoid is actually - * in state DEFERRED. + * Returns true if the trigger event is actually in state DEFERRED. * ---------- */ static bool -afterTriggerCheckState(Oid tgoid, TriggerEvent eventstate) +afterTriggerCheckState(AfterTriggerShared evtshared) { + Oid tgoid = evtshared->ats_tgoid; SetConstraintState state = afterTriggers->state; int i; @@ -2412,7 +2482,7 @@ afterTriggerCheckState(Oid tgoid, TriggerEvent eventstate) * For not-deferrable triggers (i.e. normal AFTER ROW triggers and * constraints declared NOT DEFERRABLE), the state is always false. */ - if ((eventstate & AFTER_TRIGGER_DEFERRABLE) == 0) + if ((evtshared->ats_event & AFTER_TRIGGER_DEFERRABLE) == 0) return false; /* @@ -2433,37 +2503,184 @@ afterTriggerCheckState(Oid tgoid, TriggerEvent eventstate) /* * Otherwise return the default state for the trigger. */ - return ((eventstate & AFTER_TRIGGER_INITDEFERRED) != 0); + return ((evtshared->ats_event & AFTER_TRIGGER_INITDEFERRED) != 0); } /* ---------- * afterTriggerAddEvent() * - * Add a new trigger event to the current query's queue. + * Add a new trigger event to the specified queue. + * The passed-in event data is copied. * ---------- */ static void -afterTriggerAddEvent(AfterTriggerEvent event) +afterTriggerAddEvent(AfterTriggerEventList *events, + AfterTriggerEvent event, AfterTriggerShared evtshared) { - AfterTriggerEventList *events; + Size eventsize = SizeofTriggerEvent(event); + Size needed = eventsize + sizeof(AfterTriggerSharedData); + AfterTriggerEventChunk *chunk; + AfterTriggerShared newshared; + AfterTriggerEvent newevent; - Assert(event->ate_next == NULL); + /* + * If empty list or not enough room in the tail chunk, make a new chunk. + * We assume here that a new shared record will always be needed. + */ + chunk = events->tail; + if (chunk == NULL || + chunk->endfree - chunk->freeptr < needed) + { + Size chunksize; - /* Must be inside a query */ - Assert(afterTriggers->query_depth >= 0); + /* Create event context if we didn't already */ + if (afterTriggers->event_cxt == NULL) + afterTriggers->event_cxt = + AllocSetContextCreate(TopTransactionContext, + "AfterTriggerEvents", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); - events = &afterTriggers->query_stack[afterTriggers->query_depth]; - if (events->tail == NULL) + /* + * Chunk size starts at 1KB and is allowed to increase up to 1MB. + * These numbers are fairly arbitrary, though there is a hard limit at + * AFTER_TRIGGER_OFFSET; else we couldn't link event records to their + * shared records using the available space in ate_flags. Another + * constraint is that if the chunk size gets too huge, the search loop + * below would get slow given a (not too common) usage pattern with + * many distinct event types in a chunk. Therefore, we double the + * preceding chunk size only if there weren't too many shared records + * in the preceding chunk; otherwise we halve it. This gives us some + * ability to adapt to the actual usage pattern of the current query + * while still having large chunk sizes in typical usage. All chunk + * sizes used should be MAXALIGN multiples, to ensure that the shared + * records will be aligned safely. + */ +#define MIN_CHUNK_SIZE 1024 +#define MAX_CHUNK_SIZE (1024*1024) + +#if MAX_CHUNK_SIZE > (AFTER_TRIGGER_OFFSET+1) +#error MAX_CHUNK_SIZE must not exceed AFTER_TRIGGER_OFFSET +#endif + + if (chunk == NULL) + chunksize = MIN_CHUNK_SIZE; + else + { + /* preceding chunk size... */ + chunksize = chunk->endptr - (char *) chunk; + /* check number of shared records in preceding chunk */ + if ((chunk->endptr - chunk->endfree) <= + (100 * sizeof(AfterTriggerSharedData))) + chunksize *= 2; /* okay, double it */ + else + chunksize /= 2; /* too many shared records */ + chunksize = Min(chunksize, MAX_CHUNK_SIZE); + } + chunk = MemoryContextAlloc(afterTriggers->event_cxt, chunksize); + chunk->next = NULL; + chunk->freeptr = CHUNK_DATA_START(chunk); + chunk->endptr = chunk->endfree = (char *) chunk + chunksize; + Assert(chunk->endfree - chunk->freeptr >= needed); + + if (events->head == NULL) + events->head = chunk; + else + events->tail->next = chunk; + events->tail = chunk; + } + + /* + * Try to locate a matching shared-data record already in the chunk. + * If none, make a new one. + */ + for (newshared = ((AfterTriggerShared) chunk->endptr) - 1; + (char *) newshared >= chunk->endfree; + newshared--) + { + if (newshared->ats_tgoid == evtshared->ats_tgoid && + newshared->ats_relid == evtshared->ats_relid && + newshared->ats_event == evtshared->ats_event && + newshared->ats_firing_id == 0) + break; + } + if ((char *) newshared < chunk->endfree) { - /* first list entry */ - events->head = event; - events->tail = event; + *newshared = *evtshared; + newshared->ats_firing_id = 0; /* just to be sure */ + chunk->endfree = (char *) newshared; + } + + /* Insert the data */ + newevent = (AfterTriggerEvent) chunk->freeptr; + memcpy(newevent, event, eventsize); + /* ... and link the new event to its shared record */ + newevent->ate_flags &= ~AFTER_TRIGGER_OFFSET; + newevent->ate_flags |= (char *) newshared - (char *) newevent; + + chunk->freeptr += eventsize; + events->tailfree = chunk->freeptr; +} + +/* ---------- + * afterTriggerFreeEventList() + * + * Free all the event storage in the given list. + * ---------- + */ +static void +afterTriggerFreeEventList(AfterTriggerEventList *events) +{ + AfterTriggerEventChunk *chunk; + AfterTriggerEventChunk *next_chunk; + + for (chunk = events->head; chunk != NULL; chunk = next_chunk) + { + next_chunk = chunk->next; + pfree(chunk); + } + events->head = NULL; + events->tail = NULL; + events->tailfree = NULL; +} + +/* ---------- + * afterTriggerRestoreEventList() + * + * Restore an event list to its prior length, removing all the events + * added since it had the value old_events. + * ---------- + */ +static void +afterTriggerRestoreEventList(AfterTriggerEventList *events, + const AfterTriggerEventList *old_events) +{ + AfterTriggerEventChunk *chunk; + AfterTriggerEventChunk *next_chunk; + + if (old_events->tail == NULL) + { + /* restoring to a completely empty state, so free everything */ + afterTriggerFreeEventList(events); } else { - events->tail->ate_next = event; - events->tail = event; + *events = *old_events; + /* free any chunks after the last one we want to keep */ + for (chunk = events->tail->next; chunk != NULL; chunk = next_chunk) + { + next_chunk = chunk->next; + pfree(chunk); + } + /* and clean up the tail chunk to be the right length */ + events->tail->next = NULL; + events->tail->freeptr = events->tailfree; + /* + * We don't make any effort to remove now-unused shared data records. + * They might still be useful, anyway. + */ } } @@ -2494,13 +2711,14 @@ AfterTriggerExecute(AfterTriggerEvent event, FmgrInfo *finfo, Instrumentation *instr, MemoryContext per_tuple_context) { - Oid tgoid = event->ate_tgoid; + AfterTriggerShared evtshared = GetTriggerSharedData(event); + Oid tgoid = evtshared->ats_tgoid; TriggerData LocTriggerData; - HeapTupleData oldtuple; - HeapTupleData newtuple; + HeapTupleData tuple1; + HeapTupleData tuple2; HeapTuple rettuple; - Buffer oldbuffer = InvalidBuffer; - Buffer newbuffer = InvalidBuffer; + Buffer buffer1 = InvalidBuffer; + Buffer buffer2 = InvalidBuffer; int tgindx; /* @@ -2526,37 +2744,36 @@ AfterTriggerExecute(AfterTriggerEvent event, InstrStartNode(instr + tgindx); /* - * Fetch the required OLD and NEW tuples. + * Fetch the required tuple(s). */ - LocTriggerData.tg_trigtuple = NULL; - LocTriggerData.tg_newtuple = NULL; - LocTriggerData.tg_trigtuplebuf = InvalidBuffer; - LocTriggerData.tg_newtuplebuf = InvalidBuffer; - - if (ItemPointerIsValid(&(event->ate_oldctid))) + if (ItemPointerIsValid(&(event->ate_ctid1))) { - ItemPointerCopy(&(event->ate_oldctid), &(oldtuple.t_self)); - if (!heap_fetch(rel, SnapshotAny, &oldtuple, &oldbuffer, false, NULL)) - elog(ERROR, "failed to fetch old tuple for AFTER trigger"); - LocTriggerData.tg_trigtuple = &oldtuple; - LocTriggerData.tg_trigtuplebuf = oldbuffer; + ItemPointerCopy(&(event->ate_ctid1), &(tuple1.t_self)); + if (!heap_fetch(rel, SnapshotAny, &tuple1, &buffer1, false, NULL)) + elog(ERROR, "failed to fetch tuple1 for AFTER trigger"); + LocTriggerData.tg_trigtuple = &tuple1; + LocTriggerData.tg_trigtuplebuf = buffer1; + } + else + { + LocTriggerData.tg_trigtuple = NULL; + LocTriggerData.tg_trigtuplebuf = InvalidBuffer; } - if (ItemPointerIsValid(&(event->ate_newctid))) + /* don't touch ctid2 if not there */ + if ((event->ate_flags & AFTER_TRIGGER_2CTIDS) && + ItemPointerIsValid(&(event->ate_ctid2))) { - ItemPointerCopy(&(event->ate_newctid), &(newtuple.t_self)); - if (!heap_fetch(rel, SnapshotAny, &newtuple, &newbuffer, false, NULL)) - elog(ERROR, "failed to fetch new tuple for AFTER trigger"); - if (LocTriggerData.tg_trigtuple != NULL) - { - LocTriggerData.tg_newtuple = &newtuple; - LocTriggerData.tg_newtuplebuf = newbuffer; - } - else - { - LocTriggerData.tg_trigtuple = &newtuple; - LocTriggerData.tg_trigtuplebuf = newbuffer; - } + ItemPointerCopy(&(event->ate_ctid2), &(tuple2.t_self)); + if (!heap_fetch(rel, SnapshotAny, &tuple2, &buffer2, false, NULL)) + elog(ERROR, "failed to fetch tuple2 for AFTER trigger"); + LocTriggerData.tg_newtuple = &tuple2; + LocTriggerData.tg_newtuplebuf = buffer2; + } + else + { + LocTriggerData.tg_newtuple = NULL; + LocTriggerData.tg_newtuplebuf = InvalidBuffer; } /* @@ -2564,7 +2781,7 @@ AfterTriggerExecute(AfterTriggerEvent event, */ LocTriggerData.type = T_TriggerData; LocTriggerData.tg_event = - event->ate_event & (TRIGGER_EVENT_OPMASK | TRIGGER_EVENT_ROW); + evtshared->ats_event & (TRIGGER_EVENT_OPMASK | TRIGGER_EVENT_ROW); LocTriggerData.tg_relation = rel; MemoryContextReset(per_tuple_context); @@ -2578,16 +2795,16 @@ AfterTriggerExecute(AfterTriggerEvent event, finfo, NULL, per_tuple_context); - if (rettuple != NULL && rettuple != &oldtuple && rettuple != &newtuple) + if (rettuple != NULL && rettuple != &tuple1 && rettuple != &tuple2) heap_freetuple(rettuple); /* * Release buffers */ - if (oldbuffer != InvalidBuffer) - ReleaseBuffer(oldbuffer); - if (newbuffer != InvalidBuffer) - ReleaseBuffer(newbuffer); + if (buffer1 != InvalidBuffer) + ReleaseBuffer(buffer1); + if (buffer2 != InvalidBuffer) + ReleaseBuffer(buffer2); /* * If doing EXPLAIN ANALYZE, stop charging time to this trigger, and count @@ -2605,7 +2822,7 @@ AfterTriggerExecute(AfterTriggerEvent event, * that can be invoked now with the current firing ID. * * If move_list isn't NULL, events that are not to be invoked now are - * removed from the given list and appended to move_list. + * transferred to move_list. * * When immediate_only is TRUE, do not invoke currently-deferred triggers. * (This will be FALSE only at main transaction exit.) @@ -2618,26 +2835,22 @@ afterTriggerMarkEvents(AfterTriggerEventList *events, bool immediate_only) { bool found = false; - AfterTriggerEvent event, - prev_event; - - prev_event = NULL; - event = events->head; + AfterTriggerEvent event; + AfterTriggerEventChunk *chunk; - while (event != NULL) + for_each_event_chunk(event, chunk, *events) { + AfterTriggerShared evtshared = GetTriggerSharedData(event); bool defer_it = false; - AfterTriggerEvent next_event; - if (!(event->ate_event & + if (!(event->ate_flags & (AFTER_TRIGGER_DONE | AFTER_TRIGGER_IN_PROGRESS))) { /* * This trigger hasn't been called or scheduled yet. Check if we * should call it now. */ - if (immediate_only && - afterTriggerCheckState(event->ate_tgoid, event->ate_event)) + if (immediate_only && afterTriggerCheckState(evtshared)) { defer_it = true; } @@ -2646,8 +2859,8 @@ afterTriggerMarkEvents(AfterTriggerEventList *events, /* * Mark it as to be fired in this firing cycle. */ - event->ate_firing_id = afterTriggers->firing_counter; - event->ate_event |= AFTER_TRIGGER_IN_PROGRESS; + evtshared->ats_firing_id = afterTriggers->firing_counter; + event->ate_flags |= AFTER_TRIGGER_IN_PROGRESS; found = true; } } @@ -2655,45 +2868,19 @@ afterTriggerMarkEvents(AfterTriggerEventList *events, /* * If it's deferred, move it to move_list, if requested. */ - next_event = event->ate_next; - if (defer_it && move_list != NULL) { - /* Delink it from input list */ - if (prev_event) - prev_event->ate_next = next_event; - else - events->head = next_event; - /* and add it to move_list */ - event->ate_next = NULL; - if (move_list->tail == NULL) - { - /* first list entry */ - move_list->head = event; - move_list->tail = event; - } - else - { - move_list->tail->ate_next = event; - move_list->tail = event; - } - } - else - { - /* Keep it in input list */ - prev_event = event; + /* add it to move_list */ + afterTriggerAddEvent(move_list, event, evtshared); + /* mark original copy "done" so we don't do it again */ + event->ate_flags |= AFTER_TRIGGER_DONE; } - - event = next_event; } - /* Update list tail pointer in case we moved tail event */ - events->tail = prev_event; - return found; } -/* ---------- +/* * afterTriggerInvokeEvents() * * Scan the given event list for events that are marked as to be fired @@ -2704,18 +2891,24 @@ afterTriggerMarkEvents(AfterTriggerEventList *events, * make one locally to cache the info in case there are multiple trigger * events per rel. * - * When delete_ok is TRUE, it's okay to delete fully-processed events. - * The events list pointers are updated. - * ---------- + * When delete_ok is TRUE, it's safe to delete fully-processed events. + * (We are not very tense about that: we simply reset a chunk to be empty + * if all its events got fired. The objective here is just to avoid useless + * rescanning of events when a trigger queues new events during transaction + * end, so it's not necessary to worry much about the case where only + * some events are fired.) + * + * Returns TRUE if no unfired events remain in the list (this allows us + * to avoid repeating afterTriggerMarkEvents). */ -static void +static bool afterTriggerInvokeEvents(AfterTriggerEventList *events, CommandId firing_id, EState *estate, bool delete_ok) { - AfterTriggerEvent event, - prev_event; + bool all_fired = true; + AfterTriggerEventChunk *chunk; MemoryContext per_tuple_context; bool local_estate = false; Relation rel = NULL; @@ -2738,83 +2931,68 @@ afterTriggerInvokeEvents(AfterTriggerEventList *events, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); - prev_event = NULL; - event = events->head; - - while (event != NULL) + for_each_chunk(chunk, *events) { - AfterTriggerEvent next_event; + AfterTriggerEvent event; + bool all_fired_in_chunk = true; - /* - * Is it one for me to fire? - */ - if ((event->ate_event & AFTER_TRIGGER_IN_PROGRESS) && - event->ate_firing_id == firing_id) + for_each_event(event, chunk) { + AfterTriggerShared evtshared = GetTriggerSharedData(event); + /* - * So let's fire it... but first, find the correct relation if - * this is not the same relation as before. + * Is it one for me to fire? */ - if (rel == NULL || RelationGetRelid(rel) != event->ate_relid) + if ((event->ate_flags & AFTER_TRIGGER_IN_PROGRESS) && + evtshared->ats_firing_id == firing_id) { - ResultRelInfo *rInfo; - - rInfo = ExecGetTriggerResultRel(estate, event->ate_relid); - rel = rInfo->ri_RelationDesc; - trigdesc = rInfo->ri_TrigDesc; - finfo = rInfo->ri_TrigFunctions; - instr = rInfo->ri_TrigInstrument; - if (trigdesc == NULL) /* should not happen */ - elog(ERROR, "relation %u has no triggers", - event->ate_relid); - } + /* + * So let's fire it... but first, find the correct relation if + * this is not the same relation as before. + */ + if (rel == NULL || RelationGetRelid(rel) != evtshared->ats_relid) + { + ResultRelInfo *rInfo; + + rInfo = ExecGetTriggerResultRel(estate, evtshared->ats_relid); + rel = rInfo->ri_RelationDesc; + trigdesc = rInfo->ri_TrigDesc; + finfo = rInfo->ri_TrigFunctions; + instr = rInfo->ri_TrigInstrument; + if (trigdesc == NULL) /* should not happen */ + elog(ERROR, "relation %u has no triggers", + evtshared->ats_relid); + } - /* - * Fire it. Note that the AFTER_TRIGGER_IN_PROGRESS flag is still - * set, so recursive examinations of the event list won't try to - * re-fire it. - */ - AfterTriggerExecute(event, rel, trigdesc, finfo, instr, - per_tuple_context); + /* + * Fire it. Note that the AFTER_TRIGGER_IN_PROGRESS flag is + * still set, so recursive examinations of the event list + * won't try to re-fire it. + */ + AfterTriggerExecute(event, rel, trigdesc, finfo, instr, + per_tuple_context); - /* - * Mark the event as done. - */ - event->ate_event &= ~AFTER_TRIGGER_IN_PROGRESS; - event->ate_event |= AFTER_TRIGGER_DONE; + /* + * Mark the event as done. + */ + event->ate_flags &= ~AFTER_TRIGGER_IN_PROGRESS; + event->ate_flags |= AFTER_TRIGGER_DONE; + } + else if (!(event->ate_flags & AFTER_TRIGGER_DONE)) + { + /* something remains to be done */ + all_fired = all_fired_in_chunk = false; + } } - /* - * If it's now done, throw it away, if allowed. - * - * NB: it's possible the trigger call above added more events to the - * queue, or that calls we will do later will want to add more, so we - * have to be careful about maintaining list validity at all points - * here. - */ - next_event = event->ate_next; - - if ((event->ate_event & AFTER_TRIGGER_DONE) && delete_ok) + /* Clear the chunk if delete_ok and nothing left of interest */ + if (delete_ok && all_fired_in_chunk) { - /* Delink it from list and free it */ - if (prev_event) - prev_event->ate_next = next_event; - else - events->head = next_event; - pfree(event); - } - else - { - /* Keep it in list */ - prev_event = event; + chunk->freeptr = CHUNK_DATA_START(chunk); + chunk->endfree = chunk->endptr; } - - event = next_event; } - /* Update list tail pointer in case we just deleted tail event */ - events->tail = prev_event; - /* Release working resources */ MemoryContextDelete(per_tuple_context); @@ -2832,6 +3010,8 @@ afterTriggerInvokeEvents(AfterTriggerEventList *events, } FreeExecutorState(estate); } + + return all_fired; } @@ -2854,10 +3034,11 @@ AfterTriggerBeginXact(void) MemoryContextAlloc(TopTransactionContext, sizeof(AfterTriggersData)); - afterTriggers->firing_counter = FirstCommandId; + afterTriggers->firing_counter = (CommandId) 1; /* mustn't be 0 */ afterTriggers->state = SetConstraintStateCreate(8); afterTriggers->events.head = NULL; afterTriggers->events.tail = NULL; + afterTriggers->events.tailfree = NULL; afterTriggers->query_depth = -1; /* We initialize the query stack to a reasonable size */ @@ -2870,7 +3051,6 @@ AfterTriggerBeginXact(void) afterTriggers->event_cxt = NULL; /* Subtransaction stack is empty until/unless needed */ - afterTriggers->cxt_stack = NULL; afterTriggers->state_stack = NULL; afterTriggers->events_stack = NULL; afterTriggers->depth_stack = NULL; @@ -2891,6 +3071,8 @@ AfterTriggerBeginXact(void) void AfterTriggerBeginQuery(void) { + AfterTriggerEventList *events; + /* Must be inside a transaction */ Assert(afterTriggers != NULL); @@ -2912,8 +3094,10 @@ AfterTriggerBeginQuery(void) } /* Initialize this query's list to empty */ - afterTriggers->query_stack[afterTriggers->query_depth].head = NULL; - afterTriggers->query_stack[afterTriggers->query_depth].tail = NULL; + events = &afterTriggers->query_stack[afterTriggers->query_depth]; + events->head = NULL; + events->tail = NULL; + events->tailfree = NULL; } @@ -2950,20 +3134,32 @@ AfterTriggerEndQuery(EState *estate) * IMMEDIATE: all events we have decided to defer will be available for it * to fire. * - * We loop in case a trigger queues more events. + * We loop in case a trigger queues more events at the same query level + * (is that even possible?). Be careful here: firing a trigger could + * result in query_stack being repalloc'd, so we can't save its address + * across afterTriggerInvokeEvents calls. * * If we find no firable events, we don't have to increment * firing_counter. */ - events = &afterTriggers->query_stack[afterTriggers->query_depth]; - while (afterTriggerMarkEvents(events, &afterTriggers->events, true)) + for (;;) { - CommandId firing_id = afterTriggers->firing_counter++; + events = &afterTriggers->query_stack[afterTriggers->query_depth]; + if (afterTriggerMarkEvents(events, &afterTriggers->events, true)) + { + CommandId firing_id = afterTriggers->firing_counter++; - /* OK to delete the immediate events after processing them */ - afterTriggerInvokeEvents(events, firing_id, estate, true); + /* OK to delete the immediate events after processing them */ + if (afterTriggerInvokeEvents(events, firing_id, estate, true)) + break; /* all fired */ + } + else + break; } + /* Release query-local storage for events */ + afterTriggerFreeEventList(&afterTriggers->query_stack[afterTriggers->query_depth]); + afterTriggers->query_depth--; } @@ -3011,13 +3207,17 @@ AfterTriggerFireDeferred(void) { CommandId firing_id = afterTriggers->firing_counter++; - afterTriggerInvokeEvents(events, firing_id, NULL, true); + if (afterTriggerInvokeEvents(events, firing_id, NULL, true)) + break; /* all fired */ } + /* + * We don't bother freeing the event list, since it will go away anyway + * (and more efficiently than via pfree) in AfterTriggerEndXact. + */ + if (snap_pushed) PopActiveSnapshot(); - - Assert(events->head == NULL); } @@ -3045,10 +3245,6 @@ AfterTriggerEndXact(bool isCommit) * pending-events list could be large, and so it's useful to discard it as * soon as possible --- especially if we are aborting because we ran out * of memory for the list! - * - * (Note: any event_cxts of child subtransactions could also be deleted - * here, but we have no convenient way to find them, so we leave it to - * TopTransactionContext reset to clean them up.) */ if (afterTriggers && afterTriggers->event_cxt) MemoryContextDelete(afterTriggers->event_cxt); @@ -3087,8 +3283,6 @@ AfterTriggerBeginSubXact(void) old_cxt = MemoryContextSwitchTo(TopTransactionContext); #define DEFTRIG_INITALLOC 8 - afterTriggers->cxt_stack = (MemoryContext *) - palloc(DEFTRIG_INITALLOC * sizeof(MemoryContext)); afterTriggers->state_stack = (SetConstraintState *) palloc(DEFTRIG_INITALLOC * sizeof(SetConstraintState)); afterTriggers->events_stack = (AfterTriggerEventList *) @@ -3106,9 +3300,6 @@ AfterTriggerBeginSubXact(void) /* repalloc will keep the stacks in the same context */ int new_alloc = afterTriggers->maxtransdepth * 2; - afterTriggers->cxt_stack = (MemoryContext *) - repalloc(afterTriggers->cxt_stack, - new_alloc * sizeof(MemoryContext)); afterTriggers->state_stack = (SetConstraintState *) repalloc(afterTriggers->state_stack, new_alloc * sizeof(SetConstraintState)); @@ -3130,7 +3321,6 @@ AfterTriggerBeginSubXact(void) * is not saved until/unless changed. Likewise, we don't make a * per-subtransaction event context until needed. */ - afterTriggers->cxt_stack[my_level] = NULL; afterTriggers->state_stack[my_level] = NULL; afterTriggers->events_stack[my_level] = afterTriggers->events; afterTriggers->depth_stack[my_level] = afterTriggers->query_depth; @@ -3148,6 +3338,7 @@ AfterTriggerEndSubXact(bool isCommit) int my_level = GetCurrentTransactionNestLevel(); SetConstraintState state; AfterTriggerEvent event; + AfterTriggerEventChunk *chunk; CommandId subxact_firing_id; /* @@ -3172,55 +3363,27 @@ AfterTriggerEndSubXact(bool isCommit) afterTriggers->state_stack[my_level] = NULL; Assert(afterTriggers->query_depth == afterTriggers->depth_stack[my_level]); - - /* - * It's entirely possible that the subxact created an event_cxt but - * there is not anything left in it (because all the triggers were - * fired at end-of-statement). If so, we should release the context - * to prevent memory leakage in a long sequence of subtransactions. We - * can detect whether there's anything of use in the context by seeing - * if anything was added to the global events list since subxact - * start. (This test doesn't catch every case where the context is - * deletable; for instance maybe the only additions were from a - * sub-sub-xact. But it handles the common case.) - */ - if (afterTriggers->cxt_stack[my_level] && - afterTriggers->events.tail == afterTriggers->events_stack[my_level].tail) - { - MemoryContextDelete(afterTriggers->cxt_stack[my_level]); - /* avoid double delete if abort later */ - afterTriggers->cxt_stack[my_level] = NULL; - } } else { /* - * Aborting. We don't really need to release the subxact's event_cxt, - * since it will go away anyway when CurTransactionContext gets reset, - * but doing so early in subxact abort helps free space we might need. - * - * (Note: any event_cxts of child subtransactions could also be - * deleted here, but we have no convenient way to find them, so we - * leave it to CurTransactionContext reset to clean them up.) + * Aborting. Release any event lists from queries being aborted, + * and restore query_depth to its pre-subxact value. */ - if (afterTriggers->cxt_stack[my_level]) + while (afterTriggers->query_depth > afterTriggers->depth_stack[my_level]) { - MemoryContextDelete(afterTriggers->cxt_stack[my_level]); - /* avoid double delete if repeated aborts */ - afterTriggers->cxt_stack[my_level] = NULL; + afterTriggerFreeEventList(&afterTriggers->query_stack[afterTriggers->query_depth]); + afterTriggers->query_depth--; } + Assert(afterTriggers->query_depth == + afterTriggers->depth_stack[my_level]); /* - * Restore the pointers from the stacks. - */ - afterTriggers->events = afterTriggers->events_stack[my_level]; - afterTriggers->query_depth = afterTriggers->depth_stack[my_level]; - - /* - * Cleanup the tail of the list. + * Restore the global deferred-event list to its former length, + * discarding any events queued by the subxact. */ - if (afterTriggers->events.tail != NULL) - afterTriggers->events.tail->ate_next = NULL; + afterTriggerRestoreEventList(&afterTriggers->events, + &afterTriggers->events_stack[my_level]); /* * Restore the trigger state. If the saved state is NULL, then this @@ -3244,15 +3407,15 @@ AfterTriggerEndSubXact(bool isCommit) * subxacts started after it.) */ subxact_firing_id = afterTriggers->firing_stack[my_level]; - for (event = afterTriggers->events.head; - event != NULL; - event = event->ate_next) + for_each_event_chunk(event, chunk, afterTriggers->events) { - if (event->ate_event & + AfterTriggerShared evtshared = GetTriggerSharedData(event); + + if (event->ate_flags & (AFTER_TRIGGER_DONE | AFTER_TRIGGER_IN_PROGRESS)) { - if (event->ate_firing_id >= subxact_firing_id) - event->ate_event &= + if (evtshared->ats_firing_id >= subxact_firing_id) + event->ate_flags &= ~(AFTER_TRIGGER_DONE | AFTER_TRIGGER_IN_PROGRESS); } } @@ -3579,8 +3742,9 @@ AfterTriggerSetState(ConstraintsSetStmt *stmt) * but we'd better not if inside a subtransaction, since the * subtransaction could later get rolled back. */ - afterTriggerInvokeEvents(events, firing_id, NULL, - !IsSubTransaction()); + if (afterTriggerInvokeEvents(events, firing_id, NULL, + !IsSubTransaction())) + break; /* all fired */ } } } @@ -3604,6 +3768,7 @@ bool AfterTriggerPendingOnRel(Oid relid) { AfterTriggerEvent event; + AfterTriggerEventChunk *chunk; int depth; /* No-op if we aren't in a transaction. (Shouldn't happen?) */ @@ -3611,19 +3776,19 @@ AfterTriggerPendingOnRel(Oid relid) return false; /* Scan queued events */ - for (event = afterTriggers->events.head; - event != NULL; - event = event->ate_next) + for_each_event_chunk(event, chunk, afterTriggers->events) { + AfterTriggerShared evtshared = GetTriggerSharedData(event); + /* * We can ignore completed events. (Even if a DONE flag is rolled * back by subxact abort, it's OK because the effects of the TRUNCATE * or whatever must get rolled back too.) */ - if (event->ate_event & AFTER_TRIGGER_DONE) + if (event->ate_flags & AFTER_TRIGGER_DONE) continue; - if (event->ate_relid == relid) + if (evtshared->ats_relid == relid) return true; } @@ -3634,14 +3799,14 @@ AfterTriggerPendingOnRel(Oid relid) */ for (depth = 0; depth <= afterTriggers->query_depth; depth++) { - for (event = afterTriggers->query_stack[depth].head; - event != NULL; - event = event->ate_next) + for_each_event_chunk(event, chunk, afterTriggers->query_stack[depth]) { - if (event->ate_event & AFTER_TRIGGER_DONE) + AfterTriggerShared evtshared = GetTriggerSharedData(event); + + if (event->ate_flags & AFTER_TRIGGER_DONE) continue; - if (event->ate_relid == relid) + if (evtshared->ats_relid == relid) return true; } } @@ -3665,35 +3830,85 @@ AfterTriggerSaveEvent(ResultRelInfo *relinfo, int event, bool row_trigger, { Relation rel = relinfo->ri_RelationDesc; TriggerDesc *trigdesc = relinfo->ri_TrigDesc; - int my_level = GetCurrentTransactionNestLevel(); - MemoryContext *cxtptr; - AfterTriggerEvent new_event; + AfterTriggerEventData new_event; + AfterTriggerSharedData new_shared; int i; int ntriggers; int *tgindx; - ItemPointerData oldctid; - ItemPointerData newctid; if (afterTriggers == NULL) elog(ERROR, "AfterTriggerSaveEvent() called outside of transaction"); + Assert(afterTriggers->query_depth >= 0); /* - * event is used both as a bitmask and an array offset, - * so make sure we don't walk off the edge of our arrays - */ - Assert(event >= 0 && event < TRIGGER_NUM_EVENT_CLASSES); - - /* - * Get the CTID's of OLD and NEW + * Validate the event code and collect the associated tuple CTIDs. + * + * The event code will be used both as a bitmask and an array offset, so + * validation is important to make sure we don't walk off the edge of our + * arrays. */ - if (oldtup != NULL) - ItemPointerCopy(&(oldtup->t_self), &(oldctid)); - else - ItemPointerSetInvalid(&(oldctid)); - if (newtup != NULL) - ItemPointerCopy(&(newtup->t_self), &(newctid)); - else - ItemPointerSetInvalid(&(newctid)); + new_event.ate_flags = 0; + switch (event) + { + case TRIGGER_EVENT_INSERT: + if (row_trigger) + { + Assert(oldtup == NULL); + Assert(newtup != NULL); + ItemPointerCopy(&(newtup->t_self), &(new_event.ate_ctid1)); + ItemPointerSetInvalid(&(new_event.ate_ctid2)); + } + else + { + Assert(oldtup == NULL); + Assert(newtup == NULL); + ItemPointerSetInvalid(&(new_event.ate_ctid1)); + ItemPointerSetInvalid(&(new_event.ate_ctid2)); + } + break; + case TRIGGER_EVENT_DELETE: + if (row_trigger) + { + Assert(oldtup != NULL); + Assert(newtup == NULL); + ItemPointerCopy(&(oldtup->t_self), &(new_event.ate_ctid1)); + ItemPointerSetInvalid(&(new_event.ate_ctid2)); + } + else + { + Assert(oldtup == NULL); + Assert(newtup == NULL); + ItemPointerSetInvalid(&(new_event.ate_ctid1)); + ItemPointerSetInvalid(&(new_event.ate_ctid2)); + } + break; + case TRIGGER_EVENT_UPDATE: + if (row_trigger) + { + Assert(oldtup != NULL); + Assert(newtup != NULL); + ItemPointerCopy(&(oldtup->t_self), &(new_event.ate_ctid1)); + ItemPointerCopy(&(newtup->t_self), &(new_event.ate_ctid2)); + new_event.ate_flags |= AFTER_TRIGGER_2CTIDS; + } + else + { + Assert(oldtup == NULL); + Assert(newtup == NULL); + ItemPointerSetInvalid(&(new_event.ate_ctid1)); + ItemPointerSetInvalid(&(new_event.ate_ctid2)); + } + break; + case TRIGGER_EVENT_TRUNCATE: + Assert(oldtup == NULL); + Assert(newtup == NULL); + ItemPointerSetInvalid(&(new_event.ate_ctid1)); + ItemPointerSetInvalid(&(new_event.ate_ctid2)); + break; + default: + elog(ERROR, "invalid after-trigger event code: %d", event); + break; + } /* * Scan the appropriate set of triggers @@ -3772,44 +3987,18 @@ AfterTriggerSaveEvent(ResultRelInfo *relinfo, int event, bool row_trigger, } /* - * If we don't yet have an event context for the current (sub)xact, - * create one. Make it a child of CurTransactionContext to ensure it - * will go away if the subtransaction aborts. - */ - if (my_level > 1) /* subtransaction? */ - { - Assert(my_level < afterTriggers->maxtransdepth); - cxtptr = &afterTriggers->cxt_stack[my_level]; - } - else - cxtptr = &afterTriggers->event_cxt; - if (*cxtptr == NULL) - *cxtptr = AllocSetContextCreate(CurTransactionContext, - "AfterTriggerEvents", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - ALLOCSET_DEFAULT_MAXSIZE); - - /* - * Create a new event. + * Fill in event structure and add it to the current query's queue. */ - new_event = (AfterTriggerEvent) - MemoryContextAlloc(*cxtptr, sizeof(AfterTriggerEventData)); - new_event->ate_next = NULL; - new_event->ate_event = + new_shared.ats_event = (event & TRIGGER_EVENT_OPMASK) | (row_trigger ? TRIGGER_EVENT_ROW : 0) | (trigger->tgdeferrable ? AFTER_TRIGGER_DEFERRABLE : 0) | (trigger->tginitdeferred ? AFTER_TRIGGER_INITDEFERRED : 0); - new_event->ate_firing_id = 0; - new_event->ate_tgoid = trigger->tgoid; - new_event->ate_relid = rel->rd_id; - ItemPointerCopy(&oldctid, &(new_event->ate_oldctid)); - ItemPointerCopy(&newctid, &(new_event->ate_newctid)); + new_shared.ats_tgoid = trigger->tgoid; + new_shared.ats_relid = RelationGetRelid(rel); + new_shared.ats_firing_id = 0; - /* - * Add the new event to the queue. - */ - afterTriggerAddEvent(new_event); + afterTriggerAddEvent(&afterTriggers->query_stack[afterTriggers->query_depth], + &new_event, &new_shared); } } diff --git a/src/include/commands/trigger.h b/src/include/commands/trigger.h index ae4f999776..bdc089f33a 100644 --- a/src/include/commands/trigger.h +++ b/src/include/commands/trigger.h @@ -6,7 +6,7 @@ * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/commands/trigger.h,v 1.68 2008/09/19 14:43:46 mha Exp $ + * $PostgreSQL: pgsql/src/include/commands/trigger.h,v 1.69 2008/10/24 23:42:35 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -56,10 +56,8 @@ typedef struct TriggerData /* More TriggerEvent flags, used only within trigger.c */ -#define AFTER_TRIGGER_DONE 0x00000010 -#define AFTER_TRIGGER_IN_PROGRESS 0x00000020 -#define AFTER_TRIGGER_DEFERRABLE 0x00000040 -#define AFTER_TRIGGER_INITDEFERRED 0x00000080 +#define AFTER_TRIGGER_DEFERRABLE 0x00000010 +#define AFTER_TRIGGER_INITDEFERRED 0x00000020 #define TRIGGER_FIRED_BY_INSERT(event) \ (((TriggerEvent) (event) & TRIGGER_EVENT_OPMASK) == \ -- 2.40.0