]> granicus.if.org Git - postgresql/blob - src/backend/utils/sort/tuplestore.c
Permit super-MaxAllocSize allocations with MemoryContextAllocHuge().
[postgresql] / src / backend / utils / sort / tuplestore.c
1 /*-------------------------------------------------------------------------
2  *
3  * tuplestore.c
4  *        Generalized routines for temporary tuple storage.
5  *
6  * This module handles temporary storage of tuples for purposes such
7  * as Materialize nodes, hashjoin batch files, etc.  It is essentially
8  * a dumbed-down version of tuplesort.c; it does no sorting of tuples
9  * but can only store and regurgitate a sequence of tuples.  However,
10  * because no sort is required, it is allowed to start reading the sequence
11  * before it has all been written.      This is particularly useful for cursors,
12  * because it allows random access within the already-scanned portion of
13  * a query without having to process the underlying scan to completion.
14  * Also, it is possible to support multiple independent read pointers.
15  *
16  * A temporary file is used to handle the data if it exceeds the
17  * space limit specified by the caller.
18  *
19  * The (approximate) amount of memory allowed to the tuplestore is specified
20  * in kilobytes by the caller.  We absorb tuples and simply store them in an
21  * in-memory array as long as we haven't exceeded maxKBytes.  If we do exceed
22  * maxKBytes, we dump all the tuples into a temp file and then read from that
23  * when needed.
24  *
25  * Upon creation, a tuplestore supports a single read pointer, numbered 0.
26  * Additional read pointers can be created using tuplestore_alloc_read_pointer.
27  * Mark/restore behavior is supported by copying read pointers.
28  *
29  * When the caller requests backward-scan capability, we write the temp file
30  * in a format that allows either forward or backward scan.  Otherwise, only
31  * forward scan is allowed.  A request for backward scan must be made before
32  * putting any tuples into the tuplestore.      Rewind is normally allowed but
33  * can be turned off via tuplestore_set_eflags; turning off rewind for all
34  * read pointers enables truncation of the tuplestore at the oldest read point
35  * for minimal memory usage.  (The caller must explicitly call tuplestore_trim
36  * at appropriate times for truncation to actually happen.)
37  *
38  * Note: in TSS_WRITEFILE state, the temp file's seek position is the
39  * current write position, and the write-position variables in the tuplestore
40  * aren't kept up to date.  Similarly, in TSS_READFILE state the temp file's
41  * seek position is the active read pointer's position, and that read pointer
42  * isn't kept up to date.  We update the appropriate variables using ftell()
43  * before switching to the other state or activating a different read pointer.
44  *
45  *
46  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
47  * Portions Copyright (c) 1994, Regents of the University of California
48  *
49  * IDENTIFICATION
50  *        src/backend/utils/sort/tuplestore.c
51  *
52  *-------------------------------------------------------------------------
53  */
54
55 #include "postgres.h"
56
57 #include "access/htup_details.h"
58 #include "commands/tablespace.h"
59 #include "executor/executor.h"
60 #include "storage/buffile.h"
61 #include "utils/memutils.h"
62 #include "utils/resowner.h"
63
64
65 /*
66  * Possible states of a Tuplestore object.      These denote the states that
67  * persist between calls of Tuplestore routines.
68  */
69 typedef enum
70 {
71         TSS_INMEM,                                      /* Tuples still fit in memory */
72         TSS_WRITEFILE,                          /* Writing to temp file */
73         TSS_READFILE                            /* Reading from temp file */
74 } TupStoreStatus;
75
76 /*
77  * State for a single read pointer.  If we are in state INMEM then all the
78  * read pointers' "current" fields denote the read positions.  In state
79  * WRITEFILE, the file/offset fields denote the read positions.  In state
80  * READFILE, inactive read pointers have valid file/offset, but the active
81  * read pointer implicitly has position equal to the temp file's seek position.
82  *
83  * Special case: if eof_reached is true, then the pointer's read position is
84  * implicitly equal to the write position, and current/file/offset aren't
85  * maintained.  This way we need not update all the read pointers each time
86  * we write.
87  */
88 typedef struct
89 {
90         int                     eflags;                 /* capability flags */
91         bool            eof_reached;    /* read has reached EOF */
92         int                     current;                /* next array index to read */
93         int                     file;                   /* temp file# */
94         off_t           offset;                 /* byte offset in file */
95 } TSReadPointer;
96
97 /*
98  * Private state of a Tuplestore operation.
99  */
100 struct Tuplestorestate
101 {
102         TupStoreStatus status;          /* enumerated value as shown above */
103         int                     eflags;                 /* capability flags (OR of pointers' flags) */
104         bool            backward;               /* store extra length words in file? */
105         bool            interXact;              /* keep open through transactions? */
106         bool            truncated;              /* tuplestore_trim has removed tuples? */
107         Size            availMem;               /* remaining memory available, in bytes */
108         Size            allowedMem;             /* total memory allowed, in bytes */
109         BufFile    *myfile;                     /* underlying file, or NULL if none */
110         MemoryContext context;          /* memory context for holding tuples */
111         ResourceOwner resowner;         /* resowner for holding temp files */
112
113         /*
114          * These function pointers decouple the routines that must know what kind
115          * of tuple we are handling from the routines that don't need to know it.
116          * They are set up by the tuplestore_begin_xxx routines.
117          *
118          * (Although tuplestore.c currently only supports heap tuples, I've copied
119          * this part of tuplesort.c so that extension to other kinds of objects
120          * will be easy if it's ever needed.)
121          *
122          * Function to copy a supplied input tuple into palloc'd space. (NB: we
123          * assume that a single pfree() is enough to release the tuple later, so
124          * the representation must be "flat" in one palloc chunk.) state->availMem
125          * must be decreased by the amount of space used.
126          */
127         void       *(*copytup) (Tuplestorestate *state, void *tup);
128
129         /*
130          * Function to write a stored tuple onto tape.  The representation of the
131          * tuple on tape need not be the same as it is in memory; requirements on
132          * the tape representation are given below.  After writing the tuple,
133          * pfree() it, and increase state->availMem by the amount of memory space
134          * thereby released.
135          */
136         void            (*writetup) (Tuplestorestate *state, void *tup);
137
138         /*
139          * Function to read a stored tuple from tape back into memory. 'len' is
140          * the already-read length of the stored tuple.  Create and return a
141          * palloc'd copy, and decrease state->availMem by the amount of memory
142          * space consumed.
143          */
144         void       *(*readtup) (Tuplestorestate *state, unsigned int len);
145
146         /*
147          * This array holds pointers to tuples in memory if we are in state INMEM.
148          * In states WRITEFILE and READFILE it's not used.
149          *
150          * When memtupdeleted > 0, the first memtupdeleted pointers are already
151          * released due to a tuplestore_trim() operation, but we haven't expended
152          * the effort to slide the remaining pointers down.  These unused pointers
153          * are set to NULL to catch any invalid accesses.  Note that memtupcount
154          * includes the deleted pointers.
155          */
156         void      **memtuples;          /* array of pointers to palloc'd tuples */
157         int                     memtupdeleted;  /* the first N slots are currently unused */
158         int                     memtupcount;    /* number of tuples currently present */
159         int                     memtupsize;             /* allocated length of memtuples array */
160         bool            growmemtuples;  /* memtuples' growth still underway? */
161
162         /*
163          * These variables are used to keep track of the current positions.
164          *
165          * In state WRITEFILE, the current file seek position is the write point;
166          * in state READFILE, the write position is remembered in writepos_xxx.
167          * (The write position is the same as EOF, but since BufFileSeek doesn't
168          * currently implement SEEK_END, we have to remember it explicitly.)
169          */
170         TSReadPointer *readptrs;        /* array of read pointers */
171         int                     activeptr;              /* index of the active read pointer */
172         int                     readptrcount;   /* number of pointers currently valid */
173         int                     readptrsize;    /* allocated length of readptrs array */
174
175         int                     writepos_file;  /* file# (valid if READFILE state) */
176         off_t           writepos_offset;        /* offset (valid if READFILE state) */
177 };
178
179 #define COPYTUP(state,tup)      ((*(state)->copytup) (state, tup))
180 #define WRITETUP(state,tup) ((*(state)->writetup) (state, tup))
181 #define READTUP(state,len)      ((*(state)->readtup) (state, len))
182 #define LACKMEM(state)          ((state)->availMem < 0)
183 #define USEMEM(state,amt)       ((state)->availMem -= (amt))
184 #define FREEMEM(state,amt)      ((state)->availMem += (amt))
185
186 /*--------------------
187  *
188  * NOTES about on-tape representation of tuples:
189  *
190  * We require the first "unsigned int" of a stored tuple to be the total size
191  * on-tape of the tuple, including itself (so it is never zero).
192  * The remainder of the stored tuple
193  * may or may not match the in-memory representation of the tuple ---
194  * any conversion needed is the job of the writetup and readtup routines.
195  *
196  * If state->backward is true, then the stored representation of
197  * the tuple must be followed by another "unsigned int" that is a copy of the
198  * length --- so the total tape space used is actually sizeof(unsigned int)
199  * more than the stored length value.  This allows read-backwards.      When
200  * state->backward is not set, the write/read routines may omit the extra
201  * length word.
202  *
203  * writetup is expected to write both length words as well as the tuple
204  * data.  When readtup is called, the tape is positioned just after the
205  * front length word; readtup must read the tuple data and advance past
206  * the back length word (if present).
207  *
208  * The write/read routines can make use of the tuple description data
209  * stored in the Tuplestorestate record, if needed. They are also expected
210  * to adjust state->availMem by the amount of memory space (not tape space!)
211  * released or consumed.  There is no error return from either writetup
212  * or readtup; they should ereport() on failure.
213  *
214  *
215  * NOTES about memory consumption calculations:
216  *
217  * We count space allocated for tuples against the maxKBytes limit,
218  * plus the space used by the variable-size array memtuples.
219  * Fixed-size space (primarily the BufFile I/O buffer) is not counted.
220  * We don't worry about the size of the read pointer array, either.
221  *
222  * Note that we count actual space used (as shown by GetMemoryChunkSpace)
223  * rather than the originally-requested size.  This is important since
224  * palloc can add substantial overhead.  It's not a complete answer since
225  * we won't count any wasted space in palloc allocation blocks, but it's
226  * a lot better than what we were doing before 7.3.
227  *
228  *--------------------
229  */
230
231
232 static Tuplestorestate *tuplestore_begin_common(int eflags,
233                                                 bool interXact,
234                                                 int maxKBytes);
235 static void tuplestore_puttuple_common(Tuplestorestate *state, void *tuple);
236 static void dumptuples(Tuplestorestate *state);
237 static unsigned int getlen(Tuplestorestate *state, bool eofOK);
238 static void *copytup_heap(Tuplestorestate *state, void *tup);
239 static void writetup_heap(Tuplestorestate *state, void *tup);
240 static void *readtup_heap(Tuplestorestate *state, unsigned int len);
241
242
243 /*
244  *              tuplestore_begin_xxx
245  *
246  * Initialize for a tuple store operation.
247  */
248 static Tuplestorestate *
249 tuplestore_begin_common(int eflags, bool interXact, int maxKBytes)
250 {
251         Tuplestorestate *state;
252
253         state = (Tuplestorestate *) palloc0(sizeof(Tuplestorestate));
254
255         state->status = TSS_INMEM;
256         state->eflags = eflags;
257         state->interXact = interXact;
258         state->truncated = false;
259         state->allowedMem = maxKBytes * 1024L;
260         state->availMem = state->allowedMem;
261         state->myfile = NULL;
262         state->context = CurrentMemoryContext;
263         state->resowner = CurrentResourceOwner;
264
265         state->memtupdeleted = 0;
266         state->memtupcount = 0;
267         state->memtupsize = 1024;       /* initial guess */
268         state->growmemtuples = true;
269         state->memtuples = (void **) palloc(state->memtupsize * sizeof(void *));
270
271         USEMEM(state, GetMemoryChunkSpace(state->memtuples));
272
273         state->activeptr = 0;
274         state->readptrcount = 1;
275         state->readptrsize = 8;         /* arbitrary */
276         state->readptrs = (TSReadPointer *)
277                 palloc(state->readptrsize * sizeof(TSReadPointer));
278
279         state->readptrs[0].eflags = eflags;
280         state->readptrs[0].eof_reached = false;
281         state->readptrs[0].current = 0;
282
283         return state;
284 }
285
286 /*
287  * tuplestore_begin_heap
288  *
289  * Create a new tuplestore; other types of tuple stores (other than
290  * "heap" tuple stores, for heap tuples) are possible, but not presently
291  * implemented.
292  *
293  * randomAccess: if true, both forward and backward accesses to the
294  * tuple store are allowed.
295  *
296  * interXact: if true, the files used for on-disk storage persist beyond the
297  * end of the current transaction.      NOTE: It's the caller's responsibility to
298  * create such a tuplestore in a memory context and resource owner that will
299  * also survive transaction boundaries, and to ensure the tuplestore is closed
300  * when it's no longer wanted.
301  *
302  * maxKBytes: how much data to store in memory (any data beyond this
303  * amount is paged to disk).  When in doubt, use work_mem.
304  */
305 Tuplestorestate *
306 tuplestore_begin_heap(bool randomAccess, bool interXact, int maxKBytes)
307 {
308         Tuplestorestate *state;
309         int                     eflags;
310
311         /*
312          * This interpretation of the meaning of randomAccess is compatible with
313          * the pre-8.3 behavior of tuplestores.
314          */
315         eflags = randomAccess ?
316                 (EXEC_FLAG_BACKWARD | EXEC_FLAG_REWIND) :
317                 (EXEC_FLAG_REWIND);
318
319         state = tuplestore_begin_common(eflags, interXact, maxKBytes);
320
321         state->copytup = copytup_heap;
322         state->writetup = writetup_heap;
323         state->readtup = readtup_heap;
324
325         return state;
326 }
327
328 /*
329  * tuplestore_set_eflags
330  *
331  * Set the capability flags for read pointer 0 at a finer grain than is
332  * allowed by tuplestore_begin_xxx.  This must be called before inserting
333  * any data into the tuplestore.
334  *
335  * eflags is a bitmask following the meanings used for executor node
336  * startup flags (see executor.h).      tuplestore pays attention to these bits:
337  *              EXEC_FLAG_REWIND                need rewind to start
338  *              EXEC_FLAG_BACKWARD              need backward fetch
339  * If tuplestore_set_eflags is not called, REWIND is allowed, and BACKWARD
340  * is set per "randomAccess" in the tuplestore_begin_xxx call.
341  *
342  * NOTE: setting BACKWARD without REWIND means the pointer can read backwards,
343  * but not further than the truncation point (the furthest-back read pointer
344  * position at the time of the last tuplestore_trim call).
345  */
346 void
347 tuplestore_set_eflags(Tuplestorestate *state, int eflags)
348 {
349         int                     i;
350
351         if (state->status != TSS_INMEM || state->memtupcount != 0)
352                 elog(ERROR, "too late to call tuplestore_set_eflags");
353
354         state->readptrs[0].eflags = eflags;
355         for (i = 1; i < state->readptrcount; i++)
356                 eflags |= state->readptrs[i].eflags;
357         state->eflags = eflags;
358 }
359
360 /*
361  * tuplestore_alloc_read_pointer - allocate another read pointer.
362  *
363  * Returns the pointer's index.
364  *
365  * The new pointer initially copies the position of read pointer 0.
366  * It can have its own eflags, but if any data has been inserted into
367  * the tuplestore, these eflags must not represent an increase in
368  * requirements.
369  */
370 int
371 tuplestore_alloc_read_pointer(Tuplestorestate *state, int eflags)
372 {
373         /* Check for possible increase of requirements */
374         if (state->status != TSS_INMEM || state->memtupcount != 0)
375         {
376                 if ((state->eflags | eflags) != state->eflags)
377                         elog(ERROR, "too late to require new tuplestore eflags");
378         }
379
380         /* Make room for another read pointer if needed */
381         if (state->readptrcount >= state->readptrsize)
382         {
383                 int                     newcnt = state->readptrsize * 2;
384
385                 state->readptrs = (TSReadPointer *)
386                         repalloc(state->readptrs, newcnt * sizeof(TSReadPointer));
387                 state->readptrsize = newcnt;
388         }
389
390         /* And set it up */
391         state->readptrs[state->readptrcount] = state->readptrs[0];
392         state->readptrs[state->readptrcount].eflags = eflags;
393
394         state->eflags |= eflags;
395
396         return state->readptrcount++;
397 }
398
399 /*
400  * tuplestore_clear
401  *
402  *      Delete all the contents of a tuplestore, and reset its read pointers
403  *      to the start.
404  */
405 void
406 tuplestore_clear(Tuplestorestate *state)
407 {
408         int                     i;
409         TSReadPointer *readptr;
410
411         if (state->myfile)
412                 BufFileClose(state->myfile);
413         state->myfile = NULL;
414         if (state->memtuples)
415         {
416                 for (i = state->memtupdeleted; i < state->memtupcount; i++)
417                 {
418                         FREEMEM(state, GetMemoryChunkSpace(state->memtuples[i]));
419                         pfree(state->memtuples[i]);
420                 }
421         }
422         state->status = TSS_INMEM;
423         state->truncated = false;
424         state->memtupdeleted = 0;
425         state->memtupcount = 0;
426         readptr = state->readptrs;
427         for (i = 0; i < state->readptrcount; readptr++, i++)
428         {
429                 readptr->eof_reached = false;
430                 readptr->current = 0;
431         }
432 }
433
434 /*
435  * tuplestore_end
436  *
437  *      Release resources and clean up.
438  */
439 void
440 tuplestore_end(Tuplestorestate *state)
441 {
442         int                     i;
443
444         if (state->myfile)
445                 BufFileClose(state->myfile);
446         if (state->memtuples)
447         {
448                 for (i = state->memtupdeleted; i < state->memtupcount; i++)
449                         pfree(state->memtuples[i]);
450                 pfree(state->memtuples);
451         }
452         pfree(state->readptrs);
453         pfree(state);
454 }
455
456 /*
457  * tuplestore_select_read_pointer - make the specified read pointer active
458  */
459 void
460 tuplestore_select_read_pointer(Tuplestorestate *state, int ptr)
461 {
462         TSReadPointer *readptr;
463         TSReadPointer *oldptr;
464
465         Assert(ptr >= 0 && ptr < state->readptrcount);
466
467         /* No work if already active */
468         if (ptr == state->activeptr)
469                 return;
470
471         readptr = &state->readptrs[ptr];
472         oldptr = &state->readptrs[state->activeptr];
473
474         switch (state->status)
475         {
476                 case TSS_INMEM:
477                 case TSS_WRITEFILE:
478                         /* no work */
479                         break;
480                 case TSS_READFILE:
481
482                         /*
483                          * First, save the current read position in the pointer about to
484                          * become inactive.
485                          */
486                         if (!oldptr->eof_reached)
487                                 BufFileTell(state->myfile,
488                                                         &oldptr->file,
489                                                         &oldptr->offset);
490
491                         /*
492                          * We have to make the temp file's seek position equal to the
493                          * logical position of the new read pointer.  In eof_reached
494                          * state, that's the EOF, which we have available from the saved
495                          * write position.
496                          */
497                         if (readptr->eof_reached)
498                         {
499                                 if (BufFileSeek(state->myfile,
500                                                                 state->writepos_file,
501                                                                 state->writepos_offset,
502                                                                 SEEK_SET) != 0)
503                                         elog(ERROR, "tuplestore seek failed");
504                         }
505                         else
506                         {
507                                 if (BufFileSeek(state->myfile,
508                                                                 readptr->file,
509                                                                 readptr->offset,
510                                                                 SEEK_SET) != 0)
511                                         elog(ERROR, "tuplestore seek failed");
512                         }
513                         break;
514                 default:
515                         elog(ERROR, "invalid tuplestore state");
516                         break;
517         }
518
519         state->activeptr = ptr;
520 }
521
522 /*
523  * tuplestore_ateof
524  *
525  * Returns the active read pointer's eof_reached state.
526  */
527 bool
528 tuplestore_ateof(Tuplestorestate *state)
529 {
530         return state->readptrs[state->activeptr].eof_reached;
531 }
532
533 /*
534  * Grow the memtuples[] array, if possible within our memory constraint.  We
535  * must not exceed INT_MAX tuples in memory or the caller-provided memory
536  * limit.  Return TRUE if we were able to enlarge the array, FALSE if not.
537  *
538  * Normally, at each increment we double the size of the array.  When doing
539  * that would exceed a limit, we attempt one last, smaller increase (and then
540  * clear the growmemtuples flag so we don't try any more).  That allows us to
541  * use memory as fully as permitted; sticking to the pure doubling rule could
542  * result in almost half going unused.  Because availMem moves around with
543  * tuple addition/removal, we need some rule to prevent making repeated small
544  * increases in memtupsize, which would just be useless thrashing.  The
545  * growmemtuples flag accomplishes that and also prevents useless
546  * recalculations in this function.
547  */
548 static bool
549 grow_memtuples(Tuplestorestate *state)
550 {
551         int                     newmemtupsize;
552         int                     memtupsize = state->memtupsize;
553         Size            memNowUsed = state->allowedMem - state->availMem;
554
555         /* Forget it if we've already maxed out memtuples, per comment above */
556         if (!state->growmemtuples)
557                 return false;
558
559         /* Select new value of memtupsize */
560         if (memNowUsed <= state->availMem)
561         {
562                 /*
563                  * We've used no more than half of allowedMem; double our usage,
564                  * clamping at INT_MAX.
565                  */
566                 if (memtupsize < INT_MAX / 2)
567                         newmemtupsize = memtupsize * 2;
568                 else
569                 {
570                         newmemtupsize = INT_MAX;
571                         state->growmemtuples = false;
572                 }
573         }
574         else
575         {
576                 /*
577                  * This will be the last increment of memtupsize.  Abandon doubling
578                  * strategy and instead increase as much as we safely can.
579                  *
580                  * To stay within allowedMem, we can't increase memtupsize by more
581                  * than availMem / sizeof(void *) elements. In practice, we want to
582                  * increase it by considerably less, because we need to leave some
583                  * space for the tuples to which the new array slots will refer.  We
584                  * assume the new tuples will be about the same size as the tuples
585                  * we've already seen, and thus we can extrapolate from the space
586                  * consumption so far to estimate an appropriate new size for the
587                  * memtuples array.  The optimal value might be higher or lower than
588                  * this estimate, but it's hard to know that in advance.  We again
589                  * clamp at INT_MAX tuples.
590                  *
591                  * This calculation is safe against enlarging the array so much that
592                  * LACKMEM becomes true, because the memory currently used includes
593                  * the present array; thus, there would be enough allowedMem for the
594                  * new array elements even if no other memory were currently used.
595                  *
596                  * We do the arithmetic in float8, because otherwise the product of
597                  * memtupsize and allowedMem could overflow.  Any inaccuracy in the
598                  * result should be insignificant; but even if we computed a
599                  * completely insane result, the checks below will prevent anything
600                  * really bad from happening.
601                  */
602                 double          grow_ratio;
603
604                 grow_ratio = (double) state->allowedMem / (double) memNowUsed;
605                 if (memtupsize * grow_ratio < INT_MAX)
606                         newmemtupsize = (int) (memtupsize * grow_ratio);
607                 else
608                         newmemtupsize = INT_MAX;
609
610                 /* We won't make any further enlargement attempts */
611                 state->growmemtuples = false;
612         }
613
614         /* Must enlarge array by at least one element, else report failure */
615         if (newmemtupsize <= memtupsize)
616                 goto noalloc;
617
618         /*
619          * On a 32-bit machine, allowedMem could exceed MaxAllocHugeSize.  Clamp
620          * to ensure our request won't be rejected.  Note that we can easily
621          * exhaust address space before facing this outcome.
622          */
623         if ((Size) newmemtupsize >= MaxAllocHugeSize / sizeof(void *))
624         {
625                 newmemtupsize = (int) (MaxAllocHugeSize / sizeof(void *));
626                 state->growmemtuples = false;   /* can't grow any more */
627         }
628
629         /*
630          * We need to be sure that we do not cause LACKMEM to become true, else
631          * the space management algorithm will go nuts.  The code above should
632          * never generate a dangerous request, but to be safe, check explicitly
633          * that the array growth fits within availMem.  (We could still cause
634          * LACKMEM if the memory chunk overhead associated with the memtuples
635          * array were to increase.      That shouldn't happen with any sane value of
636          * allowedMem, because at any array size large enough to risk LACKMEM,
637          * palloc would be treating both old and new arrays as separate chunks.
638          * But we'll check LACKMEM explicitly below just in case.)
639          */
640         if (state->availMem < (Size) ((newmemtupsize - memtupsize) * sizeof(void *)))
641                 goto noalloc;
642
643         /* OK, do it */
644         FREEMEM(state, GetMemoryChunkSpace(state->memtuples));
645         state->memtupsize = newmemtupsize;
646         state->memtuples = (void **)
647                 repalloc_huge(state->memtuples,
648                                           state->memtupsize * sizeof(void *));
649         USEMEM(state, GetMemoryChunkSpace(state->memtuples));
650         if (LACKMEM(state))
651                 elog(ERROR, "unexpected out-of-memory situation during sort");
652         return true;
653
654 noalloc:
655         /* If for any reason we didn't realloc, shut off future attempts */
656         state->growmemtuples = false;
657         return false;
658 }
659
660 /*
661  * Accept one tuple and append it to the tuplestore.
662  *
663  * Note that the input tuple is always copied; the caller need not save it.
664  *
665  * If the active read pointer is currently "at EOF", it remains so (the read
666  * pointer implicitly advances along with the write pointer); otherwise the
667  * read pointer is unchanged.  Non-active read pointers do not move, which
668  * means they are certain to not be "at EOF" immediately after puttuple.
669  * This curious-seeming behavior is for the convenience of nodeMaterial.c and
670  * nodeCtescan.c, which would otherwise need to do extra pointer repositioning
671  * steps.
672  *
673  * tuplestore_puttupleslot() is a convenience routine to collect data from
674  * a TupleTableSlot without an extra copy operation.
675  */
676 void
677 tuplestore_puttupleslot(Tuplestorestate *state,
678                                                 TupleTableSlot *slot)
679 {
680         MinimalTuple tuple;
681         MemoryContext oldcxt = MemoryContextSwitchTo(state->context);
682
683         /*
684          * Form a MinimalTuple in working memory
685          */
686         tuple = ExecCopySlotMinimalTuple(slot);
687         USEMEM(state, GetMemoryChunkSpace(tuple));
688
689         tuplestore_puttuple_common(state, (void *) tuple);
690
691         MemoryContextSwitchTo(oldcxt);
692 }
693
694 /*
695  * "Standard" case to copy from a HeapTuple.  This is actually now somewhat
696  * deprecated, but not worth getting rid of in view of the number of callers.
697  */
698 void
699 tuplestore_puttuple(Tuplestorestate *state, HeapTuple tuple)
700 {
701         MemoryContext oldcxt = MemoryContextSwitchTo(state->context);
702
703         /*
704          * Copy the tuple.      (Must do this even in WRITEFILE case.  Note that
705          * COPYTUP includes USEMEM, so we needn't do that here.)
706          */
707         tuple = COPYTUP(state, tuple);
708
709         tuplestore_puttuple_common(state, (void *) tuple);
710
711         MemoryContextSwitchTo(oldcxt);
712 }
713
714 /*
715  * Similar to tuplestore_puttuple(), but work from values + nulls arrays.
716  * This avoids an extra tuple-construction operation.
717  */
718 void
719 tuplestore_putvalues(Tuplestorestate *state, TupleDesc tdesc,
720                                          Datum *values, bool *isnull)
721 {
722         MinimalTuple tuple;
723         MemoryContext oldcxt = MemoryContextSwitchTo(state->context);
724
725         tuple = heap_form_minimal_tuple(tdesc, values, isnull);
726         USEMEM(state, GetMemoryChunkSpace(tuple));
727
728         tuplestore_puttuple_common(state, (void *) tuple);
729
730         MemoryContextSwitchTo(oldcxt);
731 }
732
733 static void
734 tuplestore_puttuple_common(Tuplestorestate *state, void *tuple)
735 {
736         TSReadPointer *readptr;
737         int                     i;
738         ResourceOwner oldowner;
739
740         switch (state->status)
741         {
742                 case TSS_INMEM:
743
744                         /*
745                          * Update read pointers as needed; see API spec above.
746                          */
747                         readptr = state->readptrs;
748                         for (i = 0; i < state->readptrcount; readptr++, i++)
749                         {
750                                 if (readptr->eof_reached && i != state->activeptr)
751                                 {
752                                         readptr->eof_reached = false;
753                                         readptr->current = state->memtupcount;
754                                 }
755                         }
756
757                         /*
758                          * Grow the array as needed.  Note that we try to grow the array
759                          * when there is still one free slot remaining --- if we fail,
760                          * there'll still be room to store the incoming tuple, and then
761                          * we'll switch to tape-based operation.
762                          */
763                         if (state->memtupcount >= state->memtupsize - 1)
764                         {
765                                 (void) grow_memtuples(state);
766                                 Assert(state->memtupcount < state->memtupsize);
767                         }
768
769                         /* Stash the tuple in the in-memory array */
770                         state->memtuples[state->memtupcount++] = tuple;
771
772                         /*
773                          * Done if we still fit in available memory and have array slots.
774                          */
775                         if (state->memtupcount < state->memtupsize && !LACKMEM(state))
776                                 return;
777
778                         /*
779                          * Nope; time to switch to tape-based operation.  Make sure that
780                          * the temp file(s) are created in suitable temp tablespaces.
781                          */
782                         PrepareTempTablespaces();
783
784                         /* associate the file with the store's resource owner */
785                         oldowner = CurrentResourceOwner;
786                         CurrentResourceOwner = state->resowner;
787
788                         state->myfile = BufFileCreateTemp(state->interXact);
789
790                         CurrentResourceOwner = oldowner;
791
792                         /*
793                          * Freeze the decision about whether trailing length words will be
794                          * used.  We can't change this choice once data is on tape, even
795                          * though callers might drop the requirement.
796                          */
797                         state->backward = (state->eflags & EXEC_FLAG_BACKWARD) != 0;
798                         state->status = TSS_WRITEFILE;
799                         dumptuples(state);
800                         break;
801                 case TSS_WRITEFILE:
802
803                         /*
804                          * Update read pointers as needed; see API spec above. Note:
805                          * BufFileTell is quite cheap, so not worth trying to avoid
806                          * multiple calls.
807                          */
808                         readptr = state->readptrs;
809                         for (i = 0; i < state->readptrcount; readptr++, i++)
810                         {
811                                 if (readptr->eof_reached && i != state->activeptr)
812                                 {
813                                         readptr->eof_reached = false;
814                                         BufFileTell(state->myfile,
815                                                                 &readptr->file,
816                                                                 &readptr->offset);
817                                 }
818                         }
819
820                         WRITETUP(state, tuple);
821                         break;
822                 case TSS_READFILE:
823
824                         /*
825                          * Switch from reading to writing.
826                          */
827                         if (!state->readptrs[state->activeptr].eof_reached)
828                                 BufFileTell(state->myfile,
829                                                         &state->readptrs[state->activeptr].file,
830                                                         &state->readptrs[state->activeptr].offset);
831                         if (BufFileSeek(state->myfile,
832                                                         state->writepos_file, state->writepos_offset,
833                                                         SEEK_SET) != 0)
834                                 elog(ERROR, "tuplestore seek to EOF failed");
835                         state->status = TSS_WRITEFILE;
836
837                         /*
838                          * Update read pointers as needed; see API spec above.
839                          */
840                         readptr = state->readptrs;
841                         for (i = 0; i < state->readptrcount; readptr++, i++)
842                         {
843                                 if (readptr->eof_reached && i != state->activeptr)
844                                 {
845                                         readptr->eof_reached = false;
846                                         readptr->file = state->writepos_file;
847                                         readptr->offset = state->writepos_offset;
848                                 }
849                         }
850
851                         WRITETUP(state, tuple);
852                         break;
853                 default:
854                         elog(ERROR, "invalid tuplestore state");
855                         break;
856         }
857 }
858
859 /*
860  * Fetch the next tuple in either forward or back direction.
861  * Returns NULL if no more tuples.      If should_free is set, the
862  * caller must pfree the returned tuple when done with it.
863  *
864  * Backward scan is only allowed if randomAccess was set true or
865  * EXEC_FLAG_BACKWARD was specified to tuplestore_set_eflags().
866  */
867 static void *
868 tuplestore_gettuple(Tuplestorestate *state, bool forward,
869                                         bool *should_free)
870 {
871         TSReadPointer *readptr = &state->readptrs[state->activeptr];
872         unsigned int tuplen;
873         void       *tup;
874
875         Assert(forward || (readptr->eflags & EXEC_FLAG_BACKWARD));
876
877         switch (state->status)
878         {
879                 case TSS_INMEM:
880                         *should_free = false;
881                         if (forward)
882                         {
883                                 if (readptr->eof_reached)
884                                         return NULL;
885                                 if (readptr->current < state->memtupcount)
886                                 {
887                                         /* We have another tuple, so return it */
888                                         return state->memtuples[readptr->current++];
889                                 }
890                                 readptr->eof_reached = true;
891                                 return NULL;
892                         }
893                         else
894                         {
895                                 /*
896                                  * if all tuples are fetched already then we return last
897                                  * tuple, else tuple before last returned.
898                                  */
899                                 if (readptr->eof_reached)
900                                 {
901                                         readptr->current = state->memtupcount;
902                                         readptr->eof_reached = false;
903                                 }
904                                 else
905                                 {
906                                         if (readptr->current <= state->memtupdeleted)
907                                         {
908                                                 Assert(!state->truncated);
909                                                 return NULL;
910                                         }
911                                         readptr->current--; /* last returned tuple */
912                                 }
913                                 if (readptr->current <= state->memtupdeleted)
914                                 {
915                                         Assert(!state->truncated);
916                                         return NULL;
917                                 }
918                                 return state->memtuples[readptr->current - 1];
919                         }
920                         break;
921
922                 case TSS_WRITEFILE:
923                         /* Skip state change if we'll just return NULL */
924                         if (readptr->eof_reached && forward)
925                                 return NULL;
926
927                         /*
928                          * Switch from writing to reading.
929                          */
930                         BufFileTell(state->myfile,
931                                                 &state->writepos_file, &state->writepos_offset);
932                         if (!readptr->eof_reached)
933                                 if (BufFileSeek(state->myfile,
934                                                                 readptr->file, readptr->offset,
935                                                                 SEEK_SET) != 0)
936                                         elog(ERROR, "tuplestore seek failed");
937                         state->status = TSS_READFILE;
938                         /* FALL THRU into READFILE case */
939
940                 case TSS_READFILE:
941                         *should_free = true;
942                         if (forward)
943                         {
944                                 if ((tuplen = getlen(state, true)) != 0)
945                                 {
946                                         tup = READTUP(state, tuplen);
947                                         return tup;
948                                 }
949                                 else
950                                 {
951                                         readptr->eof_reached = true;
952                                         return NULL;
953                                 }
954                         }
955
956                         /*
957                          * Backward.
958                          *
959                          * if all tuples are fetched already then we return last tuple,
960                          * else tuple before last returned.
961                          *
962                          * Back up to fetch previously-returned tuple's ending length
963                          * word. If seek fails, assume we are at start of file.
964                          */
965                         if (BufFileSeek(state->myfile, 0, -(long) sizeof(unsigned int),
966                                                         SEEK_CUR) != 0)
967                         {
968                                 /* even a failed backwards fetch gets you out of eof state */
969                                 readptr->eof_reached = false;
970                                 Assert(!state->truncated);
971                                 return NULL;
972                         }
973                         tuplen = getlen(state, false);
974
975                         if (readptr->eof_reached)
976                         {
977                                 readptr->eof_reached = false;
978                                 /* We will return the tuple returned before returning NULL */
979                         }
980                         else
981                         {
982                                 /*
983                                  * Back up to get ending length word of tuple before it.
984                                  */
985                                 if (BufFileSeek(state->myfile, 0,
986                                                                 -(long) (tuplen + 2 * sizeof(unsigned int)),
987                                                                 SEEK_CUR) != 0)
988                                 {
989                                         /*
990                                          * If that fails, presumably the prev tuple is the first
991                                          * in the file.  Back up so that it becomes next to read
992                                          * in forward direction (not obviously right, but that is
993                                          * what in-memory case does).
994                                          */
995                                         if (BufFileSeek(state->myfile, 0,
996                                                                         -(long) (tuplen + sizeof(unsigned int)),
997                                                                         SEEK_CUR) != 0)
998                                                 elog(ERROR, "bogus tuple length in backward scan");
999                                         Assert(!state->truncated);
1000                                         return NULL;
1001                                 }
1002                                 tuplen = getlen(state, false);
1003                         }
1004
1005                         /*
1006                          * Now we have the length of the prior tuple, back up and read it.
1007                          * Note: READTUP expects we are positioned after the initial
1008                          * length word of the tuple, so back up to that point.
1009                          */
1010                         if (BufFileSeek(state->myfile, 0,
1011                                                         -(long) tuplen,
1012                                                         SEEK_CUR) != 0)
1013                                 elog(ERROR, "bogus tuple length in backward scan");
1014                         tup = READTUP(state, tuplen);
1015                         return tup;
1016
1017                 default:
1018                         elog(ERROR, "invalid tuplestore state");
1019                         return NULL;            /* keep compiler quiet */
1020         }
1021 }
1022
1023 /*
1024  * tuplestore_gettupleslot - exported function to fetch a MinimalTuple
1025  *
1026  * If successful, put tuple in slot and return TRUE; else, clear the slot
1027  * and return FALSE.
1028  *
1029  * If copy is TRUE, the slot receives a copied tuple (allocated in current
1030  * memory context) that will stay valid regardless of future manipulations of
1031  * the tuplestore's state.  If copy is FALSE, the slot may just receive a
1032  * pointer to a tuple held within the tuplestore.  The latter is more
1033  * efficient but the slot contents may be corrupted if additional writes to
1034  * the tuplestore occur.  (If using tuplestore_trim, see comments therein.)
1035  */
1036 bool
1037 tuplestore_gettupleslot(Tuplestorestate *state, bool forward,
1038                                                 bool copy, TupleTableSlot *slot)
1039 {
1040         MinimalTuple tuple;
1041         bool            should_free;
1042
1043         tuple = (MinimalTuple) tuplestore_gettuple(state, forward, &should_free);
1044
1045         if (tuple)
1046         {
1047                 if (copy && !should_free)
1048                 {
1049                         tuple = heap_copy_minimal_tuple(tuple);
1050                         should_free = true;
1051                 }
1052                 ExecStoreMinimalTuple(tuple, slot, should_free);
1053                 return true;
1054         }
1055         else
1056         {
1057                 ExecClearTuple(slot);
1058                 return false;
1059         }
1060 }
1061
1062 /*
1063  * tuplestore_advance - exported function to adjust position without fetching
1064  *
1065  * We could optimize this case to avoid palloc/pfree overhead, but for the
1066  * moment it doesn't seem worthwhile.  (XXX this probably needs to be
1067  * reconsidered given the needs of window functions.)
1068  */
1069 bool
1070 tuplestore_advance(Tuplestorestate *state, bool forward)
1071 {
1072         void       *tuple;
1073         bool            should_free;
1074
1075         tuple = tuplestore_gettuple(state, forward, &should_free);
1076
1077         if (tuple)
1078         {
1079                 if (should_free)
1080                         pfree(tuple);
1081                 return true;
1082         }
1083         else
1084         {
1085                 return false;
1086         }
1087 }
1088
1089 /*
1090  * dumptuples - remove tuples from memory and write to tape
1091  *
1092  * As a side effect, we must convert each read pointer's position from
1093  * "current" to file/offset format.  But eof_reached pointers don't
1094  * need to change state.
1095  */
1096 static void
1097 dumptuples(Tuplestorestate *state)
1098 {
1099         int                     i;
1100
1101         for (i = state->memtupdeleted;; i++)
1102         {
1103                 TSReadPointer *readptr = state->readptrs;
1104                 int                     j;
1105
1106                 for (j = 0; j < state->readptrcount; readptr++, j++)
1107                 {
1108                         if (i == readptr->current && !readptr->eof_reached)
1109                                 BufFileTell(state->myfile,
1110                                                         &readptr->file, &readptr->offset);
1111                 }
1112                 if (i >= state->memtupcount)
1113                         break;
1114                 WRITETUP(state, state->memtuples[i]);
1115         }
1116         state->memtupdeleted = 0;
1117         state->memtupcount = 0;
1118 }
1119
1120 /*
1121  * tuplestore_rescan            - rewind the active read pointer to start
1122  */
1123 void
1124 tuplestore_rescan(Tuplestorestate *state)
1125 {
1126         TSReadPointer *readptr = &state->readptrs[state->activeptr];
1127
1128         Assert(readptr->eflags & EXEC_FLAG_REWIND);
1129         Assert(!state->truncated);
1130
1131         switch (state->status)
1132         {
1133                 case TSS_INMEM:
1134                         readptr->eof_reached = false;
1135                         readptr->current = 0;
1136                         break;
1137                 case TSS_WRITEFILE:
1138                         readptr->eof_reached = false;
1139                         readptr->file = 0;
1140                         readptr->offset = 0L;
1141                         break;
1142                 case TSS_READFILE:
1143                         readptr->eof_reached = false;
1144                         if (BufFileSeek(state->myfile, 0, 0L, SEEK_SET) != 0)
1145                                 elog(ERROR, "tuplestore seek to start failed");
1146                         break;
1147                 default:
1148                         elog(ERROR, "invalid tuplestore state");
1149                         break;
1150         }
1151 }
1152
1153 /*
1154  * tuplestore_copy_read_pointer - copy a read pointer's state to another
1155  */
1156 void
1157 tuplestore_copy_read_pointer(Tuplestorestate *state,
1158                                                          int srcptr, int destptr)
1159 {
1160         TSReadPointer *sptr = &state->readptrs[srcptr];
1161         TSReadPointer *dptr = &state->readptrs[destptr];
1162
1163         Assert(srcptr >= 0 && srcptr < state->readptrcount);
1164         Assert(destptr >= 0 && destptr < state->readptrcount);
1165
1166         /* Assigning to self is a no-op */
1167         if (srcptr == destptr)
1168                 return;
1169
1170         if (dptr->eflags != sptr->eflags)
1171         {
1172                 /* Possible change of overall eflags, so copy and then recompute */
1173                 int                     eflags;
1174                 int                     i;
1175
1176                 *dptr = *sptr;
1177                 eflags = state->readptrs[0].eflags;
1178                 for (i = 1; i < state->readptrcount; i++)
1179                         eflags |= state->readptrs[i].eflags;
1180                 state->eflags = eflags;
1181         }
1182         else
1183                 *dptr = *sptr;
1184
1185         switch (state->status)
1186         {
1187                 case TSS_INMEM:
1188                 case TSS_WRITEFILE:
1189                         /* no work */
1190                         break;
1191                 case TSS_READFILE:
1192
1193                         /*
1194                          * This case is a bit tricky since the active read pointer's
1195                          * position corresponds to the seek point, not what is in its
1196                          * variables.  Assigning to the active requires a seek, and
1197                          * assigning from the active requires a tell, except when
1198                          * eof_reached.
1199                          */
1200                         if (destptr == state->activeptr)
1201                         {
1202                                 if (dptr->eof_reached)
1203                                 {
1204                                         if (BufFileSeek(state->myfile,
1205                                                                         state->writepos_file,
1206                                                                         state->writepos_offset,
1207                                                                         SEEK_SET) != 0)
1208                                                 elog(ERROR, "tuplestore seek failed");
1209                                 }
1210                                 else
1211                                 {
1212                                         if (BufFileSeek(state->myfile,
1213                                                                         dptr->file, dptr->offset,
1214                                                                         SEEK_SET) != 0)
1215                                                 elog(ERROR, "tuplestore seek failed");
1216                                 }
1217                         }
1218                         else if (srcptr == state->activeptr)
1219                         {
1220                                 if (!dptr->eof_reached)
1221                                         BufFileTell(state->myfile,
1222                                                                 &dptr->file,
1223                                                                 &dptr->offset);
1224                         }
1225                         break;
1226                 default:
1227                         elog(ERROR, "invalid tuplestore state");
1228                         break;
1229         }
1230 }
1231
1232 /*
1233  * tuplestore_trim      - remove all no-longer-needed tuples
1234  *
1235  * Calling this function authorizes the tuplestore to delete all tuples
1236  * before the oldest read pointer, if no read pointer is marked as requiring
1237  * REWIND capability.
1238  *
1239  * Note: this is obviously safe if no pointer has BACKWARD capability either.
1240  * If a pointer is marked as BACKWARD but not REWIND capable, it means that
1241  * the pointer can be moved backward but not before the oldest other read
1242  * pointer.
1243  */
1244 void
1245 tuplestore_trim(Tuplestorestate *state)
1246 {
1247         int                     oldest;
1248         int                     nremove;
1249         int                     i;
1250
1251         /*
1252          * Truncation is disallowed if any read pointer requires rewind
1253          * capability.
1254          */
1255         if (state->eflags & EXEC_FLAG_REWIND)
1256                 return;
1257
1258         /*
1259          * We don't bother trimming temp files since it usually would mean more
1260          * work than just letting them sit in kernel buffers until they age out.
1261          */
1262         if (state->status != TSS_INMEM)
1263                 return;
1264
1265         /* Find the oldest read pointer */
1266         oldest = state->memtupcount;
1267         for (i = 0; i < state->readptrcount; i++)
1268         {
1269                 if (!state->readptrs[i].eof_reached)
1270                         oldest = Min(oldest, state->readptrs[i].current);
1271         }
1272
1273         /*
1274          * Note: you might think we could remove all the tuples before the oldest
1275          * "current", since that one is the next to be returned.  However, since
1276          * tuplestore_gettuple returns a direct pointer to our internal copy of
1277          * the tuple, it's likely that the caller has still got the tuple just
1278          * before "current" referenced in a slot. So we keep one extra tuple
1279          * before the oldest "current".  (Strictly speaking, we could require such
1280          * callers to use the "copy" flag to tuplestore_gettupleslot, but for
1281          * efficiency we allow this one case to not use "copy".)
1282          */
1283         nremove = oldest - 1;
1284         if (nremove <= 0)
1285                 return;                                 /* nothing to do */
1286
1287         Assert(nremove >= state->memtupdeleted);
1288         Assert(nremove <= state->memtupcount);
1289
1290         /* Release no-longer-needed tuples */
1291         for (i = state->memtupdeleted; i < nremove; i++)
1292         {
1293                 FREEMEM(state, GetMemoryChunkSpace(state->memtuples[i]));
1294                 pfree(state->memtuples[i]);
1295                 state->memtuples[i] = NULL;
1296         }
1297         state->memtupdeleted = nremove;
1298
1299         /* mark tuplestore as truncated (used for Assert crosschecks only) */
1300         state->truncated = true;
1301
1302         /*
1303          * If nremove is less than 1/8th memtupcount, just stop here, leaving the
1304          * "deleted" slots as NULL.  This prevents us from expending O(N^2) time
1305          * repeatedly memmove-ing a large pointer array.  The worst case space
1306          * wastage is pretty small, since it's just pointers and not whole tuples.
1307          */
1308         if (nremove < state->memtupcount / 8)
1309                 return;
1310
1311         /*
1312          * Slide the array down and readjust pointers.
1313          *
1314          * In mergejoin's current usage, it's demonstrable that there will always
1315          * be exactly one non-removed tuple; so optimize that case.
1316          */
1317         if (nremove + 1 == state->memtupcount)
1318                 state->memtuples[0] = state->memtuples[nremove];
1319         else
1320                 memmove(state->memtuples, state->memtuples + nremove,
1321                                 (state->memtupcount - nremove) * sizeof(void *));
1322
1323         state->memtupdeleted = 0;
1324         state->memtupcount -= nremove;
1325         for (i = 0; i < state->readptrcount; i++)
1326         {
1327                 if (!state->readptrs[i].eof_reached)
1328                         state->readptrs[i].current -= nremove;
1329         }
1330 }
1331
1332 /*
1333  * tuplestore_in_memory
1334  *
1335  * Returns true if the tuplestore has not spilled to disk.
1336  *
1337  * XXX exposing this is a violation of modularity ... should get rid of it.
1338  */
1339 bool
1340 tuplestore_in_memory(Tuplestorestate *state)
1341 {
1342         return (state->status == TSS_INMEM);
1343 }
1344
1345
1346 /*
1347  * Tape interface routines
1348  */
1349
1350 static unsigned int
1351 getlen(Tuplestorestate *state, bool eofOK)
1352 {
1353         unsigned int len;
1354         size_t          nbytes;
1355
1356         nbytes = BufFileRead(state->myfile, (void *) &len, sizeof(len));
1357         if (nbytes == sizeof(len))
1358                 return len;
1359         if (nbytes != 0)
1360                 elog(ERROR, "unexpected end of tape");
1361         if (!eofOK)
1362                 elog(ERROR, "unexpected end of data");
1363         return 0;
1364 }
1365
1366
1367 /*
1368  * Routines specialized for HeapTuple case
1369  *
1370  * The stored form is actually a MinimalTuple, but for largely historical
1371  * reasons we allow COPYTUP to work from a HeapTuple.
1372  *
1373  * Since MinimalTuple already has length in its first word, we don't need
1374  * to write that separately.
1375  */
1376
1377 static void *
1378 copytup_heap(Tuplestorestate *state, void *tup)
1379 {
1380         MinimalTuple tuple;
1381
1382         tuple = minimal_tuple_from_heap_tuple((HeapTuple) tup);
1383         USEMEM(state, GetMemoryChunkSpace(tuple));
1384         return (void *) tuple;
1385 }
1386
1387 static void
1388 writetup_heap(Tuplestorestate *state, void *tup)
1389 {
1390         MinimalTuple tuple = (MinimalTuple) tup;
1391
1392         /* the part of the MinimalTuple we'll write: */
1393         char       *tupbody = (char *) tuple + MINIMAL_TUPLE_DATA_OFFSET;
1394         unsigned int tupbodylen = tuple->t_len - MINIMAL_TUPLE_DATA_OFFSET;
1395
1396         /* total on-disk footprint: */
1397         unsigned int tuplen = tupbodylen + sizeof(int);
1398
1399         if (BufFileWrite(state->myfile, (void *) &tuplen,
1400                                          sizeof(tuplen)) != sizeof(tuplen))
1401                 elog(ERROR, "write failed");
1402         if (BufFileWrite(state->myfile, (void *) tupbody,
1403                                          tupbodylen) != (size_t) tupbodylen)
1404                 elog(ERROR, "write failed");
1405         if (state->backward)            /* need trailing length word? */
1406                 if (BufFileWrite(state->myfile, (void *) &tuplen,
1407                                                  sizeof(tuplen)) != sizeof(tuplen))
1408                         elog(ERROR, "write failed");
1409
1410         FREEMEM(state, GetMemoryChunkSpace(tuple));
1411         heap_free_minimal_tuple(tuple);
1412 }
1413
1414 static void *
1415 readtup_heap(Tuplestorestate *state, unsigned int len)
1416 {
1417         unsigned int tupbodylen = len - sizeof(int);
1418         unsigned int tuplen = tupbodylen + MINIMAL_TUPLE_DATA_OFFSET;
1419         MinimalTuple tuple = (MinimalTuple) palloc(tuplen);
1420         char       *tupbody = (char *) tuple + MINIMAL_TUPLE_DATA_OFFSET;
1421
1422         USEMEM(state, GetMemoryChunkSpace(tuple));
1423         /* read in the tuple proper */
1424         tuple->t_len = tuplen;
1425         if (BufFileRead(state->myfile, (void *) tupbody,
1426                                         tupbodylen) != (size_t) tupbodylen)
1427                 elog(ERROR, "unexpected end of data");
1428         if (state->backward)            /* need trailing length word? */
1429                 if (BufFileRead(state->myfile, (void *) &tuplen,
1430                                                 sizeof(tuplen)) != sizeof(tuplen))
1431                         elog(ERROR, "unexpected end of data");
1432         return (void *) tuple;
1433 }