1 /*-------------------------------------------------------------------------
4 * Management of "logical tapes" within temporary files.
6 * This module exists to support sorting via multiple merge passes (see
7 * tuplesort.c). Merging is an ideal algorithm for tape devices, but if
8 * we implement it on disk by creating a separate file for each "tape",
9 * there is an annoying problem: the peak space usage is at least twice
10 * the volume of actual data to be sorted. (This must be so because each
11 * datum will appear in both the input and output tapes of the final
12 * merge pass. For seven-tape polyphase merge, which is otherwise a
13 * pretty good algorithm, peak usage is more like 4x actual data volume.)
15 * We can work around this problem by recognizing that any one tape
16 * dataset (with the possible exception of the final output) is written
17 * and read exactly once in a perfectly sequential manner. Therefore,
18 * a datum once read will not be required again, and we can recycle its
19 * space for use by the new tape dataset(s) being generated. In this way,
20 * the total space usage is essentially just the actual data volume, plus
21 * insignificant bookkeeping and start/stop overhead.
23 * Few OSes allow arbitrary parts of a file to be released back to the OS,
24 * so we have to implement this space-recycling ourselves within a single
25 * logical file. logtape.c exists to perform this bookkeeping and provide
26 * the illusion of N independent tape devices to tuplesort.c. Note that
27 * logtape.c itself depends on buffile.c to provide a "logical file" of
28 * larger size than the underlying OS may support.
30 * For simplicity, we allocate and release space in the underlying file
31 * in BLCKSZ-size blocks. Space allocation boils down to keeping track
32 * of which blocks in the underlying file belong to which logical tape,
33 * plus any blocks that are free (recycled and not yet reused). Normally
34 * there are not very many free blocks, so we just keep those in a list.
35 * The blocks in each logical tape are remembered using a method borrowed
36 * from the Unix HFS filesystem: we store data block numbers in an
37 * "indirect block". If an indirect block fills up, we write it out to
38 * the underlying file and remember its location in a second-level indirect
39 * block. In the same way second-level blocks are remembered in third-
40 * level blocks, and so on if necessary (of course we're talking huge
41 * amounts of data here). The topmost indirect block of a given logical
42 * tape is never actually written out to the physical file, but all lower-
43 * level indirect blocks will be.
45 * The initial write pass is guaranteed to fill the underlying file
46 * perfectly sequentially, no matter how data is divided into logical tapes.
47 * Once we begin merge passes, the access pattern becomes considerably
48 * less predictable --- but the seeking involved should be comparable to
49 * what would happen if we kept each logical tape in a separate file,
50 * so there's no serious performance penalty paid to obtain the space
51 * savings of recycling. We try to localize the write accesses by always
52 * writing to the lowest-numbered free block when we have a choice; it's
53 * not clear this helps much, but it can't hurt. (XXX perhaps a LIFO
54 * policy for free blocks would be better?)
56 * Since all the bookkeeping and buffer memory is allocated with palloc(),
57 * and the underlying file(s) are made with OpenTemporaryFile, all resources
58 * for a logical tape set are certain to be cleaned up even if processing
59 * is aborted by ereport(ERROR). To avoid confusion, the caller should take
60 * care that all calls for a single LogicalTapeSet are made in the same
63 * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
64 * Portions Copyright (c) 1994, Regents of the University of California
67 * $PostgreSQL: pgsql/src/backend/utils/sort/logtape.c,v 1.18 2006/02/19 05:58:36 tgl Exp $
69 *-------------------------------------------------------------------------
74 #include "storage/buffile.h"
75 #include "utils/logtape.h"
78 * Block indexes are "long"s, so we can fit this many per indirect block.
79 * NB: we assume this is an exact fit!
81 #define BLOCKS_PER_INDIR_BLOCK ((int) (BLCKSZ / sizeof(long)))
84 * We use a struct like this for each active indirection level of each
85 * logical tape. If the indirect block is not the highest level of its
86 * tape, the "nextup" link points to the next higher level. Only the
87 * "ptrs" array is written out if we have to dump the indirect block to
88 * disk. If "ptrs" is not completely full, we store -1L in the first
89 * unused slot at completion of the write phase for the logical tape.
91 typedef struct IndirectBlock
93 int nextSlot; /* next pointer slot to write or read */
94 struct IndirectBlock *nextup; /* parent indirect level, or NULL if
96 long ptrs[BLOCKS_PER_INDIR_BLOCK]; /* indexes of contained blocks */
100 * This data structure represents a single "logical tape" within the set
101 * of logical tapes stored in the same file. We must keep track of the
102 * current partially-read-or-written data block as well as the active
103 * indirect block level(s).
105 typedef struct LogicalTape
107 IndirectBlock *indirect; /* bottom of my indirect-block hierarchy */
108 bool writing; /* T while in write phase */
109 bool frozen; /* T if blocks should not be freed when read */
110 bool dirty; /* does buffer need to be written? */
113 * The total data volume in the logical tape is numFullBlocks * BLCKSZ +
114 * lastBlockBytes. BUT: we do not update lastBlockBytes during writing,
115 * only at completion of a write phase.
117 long numFullBlocks; /* number of complete blocks in log tape */
118 int lastBlockBytes; /* valid bytes in last (incomplete) block */
121 * Buffer for current data block. Note we don't bother to store the
122 * actual file block number of the data block (during the write phase it
123 * hasn't been assigned yet, and during read we don't care anymore). But
124 * we do need the relative block number so we can detect end-of-tape while
127 char *buffer; /* physical buffer (separately palloc'd) */
128 long curBlockNumber; /* this block's logical blk# within tape */
129 int pos; /* next read/write position in buffer */
130 int nbytes; /* total # of valid bytes in buffer */
134 * This data structure represents a set of related "logical tapes" sharing
135 * space in a single underlying file. (But that "file" may be multiple files
136 * if needed to escape OS limits on file size; buffile.c handles that for us.)
137 * The number of tapes is fixed at creation.
139 struct LogicalTapeSet
141 BufFile *pfile; /* underlying file for whole tape set */
142 long nFileBlocks; /* # of blocks used in underlying file */
145 * We store the numbers of recycled-and-available blocks in freeBlocks[].
146 * When there are no such blocks, we extend the underlying file. Note
147 * that the block numbers in freeBlocks are always in *decreasing* order,
148 * so that removing the last entry gives us the lowest free block.
150 long *freeBlocks; /* resizable array */
151 int nFreeBlocks; /* # of currently free blocks */
152 int freeBlocksLen; /* current allocated length of freeBlocks[] */
155 * tapes[] is declared size 1 since C wants a fixed size, but actually it
156 * is of length nTapes.
158 int nTapes; /* # of logical tapes in set */
159 LogicalTape tapes[1]; /* must be last in struct! */
162 static void ltsWriteBlock(LogicalTapeSet *lts, long blocknum, void *buffer);
163 static void ltsReadBlock(LogicalTapeSet *lts, long blocknum, void *buffer);
164 static long ltsGetFreeBlock(LogicalTapeSet *lts);
165 static void ltsReleaseBlock(LogicalTapeSet *lts, long blocknum);
166 static void ltsRecordBlockNum(LogicalTapeSet *lts, IndirectBlock *indirect,
168 static long ltsRewindIndirectBlock(LogicalTapeSet *lts,
169 IndirectBlock *indirect,
171 static long ltsRewindFrozenIndirectBlock(LogicalTapeSet *lts,
172 IndirectBlock *indirect);
173 static long ltsRecallNextBlockNum(LogicalTapeSet *lts,
174 IndirectBlock *indirect,
176 static long ltsRecallPrevBlockNum(LogicalTapeSet *lts,
177 IndirectBlock *indirect);
178 static void ltsDumpBuffer(LogicalTapeSet *lts, LogicalTape *lt);
182 * Write a block-sized buffer to the specified block of the underlying file.
184 * NB: should not attempt to write beyond current end of file (ie, create
185 * "holes" in file), since BufFile doesn't allow that. The first write pass
186 * must write blocks sequentially.
188 * No need for an error return convention; we ereport() on any error.
191 ltsWriteBlock(LogicalTapeSet *lts, long blocknum, void *buffer)
193 if (BufFileSeekBlock(lts->pfile, blocknum) != 0 ||
194 BufFileWrite(lts->pfile, buffer, BLCKSZ) != BLCKSZ)
196 /* XXX is it okay to assume errno is correct? */
197 (errcode_for_file_access(),
198 errmsg("could not write block %ld of temporary file: %m",
200 errhint("Perhaps out of disk space?")));
204 * Read a block-sized buffer from the specified block of the underlying file.
206 * No need for an error return convention; we ereport() on any error. This
207 * module should never attempt to read a block it doesn't know is there.
210 ltsReadBlock(LogicalTapeSet *lts, long blocknum, void *buffer)
212 if (BufFileSeekBlock(lts->pfile, blocknum) != 0 ||
213 BufFileRead(lts->pfile, buffer, BLCKSZ) != BLCKSZ)
215 /* XXX is it okay to assume errno is correct? */
216 (errcode_for_file_access(),
217 errmsg("could not read block %ld of temporary file: %m",
222 * Select a currently unused block for writing to.
224 * NB: should only be called when writer is ready to write immediately,
225 * to ensure that first write pass is sequential.
228 ltsGetFreeBlock(LogicalTapeSet *lts)
231 * If there are multiple free blocks, we select the one appearing last in
232 * freeBlocks[]. If there are none, assign the next block at the end of
235 if (lts->nFreeBlocks > 0)
236 return lts->freeBlocks[--lts->nFreeBlocks];
238 return lts->nFileBlocks++;
242 * Return a block# to the freelist.
245 ltsReleaseBlock(LogicalTapeSet *lts, long blocknum)
251 * Enlarge freeBlocks array if full.
253 if (lts->nFreeBlocks >= lts->freeBlocksLen)
255 lts->freeBlocksLen *= 2;
256 lts->freeBlocks = (long *) repalloc(lts->freeBlocks,
257 lts->freeBlocksLen * sizeof(long));
261 * Insert blocknum into array, preserving decreasing order (so that
262 * ltsGetFreeBlock returns the lowest available block number). This could
263 * get fairly slow if there were many free blocks, but we don't expect
264 * there to be very many at one time.
266 ndx = lts->nFreeBlocks++;
267 ptr = lts->freeBlocks + ndx;
268 while (ndx > 0 && ptr[-1] < blocknum)
277 * These routines manipulate indirect-block hierarchies. All are recursive
278 * so that they don't have any specific limit on the depth of hierarchy.
282 * Record a data block number in a logical tape's lowest indirect block,
283 * or record an indirect block's number in the next higher indirect level.
286 ltsRecordBlockNum(LogicalTapeSet *lts, IndirectBlock *indirect,
289 if (indirect->nextSlot >= BLOCKS_PER_INDIR_BLOCK)
292 * This indirect block is full, so dump it out and recursively save
293 * its address in the next indirection level. Create a new
294 * indirection level if there wasn't one before.
296 long indirblock = ltsGetFreeBlock(lts);
298 ltsWriteBlock(lts, indirblock, (void *) indirect->ptrs);
299 if (indirect->nextup == NULL)
301 indirect->nextup = (IndirectBlock *) palloc(sizeof(IndirectBlock));
302 indirect->nextup->nextSlot = 0;
303 indirect->nextup->nextup = NULL;
305 ltsRecordBlockNum(lts, indirect->nextup, indirblock);
308 * Reset to fill another indirect block at this level.
310 indirect->nextSlot = 0;
312 indirect->ptrs[indirect->nextSlot++] = blocknum;
316 * Reset a logical tape's indirect-block hierarchy after a write pass
317 * to prepare for reading. We dump out partly-filled blocks except
318 * at the top of the hierarchy, and we rewind each level to the start.
319 * This call returns the first data block number, or -1L if the tape
322 * Unless 'freezing' is true, release indirect blocks to the free pool after
326 ltsRewindIndirectBlock(LogicalTapeSet *lts,
327 IndirectBlock *indirect,
330 /* Handle case of never-written-to tape */
331 if (indirect == NULL)
334 /* Insert sentinel if block is not full */
335 if (indirect->nextSlot < BLOCKS_PER_INDIR_BLOCK)
336 indirect->ptrs[indirect->nextSlot] = -1L;
339 * If block is not topmost, write it out, and recurse to obtain address of
340 * first block in this hierarchy level. Read that one in.
342 if (indirect->nextup != NULL)
344 long indirblock = ltsGetFreeBlock(lts);
346 ltsWriteBlock(lts, indirblock, (void *) indirect->ptrs);
347 ltsRecordBlockNum(lts, indirect->nextup, indirblock);
348 indirblock = ltsRewindIndirectBlock(lts, indirect->nextup, freezing);
349 Assert(indirblock != -1L);
350 ltsReadBlock(lts, indirblock, (void *) indirect->ptrs);
352 ltsReleaseBlock(lts, indirblock);
356 * Reset my next-block pointer, and then fetch a block number if any.
358 indirect->nextSlot = 0;
359 if (indirect->ptrs[0] == -1L)
361 return indirect->ptrs[indirect->nextSlot++];
365 * Rewind a previously-frozen indirect-block hierarchy for another read pass.
366 * This call returns the first data block number, or -1L if the tape
370 ltsRewindFrozenIndirectBlock(LogicalTapeSet *lts,
371 IndirectBlock *indirect)
373 /* Handle case of never-written-to tape */
374 if (indirect == NULL)
378 * If block is not topmost, recurse to obtain address of first block in
379 * this hierarchy level. Read that one in.
381 if (indirect->nextup != NULL)
385 indirblock = ltsRewindFrozenIndirectBlock(lts, indirect->nextup);
386 Assert(indirblock != -1L);
387 ltsReadBlock(lts, indirblock, (void *) indirect->ptrs);
391 * Reset my next-block pointer, and then fetch a block number if any.
393 indirect->nextSlot = 0;
394 if (indirect->ptrs[0] == -1L)
396 return indirect->ptrs[indirect->nextSlot++];
400 * Obtain next data block number in the forward direction, or -1L if no more.
402 * Unless 'frozen' is true, release indirect blocks to the free pool after
406 ltsRecallNextBlockNum(LogicalTapeSet *lts,
407 IndirectBlock *indirect,
410 /* Handle case of never-written-to tape */
411 if (indirect == NULL)
414 if (indirect->nextSlot >= BLOCKS_PER_INDIR_BLOCK ||
415 indirect->ptrs[indirect->nextSlot] == -1L)
419 if (indirect->nextup == NULL)
420 return -1L; /* nothing left at this level */
421 indirblock = ltsRecallNextBlockNum(lts, indirect->nextup, frozen);
422 if (indirblock == -1L)
423 return -1L; /* nothing left at this level */
424 ltsReadBlock(lts, indirblock, (void *) indirect->ptrs);
426 ltsReleaseBlock(lts, indirblock);
427 indirect->nextSlot = 0;
429 if (indirect->ptrs[indirect->nextSlot] == -1L)
431 return indirect->ptrs[indirect->nextSlot++];
435 * Obtain next data block number in the reverse direction, or -1L if no more.
437 * Note this fetches the block# before the one last returned, no matter which
438 * direction of call returned that one. If we fail, no change in state.
440 * This routine can only be used in 'frozen' state, so there's no need to
441 * pass a parameter telling whether to release blocks ... we never do.
444 ltsRecallPrevBlockNum(LogicalTapeSet *lts,
445 IndirectBlock *indirect)
447 /* Handle case of never-written-to tape */
448 if (indirect == NULL)
451 if (indirect->nextSlot <= 1)
455 if (indirect->nextup == NULL)
456 return -1L; /* nothing left at this level */
457 indirblock = ltsRecallPrevBlockNum(lts, indirect->nextup);
458 if (indirblock == -1L)
459 return -1L; /* nothing left at this level */
460 ltsReadBlock(lts, indirblock, (void *) indirect->ptrs);
463 * The previous block would only have been written out if full, so we
464 * need not search it for a -1 sentinel.
466 indirect->nextSlot = BLOCKS_PER_INDIR_BLOCK + 1;
468 indirect->nextSlot--;
469 return indirect->ptrs[indirect->nextSlot - 1];
474 * Create a set of logical tapes in a temporary underlying file.
476 * Each tape is initialized in write state.
479 LogicalTapeSetCreate(int ntapes)
486 * Create top-level struct including per-tape LogicalTape structs.
487 * First LogicalTape struct is already counted in sizeof(LogicalTapeSet).
490 lts = (LogicalTapeSet *) palloc(sizeof(LogicalTapeSet) +
491 (ntapes - 1) * sizeof(LogicalTape));
492 lts->pfile = BufFileCreateTemp(false);
493 lts->nFileBlocks = 0L;
494 lts->freeBlocksLen = 32; /* reasonable initial guess */
495 lts->freeBlocks = (long *) palloc(lts->freeBlocksLen * sizeof(long));
496 lts->nFreeBlocks = 0;
497 lts->nTapes = ntapes;
500 * Initialize per-tape structs. Note we allocate the I/O buffer and
501 * first-level indirect block for a tape only when it is first actually
502 * written to. This avoids wasting memory space when tuplesort.c
503 * overestimates the number of tapes needed.
505 for (i = 0; i < ntapes; i++)
512 lt->numFullBlocks = 0L;
513 lt->lastBlockBytes = 0;
515 lt->curBlockNumber = 0L;
523 * Close a logical tape set and release all resources.
526 LogicalTapeSetClose(LogicalTapeSet *lts)
533 BufFileClose(lts->pfile);
534 for (i = 0; i < lts->nTapes; i++)
537 for (ib = lt->indirect; ib != NULL; ib = nextib)
545 pfree(lts->freeBlocks);
550 * Dump the dirty buffer of a logical tape.
553 ltsDumpBuffer(LogicalTapeSet *lts, LogicalTape *lt)
555 long datablock = ltsGetFreeBlock(lts);
558 ltsWriteBlock(lts, datablock, (void *) lt->buffer);
559 ltsRecordBlockNum(lts, lt->indirect, datablock);
561 /* Caller must do other state update as needed */
565 * Write to a logical tape.
567 * There are no error returns; we ereport() on failure.
570 LogicalTapeWrite(LogicalTapeSet *lts, int tapenum,
571 void *ptr, size_t size)
576 Assert(tapenum >= 0 && tapenum < lts->nTapes);
577 lt = <s->tapes[tapenum];
580 /* Allocate data buffer and first indirect block on first write */
581 if (lt->buffer == NULL)
582 lt->buffer = (char *) palloc(BLCKSZ);
583 if (lt->indirect == NULL)
585 lt->indirect = (IndirectBlock *) palloc(sizeof(IndirectBlock));
586 lt->indirect->nextSlot = 0;
587 lt->indirect->nextup = NULL;
592 if (lt->pos >= BLCKSZ)
594 /* Buffer full, dump it out */
596 ltsDumpBuffer(lts, lt);
599 /* Hmm, went directly from reading to writing? */
600 elog(ERROR, "invalid logtape state: should be dirty");
603 lt->curBlockNumber++;
608 nthistime = BLCKSZ - lt->pos;
609 if (nthistime > size)
611 Assert(nthistime > 0);
613 memcpy(lt->buffer + lt->pos, ptr, nthistime);
616 lt->pos += nthistime;
617 if (lt->nbytes < lt->pos)
618 lt->nbytes = lt->pos;
619 ptr = (void *) ((char *) ptr + nthistime);
625 * Rewind logical tape and switch from writing to reading or vice versa.
627 * Unless the tape has been "frozen" in read state, forWrite must be the
628 * opposite of the previous tape state.
631 LogicalTapeRewind(LogicalTapeSet *lts, int tapenum, bool forWrite)
636 Assert(tapenum >= 0 && tapenum < lts->nTapes);
637 lt = <s->tapes[tapenum];
644 * Completion of a write phase. Flush last partial data block,
645 * flush any partial indirect blocks, rewind for normal
646 * (destructive) read.
649 ltsDumpBuffer(lts, lt);
650 lt->lastBlockBytes = lt->nbytes;
652 datablocknum = ltsRewindIndirectBlock(lts, lt->indirect, false);
657 * This is only OK if tape is frozen; we rewind for (another) read
661 datablocknum = ltsRewindFrozenIndirectBlock(lts, lt->indirect);
663 /* Read the first block, or reset if tape is empty */
664 lt->curBlockNumber = 0L;
667 if (datablocknum != -1L)
669 ltsReadBlock(lts, datablocknum, (void *) lt->buffer);
671 ltsReleaseBlock(lts, datablocknum);
672 lt->nbytes = (lt->curBlockNumber < lt->numFullBlocks) ?
673 BLCKSZ : lt->lastBlockBytes;
679 * Completion of a read phase. Rewind and prepare for write.
681 * NOTE: we assume the caller has read the tape to the end; otherwise
682 * untouched data and indirect blocks will not have been freed. We
683 * could add more code to free any unread blocks, but in current usage
684 * of this module it'd be useless code.
689 Assert(!lt->writing && !lt->frozen);
690 /* Must truncate the indirect-block hierarchy down to one level. */
693 for (ib = lt->indirect->nextup; ib != NULL; ib = nextib)
698 lt->indirect->nextSlot = 0;
699 lt->indirect->nextup = NULL;
703 lt->numFullBlocks = 0L;
704 lt->lastBlockBytes = 0;
705 lt->curBlockNumber = 0L;
712 * Read from a logical tape.
714 * Early EOF is indicated by return value less than #bytes requested.
717 LogicalTapeRead(LogicalTapeSet *lts, int tapenum,
718 void *ptr, size_t size)
724 Assert(tapenum >= 0 && tapenum < lts->nTapes);
725 lt = <s->tapes[tapenum];
726 Assert(!lt->writing);
730 if (lt->pos >= lt->nbytes)
732 /* Try to load more data into buffer. */
733 long datablocknum = ltsRecallNextBlockNum(lts, lt->indirect,
736 if (datablocknum == -1L)
738 lt->curBlockNumber++;
740 ltsReadBlock(lts, datablocknum, (void *) lt->buffer);
742 ltsReleaseBlock(lts, datablocknum);
743 lt->nbytes = (lt->curBlockNumber < lt->numFullBlocks) ?
744 BLCKSZ : lt->lastBlockBytes;
746 break; /* EOF (possible here?) */
749 nthistime = lt->nbytes - lt->pos;
750 if (nthistime > size)
752 Assert(nthistime > 0);
754 memcpy(ptr, lt->buffer + lt->pos, nthistime);
756 lt->pos += nthistime;
757 ptr = (void *) ((char *) ptr + nthistime);
766 * "Freeze" the contents of a tape so that it can be read multiple times
767 * and/or read backwards. Once a tape is frozen, its contents will not
768 * be released until the LogicalTapeSet is destroyed. This is expected
769 * to be used only for the final output pass of a merge.
771 * This *must* be called just at the end of a write pass, before the
772 * tape is rewound (after rewind is too late!). It performs a rewind
773 * and switch to read mode "for free". An immediately following rewind-
774 * for-read call is OK but not necessary.
777 LogicalTapeFreeze(LogicalTapeSet *lts, int tapenum)
782 Assert(tapenum >= 0 && tapenum < lts->nTapes);
783 lt = <s->tapes[tapenum];
787 * Completion of a write phase. Flush last partial data block, flush any
788 * partial indirect blocks, rewind for nondestructive read.
791 ltsDumpBuffer(lts, lt);
792 lt->lastBlockBytes = lt->nbytes;
795 datablocknum = ltsRewindIndirectBlock(lts, lt->indirect, true);
796 /* Read the first block, or reset if tape is empty */
797 lt->curBlockNumber = 0L;
800 if (datablocknum != -1L)
802 ltsReadBlock(lts, datablocknum, (void *) lt->buffer);
803 lt->nbytes = (lt->curBlockNumber < lt->numFullBlocks) ?
804 BLCKSZ : lt->lastBlockBytes;
809 * Backspace the tape a given number of bytes. (We also support a more
810 * general seek interface, see below.)
812 * *Only* a frozen-for-read tape can be backed up; we don't support
813 * random access during write, and an unfrozen read tape may have
814 * already discarded the desired data!
816 * Return value is TRUE if seek successful, FALSE if there isn't that much
817 * data before the current point (in which case there's no state change).
820 LogicalTapeBackspace(LogicalTapeSet *lts, int tapenum, size_t size)
826 Assert(tapenum >= 0 && tapenum < lts->nTapes);
827 lt = <s->tapes[tapenum];
831 * Easy case for seek within current block.
833 if (size <= (size_t) lt->pos)
835 lt->pos -= (int) size;
840 * Not-so-easy case. Figure out whether it's possible at all.
842 size -= (size_t) lt->pos; /* part within this block */
843 nblocks = size / BLCKSZ;
844 size = size % BLCKSZ;
848 newpos = (int) (BLCKSZ - size);
852 if (nblocks > lt->curBlockNumber)
853 return false; /* a seek too far... */
856 * OK, we need to back up nblocks blocks. This implementation would be
857 * pretty inefficient for long seeks, but we really aren't expecting that
858 * (a seek over one tuple is typical).
860 while (nblocks-- > 0)
862 long datablocknum = ltsRecallPrevBlockNum(lts, lt->indirect);
864 if (datablocknum == -1L)
865 elog(ERROR, "unexpected end of tape");
866 lt->curBlockNumber--;
869 ltsReadBlock(lts, datablocknum, (void *) lt->buffer);
878 * Seek to an arbitrary position in a logical tape.
880 * *Only* a frozen-for-read tape can be seeked.
882 * Return value is TRUE if seek successful, FALSE if there isn't that much
883 * data in the tape (in which case there's no state change).
886 LogicalTapeSeek(LogicalTapeSet *lts, int tapenum,
887 long blocknum, int offset)
891 Assert(tapenum >= 0 && tapenum < lts->nTapes);
892 lt = <s->tapes[tapenum];
894 Assert(offset >= 0 && offset <= BLCKSZ);
897 * Easy case for seek within current block.
899 if (blocknum == lt->curBlockNumber && offset <= lt->nbytes)
906 * Not-so-easy case. Figure out whether it's possible at all.
908 if (blocknum < 0 || blocknum > lt->numFullBlocks ||
909 (blocknum == lt->numFullBlocks && offset > lt->lastBlockBytes))
913 * OK, advance or back up to the target block. This implementation would
914 * be pretty inefficient for long seeks, but we really aren't expecting
915 * that (a seek over one tuple is typical).
917 while (lt->curBlockNumber > blocknum)
919 long datablocknum = ltsRecallPrevBlockNum(lts, lt->indirect);
921 if (datablocknum == -1L)
922 elog(ERROR, "unexpected end of tape");
923 if (--lt->curBlockNumber == blocknum)
924 ltsReadBlock(lts, datablocknum, (void *) lt->buffer);
926 while (lt->curBlockNumber < blocknum)
928 long datablocknum = ltsRecallNextBlockNum(lts, lt->indirect,
931 if (datablocknum == -1L)
932 elog(ERROR, "unexpected end of tape");
933 if (++lt->curBlockNumber == blocknum)
934 ltsReadBlock(lts, datablocknum, (void *) lt->buffer);
936 lt->nbytes = (lt->curBlockNumber < lt->numFullBlocks) ?
937 BLCKSZ : lt->lastBlockBytes;
943 * Obtain current position in a form suitable for a later LogicalTapeSeek.
945 * NOTE: it'd be OK to do this during write phase with intention of using
946 * the position for a seek after freezing. Not clear if anyone needs that.
949 LogicalTapeTell(LogicalTapeSet *lts, int tapenum,
950 long *blocknum, int *offset)
954 Assert(tapenum >= 0 && tapenum < lts->nTapes);
955 lt = <s->tapes[tapenum];
956 *blocknum = lt->curBlockNumber;
961 * Obtain total disk space currently used by a LogicalTapeSet, in blocks.
964 LogicalTapeSetBlocks(LogicalTapeSet *lts)
966 return lts->nFileBlocks;