]> granicus.if.org Git - postgresql/blob - src/backend/utils/mmgr/freepage.c
Management of free memory pages.
[postgresql] / src / backend / utils / mmgr / freepage.c
1 /*-------------------------------------------------------------------------
2  *
3  * freepage.c
4  *        Management of free memory pages.
5  *
6  * The intention of this code is to provide infrastructure for memory
7  * allocators written specifically for PostgreSQL.  At least in the case
8  * of dynamic shared memory, we can't simply use malloc() or even
9  * relatively thin wrappers like palloc() which sit on top of it, because
10  * no allocator built into the operating system will deal with relative
11  * pointers.  In the future, we may find other cases in which greater
12  * control over our own memory management seems desirable.
13  *
14  * A FreePageManager keeps track of which 4kB pages of memory are currently
15  * unused from the point of view of some higher-level memory allocator.
16  * Unlike a user-facing allocator such as palloc(), a FreePageManager can
17  * only allocate and free in units of whole pages, and freeing an
18  * allocation can only be done given knowledge of its length in pages.
19  *
20  * Since a free page manager has only a fixed amount of dedicated memory,
21  * and since there is no underlying allocator, it uses the free pages
22  * it is given to manage to store its bookkeeping data.  It keeps multiple
23  * freelists of runs of pages, sorted by the size of the run; the head of
24  * each freelist is stored in the FreePageManager itself, and the first
25  * page of each run contains a relative pointer to the next run. See
26  * FreePageManagerGetInternal for more details on how the freelists are
27  * managed.
28  *
29  * To avoid memory fragmentation, it's important to consolidate adjacent
30  * spans of pages whenever possible; otherwise, large allocation requests
31  * might not be satisfied even when sufficient contiguous space is
32  * available.  Therefore, in addition to the freelists, we maintain an
33  * in-memory btree of free page ranges ordered by page number.  If a
34  * range being freed precedes or follows a range that is already free,
35  * the existing range is extended; if it exactly bridges the gap between
36  * free ranges, then the two existing ranges are consolidated with the
37  * newly-freed range to form one great big range of free pages.
38  *
39  * When there is only one range of free pages, the btree is trivial and
40  * is stored within the FreePageManager proper; otherwise, pages are
41  * allocated from the area under management as needed.  Even in cases
42  * where memory fragmentation is very severe, only a tiny fraction of
43  * the pages under management are consumed by this btree.
44  *
45  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
46  * Portions Copyright (c) 1994, Regents of the University of California
47  *
48  * IDENTIFICATION
49  *        src/backend/utils/mmgr/freepage.c
50  *
51  *-------------------------------------------------------------------------
52  */
53
54 #include "postgres.h"
55 #include "lib/stringinfo.h"
56 #include "miscadmin.h"
57
58 #include "utils/freepage.h"
59 #include "utils/relptr.h"
60
61
62 /* Magic numbers to identify various page types */
63 #define FREE_PAGE_SPAN_LEADER_MAGIC             0xea4020f0
64 #define FREE_PAGE_LEAF_MAGIC                    0x98eae728
65 #define FREE_PAGE_INTERNAL_MAGIC                0x19aa32c9
66
67 /* Doubly linked list of spans of free pages; stored in first page of span. */
68 struct FreePageSpanLeader
69 {
70         int                     magic;                  /* always FREE_PAGE_SPAN_LEADER_MAGIC */
71         Size            npages;                 /* number of pages in span */
72         RelptrFreePageSpanLeader prev;
73         RelptrFreePageSpanLeader next;
74 };
75
76 /* Common header for btree leaf and internal pages. */
77 typedef struct FreePageBtreeHeader
78 {
79         int                     magic;                  /* FREE_PAGE_LEAF_MAGIC or
80                                                                  * FREE_PAGE_INTERNAL_MAGIC */
81         Size            nused;                  /* number of items used */
82         RelptrFreePageBtree parent; /* uplink */
83 } FreePageBtreeHeader;
84
85 /* Internal key; points to next level of btree. */
86 typedef struct FreePageBtreeInternalKey
87 {
88         Size            first_page;             /* low bound for keys on child page */
89         RelptrFreePageBtree child;      /* downlink */
90 } FreePageBtreeInternalKey;
91
92 /* Leaf key; no payload data. */
93 typedef struct FreePageBtreeLeafKey
94 {
95         Size            first_page;             /* first page in span */
96         Size            npages;                 /* number of pages in span */
97 } FreePageBtreeLeafKey;
98
99 /* Work out how many keys will fit on a page. */
100 #define FPM_ITEMS_PER_INTERNAL_PAGE \
101         ((FPM_PAGE_SIZE - sizeof(FreePageBtreeHeader)) / \
102                 sizeof(FreePageBtreeInternalKey))
103 #define FPM_ITEMS_PER_LEAF_PAGE \
104         ((FPM_PAGE_SIZE - sizeof(FreePageBtreeHeader)) / \
105                 sizeof(FreePageBtreeLeafKey))
106
107 /* A btree page of either sort */
108 struct FreePageBtree
109 {
110         FreePageBtreeHeader hdr;
111         union
112         {
113                 FreePageBtreeInternalKey internal_key[FPM_ITEMS_PER_INTERNAL_PAGE];
114                 FreePageBtreeLeafKey leaf_key[FPM_ITEMS_PER_LEAF_PAGE];
115         }                       u;
116 };
117
118 /* Results of a btree search */
119 typedef struct FreePageBtreeSearchResult
120 {
121         FreePageBtree *page;
122         Size            index;
123         bool            found;
124         unsigned        split_pages;
125 } FreePageBtreeSearchResult;
126
127 /* Helper functions */
128 static void FreePageBtreeAdjustAncestorKeys(FreePageManager *fpm,
129                                                                 FreePageBtree *btp);
130 static Size FreePageBtreeCleanup(FreePageManager *fpm);
131 static FreePageBtree *FreePageBtreeFindLeftSibling(char *base,
132                                                          FreePageBtree *btp);
133 static FreePageBtree *FreePageBtreeFindRightSibling(char *base,
134                                                           FreePageBtree *btp);
135 static Size FreePageBtreeFirstKey(FreePageBtree *btp);
136 static FreePageBtree *FreePageBtreeGetRecycled(FreePageManager *fpm);
137 static void FreePageBtreeInsertInternal(char *base, FreePageBtree *btp,
138                                                   Size index, Size first_page, FreePageBtree *child);
139 static void FreePageBtreeInsertLeaf(FreePageBtree *btp, Size index,
140                                                 Size first_page, Size npages);
141 static void FreePageBtreeRecycle(FreePageManager *fpm, Size pageno);
142 static void FreePageBtreeRemove(FreePageManager *fpm, FreePageBtree *btp,
143                                         Size index);
144 static void FreePageBtreeRemovePage(FreePageManager *fpm, FreePageBtree *btp);
145 static void FreePageBtreeSearch(FreePageManager *fpm, Size first_page,
146                                         FreePageBtreeSearchResult *result);
147 static Size FreePageBtreeSearchInternal(FreePageBtree *btp, Size first_page);
148 static Size FreePageBtreeSearchLeaf(FreePageBtree *btp, Size first_page);
149 static FreePageBtree *FreePageBtreeSplitPage(FreePageManager *fpm,
150                                            FreePageBtree *btp);
151 static void FreePageBtreeUpdateParentPointers(char *base, FreePageBtree *btp);
152 static void FreePageManagerDumpBtree(FreePageManager *fpm, FreePageBtree *btp,
153                                                  FreePageBtree *parent, int level, StringInfo buf);
154 static void FreePageManagerDumpSpans(FreePageManager *fpm,
155                                                  FreePageSpanLeader *span, Size expected_pages,
156                                                  StringInfo buf);
157 static bool FreePageManagerGetInternal(FreePageManager *fpm, Size npages,
158                                                    Size *first_page);
159 static Size FreePageManagerPutInternal(FreePageManager *fpm, Size first_page,
160                                                    Size npages, bool soft);
161 static void FreePagePopSpanLeader(FreePageManager *fpm, Size pageno);
162 static void FreePagePushSpanLeader(FreePageManager *fpm, Size first_page,
163                                            Size npages);
164 static Size FreePageManagerLargestContiguous(FreePageManager *fpm);
165 static void FreePageManagerUpdateLargest(FreePageManager *fpm);
166
167 #if FPM_EXTRA_ASSERTS
168 static Size sum_free_pages(FreePageManager *fpm);
169 #endif
170
171 /*
172  * Initialize a new, empty free page manager.
173  *
174  * 'fpm' should reference caller-provided memory large enough to contain a
175  * FreePageManager.  We'll initialize it here.
176  *
177  * 'base' is the address to which all pointers are relative.  When managing
178  * a dynamic shared memory segment, it should normally be the base of the
179  * segment.  When managing backend-private memory, it can be either NULL or,
180  * if managing a single contiguous extent of memory, the start of that extent.
181  */
182 void
183 FreePageManagerInitialize(FreePageManager *fpm, char *base)
184 {
185         Size            f;
186
187         relptr_store(base, fpm->self, fpm);
188         relptr_store(base, fpm->btree_root, (FreePageBtree *) NULL);
189         relptr_store(base, fpm->btree_recycle, (FreePageSpanLeader *) NULL);
190         fpm->btree_depth = 0;
191         fpm->btree_recycle_count = 0;
192         fpm->singleton_first_page = 0;
193         fpm->singleton_npages = 0;
194         fpm->contiguous_pages = 0;
195         fpm->contiguous_pages_dirty = true;
196 #ifdef FPM_EXTRA_ASSERTS
197         fpm->free_pages = 0;
198 #endif
199
200         for (f = 0; f < FPM_NUM_FREELISTS; f++)
201                 relptr_store(base, fpm->freelist[f], (FreePageSpanLeader *) NULL);
202 }
203
204 /*
205  * Allocate a run of pages of the given length from the free page manager.
206  * The return value indicates whether we were able to satisfy the request;
207  * if true, the first page of the allocation is stored in *first_page.
208  */
209 bool
210 FreePageManagerGet(FreePageManager *fpm, Size npages, Size *first_page)
211 {
212         bool            result;
213         Size            contiguous_pages;
214
215         result = FreePageManagerGetInternal(fpm, npages, first_page);
216
217         /*
218          * It's a bit counterintuitive, but allocating pages can actually create
219          * opportunities for cleanup that create larger ranges.  We might pull a
220          * key out of the btree that enables the item at the head of the btree
221          * recycle list to be inserted; and then if there are more items behind it
222          * one of those might cause two currently-separated ranges to merge,
223          * creating a single range of contiguous pages larger than any that
224          * existed previously.  It might be worth trying to improve the cleanup
225          * algorithm to avoid such corner cases, but for now we just notice the
226          * condition and do the appropriate reporting.
227          */
228         contiguous_pages = FreePageBtreeCleanup(fpm);
229         if (fpm->contiguous_pages < contiguous_pages)
230                 fpm->contiguous_pages = contiguous_pages;
231
232         /*
233          * FreePageManagerGetInternal may have set contiguous_pages_dirty.
234          * Recompute contigous_pages if so.
235          */
236         FreePageManagerUpdateLargest(fpm);
237
238 #ifdef FPM_EXTRA_ASSERTS
239         if (result)
240         {
241                 Assert(fpm->free_pages >= npages);
242                 fpm->free_pages -= npages;
243         }
244         Assert(fpm->free_pages == sum_free_pages(fpm));
245         Assert(fpm->contiguous_pages == FreePageManagerLargestContiguous(fpm));
246 #endif
247         return result;
248 }
249
250 #ifdef FPM_EXTRA_ASSERTS
251 static void
252 sum_free_pages_recurse(FreePageManager *fpm, FreePageBtree *btp, Size *sum)
253 {
254         char       *base = fpm_segment_base(fpm);
255
256         Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC ||
257                    btp->hdr.magic == FREE_PAGE_LEAF_MAGIC);
258         ++*sum;
259         if (btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC)
260         {
261                 Size            index;
262
263
264                 for (index = 0; index < btp->hdr.nused; ++index)
265                 {
266                         FreePageBtree *child;
267
268                         child = relptr_access(base, btp->u.internal_key[index].child);
269                         sum_free_pages_recurse(fpm, child, sum);
270                 }
271         }
272 }
273 static Size
274 sum_free_pages(FreePageManager *fpm)
275 {
276         FreePageSpanLeader *recycle;
277         char       *base = fpm_segment_base(fpm);
278         Size            sum = 0;
279         int                     list;
280
281         /* Count the spans by scanning the freelists. */
282         for (list = 0; list < FPM_NUM_FREELISTS; ++list)
283         {
284
285                 if (!relptr_is_null(fpm->freelist[list]))
286                 {
287                         FreePageSpanLeader *candidate =
288                         relptr_access(base, fpm->freelist[list]);
289
290                         do
291                         {
292                                 sum += candidate->npages;
293                                 candidate = relptr_access(base, candidate->next);
294                         } while (candidate != NULL);
295                 }
296         }
297
298         /* Count btree internal pages. */
299         if (fpm->btree_depth > 0)
300         {
301                 FreePageBtree *root = relptr_access(base, fpm->btree_root);
302
303                 sum_free_pages_recurse(fpm, root, &sum);
304         }
305
306         /* Count the recycle list. */
307         for (recycle = relptr_access(base, fpm->btree_recycle);
308                  recycle != NULL;
309                  recycle = relptr_access(base, recycle->next))
310         {
311                 Assert(recycle->npages == 1);
312                 ++sum;
313         }
314
315         return sum;
316 }
317 #endif
318
319 /*
320  * Compute the size of the largest run of pages that the user could
321  * succesfully get.
322  */
323 static Size
324 FreePageManagerLargestContiguous(FreePageManager *fpm)
325 {
326         char       *base;
327         Size            largest;
328
329         base = fpm_segment_base(fpm);
330         largest = 0;
331         if (!relptr_is_null(fpm->freelist[FPM_NUM_FREELISTS - 1]))
332         {
333                 FreePageSpanLeader *candidate;
334
335                 candidate = relptr_access(base, fpm->freelist[FPM_NUM_FREELISTS - 1]);
336                 do
337                 {
338                         if (candidate->npages > largest)
339                                 largest = candidate->npages;
340                         candidate = relptr_access(base, candidate->next);
341                 } while (candidate != NULL);
342         }
343         else
344         {
345                 Size            f = FPM_NUM_FREELISTS - 1;
346
347                 do
348                 {
349                         --f;
350                         if (!relptr_is_null(fpm->freelist[f]))
351                         {
352                                 largest = f + 1;
353                                 break;
354                         }
355                 } while (f > 0);
356         }
357
358         return largest;
359 }
360
361 /*
362  * Recompute the size of the largest run of pages that the user could
363  * succesfully get, if it has been marked dirty.
364  */
365 static void
366 FreePageManagerUpdateLargest(FreePageManager *fpm)
367 {
368         if (!fpm->contiguous_pages_dirty)
369                 return;
370
371         fpm->contiguous_pages = FreePageManagerLargestContiguous(fpm);
372         fpm->contiguous_pages_dirty = false;
373 }
374
375 /*
376  * Transfer a run of pages to the free page manager.
377  */
378 void
379 FreePageManagerPut(FreePageManager *fpm, Size first_page, Size npages)
380 {
381         Size            contiguous_pages;
382
383         Assert(npages > 0);
384
385         /* Record the new pages. */
386         contiguous_pages =
387                 FreePageManagerPutInternal(fpm, first_page, npages, false);
388
389         /*
390          * If the new range we inserted into the page manager was contiguous with
391          * an existing range, it may have opened up cleanup opportunities.
392          */
393         if (contiguous_pages > npages)
394         {
395                 Size            cleanup_contiguous_pages;
396
397                 cleanup_contiguous_pages = FreePageBtreeCleanup(fpm);
398                 if (cleanup_contiguous_pages > contiguous_pages)
399                         contiguous_pages = cleanup_contiguous_pages;
400         }
401
402         /* See if we now have a new largest chunk. */
403         if (fpm->contiguous_pages < contiguous_pages)
404                 fpm->contiguous_pages = contiguous_pages;
405
406         /*
407          * The earlier call to FreePageManagerPutInternal may have set
408          * contiguous_pages_dirty if it needed to allocate internal pages, so
409          * recompute contiguous_pages if necessary.
410          */
411         FreePageManagerUpdateLargest(fpm);
412
413 #ifdef FPM_EXTRA_ASSERTS
414         fpm->free_pages += npages;
415         Assert(fpm->free_pages == sum_free_pages(fpm));
416         Assert(fpm->contiguous_pages == FreePageManagerLargestContiguous(fpm));
417 #endif
418 }
419
420 /*
421  * Produce a debugging dump of the state of a free page manager.
422  */
423 char *
424 FreePageManagerDump(FreePageManager *fpm)
425 {
426         char       *base = fpm_segment_base(fpm);
427         StringInfoData buf;
428         FreePageSpanLeader *recycle;
429         bool            dumped_any_freelist = false;
430         Size            f;
431
432         /* Initialize output buffer. */
433         initStringInfo(&buf);
434
435         /* Dump general stuff. */
436         appendStringInfo(&buf, "metadata: self %zu max contiguous pages = %zu\n",
437                                          fpm->self.relptr_off, fpm->contiguous_pages);
438
439         /* Dump btree. */
440         if (fpm->btree_depth > 0)
441         {
442                 FreePageBtree *root;
443
444                 appendStringInfo(&buf, "btree depth %u:\n", fpm->btree_depth);
445                 root = relptr_access(base, fpm->btree_root);
446                 FreePageManagerDumpBtree(fpm, root, NULL, 0, &buf);
447         }
448         else if (fpm->singleton_npages > 0)
449         {
450                 appendStringInfo(&buf, "singleton: %zu(%zu)\n",
451                                                  fpm->singleton_first_page, fpm->singleton_npages);
452         }
453
454         /* Dump btree recycle list. */
455         recycle = relptr_access(base, fpm->btree_recycle);
456         if (recycle != NULL)
457         {
458                 appendStringInfo(&buf, "btree recycle:");
459                 FreePageManagerDumpSpans(fpm, recycle, 1, &buf);
460         }
461
462         /* Dump free lists. */
463         for (f = 0; f < FPM_NUM_FREELISTS; ++f)
464         {
465                 FreePageSpanLeader *span;
466
467                 if (relptr_is_null(fpm->freelist[f]))
468                         continue;
469                 if (!dumped_any_freelist)
470                 {
471                         appendStringInfo(&buf, "freelists:\n");
472                         dumped_any_freelist = true;
473                 }
474                 appendStringInfo(&buf, "  %zu:", f + 1);
475                 span = relptr_access(base, fpm->freelist[f]);
476                 FreePageManagerDumpSpans(fpm, span, f + 1, &buf);
477         }
478
479         /* And return result to caller. */
480         return buf.data;
481 }
482
483
484 /*
485  * The first_page value stored at index zero in any non-root page must match
486  * the first_page value stored in its parent at the index which points to that
487  * page.  So when the value stored at index zero in a btree page changes, we've
488  * got to walk up the tree adjusting ancestor keys until we reach an ancestor
489  * where that key isn't index zero.  This function should be called after
490  * updating the first key on the target page; it will propagate the change
491  * upward as far as needed.
492  *
493  * We assume here that the first key on the page has not changed enough to
494  * require changes in the ordering of keys on its ancestor pages.  Thus,
495  * if we search the parent page for the first key greater than or equal to
496  * the first key on the current page, the downlink to this page will be either
497  * the exact index returned by the search (if the first key decreased)
498  * or one less (if the first key increased).
499  */
500 static void
501 FreePageBtreeAdjustAncestorKeys(FreePageManager *fpm, FreePageBtree *btp)
502 {
503         char       *base = fpm_segment_base(fpm);
504         Size            first_page;
505         FreePageBtree *parent;
506         FreePageBtree *child;
507
508         /* This might be either a leaf or an internal page. */
509         Assert(btp->hdr.nused > 0);
510         if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC)
511         {
512                 Assert(btp->hdr.nused <= FPM_ITEMS_PER_LEAF_PAGE);
513                 first_page = btp->u.leaf_key[0].first_page;
514         }
515         else
516         {
517                 Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
518                 Assert(btp->hdr.nused <= FPM_ITEMS_PER_INTERNAL_PAGE);
519                 first_page = btp->u.internal_key[0].first_page;
520         }
521         child = btp;
522
523         /* Loop until we find an ancestor that does not require adjustment. */
524         for (;;)
525         {
526                 Size            s;
527
528                 parent = relptr_access(base, child->hdr.parent);
529                 if (parent == NULL)
530                         break;
531                 s = FreePageBtreeSearchInternal(parent, first_page);
532
533                 /* Key is either at index s or index s-1; figure out which. */
534                 if (s >= parent->hdr.nused)
535                 {
536                         Assert(s == parent->hdr.nused);
537                         --s;
538                 }
539                 else
540                 {
541                         FreePageBtree *check;
542
543                         check = relptr_access(base, parent->u.internal_key[s].child);
544                         if (check != child)
545                         {
546                                 Assert(s > 0);
547                                 --s;
548                         }
549                 }
550
551 #ifdef USE_ASSERT_CHECKING
552                 /* Debugging double-check. */
553                 {
554                         FreePageBtree *check;
555
556                         check = relptr_access(base, parent->u.internal_key[s].child);
557                         Assert(s < parent->hdr.nused);
558                         Assert(child == check);
559                 }
560 #endif
561
562                 /* Update the parent key. */
563                 parent->u.internal_key[s].first_page = first_page;
564
565                 /*
566                  * If this is the first key in the parent, go up another level; else
567                  * done.
568                  */
569                 if (s > 0)
570                         break;
571                 child = parent;
572         }
573 }
574
575 /*
576  * Attempt to reclaim space from the free-page btree.  The return value is
577  * the largest range of contiguous pages created by the cleanup operation.
578  */
579 static Size
580 FreePageBtreeCleanup(FreePageManager *fpm)
581 {
582         char       *base = fpm_segment_base(fpm);
583         Size            max_contiguous_pages = 0;
584
585         /* Attempt to shrink the depth of the btree. */
586         while (!relptr_is_null(fpm->btree_root))
587         {
588                 FreePageBtree *root = relptr_access(base, fpm->btree_root);
589
590                 /* If the root contains only one key, reduce depth by one. */
591                 if (root->hdr.nused == 1)
592                 {
593                         /* Shrink depth of tree by one. */
594                         Assert(fpm->btree_depth > 0);
595                         --fpm->btree_depth;
596                         if (root->hdr.magic == FREE_PAGE_LEAF_MAGIC)
597                         {
598                                 /* If root is a leaf, convert only entry to singleton range. */
599                                 relptr_store(base, fpm->btree_root, (FreePageBtree *) NULL);
600                                 fpm->singleton_first_page = root->u.leaf_key[0].first_page;
601                                 fpm->singleton_npages = root->u.leaf_key[0].npages;
602                         }
603                         else
604                         {
605                                 FreePageBtree *newroot;
606
607                                 /* If root is an internal page, make only child the root. */
608                                 Assert(root->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
609                                 relptr_copy(fpm->btree_root, root->u.internal_key[0].child);
610                                 newroot = relptr_access(base, fpm->btree_root);
611                                 relptr_store(base, newroot->hdr.parent, (FreePageBtree *) NULL);
612                         }
613                         FreePageBtreeRecycle(fpm, fpm_pointer_to_page(base, root));
614                 }
615                 else if (root->hdr.nused == 2 &&
616                                  root->hdr.magic == FREE_PAGE_LEAF_MAGIC)
617                 {
618                         Size            end_of_first;
619                         Size            start_of_second;
620
621                         end_of_first = root->u.leaf_key[0].first_page +
622                                 root->u.leaf_key[0].npages;
623                         start_of_second = root->u.leaf_key[1].first_page;
624
625                         if (end_of_first + 1 == start_of_second)
626                         {
627                                 Size            root_page = fpm_pointer_to_page(base, root);
628
629                                 if (end_of_first == root_page)
630                                 {
631                                         FreePagePopSpanLeader(fpm, root->u.leaf_key[0].first_page);
632                                         FreePagePopSpanLeader(fpm, root->u.leaf_key[1].first_page);
633                                         fpm->singleton_first_page = root->u.leaf_key[0].first_page;
634                                         fpm->singleton_npages = root->u.leaf_key[0].npages +
635                                                 root->u.leaf_key[1].npages + 1;
636                                         fpm->btree_depth = 0;
637                                         relptr_store(base, fpm->btree_root,
638                                                                  (FreePageBtree *) NULL);
639                                         FreePagePushSpanLeader(fpm, fpm->singleton_first_page,
640                                                                                    fpm->singleton_npages);
641                                         Assert(max_contiguous_pages == 0);
642                                         max_contiguous_pages = fpm->singleton_npages;
643                                 }
644                         }
645
646                         /* Whether it worked or not, it's time to stop. */
647                         break;
648                 }
649                 else
650                 {
651                         /* Nothing more to do.  Stop. */
652                         break;
653                 }
654         }
655
656         /*
657          * Attempt to free recycled btree pages.  We skip this if releasing the
658          * recycled page would require a btree page split, because the page we're
659          * trying to recycle would be consumed by the split, which would be
660          * counterproductive.
661          *
662          * We also currently only ever attempt to recycle the first page on the
663          * list; that could be made more aggressive, but it's not clear that the
664          * complexity would be worthwhile.
665          */
666         while (fpm->btree_recycle_count > 0)
667         {
668                 FreePageBtree *btp;
669                 Size            first_page;
670                 Size            contiguous_pages;
671
672                 btp = FreePageBtreeGetRecycled(fpm);
673                 first_page = fpm_pointer_to_page(base, btp);
674                 contiguous_pages = FreePageManagerPutInternal(fpm, first_page, 1, true);
675                 if (contiguous_pages == 0)
676                 {
677                         FreePageBtreeRecycle(fpm, first_page);
678                         break;
679                 }
680                 else
681                 {
682                         if (contiguous_pages > max_contiguous_pages)
683                                 max_contiguous_pages = contiguous_pages;
684                 }
685         }
686
687         return max_contiguous_pages;
688 }
689
690 /*
691  * Consider consolidating the given page with its left or right sibling,
692  * if it's fairly empty.
693  */
694 static void
695 FreePageBtreeConsolidate(FreePageManager *fpm, FreePageBtree *btp)
696 {
697         char       *base = fpm_segment_base(fpm);
698         FreePageBtree *np;
699         Size            max;
700
701         /*
702          * We only try to consolidate pages that are less than a third full. We
703          * could be more aggressive about this, but that might risk performing
704          * consolidation only to end up splitting again shortly thereafter.  Since
705          * the btree should be very small compared to the space under management,
706          * our goal isn't so much to ensure that it always occupies the absolutely
707          * smallest possible number of pages as to reclaim pages before things get
708          * too egregiously out of hand.
709          */
710         if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC)
711                 max = FPM_ITEMS_PER_LEAF_PAGE;
712         else
713         {
714                 Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
715                 max = FPM_ITEMS_PER_INTERNAL_PAGE;
716         }
717         if (btp->hdr.nused >= max / 3)
718                 return;
719
720         /*
721          * If we can fit our right sibling's keys onto this page, consolidate.
722          */
723         np = FreePageBtreeFindRightSibling(base, btp);
724         if (np != NULL && btp->hdr.nused + np->hdr.nused <= max)
725         {
726                 if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC)
727                 {
728                         memcpy(&btp->u.leaf_key[btp->hdr.nused], &np->u.leaf_key[0],
729                                    sizeof(FreePageBtreeLeafKey) * np->hdr.nused);
730                         btp->hdr.nused += np->hdr.nused;
731                 }
732                 else
733                 {
734                         memcpy(&btp->u.internal_key[btp->hdr.nused], &np->u.internal_key[0],
735                                    sizeof(FreePageBtreeInternalKey) * np->hdr.nused);
736                         btp->hdr.nused += np->hdr.nused;
737                         FreePageBtreeUpdateParentPointers(base, btp);
738                 }
739                 FreePageBtreeRemovePage(fpm, np);
740                 return;
741         }
742
743         /*
744          * If we can fit our keys onto our left sibling's page, consolidate. In
745          * this case, we move our keys onto the other page rather than visca
746          * versa, to avoid having to adjust ancestor keys.
747          */
748         np = FreePageBtreeFindLeftSibling(base, btp);
749         if (np != NULL && btp->hdr.nused + np->hdr.nused <= max)
750         {
751                 if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC)
752                 {
753                         memcpy(&np->u.leaf_key[np->hdr.nused], &btp->u.leaf_key[0],
754                                    sizeof(FreePageBtreeLeafKey) * btp->hdr.nused);
755                         np->hdr.nused += btp->hdr.nused;
756                 }
757                 else
758                 {
759                         memcpy(&np->u.internal_key[np->hdr.nused], &btp->u.internal_key[0],
760                                    sizeof(FreePageBtreeInternalKey) * btp->hdr.nused);
761                         np->hdr.nused += btp->hdr.nused;
762                         FreePageBtreeUpdateParentPointers(base, np);
763                 }
764                 FreePageBtreeRemovePage(fpm, btp);
765                 return;
766         }
767 }
768
769 /*
770  * Find the passed page's left sibling; that is, the page at the same level
771  * of the tree whose keyspace immediately precedes ours.
772  */
773 static FreePageBtree *
774 FreePageBtreeFindLeftSibling(char *base, FreePageBtree *btp)
775 {
776         FreePageBtree *p = btp;
777         int                     levels = 0;
778
779         /* Move up until we can move left. */
780         for (;;)
781         {
782                 Size            first_page;
783                 Size            index;
784
785                 first_page = FreePageBtreeFirstKey(p);
786                 p = relptr_access(base, p->hdr.parent);
787
788                 if (p == NULL)
789                         return NULL;            /* we were passed the rightmost page */
790
791                 index = FreePageBtreeSearchInternal(p, first_page);
792                 if (index > 0)
793                 {
794                         Assert(p->u.internal_key[index].first_page == first_page);
795                         p = relptr_access(base, p->u.internal_key[index - 1].child);
796                         break;
797                 }
798                 Assert(index == 0);
799                 ++levels;
800         }
801
802         /* Descend left. */
803         while (levels > 0)
804         {
805                 Assert(p->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
806                 p = relptr_access(base, p->u.internal_key[p->hdr.nused - 1].child);
807                 --levels;
808         }
809         Assert(p->hdr.magic == btp->hdr.magic);
810
811         return p;
812 }
813
814 /*
815  * Find the passed page's right sibling; that is, the page at the same level
816  * of the tree whose keyspace immediately follows ours.
817  */
818 static FreePageBtree *
819 FreePageBtreeFindRightSibling(char *base, FreePageBtree *btp)
820 {
821         FreePageBtree *p = btp;
822         int                     levels = 0;
823
824         /* Move up until we can move right. */
825         for (;;)
826         {
827                 Size            first_page;
828                 Size            index;
829
830                 first_page = FreePageBtreeFirstKey(p);
831                 p = relptr_access(base, p->hdr.parent);
832
833                 if (p == NULL)
834                         return NULL;            /* we were passed the rightmost page */
835
836                 index = FreePageBtreeSearchInternal(p, first_page);
837                 if (index < p->hdr.nused - 1)
838                 {
839                         Assert(p->u.internal_key[index].first_page == first_page);
840                         p = relptr_access(base, p->u.internal_key[index + 1].child);
841                         break;
842                 }
843                 Assert(index == p->hdr.nused - 1);
844                 ++levels;
845         }
846
847         /* Descend left. */
848         while (levels > 0)
849         {
850                 Assert(p->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
851                 p = relptr_access(base, p->u.internal_key[0].child);
852                 --levels;
853         }
854         Assert(p->hdr.magic == btp->hdr.magic);
855
856         return p;
857 }
858
859 /*
860  * Get the first key on a btree page.
861  */
862 static Size
863 FreePageBtreeFirstKey(FreePageBtree *btp)
864 {
865         Assert(btp->hdr.nused > 0);
866
867         if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC)
868                 return btp->u.leaf_key[0].first_page;
869         else
870         {
871                 Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
872                 return btp->u.internal_key[0].first_page;
873         }
874 }
875
876 /*
877  * Get a page from the btree recycle list for use as a btree page.
878  */
879 static FreePageBtree *
880 FreePageBtreeGetRecycled(FreePageManager *fpm)
881 {
882         char       *base = fpm_segment_base(fpm);
883         FreePageSpanLeader *victim = relptr_access(base, fpm->btree_recycle);
884         FreePageSpanLeader *newhead;
885
886         Assert(victim != NULL);
887         newhead = relptr_access(base, victim->next);
888         if (newhead != NULL)
889                 relptr_copy(newhead->prev, victim->prev);
890         relptr_store(base, fpm->btree_recycle, newhead);
891         Assert(fpm_pointer_is_page_aligned(base, victim));
892         fpm->btree_recycle_count--;
893         return (FreePageBtree *) victim;
894 }
895
896 /*
897  * Insert an item into an internal page.
898  */
899 static void
900 FreePageBtreeInsertInternal(char *base, FreePageBtree *btp, Size index,
901                                                         Size first_page, FreePageBtree *child)
902 {
903         Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
904         Assert(btp->hdr.nused <= FPM_ITEMS_PER_INTERNAL_PAGE);
905         Assert(index <= btp->hdr.nused);
906         memmove(&btp->u.internal_key[index + 1], &btp->u.internal_key[index],
907                         sizeof(FreePageBtreeInternalKey) * (btp->hdr.nused - index));
908         btp->u.internal_key[index].first_page = first_page;
909         relptr_store(base, btp->u.internal_key[index].child, child);
910         ++btp->hdr.nused;
911 }
912
913 /*
914  * Insert an item into a leaf page.
915  */
916 static void
917 FreePageBtreeInsertLeaf(FreePageBtree *btp, Size index, Size first_page,
918                                                 Size npages)
919 {
920         Assert(btp->hdr.magic == FREE_PAGE_LEAF_MAGIC);
921         Assert(btp->hdr.nused <= FPM_ITEMS_PER_LEAF_PAGE);
922         Assert(index <= btp->hdr.nused);
923         memmove(&btp->u.leaf_key[index + 1], &btp->u.leaf_key[index],
924                         sizeof(FreePageBtreeLeafKey) * (btp->hdr.nused - index));
925         btp->u.leaf_key[index].first_page = first_page;
926         btp->u.leaf_key[index].npages = npages;
927         ++btp->hdr.nused;
928 }
929
930 /*
931  * Put a page on the btree recycle list.
932  */
933 static void
934 FreePageBtreeRecycle(FreePageManager *fpm, Size pageno)
935 {
936         char       *base = fpm_segment_base(fpm);
937         FreePageSpanLeader *head = relptr_access(base, fpm->btree_recycle);
938         FreePageSpanLeader *span;
939
940         span = (FreePageSpanLeader *) fpm_page_to_pointer(base, pageno);
941         span->magic = FREE_PAGE_SPAN_LEADER_MAGIC;
942         span->npages = 1;
943         relptr_store(base, span->next, head);
944         relptr_store(base, span->prev, (FreePageSpanLeader *) NULL);
945         if (head != NULL)
946                 relptr_store(base, head->prev, span);
947         relptr_store(base, fpm->btree_recycle, span);
948         fpm->btree_recycle_count++;
949 }
950
951 /*
952  * Remove an item from the btree at the given position on the given page.
953  */
954 static void
955 FreePageBtreeRemove(FreePageManager *fpm, FreePageBtree *btp, Size index)
956 {
957         Assert(btp->hdr.magic == FREE_PAGE_LEAF_MAGIC);
958         Assert(index < btp->hdr.nused);
959
960         /* When last item is removed, extirpate entire page from btree. */
961         if (btp->hdr.nused == 1)
962         {
963                 FreePageBtreeRemovePage(fpm, btp);
964                 return;
965         }
966
967         /* Physically remove the key from the page. */
968         --btp->hdr.nused;
969         if (index < btp->hdr.nused)
970                 memmove(&btp->u.leaf_key[index], &btp->u.leaf_key[index + 1],
971                                 sizeof(FreePageBtreeLeafKey) * (btp->hdr.nused - index));
972
973         /* If we just removed the first key, adjust ancestor keys. */
974         if (index == 0)
975                 FreePageBtreeAdjustAncestorKeys(fpm, btp);
976
977         /* Consider whether to consolidate this page with a sibling. */
978         FreePageBtreeConsolidate(fpm, btp);
979 }
980
981 /*
982  * Remove a page from the btree.  Caller is responsible for having relocated
983  * any keys from this page that are still wanted.  The page is placed on the
984  * recycled list.
985  */
986 static void
987 FreePageBtreeRemovePage(FreePageManager *fpm, FreePageBtree *btp)
988 {
989         char       *base = fpm_segment_base(fpm);
990         FreePageBtree *parent;
991         Size            index;
992         Size            first_page;
993
994         for (;;)
995         {
996                 /* Find parent page. */
997                 parent = relptr_access(base, btp->hdr.parent);
998                 if (parent == NULL)
999                 {
1000                         /* We are removing the root page. */
1001                         relptr_store(base, fpm->btree_root, (FreePageBtree *) NULL);
1002                         fpm->btree_depth = 0;
1003                         Assert(fpm->singleton_first_page == 0);
1004                         Assert(fpm->singleton_npages == 0);
1005                         return;
1006                 }
1007
1008                 /*
1009                  * If the parent contains only one item, we need to remove it as well.
1010                  */
1011                 if (parent->hdr.nused > 1)
1012                         break;
1013                 FreePageBtreeRecycle(fpm, fpm_pointer_to_page(base, btp));
1014                 btp = parent;
1015         }
1016
1017         /* Find and remove the downlink. */
1018         first_page = FreePageBtreeFirstKey(btp);
1019         if (parent->hdr.magic == FREE_PAGE_LEAF_MAGIC)
1020         {
1021                 index = FreePageBtreeSearchLeaf(parent, first_page);
1022                 Assert(index < parent->hdr.nused);
1023                 if (index < parent->hdr.nused - 1)
1024                         memmove(&parent->u.leaf_key[index],
1025                                         &parent->u.leaf_key[index + 1],
1026                                         sizeof(FreePageBtreeLeafKey)
1027                                         * (parent->hdr.nused - index - 1));
1028         }
1029         else
1030         {
1031                 index = FreePageBtreeSearchInternal(parent, first_page);
1032                 Assert(index < parent->hdr.nused);
1033                 if (index < parent->hdr.nused - 1)
1034                         memmove(&parent->u.internal_key[index],
1035                                         &parent->u.internal_key[index + 1],
1036                                         sizeof(FreePageBtreeInternalKey)
1037                                         * (parent->hdr.nused - index - 1));
1038         }
1039         parent->hdr.nused--;
1040         Assert(parent->hdr.nused > 0);
1041
1042         /* Recycle the page. */
1043         FreePageBtreeRecycle(fpm, fpm_pointer_to_page(base, btp));
1044
1045         /* Adjust ancestor keys if needed. */
1046         if (index == 0)
1047                 FreePageBtreeAdjustAncestorKeys(fpm, parent);
1048
1049         /* Consider whether to consolidate the parent with a sibling. */
1050         FreePageBtreeConsolidate(fpm, parent);
1051 }
1052
1053 /*
1054  * Search the btree for an entry for the given first page and initialize
1055  * *result with the results of the search.  result->page and result->index
1056  * indicate either the position of an exact match or the position at which
1057  * the new key should be inserted.  result->found is true for an exact match,
1058  * otherwise false.  result->split_pages will contain the number of additional
1059  * btree pages that will be needed when performing a split to insert a key.
1060  * Except as described above, the contents of fields in the result object are
1061  * undefined on return.
1062  */
1063 static void
1064 FreePageBtreeSearch(FreePageManager *fpm, Size first_page,
1065                                         FreePageBtreeSearchResult *result)
1066 {
1067         char       *base = fpm_segment_base(fpm);
1068         FreePageBtree *btp = relptr_access(base, fpm->btree_root);
1069         Size            index;
1070
1071         result->split_pages = 1;
1072
1073         /* If the btree is empty, there's nothing to find. */
1074         if (btp == NULL)
1075         {
1076                 result->page = NULL;
1077                 result->found = false;
1078                 return;
1079         }
1080
1081         /* Descend until we hit a leaf. */
1082         while (btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC)
1083         {
1084                 FreePageBtree *child;
1085                 bool            found_exact;
1086
1087                 index = FreePageBtreeSearchInternal(btp, first_page);
1088                 found_exact = index < btp->hdr.nused &&
1089                         btp->u.internal_key[index].first_page == first_page;
1090
1091                 /*
1092                  * If we found an exact match we descend directly.  Otherwise, we
1093                  * descend into the child to the left if possible so that we can find
1094                  * the insertion point at that child's high end.
1095                  */
1096                 if (!found_exact && index > 0)
1097                         --index;
1098
1099                 /* Track required split depth for leaf insert. */
1100                 if (btp->hdr.nused >= FPM_ITEMS_PER_INTERNAL_PAGE)
1101                 {
1102                         Assert(btp->hdr.nused == FPM_ITEMS_PER_INTERNAL_PAGE);
1103                         result->split_pages++;
1104                 }
1105                 else
1106                         result->split_pages = 0;
1107
1108                 /* Descend to appropriate child page. */
1109                 Assert(index < btp->hdr.nused);
1110                 child = relptr_access(base, btp->u.internal_key[index].child);
1111                 Assert(relptr_access(base, child->hdr.parent) == btp);
1112                 btp = child;
1113         }
1114
1115         /* Track required split depth for leaf insert. */
1116         if (btp->hdr.nused >= FPM_ITEMS_PER_LEAF_PAGE)
1117         {
1118                 Assert(btp->hdr.nused == FPM_ITEMS_PER_INTERNAL_PAGE);
1119                 result->split_pages++;
1120         }
1121         else
1122                 result->split_pages = 0;
1123
1124         /* Search leaf page. */
1125         index = FreePageBtreeSearchLeaf(btp, first_page);
1126
1127         /* Assemble results. */
1128         result->page = btp;
1129         result->index = index;
1130         result->found = index < btp->hdr.nused &&
1131                 first_page == btp->u.leaf_key[index].first_page;
1132 }
1133
1134 /*
1135  * Search an internal page for the first key greater than or equal to a given
1136  * page number.  Returns the index of that key, or one greater than the number
1137  * of keys on the page if none.
1138  */
1139 static Size
1140 FreePageBtreeSearchInternal(FreePageBtree *btp, Size first_page)
1141 {
1142         Size            low = 0;
1143         Size            high = btp->hdr.nused;
1144
1145         Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
1146         Assert(high > 0 && high <= FPM_ITEMS_PER_INTERNAL_PAGE);
1147
1148         while (low < high)
1149         {
1150                 Size            mid = (low + high) / 2;
1151                 Size            val = btp->u.internal_key[mid].first_page;
1152
1153                 if (first_page == val)
1154                         return mid;
1155                 else if (first_page < val)
1156                         high = mid;
1157                 else
1158                         low = mid + 1;
1159         }
1160
1161         return low;
1162 }
1163
1164 /*
1165  * Search a leaf page for the first key greater than or equal to a given
1166  * page number.  Returns the index of that key, or one greater than the number
1167  * of keys on the page if none.
1168  */
1169 static Size
1170 FreePageBtreeSearchLeaf(FreePageBtree *btp, Size first_page)
1171 {
1172         Size            low = 0;
1173         Size            high = btp->hdr.nused;
1174
1175         Assert(btp->hdr.magic == FREE_PAGE_LEAF_MAGIC);
1176         Assert(high > 0 && high <= FPM_ITEMS_PER_LEAF_PAGE);
1177
1178         while (low < high)
1179         {
1180                 Size            mid = (low + high) / 2;
1181                 Size            val = btp->u.leaf_key[mid].first_page;
1182
1183                 if (first_page == val)
1184                         return mid;
1185                 else if (first_page < val)
1186                         high = mid;
1187                 else
1188                         low = mid + 1;
1189         }
1190
1191         return low;
1192 }
1193
1194 /*
1195  * Allocate a new btree page and move half the keys from the provided page
1196  * to the new page.  Caller is responsible for making sure that there's a
1197  * page available from fpm->btree_recycle.  Returns a pointer to the new page,
1198  * to which caller must add a downlink.
1199  */
1200 static FreePageBtree *
1201 FreePageBtreeSplitPage(FreePageManager *fpm, FreePageBtree *btp)
1202 {
1203         FreePageBtree *newsibling;
1204
1205         newsibling = FreePageBtreeGetRecycled(fpm);
1206         newsibling->hdr.magic = btp->hdr.magic;
1207         newsibling->hdr.nused = btp->hdr.nused / 2;
1208         relptr_copy(newsibling->hdr.parent, btp->hdr.parent);
1209         btp->hdr.nused -= newsibling->hdr.nused;
1210
1211         if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC)
1212                 memcpy(&newsibling->u.leaf_key,
1213                            &btp->u.leaf_key[btp->hdr.nused],
1214                            sizeof(FreePageBtreeLeafKey) * newsibling->hdr.nused);
1215         else
1216         {
1217                 Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
1218                 memcpy(&newsibling->u.internal_key,
1219                            &btp->u.internal_key[btp->hdr.nused],
1220                            sizeof(FreePageBtreeInternalKey) * newsibling->hdr.nused);
1221                 FreePageBtreeUpdateParentPointers(fpm_segment_base(fpm), newsibling);
1222         }
1223
1224         return newsibling;
1225 }
1226
1227 /*
1228  * When internal pages are split or merged, the parent pointers of their
1229  * children must be updated.
1230  */
1231 static void
1232 FreePageBtreeUpdateParentPointers(char *base, FreePageBtree *btp)
1233 {
1234         Size            i;
1235
1236         Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
1237         for (i = 0; i < btp->hdr.nused; ++i)
1238         {
1239                 FreePageBtree *child;
1240
1241                 child = relptr_access(base, btp->u.internal_key[i].child);
1242                 relptr_store(base, child->hdr.parent, btp);
1243         }
1244 }
1245
1246 /*
1247  * Debugging dump of btree data.
1248  */
1249 static void
1250 FreePageManagerDumpBtree(FreePageManager *fpm, FreePageBtree *btp,
1251                                                  FreePageBtree *parent, int level, StringInfo buf)
1252 {
1253         char       *base = fpm_segment_base(fpm);
1254         Size            pageno = fpm_pointer_to_page(base, btp);
1255         Size            index;
1256         FreePageBtree *check_parent;
1257
1258         check_stack_depth();
1259         check_parent = relptr_access(base, btp->hdr.parent);
1260         appendStringInfo(buf, "  %zu@%d %c", pageno, level,
1261                                          btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC ? 'i' : 'l');
1262         if (parent != check_parent)
1263                 appendStringInfo(buf, " [actual parent %zu, expected %zu]",
1264                                                  fpm_pointer_to_page(base, check_parent),
1265                                                  fpm_pointer_to_page(base, parent));
1266         appendStringInfoChar(buf, ':');
1267         for (index = 0; index < btp->hdr.nused; ++index)
1268         {
1269                 if (btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC)
1270                         appendStringInfo(buf, " %zu->%zu",
1271                                                          btp->u.internal_key[index].first_page,
1272                                 btp->u.internal_key[index].child.relptr_off / FPM_PAGE_SIZE);
1273                 else
1274                         appendStringInfo(buf, " %zu(%zu)",
1275                                                          btp->u.leaf_key[index].first_page,
1276                                                          btp->u.leaf_key[index].npages);
1277         }
1278         appendStringInfo(buf, "\n");
1279
1280         if (btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC)
1281         {
1282                 for (index = 0; index < btp->hdr.nused; ++index)
1283                 {
1284                         FreePageBtree *child;
1285
1286                         child = relptr_access(base, btp->u.internal_key[index].child);
1287                         FreePageManagerDumpBtree(fpm, child, btp, level + 1, buf);
1288                 }
1289         }
1290 }
1291
1292 /*
1293  * Debugging dump of free-span data.
1294  */
1295 static void
1296 FreePageManagerDumpSpans(FreePageManager *fpm, FreePageSpanLeader *span,
1297                                                  Size expected_pages, StringInfo buf)
1298 {
1299         char       *base = fpm_segment_base(fpm);
1300
1301         while (span != NULL)
1302         {
1303                 if (span->npages != expected_pages)
1304                         appendStringInfo(buf, " %zu(%zu)", fpm_pointer_to_page(base, span),
1305                                                          span->npages);
1306                 else
1307                         appendStringInfo(buf, " %zu", fpm_pointer_to_page(base, span));
1308                 span = relptr_access(base, span->next);
1309         }
1310
1311         appendStringInfo(buf, "\n");
1312 }
1313
1314 /*
1315  * This function allocates a run of pages of the given length from the free
1316  * page manager.
1317  */
1318 static bool
1319 FreePageManagerGetInternal(FreePageManager *fpm, Size npages, Size *first_page)
1320 {
1321         char       *base = fpm_segment_base(fpm);
1322         FreePageSpanLeader *victim = NULL;
1323         FreePageSpanLeader *prev;
1324         FreePageSpanLeader *next;
1325         FreePageBtreeSearchResult result;
1326         Size            victim_page = 0;        /* placate compiler */
1327         Size            f;
1328
1329         /*
1330          * Search for a free span.
1331          *
1332          * Right now, we use a simple best-fit policy here, but it's possible for
1333          * this to result in memory fragmentation if we're repeatedly asked to
1334          * allocate chunks just a little smaller than what we have available.
1335          * Hopefully, this is unlikely, because we expect most requests to be
1336          * single pages or superblock-sized chunks -- but no policy can be optimal
1337          * under all circumstances unless it has knowledge of future allocation
1338          * patterns.
1339          */
1340         for (f = Min(npages, FPM_NUM_FREELISTS) - 1; f < FPM_NUM_FREELISTS; ++f)
1341         {
1342                 /* Skip empty freelists. */
1343                 if (relptr_is_null(fpm->freelist[f]))
1344                         continue;
1345
1346                 /*
1347                  * All of the freelists except the last one contain only items of a
1348                  * single size, so we just take the first one.  But the final free
1349                  * list contains everything too big for any of the other lists, so we
1350                  * need to search the list.
1351                  */
1352                 if (f < FPM_NUM_FREELISTS - 1)
1353                         victim = relptr_access(base, fpm->freelist[f]);
1354                 else
1355                 {
1356                         FreePageSpanLeader *candidate;
1357
1358                         candidate = relptr_access(base, fpm->freelist[f]);
1359                         do
1360                         {
1361                                 if (candidate->npages >= npages && (victim == NULL ||
1362                                                                                  victim->npages > candidate->npages))
1363                                 {
1364                                         victim = candidate;
1365                                         if (victim->npages == npages)
1366                                                 break;
1367                                 }
1368                                 candidate = relptr_access(base, candidate->next);
1369                         } while (candidate != NULL);
1370                 }
1371                 break;
1372         }
1373
1374         /* If we didn't find an allocatable span, return failure. */
1375         if (victim == NULL)
1376                 return false;
1377
1378         /* Remove span from free list. */
1379         Assert(victim->magic == FREE_PAGE_SPAN_LEADER_MAGIC);
1380         prev = relptr_access(base, victim->prev);
1381         next = relptr_access(base, victim->next);
1382         if (prev != NULL)
1383                 relptr_copy(prev->next, victim->next);
1384         else
1385                 relptr_copy(fpm->freelist[f], victim->next);
1386         if (next != NULL)
1387                 relptr_copy(next->prev, victim->prev);
1388         victim_page = fpm_pointer_to_page(base, victim);
1389
1390         /* Decide whether we might be invalidating contiguous_pages. */
1391         if (f == FPM_NUM_FREELISTS - 1 &&
1392                 victim->npages == fpm->contiguous_pages)
1393         {
1394                 /*
1395                  * The victim span came from the oversized freelist, and had the same
1396                  * size as the longest span.  There may or may not be another one of
1397                  * the same size, so contiguous_pages must be recomputed just to be
1398                  * safe.
1399                  */
1400                 fpm->contiguous_pages_dirty = true;
1401         }
1402         else if (f + 1 == fpm->contiguous_pages &&
1403                          relptr_is_null(fpm->freelist[f]))
1404         {
1405                 /*
1406                  * The victim span came from a fixed sized freelist, and it was the
1407                  * list for spans of the same size as the current longest span, and
1408                  * the list is now empty after removing the victim.  So
1409                  * contiguous_pages must be recomputed without a doubt.
1410                  */
1411                 fpm->contiguous_pages_dirty = true;
1412         }
1413
1414         /*
1415          * If we haven't initialized the btree yet, the victim must be the single
1416          * span stored within the FreePageManager itself.  Otherwise, we need to
1417          * update the btree.
1418          */
1419         if (relptr_is_null(fpm->btree_root))
1420         {
1421                 Assert(victim_page == fpm->singleton_first_page);
1422                 Assert(victim->npages == fpm->singleton_npages);
1423                 Assert(victim->npages >= npages);
1424                 fpm->singleton_first_page += npages;
1425                 fpm->singleton_npages -= npages;
1426                 if (fpm->singleton_npages > 0)
1427                         FreePagePushSpanLeader(fpm, fpm->singleton_first_page,
1428                                                                    fpm->singleton_npages);
1429         }
1430         else
1431         {
1432                 /*
1433                  * If the span we found is exactly the right size, remove it from the
1434                  * btree completely.  Otherwise, adjust the btree entry to reflect the
1435                  * still-unallocated portion of the span, and put that portion on the
1436                  * appropriate free list.
1437                  */
1438                 FreePageBtreeSearch(fpm, victim_page, &result);
1439                 Assert(result.found);
1440                 if (victim->npages == npages)
1441                         FreePageBtreeRemove(fpm, result.page, result.index);
1442                 else
1443                 {
1444                         FreePageBtreeLeafKey *key;
1445
1446                         /* Adjust btree to reflect remaining pages. */
1447                         Assert(victim->npages > npages);
1448                         key = &result.page->u.leaf_key[result.index];
1449                         Assert(key->npages == victim->npages);
1450                         key->first_page += npages;
1451                         key->npages -= npages;
1452                         if (result.index == 0)
1453                                 FreePageBtreeAdjustAncestorKeys(fpm, result.page);
1454
1455                         /* Put the unallocated pages back on the appropriate free list. */
1456                         FreePagePushSpanLeader(fpm, victim_page + npages,
1457                                                                    victim->npages - npages);
1458                 }
1459         }
1460
1461         /* Return results to caller. */
1462         *first_page = fpm_pointer_to_page(base, victim);
1463         return true;
1464 }
1465
1466 /*
1467  * Put a range of pages into the btree and freelists, consolidating it with
1468  * existing free spans just before and/or after it.  If 'soft' is true,
1469  * only perform the insertion if it can be done without allocating new btree
1470  * pages; if false, do it always.  Returns 0 if the soft flag caused the
1471  * insertion to be skipped, or otherwise the size of the contiguous span
1472  * created by the insertion.  This may be larger than npages if we're able
1473  * to consolidate with an adjacent range.  *internal_pages_used is set to
1474  * true if the btree allocated pages for internal purposes, which might
1475  * invalidate the current largest run requiring it to be recomputed.
1476  */
1477 static Size
1478 FreePageManagerPutInternal(FreePageManager *fpm, Size first_page, Size npages,
1479                                                    bool soft)
1480 {
1481         char       *base = fpm_segment_base(fpm);
1482         FreePageBtreeSearchResult result;
1483         FreePageBtreeLeafKey *prevkey = NULL;
1484         FreePageBtreeLeafKey *nextkey = NULL;
1485         FreePageBtree *np;
1486         Size            nindex;
1487
1488         Assert(npages > 0);
1489
1490         /* We can store a single free span without initializing the btree. */
1491         if (fpm->btree_depth == 0)
1492         {
1493                 if (fpm->singleton_npages == 0)
1494                 {
1495                         /* Don't have a span yet; store this one. */
1496                         fpm->singleton_first_page = first_page;
1497                         fpm->singleton_npages = npages;
1498                         FreePagePushSpanLeader(fpm, first_page, npages);
1499                         return fpm->singleton_npages;
1500                 }
1501                 else if (fpm->singleton_first_page + fpm->singleton_npages ==
1502                                  first_page)
1503                 {
1504                         /* New span immediately follows sole existing span. */
1505                         fpm->singleton_npages += npages;
1506                         FreePagePopSpanLeader(fpm, fpm->singleton_first_page);
1507                         FreePagePushSpanLeader(fpm, fpm->singleton_first_page,
1508                                                                    fpm->singleton_npages);
1509                         return fpm->singleton_npages;
1510                 }
1511                 else if (first_page + npages == fpm->singleton_first_page)
1512                 {
1513                         /* New span immediately precedes sole existing span. */
1514                         FreePagePopSpanLeader(fpm, fpm->singleton_first_page);
1515                         fpm->singleton_first_page = first_page;
1516                         fpm->singleton_npages += npages;
1517                         FreePagePushSpanLeader(fpm, fpm->singleton_first_page,
1518                                                                    fpm->singleton_npages);
1519                         return fpm->singleton_npages;
1520                 }
1521                 else
1522                 {
1523                         /* Not contiguous; we need to initialize the btree. */
1524                         Size            root_page;
1525                         FreePageBtree *root;
1526
1527                         if (!relptr_is_null(fpm->btree_recycle))
1528                                 root = FreePageBtreeGetRecycled(fpm);
1529                         else if (FreePageManagerGetInternal(fpm, 1, &root_page))
1530                                 root = (FreePageBtree *) fpm_page_to_pointer(base, root_page);
1531                         else
1532                         {
1533                                 /* We'd better be able to get a page from the existing range. */
1534                                 elog(FATAL, "free page manager btree is corrupt");
1535                         }
1536
1537                         /* Create the btree and move the preexisting range into it. */
1538                         root->hdr.magic = FREE_PAGE_LEAF_MAGIC;
1539                         root->hdr.nused = 1;
1540                         relptr_store(base, root->hdr.parent, (FreePageBtree *) NULL);
1541                         root->u.leaf_key[0].first_page = fpm->singleton_first_page;
1542                         root->u.leaf_key[0].npages = fpm->singleton_npages;
1543                         relptr_store(base, fpm->btree_root, root);
1544                         fpm->singleton_first_page = 0;
1545                         fpm->singleton_npages = 0;
1546                         fpm->btree_depth = 1;
1547
1548                         /*
1549                          * Corner case: it may be that the btree root took the very last
1550                          * free page.  In that case, the sole btree entry covers a zero
1551                          * page run, which is invalid.  Overwrite it with the entry we're
1552                          * trying to insert and get out.
1553                          */
1554                         if (root->u.leaf_key[0].npages == 0)
1555                         {
1556                                 root->u.leaf_key[0].first_page = first_page;
1557                                 root->u.leaf_key[0].npages = npages;
1558                                 FreePagePushSpanLeader(fpm, first_page, npages);
1559                                 return npages;
1560                         }
1561
1562                         /* Fall through to insert the new key. */
1563                 }
1564         }
1565
1566         /* Search the btree. */
1567         FreePageBtreeSearch(fpm, first_page, &result);
1568         Assert(!result.found);
1569         if (result.index > 0)
1570                 prevkey = &result.page->u.leaf_key[result.index - 1];
1571         if (result.index < result.page->hdr.nused)
1572         {
1573                 np = result.page;
1574                 nindex = result.index;
1575                 nextkey = &result.page->u.leaf_key[result.index];
1576         }
1577         else
1578         {
1579                 np = FreePageBtreeFindRightSibling(base, result.page);
1580                 nindex = 0;
1581                 if (np != NULL)
1582                         nextkey = &np->u.leaf_key[0];
1583         }
1584
1585         /* Consolidate with the previous entry if possible. */
1586         if (prevkey != NULL && prevkey->first_page + prevkey->npages >= first_page)
1587         {
1588                 bool            remove_next = false;
1589                 Size            result;
1590
1591                 Assert(prevkey->first_page + prevkey->npages == first_page);
1592                 prevkey->npages = (first_page - prevkey->first_page) + npages;
1593
1594                 /* Check whether we can *also* consolidate with the following entry. */
1595                 if (nextkey != NULL &&
1596                         prevkey->first_page + prevkey->npages >= nextkey->first_page)
1597                 {
1598                         Assert(prevkey->first_page + prevkey->npages ==
1599                                    nextkey->first_page);
1600                         prevkey->npages = (nextkey->first_page - prevkey->first_page)
1601                                 + nextkey->npages;
1602                         FreePagePopSpanLeader(fpm, nextkey->first_page);
1603                         remove_next = true;
1604                 }
1605
1606                 /* Put the span on the correct freelist and save size. */
1607                 FreePagePopSpanLeader(fpm, prevkey->first_page);
1608                 FreePagePushSpanLeader(fpm, prevkey->first_page, prevkey->npages);
1609                 result = prevkey->npages;
1610
1611                 /*
1612                  * If we consolidated with both the preceding and following entries,
1613                  * we must remove the following entry.  We do this last, because
1614                  * removing an element from the btree may invalidate pointers we hold
1615                  * into the current data structure.
1616                  *
1617                  * NB: The btree is technically in an invalid state a this point
1618                  * because we've already updated prevkey to cover the same key space
1619                  * as nextkey.  FreePageBtreeRemove() shouldn't notice that, though.
1620                  */
1621                 if (remove_next)
1622                         FreePageBtreeRemove(fpm, np, nindex);
1623
1624                 return result;
1625         }
1626
1627         /* Consolidate with the next entry if possible. */
1628         if (nextkey != NULL && first_page + npages >= nextkey->first_page)
1629         {
1630                 Size            newpages;
1631
1632                 /* Compute new size for span. */
1633                 Assert(first_page + npages == nextkey->first_page);
1634                 newpages = (nextkey->first_page - first_page) + nextkey->npages;
1635
1636                 /* Put span on correct free list. */
1637                 FreePagePopSpanLeader(fpm, nextkey->first_page);
1638                 FreePagePushSpanLeader(fpm, first_page, newpages);
1639
1640                 /* Update key in place. */
1641                 nextkey->first_page = first_page;
1642                 nextkey->npages = newpages;
1643
1644                 /* If reducing first key on page, ancestors might need adjustment. */
1645                 if (nindex == 0)
1646                         FreePageBtreeAdjustAncestorKeys(fpm, np);
1647
1648                 return nextkey->npages;
1649         }
1650
1651         /* Split leaf page and as many of its ancestors as necessary. */
1652         if (result.split_pages > 0)
1653         {
1654                 /*
1655                  * NB: We could consider various coping strategies here to avoid a
1656                  * split; most obviously, if np != result.page, we could target that
1657                  * page instead.   More complicated shuffling strategies could be
1658                  * possible as well; basically, unless every single leaf page is 100%
1659                  * full, we can jam this key in there if we try hard enough.  It's
1660                  * unlikely that trying that hard is worthwhile, but it's possible we
1661                  * might need to make more than no effort.  For now, we just do the
1662                  * easy thing, which is nothing.
1663                  */
1664
1665                 /* If this is a soft insert, it's time to give up. */
1666                 if (soft)
1667                         return 0;
1668
1669                 /* Check whether we need to allocate more btree pages to split. */
1670                 if (result.split_pages > fpm->btree_recycle_count)
1671                 {
1672                         Size            pages_needed;
1673                         Size            recycle_page;
1674                         Size            i;
1675
1676                         /*
1677                          * Allocate the required number of pages and split each one in
1678                          * turn.  This should never fail, because if we've got enough
1679                          * spans of free pages kicking around that we need additional
1680                          * storage space just to remember them all, then we should
1681                          * certainly have enough to expand the btree, which should only
1682                          * ever use a tiny number of pages compared to the number under
1683                          * management.  If it does, something's badly screwed up.
1684                          */
1685                         pages_needed = result.split_pages - fpm->btree_recycle_count;
1686                         for (i = 0; i < pages_needed; ++i)
1687                         {
1688                                 if (!FreePageManagerGetInternal(fpm, 1, &recycle_page))
1689                                         elog(FATAL, "free page manager btree is corrupt");
1690                                 FreePageBtreeRecycle(fpm, recycle_page);
1691                         }
1692
1693                         /*
1694                          * The act of allocating pages to recycle may have invalidated the
1695                          * results of our previous btree reserch, so repeat it. (We could
1696                          * recheck whether any of our split-avoidance strategies that were
1697                          * not viable before now are, but it hardly seems worthwhile, so
1698                          * we don't bother. Consolidation can't be possible now if it
1699                          * wasn't previously.)
1700                          */
1701                         FreePageBtreeSearch(fpm, first_page, &result);
1702
1703                         /*
1704                          * The act of allocating pages for use in constructing our btree
1705                          * should never cause any page to become more full, so the new
1706                          * split depth should be no greater than the old one, and perhaps
1707                          * less if we fortutiously allocated a chunk that freed up a slot
1708                          * on the page we need to update.
1709                          */
1710                         Assert(result.split_pages <= fpm->btree_recycle_count);
1711                 }
1712
1713                 /* If we still need to perform a split, do it. */
1714                 if (result.split_pages > 0)
1715                 {
1716                         FreePageBtree *split_target = result.page;
1717                         FreePageBtree *child = NULL;
1718                         Size            key = first_page;
1719
1720                         for (;;)
1721                         {
1722                                 FreePageBtree *newsibling;
1723                                 FreePageBtree *parent;
1724
1725                                 /* Identify parent page, which must receive downlink. */
1726                                 parent = relptr_access(base, split_target->hdr.parent);
1727
1728                                 /* Split the page - downlink not added yet. */
1729                                 newsibling = FreePageBtreeSplitPage(fpm, split_target);
1730
1731                                 /*
1732                                  * At this point in the loop, we're always carrying a pending
1733                                  * insertion.  On the first pass, it's the actual key we're
1734                                  * trying to insert; on subsequent passes, it's the downlink
1735                                  * that needs to be added as a result of the split performed
1736                                  * during the previous loop iteration.  Since we've just split
1737                                  * the page, there's definitely room on one of the two
1738                                  * resulting pages.
1739                                  */
1740                                 if (child == NULL)
1741                                 {
1742                                         Size            index;
1743                                         FreePageBtree *insert_into;
1744
1745                                         insert_into = key < newsibling->u.leaf_key[0].first_page ?
1746                                                 split_target : newsibling;
1747                                         index = FreePageBtreeSearchLeaf(insert_into, key);
1748                                         FreePageBtreeInsertLeaf(insert_into, index, key, npages);
1749                                         if (index == 0 && insert_into == split_target)
1750                                                 FreePageBtreeAdjustAncestorKeys(fpm, split_target);
1751                                 }
1752                                 else
1753                                 {
1754                                         Size            index;
1755                                         FreePageBtree *insert_into;
1756
1757                                         insert_into =
1758                                                 key < newsibling->u.internal_key[0].first_page ?
1759                                                 split_target : newsibling;
1760                                         index = FreePageBtreeSearchInternal(insert_into, key);
1761                                         FreePageBtreeInsertInternal(base, insert_into, index,
1762                                                                                                 key, child);
1763                                         relptr_store(base, child->hdr.parent, insert_into);
1764                                         if (index == 0 && insert_into == split_target)
1765                                                 FreePageBtreeAdjustAncestorKeys(fpm, split_target);
1766                                 }
1767
1768                                 /* If the page we just split has no parent, split the root. */
1769                                 if (parent == NULL)
1770                                 {
1771                                         FreePageBtree *newroot;
1772
1773                                         newroot = FreePageBtreeGetRecycled(fpm);
1774                                         newroot->hdr.magic = FREE_PAGE_INTERNAL_MAGIC;
1775                                         newroot->hdr.nused = 2;
1776                                         relptr_store(base, newroot->hdr.parent,
1777                                                                  (FreePageBtree *) NULL);
1778                                         newroot->u.internal_key[0].first_page =
1779                                                 FreePageBtreeFirstKey(split_target);
1780                                         relptr_store(base, newroot->u.internal_key[0].child,
1781                                                                  split_target);
1782                                         relptr_store(base, split_target->hdr.parent, newroot);
1783                                         newroot->u.internal_key[1].first_page =
1784                                                 FreePageBtreeFirstKey(newsibling);
1785                                         relptr_store(base, newroot->u.internal_key[1].child,
1786                                                                  newsibling);
1787                                         relptr_store(base, newsibling->hdr.parent, newroot);
1788                                         relptr_store(base, fpm->btree_root, newroot);
1789                                         fpm->btree_depth++;
1790
1791                                         break;
1792                                 }
1793
1794                                 /* If the parent page isn't full, insert the downlink. */
1795                                 key = newsibling->u.internal_key[0].first_page;
1796                                 if (parent->hdr.nused < FPM_ITEMS_PER_INTERNAL_PAGE)
1797                                 {
1798                                         Size            index;
1799
1800                                         index = FreePageBtreeSearchInternal(parent, key);
1801                                         FreePageBtreeInsertInternal(base, parent, index,
1802                                                                                                 key, newsibling);
1803                                         relptr_store(base, newsibling->hdr.parent, parent);
1804                                         if (index == 0)
1805                                                 FreePageBtreeAdjustAncestorKeys(fpm, parent);
1806                                         break;
1807                                 }
1808
1809                                 /* The parent also needs to be split, so loop around. */
1810                                 child = newsibling;
1811                                 split_target = parent;
1812                         }
1813
1814                         /*
1815                          * The loop above did the insert, so just need to update the free
1816                          * list, and we're done.
1817                          */
1818                         FreePagePushSpanLeader(fpm, first_page, npages);
1819
1820                         return npages;
1821                 }
1822         }
1823
1824         /* Physically add the key to the page. */
1825         Assert(result.page->hdr.nused < FPM_ITEMS_PER_LEAF_PAGE);
1826         FreePageBtreeInsertLeaf(result.page, result.index, first_page, npages);
1827
1828         /* If new first key on page, ancestors might need adjustment. */
1829         if (result.index == 0)
1830                 FreePageBtreeAdjustAncestorKeys(fpm, result.page);
1831
1832         /* Put it on the free list. */
1833         FreePagePushSpanLeader(fpm, first_page, npages);
1834
1835         return npages;
1836 }
1837
1838 /*
1839  * Remove a FreePageSpanLeader from the linked-list that contains it, either
1840  * because we're changing the size of the span, or because we're allocating it.
1841  */
1842 static void
1843 FreePagePopSpanLeader(FreePageManager *fpm, Size pageno)
1844 {
1845         char       *base = fpm_segment_base(fpm);
1846         FreePageSpanLeader *span;
1847         FreePageSpanLeader *next;
1848         FreePageSpanLeader *prev;
1849
1850         span = (FreePageSpanLeader *) fpm_page_to_pointer(base, pageno);
1851
1852         next = relptr_access(base, span->next);
1853         prev = relptr_access(base, span->prev);
1854         if (next != NULL)
1855                 relptr_copy(next->prev, span->prev);
1856         if (prev != NULL)
1857                 relptr_copy(prev->next, span->next);
1858         else
1859         {
1860                 Size            f = Min(span->npages, FPM_NUM_FREELISTS) - 1;
1861
1862                 Assert(fpm->freelist[f].relptr_off == pageno * FPM_PAGE_SIZE);
1863                 relptr_copy(fpm->freelist[f], span->next);
1864         }
1865 }
1866
1867 /*
1868  * Initialize a new FreePageSpanLeader and put it on the appropriate free list.
1869  */
1870 static void
1871 FreePagePushSpanLeader(FreePageManager *fpm, Size first_page, Size npages)
1872 {
1873         char       *base = fpm_segment_base(fpm);
1874         Size            f = Min(npages, FPM_NUM_FREELISTS) - 1;
1875         FreePageSpanLeader *head = relptr_access(base, fpm->freelist[f]);
1876         FreePageSpanLeader *span;
1877
1878         span = (FreePageSpanLeader *) fpm_page_to_pointer(base, first_page);
1879         span->magic = FREE_PAGE_SPAN_LEADER_MAGIC;
1880         span->npages = npages;
1881         relptr_store(base, span->next, head);
1882         relptr_store(base, span->prev, (FreePageSpanLeader *) NULL);
1883         if (head != NULL)
1884                 relptr_store(base, head->prev, span);
1885         relptr_store(base, fpm->freelist[f], span);
1886 }