]> granicus.if.org Git - postgresql/blob - src/backend/access/hash/hashinsert.c
Reduce pinning and buffer content locking for btree scans.
[postgresql] / src / backend / access / hash / hashinsert.c
1 /*-------------------------------------------------------------------------
2  *
3  * hashinsert.c
4  *        Item insertion in hash tables for Postgres.
5  *
6  * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  *        src/backend/access/hash/hashinsert.c
12  *
13  *-------------------------------------------------------------------------
14  */
15
16 #include "postgres.h"
17
18 #include "access/hash.h"
19 #include "utils/rel.h"
20
21
22 /*
23  *      _hash_doinsert() -- Handle insertion of a single index tuple.
24  *
25  *              This routine is called by the public interface routines, hashbuild
26  *              and hashinsert.  By here, itup is completely filled in.
27  */
28 void
29 _hash_doinsert(Relation rel, IndexTuple itup)
30 {
31         Buffer          buf;
32         Buffer          metabuf;
33         HashMetaPage metap;
34         BlockNumber blkno;
35         BlockNumber oldblkno = InvalidBlockNumber;
36         bool            retry = false;
37         Page            page;
38         HashPageOpaque pageopaque;
39         Size            itemsz;
40         bool            do_expand;
41         uint32          hashkey;
42         Bucket          bucket;
43
44         /*
45          * Get the hash key for the item (it's stored in the index tuple itself).
46          */
47         hashkey = _hash_get_indextuple_hashkey(itup);
48
49         /* compute item size too */
50         itemsz = IndexTupleDSize(*itup);
51         itemsz = MAXALIGN(itemsz);      /* be safe, PageAddItem will do this but we
52                                                                  * need to be consistent */
53
54         /* Read the metapage */
55         metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
56         metap = HashPageGetMeta(BufferGetPage(metabuf));
57
58         /*
59          * Check whether the item can fit on a hash page at all. (Eventually, we
60          * ought to try to apply TOAST methods if not.)  Note that at this point,
61          * itemsz doesn't include the ItemId.
62          *
63          * XXX this is useless code if we are only storing hash keys.
64          */
65         if (itemsz > HashMaxItemSize((Page) metap))
66                 ereport(ERROR,
67                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
68                                  errmsg("index row size %zu exceeds hash maximum %zu",
69                                                 itemsz, HashMaxItemSize((Page) metap)),
70                         errhint("Values larger than a buffer page cannot be indexed.")));
71
72         /*
73          * Loop until we get a lock on the correct target bucket.
74          */
75         for (;;)
76         {
77                 /*
78                  * Compute the target bucket number, and convert to block number.
79                  */
80                 bucket = _hash_hashkey2bucket(hashkey,
81                                                                           metap->hashm_maxbucket,
82                                                                           metap->hashm_highmask,
83                                                                           metap->hashm_lowmask);
84
85                 blkno = BUCKET_TO_BLKNO(metap, bucket);
86
87                 /* Release metapage lock, but keep pin. */
88                 _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);
89
90                 /*
91                  * If the previous iteration of this loop locked what is still the
92                  * correct target bucket, we are done.  Otherwise, drop any old lock
93                  * and lock what now appears to be the correct bucket.
94                  */
95                 if (retry)
96                 {
97                         if (oldblkno == blkno)
98                                 break;
99                         _hash_droplock(rel, oldblkno, HASH_SHARE);
100                 }
101                 _hash_getlock(rel, blkno, HASH_SHARE);
102
103                 /*
104                  * Reacquire metapage lock and check that no bucket split has taken
105                  * place while we were awaiting the bucket lock.
106                  */
107                 _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_READ);
108                 oldblkno = blkno;
109                 retry = true;
110         }
111
112         /* Fetch the primary bucket page for the bucket */
113         buf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_BUCKET_PAGE);
114         page = BufferGetPage(buf);
115         pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
116         Assert(pageopaque->hasho_bucket == bucket);
117
118         /* Do the insertion */
119         while (PageGetFreeSpace(page) < itemsz)
120         {
121                 /*
122                  * no space on this page; check for an overflow page
123                  */
124                 BlockNumber nextblkno = pageopaque->hasho_nextblkno;
125
126                 if (BlockNumberIsValid(nextblkno))
127                 {
128                         /*
129                          * ovfl page exists; go get it.  if it doesn't have room, we'll
130                          * find out next pass through the loop test above.
131                          */
132                         _hash_relbuf(rel, buf);
133                         buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE);
134                         page = BufferGetPage(buf);
135                 }
136                 else
137                 {
138                         /*
139                          * we're at the end of the bucket chain and we haven't found a
140                          * page with enough room.  allocate a new overflow page.
141                          */
142
143                         /* release our write lock without modifying buffer */
144                         _hash_chgbufaccess(rel, buf, HASH_READ, HASH_NOLOCK);
145
146                         /* chain to a new overflow page */
147                         buf = _hash_addovflpage(rel, metabuf, buf);
148                         page = BufferGetPage(buf);
149
150                         /* should fit now, given test above */
151                         Assert(PageGetFreeSpace(page) >= itemsz);
152                 }
153                 pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
154                 Assert(pageopaque->hasho_flag == LH_OVERFLOW_PAGE);
155                 Assert(pageopaque->hasho_bucket == bucket);
156         }
157
158         /* found page with enough space, so add the item here */
159         (void) _hash_pgaddtup(rel, buf, itemsz, itup);
160
161         /* write and release the modified page */
162         _hash_wrtbuf(rel, buf);
163
164         /* We can drop the bucket lock now */
165         _hash_droplock(rel, blkno, HASH_SHARE);
166
167         /*
168          * Write-lock the metapage so we can increment the tuple count. After
169          * incrementing it, check to see if it's time for a split.
170          */
171         _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);
172
173         metap->hashm_ntuples += 1;
174
175         /* Make sure this stays in sync with _hash_expandtable() */
176         do_expand = metap->hashm_ntuples >
177                 (double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1);
178
179         /* Write out the metapage and drop lock, but keep pin */
180         _hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK);
181
182         /* Attempt to split if a split is needed */
183         if (do_expand)
184                 _hash_expandtable(rel, metabuf);
185
186         /* Finally drop our pin on the metapage */
187         _hash_dropbuf(rel, metabuf);
188 }
189
190 /*
191  *      _hash_pgaddtup() -- add a tuple to a particular page in the index.
192  *
193  * This routine adds the tuple to the page as requested; it does not write out
194  * the page.  It is an error to call pgaddtup() without pin and write lock on
195  * the target buffer.
196  *
197  * Returns the offset number at which the tuple was inserted.  This function
198  * is responsible for preserving the condition that tuples in a hash index
199  * page are sorted by hashkey value.
200  */
201 OffsetNumber
202 _hash_pgaddtup(Relation rel, Buffer buf, Size itemsize, IndexTuple itup)
203 {
204         OffsetNumber itup_off;
205         Page            page;
206         uint32          hashkey;
207
208         _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
209         page = BufferGetPage(buf);
210
211         /* Find where to insert the tuple (preserving page's hashkey ordering) */
212         hashkey = _hash_get_indextuple_hashkey(itup);
213         itup_off = _hash_binsearch(page, hashkey);
214
215         if (PageAddItem(page, (Item) itup, itemsize, itup_off, false, false)
216                 == InvalidOffsetNumber)
217                 elog(ERROR, "failed to add index item to \"%s\"",
218                          RelationGetRelationName(rel));
219
220         return itup_off;
221 }