gist-fast-build-v0.8-lobotomized-1.patch
text/x-diff
Filename: gist-fast-build-v0.8-lobotomized-1.patch
Type: text/x-diff
Part: 0
Message:
Re: WIP: Fast GiST index build
diff --git a/src/backend/access/common/reloptions.c b/src/backend/access/common/reloptions.c
index 4657425..d829243 100644
--- a/src/backend/access/common/reloptions.c
+++ b/src/backend/access/common/reloptions.c
@@ -30,6 +30,9 @@
#include "utils/memutils.h"
#include "utils/rel.h"
+
+static void validateBufferingOption(char *value);
+
/*
* Contents of pg_class.reloptions
*
@@ -66,6 +69,14 @@ static relopt_bool boolRelOpts[] =
},
true
},
+ {
+ {
+ "neighborrelocation",
+ "Enables relocation of index tuples into neighbor node buffers in GiST index buffering build",
+ RELOPT_KIND_GIST
+ },
+ true
+ },
/* list terminator */
{{NULL}}
};
@@ -159,6 +170,22 @@ static relopt_int intRelOpts[] =
RELOPT_KIND_HEAP | RELOPT_KIND_TOAST
}, -1, 0, 2000000000
},
+ {
+ {
+ "levelstep",
+ "Level step in GiST index buffering build",
+ RELOPT_KIND_GIST
+ },
+ -1, 1, 100
+ },
+ {
+ {
+ "buffersize",
+ "Buffer size in GiST index buffering build",
+ RELOPT_KIND_GIST
+ },
+ -1, 1, 1000000000
+ },
/* list terminator */
{{NULL}}
};
@@ -219,6 +246,17 @@ static relopt_real realRelOpts[] =
static relopt_string stringRelOpts[] =
{
+ {
+ {
+ "buffering",
+ "Enables buffering build for this GiST index",
+ RELOPT_KIND_GIST
+ },
+ 4,
+ false,
+ validateBufferingOption,
+ "auto"
+ },
/* list terminator */
{{NULL}}
};
@@ -1282,3 +1320,22 @@ tablespace_reloptions(Datum reloptions, bool validate)
return (bytea *) tsopts;
}
+
+/*
+ * Validator for "buffering" option of GiST indexed. Allows only "on", "off" and
+ * "auto" values.
+ */
+static void
+validateBufferingOption(char *value)
+{
+ if (!value ||
+ (
+ strcmp(value, "on") &&
+ strcmp(value, "off") &&
+ strcmp(value, "auto")
+ )
+ )
+ {
+ elog(ERROR, "Only \"on\", \"off\" and \"auto\" values are available for \"buffering\" option.");
+ }
+}
diff --git a/src/backend/access/gist/Makefile b/src/backend/access/gist/Makefile
index f8051a2..cc9468f 100644
--- a/src/backend/access/gist/Makefile
+++ b/src/backend/access/gist/Makefile
@@ -13,6 +13,6 @@ top_builddir = ../../../..
include $(top_builddir)/src/Makefile.global
OBJS = gist.o gistutil.o gistxlog.o gistvacuum.o gistget.o gistscan.o \
- gistproc.o gistsplit.o
+ gistproc.o gistsplit.o gistbuild.o gistbuildbuffers.o
include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/access/gist/README b/src/backend/access/gist/README
index 2d78dcb..45f8fc9 100644
--- a/src/backend/access/gist/README
+++ b/src/backend/access/gist/README
@@ -24,6 +24,7 @@ The current implementation of GiST supports:
* provides NULL-safe interface to GiST core
* Concurrency
* Recovery support via WAL logging
+ * Buffering build algorithm
The support for concurrency implemented in PostgreSQL was developed based on
the paper "Access Methods for Next-Generation Database Systems" by
@@ -31,6 +32,12 @@ Marcel Kornaker:
http://www.sai.msu.su/~megera/postgres/gist/papers/concurrency/access-methods-for-next-generation.pdf.gz
+Buffering build algorithm for GiST was developed based on the paper "Efficient
+Bulk Operations on Dynamic R-trees" by Lars Arge, Klaus Hinrichs, Jan Vahrenhold
+and Jeffrey Scott Vitter.
+
+ http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.135.9894&rep=rep1&type=pdf
+
The original algorithms were modified in several ways:
* They had to be adapted to PostgreSQL conventions. For example, the SEARCH
@@ -278,6 +285,113 @@ would complicate the insertion algorithm. So when an insertion sees a page
with F_FOLLOW_RIGHT set, it immediately tries to bring the split that
crashed in the middle to completion by adding the downlink in the parent.
+Buffering build algorithm
+--------------------
+
+In buffering build algorithm levels are numbering upwards and leaf pages level
+has number zero. An example is given on the picture below. Such numbering is
+used in order to make pages save it's level number during all life-time even if
+root splits.
+
+Level Tree
+
+3 *
+ / \
+2 * *
+ / | \ / | \
+1 * * * * * *
+ / \ / \ / \ / \ / \ / \
+0 o o o o o o o o o o o o
+
+* - internal page
+o - leaf page
+
+In buffering build algorithm additional buffers are associated with pages. Leaf
+pages never have buffers. Internal pages have buffers with some level step.
+I.e. pages on levels level_step*i (i >=1) have buffers. If level_step = 1 then
+each internal page have buffer.
+
+Level Tree (level_step = 1) Tree (level_step = 2)
+
+3 *(b) *
+ / \ / \
+2 *(b) *(b) *(b) *(b)
+ / | \ / | \ / | \ / | \
+1 *(b) *(b) *(b) *(b) *(b) *(b) * * * * * *
+ / \ / \ / \ / \ / \ / \ / \ / \ / \ / \ / \ / \
+0 o o o o o o o o o o o o o o o o o o o o o o o o
+
+(b) - buffer
+
+Each buffer can be in one of following states:
+1) Append. New tuples are being added to the buffer. Last page of the buffer is kept in main memory
+2) Flush. Tuples are being read from the buffer. Page containing last returned tuple is kept in main memory.
+
+Buffer goes from append mode to flush mode on the first call to popItupFromNodeBuffer(), and from flush mode to append mode when the buffer is empty.
+
+When index tuple is inserting, it's first path can end in following points:
+1) Leaf page if no levels has buffers, i.e. root level <= level_step.
+2) Buffer of root page if root page has a buffer.
+3) Buffer of topmost level page if root page doesn't have a buffer.
+
+New index tuples are processing until root level buffer (or buffer of topmost
+level page) will be filled at half. When some buffer is filled at halt then
+the process of it's emptying is starting.
+
+Buffer emptying process means that index tuples from buffer are moving into
+underlying buffers(if any) or leaf pages. For buffer emptying to another buffers
+following items should be loaded into main memory:
+1) Buffer itself should be completely loaded
+2) Underlying buffers should be loaded for append
+3) Page associated with buffer
+4) Pages between buffer and underlying buffers if level_step != 1 (note that
+ pages associated with underlying buffers aren't required to be loaded)
+For emptying to leaf pages list of those items is following
+1) Buffer itself should be completely loaded
+2) Page associated with buffer
+3) Pages between buffer and leaf pages if level_step != 1
+4) Leaf pages
+Illustration of this requirements is given below.
+
+ Buffer emptying to another buffers Buffer emptying to leaf pages
+
+ +(cb) +(cb)
+ / \ / \
+ + + + +
+ / \ / \ / \ / \
+ *(ab) *(ab) *(ab) *(ab) x x x x
+
++ - loaded into main memory internal page
+x - loaded into main memory leaf page
+* - not loaded into main memory internal page
+(cb) - completely loaded buffer
+(ab) - loaded for append buffer
+
+One buffer emptying process can trigger another buffer emptying processes.
+Buffer emptying stack data structure is the data structure responsible for
+sequence of buffer emptying. Each node buffer which is half filled should be
+inserted into buffer emptying stack.
+
+When we're moving from buffer emptying on higher level to the buffer emptying
+on lower level, loaded part of tree (only pages of tree not the buffers) are
+remained in the main memory. Tree parts stack is the data structure which
+represents hierarchy of loaded tree parts.
+
+If split occurs on the page which have a buffer then index tuples are
+relocating. When neighborrelocation is off index tuples are relocating between
+buffers of pages produces by split using penalty method. This method was
+proposed in the original paper. For for reasons of index quality improvement
+another method of relocation was implemented. When neighborrelocation is on
+index tuples are relocation into both buffers of pages produces by split and
+buffers on neighbor pages (pages with same parent). This method uses more CPU
+and IO but improves index quality.
+
+When all index tuples are inserted there are still some index tuples in buffers.
+At this moment final buffer emptying starts. Each level have a list of non-empty
+buffers. Final emptying contain loop over all tree levels starting from topmost.
+On each levels all it's buffers are sequentially emptying until all buffers of
+the level are empty. Since no index tuples move upwards during buffer emptying
+all the buffers are empty when final emptying are finished.
Authors:
Teodor Sigaev <teodor@sigaev.ru>
diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c
index 4fc7a21..278de32 100644
--- a/src/backend/access/gist/gist.c
+++ b/src/backend/access/gist/gist.c
@@ -24,38 +24,16 @@
#include "utils/memutils.h"
#include "utils/rel.h"
-/* Working state for gistbuild and its callback */
-typedef struct
-{
- GISTSTATE giststate;
- int numindexattrs;
- double indtuples;
- MemoryContext tmpCtx;
-} GISTBuildState;
-
/* A List of these is used represent a split-in-progress. */
typedef struct
{
Buffer buf; /* the split page "half" */
IndexTuple downlink; /* downlink for this half. */
+ bool release;
} GISTPageSplitInfo;
/* non-export function prototypes */
-static void gistbuildCallback(Relation index,
- HeapTuple htup,
- Datum *values,
- bool *isnull,
- bool tupleIsAlive,
- void *state);
-static void gistdoinsert(Relation r,
- IndexTuple itup,
- Size freespace,
- GISTSTATE *GISTstate);
static void gistfixsplit(GISTInsertState *state, GISTSTATE *giststate);
-static bool gistinserttuples(GISTInsertState *state, GISTInsertStack *stack,
- GISTSTATE *giststate,
- IndexTuple *tuples, int ntup, OffsetNumber oldoffnum,
- Buffer leftchild);
static void gistfinishsplit(GISTInsertState *state, GISTInsertStack *stack,
GISTSTATE *giststate, List *splitinfo);
@@ -89,138 +67,6 @@ createTempGistContext(void)
}
/*
- * Routine to build an index. Basically calls insert over and over.
- *
- * XXX: it would be nice to implement some sort of bulk-loading
- * algorithm, but it is not clear how to do that.
- */
-Datum
-gistbuild(PG_FUNCTION_ARGS)
-{
- Relation heap = (Relation) PG_GETARG_POINTER(0);
- Relation index = (Relation) PG_GETARG_POINTER(1);
- IndexInfo *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2);
- IndexBuildResult *result;
- double reltuples;
- GISTBuildState buildstate;
- Buffer buffer;
- Page page;
-
- /*
- * We expect to be called exactly once for any index relation. If that's
- * not the case, big trouble's what we have.
- */
- if (RelationGetNumberOfBlocks(index) != 0)
- elog(ERROR, "index \"%s\" already contains data",
- RelationGetRelationName(index));
-
- /* no locking is needed */
- initGISTstate(&buildstate.giststate, index);
-
- /* initialize the root page */
- buffer = gistNewBuffer(index);
- Assert(BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO);
- page = BufferGetPage(buffer);
-
- START_CRIT_SECTION();
-
- GISTInitBuffer(buffer, F_LEAF);
-
- MarkBufferDirty(buffer);
-
- if (RelationNeedsWAL(index))
- {
- XLogRecPtr recptr;
- XLogRecData rdata;
-
- rdata.data = (char *) &(index->rd_node);
- rdata.len = sizeof(RelFileNode);
- rdata.buffer = InvalidBuffer;
- rdata.next = NULL;
-
- recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_CREATE_INDEX, &rdata);
- PageSetLSN(page, recptr);
- PageSetTLI(page, ThisTimeLineID);
- }
- else
- PageSetLSN(page, GetXLogRecPtrForTemp());
-
- UnlockReleaseBuffer(buffer);
-
- END_CRIT_SECTION();
-
- /* build the index */
- buildstate.numindexattrs = indexInfo->ii_NumIndexAttrs;
- buildstate.indtuples = 0;
-
- /*
- * create a temporary memory context that is reset once for each tuple
- * inserted into the index
- */
- buildstate.tmpCtx = createTempGistContext();
-
- /* do the heap scan */
- reltuples = IndexBuildHeapScan(heap, index, indexInfo, true,
- gistbuildCallback, (void *) &buildstate);
-
- /* okay, all heap tuples are indexed */
- MemoryContextDelete(buildstate.tmpCtx);
-
- freeGISTstate(&buildstate.giststate);
-
- /*
- * Return statistics
- */
- result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
-
- result->heap_tuples = reltuples;
- result->index_tuples = buildstate.indtuples;
-
- PG_RETURN_POINTER(result);
-}
-
-/*
- * Per-tuple callback from IndexBuildHeapScan
- */
-static void
-gistbuildCallback(Relation index,
- HeapTuple htup,
- Datum *values,
- bool *isnull,
- bool tupleIsAlive,
- void *state)
-{
- GISTBuildState *buildstate = (GISTBuildState *) state;
- IndexTuple itup;
- MemoryContext oldCtx;
-
- oldCtx = MemoryContextSwitchTo(buildstate->tmpCtx);
-
- /* form an index tuple and point it at the heap tuple */
- itup = gistFormTuple(&buildstate->giststate, index,
- values, isnull, true /* size is currently bogus */ );
- itup->t_tid = htup->t_self;
-
- /*
- * Since we already have the index relation locked, we call gistdoinsert
- * directly. Normal access method calls dispatch through gistinsert,
- * which locks the relation for write. This is the right thing to do if
- * you're inserting single tups, but not when you're initializing the
- * whole index at once.
- *
- * In this path we respect the fillfactor setting, whereas insertions
- * after initial build do not.
- */
- gistdoinsert(index, itup,
- RelationGetTargetPageFreeSpace(index, GIST_DEFAULT_FILLFACTOR),
- &buildstate->giststate);
-
- buildstate->indtuples += 1;
- MemoryContextSwitchTo(oldCtx);
- MemoryContextReset(buildstate->tmpCtx);
-}
-
-/*
* gistbuildempty() -- build an empty gist index in the initialization fork
*/
Datum
@@ -275,7 +121,6 @@ gistinsert(PG_FUNCTION_ARGS)
PG_RETURN_BOOL(false);
}
-
/*
* Place tuples from 'itup' to 'buffer'. If 'oldoffnum' is valid, the tuple
* at that offset is atomically removed along with inserting the new tuples.
@@ -608,7 +453,7 @@ gistplacetopage(GISTInsertState *state, GISTSTATE *giststate,
* this routine assumes it is invoked in a short-lived memory context,
* so it does not bother releasing palloc'd allocations.
*/
-static void
+BlockNumber
gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate)
{
ItemId iid;
@@ -617,6 +462,7 @@ gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate)
GISTInsertStack *stack;
GISTInsertState state;
bool xlocked = false;
+ BlockNumber leafBlocknum = InvalidBlockNumber;
memset(&state, 0, sizeof(GISTInsertState));
state.freespace = freespace;
@@ -841,7 +687,8 @@ gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate)
}
/* now state.stack->(page, buffer and blkno) points to leaf page */
-
+
+ leafBlocknum = stack->blkno;
gistinserttuples(&state, stack, giststate, &itup, 1,
InvalidOffsetNumber, InvalidBuffer);
LockBuffer(stack->buffer, GIST_UNLOCK);
@@ -852,6 +699,7 @@ gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate)
break;
}
}
+ return leafBlocknum;
}
/*
@@ -1183,7 +1031,7 @@ gistfixsplit(GISTInsertState *state, GISTSTATE *giststate)
* to hold an exclusive lock on state->stack->buffer, but if we had to split
* the page, it might not contain the tuple we just inserted/updated.
*/
-static bool
+bool
gistinserttuples(GISTInsertState *state, GISTInsertStack *stack,
GISTSTATE *giststate,
IndexTuple *tuples, int ntup, OffsetNumber oldoffnum,
@@ -1414,6 +1262,7 @@ initGISTstate(GISTSTATE *giststate, Relation index)
else
giststate->supportCollation[i] = DEFAULT_COLLATION_OID;
}
+ giststate->gfbb = NULL;
}
void
diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c
new file mode 100644
index 0000000..8543033
--- /dev/null
+++ b/src/backend/access/gist/gistbuild.c
@@ -0,0 +1,1173 @@
+/*-------------------------------------------------------------------------
+ *
+ * gistbuild.c
+ * build algorithm for GiST indexes implementation.
+ *
+ *
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/gist/gistbuild.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/genam.h"
+#include "access/gist_private.h"
+#include "catalog/index.h"
+#include "catalog/pg_collation.h"
+#include "miscadmin.h"
+#include "optimizer/cost.h"
+#include "storage/bufmgr.h"
+#include "storage/indexfsm.h"
+#include "storage/smgr.h"
+#include "utils/memutils.h"
+#include "utils/rel.h"
+
+#define LEAF_PAGES_STATS_STEP 128
+#define LEAF_PAGES_STATS_COUNT 16
+
+
+/* Working state for gistbuild and its callback */
+typedef struct
+{
+ GISTSTATE giststate;
+ int numindexattrs;
+ int64 indtuples;
+ /*
+ * Buffering build mode. Possible values:
+ * 'y' - we are in buffering build mode.
+ * 'a' - we are now in regular build mode, but can switch to buffering
+ * build mode when we decide to.
+ * 'n' - we are in regular build mode and aren't going to switch.
+ */
+ char bufferingMode;
+ MemoryContext tmpCtx;
+ /* Tracking statistics about last accessed leaf pages */
+ HTAB *leafPagesTab;
+ int nonHitLeafPagesStats[LEAF_PAGES_STATS_COUNT];
+ int nonHitLeafPagesStatsIndex;
+} GISTBuildState;
+
+typedef struct
+{
+ BlockNumber blockNumber;
+ int64 lastTupleNumber;
+} LeafPageInfo;
+
+
+static void gistBufferingBuildInsert(Relation index, IndexTuple itup,
+ GISTBuildState *buildstate);
+static void gistBuildCallback(Relation index,
+ HeapTuple htup,
+ Datum *values,
+ bool *isnull,
+ bool tupleIsAlive,
+ void *state);
+
+static bool initBuffering(GISTBuildState *buildstate, Relation index);
+static bool bufferingbuildinsert(GISTInsertState *state,
+ GISTLoadedPartItem *item,
+ GISTSTATE *giststate,
+ IndexTuple *itup, int ntup,
+ OffsetNumber oldoffnum);
+static int bufferlevel_cmp(const void *a, const void *b);
+
+static void gistFindCorrectParent(GISTSTATE *giststate, Relation r, GISTLoadedPartItem *child);
+
+#ifdef GIST_DEBUG
+#include "utils/geo_decls.h"
+static void gist_dumptree(Relation r, int level, BlockNumber blk, OffsetNumber coff, BOX *downlink, StringInfo out, int maxlevel);
+static int gist_counttups(Relation r, BlockNumber blk);
+static int gist_count_tuples_in_buffers(GISTBuildBuffers *gfbb);
+#endif
+
+/*
+ * Index tuple insert function of buffering build algorithm. In simpler than
+ * regular insert function in the fact that it don't takes care about
+ * concurrency. It invokes buffer relocation function when it splits page. Also
+ * it take several oldoffnums as a parameter because buffer relocation can alter
+ * a number of parent index tuples.
+ */
+static bool
+bufferingbuildinsert(GISTInsertState *state,
+ GISTLoadedPartItem *path,
+ GISTSTATE *giststate,
+ IndexTuple *itup, int ntup,
+ OffsetNumber oldoffnum)
+{
+ GISTBuildBuffers *gfbb = giststate->gfbb;
+ Buffer buffer = ReadBuffer(state->r, path->blkno);
+ Page page = (Page) BufferGetPage(buffer);
+ bool is_leaf = (GistPageIsLeaf(page)) ? true : false;
+ int i;
+ bool is_split;
+
+ LockBuffer(buffer, GIST_EXCLUSIVE);
+
+ /* Remove old index tuple if needed */
+ if (OffsetNumberIsValid(oldoffnum))
+ PageIndexMultiDelete(page, &oldoffnum, 1);
+
+ /* Check if there is enough space for insertion */
+ is_split = gistnospace(page, itup, ntup,
+ InvalidOffsetNumber, state->freespace);
+
+ if (is_split)
+ {
+ /* no space for insertion */
+ IndexTuple *itvec;
+ int tlen;
+ SplitedPageLayout *dist = NULL,
+ *ptr;
+ SplitedPageLayout rootpg;
+ BlockNumber blkno = BufferGetBlockNumber(buffer);
+ BlockNumber oldrlink = InvalidBlockNumber;
+ bool is_rootsplit;
+ XLogRecPtr recptr;
+
+ is_rootsplit = (blkno == GIST_ROOT_BLKNO);
+
+ /*
+ * Form index tuples vector to split. Old tuple was already removed
+ * from the vector.
+ */
+ itvec = gistextractpage(page, &tlen);
+ itvec = gistjoinvector(itvec, &tlen, itup, ntup);
+ dist = gistSplit(state->r, page, itvec, tlen, giststate);
+
+ /*
+ * Set up pages to work with. Allocate new buffers for all but the
+ * leftmost page. The original page becomes the new leftmost page, and
+ * is just replaced with the new contents.
+ *
+ * For a root-split, allocate new buffers for all child pages, the
+ * original page is overwritten with new root page containing
+ * downlinks to the new child pages.
+ */
+ ptr = dist;
+ if (!is_rootsplit)
+ {
+ oldrlink = GistPageGetOpaque(page)->rightlink;
+ dist->buffer = buffer;
+ dist->block.blkno = BufferGetBlockNumber(buffer);
+ dist->page = PageGetTempPageCopySpecial(BufferGetPage(buffer));
+
+ /* clean all flags except F_LEAF */
+ GistPageGetOpaque(dist->page)->flags = (is_leaf) ? F_LEAF : 0;
+
+ ptr = ptr->next;
+ }
+ for (; ptr; ptr = ptr->next)
+ {
+ /* Allocate new page */
+ ptr->buffer = gistNewBuffer(state->r);
+ GISTInitBuffer(ptr->buffer, (is_leaf) ? F_LEAF : 0);
+ ptr->page = BufferGetPage(ptr->buffer);
+ ptr->block.blkno = BufferGetBlockNumber(ptr->buffer);
+ }
+
+ /*
+ * Now that we know which blocks the new pages go to, set up downlink
+ * tuples to point to them.
+ */
+ for (ptr = dist; ptr; ptr = ptr->next)
+ {
+ ItemPointerSetBlockNumber(&(ptr->itup->t_tid), ptr->block.blkno);
+ GistTupleSetValid(ptr->itup);
+ }
+
+ {
+ StringInfoData sb;
+ initStringInfo(&sb);
+ for (ptr = dist; ptr; ptr = ptr->next)
+ appendStringInfo(&sb, "%u ", ptr->block.blkno);
+ }
+
+ if (is_rootsplit)
+ {
+ /*
+ * Adjust the top element in the insert stacks for the new root
+ * page.
+ */
+ GISTLoadedPartItem *oldroot = gfbb->rootitem;
+
+ gfbb->rootitem = (GISTLoadedPartItem *) MemoryContextAlloc(gfbb->context,
+ sizeof(GISTLoadedPartItem));
+ gfbb->rootitem->parent = NULL;
+ gfbb->rootitem->blkno = GIST_ROOT_BLKNO;
+ gfbb->rootitem->downlinkoffnum = InvalidOffsetNumber;
+ gfbb->rootitem->level = oldroot->level + 1;
+
+ oldroot->parent = gfbb->rootitem;
+ oldroot->blkno = dist->next->block.blkno;
+ oldroot->downlinkoffnum = InvalidOffsetNumber;
+ }
+
+ /* Maintain node buffers and loaded tree parts on split */
+ relocateBuildBuffersOnSplit(giststate->gfbb, giststate, state->r,
+ path, buffer, dist);
+
+ /*
+ * If this is a root split, we construct the new root page with the
+ * downlinks here directly, instead of do recursive call for their
+ * insertion. Add the new root page to the list along with the child
+ * pages.
+ */
+ if (is_rootsplit)
+ {
+ IndexTuple *downlinks;
+ int ndownlinks = 0;
+ int i;
+
+ rootpg.buffer = buffer;
+ rootpg.page = PageGetTempPageCopySpecial(
+ BufferGetPage(rootpg.buffer));
+ GistPageGetOpaque(rootpg.page)->flags = 0;
+
+ /* Prepare a vector of all the downlinks */
+ for (ptr = dist; ptr; ptr = ptr->next)
+ ndownlinks++;
+ downlinks = palloc(sizeof(IndexTuple) * ndownlinks);
+ for (i = 0, ptr = dist; ptr; ptr = ptr->next)
+ downlinks[i++] = ptr->itup;
+
+ rootpg.block.blkno = GIST_ROOT_BLKNO;
+ rootpg.block.num = ndownlinks;
+ rootpg.list = gistfillitupvec(downlinks, ndownlinks,
+ &(rootpg.lenlist));
+ rootpg.itup = NULL;
+
+ rootpg.next = dist;
+ dist = &rootpg;
+ }
+
+ /*
+ * Fill all pages. All the pages are new, ie. freshly allocated empty
+ * pages, or a temporary copy of the old page.
+ */
+ for (ptr = dist; ptr; ptr = ptr->next)
+ {
+ char *data = (char *) (ptr->list);
+
+ for (i = 0; i < ptr->block.num; i++)
+ {
+ if (PageAddItem(ptr->page,
+ (Item) data, IndexTupleSize((IndexTuple) data),
+ i + FirstOffsetNumber, false, false) ==
+ InvalidOffsetNumber)
+ elog(ERROR, "failed to add item to index page in \"%s\"",
+ RelationGetRelationName(state->r));
+ data += IndexTupleSize((IndexTuple) data);
+ }
+
+ /* Set up rightlinks */
+ if (ptr->next && ptr->block.blkno != GIST_ROOT_BLKNO)
+ GistPageGetOpaque(ptr->page)->rightlink =
+ ptr->next->block.blkno;
+ else
+ GistPageGetOpaque(ptr->page)->rightlink = oldrlink;
+ }
+
+ /* Mark buffers dirty */
+ for (ptr = dist; ptr; ptr = ptr->next)
+ MarkBufferDirty(ptr->buffer);
+
+ PageRestoreTempPage(dist->page, BufferGetPage(dist->buffer));
+ dist->page = BufferGetPage(dist->buffer);
+
+ /* TODO: Write the WAL record */
+#ifdef NOT_IMPLEMENTED
+ if (RelationNeedsWAL(state->r))
+ recptr = gistXLogSplit(state->r->rd_node, blkno, is_leaf,
+ dist, oldrlink, oldnsn, leftchildbuf);
+ else
+ recptr = GetXLogRecPtrForTemp();
+#else
+ recptr.xlogid = 0;
+ recptr.xrecoff = 1;
+#endif
+
+ for (ptr = dist; ptr; ptr = ptr->next)
+ {
+ PageSetLSN(ptr->page, recptr);
+ PageSetTLI(ptr->page, ThisTimeLineID);
+ }
+
+ /* Release buffers, except the one holding the inserted/updated tuple */
+ for (ptr = dist; ptr; ptr = ptr->next)
+ {
+ if (BufferIsValid(ptr->buffer))
+ UnlockReleaseBuffer(ptr->buffer);
+ }
+
+ /*
+ * If it wasn't root split, we have to insert downlinks to parent page.
+ */
+ if (!is_rootsplit)
+ {
+ IndexTuple *itups;
+ int cnt = 0, i;
+
+ for (ptr = dist; ptr; ptr = ptr->next)
+ {
+ cnt++;
+ }
+ itups = (IndexTuple *)palloc(sizeof(IndexTuple) * cnt);
+ i = 0;
+ for (ptr = dist; ptr; ptr = ptr->next)
+ {
+ itups[i] = ptr->itup;
+ i++;
+ }
+
+ if (!is_rootsplit)
+ gistFindCorrectParent(giststate, state->r, path);
+
+ bufferingbuildinsert(state, path->parent, giststate, itups,
+ cnt,
+ is_rootsplit ? InvalidOffsetNumber : path->downlinkoffnum);
+ }
+ }
+ else
+ {
+ /*
+ * Enough of space. Just insert index tuples to the page.
+ */
+ gistfillbuffer(page, itup, ntup, InvalidOffsetNumber);
+ MarkBufferDirty(buffer);
+ UnlockReleaseBuffer(buffer);
+ }
+
+ return is_split;
+}
+
+/*
+ * Process index tuple. Run index tuple down until it meet leaf page or
+ * node buffer.
+ */
+static void
+processItup(GISTSTATE *giststate, GISTInsertState *state,
+ GISTBuildBuffers *gfbb, IndexTuple itup,
+ GISTLoadedPartItem *startparent)
+{
+ GISTLoadedPartItem *path;
+ BlockNumber childblkno;
+ Buffer buffer;
+
+ if (!startparent)
+ path = gfbb->rootitem;
+ else
+ path = startparent;
+
+ /*
+ * Loop until we are on leaf page (level == 0) or we reach level with
+ * buffers (if it wasn't level that we've start at).
+ */
+ for (;;)
+ {
+ ItemId iid;
+ IndexTuple idxtuple, newtup;
+ Page page;
+ OffsetNumber childoffnum;
+ GISTLoadedPartItem *parent;
+
+ if (path != startparent && LEVEL_HAS_BUFFERS(path->level, gfbb))
+ break;
+
+ if (path->level == 0)
+ break;
+
+ /* Choose child for insertion */
+ buffer = ReadBuffer(state->r, path->blkno);
+ LockBuffer(buffer, GIST_EXCLUSIVE);
+
+ page = (Page) BufferGetPage(buffer);
+ childoffnum = gistchoose(state->r, page, itup, giststate);
+ iid = PageGetItemId(page, childoffnum);
+ idxtuple = (IndexTuple) PageGetItem(page, iid);
+ childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
+
+ /* Adjust key representing child if needed */
+ newtup = gistgetadjusted(state->r, idxtuple, itup, giststate);
+
+ UnlockReleaseBuffer(buffer);
+
+ if (newtup)
+ bufferingbuildinsert(state, path, giststate,
+ &newtup, 1, childoffnum);
+
+ /* descend */
+ parent = path;
+ path = (GISTLoadedPartItem *) MemoryContextAlloc(gfbb->context,
+ sizeof(GISTLoadedPartItem));
+ path->parent = parent;
+ path->level = parent->level - 1;
+ path->blkno = childblkno;
+ path->downlinkoffnum = childoffnum;
+ }
+
+ if (LEVEL_HAS_BUFFERS(path->level, gfbb))
+ {
+ /*
+ * We've reached level with buffers. Now place index tuple to the
+ * buffer and add buffer emptying stack element if buffer overflows.
+ */
+ bool wasOverflowed;
+ NodeBuffer *childNodeBuffer;
+
+ childNodeBuffer = getNodeBuffer(gfbb, giststate, path->blkno, path->downlinkoffnum, path->parent, true);
+ wasOverflowed = BUFFER_IS_OVERLOW(childNodeBuffer, gfbb);
+ pushItupToNodeBuffer(gfbb, childNodeBuffer, itup);
+ if (!wasOverflowed && BUFFER_IS_OVERLOW(childNodeBuffer, gfbb))
+ {
+ MemoryContext oldcxt = MemoryContextSwitchTo(gfbb->context);
+ gfbb->bufferEmptyingStack = lcons(childNodeBuffer, gfbb->bufferEmptyingStack);
+ MemoryContextSwitchTo(oldcxt);
+ path = NULL; /* don't free the allocated items */
+ }
+ }
+ else
+ {
+ /*
+ * We've reached leaf level. So, place index tuple here.
+ */
+ bufferingbuildinsert(state, path, giststate, &itup, 1, InvalidOffsetNumber);
+ }
+
+ if (path)
+ {
+ while(path != startparent && path != gfbb->rootitem)
+ {
+ GISTLoadedPartItem *parent = path->parent;
+ pfree(path);
+ path = parent;
+ }
+ }
+}
+
+
+static void
+gistFindCorrectParent(GISTSTATE *giststate, Relation r, GISTLoadedPartItem *child)
+{
+ GISTBuildBuffers *gfbb = giststate->gfbb;
+ GISTLoadedPartItem *parent = child->parent;
+ OffsetNumber i,
+ maxoff;
+ ItemId iid;
+ IndexTuple idxtuple;
+ Buffer buffer;
+ Page page;
+ bool copied = false;
+
+ buffer = ReadBuffer(r, parent->blkno);
+ page = BufferGetPage(buffer);
+ LockBuffer(buffer, GIST_EXCLUSIVE);
+ gistcheckpage(r, buffer);
+
+ /* Check if it has not moved */
+ if (child->downlinkoffnum != InvalidOffsetNumber)
+ {
+ iid = PageGetItemId(page, child->downlinkoffnum);
+ idxtuple = (IndexTuple) PageGetItem(page, iid);
+ if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == child->blkno)
+ {
+ /* Still there */
+ UnlockReleaseBuffer(buffer);
+ return;
+ }
+ }
+
+ /* parent is changed, look child in right links until found */
+ while (true)
+ {
+ maxoff = PageGetMaxOffsetNumber(page);
+ for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
+ {
+ iid = PageGetItemId(page, i);
+ idxtuple = (IndexTuple) PageGetItem(page, iid);
+ if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == child->blkno)
+ {
+ /* yes!!, found */
+ child->downlinkoffnum = i;
+ UnlockReleaseBuffer(buffer);
+ return;
+ }
+ }
+
+ if (!copied)
+ {
+ parent = (GISTLoadedPartItem *) MemoryContextAlloc(gfbb->context,
+ sizeof(GISTLoadedPartItem));
+ memcpy(parent, child->parent, sizeof(GISTLoadedPartItem));
+ copied = true;
+ }
+
+ parent->blkno = GistPageGetOpaque(page)->rightlink;
+ UnlockReleaseBuffer(buffer);
+
+ if (parent->blkno == InvalidBlockNumber)
+ {
+ /*
+ * End of chain and still didn't find parent. Should not happen
+ * during index build.
+ */
+ break;
+ }
+
+ /* Next page */
+
+ buffer = ReadBuffer(r, parent->blkno);
+ page = BufferGetPage(buffer);
+ LockBuffer(buffer, GIST_EXCLUSIVE);
+ gistcheckpage(r, buffer);
+ }
+
+ elog(ERROR, "failed to re-find parent for block %u", child->blkno);
+}
+
+/*
+ * Process buffers emptying stack. Emptying of one buffer can cause emptying
+ * of other buffers. This function iterates until this cascading emptying
+ * process finished, e.g. until buffers emptying stack is empty.
+ */
+static void
+processEmptyingStack(GISTSTATE *giststate, GISTInsertState *state)
+{
+ GISTBuildBuffers *gfbb = giststate->gfbb;
+ int requiredSizeDecrease = gfbb->pagesPerBuffer * BLCKSZ;
+
+ /* Iterate while we have elements in buffers emptying stack. */
+ while (gfbb->bufferEmptyingStack != NIL)
+ {
+ int initialBusySize, busySize;
+ NodeBuffer *emptyingNodeBuffer;
+
+ /* Remove element from the stack and prepare for emptying. */
+ emptyingNodeBuffer = (NodeBuffer *) linitial(gfbb->bufferEmptyingStack);
+ gfbb->bufferEmptyingStack = list_delete_first(gfbb->bufferEmptyingStack);
+
+ initialBusySize = getNodeBufferBusySize(emptyingNodeBuffer);
+ gfbb->currentEmptyingBufferSplit = false;
+
+ while(true)
+ {
+ IndexTuple itup;
+
+ /* Get one index tuple from node buffer and process it */
+ if (!popItupFromNodeBuffer(gfbb, emptyingNodeBuffer, &itup))
+ break;
+ processItup(giststate, state, gfbb, itup,
+ emptyingNodeBuffer->path);
+
+ /* Free all the memory allocated during index tuple processing */
+ MemoryContextReset(CurrentMemoryContext);
+
+ /*
+ * If current emptying node buffer split we should stop emptying
+ * just because there is just no such node buffer anymore.
+ */
+ if (gfbb->currentEmptyingBufferSplit)
+ break;
+
+ busySize = getNodeBufferBusySize(emptyingNodeBuffer);
+
+ /*
+ * If we've processed half of buffer size limit and buffer is not
+ * overflowed anymore we should stop in order to avoid exceeding
+ * of limit in lower buffers.
+ */
+ if (initialBusySize - busySize >= requiredSizeDecrease &&
+ !BUFFER_IS_OVERLOW(emptyingNodeBuffer, gfbb))
+ break;
+ }
+ }
+}
+
+/*
+ * Insert function for buffering index build.
+ */
+static void
+gistBufferingBuildInsert(Relation index, IndexTuple itup,
+ GISTBuildState *buildstate)
+{
+ GISTBuildBuffers *gfbb = buildstate->giststate.gfbb;
+ GISTInsertState insertstate;
+
+ memset(&insertstate, 0, sizeof(GISTInsertState));
+ insertstate.freespace = RelationGetTargetPageFreeSpace(index,
+ GIST_DEFAULT_FILLFACTOR);
+ insertstate.r = index;
+
+ /* We are ready for index tuple processing */
+ processItup(&buildstate->giststate, &insertstate, gfbb, itup, NULL);
+
+ /* Process buffer emptying stack if any */
+ processEmptyingStack(&buildstate->giststate, &insertstate);
+}
+
+/*
+ * Per-tuple callback from IndexBuildHeapScan
+ */
+static void
+gistBuildCallback(Relation index,
+ HeapTuple htup,
+ Datum *values,
+ bool *isnull,
+ bool tupleIsAlive,
+ void *state)
+{
+ GISTBuildState *buildstate = (GISTBuildState *) state;
+ IndexTuple itup;
+ MemoryContext oldCtx;
+
+ oldCtx = MemoryContextSwitchTo(buildstate->tmpCtx);
+
+ /* form an index tuple and point it at the heap tuple */
+ itup = gistFormTuple(&buildstate->giststate, index,
+ values, isnull, true /* size is currently bogus */ );
+ itup->t_tid = htup->t_self;
+
+ if (buildstate->bufferingMode == 'y')
+ {
+ /* We've decided to use buffering. So let's use buffering insert. */
+ gistBufferingBuildInsert(index, itup, buildstate);
+ }
+ else
+ {
+ BlockNumber leafBlocknum;
+ LeafPageInfo *leafInfo;
+ bool found;
+
+ /* We didn't decide to use buffering yet or aren't goint to use it at
+ * all. Since we already have the index relation locked, we call
+ * gistdoinsert directly. Normal access method calls dispatch through
+ * gistinsert, which locks the relation for write. This is the right
+ * thing to do if you're inserting single tups, but not when you're
+ * initializing the whole index at once.
+ *
+ * In this path we respect the fillfactor setting, whereas insertions
+ * after initial build do not.
+ */
+ leafBlocknum = gistdoinsert(index, itup,
+ RelationGetTargetPageFreeSpace(index, GIST_DEFAULT_FILLFACTOR),
+ &buildstate->giststate);
+
+ if (buildstate->bufferingMode == 'a')
+ {
+ /* Add information about leaf pages accesses */
+ leafInfo = (LeafPageInfo *) hash_search(buildstate->leafPagesTab,
+ (const void *) &leafBlocknum,
+ HASH_ENTER,
+ &found);
+ leafInfo->lastTupleNumber = buildstate->indtuples;
+ }
+
+ }
+
+ buildstate->indtuples += 1;
+ MemoryContextSwitchTo(oldCtx);
+ MemoryContextReset(buildstate->tmpCtx);
+
+ if (buildstate->bufferingMode == 'a' &&
+ buildstate->indtuples % LEAF_PAGES_STATS_STEP == 0)
+ {
+ HASH_SEQ_STATUS scan_status;
+ LeafPageInfo *leafInfo;
+ int removedItemsCount = 0;
+ BlockNumber indexSize;
+
+ /*
+ * Count leaf pages which was accessed by previous chunk of index
+ * tuples and wasn't accessed by last chunk of index tuples.
+ */
+ hash_seq_init(&scan_status, buildstate->leafPagesTab);
+ while((leafInfo = (LeafPageInfo *)hash_seq_search(&scan_status)) != NULL)
+ {
+ if (leafInfo->blockNumber < buildstate->indtuples - LEAF_PAGES_STATS_STEP)
+ {
+ if (hash_search(buildstate->leafPagesTab,
+ (const void *) &leafInfo->blockNumber,
+ HASH_REMOVE, NULL) == NULL)
+ elog(ERROR, "hash table corrupted");
+ removedItemsCount++;
+ }
+ }
+ buildstate->nonHitLeafPagesStats[buildstate->nonHitLeafPagesStatsIndex] =
+ removedItemsCount;
+ buildstate->nonHitLeafPagesStatsIndex =
+ (buildstate->nonHitLeafPagesStatsIndex + 1) % LEAF_PAGES_STATS_COUNT;
+
+ indexSize = smgrnblocks(index->rd_smgr, MAIN_FORKNUM);
+
+ /*
+ * Check if we are going to switch to buffering build.
+ */
+ if (buildstate->bufferingMode == 'a' &&
+ effective_cache_size < indexSize &&
+ indexSize >= 2 * LEAF_PAGES_STATS_STEP)
+ {
+ int i, nonHitLeafPages = 0;
+ double lambda, factor;
+ for (i = 0; i < LEAF_PAGES_STATS_COUNT; i++)
+ {
+ nonHitLeafPages += buildstate->nonHitLeafPagesStats[i];
+ }
+ /* Calculate parameter of exponential distribution. */
+ lambda = (-1.0 / (double)LEAF_PAGES_STATS_STEP) *
+ log((double)nonHitLeafPages / (double)(LEAF_PAGES_STATS_STEP * LEAF_PAGES_STATS_COUNT));
+
+ /* Estimate portion index tuples which can be processed without IO */
+ factor = (1 - exp(- (double)effective_cache_size * lambda)) /
+ (1 - exp(- (double)indexSize * lambda));
+
+ /* If estimated portion exceeds threshold then switch to buffering build */
+ if (factor < 0.95)
+ {
+ if (initBuffering(buildstate, index))
+ {
+ /*
+ * Buffering build is successfully initialized. Now we can
+ * set appropriate flag.
+ */
+ buildstate->bufferingMode = 'y';
+ elog(INFO, "switching to buffered mode");
+ }
+ else
+ {
+ /*
+ * Failed to switch to buffering build due to not enough
+ * memory settings. Mark that we aren't going to switch
+ * anymore.
+ */
+ buildstate->bufferingMode = 'n';
+ }
+ }
+ }
+ }
+}
+
+/*
+ * Initial calculations for GiST buffering build.
+ */
+static bool
+initBuffering(GISTBuildState *buildstate, Relation index)
+{
+ int pagesPerBuffer = -1;
+ bool neighborRelocation = true;
+ Size pageFreeSpace;
+ Size itupMinSize;
+ int i, maxIndexTuplesCount;
+ int effectiveMemory;
+ int levelStep = 0;
+ GISTBuildBuffers *gfbb;
+
+ /* try to get user difened options */
+ if (index->rd_options)
+ {
+ GiSTOptions *options = (GiSTOptions *)index->rd_options;
+ levelStep = options->levelStep;
+ pagesPerBuffer = options->bufferSize;
+ neighborRelocation = options->neighborRelocation;
+ }
+
+ /* calc number of index tuples which fit in page */
+ pageFreeSpace = BLCKSZ - SizeOfPageHeaderData -
+ sizeof(GISTPageOpaqueData) - sizeof(ItemIdData);
+ itupMinSize = (Size)MAXALIGN(sizeof(IndexTupleData));
+ for (i = 0; i < index->rd_att->natts; i++)
+ {
+ if (index->rd_att->attrs[i]->attlen < 0)
+ itupMinSize += VARHDRSZ;
+ else
+ itupMinSize += index->rd_att->attrs[i]->attlen;
+ }
+ maxIndexTuplesCount = pageFreeSpace / itupMinSize;
+
+ /* calculate level step if it isn't specified by user */
+ effectiveMemory = Min(maintenance_work_mem * 1024 / BLCKSZ,
+ effective_cache_size);
+ if (levelStep <= 0)
+ {
+ levelStep = (int)log((double)effectiveMemory / 4.0) /
+ log((double)maxIndexTuplesCount);
+ if (levelStep < 0)
+ levelStep = 0;
+ }
+
+ /* calculate buffer size if it isn't specified by user */
+ if (pagesPerBuffer <= 0)
+ {
+ pagesPerBuffer = 2;
+ for (i = 0; i < levelStep; i++)
+ {
+ pagesPerBuffer *= maxIndexTuplesCount;
+ }
+ }
+
+ if (levelStep > 0)
+ {
+ /* Enough of memory for at least level_step == 1. */
+ gfbb = palloc(sizeof(GISTBuildBuffers));
+ gfbb->pagesPerBuffer = pagesPerBuffer;
+ gfbb->levelStep = levelStep;
+ gfbb->neighborRelocation = neighborRelocation;
+ initGiSTBuildBuffers(gfbb);
+ buildstate->giststate.gfbb = gfbb;
+ elog(INFO, "Level step = %d, pagesPerBuffer = %d", levelStep,
+ pagesPerBuffer);
+ return true;
+ }
+ else
+ {
+ /* Not enough memory for buffering build. */
+ return false;
+ }
+}
+
+/*
+ * Routine to build an index. Basically calls insert over and over.
+ *
+ * XXX: it would be nice to implement some sort of bulk-loading
+ * algorithm, but it is not clear how to do that.
+ */
+Datum
+gistbuild(PG_FUNCTION_ARGS)
+{
+ Relation heap = (Relation) PG_GETARG_POINTER(0);
+ Relation index = (Relation) PG_GETARG_POINTER(1);
+ IndexInfo *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2);
+ IndexBuildResult *result;
+ double reltuples;
+ GISTBuildState buildstate;
+ Buffer buffer;
+ Page page;
+ HASHCTL hashCtl;
+ int i;
+ MemoryContext oldcxt = CurrentMemoryContext;
+
+ if (index->rd_options)
+ {
+ /* Get buffering mode from the options string */
+ GiSTOptions *options = (GiSTOptions *)index->rd_options;
+ char *bufferingMode = (char *)options + options->bufferingModeOffset;
+ if (!strcmp(bufferingMode, "on"))
+ buildstate.bufferingMode = 'y';
+ if (!strcmp(bufferingMode, "off"))
+ buildstate.bufferingMode = 'n';
+ if (!strcmp(bufferingMode, "auto"))
+ buildstate.bufferingMode = 'a';
+ }
+ else
+ {
+ /* Automatic buffering mode switching by default */
+ buildstate.bufferingMode = 'a';
+ }
+
+ if (buildstate.bufferingMode == 'a')
+ {
+ /* Init hash for tracking leaf pages accesses */
+ hashCtl.keysize = sizeof(BlockNumber);
+ hashCtl.entrysize = sizeof(LeafPageInfo);
+ hashCtl.hcxt = CurrentMemoryContext;
+ hashCtl.hash = tag_hash;
+ hashCtl.match = memcmp;
+ buildstate.leafPagesTab = hash_create(
+ "leafpagestab",
+ 2 * LEAF_PAGES_STATS_STEP,
+ &hashCtl,
+ HASH_ELEM | HASH_CONTEXT | HASH_FUNCTION | HASH_COMPARE);
+ }
+ else
+ buildstate.leafPagesTab = NULL;
+
+ /*
+ * We expect to be called exactly once for any index relation. If that's
+ * not the case, big trouble's what we have.
+ */
+ if (RelationGetNumberOfBlocks(index) != 0)
+ elog(ERROR, "index \"%s\" already contains data",
+ RelationGetRelationName(index));
+
+ /* no locking is needed */
+ initGISTstate(&buildstate.giststate, index);
+ if (buildstate.bufferingMode == 'y')
+ {
+ if (!initBuffering(&buildstate, index))
+ buildstate.bufferingMode = 'n';
+ }
+
+ /* initialize the root page */
+ buffer = gistNewBuffer(index);
+ Assert(BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO);
+ page = BufferGetPage(buffer);
+
+ START_CRIT_SECTION();
+
+ GISTInitBuffer(buffer, F_LEAF);
+
+ MarkBufferDirty(buffer);
+
+ if (RelationNeedsWAL(index))
+ {
+ XLogRecPtr recptr;
+ XLogRecData rdata;
+
+ rdata.data = (char *) &(index->rd_node);
+ rdata.len = sizeof(RelFileNode);
+ rdata.buffer = InvalidBuffer;
+ rdata.next = NULL;
+
+ recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_CREATE_INDEX, &rdata);
+ PageSetLSN(page, recptr);
+ PageSetTLI(page, ThisTimeLineID);
+ }
+ else
+ PageSetLSN(page, GetXLogRecPtrForTemp());
+
+ UnlockReleaseBuffer(buffer);
+
+ END_CRIT_SECTION();
+
+ /* build the index */
+ buildstate.numindexattrs = indexInfo->ii_NumIndexAttrs;
+ buildstate.indtuples = 0;
+ buildstate.nonHitLeafPagesStatsIndex = 0;
+ for (i = 0; i < LEAF_PAGES_STATS_COUNT; i++)
+ buildstate.nonHitLeafPagesStats[i] = 0;
+
+ /*
+ * create a temporary memory context that is reset once for each tuple
+ * inserted into the index
+ */
+ buildstate.tmpCtx = createTempGistContext();
+
+ /*
+ * Do the heap scan.
+ */
+ reltuples = IndexBuildHeapScan(heap, index, indexInfo, true,
+ gistBuildCallback, (void *) &buildstate);
+
+ /*
+ * If buffering build do final node buffers emptying.
+ */
+ if (buildstate.bufferingMode == 'y')
+ {
+ int i;
+ GISTInsertState insertstate;
+ MemoryContext oldCtx;
+ GISTBuildBuffers *gfbb = buildstate.giststate.gfbb;
+ NodeBuffer **buffers;
+ NodeBuffer *buf;
+ HASH_SEQ_STATUS scan_status;
+ int nbuffers;
+
+ oldCtx = MemoryContextSwitchTo(buildstate.tmpCtx);
+
+ memset(&insertstate, 0, sizeof(GISTInsertState));
+ insertstate.freespace = RelationGetTargetPageFreeSpace(index,
+ GIST_DEFAULT_FILLFACTOR);
+ insertstate.r = index;
+
+ for (;;)
+ {
+ /* Collect all buffers into array */
+ buffers = MemoryContextAlloc(gfbb->context,
+ hash_get_num_entries(gfbb->nodeBuffersTab) * sizeof(NodeBuffer *));
+ nbuffers = 0;
+ /* Sort the buffers by level */
+ hash_seq_init(&scan_status, gfbb->nodeBuffersTab);
+ while ((buf = hash_seq_search(&scan_status)) != NULL)
+ {
+ if (buf->tuplesCount > 0)
+ buffers[nbuffers++] = buf;
+ }
+ if (nbuffers == 0)
+ break;
+
+ /*
+ * Iterate through the buffers, from top to bottom
+ */
+ qsort(buffers, nbuffers, sizeof(NodeBuffer *), bufferlevel_cmp);
+ for (i = 0; i < nbuffers; i++)
+ {
+ MemoryContext oldcxt;
+
+ oldcxt = MemoryContextSwitchTo(gfbb->context);
+ gfbb->bufferEmptyingStack = lcons(buffers[i], gfbb->bufferEmptyingStack);
+ MemoryContextSwitchTo(oldcxt);
+
+ processEmptyingStack(&buildstate.giststate, &insertstate);
+ }
+ /*
+ * Emptying these buffers might've created new buffers, so iterate
+ * until we're fully done
+ */
+ }
+ for (i = 0; i < nbuffers; i++)
+ {
+ Assert(buffers[i]->tuplesCount == 0);
+ }
+ }
+
+ /* okay, all heap tuples are indexed */
+ MemoryContextSwitchTo(oldcxt);
+ MemoryContextDelete(buildstate.tmpCtx);
+
+ freeGISTstate(&buildstate.giststate);
+
+ /*
+ * Return statistics
+ */
+ result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
+
+ result->heap_tuples = reltuples;
+ result->index_tuples = (double)buildstate.indtuples;
+
+ PG_RETURN_POINTER(result);
+}
+
+/*
+ * qsort comparator for sorting NodeBuffers by level.
+ */
+static int
+bufferlevel_cmp(const void *a, const void *b)
+{
+ NodeBuffer *abuf = *((NodeBuffer **) a);
+ NodeBuffer *bbuf = *((NodeBuffer **) b);
+
+ return (bbuf->level - abuf->level);
+}
+
+
+#ifdef GIST_DEBUG
+static void
+gist_dumptree(Relation r, int level, BlockNumber blk, OffsetNumber coff, BOX *downlink, StringInfo out, int maxlevel) {
+ Buffer buffer;
+ Page page;
+ IndexTuple which;
+ ItemId iid;
+ OffsetNumber i,
+ maxoff;
+ BlockNumber cblk;
+ char *pred;
+
+ pred = (char *) palloc(sizeof(char) * level * 4 + 1);
+ MemSet(pred, ' ', level*4);
+ pred[level*4] = '\0';
+
+ buffer = ReadBuffer(r, blk);
+ page = (Page) BufferGetPage(buffer);
+
+ maxoff = PageGetMaxOffsetNumber(page);
+
+ appendStringInfo(out, "%s%d(l:%d) blk: %d numTuple: %d free: %db(%.2f%%) rightlink:%u (%s) (%.5g, %.5g),(%.5g, %.5g)\n",
+ pred,
+ coff,
+ level,
+ (int) blk,
+ (int) maxoff,
+ PageGetFreeSpace(page),
+ 100.0*(((float)BLCKSZ)-(float)PageGetFreeSpace(page))/((float)BLCKSZ),
+ GistPageGetOpaque(page)->rightlink,
+ ( GistPageGetOpaque(page)->rightlink == InvalidBlockNumber ) ? "InvalidBlockNumber" : "OK",
+ downlink ? downlink->low.x : 0,
+ downlink ? downlink->low.y : 0,
+ downlink ? downlink->high.x : 0,
+ downlink ? downlink->high.y :0 );
+
+ if (maxlevel<0 || level<maxlevel)
+ {
+ for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
+ {
+ bool isnull;
+ BOX *box;
+
+ iid = PageGetItemId(page, i);
+ which = (IndexTuple) PageGetItem(page, iid);
+
+ box = (BOX *)index_getattr(which, 1, RelationGetDescr(r), &isnull);
+
+ if (!GistPageIsLeaf(page))
+ {
+ cblk = ItemPointerGetBlockNumber(&(which->t_tid));
+ gist_dumptree(r, level + 1, cblk, i, box, out, maxlevel);
+ }
+ else
+ {
+ bool invalid = false;
+ if (downlink)
+ {
+ if (box->high.x > downlink->high.x)
+ invalid = true;
+ if (box->high.y > downlink->high.y)
+ invalid = true;
+ if (box->high.x < downlink->low.x)
+ invalid = true;
+ if (box->high.y < downlink->low.y)
+ invalid = true;
+ }
+
+ if (invalid)
+ appendStringInfo(out, "%s leaf item %d points to %u/%u: (%.5g, %.5g)%s\n",
+ pred, i,
+ ItemPointerGetBlockNumber(&(which->t_tid)),
+ ItemPointerGetOffsetNumber(&(which->t_tid)),
+ box->high.x, box->high.y, invalid ? " INVALID" : "");
+ }
+ }
+ }
+ ReleaseBuffer(buffer);
+ pfree(pred);
+}
+
+static int
+gist_counttups(Relation r, BlockNumber blk)
+{
+ Buffer buffer;
+ Page page;
+ IndexTuple which;
+ ItemId iid;
+ OffsetNumber i,
+ maxoff;
+ BlockNumber cblk;
+ int ntuples;
+
+ buffer = ReadBuffer(r, blk);
+ page = (Page) BufferGetPage(buffer);
+ maxoff = PageGetMaxOffsetNumber(page);
+
+ ntuples = 0;
+ for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
+ {
+ iid = PageGetItemId(page, i);
+ which = (IndexTuple) PageGetItem(page, iid);
+
+ if (!GistPageIsLeaf(page))
+ {
+ cblk = ItemPointerGetBlockNumber(&(which->t_tid));
+ ntuples += gist_counttups(r, cblk);
+ }
+ else
+ ntuples++;
+ }
+ ReleaseBuffer(buffer);
+
+ return ntuples;
+}
+
+static int
+gist_count_tuples_in_buffers(GISTBuildBuffers *gfbb)
+{
+ HASH_SEQ_STATUS scan_status;
+ NodeBuffer *buf;
+ int ntuples = 0;
+
+ hash_seq_init(&scan_status, gfbb->nodeBuffersTab);
+ while ((buf = hash_seq_search(&scan_status)) != NULL)
+ ntuples += buf->tuplesCount;
+ return ntuples;
+}
+#endif
diff --git a/src/backend/access/gist/gistbuildbuffers.c b/src/backend/access/gist/gistbuildbuffers.c
new file mode 100644
index 0000000..02c91e7
--- /dev/null
+++ b/src/backend/access/gist/gistbuildbuffers.c
@@ -0,0 +1,600 @@
+/*-------------------------------------------------------------------------
+ *
+ * gistbuildbuffers.c
+ * buffers management functions for GiST buffering build algorithm.
+ *
+ *
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/gist/gistbuildbuffers.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/genam.h"
+#include "access/gist_private.h"
+#include "catalog/index.h"
+#include "catalog/pg_collation.h"
+#include "miscadmin.h"
+#include "storage/bufmgr.h"
+#include "storage/indexfsm.h"
+#include "storage/buffile.h"
+#include "utils/memutils.h"
+#include "utils/rel.h"
+
+static NodeBufferPage *allocateNewPageBuffer(GISTBuildBuffers *gfbb);
+static void placeItupToPage(NodeBufferPage *pageBuffer, IndexTuple item);
+static void getItupFromPage(NodeBufferPage *pageBuffer, IndexTuple *item);
+static int freeBlocks_cmp(const void *a, const void *b);
+static long getFreeBlock(GISTBuildBuffers *gfbb);
+static void releaseBlock(GISTBuildBuffers *gfbb, long blocknum);
+
+/*
+ * Initialize GiST buffering build structure.
+ */
+void
+initGiSTBuildBuffers(GISTBuildBuffers * gfbb)
+{
+ HASHCTL hashCtl;
+
+ /*
+ * Create temporary file initialize data structures for free pages
+ * management.
+ */
+ gfbb->pfile = BufFileCreateTemp(true);
+ gfbb->nFileBlocks = 0;
+ gfbb->nFreeBlocks = 0;
+ gfbb->blocksSorted = false;
+ gfbb->freeBlocksLen = 32;
+ gfbb->freeBlocks = (long *) palloc(gfbb->freeBlocksLen * sizeof(long));
+
+ /*
+ * Current memory context will be used for all in-memory data structures
+ * of buffers.
+ */
+ gfbb->context = CurrentMemoryContext;
+
+ /*
+ * nodeBuffersTab hash is association between index blocks and it's buffer.
+ */
+ hashCtl.keysize = sizeof(BlockNumber);
+ hashCtl.entrysize = sizeof(NodeBuffer);
+ hashCtl.hcxt = CurrentMemoryContext;
+ hashCtl.hash = tag_hash;
+ hashCtl.match = memcmp;
+ gfbb->nodeBuffersTab = hash_create(
+ "gistbuildbuffers",
+ 1024,
+ &hashCtl,
+ HASH_ELEM | HASH_CONTEXT | HASH_FUNCTION | HASH_COMPARE);
+
+ /*
+ * Stack of node buffers which was planned for emptying.
+ */
+ gfbb->bufferEmptyingStack = NIL;
+
+ gfbb->currentEmptyingBufferBlockNumber = InvalidBlockNumber;
+ gfbb->currentEmptyingBufferSplit = false;
+
+ gfbb->rootitem = (GISTLoadedPartItem *) MemoryContextAlloc(gfbb->context,
+ sizeof(GISTLoadedPartItem));
+ gfbb->rootitem->parent = NULL;
+ gfbb->rootitem->blkno = GIST_ROOT_BLKNO;
+ gfbb->rootitem->downlinkoffnum = InvalidOffsetNumber;
+ gfbb->rootitem->level = 0;
+}
+
+/*
+ * Return NodeBuffer structure by it's block number. If createNew flag is
+ * specified then new NodeBuffer structure will be created on it's absence.
+ */
+NodeBuffer *
+getNodeBuffer(GISTBuildBuffers *gfbb, GISTSTATE *giststate,
+ BlockNumber nodeBlocknum,
+ OffsetNumber downlinkoffnum,
+ GISTLoadedPartItem *parent,
+ bool createNew)
+{
+ NodeBuffer *nodeBuffer;
+ bool found;
+
+ /* Find nodeBuffer in hash table */
+ nodeBuffer = (NodeBuffer *) hash_search(gfbb->nodeBuffersTab,
+ (const void *) &nodeBlocknum,
+ createNew ? HASH_ENTER : HASH_FIND,
+ &found);
+ if (!found)
+ {
+ GISTLoadedPartItem *path;
+
+ /*
+ * Node buffer wasn't found. Create new if required.
+ */
+ if (!createNew)
+ return NULL;
+
+ if (nodeBlocknum != GIST_ROOT_BLKNO)
+ {
+ path = (GISTLoadedPartItem *) MemoryContextAlloc(gfbb->context,
+ sizeof(GISTLoadedPartItem));
+ path->parent = parent;
+ path->blkno = nodeBlocknum;
+ path->downlinkoffnum = downlinkoffnum;
+ path->level = parent->level - 1;
+ Assert(path->level > 0);
+ }
+ else
+ path = gfbb->rootitem;
+
+ /*
+ * New node buffer. Fill data structure with default values.
+ */
+ nodeBuffer->pageBuffer = NULL;
+ nodeBuffer->blocksCount = 0;
+ nodeBuffer->tuplesCount = 0;
+ nodeBuffer->level = path->level;
+ nodeBuffer->path = path;
+ }
+ else
+ {
+ Assert(nodeBuffer->path->parent == parent);
+ }
+
+ return nodeBuffer;
+}
+
+/*
+ * Allocate memory for buffer page.
+ */
+static NodeBufferPage *
+allocateNewPageBuffer(GISTBuildBuffers *gfbb)
+{
+ NodeBufferPage *pageBuffer;
+ /* Allocate memory for page in appropriate context. */
+ pageBuffer = (NodeBufferPage *) MemoryContextAlloc(gfbb->context, BLCKSZ);
+ /* Set page free space */
+ PAGE_FREE_SPACE(pageBuffer) = BLCKSZ - MAXALIGN(sizeof(uint32));
+ return pageBuffer;
+}
+
+/*
+ * Add item to buffer page.
+ */
+static void
+placeItupToPage(NodeBufferPage *pageBuffer, IndexTuple itup)
+{
+ /* Get pointer to page free space start */
+ char *ptr = (char *)pageBuffer + PAGE_FREE_SPACE(pageBuffer)
+ - MAXALIGN(IndexTupleSize(itup));
+ /* There should be enough of space */
+ Assert(PAGE_FREE_SPACE(pageBuffer) >= MAXALIGN(IndexTupleSize(itup)));
+ /* Copy index tuple to free space */
+ memcpy(ptr, itup, IndexTupleSize(itup));
+ /* Reduce free space value of page */
+ PAGE_FREE_SPACE(pageBuffer) -= MAXALIGN(IndexTupleSize(itup));
+}
+
+/*
+ * Get last item from buffer page and remove it from page.
+ */
+static void
+getItupFromPage(NodeBufferPage *pageBuffer, IndexTuple *itup)
+{
+ /* Get pointer to last index tuple */
+ IndexTuple ptr = (IndexTuple)((char *)pageBuffer +
+ PAGE_FREE_SPACE(pageBuffer));
+ /* Page shouldn't be empty */
+ Assert(!PAGE_IS_EMPTY(pageBuffer));
+ /* Allocate memory for returned index tuple copy */
+ *itup = (IndexTuple)palloc(IndexTupleSize(ptr));
+ /* Copy data */
+ memcpy(*itup, ptr, IndexTupleSize(ptr));
+ /* Increase free space value of page */
+ PAGE_FREE_SPACE(pageBuffer) += MAXALIGN(IndexTupleSize(*itup));
+}
+
+/*
+ * Push new index tuple to node buffer.
+ */
+void
+pushItupToNodeBuffer(GISTBuildBuffers *gfbb, NodeBuffer *nodeBuffer,
+ IndexTuple itup)
+{
+ /*
+ * Most part of memory operations will be in buffering build persistent
+ * context. So, let's switch to it.
+ */
+ MemoryContext oldcxt = MemoryContextSwitchTo(gfbb->context);
+
+ /*
+ * If there wasn't any data in buffer before then we should add this node
+ * buffer to nonempty buffers list.
+ * Allocate a page buffer if we don't have one yet.
+ */
+ if (nodeBuffer->blocksCount == 0)
+ {
+ nodeBuffer->pageBuffer = allocateNewPageBuffer(gfbb);
+ nodeBuffer->pageBuffer->prev = InvalidBlockNumber;
+ nodeBuffer->blocksCount = 1;
+ }
+
+ /* Check if there is enough space on the last page for the tuple */
+ if (PAGE_NO_SPACE(nodeBuffer->pageBuffer, itup))
+ {
+ /* Swap previous block to disk and allocate new one */
+ BlockNumber blkno;
+
+ nodeBuffer->blocksCount++;
+
+ blkno = getFreeBlock(gfbb);
+ BufFileSeekBlock(gfbb->pfile, blkno);
+ BufFileWrite(gfbb->pfile, nodeBuffer->pageBuffer, BLCKSZ);
+
+ nodeBuffer->pageBuffer = allocateNewPageBuffer(gfbb);
+ nodeBuffer->pageBuffer->prev = blkno;
+ }
+
+ placeItupToPage(nodeBuffer->pageBuffer, itup);
+
+ /* Increase tuples counter of node buffer */
+ nodeBuffer->tuplesCount++;
+
+ /* Restore memory context */
+ MemoryContextSwitchTo(oldcxt);
+}
+
+/*
+ * Removes one index tuple from node buffer. Returns true if success and false
+ * if node buffer is empty.
+ */
+bool
+popItupFromNodeBuffer(GISTBuildBuffers *gfbb, NodeBuffer *nodeBuffer,
+ IndexTuple *itup)
+{
+ /* If node buffer is empty then return false. */
+ if (nodeBuffer->blocksCount <= 0)
+ return false;
+
+ /* Get index tuple from last non-empty page and mark it as dirty. */
+ getItupFromPage(nodeBuffer->pageBuffer, itup);
+
+ /* Check if the page which the index tuple was got from is now empty */
+ if (PAGE_IS_EMPTY(nodeBuffer->pageBuffer))
+ {
+ BlockNumber prevblkno;
+ /*
+ * If it's empty then we need to release buffer file block and free
+ * page buffer.
+ */
+ nodeBuffer->blocksCount--;
+
+ /* If there's more pages, fetch previous one */
+ prevblkno = nodeBuffer->pageBuffer->prev;
+ if (prevblkno != InvalidBlockNumber)
+ {
+ Assert(nodeBuffer->blocksCount > 0);
+ BufFileSeekBlock(gfbb->pfile, prevblkno);
+ BufFileRead(gfbb->pfile, nodeBuffer->pageBuffer, BLCKSZ);
+
+ releaseBlock(gfbb, prevblkno);
+ }
+ else
+ {
+ Assert(nodeBuffer->blocksCount == 0);
+ pfree(nodeBuffer->pageBuffer);
+ nodeBuffer->pageBuffer = NULL;
+ }
+ }
+
+ /* Decrease node buffer tuples counter. */
+ nodeBuffer->tuplesCount--;
+ return true;
+}
+
+/*
+ * qsort comparator for sorting freeBlocks[] into decreasing order.
+ */
+static int
+freeBlocks_cmp(const void *a, const void *b)
+{
+ long ablk = *((const long *) a);
+ long bblk = *((const long *) b);
+
+ /* can't just subtract because long might be wider than int */
+ if (ablk < bblk)
+ return 1;
+ if (ablk > bblk)
+ return -1;
+ return 0;
+}
+
+/*
+ * Select a currently unused block for writing to.
+ *
+ * NB: should only be called when writer is ready to write immediately,
+ * to ensure that first write pass is sequential.
+ */
+static long
+getFreeBlock(GISTBuildBuffers *gfbb)
+{
+ /*
+ * If there are multiple free blocks, we select the one appearing last in
+ * freeBlocks[] (after sorting the array if needed). If there are none,
+ * assign the next block at the end of the file.
+ */
+ if (gfbb->nFreeBlocks > 0)
+ {
+ if (!gfbb->blocksSorted)
+ {
+ qsort((void *) gfbb->freeBlocks, gfbb->nFreeBlocks,
+ sizeof(long), freeBlocks_cmp);
+ gfbb->blocksSorted = true;
+ }
+ return gfbb->freeBlocks[--gfbb->nFreeBlocks];
+ }
+ else
+ return gfbb->nFileBlocks++;
+}
+
+/*
+ * Return a block# to the freelist.
+ */
+static void
+releaseBlock(GISTBuildBuffers *gfbb, long blocknum)
+{
+ int ndx;
+
+ /*
+ * Enlarge freeBlocks array if full.
+ */
+ if (gfbb->nFreeBlocks >= gfbb->freeBlocksLen)
+ {
+ gfbb->freeBlocksLen *= 2;
+ gfbb->freeBlocks = (long *) repalloc(gfbb->freeBlocks,
+ gfbb->freeBlocksLen * sizeof(long));
+ }
+
+ /*
+ * Add blocknum to array, and mark the array unsorted if it's no longer in
+ * decreasing order.
+ */
+ ndx = gfbb->nFreeBlocks++;
+ gfbb->freeBlocks[ndx] = blocknum;
+ if (ndx > 0 && gfbb->freeBlocks[ndx - 1] < blocknum)
+ gfbb->blocksSorted = false;
+}
+
+/*
+ * Free buffering build data structure.
+ */
+void
+freeGiSTBuildBuffers(GISTBuildBuffers *gfbb)
+{
+ /* Close buffers file. */
+ BufFileClose(gfbb->pfile);
+ /* All other things will be free on memory context release */
+}
+
+/*
+ * Data structure representing information about node buffer for index tuples
+ * relocation from splitted node buffer.
+ */
+typedef struct
+{
+ GISTENTRY entry[INDEX_MAX_KEYS];
+ bool isnull[INDEX_MAX_KEYS];
+ SplitedPageLayout *dist;
+ NodeBuffer *nodeBuffer;
+ OffsetNumber offnum;
+} RelocationBufferInfo;
+
+/*
+ * Maintain data structures on page split.
+ */
+void
+relocateBuildBuffersOnSplit(GISTBuildBuffers *gfbb, GISTSTATE *giststate,
+ Relation r, GISTLoadedPartItem *path,
+ Buffer buffer, SplitedPageLayout *dist)
+{
+ RelocationBufferInfo *relocationBuffersInfos;
+ bool found;
+ NodeBuffer *nodeBuffer;
+ BlockNumber blocknum;
+ IndexTuple itup;
+ int splitPagesCount = 0, i;
+ GISTENTRY entry[INDEX_MAX_KEYS];
+ bool isnull[INDEX_MAX_KEYS];
+ SplitedPageLayout *ptr;
+ int level = path->level;
+ NodeBuffer nodebuf;
+ SplitedPageLayout *last = NULL;
+
+ blocknum = BufferGetBlockNumber(buffer);
+
+ /*
+ * If splitted page level doesn't have buffers, then everything is done.
+ * Otherwise we also need to relocate index tuples of buffer of splitted
+ * page.
+ */
+ if (!LEVEL_HAS_BUFFERS(level, gfbb))
+ return;
+
+ /*
+ * Get pointer of node buffer of splitted page and remove it from the hash.
+ */
+ nodeBuffer = hash_search(gfbb->nodeBuffersTab, &blocknum,
+ HASH_FIND, &found);
+ if (!found)
+ {
+ /*
+ * Node buffer should anyway be created at this moment. Either by
+ * index tuples insertion or page split.
+ */
+ elog(ERROR, "node buffer of splitting page (%u) doesn't exists while it should.", blocknum);
+ }
+
+ /*
+ * We are going to perform some operations with node buffers hash. Thus,
+ * it unsafe to operate with already removed hash item. Let's save it.
+ */
+ memcpy(&nodebuf, nodeBuffer, sizeof(NodeBuffer));
+ nodeBuffer = &nodebuf;
+
+ hash_search(gfbb->nodeBuffersTab, &blocknum, HASH_REMOVE, &found);
+ Assert(found);
+
+ /*
+ * Count pages produced by split and save pointer data structure of
+ * the last one.
+ */
+ for (ptr = dist; ptr; ptr = ptr->next)
+ {
+ last = ptr;
+ splitPagesCount++;
+ }
+
+ /* Allocate memory for information about relocation buffers. */
+ relocationBuffersInfos = (RelocationBufferInfo *)palloc(
+ sizeof(RelocationBufferInfo) * splitPagesCount);
+
+ /*
+ * Fill relocation buffers information for node buffers of pages
+ * produced by split.
+ */
+ i = 0;
+ for (ptr = dist; ptr; ptr = ptr->next)
+ {
+ /* Decompress parent index tuple of node buffer page. */
+ gistDeCompressAtt(giststate, r,
+ ptr->itup, NULL, (OffsetNumber) 0,
+ relocationBuffersInfos[i].entry,
+ relocationBuffersInfos[i].isnull);
+
+ /* Fill relocation information */
+ relocationBuffersInfos[i].nodeBuffer =
+ getNodeBuffer(gfbb, giststate, ptr->block.blkno,
+ path->downlinkoffnum, path->parent, true);
+
+ /* Fill node buffer structure */
+ relocationBuffersInfos[i].dist = ptr;
+
+ i++;
+ }
+
+ /*
+ * Loop of index tuples relocation.
+ */
+ while (popItupFromNodeBuffer(gfbb, nodeBuffer, &itup))
+ {
+ float sum_grow, which_grow[INDEX_MAX_KEYS];
+ int i, which;
+ IndexTuple newtup;
+ bool wasOverflow;
+
+ /* Choose node buffer for index tuple insert. */
+
+ gistDeCompressAtt(giststate, r,
+ itup, NULL, (OffsetNumber) 0,
+ entry, isnull);
+
+ which = -1;
+ *which_grow = -1.0f;
+ sum_grow = 1.0f;
+
+ for (i = 0; i < splitPagesCount && sum_grow; i++)
+ {
+ int j;
+ RelocationBufferInfo *splitPageInfo = & relocationBuffersInfos[i];
+
+ sum_grow = 0.0f;
+ for (j = 0; j < r->rd_att->natts; j++)
+ {
+ float usize;
+
+ usize = gistpenalty(giststate, j,
+ &splitPageInfo->entry[j],
+ splitPageInfo->isnull[j],
+ &entry[j], isnull[j]);
+
+ if (which_grow[j] < 0 || usize < which_grow[j])
+ {
+ which = i;
+ which_grow[j] = usize;
+ if (j < r->rd_att->natts - 1 && i == 0)
+ which_grow[j + 1] = -1;
+ sum_grow += which_grow[j];
+ }
+ else if (which_grow[j] == usize)
+ sum_grow += usize;
+ else
+ {
+ sum_grow = 1;
+ break;
+ }
+ }
+ }
+
+ wasOverflow = BUFFER_IS_OVERLOW(
+ relocationBuffersInfos[which].nodeBuffer, gfbb);
+
+ /* push item to selected node buffer */
+ pushItupToNodeBuffer(gfbb, relocationBuffersInfos[which].nodeBuffer,
+ itup);
+
+ /*
+ * If node buffer was just overflowed then we should add it to the
+ * emptying stack.
+ */
+ if (BUFFER_IS_OVERLOW(relocationBuffersInfos[which].nodeBuffer, gfbb) &&
+ !wasOverflow)
+ {
+ MemoryContext oldcxt = CurrentMemoryContext;
+ MemoryContextSwitchTo(gfbb->context);
+ gfbb->bufferEmptyingStack = lcons(relocationBuffersInfos[which].nodeBuffer, gfbb->bufferEmptyingStack);
+ MemoryContextSwitchTo(oldcxt);
+ }
+
+ /* adjust tuple of parent page */
+ newtup = gistgetadjusted(r, relocationBuffersInfos[which].dist->itup,
+ itup, giststate);
+ if (newtup)
+ {
+ /*
+ * Parent page index tuple expands. We need to update old
+ * index tuple with the new one.
+ */
+ gistDeCompressAtt(giststate, r,
+ newtup, NULL, (OffsetNumber) 0,
+ relocationBuffersInfos[which].entry,
+ relocationBuffersInfos[which].isnull);
+
+ relocationBuffersInfos[which].dist->itup = newtup;
+ }
+ }
+
+ if (blocknum == gfbb->currentEmptyingBufferBlockNumber)
+ gfbb->currentEmptyingBufferSplit = true;
+
+ pfree(relocationBuffersInfos);
+}
+
+/*
+ * Return size of node buffer occupied by stored index tuples.
+ */
+int
+getNodeBufferBusySize(NodeBuffer *nodeBuffer)
+{
+ int size;
+
+ /* No occupied buffer file blocks means that node buffer is empty. */
+ if (nodeBuffer->blocksCount == 0)
+ return 0;
+
+ /* We assume only the last page to be not fully filled. */
+ size = (BLCKSZ - MAXALIGN(sizeof(uint32))) * nodeBuffer->blocksCount;
+ size -= PAGE_FREE_SPACE(nodeBuffer->pageBuffer);
+ return size;
+}
diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c
index 1754a10..ca1a0a3 100644
--- a/src/backend/access/gist/gistutil.c
+++ b/src/backend/access/gist/gistutil.c
@@ -670,13 +670,33 @@ gistoptions(PG_FUNCTION_ARGS)
{
Datum reloptions = PG_GETARG_DATUM(0);
bool validate = PG_GETARG_BOOL(1);
- bytea *result;
+ relopt_value *options;
+ GiSTOptions *rdopts;
+ int numoptions;
+ static const relopt_parse_elt tab[] = {
+ {"fillfactor", RELOPT_TYPE_INT, offsetof(GiSTOptions, fillfactor)},
+ {"levelstep", RELOPT_TYPE_INT, offsetof(GiSTOptions, levelStep)},
+ {"buffersize", RELOPT_TYPE_INT, offsetof(GiSTOptions, bufferSize)},
+ {"neighborrelocation", RELOPT_TYPE_BOOL, offsetof(GiSTOptions, neighborRelocation)},
+ {"buffering", RELOPT_TYPE_STRING, offsetof(GiSTOptions, bufferingModeOffset)}
+ };
- result = default_reloptions(reloptions, validate, RELOPT_KIND_GIST);
+ options = parseRelOptions(reloptions, validate, RELOPT_KIND_GIST,
+ &numoptions);
+
+ /* if none set, we're done */
+ if (numoptions == 0)
+ PG_RETURN_NULL();
+
+ rdopts = allocateReloptStruct(sizeof(GiSTOptions), options, numoptions);
+
+ fillRelOptions((void *) rdopts, sizeof(GiSTOptions), options, numoptions,
+ validate, tab, lengthof(tab));
+
+ pfree(options);
+
+ PG_RETURN_BYTEA_P(rdopts);
- if (result)
- PG_RETURN_BYTEA_P(result);
- PG_RETURN_NULL();
}
/*
diff --git a/src/include/access/gist_private.h b/src/include/access/gist_private.h
index 9fb20a6..b7f2bf8 100644
--- a/src/include/access/gist_private.h
+++ b/src/include/access/gist_private.h
@@ -17,13 +17,56 @@
#include "access/gist.h"
#include "access/itup.h"
#include "storage/bufmgr.h"
+#include "storage/buffile.h"
#include "utils/rbtree.h"
+#include "utils/hsearch.h"
+
+/* Has specified level buffers? */
+#define LEVEL_HAS_BUFFERS(level,gfbb) ((level) != 0 && (level) % (gfbb)->levelStep == 0)
+/* Is specified buffer at least halt-filled (should be planned for emptying)?*/
+#define BUFFER_IS_OVERLOW(nodeBuffer,gfbb) ((nodeBuffer)->blocksCount > (gfbb)->pagesPerBuffer / 2)
/* Buffer lock modes */
#define GIST_SHARE BUFFER_LOCK_SHARE
#define GIST_EXCLUSIVE BUFFER_LOCK_EXCLUSIVE
#define GIST_UNLOCK BUFFER_LOCK_UNLOCK
+typedef struct
+{
+ BlockNumber prev;
+ uint32 freespace;
+ char tupledata[1];
+} NodeBufferPage;
+
+/* Returns free space in node buffer page */
+#define PAGE_FREE_SPACE(nbp) (nbp->freespace)
+/* Checks if node buffer page is empty */
+#define PAGE_IS_EMPTY(nbp) (nbp->freespace == BLCKSZ - offsetof(NodeBufferPage, tupledata))
+/* Checks if node buffers page don't contain sufficient space for index tuple */
+#define PAGE_NO_SPACE(nbp, itup) (PAGE_FREE_SPACE(nbp) < \
+ MAXALIGN(IndexTupleSize(itup)))
+
+/* Buffer of tree node data structure */
+typedef struct NodeBuffer
+{
+ /* number of page containing node */
+ BlockNumber nodeBlocknum;
+
+ /* count of blocks occupied by buffer */
+ int blocksCount;
+
+ NodeBufferPage *pageBuffer;
+
+ /* corresponding downlink number in parent page */
+ OffsetNumber downlinkoffnum;
+ /* level of corresponding node */
+ int level;
+ /* number of tuples in buffer */
+ int tuplesCount;
+
+ struct GISTLoadedPartItem *path;
+} NodeBuffer;
+
/*
* GISTSTATE: information needed for any GiST index operation
*
@@ -43,6 +86,8 @@ typedef struct GISTSTATE
/* Collations to pass to the support functions */
Oid supportCollation[INDEX_MAX_KEYS];
+
+ struct GISTBuildBuffers *gfbb;
TupleDesc tupdesc;
} GISTSTATE;
@@ -225,6 +270,86 @@ typedef struct GISTInsertStack
struct GISTInsertStack *parent;
} GISTInsertStack;
+/*
+ * Extended GISTInsertStack for buffering GiST index build. It additionally hold
+ * level number of page.
+ */
+typedef struct GISTLoadedPartItem
+{
+ /* current page */
+ BlockNumber blkno;
+
+ /* offset of the downlink in the parent page, that points to this page */
+ OffsetNumber downlinkoffnum;
+
+ /* pointer to parent */
+ struct GISTLoadedPartItem *parent;
+
+ /* level number */
+ int level;
+} GISTLoadedPartItem;
+
+/* List of non-empty node buffers on specific level */
+typedef struct
+{
+ NodeBuffer *first, *last;
+} NodeBuffersOnLevel;
+
+/*
+ * Data structure with general information about build buffers.
+ */
+typedef struct GISTBuildBuffers
+{
+ /* memory context which is persistent during buffering build */
+ MemoryContext context;
+ /* underlying files */
+ BufFile *pfile;
+ /* # of blocks used in underlying files */
+ long nFileBlocks;
+ /* is freeBlocks[] currently in order? */
+ bool blocksSorted;
+ /* resizable array of free blocks */
+ long *freeBlocks;
+ /* # of currently free blocks */
+ int nFreeBlocks;
+ /* current allocated length of freeBlocks[] */
+ int freeBlocksLen;
+
+ /* hash for buffers by block number*/
+ HTAB *nodeBuffersTab;
+
+ /* stack of buffers for emptying */
+ List *bufferEmptyingStack;
+ /* number of currently emptying buffer */
+ BlockNumber currentEmptyingBufferBlockNumber;
+ /* whether currently emptying buffer was split - a signal to stop emptying */
+ bool currentEmptyingBufferSplit;
+
+ /* whether to use neighbor relocation of buffers when split */
+ bool neighborRelocation;
+ /* step of levels for buffers location */
+ int levelStep;
+ /* maximal number of pages occupied by buffer */
+ int pagesPerBuffer;
+
+ GISTLoadedPartItem *rootitem;
+} GISTBuildBuffers;
+
+/*
+ * Information about sub-tree level planned for load.
+ */
+typedef struct
+{
+ /* pages of sub-tree level */
+ GISTLoadedPartItem **items;
+ /* lenght of items array */
+ int itemsLen;
+ /* number of pages */
+ int itemsCount;
+ /* level number in whole tree */
+ int level;
+} SubtreeLevelInfo;
+
typedef struct GistSplitVector
{
GIST_SPLITVEC splitVector; /* to/from PickSplit method */
@@ -286,6 +411,15 @@ extern Datum gistinsert(PG_FUNCTION_ARGS);
extern MemoryContext createTempGistContext(void);
extern void initGISTstate(GISTSTATE *giststate, Relation index);
extern void freeGISTstate(GISTSTATE *giststate);
+BlockNumber gistdoinsert(Relation r,
+ IndexTuple itup,
+ Size freespace,
+ GISTSTATE *GISTstate);
+bool gistinserttuples(GISTInsertState *state, GISTInsertStack *stack,
+ GISTSTATE *giststate,
+ IndexTuple *tuples, int ntup, OffsetNumber oldoffnum,
+ Buffer leftchild);
+
extern SplitedPageLayout *gistSplit(Relation r, Page page, IndexTuple *itup,
int len, GISTSTATE *giststate);
@@ -313,6 +447,19 @@ extern Datum gistgetbitmap(PG_FUNCTION_ARGS);
/* gistutil.c */
+/*
+ * Storage type for GiST's reloptions
+ */
+typedef struct GiSTOptions
+{
+ int32 vl_len_; /* varlena header (do not touch directly!) */
+ int fillfactor; /* page fill factor in percent (0..100) */
+ int bufferingModeOffset; /* use buffering build? */
+ bool neighborRelocation; /* use neighbor buffer relocation? */
+ int levelStep; /* level step in buffering build */
+ int bufferSize; /* buffer size in buffering build */
+} GiSTOptions;
+
#define GiSTPageSize \
( BLCKSZ - SizeOfPageHeaderData - MAXALIGN(sizeof(GISTPageOpaqueData)) )
@@ -380,4 +527,14 @@ extern void gistSplitByKey(Relation r, Page page, IndexTuple *itup,
GistSplitVector *v, GistEntryVector *entryvec,
int attno);
+/* gistbuildbuffers.c */
+
+void initGiSTBuildBuffers(GISTBuildBuffers * gfbb);
+extern NodeBuffer *getNodeBuffer(GISTBuildBuffers *gfbb, GISTSTATE *giststate, BlockNumber blkno, OffsetNumber downlinkoffnu, GISTLoadedPartItem *parent, bool createNew);
+void pushItupToNodeBuffer(GISTBuildBuffers *gfbb, NodeBuffer *nodeBuffer, IndexTuple item);
+bool popItupFromNodeBuffer(GISTBuildBuffers *gfbb, NodeBuffer *nodeBuffer, IndexTuple *item);
+void freeGiSTBuildBuffers(GISTBuildBuffers *gfbb);
+extern void relocateBuildBuffersOnSplit(GISTBuildBuffers *gfbb, GISTSTATE *giststate, Relation r, GISTLoadedPartItem *path, Buffer buffer, SplitedPageLayout *dist);
+int getNodeBufferBusySize(NodeBuffer *nodeBuffer);
+
#endif /* GIST_PRIVATE_H */