v20251223-0003-VCI-main-part2.patch
application/octet-stream
Filename: v20251223-0003-VCI-main-part2.patch
Type: application/octet-stream
Part: 1
From 80f8ded3ae8b8bc80d31079df91367d4a35f660b Mon Sep 17 00:00:00 2001
From: Peter Smith <peter.b.smith@fujitsu.com>
Date: Tue, 23 Dec 2025 15:24:33 +1100
Subject: [PATCH v20251223] VCI - main - part2
---
contrib/vci/include/postgresql_copy.h | 176 ++
contrib/vci/include/vci_chunk.h | 114 +
contrib/vci/include/vci_columns.h | 319 +++
contrib/vci/include/vci_columns_data.h | 33 +
contrib/vci/include/vci_fetch.h | 1007 ++++++++
contrib/vci/include/vci_freelist.h | 75 +
contrib/vci/include/vci_mem.h | 177 ++
contrib/vci/include/vci_memory_entry.h | 118 +
contrib/vci/include/vci_ros.h | 1085 +++++++++
contrib/vci/include/vci_ros_command.h | 214 ++
contrib/vci/include/vci_ros_daemon.h | 69 +
contrib/vci/include/vci_tidcrid.h | 344 +++
contrib/vci/include/vci_wos.h | 29 +
contrib/vci/include/vci_xact.h | 39 +
contrib/vci/storage/Makefile | 34 +
contrib/vci/storage/meson.build | 19 +
contrib/vci/storage/vci_ros.c | 1659 +++++++++++++
contrib/vci/storage/vci_ros_command.c | 4131 ++++++++++++++++++++++++++++++++
contrib/vci/storage/vci_ros_daemon.c | 859 +++++++
19 files changed, 10501 insertions(+)
create mode 100644 contrib/vci/include/postgresql_copy.h
create mode 100644 contrib/vci/include/vci_chunk.h
create mode 100644 contrib/vci/include/vci_columns.h
create mode 100644 contrib/vci/include/vci_columns_data.h
create mode 100644 contrib/vci/include/vci_fetch.h
create mode 100644 contrib/vci/include/vci_freelist.h
create mode 100644 contrib/vci/include/vci_mem.h
create mode 100644 contrib/vci/include/vci_memory_entry.h
create mode 100644 contrib/vci/include/vci_ros.h
create mode 100644 contrib/vci/include/vci_ros_command.h
create mode 100644 contrib/vci/include/vci_ros_daemon.h
create mode 100644 contrib/vci/include/vci_tidcrid.h
create mode 100644 contrib/vci/include/vci_wos.h
create mode 100644 contrib/vci/include/vci_xact.h
create mode 100644 contrib/vci/storage/Makefile
create mode 100644 contrib/vci/storage/meson.build
create mode 100644 contrib/vci/storage/vci_ros.c
create mode 100644 contrib/vci/storage/vci_ros_command.c
create mode 100644 contrib/vci/storage/vci_ros_daemon.c
diff --git a/contrib/vci/include/postgresql_copy.h b/contrib/vci/include/postgresql_copy.h
new file mode 100644
index 0000000..f302232
--- /dev/null
+++ b/contrib/vci/include/postgresql_copy.h
@@ -0,0 +1,176 @@
+/*-------------------------------------------------------------------------
+ *
+ * postgresql_copy.h
+ * Definitions copied from PostgreSQL core
+ *
+ * Portions Copyright (c) 2025, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * contrib/vci/include/postgresql_copy.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef POSTGRESQL_COPY_H
+#define POSTGRESQL_COPY_H
+
+/*
+ * src/backend/utils/adt/float.c
+ */
+#include "postgres.h"
+
+#include <math.h>
+
+#include "catalog/pg_type.h"
+#include "datatype/timestamp.h"
+#include "utils/array.h"
+#include "utils/date.h"
+#include "utils/elog.h"
+#include "utils/errcodes.h"
+
+/*
+ * check to see if a float4/8 val has underflowed or overflowed
+ */
+#define CHECKFLOATVAL(val, inf_is_valid, zero_is_valid) \
+do { \
+ if (isinf(val) && !(inf_is_valid)) \
+ ereport(ERROR, \
+ (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), \
+ errmsg("value out of range: overflow"))); \
+ \
+ if ((val) == 0.0 && !(zero_is_valid)) \
+ ereport(ERROR, \
+ (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), \
+ errmsg("value out of range: underflow"))); \
+} while(0)
+
+/*
+ * src/backend/utils/adt/float.c
+ */
+static inline float8 *
+check_float8_array(ArrayType *transarray, const char *caller, int n)
+{
+ /*
+ * We expect the input to be an N-element float array; verify that. We
+ * don't need to use deconstruct_array() since the array data is just
+ * going to look like a C array of N float8 values.
+ */
+ if (ARR_NDIM(transarray) != 1 ||
+ ARR_DIMS(transarray)[0] != n ||
+ ARR_HASNULL(transarray) ||
+ ARR_ELEMTYPE(transarray) != FLOAT8OID)
+ elog(ERROR, "%s: expected %d-element float8 array", caller, n);
+ return (float8 *) ARR_DATA_PTR(transarray);
+}
+
+typedef struct Int8TransTypeData
+{
+ int64 count;
+ int64 sum;
+} Int8TransTypeData;
+
+#ifdef VCI_USE_CMP_FUNC
+/*
+ * interval_relop - is interval1 relop interval2
+ *
+ * collate invalid interval at the end
+ */
+static inline TimeOffset
+interval_cmp_value(const Interval *interval)
+{
+ TimeOffset span;
+
+ span = interval->time;
+
+#ifdef HAVE_INT64_TIMESTAMP
+ span += interval->month * INT64CONST(30) * USECS_PER_DAY;
+ span += interval->day * INT64CONST(24) * USECS_PER_HOUR;
+#else
+ span += interval->month * ((double) DAYS_PER_MONTH * SECS_PER_DAY);
+ span += interval->day * ((double) HOURS_PER_DAY * SECS_PER_HOUR);
+#endif
+
+ return span;
+}
+
+static int
+interval_cmp_internal(Interval *interval1, Interval *interval2)
+{
+ TimeOffset span1 = interval_cmp_value(interval1);
+ TimeOffset span2 = interval_cmp_value(interval2);
+
+ return ((span1 < span2) ? -1 : (span1 > span2) ? 1 : 0);
+}
+
+static int
+timetz_cmp_internal(TimeTzADT *time1, TimeTzADT *time2)
+{
+ TimeOffset t1,
+ t2;
+
+ /* Primary sort is by true (GMT-equivalent) time */
+#ifdef HAVE_INT64_TIMESTAMP
+ t1 = time1->time + (time1->zone * USECS_PER_SEC);
+ t2 = time2->time + (time2->zone * USECS_PER_SEC);
+#else
+ t1 = time1->time + time1->zone;
+ t2 = time2->time + time2->zone;
+#endif
+
+ if (t1 > t2)
+ return 1;
+ if (t1 < t2)
+ return -1;
+
+ /*
+ * If same GMT time, sort by timezone; we only want to say that two
+ * timetz's are equal if both the time and zone parts are equal.
+ */
+ if (time1->zone > time2->zone)
+ return 1;
+ if (time1->zone < time2->zone)
+ return -1;
+
+ return 0;
+}
+#endif
+
+/* taken from numeric.c */
+
+typedef int16 NumericDigit;
+struct NumericShort
+{
+ uint16 n_header; /* Sign + display scale + weight */
+ NumericDigit n_data[1]; /* Digits */
+};
+
+struct NumericLong
+{
+ uint16 n_sign_dscale; /* Sign + display scale */
+ int16 n_weight; /* Weight of 1st digit */
+ NumericDigit n_data[1]; /* Digits */
+};
+
+union NumericChoice
+{
+ uint16 n_header; /* Header word */
+ struct NumericLong n_long; /* Long form (4-byte header) */
+ struct NumericShort n_short; /* Short form (2-byte header) */
+};
+
+struct NumericData
+{
+ int32 vl_len_; /* varlena header (do not touch directly!) */
+ union NumericChoice choice; /* choice of format */
+};
+
+typedef struct NumericVar
+{
+ int ndigits; /* # of digits in digits[] - can be 0! */
+ int weight; /* weight of first digit */
+ int sign; /* NUMERIC_POS, NUMERIC_NEG, or NUMERIC_NAN */
+ int dscale; /* display scale */
+ NumericDigit *buf; /* start of palloc'd space for digits[] */
+ NumericDigit *digits; /* base-NBASE digits */
+} NumericVar;
+
+#endif /* POSTGRESQL_COPY_H */
diff --git a/contrib/vci/include/vci_chunk.h b/contrib/vci/include/vci_chunk.h
new file mode 100644
index 0000000..9c4f628
--- /dev/null
+++ b/contrib/vci/include/vci_chunk.h
@@ -0,0 +1,114 @@
+/*-------------------------------------------------------------------------
+ *
+ * vci_chunk.h
+ * Definitions and Declarations of ROS chunk buffer strage.
+ *
+ * Portions Copyright (c) 2025, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * contrib/vci/include/vci_chunk.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef VCI_CHUNK_H
+#define VCI_CHUNK_H
+
+#include "postgres.h"
+
+#include "miscadmin.h"
+#include "utils/snapmgr.h"
+#include "utils/timestamp.h"
+#include "utils/uuid.h"
+
+#include "vci.h"
+#include "vci_ros.h"
+
+/**
+ * @brief RosChunkBuffer is a buffer to store one chunk.
+ *
+ * We use RosChunkBuffer in two purposes. One is to store data obtaind
+ * directly from PostgreSQL relation. For this purpose, we prepare this
+ * buffer to have enough space to store data even when all the attributes
+ * have the size of worst case, that never happens. Once the chunk is
+ * stored in this buffer, we inspect the size of each column in the chunk.
+ * Afterward, we copy all the chunk data into RosChunkStorage with removing
+ * unused spaces. Here, we use RosChunkBuffer for each chunk, but this
+ * time we prepare the buffer with the size suitable for each chunk. ROS
+ * without compression is built from RosChunkStorage directly.
+ */
+typedef struct RosChunkBuffer
+{
+ int16 numColumns; /* number of columns */
+ int16 numNullableColumns; /* number of nullable columns */
+
+ /** number of columns which need offset data for each entry because they
+ * have variable-length fields or fields longer than eight bytes, say,
+ * reference Datum.
+ */
+ int16 numColumnsWithIndex;
+
+ int nullWidthInByte; /* The byte width of null bit vector. */
+ int numRowsAtOnce; /* the maximum number of rows in the chunk */
+ int numFilled; /* the number of rows actually contained here */
+ vcis_compression_type_t *compType; /* Array of compression type for
+ * columns. */
+ int16 *nullBitId; /* -1 for NOT NULLABLE */
+ int16 *columnSizeList; /* the sizes of columns in the worst case */
+ void *dataAllocPtr; /* pointer keeping allocated area */
+ char **data; /* buffer for each column */
+ vci_offset_in_extent_t **dataOffset; /* offset to each datum */
+ char *nullData; /* pointer to array of null bit vector. */
+ char *tidData; /* pointer to array of TID. */
+ char *deleteData; /* pointer to array of delete information */
+} RosChunkBuffer;
+
+/**
+ * @brief Structure to keep buffers that keeps column-wise data built from WOS.
+ */
+typedef struct RosChunkStorage
+{
+ int numChunks; /* The length of allocated chunk. */
+ int numFilled; /* The number of chunk actually used. */
+ int numTotalRows; /* The sum of rows in registered chunks. */
+ bool forAppending; /* True to append data to the shrunken extent. */
+
+ /** Array of pointers to RosChunkBuffer, which is copied in a compact
+ * manner to reduce the memory.
+ */
+ RosChunkBuffer **chunk;
+} RosChunkStorage;
+
+extern void
+ vci_InitOneRosChunkBuffer(RosChunkBuffer *rosChunkBuffer,
+ int numRowsAtOnce,
+ int16 *columnSizeList,
+ int numColumns,
+ bool useDeleteVector,
+ vci_MainRelHeaderInfo *info);
+extern void
+ vci_InitRosChunkStorage(RosChunkStorage *rosChunkStorage,
+ int numRowsAtOnce,
+ bool forAppending);
+extern void
+ vci_DestroyOneRosChunkBuffer(RosChunkBuffer *rosChunkBuffer);
+extern void
+ vci_DestroyRosChunkStorage(RosChunkStorage *rosChunkStorage);
+extern PGDLLEXPORT void
+ vci_ResetRosChunkStorage(RosChunkStorage *rosChunkStorage);
+extern void
+ vci_FillOneRowInRosChunkBuffer(RosChunkBuffer *rosChunkBuffer,
+ vci_MainRelHeaderInfo *info,
+ ItemPointer tid,
+ HeapTuple tuple,
+ int16 *dstColumnIdList,
+ AttrNumber *heapAttrNumList,
+ TupleDesc tupleDesc);
+extern void
+ vci_ResetRosChunkBufferCounter(RosChunkBuffer *buffer);
+extern void
+ vci_RegisterChunkBuffer(RosChunkStorage *rosChunkStorage, RosChunkBuffer *src);
+extern Size
+ vci_GetDataSizeInChunkStorage(RosChunkStorage *src, int columnId, bool asFixed);
+
+#endif /* #ifndef VCI_CHUNK_H */
diff --git a/contrib/vci/include/vci_columns.h b/contrib/vci/include/vci_columns.h
new file mode 100644
index 0000000..18d6e9a
--- /dev/null
+++ b/contrib/vci/include/vci_columns.h
@@ -0,0 +1,319 @@
+/*-------------------------------------------------------------------------
+ *
+ * vci_columns.h
+ * Definitions and declarations of VCI column store and extents
+ *
+ * Portions Copyright (c) 2025, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * contrib/vci/include/vci_columns.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef VCI_COLUMNS_H
+#define VCI_COLUMNS_H
+
+#include "postgres.h"
+
+#include "vci.h"
+#include "vci_chunk.h"
+#include "vci_ros.h"
+#include "vci_tidcrid.h"
+
+/** header page ID of column meta data */
+#define VCI_COLUMN_META_HEADER_PAGE_ID (0)
+
+/** First page of Column data relations */
+#define VCI_COLUMN_DATA_FIRST_PAGE_ID (0)
+
+/** Column number of Column meta header page */
+#define VCI_NUM_COLUMN_META_HEADER_PAGE (1)
+
+/** Column ID of first Normal Column */
+#define VCI_FIRST_NORMALCOLUMN_ID (0)
+
+/** Column ID of special column */
+#define VCI_COLUMN_ID_TID (-1)
+#define VCI_COLUMN_ID_NULL (-2)
+#define VCI_COLUMN_ID_DELETE (-3)
+#define VCI_COLUMN_ID_CRID (-4) /** @todo what is this? */
+
+/** The data below are not column-stored data.
+ * We prepare them for convenience.
+ */
+#define VCI_COLUMN_ID_TID_CRID (-5)
+#define VCI_COLUMN_ID_TID_CRID_UPDATE (-6)
+#define VCI_COLUMN_ID_TID_CRID_WRITE (-7)
+#define VCI_COLUMN_ID_TID_CRID_CDR (-8)
+#define VCI_COLUMN_ID_DATA_WOS (-9)
+#define VCI_COLUMN_ID_WHITEOUT_WOS (-10)
+
+#define VCI_INVALID_COLUMN_ID ((int16) -11)
+
+/** Vector bit count in one item (tuple) for delete vector */
+#define VCI_NUM_ROWS_IN_ONE_ITEM_FOR_DELETE (1024)
+
+/** Item number in page for delete vector */
+#define VCI_ITEMS_IN_PAGE_FOR_DELETE (52)
+
+/** Page number in extent for delete vector */
+#define VCI_NUM_PAGES_IN_EXTENT_FOR_DELETE (5)
+
+static inline BlockNumber
+vci_CalcBlockNumberFromCrid64ForDelete(uint64 crid64)
+{
+ return (vci_CalcExtentIdFromCrid64(crid64) *
+ VCI_NUM_PAGES_IN_EXTENT_FOR_DELETE) +
+ (vci_CalcRowIdInExtentFromCrid64(crid64) /
+ (VCI_ITEMS_IN_PAGE_FOR_DELETE *
+ VCI_NUM_ROWS_IN_ONE_ITEM_FOR_DELETE));
+}
+
+static inline OffsetNumber
+vci_CalcOffsetNumberFromCrid64ForDelete(uint64 crid64)
+{
+ return ((vci_CalcRowIdInExtentFromCrid64(crid64) /
+ VCI_NUM_ROWS_IN_ONE_ITEM_FOR_DELETE) %
+ VCI_ITEMS_IN_PAGE_FOR_DELETE) + FirstOffsetNumber;
+}
+
+static inline uint32
+vci_CalcByteFromCrid64ForDelete(uint64 crid64)
+{
+ return (crid64 % VCI_NUM_ROWS_IN_ONE_ITEM_FOR_DELETE) / BITS_PER_BYTE;
+}
+
+static inline uint32
+vci_CalcBitFromCrid64ForDelete(uint64 crid64)
+{
+ return crid64 & (BITS_PER_BYTE - 1);
+}
+
+/**
+ * Pointing extent position of each column in BlockNumber.
+ *
+ * @description
+ * This is used in vcis_column_meta_t.block_number_extent.
+ * The field is not defined in the definition of the structure, because
+ * we have the other variable length field "common_dict_info".
+ * This block_number_extent follows the field.
+ *
+ * @note
+ * unused entries have InvalidBlockNumber in block_number and
+ * zero in num_blocks.
+ */
+typedef struct vcis_c_extent
+{
+ BlockNumber block_number; /* the position in the column data relation */
+ BlockNumber num_blocks; /* the length in DB page unit */
+
+ bool enabled; /* block_number is enabled if true */
+
+ /* FIXME */ /* fill me */
+ bool valid_min_max; /* size of min is
+ * vcis_column_meta_t.min_max_element_size */
+ char min[1]; /* max follows min. */
+} vcis_c_extent_t;
+
+/**
+ * common dictionary info of each column
+ *
+ * @descriptions
+ *This is used in vcis_column_meta_t.common_dict_info
+ *
+ * @note
+ * unused entries have InvalidBlockNumber in block_number and
+ * zero in num_blocks.
+ */
+typedef struct vcis_c_common_dict
+{
+ BlockNumber block_number; /* the position in the column data relation */
+ BlockNumber num_blocks; /* the length in DB page unit */
+} vcis_c_common_dict_t;
+
+typedef struct vcis_column_meta
+{
+ vcis_attribute_type_t vcis_attr_type; /* Attribute type */
+
+ Oid pgsql_atttypid; /* taken from FormData_pg_attribute.atttypid */
+ int16 pgsql_attnum; /* taken from FormData_pg_attribute.attnum */
+ int16 pgsql_attlen; /* taken from FormData_pg_attribute.attlen */
+ int32 pgsql_atttypmod; /* taken from
+ * FormData_pg_attribute.atttypmod */
+ uint32 num_extents; /* number of extents (for debug) */
+ uint32 num_extents_old; /* previous number of extents (for
+ * recovery) */
+
+ BlockNumber free_page_begin_id; /* page ID of the first free area */
+
+ BlockNumber free_page_end_id; /* page ID of the last free area */
+
+ /**
+ * The DB page ID of free area that located in front of the added or
+ * deleted extent by the ROS command. (for recovery)
+ * This is used to recover free area list.
+ */
+ BlockNumber free_page_prev_id;
+
+ /**
+ * Same as free_page_prev_id, but just behind the added or deleted extent.
+ */
+ BlockNumber free_page_next_id;
+
+ /**
+ * The freespace size of added or deleted extent by the ROS command (for recovery)
+ */
+ uint32 free_page_old_size;
+
+ /**
+ * The freespace position of added or deleted extent in BlockNumber
+ * by the ROS command (for recovery)
+ */
+ BlockNumber new_data_head;
+
+ BlockNumber num_free_pages; /* number of free DB pages in the listed free
+ * area */
+ BlockNumber num_free_pages_old; /* for recovery */
+ BlockNumber num_free_page_blocks; /* number of free areas, not number of
+ * free DB pages */
+ BlockNumber num_free_page_blocks_old; /* for recovery */
+
+ /*--- Above must be same as vcis_tidcrid_meta_t ---*/
+
+ uint32 common_flag_0; /* vcis_column_meta_flag */
+
+ uint32 min_max_field_size; /* size of min_max field size */
+ uint32 min_max_content_size; /* size of min_max content size */
+ uint16 num_common_dicts; /* Number of common dictionarys */
+ int16 latest_common_dict_id; /* Id of the latest common dictionary */
+ uint32 common_dict_info_offset; /* offset of common_dict_info[0] */
+ uint32 block_number_extent_offset; /* offset of extent_pointer[0] */
+
+ vcis_c_common_dict_t common_dict_info[1]; /* common dictionary
+ * informations */
+ /* block_number_extent follows common_dict_info[num_common_dict - 1] */
+} vcis_column_meta_t;
+
+/**
+ * @brief Get pointer to vcis_extent_t in the give DB page.
+ */
+#define vci_GetExtentT(page) \
+ ((vcis_extent_t *) &((page)[VCI_MIN_PAGE_HEADER]))
+
+/*
+ * Extend headers
+ */
+typedef struct vcis_extent
+{
+ uint32 size; /* Size of extent */
+ vcis_extent_type_t type;
+ uint32 id; /* Extend id */
+ vcis_compression_type_t comp_type; /* Compression method */
+ uint32 offset_offset; /* Offset to the offset */
+ uint32 offset_size; /* Size of the offset size */
+ uint32 data_offset; /* Offset to the data */
+ uint32 data_size; /* Data size */
+ uint16 compressed; /* 0 for not compressed, 1 for compressed */
+ int16 dict_offset; /* or common dictionary ID (>= -1) when
+ * dict_size == 0 */
+ uint32 dict_size; /* Size to the dictionary data */
+ vcis_dict_type_t dict_type; /* The type of dictionary */
+ char dict_body[1]; /* the mainbody of the dictionary */
+ /* offset_body and data_body follows dict_body */
+} vcis_extent_t;
+
+typedef vci_RelationPair vci_ColumnRelations;
+
+extern PGDLLEXPORT vcis_column_meta_t *vci_GetColumnMeta(Buffer *buffer, Relation rel);
+extern PGDLLEXPORT vcis_c_extent_t *vci_GetColumnExtent(Buffer *buffer,
+ BlockNumber *blockNumber,
+ Relation rel,
+ int32 extentId);
+
+extern PGDLLEXPORT void vci_OpenColumnRelations(vci_ColumnRelations *rel,
+ vci_MainRelHeaderInfo *info,
+ int16 columnId,
+ LOCKMODE lockmode);
+
+extern void vci_CloseColumnRelations(vci_ColumnRelations *rel,
+ LOCKMODE lockmode);
+
+extern void vci_InitializeColumnRelations(vci_MainRelHeaderInfo *info,
+ TupleDesc tupdesc,
+ Relation heapRel);
+
+extern void vci_WriteRawDataExtentInfo(Relation rel,
+ int32 extentId,
+ uint32 startPageID,
+ uint32 numBlocks,
+ char *minData,
+ char *maxData,
+ bool validMinMax,
+ bool checkOverwrite);
+
+extern void vci_WriteOneExtent(vci_MainRelHeaderInfo *info,
+ RosChunkStorage *src,
+ int extentId,
+ TransactionId xgen, /* xgen in extent info */
+ TransactionId xdel, /* xdel in extent info */
+ TransactionId xid); /* in tuple header */
+
+/* columns to fetcher Interface */
+extern void vci_GetElementPosition(uint32 *offset,
+ BlockNumber *blockNumberBase,
+ uint32 *dataOffset,
+ vci_ColumnRelations *rel,
+ int32 extentId,
+ uint32 rowIdInExtent,
+ Form_pg_attribute attr);
+
+extern PGDLLEXPORT void vci_GetChunkPositionAndSize(uint32 *offset,
+ Size *totalSize,
+ BlockNumber *blockNumberBase,
+ uint32 *dataOffset,
+ vci_ColumnRelations *rel,
+ int32 extentId,
+ uint32 rowIdInExtent,
+ int32 numUnit,
+ Form_pg_attribute attr);
+
+extern uint16
+ vci_GetFixedColumnSize(vci_MainRelHeaderInfo *info, int16 columnId);
+extern void
+ vci_GetPositionForFixedColumn(BlockNumber *blockNumber,
+ uint32 *offset,
+ vci_MainRelHeaderInfo *info,
+ int16 columnId,
+ int32 extentId,
+ uint32 rowIdInExtent,
+ bool atEnd);
+
+extern PGDLLEXPORT void
+ vci_InitializeDictInfo(vci_DictInfo *dictInfo);
+
+/* ***************************
+ * Min-Max info
+ * ***************************
+ */
+
+static inline void
+vci_Initvci_ColumnRelations(vci_ColumnRelations *rel)
+{
+ rel->meta = NULL;
+ rel->data = NULL;
+}
+
+/* function to write meta data header
+ * argumtents
+ * Relation relMeta
+ * Buffer buffer
+ */
+static inline void
+vci_WriteColumnMetaDataHeader(Relation relMeta,
+ Buffer buffer)
+{
+ vci_WriteOneItemPage(relMeta, buffer);
+}
+
+#endif /* VCI_COLUMNS_H */
diff --git a/contrib/vci/include/vci_columns_data.h b/contrib/vci/include/vci_columns_data.h
new file mode 100644
index 0000000..d21c0a1
--- /dev/null
+++ b/contrib/vci/include/vci_columns_data.h
@@ -0,0 +1,33 @@
+/*-------------------------------------------------------------------------
+ *
+ * vci_columns_data.h
+ * Declarations of functions to check which columns are indexed.
+ *
+ *
+ * Portions Copyright (c) 2025, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * contrib/vci/include/vci_columns_data.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef VCI_COLUMNS_DATA_H
+#define VCI_COLUMNS_DATA_H
+
+#include "access/tupdesc.h"
+#include "access/attnum.h"
+#include "nodes/bitmapset.h"
+#include "storage/lock.h"
+#include "utils/palloc.h"
+#include "utils/rel.h"
+
+#include "vci_ros.h"
+
+extern TupleDesc vci_ExtractColumnDataUsingIds(const char *vci_column_ids, Relation heapRel);
+extern PGDLLEXPORT TupleDesc vci_GetTupleDescr(vci_MainRelHeaderInfo *info);
+extern Bitmapset *vci_MakeIndexedColumnBitmap(Oid mainRelationOid, MemoryContext sharedMemCtx, LOCKMODE lockmode);
+extern Bitmapset *vci_MakeDroppedColumnBitmap(Relation indexRel);
+extern char *vci_ConvertAttidBitmap2String(Bitmapset *attid_bitmap);
+extern AttrNumber vci_GetAttNum(TupleDesc desc, const char *name);
+
+#endif /* VCI_COLUMNS_DATA_H */
diff --git a/contrib/vci/include/vci_fetch.h b/contrib/vci/include/vci_fetch.h
new file mode 100644
index 0000000..60326ee
--- /dev/null
+++ b/contrib/vci/include/vci_fetch.h
@@ -0,0 +1,1007 @@
+/*-------------------------------------------------------------------------
+ *
+ * vci_fetch.h
+ * Definitions and declarations of Column store fetch
+ *
+ * Portions Copyright (c) 2025, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * contrib/vci/include/vci_fetch.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef VCI_FETCH_H
+#define VCI_FETCH_H
+
+#include "postgres.h"
+
+#include "access/attnum.h"
+#include "utils/guc.h"
+
+#include "vci.h"
+#include "vci_columns.h"
+
+#include "vci_mem.h"
+#include "vci_ros.h"
+
+/* Get function of virtual tuples may used to get the storage area.
+ * In that case, no rows may stored.
+ * So, skipping the assertion check by default.
+ * To use the assertion check, define CHECK_VTUPLE_GET_RANGE.
+ */
+#define CHECK_VTUPLE_GET_RANGE
+
+/*
+ * memory image of data loaded by vci_CSFetchVirtualTuples().
+ * The area is allocated by vci_CSCreateVirtualTuples(), and the addresses
+ * are fixed except for each data in "column N area"s.
+ *
+ * ADDRESS CONTENT
+ * allocated (palloc-ed address)
+ * (no padding)
+ * flags (or skip) uint8 of tuple[0]
+ * (bit 0 is a copy of uint8 of tuple[1]
+ * delete vector) .
+ * .
+ * uint8 of tuple[num_rows_read_at_once-1]
+ * uint8 of tuple[num_rows_read_at_once] (extra element)
+ * (no padding)
+ * isnull bool[0]--bool[num_columns-1] of tuple[0]
+ * bool[0]--bool[num_columns-1] of tuple[1]
+ * .
+ * .
+ * bool[0]--bool[num_columns-1] of tuple[num_..._once-1]
+ * (padding if necessary)
+ * crid (aligned) int64 of tuple[0]
+ * (when need_crid is int64 of tuple[1]
+ * true) .
+ * .
+ * int64 of tuple[num_rows_read_at_once-1]
+ * (no padding)
+ * tid (aligned) int64 of tuple[0]
+ * (when need_tid is int64 of tuple[1]
+ * true) .
+ * .
+ * .
+ * int64 of tuple[num_rows_read_at_once-1]
+ * (no padding)
+ * values (aligned) Datum[0]--Datum[num_columns-1] of tuple[0]
+ * Datum[0]--Datum[num_columns-1] of tuple[1]
+ * .
+ * .
+ * Datum[0]--Datum[num_columns-1] of tuple[num_..._once-1]
+ * (padding if necessary)
+ * column 0 area aligned data are stored when the element size is
+ * (aligned) larger than sizeof(Datum). Each datum are pointed
+ * by Datum[0] of tuples in the upper "data" area.
+ * The size for the area is calculated using worst
+ * case size.
+ * (free space)
+ * (padding if necessary)
+ * column 1 area aligned data are stored when the element size is
+ * (aligned) larger than sizeof(Datum). Each datum are pointed
+ * by Datum[1] of tuples in the upper "data" area.
+ * The size for the area is calculated using worst
+ * case size.
+ * .
+ * .
+ * (free space)
+ * (padding if necessary)
+ * column (num_rows-1) area
+ * (aligned)
+ *
+ *
+ * usage:
+ *
+ * ---- in backend process ----
+ * vci_CSQueryContext queryContext = vci_CSCreateQueryContext( mainRelationOid,
+ * numReadColumns, attrNum, sharedMemCtx);
+ *
+ * Size localRosSize = vci_CSEstimateLocalRosSize(queryContext);
+ * if (limitLocalRos <= localRosSize)
+ * goto PostgreSQLQueryExecution;
+ * vci_local_ros_t *localRos = vci_CSGenerateLocalRos(queryContext);
+ *
+ * vci_CSFetchContext fetchContext = vci_CSCreateFetchContext( queryContext,
+ * numRowsReadAtOnce,
+ * useColumnStore,
+ * numReadColumns, attrNum,
+ * returnTid, returnCrid);
+ * Size fetchContextSize = vci_CSGetFetchContextSize(fetchContext);
+ * if (limitFetchContext <= sumOfFetchContextSize)
+ * goto PostgreSQLQueryExecution;
+ *
+ * ---- in background worker ----
+ * int lenVector = vci_CSGetActualNumRowsReadAtOnce(fetchContext);
+ * vci_CSFetchContext localContext = vci_CSLocalizeFetchContext(fetchContext);
+ * vci_virtual_tuples_t *vTuples = vci_CSCreateVirtualTuples(localContext);
+ *
+ * ** here you can make pointers to vTuples from PostgreSQL virtual tuples. **
+ *
+ * vci_extent_status_t *status =vci_CSCreateCheckExtent(fetchContext);
+ *
+ * for (extentID)
+ * {
+ * vci_CSCheckExtent(status, localContext, extentId, readMinMax);
+ * if (status->existence && status->visible)
+ * {
+ * ** loop of vectors and rows **
+ * ** number of rows in the extent is in status->num_rows **
+ * for (vectorID)
+ * {
+ * int readableRows = vci_CSFetchVirtualTuples(vTuples,
+ * vectroID * lenVector,
+ * lenVector);
+ * for (idInVector = 0; idInVector < readableRows; ++ idInVector)
+ * {
+ *
+ * ** normal style from here **
+ * int8 *flags = vci_CSGetSkipOfVirtualTuple(vTuples);
+ * if ((* flags) & vcivtf_delete)
+ * continue;
+ *
+ * ** Row wise **
+ * Datum *values = vci_CSGetValuesOfVirtualTuple(vTuples,
+ * idInVector);
+ * bool *isnull = vci_CSGetIsNullOfVirtualTuple(vTuples,
+ * idInVector);
+ *
+ * ** Column wise **
+ * Datum *values = vci_CSGetValuesOfVirtualTupleColumnar(vTuples,
+ * columnId);
+ * bool *isnull = vci_CSGetIsNullOfVirtualTupleColumnar(vTuples,
+ * columnId);
+ *
+ * int64 *crid = vci_CSGetCridOfVirtualTuple(vTuples,
+ * idInVector);
+ * int64 *tid = vci_CSGetTidOfVirtualTuple(vTuples,
+ * idInVector);
+ * UpdateVirtualTupleLinks();
+ * EvaluateQualsEtc();
+ * ** normal style to here **
+ *
+ * ** if you use fixed linked virtual tuples from here **
+ * SelectPostgreSQLVirtualTuple();
+ * EvaluateQualsEtc();
+ * ** if you use fixed linked virtual tuples to here **
+ *
+ * }
+ * }
+ * }
+ * }
+ *
+ * vci_CSDestroyCheckExtent(status)
+ * vci_CSDestroyVirtualTuples(vTuples);
+ * vci_CSDestroyFetchContext(localContext);
+ *
+ * ---- in backend process ----
+ * vci_CSDestroyFetchContext(fetchContext);
+ * vci_CSDestroyLocalRos(localRos);
+ * vci_CSDestroyQueryContext(queryContext);
+ */
+
+/**
+ * @brief Information to fetch data from one relation used in a query.
+ *
+ * When multiple relations are used in one query,
+ * multiple vci_CSQueryContextData should be created.
+ */
+typedef struct vci_CSQueryContextData
+{
+ /** Number of columns of the relation used in the query. */
+ int num_columns;
+
+ /** Attribute number in original PostgreSQL relation. */
+ AttrNumber *volatile attr_num;
+
+ /** Column ID in VCI main relation. */
+ int16 *volatile column_id;
+
+ /* Number of maximum WOS entries */
+ int64 num_data_wos_entries;
+
+ /* Number of maximum whiteout WOS entries */
+ int64 num_whiteout_wos_entries;
+
+ /**
+ * Number of entries in delete_list, just a copy of
+ * vci_local_ros_t.local_delete_list->num_entry.
+ */
+ int num_delete;
+
+ /**
+ * Local delete list, containing whiteout WOS.
+ * CAUTION : THIS POINTER VALUE IS JUST A COPY OF
+ * vci_local_ros_t.local_delete_list->crid_list.
+ * NEVER pfree().
+ */
+ uint64 *delete_list;
+
+ /**
+ * Number of extents of local ROS.
+ * To keep the extnets of local ROS at reasonable size,
+ * they may contain fewer rows than 262,144 rows.
+ */
+ int num_local_ros_extents;
+
+ vci_local_ros_t *local_ros; /* pointer to the local ROS. */
+
+ /** Number of extents in ROS. */
+ int num_ros_extents;
+
+ /**
+ * Pointer to main relation information.
+ * The object is allocated in shared_memory_context,
+ * but info->rel cannot access from other process than that creates
+ * vci_CSFetchContext.
+ * In order to access main relation, open using main_relation_oid.
+ */
+ vci_MainRelHeaderInfo *volatile info;
+
+ /** Heap relation indexed by VCI to keep shared lock. */
+ volatile Relation heap_rel;
+
+ /** Oid of VCI main relation. */
+ Oid main_relation_oid;
+
+ uint32 num_nullable_columns; /* Number of nullable columns */
+ uint32 null_width_in_byte; /* Size of null bit vector per row */
+
+ /**
+ * ROS version taken from current ROS version or last ROS version.
+ */
+ TransactionId ros_version;
+
+ /**
+ * @see inclusiveXid of struct vci_RosCommandContext
+ */
+ TransactionId inclusive_xid;
+
+ /**
+ * @see exclusiveXid of struct vci_RosCommandContext
+ */
+ TransactionId exclusive_xid;
+
+ uint32 tid_crid_diff_sel; /* Selection of TID CRID difference. */
+
+ /**
+ * Memory context where all the shared data are allocate,
+ * including the elements in this sturcture.
+ */
+ MemoryContext shared_memory_context;
+
+ /** lockmode of index relation (main relation) */
+ LOCKMODE lockmode;
+
+} vci_CSQueryContextData;
+typedef vci_CSQueryContextData *vci_CSQueryContext;
+
+/**
+ * @brief Buffer for decompression,
+ *
+ * and concatenate data separated into multiple pages.
+ */
+typedef struct vci_seq_scan_buffer
+{
+ int num_buffers;
+} vci_seq_scan_buffer_t;
+
+/**
+ * @brief Context to fetch vectors.
+ *
+ * Vector itself is in vci_virtual_tuples_t,
+ * and the running parameters are kept in it.
+ * A master instance of vci_CSFetchContextData is created by backend process,
+ * then background workers copy to have locally.
+ * Some member variables in local copy is over-written, marked as
+ * \b LOCALIZED \b VARIABLE .
+ */
+typedef struct vci_CSFetchContextData
+{
+ uint32 size; /* Size of this structure. */
+
+ int32 extent_id; /* The extent ID of stored virtual tuples. */
+ uint16 num_rows; /* Number of stored virtual tuples. */
+
+ int16 num_columns; /* Number of columns to fetch in this context. */
+
+ /**
+ * Number of rows for the context to read at once.
+ * The fetcher read multiple lines at once and store them into the
+ * virtual tuple storage.
+ */
+ uint32 num_rows_read_at_once;
+
+ bool use_column_store; /* Store data in columnar style (true) or
+ * not. */
+
+ bool need_crid; /* Fetch CRID or not. */
+ bool need_tid; /* Fetch TID or not. */
+
+ /** Used in decompression or data concatenation. */
+ vci_seq_scan_buffer_t *buffer;
+
+ /** \b LOCALIZED \b VARIABLE \n
+ * The ROS data fetched are stored in this context.
+ * virtual tuple storage is located here.
+ */
+ MemoryContext local_memory_context;
+
+ /** The size of virtual Tuple storage.
+ * This is sum of size_values, size_flags, and sizes of area pointed by
+ * vci_virtual_tuples_t->column_info[columnId].al_area.
+ */
+ Size size_vector_memory_context;
+
+ /** area where Datum and pointers are stores */
+ Size size_values;
+
+ /** The area where nulls, skip information, local skip information,
+ * TIDs, CRIDs, dictionaries, compression workarea and temporay
+ * area for wor-wise mode are placed.
+ * The amount of dictionary sizes is in size_dictionary_area.
+ * The workarea size for compression and decompression is in
+ * size_decompression_area.
+ */
+ Size size_flags;
+
+ /** The memory size for dictionaries
+ * This is included in size_flags.
+ */
+ Size size_dictionary_area;
+
+ /** Workarea size to decompress one VCI_COMPACTION_UNIT_ROW.
+ * The size is calculated as
+ * MAXALIGN(VCI_MAX_PAGE_SPACE * VCI_COMPACTION_UNIT_ROW)
+ * when size_dictionary_area != 0, or zero.
+ * This is included in size_flags.
+ */
+ Size size_decompression_area;
+
+ /** The query context this fetch context belongs to. */
+ vci_CSQueryContext query_context;
+
+ /** \b LOCALIZED \b VARIABLE \n
+ * VCI main relation information used in localized fetch.
+ * Since the file discriptor or Relation structure must be obtained
+ * in each process, the main relation information also calculated in
+ * each process.
+ */
+ vci_MainRelHeaderInfo *info;
+
+ /** \b LOCALIZED \b VARIABLE \n
+ * Relations of the delete vector.
+ */
+ vci_ColumnRelations rel_delete;
+
+ /** \b LOCALIZED \b VARIABLE \n
+ * Relations of the null bit vector.
+ */
+ vci_ColumnRelations rel_null;
+
+ /** \b LOCALIZED \b VARIABLE \n
+ * Relations of the TID vector.
+ */
+ vci_ColumnRelations rel_tid;
+
+ /** \b LOCALIZED \b VARIABLE \n
+ * Pointer to the array of relations of normal columns.
+ */
+ vci_ColumnRelations *rel_column;
+
+ /**
+ * The column ID translation table.
+ * Since the column IDs in fetch vector are differ from those of
+ * VCI main relations,
+ * we have the translation table from the former to the latter here.
+ */
+ int16 column_link[1]; /* VARIABLE LENGTH ARRAY */
+} vci_CSFetchContextData; /* VARIABLE LENGTH STRUCT */
+typedef vci_CSFetchContextData *vci_CSFetchContext;
+
+/**
+ * @brief Structure to keep minimum and maximum value for a column.
+ */
+typedef struct vci_minmax
+{
+ bool valid; /* min and max are meaningful (true) or not
+ * (false). */
+ char min[VCI_MAX_MIN_MAX_SIZE]; /* Minimum value. */
+ char max[VCI_MAX_MIN_MAX_SIZE]; /* Maximum value. */
+} vci_minmax_t;
+
+/**
+ * @brief The extent information which is obtained before fetching the
+ * extent itself.
+ *
+ * It has information of existence, visibility of the extent,
+ * number of rows in the extent,
+ * and the minimum and maximum values of the extent.
+ */
+typedef struct vci_extent_status
+{
+ uint32 size; /* Size of this structure. */
+ uint32 num_rows; /* Number of rows in the extent. */
+ bool existence; /* Existence of the extent. */
+ bool visible; /* Visibility of the extent. */
+
+ /** The minimum and the maximum values of columns to be fetched. */
+ vci_minmax_t minmax[1]; /* VARIABLE LENGTH ARRAY */
+} vci_extent_status_t; /* VARIABLE LENGTH STRUCT */
+
+/**
+ * @brief The status after reading vector.
+ */
+typedef enum vci_read_vector_status_t
+{
+ vcirvs_read_whole, /* Whole the data, that are required, are
+ * read. */
+ vcirvs_out_of_memory, /* Partially read since out of memory. */
+ vcirvs_end_of_extent, /* Reaches the end of extent. */
+
+ /** Failed to read since the parameter is out of range. */
+ vcirvs_out_of_range,
+
+ vcirvs_not_visible, /* Failed to read since the extent is
+ * invisible. */
+ vcirvs_not_exist, /* The specified extent is not exists. */
+} vci_read_vector_status_t;
+
+/**
+ * @brief Information of a fetched column in virtual tuple.
+ */
+typedef struct vci_virtual_tuples_column_info
+{
+ char *area; /* Aligned pointer of al_area. NEVER pfree() */
+
+ /** Allocated pointer, actual palloced() address is kept. */
+ char *al_area;
+
+ int32 null_bit_id; /* Null bit ID in null bit vector. */
+ uint32 max_column_size; /* The maximum size of data in the column. */
+
+ /** true when the value is passed by the pointer (datum by reference).
+ * false when the value itself is contained in Datum (datum by value).
+ */
+ bool strict_datum_type;
+
+ vcis_compression_type_t comp_type; /* Compression method used. */
+ Oid atttypid; /* Type ID of attribute. */
+ bool *isnull; /* Pointer to the isnull flag area. */
+ Datum *values; /* Pointer to the Datum array area. */
+
+ /** The information of the dictionary of LZVF compression. */
+ vci_DictInfo *dict_info;
+} vci_virtual_tuples_column_info_t;
+
+/**
+ * @brief Information of virtual tuple, a set of fetched data.
+ *
+ * In the form, both colum-wise and row-wise are supported.
+ */
+typedef struct vci_virtual_tuples
+{
+ uint32 size; /* Size of this instance. */
+ uint16 num_columns; /* Number of columns to store. */
+ int32 extent_id; /* The extent ID of stored data. */
+
+ /** Physically recorded number of rows in the target extent. */
+ uint32 num_rows_in_extent;
+
+ /** The row ID in extent of the stored first datum. */
+ uint32 row_id_in_extent;
+
+ uint32 num_rows; /* Number of stored rows in this structure. */
+
+ uint32 buffer_capacity; /* Capacity in unit of rows in this
+ * structure. */
+
+ vci_read_vector_status_t status; /* Read status. */
+
+ /**
+ * This keeps the position of first tuple of vector,
+ * since the first virtual tuple of the vector is not always the first
+ * entry of stored data.
+ * At present, the upstream users requre that always the first data
+ * to be placed at the same address, this member variable is always
+ * set to zero.
+ */
+ uint32 offset_of_first_tuple_of_vector;
+
+ /**
+ * Number of rows for the context to read at once.
+ * The fetcher read multiple lines at once and store them into the
+ * virtual tuple storage.
+ */
+ uint32 num_rows_read_at_once;
+
+ /** The fetch context for this virtual tuple. */
+ vci_CSFetchContext fetch_context;
+
+ /** True for store in column-wise style. False for row-wise. */
+ bool use_column_store;
+
+ /**
+ * The size of virtual Tuple storage.
+ * This is sum of size_values, size_flags, and sizes of area pointed by
+ * vci_virtual_tuples_t->column_info[columnId].al_area.
+ */
+ Size size_vector_memory_context;
+
+ /** The size of the area where Datum and pointers are stores. */
+ Size size_values;
+
+ /**
+ * The size of the area where nulls, skip information,
+ * local skip information, TIDs, CRIDs, dictionaries,
+ * compression workarea and temporay area for wor-wise mode are placed.
+ * The amount of dictionary sizes is in size_dictionary_area.
+ * The workarea size for compression / decompression is in
+ * size_decompression_area.
+ */
+ Size size_flags;
+
+ /**
+ * The memory size for dictionaries.
+ * This is included in size_flags.
+ */
+ Size size_dictionary_area;
+
+ /**
+ * Workarea size to decompress one VCI_COMPACTION_UNIT_ROW.
+ * The size is calculated as
+ * MAXALIGN(VCI_MAX_PAGE_SPACE * VCI_COMPACTION_UNIT_ROW)
+ * when size_dictionary_area != 0, or zero.
+ * This is included in size_flags.
+ */
+ Size size_decompression_area;
+
+ int64 *crid; /* Aligned pointer to CRID list in al_flags */
+
+ /** Aligned pointer to TID list in al_flags.
+ * ItemPointerData are wrtten.
+ */
+ int64 *tid;
+
+ /** Aligned pointer to skip list. */
+ uint16 *skip;
+
+ /** Aligned pointer to skip list for local ROS. */
+ uint16 *local_skip;
+
+ /** Aligned pointer to the area for isnull of all columns. */
+ bool *isnull;
+
+ /**
+ * In row-wise mode, the vector in local ROS is once built here.
+ * The area is allocated in local_memory_context.
+ * The size is
+ * num_rows_read_at_once * num_columns * (sizeof(Datum) + sizeof(bool))
+ */
+ char *row_wise_local_ros;
+
+ /**
+ * Workarea to decompress data.
+ * Dictionaries follow work_decompression
+ */
+ char *work_decompression;
+
+ /** Aligned pointer to the area for values of all columns in al_values. */
+ Datum *values;
+
+ /** Aligned pointer to the area for meta information like skip, TID,
+ * NULL, and so on.
+ */
+ char *flags;
+
+ char *al_values; /* Allocated pointer for values. */
+ char *al_flags; /* Allocated pointer for flags. */
+
+ /** Array of column informations. */
+ vci_virtual_tuples_column_info_t column_info[1]; /* VARIABLE LENGTH ARRAY */
+} vci_virtual_tuples_t; /* VARIABLE LENGTH STRUCT */
+
+extern PGDLLEXPORT vci_CSQueryContext vci_CSCreateQueryContextWLockMode(Oid mainRelationOid,
+ int numReadColumns,
+ /* attribute number in original relation */
+ AttrNumber *attrNum,
+ MemoryContext sharedMemCtx,
+ LOCKMODE lockmode);
+
+/**
+ * @brief Create query context.
+ *
+ * @param[in] mainRelationOid Oid of VCI main relation.
+ * @param[in] numReadColumns The number of read columns in the part of query.
+ * @param[in] attrNum The attribute numbers in the original heap relation,
+ * not those of the VCI main relation.
+ * @param[in] sharedMemCtx The shared memory context to keep elements of
+ * query context, fetch context, local ROS.
+ * @param[in] recoveryInProgress true if recovery is still in progress.
+ * @param[in] estimatingLocalROSSize true if creating a local ROS.
+ * @return The pointer to the allocated vci_CSQueryContext.
+ */
+static inline vci_CSQueryContext
+vci_CSCreateQueryContext(Oid mainRelationOid,
+ int numReadColumns,
+ AttrNumber *attrNum,
+ /* attribute number in original relation */
+ MemoryContext sharedMemCtx,
+ bool recoveryInProgress,
+ bool estimatingLocalROSSize)
+{
+ /*
+ * ShareUpdateExclusiveLock is used for creating local ROS. But on the
+ * standby, AccessShareLock is used because queries on the standby can be
+ * used only RowExclusiveLock or weaker ones.
+ */
+ LOCKMODE lockmode = (recoveryInProgress || estimatingLocalROSSize) ? AccessShareLock : ShareUpdateExclusiveLock;
+
+ return vci_CSCreateQueryContextWLockMode(mainRelationOid, numReadColumns,
+ attrNum, sharedMemCtx, lockmode);
+}
+
+extern PGDLLEXPORT void vci_CSDestroyQueryContext(vci_CSQueryContext queryContext);
+
+/* obtain the worst size of local ROS to be estimated */
+extern Size vci_CSEstimateLocalRosSize(vci_CSQueryContext queryContext);
+
+extern PGDLLEXPORT vci_local_ros_t *vci_CSGenerateLocalRos(vci_CSQueryContextData *queryContext);
+
+/**
+ * @brief Entry point to destroy local ROS.
+ *
+ * @param[in] localRos Local ROS to be destroyed.
+ */
+static inline void
+vci_CSDestroyLocalRos(vci_local_ros_t *localRos)
+{
+ vci_DestroyLocalRos(localRos);
+}
+
+extern PGDLLEXPORT vci_CSFetchContext vci_CSCreateFetchContextBase(
+ vci_CSQueryContext queryContext,
+ uint32 numRowsReadAtOnce,
+ int16 numReadColumns,
+ /* attribute number in original relation */
+ AttrNumber *attrNum,
+ bool useColumnStore,
+ bool returnTid,
+ bool returnCrid,
+ bool useCompression);
+
+#define VCI_MAX_NUM_ROW_TO_FETCH (65536 - VCI_COMPACTION_UNIT_ROW)
+
+/**
+ * @brief The entry point to the function creating fetch context.
+ *
+ * The actual number of rows read at once is quantized
+ * by VCI_COMPACTION_UNIT_ROW by the formula,
+ * actualNumRowsReadAtOnce
+ * = TYPEALIGN(VCI_COMPACTION_UNIT_ROW, numRowsReadAtOnce),
+ * and numRowsReadAtOnce is unsigned 16 bit integer, it should be smaller than
+ * or equal to VCI_MAX_NUM_ROW_TO_FETCH. Otherwise, it returns NULL.
+ *
+ * @param[in] queryContext The query context.
+ * @param[in] numRowsReadAtOnce The number of rows which read at once and
+ * stored in the virtual tuples.
+ * @param[in] numReadColumns The number of columns to be read.
+ * @param[in] attrNum The pointer to the array which has the attribute numbers
+ * of the original heap relation, not VCI main relation.
+ * @param[in] useColumnStore True for column-wise store. False for row-wise.
+ * @param[in] returnTid True to get TID in virtual tuples.
+ * @param[in] returnCrid True to get CRID in virtual tuples.
+ * @return The pointer to the created fetch context.
+ * NULL if some parameters are invald resulting no fetch context is created.
+ */
+static inline vci_CSFetchContext
+vci_CSCreateFetchContext(vci_CSQueryContext queryContext,
+ uint16 numRowsReadAtOnce,
+ int16 numReadColumns,
+ /* attribute number in original relation */
+ AttrNumber *attrNum,
+ bool useColumnStore,
+ bool returnTid,
+ bool returnCrid)
+{
+ return vci_CSCreateFetchContextBase(queryContext,
+ numRowsReadAtOnce,
+ numReadColumns,
+ attrNum,
+ useColumnStore,
+ returnTid,
+ returnCrid,
+ false);
+}
+
+extern PGDLLEXPORT void vci_CSDestroyFetchContext(vci_CSFetchContext fetchContext);
+extern PGDLLEXPORT vci_CSFetchContext vci_CSLocalizeFetchContext(
+ vci_CSFetchContext fetchContext,
+ MemoryContext memoryContext);
+extern PGDLLEXPORT vci_extent_status_t *vci_CSCreateCheckExtent(
+ vci_CSFetchContext localContext);
+extern PGDLLEXPORT void vci_CSDestroyCheckExtent(vci_extent_status_t *status);
+extern PGDLLEXPORT void vci_CSCheckExtent(vci_extent_status_t *status,
+ vci_CSFetchContext fetchContext,
+ int32 extentId,
+ bool readMinMax);
+
+extern PGDLLEXPORT vci_virtual_tuples_t *vci_CSCreateVirtualTuplesWithNumRows(vci_CSFetchContext fetchContext, uint32 numRows);
+
+/**
+ * @brief Create virtual tuples according to the context.
+ *
+ * @param[in] localContext The localized fetch context.
+ * @return The created virtual tuples.
+ */
+static inline vci_virtual_tuples_t *
+vci_CSCreateVirtualTuples(vci_CSFetchContext localContext)
+{
+ return vci_CSCreateVirtualTuplesWithNumRows(localContext,
+ localContext->num_rows_read_at_once);
+}
+
+extern PGDLLEXPORT void vci_CSDestroyVirtualTuples(vci_virtual_tuples_t *vTuples);
+
+/**
+ * @brief Get the address of the area where Datum of the specified column
+ * is stored.
+ *
+ * At present, the upstream requester requires the start address fixed.
+ * For better performance, it is better that the start address is modifiable,
+ * to fetch many rows at once, or to use local ROS directly.
+ *
+ * @param[in] vTuples The virtual tuples.
+ * @param[in] columnId Target column ID.
+ * @return The pointer to the Datum array.
+ */
+static inline Datum *
+vci_CSGetValueAddrFromVirtualTuplesColumnwise(vci_virtual_tuples_t *vTuples, uint16 columnId)
+{
+ return vTuples->column_info[columnId].values;
+}
+
+/**
+ * @brief Get the address of the area where isnull of the specified column
+ * is stored.
+ *
+ * At present, the upstream requester requires the start address fixed.
+ * For better performance, it is better that the start address is modifiable,
+ * to fetch many rows at once, or to use local ROS directly.
+ *
+ * @param[in] vTuples The virtual tuples.
+ * @param[in] columnId Target column ID.
+ * @return The pointer to the bool array.
+ */
+static inline bool *
+vci_CSGetIsNullAddrFromVirtualTuplesColumnwise(vci_virtual_tuples_t *vTuples, uint16 columnId)
+{
+ return vTuples->column_info[columnId].isnull;
+}
+
+/**
+ * @brief Get the address of the skip information of the specified column
+ * is stored.
+ *
+ * @param[in] vTuples The virtual tuples.
+ * @return The pointer to the skip information array.
+ */
+static inline uint16 *
+vci_CSGetSkipAddrFromVirtualTuples(vci_virtual_tuples_t *vTuples)
+{
+ return vTuples->skip;
+}
+
+/**
+ * @brief Get the vector of specified skip information.
+ *
+ * @param[in] vTuples The virtual tuples.
+ * @return The pointer to the array of skip information.
+ *
+ * @note The instrtuction is the same as
+ * vci_CSGetValuesOfVirtualTupleColumnar().
+ */
+static inline uint16 *
+vci_CSGetSkipFromVirtualTuples(vci_virtual_tuples_t *vTuples)
+{
+#ifdef CHECK_VTUPLE_GET_RANGE
+ Assert((0 <= vTuples->offset_of_first_tuple_of_vector) &&
+ (vTuples->offset_of_first_tuple_of_vector < vTuples->num_rows));
+#endif /* #ifdef CHECK_VTUPLE_GET_RANGE */
+
+ return &(vTuples->skip[vTuples->offset_of_first_tuple_of_vector]);
+}
+
+/**
+ * @brief Get the vector of TID.
+ *
+ * @param[in] vTuples The virtual tuples.
+ * @return The pointer to the array of TID information in int64* form.
+ *
+ * @note This function is available when the fetch context is created
+ * with the option returnTid is true.
+ * This function can be available independent of useColumnStore option.
+ */
+/* Cast please */
+static inline int64 *
+vci_CSGetTidFromVirtualTuples(vci_virtual_tuples_t *vTuples)
+{
+#ifdef CHECK_VTUPLE_GET_RANGE
+ Assert((0 <= vTuples->offset_of_first_tuple_of_vector) &&
+ (vTuples->offset_of_first_tuple_of_vector < vTuples->num_rows));
+#endif /* #ifdef CHECK_VTUPLE_GET_RANGE */
+
+ return &(vTuples->tid[vTuples->offset_of_first_tuple_of_vector]);
+}
+
+/**
+ * @brief Get the TID of specified tuple.
+ *
+ * @param[in] vTuples The virtual tuples.
+ * @param[in] offsetInVector offset in the vector.
+ * @return TID information.
+ *
+ * @note The instruction is the same as vci_GetTidFromVirtualTuples().
+ */
+#ifdef __s390x__
+static inline ItemPointerData
+vci_CSGetTidInItemPointerFromVirtualTuples(vci_virtual_tuples_t *vTuples,
+ int offsetInVector)
+{
+ ItemPointerData ipd;
+ int64 result = (vci_CSGetTidFromVirtualTuples(vTuples))[offsetInVector];
+#ifdef WORDS_BIGENDIAN
+ result = result << 16;
+#else
+#endif
+ ipd = *((ItemPointer) &result);
+ return ipd;
+}
+#else
+static inline ItemPointer
+vci_CSGetTidInItemPointerFromVirtualTuples(vci_virtual_tuples_t *vTuples,
+ int offsetInVector)
+{
+ return (ItemPointer) &(vci_CSGetTidFromVirtualTuples(vTuples)
+ [offsetInVector]);
+}
+#endif
+
+extern PGDLLEXPORT int vci_CSFetchVirtualTuples(vci_virtual_tuples_t *vTuples,
+ int64 cridStart,
+ uint32 numReadRows);
+
+/**
+ * @brief Get the tuple specified.
+ *
+ * @param[in] vTuples The virtual tuples.
+ * @param[in] offsetInVector offset in the vector.
+ * @return The pointer to the array of Datum.
+ *
+ * @note This function can be used when the fetch context is created in
+ * row-wise mode, i.e. useColumnStore = false.
+ * The column fetcher is read rows in unit of VCI_COMPACTION_UNIT_ROW.
+ * Therefore, at the start address of the buffer does not always have
+ * the specified data.
+ * The specified data is pointed by the offset of
+ * vTuples->offset_of_first_tuple_of_vector, actually.
+ * To have the data at the start address, always read rows of multiples
+ * of VCI_COMPACTION_UNIT_ROW at once.
+ * For example, when VCI_COMPACTION_UNIT_ROW = 128, then
+ * read 128 rows at once from the row ID in the extent, 0, 128, 256, 384, ....
+ * Or, read 256 rows at once from the row ID in the extent, 0, 256, 512, ...
+ */
+static inline Datum *
+vci_CSGetValuesOfVirtualTuple(vci_virtual_tuples_t *vTuples,
+ uint32 offsetInVector)
+{
+ offsetInVector += vTuples->offset_of_first_tuple_of_vector;
+
+#ifdef CHECK_VTUPLE_GET_RANGE
+ Assert(!vTuples->use_column_store);
+ Assert((0 <= offsetInVector) && (offsetInVector < vTuples->num_rows));
+#endif /* #ifdef CHECK_VTUPLE_GET_RANGE */
+
+ return &(vTuples->values[vTuples->num_columns * offsetInVector]);
+}
+
+/**
+ * @brief Get the isnull of specified tuple.
+ *
+ * @param[in] vTuples The virtual tuples.
+ * @param[in] offsetInVector offset in the vector.
+ * @return The pointer to the array of bool.
+ *
+ * @note See instruction of vci_CSGetValuesOfVirtualTuple().
+ */
+static inline bool *
+vci_CSGetIsNullOfVirtualTuple(vci_virtual_tuples_t *vTuples,
+ int32 offsetInVector)
+{
+ offsetInVector += vTuples->offset_of_first_tuple_of_vector;
+
+#ifdef CHECK_VTUPLE_GET_RANGE
+ Assert(!vTuples->use_column_store);
+ Assert((0 <= offsetInVector) && ((uint32) offsetInVector < vTuples->num_rows));
+#endif /* #ifdef CHECK_VTUPLE_GET_RANGE */
+
+ return &(vTuples->isnull[vTuples->num_columns * offsetInVector]);
+}
+
+/**
+ * @brief Get the vector of specified column data.
+ *
+ * @param[in] vTuples The virtual tuples.
+ * @param[in] columnId The column ID.
+ * @return The pointer to the array of Datum.
+ *
+ * @note This function can be used when the fetch context is created in
+ * column-wise mode, i.e. useColumnStore = true.
+ * The other instruction is the same as vci_CSGetValuesOfVirtualTuple().
+ */
+static inline Datum *
+vci_CSGetValuesOfVirtualTupleColumnar(vci_virtual_tuples_t *vTuples, uint16 columnId)
+{
+#ifdef CHECK_VTUPLE_GET_RANGE
+ Assert(vTuples->use_column_store);
+ Assert((VCI_FIRST_NORMALCOLUMN_ID <= columnId) && (columnId < vTuples->num_columns));
+#endif /* #ifdef CHECK_VTUPLE_GET_RANGE */
+
+ return &(vTuples->column_info[columnId].values
+ [vTuples->offset_of_first_tuple_of_vector]);
+}
+
+/**
+ * @brief Get the vector of specified isnull information.
+ *
+ * @param[in] vTuples The virtual tuples.
+ * @param[in] columnId The column ID.
+ * @return The pointer to the array of bool.
+ *
+ * @note The instrtuction is the same as
+ * vci_CSGetValuesOfVirtualTupleColumnar().
+ */
+static inline bool *
+vci_CSGetIsNullOfVirtualTupleColumnar(vci_virtual_tuples_t *vTuples, uint16 columnId)
+{
+#ifdef CHECK_VTUPLE_GET_RANGE
+ Assert(vTuples->use_column_store);
+ Assert((VCI_FIRST_NORMALCOLUMN_ID <= columnId) && (columnId < vTuples->num_columns));
+#endif /* #ifdef CHECK_VTUPLE_GET_RANGE */
+
+ return &(vTuples->column_info[columnId].isnull[vTuples->offset_of_first_tuple_of_vector]);
+}
+
+/**
+ * @brief Obtains the column ID in the VCI main relation from the serial number
+ * in a set of read columns listed in vci_CSFetchContext.
+ *
+ * @param[in] fetchContext The fetch context.
+ * @param[in] serialNumber The serial number in a set of read columns.
+ * @return the columnID in the VCI main relation.
+ */
+static inline int16
+vci_GetColumnIdFromFetchContext(vci_CSFetchContext fetchContext,
+ int16 serialNumber)
+{
+ int cId;
+
+ Assert((0 <= serialNumber) && (serialNumber < fetchContext->num_columns));
+ cId = fetchContext->column_link[serialNumber];
+ Assert((0 <= cId) && (cId < fetchContext->query_context->num_columns));
+
+ return fetchContext->query_context->column_id[cId];
+}
+
+extern void vci_FillCridInVirtualTuples(vci_virtual_tuples_t *vTuples);
+extern void
+ vci_FillFixedWidthColumnarFromRosChunkStorage(vci_virtual_tuples_t *vTuples,
+ int16 columnId,
+ RosChunkStorage *rosChunkStorage);
+extern void
+ vci_FillVariableWidthColumnarFromRosChunkStorage(vci_virtual_tuples_t *vTuples,
+ int16 columnId,
+ RosChunkStorage *rosChunkStorage);
+extern int16 *vci_GetNullableColumnIds(vci_virtual_tuples_t *vTuples);
+
+#endif /* VCI_FETCH_H */
diff --git a/contrib/vci/include/vci_freelist.h b/contrib/vci/include/vci_freelist.h
new file mode 100644
index 0000000..8cdfec7
--- /dev/null
+++ b/contrib/vci/include/vci_freelist.h
@@ -0,0 +1,75 @@
+/*-------------------------------------------------------------------------
+ *
+ * vci_freelist.h
+ * Definitions and declarations of Free space link list
+ *
+ * Portions Copyright (c) 2025, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * contrib/vci/include/vci_freelist.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef VCI_FREELIST_H
+#define VCI_FREELIST_H
+
+#include "postgres.h"
+
+#include "vci.h"
+#include "vci_columns.h"
+#include "vci_ros.h"
+
+#define VCI_FREESPACE_ITEM_ID FirstOffsetNumber
+
+typedef struct vcis_free_space
+{
+ uint32 size;
+
+ vcis_extent_type_t type;
+
+ BlockNumber prev_pos;
+
+ BlockNumber next_pos;
+} vcis_free_space_t;
+
+#define vci_hasFreeLinkNode(freespace) \
+ (vcis_free_space == (freespace)->type) \
+ || (vcis_tidcrid_type_pagetag == (freespace)->type)
+
+extern PGDLLEXPORT vcis_free_space_t *vci_GetFreeSpace(vci_RelationPair *relPair, BlockNumber blk);
+
+extern int32 vci_MakeFreeSpace(vci_RelationPair *relPair,
+ BlockNumber startBlockNumber,
+ BlockNumber *newFSBlockNumber,
+ vcis_free_space_t *newFS,
+ bool coalesce);
+
+extern void vci_AppendFreeSpaceToLinkList(vci_RelationPair *relPair,
+ BlockNumber startBlockNumber,
+ BlockNumber prevFreeBlockNumber,
+ BlockNumber nextFreeBlockNumber,
+ BlockNumber size);
+
+extern BlockNumber vci_FindFreeSpaceForExtent(vci_RelationPair *relPair,
+ BlockNumber requiredSize);
+
+extern void vci_RemoveFreeSpaceFromLinkList(vci_RelationPair *relPair,
+ BlockNumber startBlockNumber,
+ BlockNumber numExtentPages);
+
+/* *************** */
+/* Recovery */
+/* *************** */
+
+extern void vci_InitRecoveryRecordForFreeSpace(vci_MainRelHeaderInfo *info);
+
+extern void vci_WriteRecoveryRecordForFreeSpace(vci_RelationPair *relPair,
+ int16 colId,
+ int16 dictId,
+ BlockNumber StartBlockNumber,
+ vcis_free_space_t *FS);
+
+extern void vci_RecoveryFreeSpace(vci_MainRelHeaderInfo *info, vci_ros_command_t command);
+
+#endif /* VCI_FREELIST_H */
diff --git a/contrib/vci/include/vci_mem.h b/contrib/vci/include/vci_mem.h
new file mode 100644
index 0000000..3f455d5
--- /dev/null
+++ b/contrib/vci/include/vci_mem.h
@@ -0,0 +1,177 @@
+/*-------------------------------------------------------------------------
+ *
+ * vci_mem.h
+ * Definitions of on-memmory structures
+ *
+ * Portions Copyright (c) 2025, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * contrib/vci/include/vci_mem.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef VCI_MEM_H
+#define VCI_MEM_H
+
+#include "postgres.h"
+
+#include <float.h>
+
+#include "lib/ilist.h"
+#include "portability/instr_time.h"
+#include "storage/lwlock.h"
+#include "utils/palloc.h"
+
+#include "vci.h"
+#include "vci_ros.h"
+#include "vci_memory_entry.h"
+
+/*-------------------------------------------------------------------------
+ * START: Copied from include/vci_port.h
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef VCI_PORT_H
+#define VCI_PORT_H
+
+/*
+ * key for vci_devload_t
+ */
+#define VCI_PSEUDO_UNMONITORED_DEVICE ""
+
+#ifndef WIN32
+#define VCI_PATH_MAX PATH_MAX
+#else
+#define VCI_PATH_MAX MAX_PATH
+#endif
+
+/*
+ * Memory entry on the each device
+ *
+ * * head is the actual list, link is used to track unused entries
+ */
+typedef struct
+{
+ dlist_head head;
+ dlist_node link;
+} vci_memory_entry_list_t;
+
+/*
+ * IO statistics, mount information, etc for each devices
+ */
+typedef struct
+{
+ char devname[VCI_PATH_MAX];
+
+ vci_memory_entry_list_t *memory_entry_queue;
+
+ /*
+ * Next position when memory entry would be traced. NULL means there are
+ * no entries to be seen.
+ */
+ dlist_node *memory_entry_pos;
+} vci_devload_t;
+
+#endif /* VCI_PORT_H */
+
+/*-------------------------------------------------------------------------
+ * END: Copied from include/vci_port.h
+ *-------------------------------------------------------------------------
+ */
+
+typedef struct VciGucStruct
+{
+ bool have_loaded_postgresql_conf;
+
+ bool enable;
+
+ bool log_query;
+
+ int cost_threshold;
+
+ int table_scan_policy;
+
+ /* GUC parameters read from postgresq.conf */
+ int maintenance_work_mem;
+ int max_devices; /* max device num for storage */
+
+ /* ROS control worker/daemon */
+ int control_max_workers;
+ int control_naptime;
+
+ /* command thresholds */
+ int wosros_conv_threshold;
+ int cdr_threshold;
+
+ /* for custom plan execution */
+ int max_local_ros_size;
+
+ /* for parallel processing */
+ int table_rows_threshold;
+
+ bool enable_seqscan;
+ bool enable_indexscan;
+ bool enable_bitmapheapscan;
+ bool enable_sort;
+ bool enable_hashagg;
+ bool enable_sortagg;
+ bool enable_plainagg;
+ bool enable_hashjoin;
+ bool enable_nestloop;
+
+ /* GUC parameters for internal use */
+ bool enable_ros_control_daemon;
+
+} VciGucStruct;
+
+extern PGDLLEXPORT VciGucStruct VciGuc;
+
+/*
+ * Data structure on shared memory
+ *
+ * The instance would be allocated on the shared memory and can be accessed via
+ * VciShmemAddr.
+ */
+typedef struct VciShmemStruct
+{
+ /* --- ROS Control Daemon --- */
+
+ /* Attributes for passing attributes to a worker */
+
+ vci_wosros_conv_worker_arg_t *worker_args_array;
+
+ /** vci_memory_entries_t is defined in vci_ros.h
+ * That keeps information of VCI indices kept in memory.
+ * The life is the same with PostgreSQL instance.
+ */
+ vci_memory_entries_t *memory_entries;
+
+ dlist_head memory_entry_device_unknown_list;
+
+ /* Standby server controller */
+ LWLock *standby_exec_loc;
+ int num_standby_exec_queries;
+
+ /* IO statistics */
+
+ vci_devload_t *devload_array;
+
+ vci_memory_entry_list_t *memory_entry_queue_array;
+
+ dlist_head free_memory_entry_queue_list; /**list of memory_entry_queue_array */
+ int num_devload_info; /* monitored device numbers + 1(for
+ * unmonitored devices) */
+ int max_devices; /* max device num for storage */
+ int translated_dev_pos; /* index of a device VCIs on which is to
+ * be translated */
+ LWLock *io_load_lock;
+
+ /* Additional Lwlocks used by various modules */
+ LWLock *vci_memory_entries_lock;
+ LWLock *vci_query_context_lock;
+ LWLock *vci_mnt_point2dev_lock;
+} VciShmemStruct;
+
+extern PGDLLEXPORT VciShmemStruct *VciShmemAddr;
+
+#endif /* VCI_MEM_H */
diff --git a/contrib/vci/include/vci_memory_entry.h b/contrib/vci/include/vci_memory_entry.h
new file mode 100644
index 0000000..7aba17e
--- /dev/null
+++ b/contrib/vci/include/vci_memory_entry.h
@@ -0,0 +1,118 @@
+/*-------------------------------------------------------------------------
+ *
+ * vci_memory_entry.h
+ * Definitions and declarations of on-memory structures per VCI index
+ *
+ * Portions Copyright (c) 2025, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * contrib/vci/include/vci_memory_entry.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef VCI_MEMORY_ENTRY_H
+#define VCI_MEMORY_ENTRY_H
+
+#include "lib/ilist.h"
+#include "storage/lwlock.h"
+
+#include "vci_ros.h"
+
+/**
+ * the key when searching a vci_memory_entry_t type value from its set.
+ */
+typedef struct
+{
+ Oid oid; /* Oid of VCI main relation */
+ Oid dbid; /* Oid of database where VCI main relations
+ * belongs */
+} vci_id_t;
+
+/**
+ * VCI index placeholder to determine the target of ROS command by ROS daemon
+ */
+typedef struct
+{
+ vci_id_t id; /* identifier of vci_memory_entry_t */
+ Oid tsid; /* Oid of tablespace where VCI a main relation
+ * belongs */
+
+ /**
+ * If tsid is equal to InvalidOid, the Oid corresponding to default table
+ * space. Otherwise, this is equal to tsid.
+ */
+ Oid real_tsid;
+
+ /**
+ * Timestamp used for least recent update.
+ * We do nothing for the wraparound effect, aka "wraparound failures" in
+ * the PostgreSQL manual.
+ */
+ int32 time_stamp;
+
+ /**
+ * flag to force the ROS control daemon to do WOS->ROS conversion
+ * at next WOS->ROS conversion stage regardless of the WOS size.
+ *
+ * This flag is set to true when a local WOS->ROS conversion fails
+ * on account of out-of-memory error. This flag is set to false when
+ * WOS->ROS conversion is done.
+ */
+ bool force_next_wosros_conv;
+
+ dlist_node link; /* links of vci indexes on a same device */
+
+} vci_memory_entry_t;
+
+/**
+ * @brief Contains the pointer to the array of vci_memory_entry_t,
+ * and a lock.
+ *
+ * The lock must be used when the array is exclusively accessed, say
+ * add / remove entries to / from the array, or so.
+ *
+ * The instance of vci_memory_entries_t and the array of entries must
+ * be allocated in shared memory living throughout the PostgreSQL instance.
+ */
+typedef struct
+{
+ /**
+ * Lock to update member variables of vci_memory_entries_t.
+ */
+ LWLock *lock;
+
+ /**
+ * Number of allocated vci_memory_entry_t pointed by data[].
+ */
+ uint32 capacity_hash_entries;
+
+ /**
+ * Current time stamp value, used to least-recently-updated method.
+ * Instances of vci_memory_entry_t have the timestamp of last access,
+ * which we do not care wraparound effect, aka "wraparound failures" in
+ * the PostgreSQL manual.
+ */
+ int32 time_stamp;
+
+ /**
+ * Pointer to the array of vci_memory_entry_t.
+ */
+ vci_memory_entry_t data[1]; /* VARIABLE LENGTH ARRAY */
+
+} vci_memory_entries_t;
+
+extern Size vci_GetSizeOfMemoryEntries(void);
+extern void vci_InitMemoryEntries(void);
+
+extern void vci_TouchMemoryEntry(vci_id_t *vciid, Oid tsid);
+extern bool vci_GetWosRosConvertingVCI(vci_wosros_conv_worker_arg_t *vci_info);
+extern void vci_freeMemoryEntry(vci_id_t *vciid);
+
+extern void vci_update_memoryentry_in_devloadinfo(void);
+extern void vci_MoveTranslatedVCI2Tail(void);
+extern void vci_ResetDevloadCurrentPos(void);
+extern void vci_RemoveMemoryEntryOnDroppedDatabase(void);
+extern void vci_SetForceNextWosRosConvFlag(vci_id_t *vciid, bool value);
+
+#endif /* VCI_MEMORY_ENTRY_H */
diff --git a/contrib/vci/include/vci_ros.h b/contrib/vci/include/vci_ros.h
new file mode 100644
index 0000000..f40e840
--- /dev/null
+++ b/contrib/vci/include/vci_ros.h
@@ -0,0 +1,1085 @@
+/*-------------------------------------------------------------------------
+ *
+ * vci_ros.h
+ * Definitions and declarations of VCI main relation
+ *
+ * Portions Copyright (c) 2025, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * contrib/vci/include/vci_ros.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+/****************************************************************************
+ * ** CAUTION: THE STRUCTURES DEFINED IN THIS HEADER FILE WITH THE PREFIX **
+ * ** OF "vcis_" AND vci_MainRelVar, vcis_Crid DEFINE THE FORMAT OF THE ROS **
+ * ** DATA. ANY MODIFICATION ON THEM MAY CAUSE FORMAT INCOMPATIBILITY. **
+ * ** PLEASE BE SURE TO CHANGE THE VALUE OF EITHER MACRO **
+ * ** VCI_ROS_VERSION_MAJOR OR VCI_ROS_VERSION_MINOR, TO DETECT FORMAT **
+ * ** INCOMPATIBILITY. **
+ * **************************************************************************
+ */
+
+#ifndef VCI_ROS_H
+#define VCI_ROS_H
+
+#include "postgres.h"
+
+#include "access/heapam.h"
+#include "access/htup_details.h"
+#include "c.h"
+#include "catalog/pg_attribute.h"
+#include "catalog/pg_class.h"
+#include "nodes/execnodes.h"
+#include "storage/block.h"
+#include "storage/buf.h"
+#include "storage/bufmgr.h"
+#include "storage/itemptr.h"
+#include "storage/lock.h"
+#include "storage/off.h"
+#include "utils/memutils.h"
+#include "utils/rel.h"
+#include "utils/relcache.h"
+#include "utils/snapshot.h"
+
+#include "vci.h"
+
+#include "vci_utils.h"
+
+#if (!defined(WIN32))
+#define UINT uint
+#endif
+
+#define VCI_ROS_VERSION_MAJOR ((uint32) 0x00000000)
+#define VCI_ROS_VERSION_MINOR ((uint32) 0x0000000D)
+
+/**
+ * @brief IDs of ROS commands.
+ */
+typedef enum vci_ros_command
+{
+ vci_rc_invalid = -11, /* Invalid case. */
+
+ /** For vacuum with vci_mrlm_read_write_exclusive. */
+ vci_rc_vacuum = -10,
+
+ /** For normal query with vci_mrlm_read_share. */
+ vci_rc_query = -9,
+
+ /** For DROP command with vci_mrlm_read_write_exclusive. */
+ vci_rc_drop_index = -8,
+
+ /** For DELETE or UPDATE commands with vci_mrlm_read_share. */
+ vci_rc_wos_delete = -7,
+
+ /** For INSERT or UPDATE commands with vci_mrlm_read_share. */
+ vci_rc_wos_insert = -6,
+
+ /** For recovering ROS with vci_mrlm_read_share, assumed that this command
+ * is used in vci_mrlm_write_exclusive lock of ROS commands. */
+ vci_rc_recovery = -5,
+
+ /** For collecting VCI information with vci_mrlm_read_share.
+ * This is also used by vci_KeepMainRelHeader() and
+ * vci_KeepMainRelHeaderWOVersionCheck() automatically.
+ * */
+ vci_rc_probe = -4,
+
+ /** For building ROS in initial index building with
+ * vci_mrlm_read_write_exclusive. */
+ vci_rc_wos_ros_conv_build = -3,
+
+ /** For building local ROS with vci_mrlm_read_write_exclusive, to serialize
+ * ROS commands.
+ */
+ vci_rc_generate_local_ros = -2,
+
+ /** For COPY command with vci_mrlm_write_share. */
+ vci_rc_copy_command = -1,
+
+ /** For WOS -> ROS conversion with vci_mrlm_write_exclusive */
+ vci_rc_wos_ros_conv = 0,
+
+ /** For updating delete vector with vci_mrlm_write_exclusive */
+ vci_rc_update_del_vec,
+
+ /** For collecting deleted rows with vci_mrlm_write_exclusive */
+ vci_rc_collect_deleted,
+
+ /** For collecting deleted extents, unable to access anymore,
+ * with vci_mrlm_write_exclusive
+ */
+ vci_rc_collect_extent,
+
+ /** For updating TID -> CRID relations with vci_mrlm_write_exclusive */
+ vci_rc_update_tid_crid,
+
+ /** For compaction with vci_mrlm_write_exclusive */
+ /* vci_rc_compaction, */
+
+ num_vci_rc, /* anchor */
+} vci_ros_command_t;
+
+/**
+ * @brief function to obtain the size of the varlena headers.
+ *
+ * @param[in] ptr Pointer to the varlena.
+ * @return Header size of given varlena.
+ */
+static inline int32
+vci_VARHDSZ_ANY(void *ptr)
+{
+ return VARATT_IS_1B_E(ptr) ? VARHDRSZ_EXTERNAL
+ : ((VARATT_IS_1B(ptr) ? VARHDRSZ_SHORT : VARHDRSZ));
+}
+
+/** taken from src/backend/utils/adt/tid.c */
+#define DatumGetItemPointer(X) ((ItemPointer) DatumGetPointer(X))
+/** taken from src/backend/utils/adt/tid.c */
+#define ItemPointerGetDatum(X) PointerGetDatum(X)
+
+typedef uint32 vci_offset_in_extent_t; /* offset to data */
+
+/** bit width of maximum number of row ID in an extent */
+#define VCI_CRID_ROW_ID_BIT_WIDTH (18)
+
+/** Calculate CRID in int64 format from extentID and rowID in extent */
+static inline int64
+vci_CalcCrid64(int32 extentId, uint32 rowIdInExtent)
+{
+ return ((int64) extentId << VCI_CRID_ROW_ID_BIT_WIDTH) |
+ (rowIdInExtent & ((UINT64CONST(1) << VCI_CRID_ROW_ID_BIT_WIDTH) - 1));
+}
+
+/** Calculate extentID from CRID in int64 format */
+static inline int32
+vci_CalcExtentIdFromCrid64(int64 crid64)
+{
+ return (int32) (crid64 >> VCI_CRID_ROW_ID_BIT_WIDTH);
+}
+
+/** Calculate rowID in extent from CRID in int64 format */
+static inline uint32
+vci_CalcRowIdInExtentFromCrid64(int64 crid64)
+{
+ return (uint32) (crid64 & ((UINT64CONST(1) << VCI_CRID_ROW_ID_BIT_WIDTH) - 1));
+}
+
+/** Maximum number of rows in an extent. (256 * 1024) for 18 bits */
+#define VCI_NUM_ROWS_IN_EXTENT (1 << VCI_CRID_ROW_ID_BIT_WIDTH)
+
+#define VCI_MAX_NUMBER_UNCONVERTED_ROS (128)
+
+#define VCI_INVALID_CRID_IN_48_BIT (UINT64CONST(0xFFFF800000000000))
+#define VCI_INVALID_CRID VCI_INVALID_CRID_IN_48_BIT
+
+#define VCI_MOVED_CRID_IN_48_BIT (UINT64CONST(0xFFFFC00000000000))
+#define VCI_MOVED_CRID VCI_MOVED_CRID_IN_48_BIT
+
+/** Value indicating invalid extent. The value is 0xE0000000 */
+#define VCI_INVALID_EXTENT_ID \
+ ((int32) (VCI_INVALID_CRID_IN_48_BIT >> VCI_CRID_ROW_ID_BIT_WIDTH))
+
+/** ID of the first extent stored in the storage. */
+#define VCI_FIRST_NORMAL_EXTENT_ID (0)
+
+/** Value indicating invalid dictionary. The value is -1 */
+#define VCI_INVALID_DICTIONARY_ID (-1)
+
+/** The number of rows converted at once by WOS->ROS converter.
+ * Offset is assigned every VCI_COMPACTION_UNIT_ROW rows.
+ */
+#define VCI_COMPACTION_UNIT_ROW (128)
+
+/** The ratio to keep usage of work area in safe level */
+#define VCI_WOS_ROS_WORKAREA_SAFE_RATIO (0.5)
+
+/** Base alignment in storage.
+ * In the storage, normally VCI uses four-byte integers.
+ * Thus, we align the data in the storage by four bytes.
+ */
+#define VCI_DATA_ALIGNMENT_IN_STORAGE (4)
+
+/** Aligned values, rounded up */
+#define vci_RoundUpValue(value, unit) \
+ ((((value) + (unit) - 1) / (unit)) * (unit))
+/** Aligned values, rounded down */
+#define vci_RoundDownValue(value, unit) \
+ (((value) / (unit)) * (unit))
+
+/** Get byte size of data in an item when a page contains multiple items.
+ * @param[in] numItem Number of items in a page.
+ * @return The size of data in an item in byte.
+ */
+#define VCI_ITEM_SPACE(numItem) \
+ ((((BLCKSZ - offsetof(PageHeaderData, pd_linp) \
+ - (numItem * (sizeof(HeapTupleHeaderData) + sizeof(ItemIdData)))) \
+ / numItem) / VCI_DATA_ALIGNMENT_IN_STORAGE) \
+ * VCI_DATA_ALIGNMENT_IN_STORAGE)
+
+/** Get byte size of an item include item header,
+ * when a page contains multiple items.
+ * @param[in] numItem Number of items in a page.
+ * @return The size of an item in byte.
+ */
+#define VCI_ITEM_SIZE(numItem) \
+ (VCI_ITEM_SPACE(numItem) + sizeof(HeapTupleHeaderData))
+
+/** Minimum header space in DB page with one item, normally 52 byts */
+#define VCI_MIN_PAGE_HEADER \
+ (SizeOfPageHeaderData + sizeof(HeapTupleHeaderData) \
+ + sizeof(ItemIdData))
+
+/** Available area in DB page with one item, normally 8140 bytes */
+#define VCI_MAX_PAGE_SPACE (BLCKSZ - VCI_MIN_PAGE_HEADER)
+
+/**
+ * @brief Return ID of the target page and offset in the target page
+ * calculated from the position.
+ *
+ * The position and offsetInPage is measured in data area in DB pages. We do
+ * not care the header of DB page in this macro.
+ *
+ * @param[out] blockNumber Block number for the given position.
+ * @param[out] offsetInPage Offset in page in byte, ignoring page header,
+ * for the given position.
+ * @param[in] position Byte offset in area formed by multiple DB pages.
+ */
+static inline void
+vci_GetBlockNumberAndOffsetInPage(BlockNumber *blockNumber,
+ uint32 *offsetInPage,
+ uint32 position)
+{
+ *blockNumber = position / VCI_MAX_PAGE_SPACE;
+ *offsetInPage = position - (*blockNumber * VCI_MAX_PAGE_SPACE);
+}
+
+/**
+ * @brief Get number of pages to write given data size.
+ *
+ * @param[in] size The data size.
+ * @return Number of pages to write.
+ */
+static inline uint32
+vci_GetNumBlocks(Size size)
+{
+ if (size == MaxBlockNumber)
+ return MaxBlockNumber;
+
+ return (size + VCI_MAX_PAGE_SPACE - 1) / VCI_MAX_PAGE_SPACE;
+}
+
+/** Maximum data size of maximum and minimum values in extents. */
+#define VCI_MAX_MIN_MAX_SIZE (16)
+
+/* Accessing VCI main relation header
+ * Because the header of VCI main relation has three pages, we can not map
+ * one structure of C on the header pages simply.
+ * Instead, we use access functions.
+ *
+ * In order to, first prepare a variable to keep page info and call the
+ * initialize function, with relation opend already.
+ * vci_InitMainRelHeaderInfo(info, rel)
+ *
+ * use one of these two * functions.
+ * vci_KeepReadingMainRelHeader()
+ * Read header pages for reading, pin and lock them.
+ * vci_KeepWritingMainRelHeader()
+ * Read header pages for writing, pin and lock them.
+ *
+ * We have to repair all VCI relation, if some of them are broken.
+ * Just call the next for the purpose.
+ * vci_RecoverOneVCIIfNecessary()
+ *
+ * Then, use the following two functions,
+ *
+ * vci_SetMainRelVar()
+ * To set the value to the field.
+ * vci_GetMainRelVar()
+ * To get the value of the field.
+ *
+ * Or, if you access column_info, use
+ * vci_GetMColumn()
+ * which gives the pointer to the vcis_m_column_t on the DB buffer directly.
+ *
+ * The field is defined in enum enum vci_MainRelVar.
+ *
+ *
+ * To write the updated data, use the funcition
+ * vci_WriteMainRelVar()
+ *
+ * After accessing the header, release the DB pages with the following
+ * function.
+ *
+ * vci_ReleaseMainRelHeader()
+ * Release header pages, pins and locks.
+ */
+
+/**
+ * @brief Field names and addresses of VCI main relation.
+ *
+ * These enum values has the page ID at upper 16 bits, and offset for the
+ * field at lower 16 bits.
+ * The offset is measured from the top of DB page, not after the page header.
+ *
+ * This is for struct vcis_main_t.
+ * Because the header ov VCI main relation has three pages, we can not map
+ * one structure of C on the header pages.
+ *
+ * Minimum header in DB page is 52 bytes (0x34)
+ */
+typedef enum vci_MainRelVar
+{
+ /* page 0 */
+ vcimrv_data_wos_oid = 0x00000034,
+ vcimrv_whiteout_wos_oid = 0x00000038,
+ /* vcimrv_cdr_tid_crid_data_oid = 0x0000003C, //reserved */
+ vcimrv_tid_crid_meta_oid = 0x00000040,
+ vcimrv_tid_crid_data_oid = 0x00000044,
+ vcimrv_tid_crid_update_oid_0 = 0x00000048,
+ vcimrv_tid_crid_update_oid_1 = 0x0000004C,
+ /* vcimrv_tid_crid_write_oid = 0x00000050, //reserved */
+ vcimrv_delete_meta_oid = 0x00000054,
+ vcimrv_delete_data_oid = 0x00000058,
+ vcimrv_null_meta_oid = 0x0000005C,
+ vcimrv_null_data_oid = 0x00000060,
+ vcimrv_tid_meta_oid = 0x00000064,
+ vcimrv_tid_data_oid = 0x00000068,
+ vcimrv_ros_version_major = 0x0000006C, /** MUST BE 0x0000006C */
+ vcimrv_ros_version_minor = 0x00000070, /** MUST BE 0x00000070 */
+ vcimrv_num_nullable_columns = 0x00000074,
+ vcimrv_null_width_in_byte = 0x00000078, /** byte size of null bit vector for one row. */
+ vcimrv_column_info_offset = 0x0000007C,
+ vcimrv_num_columns = 0x00000080,
+ vcimrv_extent_info_offset = 0x00000084,
+ /* page 0 to 2 */
+ vcimrv_column_info = 0x00000088,
+ /* page 3 */
+ vcimrv_size_mr = 0x00030034, /** @todo Maybe, dose not need */
+ vcimrv_size_mr_old = 0x00030038, /** @todo Maybe, dose not need */
+ vcimrv_current_ros_version = 0x0003003C,
+ vcimrv_last_ros_version = 0x00030040,
+ vcimrv_tid_crid_diff_sel = 0x00030044,
+ vcimrv_tid_crid_diff_sel_old = 0x00030048,
+ vcimrv_xid_generation = 0x0003004C,
+ vcimrv_xid_gen_update_xid = 0x00030050,
+ /* vcimrv_xgen_tid_crid_write = 0x00030054, //reserved */
+ /* vcimrv_num_tid_crid_update_oid_0 = 0x00030058, //reserved */
+ /* vcimrv_num_tid_crid_update_oid_1 = 0x0003005C, //reserved */
+ vcimrv_ros_command = 0x00030060,
+ /* vcimrv_ros_conv_extent_id = 0x00030064, //reserved */
+ /* vcimrv_ros_conv_common_dict_id = 0x00030068, //reserved */
+ vcimrv_old_extent_id = 0x0003006C,
+ vcimrv_new_extent_id = 0x00030070,
+ vcimrv_working_column_id = 0x00030074,
+ vcimrv_working_dictionary_id = 0x00030078,
+ vcimrv_tid_crid_operation = 0x0003007C,
+ vcimrv_tid_crid_target_blocknumber = 0x00030080,
+ vcimrv_tid_crid_target_info = 0x00030084,
+ vcimrv_tid_crid_free_blocknumber = 0x00030088,
+ /* vcimrv_compaction_colmn_id = 0x0003007C, //reserved */
+ /* vcimrv_compaction_extent_id = 0x00030080, //reserved */
+ /* vcimrv_compaction_old_block_number = 0x00030084, //reserved */
+ /* vcimrv_compaction_new_block_number = 0x00030088, //reserved */
+ vcimrv_num_unterminated_copy_cmd = 0x0003008C,
+ vcimrv_tid_crid_tag_bitmap = 0x00030090,
+ /* vcimrv_num_request_cdr = 0x00030090, //reserved */
+ /* vcimrv_num_appendable_extents = 0x00030094, //reserved */
+ /* vcimrv_num_compaction = 0x00030098, //reserved */
+ /* vcimrv_extent_id_to_write = 0x0003009C, //reserved */
+ vcimrv_num_extents = 0x000300A0,
+ vcimrv_num_extents_old = 0x000300A4,
+ vcimrv_extent_info = 0x000300A8,
+
+ /* error code */
+ vcimrv_invalid = 0xFFFFFFFF,
+} vci_MainRelVar;
+
+/** mask data to get offset for fileds in VCI main relation header in DB page */
+#define VCI_MRV_MASK_OFFSET (0xFFFF)
+/** bit to shift to get DB page ID for fileds in VCI main relation header */
+#define VCI_MRV_PAGE_SHIFT (16)
+
+/**
+ * @brief Get block number for given field of main relation header.
+ *
+ * @param[in] value value defined in vci_MainRelVar.
+ * @return Block number containing given field.
+ */
+#define vci_MRVGetBlockNumber(value) ((value) >> VCI_MRV_PAGE_SHIFT)
+
+/**
+ * @brief Get offset in DB page for given field of main relation header.
+ *
+ * @param[in] value value defined in vci_MainRelVar.
+ * @return Offset for containing given field from page top including header.
+ */
+#define vci_MRVGetOffset(value) ((value) & VCI_MRV_MASK_OFFSET)
+
+/** Number of header pages of VCI main relation */
+#define VCI_NUM_MAIN_REL_HEADER_PAGES (4)
+
+/** Struct to keep pointers to the header pages of VCI main relation */
+typedef struct vci_MainRelHeaderInfo
+{
+ Relation rel; /* Relation of VCI main relation */
+
+ /*
+ * VCI mainrelation header pages should be initialized with InvalidBuffer
+ */
+ Buffer buffer[VCI_NUM_MAIN_REL_HEADER_PAGES]; /* Buffers for the main
+ * relation header
+ * pages. */
+ vci_ros_command_t command; /* Command using this structure. */
+
+ /** number of extents that have the area to store their vcis_m_extent_t
+ * in main relation.
+ * This field is used in query execution, otherwise it has "-1".
+ */
+ int32 num_extents_allocated;
+ /** To create VCI on more than 32 columns, creating TupleDesc by copying table's
+ * one is required. However, it is too heavy to repeat. So cache the created
+ * one to cached_tupledesc in initctx context.
+ */
+ MemoryContext initctx;
+ TupleDesc cached_tupledesc;
+} vci_MainRelHeaderInfo;
+
+/** Minimum size of an extent
+ * The extents of fixed field length columns has the size.
+ * The extents of the other types have larger size.
+ * Use vci_GetExtentFixedLengthRawDataHeaderSize() or something to obtain
+ * the size actually.
+ */
+#define VCI_EXTENT_HEADER_SIZE (offsetof(vcis_extent_t, dict_body))
+
+/** This function returns the size of header of extent for fixed field length
+ * data. The size can be calculated from the format and the number of rows
+ * in an extent. Actually, it is independent of the number of rows, but that
+ * of variable length depends.
+ * @param[in] numRowsInExtent The number of rows in the extent.
+ * @return The size of extent header.
+ */
+#define vci_GetExtentFixedLengthRawDataHeaderSize(numRowsInExtent) \
+ VCI_EXTENT_HEADER_SIZE
+
+/** Function to calculate necessary number of offset data to the chunks
+ * of VCI_COMPACTION_UNIT_ROW in ROS.
+ * @param[in] numRowsInExtent Number of rows in the extent.
+ * @return Number of necessary offsets.
+ */
+#define vci_GetOffsetArrayLength(numRowsInExtent) \
+ (1 + (((numRowsInExtent) + VCI_COMPACTION_UNIT_ROW - 1) \
+ / VCI_COMPACTION_UNIT_ROW))
+
+/** Function to calculate data size of necessary offset data to the chunks
+ * of VCI_COMPACTION_UNIT_ROW in ROS.
+ * @param[in] numRowsInExtent Number of rows in the extent.
+ * @return Necessary data size.
+ */
+#define vci_GetOffsetArraySize(numRowsInExtent) \
+ vci_GetOffsetArrayLength(numRowsInExtent) \
+ * sizeof(vci_offset_in_extent_t)
+
+/** This function returns the size of header of extent for variable field
+ * length data, and compressed data.
+ * The size can be calculated from the format and the number of rows
+ * in an extent. Actually, it is independent of the number of rows, but that
+ * of variable length depends.
+ * @param[in] numRowsInExtent The number of rows in the extent.
+ * @return The size of extent header.
+ */
+#define vci_GetExtentVariableLengthRawDataHeaderSize(numRowsInExtent) \
+ (VCI_EXTENT_HEADER_SIZE + vci_GetOffsetArraySize(numRowsInExtent))
+
+/** One entry of column_info in VCI main relation
+ */
+typedef struct vcis_m_column
+{
+ Oid meta_oid; /** OID of metadata relation */
+ Oid data_oid; /** OID of data relation */
+
+ /*
+ * int16 max_columns_size;
+ */
+ /** AttrNumber original_attribute_number; */
+ int16 max_columns_size;
+ int16 comp_type; /** vcis_compression_type_t */
+} vcis_m_column_t;
+
+/** One entry of extent_info in VCI main relation
+ */
+typedef struct vcis_m_extent
+{
+ /** number of rows recorded, including marked as deleted. */
+ uint32 num_rows;
+ uint32 num_deleted_rows; /* number of rows marked as deleted. */
+ uint32 num_deleted_rows_old; /* num_deleted_rows for recovery */
+ TransactionId xgen; /* like xmin */
+ TransactionId xdel; /* like xmax */
+
+ uint16 flags;
+ uint16 recovered_colid;
+} vcis_m_extent_t;
+
+#define VCIS_M_EXTENT_FLAG_ENABLE_RECOVERED_COLID (0x0001)
+
+/**
+ * @brief VCI main relation header area to store by vci_WriteMainRelVar().
+ *
+ * vci_wmrv_all is used when the VCI relation is built, since first two or
+ * three pages are defined in building time, then not modified at all.
+ * The last page has ROS command, current ROS version, and extent information
+ * so will be updated after creation. vci_wmrv_update is used when the last
+ * page is updated.
+ */
+typedef enum vci_wmrv_t
+{
+ vci_wmrv_update, /** Only the last header page will be wrote to storage */
+ vci_wmrv_all, /** All the header pages will be wrote to storage */
+} vci_wmrv_t;
+
+/** I categorized ROS data like TID, NULL bit vector, normal column data
+ * as shown below.
+ */
+typedef enum vcis_attribute_type_t
+{
+ vcis_attribute_type_main = 0, /* data only */
+ vcis_attribute_type_data_wos, /* data only */
+ vcis_attribute_type_whiteout_wos, /* data only */
+ vcis_attribute_type_tid_crid, /* special type, meta and data */
+ vcis_attribute_type_tid_crid_update, /* data only */ /* two elements */
+ vcis_attribute_type_delete_vec, /* normal column type */
+ vcis_attribute_type_null_vec, /* normal column type */
+ vcis_attribute_type_tid, /* normal column type */
+ vcis_attribute_type_pgsql, /* normal column type */
+ /* number of indexed columns */
+ num_vcis_attribute_type,
+} vcis_attribute_type_t;
+
+/**
+ * @brief Gives how many colums or data belong to the given category.
+ *
+ * Some categories, defined in vcis_attribute_type_t, have multiple elements.
+ * For example, vcis_attribute_type_pgsql category contains all the columns
+ * given in CREATE INDEX command. This function gives how many colums or data
+ * belong to the given category.
+ *
+ * @param[in] attrType Attribute type define in vcis_attribute_type_t.
+ * For normal columns, it takes vcis_attribute_type_pgsql.
+ * @param[in] numColumns The number of columns, which is returned when
+ * attrType is vcis_attribute_type_pgsql.
+ */
+static inline int
+vci_GetNumIndexForAttributeType(vcis_attribute_type_t attrType,
+ int16 numColumns)
+{
+ return (vcis_attribute_type_pgsql == attrType) ? numColumns
+ : ((vcis_attribute_type_tid_crid_update == attrType) ? 2
+ : ((0 <= attrType) && (attrType < num_vcis_attribute_type)) ? 1
+ : 0);
+}
+
+extern PGDLLEXPORT int vci_GetSumOfAttributeIndices(int16 numColumns);
+extern PGDLLEXPORT void vci_GetAttrTypeAndIndexFromSumOfIndices(
+ vcis_attribute_type_t *attrType,
+ int *index,
+ int16 numColumns,
+ int sumOfIndex);
+
+typedef enum vcis_compression_type_t
+{
+ vcis_compression_type_invalid = -1,
+ vcis_compression_type_fixed_raw = 0,
+ vcis_compression_type_variable_raw,
+ vcis_compression_type_fixed_comp, /* reserved */
+ vcis_compression_type_auto, /* reserved */
+ num_vcis_compression_type,
+} vcis_compression_type_t;
+
+typedef enum vcis_extent_type_t
+{
+ /** initial value is zero, since newly created DB page is filled with zero.
+ */
+ vcis_undef_space = 0,
+
+ vcis_extent_type_data,
+ vcis_extent_type_dict,
+ vcis_free_space,
+
+ vcis_tidcrid_type_leaf,
+ vcis_tidcrid_type_trunk,
+ vcis_tidcrid_type_pagetag,
+
+ num_vcis_extent_type,
+} vcis_extent_type_t ,
+vcis_tidcrid_item_type_t;
+
+/** Type(s) of dictionary.
+ */
+typedef enum vcis_dict_type_t
+{
+ /** initial value is zero, since newly created DB page is filled with zero.
+ */
+ vcis_dict_type_none = 0,
+ vcis_dict_type_lzvf,
+ num_vcis_dict_type,
+} vcis_dict_type_t;
+
+/** Type(s) of operations in updating TID-CRID tree.
+ */
+typedef enum
+{
+ vcis_tid_crid_op_none = 0,
+ vcis_tid_crid_op_trunk,
+ vcis_tid_crid_op_leaf_add,
+ vcis_tid_crid_op_leaf_remove,
+} vcis_tid_crid_op_type_t;
+
+#define vci_GetBlockNumberFromUint64(tId) \
+ ((tId) >> (BITS_PER_BYTE * sizeof(OffsetNumber)))
+#define vci_GetOffsetFromUint64(tId) \
+ ((tId) & ((1U << (BITS_PER_BYTE * sizeof(OffsetNumber))) - 1))
+#define vci_MakeUint64FromBlockNumberAndOffset(blockNumber, offset) \
+ (((uint64) (blockNumber) << (BITS_PER_BYTE * sizeof(OffsetNumber))) | (offset))
+
+/** Local delete list */
+typedef struct vci_local_delete_list
+{
+ uint32 num_entry; /* the number of CRID stored */
+ uint32 length; /* capacity of crid_list */
+ uint64 *crid_list; /* actual values taken from whiteout WOS */
+} vci_local_delete_list;
+
+struct vci_CSFetchContextData;
+
+/** Local ROS */
+typedef struct vci_local_ros
+{
+ vci_local_delete_list local_delete_list;
+
+ /** Number of extents of local ROS.
+ * The minimum extent ID of the local ROS is (-num_local_extents).
+ */
+ uint32 num_local_extents;
+
+ /** Pointer of the array of pointers to extent data.
+ * When release the data, first pfree(extent[i]) where i is from zero
+ * to (num_local_extents - 1), then pfree(extent).
+ */
+ struct vci_virtual_tuples **extent;
+
+ /* Memory context to store local ROS data */
+ MemoryContext memory_context;
+
+ /* not localized one */
+ /** this fetch_context is allocated in shared memory context created
+ * in vci_GenerateLocalRos(), and destructed in vci_DestroyLocalRos().
+ * In the latter function, the fetch_context is freed automatically.
+ */
+ struct vci_CSFetchContextData *fetch_context;
+} vci_local_ros_t;
+
+typedef struct vci_RelationPair
+{
+ vci_MainRelHeaderInfo *info;
+
+ Relation meta;
+ Relation data;
+
+ Buffer bufMeta;
+ Buffer bufData;
+} vci_RelationPair;
+
+extern PGDLLEXPORT void vci_InitMainRelHeaderInfo(vci_MainRelHeaderInfo *info,
+ Relation rel,
+ vci_ros_command_t command);
+extern void vci_KeepMainRelHeaderWithoutVersionCheck(vci_MainRelHeaderInfo *info);
+extern PGDLLEXPORT void vci_KeepMainRelHeader(vci_MainRelHeaderInfo *info);
+extern void vci_ChangeCommand(vci_MainRelHeaderInfo *info, vci_ros_command_t command);
+
+extern PGDLLEXPORT void vci_ReleaseMainRelHeader(vci_MainRelHeaderInfo *info);
+
+extern void vci_SetMainRelVar(vci_MainRelHeaderInfo *info,
+ vci_MainRelVar var,
+ int elemId,
+ uint32 value);
+extern PGDLLEXPORT uint32 vci_GetMainRelVar(vci_MainRelHeaderInfo *info,
+ vci_MainRelVar var,
+ int elemId);
+extern void vci_WriteMainRelVar(vci_MainRelHeaderInfo *info,
+ vci_wmrv_t writeArea);
+
+extern void vci_InitPageCore(Buffer buffer, int16 numItem, bool locked);
+extern void vci_InitPage(Relation rel, BlockNumber blockNumber, int16 numItem);
+
+extern Buffer vci_ReadBufferWithPageInit(Relation reln, BlockNumber blockNumber);
+extern Buffer vci_ReadBufferWithPageInitDelVec(Relation reln, BlockNumber blockNumber);
+
+/*
+ * In order to keep the heap tuple plane, set 'p' to attstorage in
+ * FormData_pg_attribute.
+ */
+
+extern PGDLLEXPORT vci_MainRelVar vci_GetMColumnPosition(int16 columnId);
+extern PGDLLEXPORT vcis_m_column_t *vci_GetMColumn(vci_MainRelHeaderInfo *info, int16 columnId);
+extern PGDLLEXPORT vcis_m_extent_t *vci_GetMExtent(Buffer *buffer, vci_MainRelHeaderInfo *info, int32 extentId);
+
+extern void vci_GetExtentInfoPosition(BlockNumber *blockNumber,
+ OffsetNumber *offset,
+ int32 extentId);
+extern bool vci_ExtentInfoExists(vci_MainRelHeaderInfo *info, int32 extentId);
+extern bool vci_ExtentIsVisible(vcis_m_extent_t *mExtent, TransactionId xid);
+extern bool vci_ExtentIsCollectable(vcis_m_extent_t *mExtent, TransactionId wos2rosXid);
+extern bool vci_ExtentIsFree(vcis_m_extent_t *extentInfo);
+
+extern uint32 vci_GetFreeExtentId(vci_MainRelHeaderInfo *info);
+extern PGDLLEXPORT int16 vci_GetColumnWorstSize(Form_pg_attribute attr);
+
+/* **************************************
+ * ** CAUTION: AttrNumber is 1 origin. **
+ * **************************************
+ */
+extern Size vci_GetColumnIdsAndSizes(AttrNumber *heapAttrNumList,
+ int16 *indxColumnIdList,
+ int16 *columnSizeList,
+ int numColumn,
+ vci_MainRelHeaderInfo *info,
+ Oid heapOid);
+extern void vci_WriteExtentInfoInMainRosForWriteExtentOrCommonDict(
+ vci_MainRelHeaderInfo *info,
+ int32 extentId,
+ int32 dictionaryId,
+ TransactionId xid,
+ vci_ros_command_t command);
+
+static inline void
+vci_WriteExtentInfoInMainRosForWriteExtent(vci_MainRelHeaderInfo *info,
+ int32 extentId,
+ TransactionId xid,
+ vci_ros_command_t command)
+{
+ vci_WriteExtentInfoInMainRosForWriteExtentOrCommonDict(info, extentId,
+ VCI_INVALID_DICTIONARY_ID,
+ xid, command);
+}
+
+static inline void
+vci_SetItemPointerFromTid64(ItemPointer item, uint64 tId)
+{
+ ItemPointerSet(item,
+ vci_GetBlockNumberFromUint64(tId),
+ vci_GetOffsetFromUint64(tId));
+}
+
+static inline uint64
+vci_GetTid64FromItemPointer(ItemPointer item)
+{
+ uint64 blockNumber;
+
+ Assert(NULL != item);
+ blockNumber = BlockIdGetBlockNumber(&(item->ip_blkid));
+
+ return vci_MakeUint64FromBlockNumberAndOffset(blockNumber, item->ip_posid);
+}
+
+/* **************************************
+ * ** CAUTION: AttrNumber is 1 origin. **
+ * **************************************
+ */
+extern Buffer vci_WriteOnePageIfNecessaryAndGetBuffer(Relation relation,
+ BlockNumber blockNumber,
+ BlockNumber blockNumberOld,
+ Buffer buffer);
+extern void vci_WriteExtentInfo(vci_MainRelHeaderInfo *info,
+ int32 extentId,
+ uint32 numRows,
+ uint32 numDeletedRows,
+ uint32 numDeletedRowsOld,
+ TransactionId xgen,
+ TransactionId xdel);
+
+/*
+ * *********************************************************
+ * functions to recover ROS
+ * *********************************************************
+ */
+extern void vci_RecoverOneVCIIfNecessary(vci_MainRelHeaderInfo *info);
+
+extern PGDLLEXPORT void
+ vci_PreparePagesIfNecessaryCore(Relation rel,
+ BlockNumber blockNumber,
+ uint16 numItems,
+ bool forceInit,
+ bool logItems);
+
+/**
+ * @brief This function checks if the relation has the DB page with the page ID
+ * blockNumber.
+ *
+ * When it does not exists, the function extends the relation and initialize
+ * extended pages with one item per page.
+ *
+ * @param[in] rel The relation.
+ * @param[in] blockNumber The block number to be examined.
+ * @param[in] numItems The number of items the page is initialized with.
+ */
+static inline void
+vci_FormatPageWithItems(Relation rel, BlockNumber blockNumber, int16 numItems)
+{
+ vci_PreparePagesIfNecessaryCore(rel, blockNumber, numItems, true, false);
+}
+
+static inline void
+vci_PreparePagesIfNecessary(Relation rel, BlockNumber blockNumber, uint16 numItems)
+{
+ vci_PreparePagesIfNecessaryCore(rel, blockNumber, numItems, false, false);
+}
+
+extern PGDLLEXPORT void vci_WriteItem(Relation rel,
+ Buffer buffer,
+ OffsetNumber itemId);
+
+extern void
+ vci_UpdateOldFieldsInMetaHeader(Relation rel, TransactionId xId);
+extern PGDLLEXPORT uint16
+ vci_GetFixedColumnSize(vci_MainRelHeaderInfo *info, int16 columnId);
+extern PGDLLEXPORT void
+ vci_GetPositionForFixedColumn(BlockNumber *blockNumber,
+ uint32 *offset,
+ vci_MainRelHeaderInfo *info,
+ int16 columnId,
+ int32 extentId,
+ uint32 rowIdInExtent,
+ bool atEnd);
+
+extern int vci_GetNumberOfNullableColumn(TupleDesc tupleDesc);
+extern PGDLLEXPORT int16 vci_GetBitIdInNullBits(TupleDesc tupleDesc, int16 columnId);
+
+extern PGDLLEXPORT Snapshot vci_GetCurrentSnapshot(void);
+extern void vci_FinalizeCopyCommand(void);
+
+struct vci_CSQueryContextData;
+extern struct vci_local_ros *vci_GenerateLocalRos(
+ struct vci_CSQueryContextData *queryContext,
+
+ /* maximum memory size to generate and keep local ROS */
+ Size workareaSize,
+
+ /* the number of rows from data WOS to local ROS */
+ int64 numDataWosRows,
+
+ /* the number of rows from whiteout WOS to local delete list */
+ int64 numWhiteoutWosRows);
+
+static inline unsigned int
+vci_GetNumRowsInLocalRosExtent(int numColumns)
+{
+ unsigned int numRowsInExtent = MaxAllocSize / Max(
+
+ /*
+ * The size of area to store pointers to larger data or values of small
+ * fixed length directly, say each size is smaller than or equal to
+ * sizeof(Datum). We allocate one are for all columns to support both row
+ * wise and column wise access.
+ */
+ sizeof(Datum) * numColumns,
+
+ /*
+ * The size of area to store with larger size than sizeof(Datum). The data
+ * in the area is pointed from pointers stored in above area, so we can
+ * allocate separately.
+ */
+ MaxHeapTupleSize);
+
+ return 1U << vci_GetHighestBit(Min(numRowsInExtent, VCI_NUM_ROWS_IN_EXTENT));
+}
+
+extern void vci_DestroyLocalRos(vci_local_ros_t *localRos);
+
+#define vci_WriteExtentInfoInMainRosForWosRosConvInit(info, extentId, xid) \
+ vci_WriteExtentInfoInMainRosForWriteExtent((info), \
+ (extentId), \
+ (xid), \
+ vci_rc_wos_ros_conv)
+
+#define vci_WriteExtentInfoInMainRosForCopyInit(info, extentId, xid) \
+ vci_WriteExtentInfoInMainRosForWriteExtent((info), \
+ (extentId), \
+ (xid), \
+ vci_rc_copy_command)
+
+/*
+ *
+ */
+static inline void
+vci_PreparePagesWithOneItemIfNecessary(Relation relation,
+ BlockNumber blockNumber)
+{
+ vci_PreparePagesIfNecessary(relation, blockNumber, 1);
+}
+
+/* this function set the dirty bit, and write all the items in the page
+ * to the WAL.
+ * arguments
+ * Relation rel
+ * Buffer buffer
+ */
+static inline void
+vci_WriteOneItemPage(Relation rel,
+ Buffer buffer)
+{
+ vci_WriteItem(rel, buffer, FirstOffsetNumber);
+}
+
+/* Initialize a DB page with one item format
+ * argumtents
+ * Relation relation
+ * BlockNumber blockNumber
+ */
+static inline void
+vci_InitOneItemPage(Relation relation, BlockNumber blockNumber)
+{
+ vci_InitPage(relation, blockNumber, 1);
+}
+
+static inline void
+vci_FormatPageWithOneItem(Relation rel, BlockNumber blockNumber)
+{
+ vci_FormatPageWithItems(rel, blockNumber, 1);
+}
+
+static inline uint32
+vci_VarSizeAny(char *ptr)
+{
+ if (!VARATT_IS_1B(ptr))
+ {
+ static varattrib_4b tmp;
+
+ MemCpy(&tmp, ptr, sizeof(varattrib_4b));
+
+ return VARSIZE_4B(&tmp);
+ }
+
+ return VARSIZE_ANY(ptr);
+}
+
+static inline bool
+vci_PassByRefForFixed(Form_pg_attribute attr)
+{
+#ifndef USE_FLOAT8_BYVAL
+ if (8 == attr->attlen)
+ return true;
+#endif /* #ifndef USE_FLOAT8_BYVAL */
+
+ return sizeof(Datum) < (unsigned long) attr->attlen;
+}
+
+static inline void *
+vci_repalloc(void *ptr, size_t size)
+{
+ return ptr ? repalloc_array(ptr, char, size) : palloc_array(char, size);
+}
+
+static inline bool
+vci_GetBit(uint8 *bitArray, int bitId)
+{
+ return (bitArray[bitId >> 3] >> (bitId & 7)) & 1;
+}
+
+typedef struct vci_DictInfo
+{
+ /*
+ * Memory area to read dictionary. This is not used when create new
+ * dictionaries.
+ */
+ unsigned char *dictionary_storage;
+
+ Size storage_size; /* byte size of dictionary_storage */
+
+ /*
+ * The extent ID for individual dictionary. VCI_INVALID_EXTENT_ID for
+ * common dictionaries.
+ */
+ int32 extent_id;
+
+ /* VCI_INVALID_DICTIONARY_ID for individual dictionary */
+ int16 common_dict_id;
+
+ vcis_dict_type_t dict_type;
+
+} vci_DictInfo;
+
+Buffer
+ vci_WriteDataIntoMultiplePages(Relation rel,
+ BlockNumber *blockNumber,
+ BlockNumber *blockNumberOld,
+ uint32 *offsetInPage,
+ Buffer buffer,
+ const void *data_,
+ Size size);
+
+typedef struct vci_meta_item_scanner
+{
+ bool inited;
+
+ Relation rel;
+ int index;
+
+ BlockNumber end_block; /* inclusive */
+ BlockNumber start_block;
+
+ Buffer buffer;
+ BlockNumber current_block;
+
+ int max_item;
+ int max_item_in_page;
+ int item_size;
+
+ int buf_lockmode;
+
+} vci_meta_item_scanner_t;
+
+typedef struct
+{
+ Oid oid; /* Oid of VCI main relation */
+ Oid dbid; /* Oid of database to which a VCI main
+ * relation belongs */
+ bool force_next_wosros_conv; /* flag to force WOS->ROS conversion
+ * on next time */
+} vci_wosros_conv_worker_arg_t;
+
+extern vcis_m_extent_t *vci_GetMExtentNext(vci_MainRelHeaderInfo *info, vci_meta_item_scanner_t *scan);
+extern vci_meta_item_scanner_t *vci_BeginMetaItemScan(Relation rel, int buf_lock);
+extern void vci_EndMetaItemScan(vci_meta_item_scanner_t *scan);
+
+/* recovery functions for command */
+extern void vci_UpdateLastRosVersionAndOthers(vci_MainRelHeaderInfo *info);
+extern void vci_RecoveryDone(vci_MainRelHeaderInfo *info);
+extern void vci_WriteRecoveryRecordDone(vci_MainRelHeaderInfo *info, vci_ros_command_t command, TransactionId xid);
+
+extern void vci_WriteRecoveryRecordForExtentInfo(vci_MainRelHeaderInfo *info,
+ int32 newExtentId, int32 oldExtentId);
+extern void vci_RecoveryExtentInfo(vci_MainRelHeaderInfo *info, vci_ros_command_t command);
+
+extern void vci_WriteRecoveryRecordForUpdateDelVec(vci_MainRelHeaderInfo *info);
+extern void vci_RecoveryUpdateDelVec(vci_MainRelHeaderInfo *info);
+extern const char *vci_GetRosCommandName(vci_ros_command_t command);
+
+/* ----------------
+ * vci_index.c
+ * ----------------
+ */
+
+extern bool vci_isVciAdditionalRelation(Relation rel);
+extern bool vci_isVciAdditionalRelationTuple(Oid reloid, Form_pg_class reltuple);
+
+/* ----------------
+ * vci_internal_view.c
+ * ----------------
+ */
+
+extern void vci_check_prohibited_operation(Node *parseTree, bool *creating_vci_extension);
+
+#endif /* VCI_ROS_H */
diff --git a/contrib/vci/include/vci_ros_command.h b/contrib/vci/include/vci_ros_command.h
new file mode 100644
index 0000000..8c2cb5c
--- /dev/null
+++ b/contrib/vci/include/vci_ros_command.h
@@ -0,0 +1,214 @@
+/*-------------------------------------------------------------------------
+ *
+ * vci_ros_command.h
+ * Definitions and declarations of ROS control commands
+ *
+ * Portions Copyright (c) 2025, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * contrib/vci/include/vci_ros_command.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef VCI_ROS_COMMAND_H
+#define VCI_ROS_COMMAND_H
+
+#include "postgres.h"
+#include "c.h"
+#include "utils/tuplesort.h"
+#include "access/genam.h"
+
+#include "vci_ros.h"
+#include "vci_chunk.h"
+
+typedef struct
+{
+ ItemPointerData *orig_tids;
+
+ ItemPointerData *wos_tids;
+
+ int max;
+
+ int num;
+
+ int offset;
+} vci_tid_array_t;
+
+typedef struct
+{
+ BlockNumber *orig_blknos;
+
+ int max;
+
+ int num;
+
+} vci_blk_array_t;
+
+/**
+ * @brief Context for ROS commands, containing TID list read from data WOS or
+ * whiteout WOS, data read from the PostgreSQL heap relation or from ROS,
+ * related attribute numbers, OIDs, number of rows, and so on.
+ */
+typedef struct vci_RosCommandContext
+{
+ vci_ros_command_t command; /* command using this context */
+
+ RosChunkBuffer buffer; /* data are stored primary here */
+ RosChunkStorage storage; /* data are compacted and copied here */
+ vci_MainRelHeaderInfo info; /* VCI main relation header */
+
+ /** numRowsToConvert is something tricky.
+ * set VCI_NUM_ROWS_IN_EXTENT in index building phase.
+ * set number of rows (up to VCI_NUM_ROWS_IN_EXTENT) to convert after
+ * building.
+ */
+ int numRowsToConvert;
+
+ int numRowsAtOnce; /* maximum number of rows in a chunk */
+ Relation heapRel; /* the original relation indexed by VCI */
+ Oid heapOid; /* the original relation indexed by VCI */
+ Oid indexOid; /* the VCI indexed relation */
+
+ int numColumns; /* number of columns in VCI index */
+
+ /** the processing extent ID. negative IDs for local ROSes */
+ int32 extentId;
+ int32 extentIdSrc; /* source extentId in copy operation (wos2ros,
+ * cdr) */
+
+ struct vci_local_ros *local_ros; /* local ROS */
+
+ /** list of worst case column size */
+ int16 *columnSizeList;
+
+ /** attribute number (1-origin) in the original relation */
+ AttrNumber *heapAttrNumList;
+
+ /** index ID (0-origin) in the VCI relation */
+ int16 *indxColumnIdList;
+
+ /** transaction ID using this context */
+ TransactionId xid;
+
+ TransactionId oldestXmin;
+
+ TransactionId wos2rosXid;
+
+ TransactionId inclusiveXid;
+
+ TransactionId exclusiveXid;
+
+ vci_tid_array_t wos2ros_array;
+
+ vci_tid_array_t delvec_array;
+
+ vci_blk_array_t utility_array;
+
+ /**
+ * TID on "WOS Relation" list to convert in Item Pointer format
+ */
+
+ bool done; /* true if all records are read */
+
+ /**
+ * Number of rows in the relation estimated by analyze or vacuum command.
+ * This is used to build ROS in CREATE INDEX command.
+ */
+ double estimatedNumRows;
+
+ /**
+ * Number of converted rows.
+ * This is used to build ROS in CREATE INDEX command.
+ */
+ uint64 numConvertedRows;
+
+ /**
+ * The name of index relation built.
+ * This is used to build ROS in CREATE INDEX command.
+ */
+ char relName[NAMEDATALEN];
+
+ /**
+ * scan context.
+ * This is used only in initial building to scan the original relation
+ * sequentially.
+ */
+ HeapScanDesc scan;
+
+ TupleDesc tid_tid_tupdesc;
+
+ TupleTableSlot *tid_tid_slot;
+
+ /**
+ * a sorted TID list to be converted into ROS extents
+ */
+ Tuplesortstate *wos2ros_tid_list;
+ int64 num_wos2ros_tids;
+
+ /**
+ * a sorted TID list to be converted into a delete vector
+ */
+ Tuplesortstate *delvec_tid_list;
+ int64 num_delvec_tids;
+
+ Tuplesortstate *data_wos_del_list;
+
+ Tuplesortstate *whiteout_wos_del_list;
+
+} vci_RosCommandContext;
+
+typedef struct
+{
+ int32 num_fit_extents;
+ int32 best_extent_id;
+} vci_target_extent_info_t;
+
+/*
+ * *********************************************************
+ * Conversion Context operation
+ * *********************************************************
+ */
+extern void vci_InitRosCommandContext0(vci_RosCommandContext *context,
+ Relation rel, vci_ros_command_t command);
+extern void vci_InitRosCommandContext1(vci_RosCommandContext *comContext,
+ Size workareaSize,
+ int numInsertRows,
+ int numDeleteRows,
+ bool readOriginalData);
+extern void vci_InitRosCommandContext2(vci_RosCommandContext *comContext, Size workareaSize);
+
+extern void vci_InitRosChunkStroageAndBuffer(vci_RosCommandContext *comContext, bool forAppending);
+
+extern void vci_CleanRosCommandContext(vci_RosCommandContext *comContext, bool neverWrite);
+extern void vci_FinRosCommandContext(vci_RosCommandContext *comContext, bool neverWrite);
+
+extern void vci_ReleaseMainRelInCommandContext(vci_RosCommandContext *comContext);
+extern void vci_CloseHeapRelInCommandContext(vci_RosCommandContext *comContext);
+
+/*
+ * *********************************************************
+ * Functions for ROS command
+ * *********************************************************
+ */
+extern PGDLLEXPORT int vci_ConvertWos2Ros(Relation mainRel, Size workareaSize, int numRows);
+extern double vci_ConvertWos2RosForBuild(Relation mainRel, Size workarea, IndexInfo *indexInfo);
+extern PGDLLEXPORT int vci_UpdateDelVec(Relation mainRel, Size workareaSize, int numRows);
+extern PGDLLEXPORT int vci_CollectDeletedRows(Relation mainRel, Size workareaSize, int32 extentId);
+extern PGDLLEXPORT int vci_UpdateTidCrid(Relation mainRel, Size workareaSize, int numPages);
+extern PGDLLEXPORT int vci_CollectUnusedExtent(Relation mainRel, Size workareaSize);
+
+extern void vci_VacuumRos(Relation mainRel, IndexVacuumInfo *vacuumInfo);
+
+/*
+ * *********************************************************
+ * Probing functions to decided whether to execute the command
+ * *********************************************************
+ */
+extern PGDLLEXPORT uint32 vci_CountFreezedInDataWos(Relation mainRel, Size workarea);
+extern PGDLLEXPORT uint32 vci_CountFreezedInWhiteoutWos(Relation mainRel, Size workarea);
+extern PGDLLEXPORT vci_target_extent_info_t vci_CountDeletedRowsInROS(Relation mainRel, uint32 threshold);
+extern vci_target_extent_info_t vci_CountUnusedExtents(Relation mainRel);
+extern int32 vci_CountTidCridUpdateListLength(Relation mainRel, Size workarea);
+
+#endif /* #ifndef VCI_ROS_COMMAND_H */
diff --git a/contrib/vci/include/vci_ros_daemon.h b/contrib/vci/include/vci_ros_daemon.h
new file mode 100644
index 0000000..8def778
--- /dev/null
+++ b/contrib/vci/include/vci_ros_daemon.h
@@ -0,0 +1,69 @@
+/*-------------------------------------------------------------------------
+ *
+ * vci_ros_daemon.h
+ * Definitions and declarations of ROS Control Daemon and Worker
+ *
+ *
+ * Portions Copyright (c) 2025, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * contrib/vci/include/vci_ros_daemon.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef VCI_ROS_DAEMON_H
+#define VCI_ROS_DAEMON_H
+
+#include "postgres.h"
+
+#include "lib/ilist.h"
+#include "postmaster/bgworker.h"
+#include "utils/relcache.h"
+
+#include "vci_ros.h"
+
+/**
+ * The threshold of tid->crid update list item coutns to execute tid->crid update
+ */
+#define VCI_UPDATE_TIDCRID_THRESHOLD (1024)
+
+/**
+ * The threshold of Whiteout WOS rows to update Delete Vector
+ */
+#define VCI_UPDATE_DELVEC_THRESHOLD (256 * 1024)
+
+/**
+ * @see src/backend/postmaster/bgworker.c
+ */
+struct BackgroundWorkerHandle
+{
+ int slot;
+ uint64 generation;
+};
+
+typedef struct vci_workerslot
+{
+ pid_t pid;
+
+ BackgroundWorkerHandle handle;
+
+ Oid dbid;
+ Oid oid;
+} vci_workerslot_t;
+
+/* ************************* */
+/* daemon functions */
+/* ************************* */
+
+extern void vci_ROS_control_daemon_setup(void);
+PGDLLEXPORT void vci_ROS_control_daemon_main(Datum main_arg);
+
+extern PGDLLEXPORT vci_workerslot_t vci_LaunchROSControlWorker(vci_wosros_conv_worker_arg_t *vciinfo, int slot_id);
+PGDLLEXPORT void vci_ROS_control_worker_main(Datum main_arg);
+
+extern BackgroundWorkerHandle vci_LaunchROSControlMaintainer(int mode);
+extern void vci_ROS_control_maintainer_main(Datum main_arg);
+
+extern void vci_InitDbPriorityList(void);
+
+#endif /* VCI_ROS_DAEMON_H */
diff --git a/contrib/vci/include/vci_tidcrid.h b/contrib/vci/include/vci_tidcrid.h
new file mode 100644
index 0000000..6728a60
--- /dev/null
+++ b/contrib/vci/include/vci_tidcrid.h
@@ -0,0 +1,344 @@
+/*-------------------------------------------------------------------------
+ *
+ * vci_tidcrid.h
+ * Definitions and Declarations of TIDCRID update list and
+ * TIDCRID Tree relation
+ *
+ * Portions Copyright (c) 2025, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * contrib/vci/include/vci_tidcrid.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef VCI_TIDCRID_H
+#define VCI_TIDCRID_H
+
+#include "postgres.h"
+
+#include "utils/tuplesort.h"
+
+#include "vci.h"
+#include "vci_ros.h"
+#include "vci_chunk.h"
+
+/** header page ID of TID->CRID update (differential) list */
+#define VCI_TID_CRID_UPDATE_HEADER_PAGE_ID (0)
+
+/** first body page ID of TID->CRID update (differential) list */
+#define VCI_TID_CRID_UPDATE_BODY_PAGE_ID (1)
+
+/** First page of tidcrid tree meta relation */
+#define VCI_TID_CRID_META_FIRST_PAGE_ID (0)
+
+/** First page of tidcrid tree data relation */
+#define VCI_TID_CRID_DATA_FIRST_PAGE_ID (0)
+
+/** Item number in page for tidcrid tree relation */
+#define VCI_ITEMS_IN_PAGE_FOR_TID_CRID_TREE (18)
+
+/** Offset number of page tag */
+#define VCI_TID_CRID_PAGETAG_ITEM_ID (VCI_FREESPACE_ITEM_ID)
+
+/** Capacity of tidcrid leaf node in bit*/
+#define VCI_TID_CRID_LEAF_CAPACITY_BITS (6)
+
+/** Capacity of tidcrid leaf node in bit*/
+#define VCI_TID_CRID_LEAF_CAPACITY (1 << VCI_TID_CRID_LEAF_CAPACITY_BITS)
+
+/** Capacity of tidcrid trunk node in bit*/
+#define VCI_TID_CRID_TRUNK_CAPACITY_BITS (6)
+
+/** Capacity of tidcrid trunk node in bit*/
+#define VCI_TID_CRID_TRUNK_CAPACITY (1 << VCI_TID_CRID_TRUNK_CAPACITY_BITS)
+
+/** Index of trunk node */
+#define VCI_TID_CRID_TRUNKNODE (-1)
+
+/** The number of items in DB page of TID-CRID Update List, normally 678 */
+#define VCI_TID_CRID_UPDATE_PAGE_ITEMS (VCI_MAX_PAGE_SPACE / sizeof(vcis_tidcrid_pair_item_t))
+
+/** Available area in DB page of TID-CRID Update List, normally 8136 bytes */
+#define VCI_TID_CRID_UPDATE_PAGE_SPACE (VCI_TID_CRID_UPDATE_PAGE_ITEMS * sizeof(vcis_tidcrid_pair_item_t))
+
+#define VCI_TID_CRID_UPDATE_CONTEXT_SAMPLES (1353)
+
+/*
+ * On-disk data structure for CRID
+ *
+ * GetUin64tFromCrid() can be used to convert to uint64
+ *
+ * Sometimes v2 has special meanings, it represents special CRID.
+ */
+typedef struct vcis_Crid
+{
+ uint16 v0;
+ uint16 v1;
+ uint16 v2;
+}
+#ifdef __arm__
+ __attribute__((packed))
+#endif
+vcis_Crid;
+
+/*
+ * Convert vcis_Crid to uint64, on-memory structure
+ */
+static inline uint64
+vci_GetUint64FromCrid(vcis_Crid crid)
+{
+ /* Handle special values */
+ if (crid.v2 == 0x8000)
+ return VCI_INVALID_CRID;
+ if (crid.v2 == 0xc000)
+ return VCI_MOVED_CRID;
+
+ return ((uint64) crid.v2 << 32) | ((uint64) crid.v1 << 16) | crid.v0;
+}
+
+/*
+ * Convert uint64 to vcis_Crid, on-disk structure
+ */
+static inline vcis_Crid
+vci_GetCridFromUint64(uint64 crid_uint64)
+{
+ vcis_Crid crid;
+
+ crid.v0 = crid_uint64 & ((uint64) 0xFFFF);
+ crid.v1 = (crid_uint64 >> 16) & ((uint64) 0xFFFF);
+ crid.v2 = (crid_uint64 >> 32) & ((uint64) 0xFFFF);
+
+ return crid;
+}
+
+/*
+ * TID-CRID tree relation
+ *
+ * The relation for the TID-CRID tree adds 18 tuples per page. In more detail,
+ * each tuple can use only 424 bytes.
+ *
+ * Each node of the tree has 64 slots, and each slot has 6 bytes, so 384 bytes
+ * are used to represent the tree. The remaining part is used for maintenance.
+ * Also, the initial tuple of each page is used for maintaining the page.
+ */
+
+/*
+ * Entries of flexible array in vcis_tidcrid_meta
+ */
+typedef struct vcis_tidcrid_meta_item
+{
+ BlockNumber block_number; /* block number in TID-CRID tree relation */
+ BlockNumber block_number_old; /* previous block_number, used for
+ * recovery purpose */
+ int16 item_id; /* item id on TID-CRID tree relation */
+ int16 item_id_old; /* previous item_id, used for recovery purpose */
+} vcis_tidcrid_meta_item_t;
+
+/*
+ * Meta relation for TID-CRID tree
+ *
+ * XXX: Several arrtibutes are not used but retained, to be consistent with
+ * Column Meta Relation.
+ */
+typedef struct vcis_tidcrid_meta
+{
+ vcis_attribute_type_t vcis_attr_type; /* Attribute type */
+
+ Oid pgsql_atttypid; /* taken from FormData_pg_attribute.atttypid */
+ int16 pgsql_attnum; /* taken from FormData_pg_attribute.attnum */
+ int16 pgsql_attlen; /* taken from FormData_pg_attribute.attlen */
+ int32 pgsql_atttypmod; /* taken from
+ * FormData_pg_attribute.atttypmod */
+ uint32 num_extents; /* number of extents (for debug) */
+ uint32 num_extents_old; /* previous number of extents (for
+ * recovery) */
+
+ BlockNumber free_page_begin_id; /* page ID of the first free area */
+ BlockNumber free_page_begin_id_old; /* previous free_page_begin_id (for
+ * recovery) */
+
+ BlockNumber free_page_end_id; /* page ID of the last free area */
+ BlockNumber free_page_end_id_old; /* previous free_page_end_id (for
+ * recovery) */
+
+ /**
+ * The DB page ID of free area that located in front of the added or
+ * deleted extent by the ROS command. (for recovery)
+ * This is used to recover free area list.
+ */
+ BlockNumber free_page_prev_id;
+
+ /**
+ * Same as free_page_prev_id, but just behind the added or deleted extent.
+ */
+ BlockNumber free_page_next_id;
+
+ /**
+ * The freespace size of added or deleted extent by the ROS command (for recovery)
+ */
+ uint32 free_page_old_size;
+
+ /**
+ * The freespace position of added or deleted extent in BlockNumber
+ * by the ROS command (for recovery)
+ */
+ BlockNumber new_data_head;
+ BlockNumber new_freespace_head; /* @todo unused field */
+
+ BlockNumber num_free_pages; /* number of free DB pages in the listed free
+ * area */
+ BlockNumber num_free_pages_old; /* for recovery */
+ BlockNumber num_free_page_blocks; /* number of free areas, not number of
+ * free DB pages */
+ BlockNumber num_free_page_blocks_old; /* for recovery */
+
+ /*--- Above must be same as column Meta ---*/
+
+ BlockNumber num; /* number of Stored items */
+ BlockNumber num_old; /* previous num, used for recovery purpose */
+ BlockNumber free_block_number; /* number of free blocks */
+ int32 offset; /* Offset from the head */
+ vcis_tidcrid_meta_item_t body[1]; /* Flexible array of
+ * vcis_tidcrid_meta_item_t */
+} vcis_tidcrid_meta_t;
+
+/*
+ * Metadata at the initial tuple
+ */
+typedef struct vcis_tidcrid_pagetag
+{
+ uint32 size;
+ vcis_extent_type_t type;
+ BlockNumber prev_pos;
+ BlockNumber next_pos;
+
+ uint32 num;
+ uint32 free_size;
+ uint32 bitmap;
+ char rsv[4];
+} vcis_tidcrid_pagetag_t;
+
+/*
+ * Leaf in the TID-CRID tree
+ */
+typedef struct vcis_tidcrid_leaf
+{
+ uint32 size;
+ vcis_tidcrid_item_type_t type;
+
+ uint64 bitmap;
+ uint64 unused;
+
+ /* Sum of above must be less than 40 bytes */
+
+ vcis_Crid crid[VCI_TID_CRID_LEAF_CAPACITY]; /* CRIDs related with TID */
+} vcis_tidcrid_leaf_t;
+
+/*
+ * Intermediate (trunk) node in TID-CRID tree
+ */
+typedef struct vcis_tidcrid_trunk
+{
+ uint32 size;
+ vcis_tidcrid_item_type_t type;
+
+ uint64 bitmap;
+ uint64 unused;
+
+ /* Sum of above must be less than 40 bytes */
+
+ ItemPointerData leaf_item[VCI_TID_CRID_TRUNK_CAPACITY]; /* Pointer to the leaf */
+} vcis_tidcrid_trunk_t;
+
+/*
+ * TID-CRID pair used for TIDCRID update list
+ */
+typedef struct vcis_tidcrid_pair_item
+{
+ ItemPointerData page_item_id; /* TID on the original relation */
+ vcis_Crid crid; /* CRID */
+} vcis_tidcrid_pair_item_t;
+
+/*
+ * TID-CRID Update List
+ */
+typedef struct vcis_tidcrid_pair_list
+{
+ uint64 num; /* Number of items in the list */
+
+ uint16 blocks_per_samp; /* Number of blocks each entries in
+ * samples_tids[] handles */
+ uint16 num_samples; /* Number of entries in samples_tids[] */
+
+ /*
+ * TID samples from update list. Sampling condition:
+ *
+ * 1. Initial entries in each blocks_per_samp blocks 2. Final entry
+ */
+ ItemPointerData sample_tids[VCI_TID_CRID_UPDATE_CONTEXT_SAMPLES + 1];
+
+ vcis_tidcrid_pair_item_t body[1]; /* Flexible array of
+ * vcis_tidcrid_pair_item_t */
+} vcis_tidcrid_pair_list_t;
+
+typedef struct vci_TidCridUpdateListContext
+{
+ vci_MainRelHeaderInfo *info; /* Parent VCI main relation */
+
+ Relation rel;
+
+ /* Number of vcis_tidcrid_pair_item_t entries in the rel */
+ uint64 count;
+
+ /* Number of blocks of the rel */
+ BlockNumber nblocks;
+
+ /* Head pointer to the TID-CRID Update List */
+ vcis_tidcrid_pair_list_t header;
+
+} vci_TidCridUpdateListContext;
+
+typedef vci_RelationPair vci_TidCridRelations;
+
+/* initialize function */
+extern void vci_InitializeTidCridUpdateLists(vci_MainRelHeaderInfo *info);
+extern void vci_InitializeTidCridTree(vci_MainRelHeaderInfo *info);
+
+/* TIDCRID Update List access functions */
+
+extern PGDLLEXPORT vci_TidCridUpdateListContext *vci_OpenTidCridUpdateList(vci_MainRelHeaderInfo *info, int sel);
+extern PGDLLEXPORT void vci_CloseTidCridUpdateList(vci_TidCridUpdateListContext *context);
+
+extern PGDLLEXPORT void vci_ReadOneBlockFromTidCridUpdateList(vci_TidCridUpdateListContext *context, BlockNumber blkno, vcis_tidcrid_pair_item_t *array);
+
+extern int32 vci_GetTidCridUpdateListLength(vci_MainRelHeaderInfo *info, int sel);
+extern void vci_MergeAndWriteTidCridUpdateList(vci_MainRelHeaderInfo *info, int newSel, int oldSel, Tuplesortstate *newList, vcis_Crid crid);
+
+/* TIDCRID Tree access functions */
+extern void vci_OpenTidCridRelations(vci_TidCridRelations *rel,
+ vci_MainRelHeaderInfo *info,
+ LOCKMODE lockmode);
+extern void vci_CloseTidCridRelations(vci_TidCridRelations *rel, LOCKMODE lockmode);
+
+extern void vci_GetTidCridSubTree(vci_TidCridRelations *relPair, BlockNumber blkOrig,
+ ItemPointer retPtr);
+extern void vci_CreateTidCridSubTree(vci_TidCridRelations *relPair, BlockNumber blkOrig,
+ ItemPointer retPtr);
+extern void vci_UpdateTidCridSubTree(vci_TidCridRelations *relPair, ItemPointer trunkPtr,
+ vcis_tidcrid_pair_list_t *newItems);
+
+/* TID->CRID Conversion */
+extern PGDLLEXPORT uint64 vci_GetCridFromTid(vci_TidCridUpdateListContext *context, ItemPointer tId, bool *fromTree);
+
+/* Recovery functions */
+
+extern void vci_RecoveryFreeSpaceForTidCrid(vci_MainRelHeaderInfo *info);
+extern void vci_RecoveryTidCrid(vci_MainRelHeaderInfo *info);
+extern void vci_InitRecoveryRecordForTidCrid(vci_MainRelHeaderInfo *info);
+
+extern void vci_AddTidCridUpdateList(vci_MainRelHeaderInfo *info,
+ RosChunkStorage *src,
+ int32 extentId);
+
+#endif /* VCI_TIDCRID_H */
diff --git a/contrib/vci/include/vci_wos.h b/contrib/vci/include/vci_wos.h
new file mode 100644
index 0000000..7bc302b
--- /dev/null
+++ b/contrib/vci/include/vci_wos.h
@@ -0,0 +1,29 @@
+/*-------------------------------------------------------------------------
+ *
+ * vci_wos.h
+ * Declarations of WOS functions
+ *
+ *
+ * Portions Copyright (c) 2025, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * contrib/vci/include/vci_wos.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef VCI_WOS_H
+#define VCI_WOS_H
+
+#include "postgres.h"
+
+#include "storage/itemptr.h"
+#include "lib/rbtree.h"
+#include "utils/relcache.h"
+#include "utils/snapshot.h"
+
+extern Snapshot vci_GetSnapshotForWos2Ros(void);
+extern Snapshot vci_GetSnapshotForLocalRos(TransactionId inclusive_xid, TransactionId exclusive_xid);
+
+extern PGDLLEXPORT uint64 vci_EstimateNumEntriesInHeapRelation(Oid oid);
+
+#endif /* VCI_WOS_H */
diff --git a/contrib/vci/include/vci_xact.h b/contrib/vci/include/vci_xact.h
new file mode 100644
index 0000000..67fe1e4
--- /dev/null
+++ b/contrib/vci/include/vci_xact.h
@@ -0,0 +1,39 @@
+/*-------------------------------------------------------------------------
+ *
+ * vci_xact.h
+ * Transaction control
+ *
+ * Portions Copyright (c) 2025, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * contrib/vci/include/vci_xact.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef VCI_XACT_H
+#define VCI_XACT_H
+
+#include "access/xact.h"
+
+struct vci_MainRelHeaderInfo;
+
+/*
+ * States of transactions
+ */
+enum vci_xact_status_kind
+{
+ VCI_XACT_INVALID, /* invalid transaction ID */
+ VCI_XACT_SELF, /* my transaction */
+ VCI_XACT_IN_PROGRESS, /* in-progress transaction (not mine) */
+ VCI_XACT_DID_COMMIT, /* committed transaction */
+ VCI_XACT_DID_ABORT, /* aborted transaction */
+ VCI_XACT_DID_CRASH /* crash was happened during the transaction */
+};
+
+extern enum vci_xact_status_kind vci_transaction_get_type(TransactionId xid);
+
+extern int64 vci_GenerateXid64(TransactionId target_xid, struct vci_MainRelHeaderInfo *info);
+extern void vci_UpdateXidGeneration(struct vci_MainRelHeaderInfo *info);
+
+#endif /* VCI_XACT_H */
diff --git a/contrib/vci/storage/Makefile b/contrib/vci/storage/Makefile
new file mode 100644
index 0000000..2ea8365
--- /dev/null
+++ b/contrib/vci/storage/Makefile
@@ -0,0 +1,34 @@
+# contrib/vci/storage/Makefile
+
+SUBOBJS = \
+# vci_chunk.o \
+# vci_columns.o \
+# vci_columns_data.o \
+# vci_fetch.o \
+# vci_freelist.o \
+# vci_index.o \
+# vci_internal_view.o \
+# vci_low_utils.o \
+# vci_memory_entry.o \
+ vci_ros.o \
+ vci_ros_command.o \
+ vci_ros_daemon.o \
+# vci_tidcrid.o \
+# vci_wos.o \
+# vci_xact.o
+
+EXTRA_CLEAN = SUBSYS.o $(SUBOBJS)
+
+PG_CPPFLAGS = -I $(top_srcdir)/contrib/vci/include
+
+ifdef USE_PGXS
+PGXS := $(shell pg_config --pgxs)
+include $(PGXS)
+else
+subdir = contrib/vci/storage
+top_builddir = ../../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
+
+override CFLAGS += $(CFLAGS_SL)
diff --git a/contrib/vci/storage/meson.build b/contrib/vci/storage/meson.build
new file mode 100644
index 0000000..fefe15b
--- /dev/null
+++ b/contrib/vci/storage/meson.build
@@ -0,0 +1,19 @@
+# Copyright (c) 2025, PostgreSQL Global Development Group
+
+vci_storage_sources = files(
+# 'vci_chunk.c',
+# 'vci_columns.c',
+# 'vci_columns_data.c',
+# 'vci_fetch.c',
+# 'vci_freelist.c',
+# 'vci_index.c',
+# 'vci_internal_view.c',
+# 'vci_low_utils.c',
+# 'vci_memory_entry.c',
+ 'vci_ros.c',
+ 'vci_ros_command.c',
+ 'vci_ros_daemon.c',
+# 'vci_tidcrid.c',
+# 'vci_wos.c',
+# 'vci_xact.c',
+)
diff --git a/contrib/vci/storage/vci_ros.c b/contrib/vci/storage/vci_ros.c
new file mode 100644
index 0000000..49cccf1
--- /dev/null
+++ b/contrib/vci/storage/vci_ros.c
@@ -0,0 +1,1659 @@
+/*-------------------------------------------------------------------------
+ *
+ * vci_ros.c
+ *
+ * Portions Copyright (c) 2025, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * contrib/vci/storage/vci_ros.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <stdint.h>
+
+#include "access/heapam_xlog.h"
+#include "access/xact.h"
+#include "access/xloginsert.h"
+#include "catalog/pg_type.h"
+#include "mb/pg_wchar.h" /* for MAX_MULTIBYTE_CHAR_LEN */
+#include "miscadmin.h"
+#include "storage/ipc.h"
+#include "storage/procarray.h"
+#include "utils/lsyscache.h"
+#include "utils/memutils.h"
+#include "utils/rel.h"
+#include "utils/snapmgr.h"
+#include "utils/syscache.h"
+#include "utils/varbit.h"
+
+#include "vci.h"
+#include "vci_columns.h"
+#include "vci_freelist.h"
+#include "vci_ros.h"
+#include "vci_mem.h"
+#include "vci_wos.h"
+
+/*
+ * This file has four parts.
+ * 1. Accessing VCI main relation header
+ * 2. Relation and buffer control
+ * 3. Attributes (columns)
+ * 4. VCI "columns"
+ */
+
+/*
+ * *********************************************************
+ * Accessing VCI main relation header
+ * *********************************************************
+ */
+/* Accessing VCI main relation header
+ * Because the header of VCI main relation has three pages, we can not map
+ * one structure of C on the header pages simply.
+ * Instead, we use access functions.
+ *
+ * In order to, first use one of these two * functions,
+ *
+ * vci_KeepReadingMainRelHeader()
+ * Read header pages for reading, and pin them.
+ * vci_KeepWritingMainRelHeader()
+ * Read header pages for writing, and pin them.
+ *
+ * Then, use the following two functions,
+ *
+ * vci_SetMainRelVar()
+ * To set the value to the field.
+ * vci_GetMainRelVar()
+ * To get the value of the field.
+ *
+ * The field is defined in enum enum vci_MainRelVar.
+ * The format is, page ID is in upper 16 bits, and offset from
+ * the page top is in lower 16 bits.
+ *
+ * To write the header pages out to storage, use the next function.
+ *
+ * vci_WriteMainRelVar()
+ *
+ * After accessing the header, release the DB pages with the following
+ * function.
+ *
+ * vci_ReleaseMainRelHeader()
+ * Release header pages.
+ *
+ * Other helper functions.
+ *
+ * vci_GetMColumnPosition()
+ * Gives the position of vcis_m_column_t.
+ *
+ * vci_GetMColumn()
+ * Gives vcis_m_column_t.
+ *
+ * vci_GetExtentInfoPosition()
+ * Get the position of vcis_m_extent_t structure for the target
+ * extentId.
+ *
+ * FIXME Lock check function necessary?
+ * Memo: I think the functions to check the lock status of the VCI main
+ * relation may be convenient, in order to determine if it is possible to
+ * start a ROS command. It will be used to avoid conflict between building
+ * local ROS, the vacuum operation, and other ROS commands. For other ROS
+ * commands, we do not need to use such functions, just try to lock and
+ * wait. Vacuum, too. For local ROS conversion, we have to determine if
+ * other ROS command is running when we evaluate the cost of plans.
+ */
+
+/**
+ * @brief Initialize the structure info to access the header of VCI main
+ * relation.
+ *
+ * This function "just" initializes the given object.
+ * To access the information in the header, keep the DB pages in buffer
+ * using vci_KeepMainRelHeader().
+ * The accessors are vci_GetMainRelVar() and vci_SetMainRelVar().
+ * After modifying the information, call vci_WriteMainRelVar() to write
+ * the page back to the storage.
+ * Finally to release the buffer, call vci_ReleaseMainRelHeader().
+ *
+ * @param[out] info Pointer to the target vci_MainRelHeaderInfo,
+ * which will be initialized
+ * @param[in] rel VCI main relation.
+ * @param[in] command ROS command which uses this structure.
+ */
+void
+vci_InitMainRelHeaderInfo(vci_MainRelHeaderInfo *info,
+ Relation rel,
+ vci_ros_command_t command)
+{
+ Assert(NULL != info);
+ info->rel = rel;
+ for (int aId = 0; aId < lengthof(info->buffer); ++aId)
+ info->buffer[aId] = InvalidBuffer;
+ info->command = command;
+ info->num_extents_allocated = -1;
+ info->initctx = CurrentMemoryContext;
+ info->cached_tupledesc = NULL;
+}
+
+static void
+KeepMainRelHeader(vci_MainRelHeaderInfo *info)
+{
+ Assert(NULL != info);
+ Assert(NULL != info->rel);
+ for (int blockNum = 0; blockNum < lengthof(info->buffer); ++blockNum)
+ info->buffer[blockNum] = vci_ReadBufferWithPageInit(info->rel, blockNum);
+}
+
+static void
+CheckRosVersion(vci_MainRelHeaderInfo *info)
+{
+ uint32 major = vci_GetMainRelVar(info, vcimrv_ros_version_major, 0);
+ uint32 minor = vci_GetMainRelVar(info, vcimrv_ros_version_minor, 0);
+
+ if ((major == 0) && (minor == 0))
+ ereport(ERROR, (errmsg("ROS has not been formatted yet."),
+ errhint("This might happen when CREATE INDEX fails. "
+ "\"DROP INDEX %s;\" and CREATE INDEX again may help.",
+ RelationGetRelationName(info->rel))));
+
+ if ((VCI_ROS_VERSION_MAJOR != major) || (VCI_ROS_VERSION_MINOR != minor))
+ ereport(ERROR, (errmsg("incompatible VCI version: expected (%d, %d), stored (%d, %d).", VCI_ROS_VERSION_MAJOR, VCI_ROS_VERSION_MINOR, major, minor),
+ errhint("This can happen when accessing old database with newer VCI modules. DROP and CREATE INDEX may help.")));
+}
+
+static int32
+GetNumberOfExtentsFromSizeOfMainRelation(Relation rel)
+{
+ const int headerBlockNumber = vcimrv_extent_info >> VCI_MRV_PAGE_SHIFT;
+ const int maxExtentInfoInFirstPage = (BLCKSZ -
+ (vcimrv_extent_info & VCI_MRV_MASK_OFFSET)) /
+ sizeof(vcis_m_extent_t);
+ const int maxExtentInfoInPage = VCI_MAX_PAGE_SPACE /
+ sizeof(vcis_m_extent_t);
+ int numBlocks = RelationGetNumberOfBlocks(rel);
+
+ if (numBlocks <= headerBlockNumber)
+ return -1;
+
+ return ((numBlocks - (headerBlockNumber + 1)) * maxExtentInfoInPage)
+ + maxExtentInfoInFirstPage;
+}
+
+static void
+UpdateNumberOfExtentsInMainRelHeader(vci_MainRelHeaderInfo *info)
+{
+ if (vci_rc_query == info->command)
+ info->num_extents_allocated = GetNumberOfExtentsFromSizeOfMainRelation(
+ info->rel);
+ else
+ info->num_extents_allocated = -1;
+}
+
+/**
+ * @brief Keep DB pages of VCI header in buffer.
+ *
+ * This function acquire one read lock with AccessShareLock.
+ * This is called only by vci_inner_build().
+ *
+ * @param[in] info Pointer to the target vci_MainRelHeaderInfo.
+ */
+void
+vci_KeepMainRelHeaderWithoutVersionCheck(vci_MainRelHeaderInfo *info)
+{
+ Assert(info);
+ Assert(RelationIsValid(info->rel));
+ elog(DEBUG3, "open VCI \"%s\" ignoring ROS version",
+ RelationGetRelationName(info->rel));
+ KeepMainRelHeader(info);
+}
+
+/**
+ * @brief Change command ID stored in vci_MainRelHeaderInfo.
+ *
+ * @param[in] info pointer to the target vci_MainRelHeaderInfo.
+ * @param[in] command new command ID.
+ */
+void
+vci_ChangeCommand(vci_MainRelHeaderInfo *info, vci_ros_command_t command)
+{
+ Assert(info);
+ info->command = command;
+ UpdateNumberOfExtentsInMainRelHeader(info);
+}
+
+/**
+ * @brief Keep DB pages of VCI header in buffer after checking the ROS version.
+ *
+ * This function acquire one read lock with AccessShareLock.
+ *
+ * @param[in] info Pointer to the target vci_MainRelHeaderInfo.
+ */
+void
+vci_KeepMainRelHeader(vci_MainRelHeaderInfo *info)
+{
+ Assert(info);
+ Assert(RelationIsValid(info->rel));
+ elog(DEBUG3, "open VCI \"%s\"",
+ RelationGetRelationName(info->rel));
+ KeepMainRelHeader(info);
+ CheckRosVersion(info);
+ UpdateNumberOfExtentsInMainRelHeader(info);
+}
+
+/**
+ * @brief Write header pages of VCI main relation.
+ *
+ * @param[in] info Pointer to the target vci_MainRelHeaderInfo.
+ * @param[in] writeArea Give vci_wmrv_update for updating the pages for
+ * recovery, or vci_wmrv_all for all pages. The latter should only be used in
+ * building the index.
+ */
+void
+vci_WriteMainRelVar(vci_MainRelHeaderInfo *info,
+ vci_wmrv_t writeArea)
+{
+ int start = 0;
+
+ Assert(NULL != info);
+ Assert(NULL != info->rel);
+
+ elog(DEBUG3, "flush header pages of VCI \"%s\" main relation",
+ RelationGetRelationName(info->rel));
+
+ switch (writeArea)
+ {
+ case vci_wmrv_update:
+ start = lengthof(info->buffer) - 1;
+ break;
+ case vci_wmrv_all:
+ start = 0;
+ break;
+ default:
+ ereport(ERROR, (errmsg("internal error. unsupported parameter."), errhint("Disable VCI by 'SELECT vci_disable();'")));
+ }
+
+ for (int blockNum = start; blockNum < lengthof(info->buffer); ++blockNum)
+ {
+ LockBuffer(info->buffer[blockNum], BUFFER_LOCK_EXCLUSIVE);
+ MarkBufferDirty(info->buffer[blockNum]);
+ vci_WriteOneItemPage(info->rel, info->buffer[blockNum]);
+ LockBuffer(info->buffer[blockNum], BUFFER_LOCK_UNLOCK);
+ }
+}
+
+/**
+ * @brief Release buffer for the VCI header.
+ *
+ * This function release one read lock with AccessShareLock.
+ *
+ * @param[in] info Pointer to the target vci_MainRelHeaderInfo.
+ */
+void
+vci_ReleaseMainRelHeader(vci_MainRelHeaderInfo *info)
+{
+ Assert(NULL != info);
+ Assert(NULL != info->rel);
+
+ elog(DEBUG3, "release VCI \"%s\"",
+ RelationGetRelationName(info->rel));
+ for (int blockNum = 0; blockNum < lengthof(info->buffer); ++blockNum)
+ {
+ ReleaseBuffer(info->buffer[blockNum]);
+ info->buffer[blockNum] = InvalidBuffer;
+ }
+ info->rel = NULL;
+ info->cached_tupledesc = NULL;
+}
+
+/**
+ * @brief Set values in the header part of VCI main relation.
+ *
+ * @param[in] info Pointer to the target vci_MainRelHeaderInfo.
+ * @param[in] var "virtual address" of the variable, defined in
+ * enum vci_MainRelVar.
+ * @param[in] elemId Give 0 normally.
+ * When the target variable has multiple of elements, say an array,
+ * the element ID should be placed.
+ * @param[in] value The value to write.
+ */
+void
+vci_SetMainRelVar(vci_MainRelHeaderInfo *info,
+ vci_MainRelVar var,
+ int elemId,
+ uint32 value)
+{
+ Page page;
+ unsigned int blockNumber = vci_MRVGetBlockNumber(var);
+ unsigned int offset = vci_MRVGetOffset(var);
+
+ Assert(blockNumber < lengthof(info->buffer));
+ Assert(offset < BLCKSZ);
+
+ page = BufferGetPage(info->buffer[blockNumber]);
+ ((uint32 *) &(((char *) page)[offset]))[elemId] = value;
+}
+
+/**
+ * @brief Get values in the header part of VCI main relation.
+ *
+ * @param[in] info Pointer to the target vci_MainRelHeaderInfo.
+ * @param[in] var "virtual address" of the variable, defined in
+ * enum vci_MainRelVar.
+ * @param[in] elemId Give 0 normally.
+ * When the target variable has multiple of elements, say an array,
+ * the element ID should be placed.
+ * @return The gotten value.
+ */
+uint32
+vci_GetMainRelVar(vci_MainRelHeaderInfo *info,
+ vci_MainRelVar var,
+ int elemId)
+{
+ Page page;
+ unsigned int blockNumber = vci_MRVGetBlockNumber(var);
+ unsigned int offset = vci_MRVGetOffset(var);
+
+ Assert(blockNumber < lengthof(info->buffer));
+ Assert(offset < BLCKSZ);
+ page = BufferGetPage(info->buffer[blockNumber]);
+
+ return ((uint32 *) &(((char *) page)[offset]))[elemId];
+}
+
+/**
+ * @brief Get the position of column information in the VCI main relation.
+ *
+ * @param[in] columnId The column ID in the VCI index.
+ * @return The offset in the page, which including DB page header part.
+ */
+vci_MainRelVar
+vci_GetMColumnPosition(int16 columnId)
+{
+ const int firstBlockNumber = vci_MRVGetBlockNumber(vcimrv_column_info);
+ const int numInFirstPage = (BLCKSZ - vci_MRVGetOffset(vcimrv_column_info)) /
+ sizeof(vcis_m_column_t);
+ const int numInPage = VCI_MAX_PAGE_SPACE / sizeof(vcis_m_column_t);
+ int blockNumber;
+
+ Assert(VCI_FIRST_NORMALCOLUMN_ID <= columnId);
+ if (columnId < numInFirstPage)
+ {
+ return (firstBlockNumber << VCI_MRV_PAGE_SHIFT) +
+ vci_MRVGetOffset(vcimrv_column_info) +
+ (columnId * sizeof(vcis_m_column_t));
+ }
+
+ columnId -= numInFirstPage;
+ blockNumber = columnId / numInPage;
+ columnId -= blockNumber * numInPage;
+ blockNumber += 1 + firstBlockNumber;
+ Assert(blockNumber < (VCI_NUM_MAIN_REL_HEADER_PAGES - 1));
+
+ return (blockNumber << VCI_MRV_PAGE_SHIFT) +
+ VCI_MIN_PAGE_HEADER +
+ (columnId * sizeof(vcis_m_column_t));
+}
+
+/**
+ * @brief Get the column information in the VCI main relation.
+ *
+ * @param[in] info Pointer to the target vci_MainRelHeaderInfo.
+ * @param[in] columnId The column ID in the VCI index.
+ * @return The pointer to the column information in the header page of
+ * VCI main relation.
+ *
+ * @note
+ * AFTER ACCESSING vcis_m_column_t, RELEASE BUFFER WITH ReleaseBuffer(buffer);
+ */
+vcis_m_column_t *
+vci_GetMColumn(vci_MainRelHeaderInfo *info, int16 columnId)
+{
+ Page page;
+ vci_MainRelVar mrv = vci_GetMColumnPosition(columnId);
+
+ page = BufferGetPage(info->buffer[vci_MRVGetBlockNumber(mrv)]);
+
+ return (vcis_m_column_t *) &(((char *) page)[vci_MRVGetOffset(mrv)]);
+}
+
+/**
+ * @brief Obtain the position of vcis_m_extent_t structure for
+ * the target extentId.
+ *
+ * vcis_m_extent_t is the information of extents in VCI main relation.
+ *
+ * @param[out] blockNumber The block number contains the information is written
+ * in * blockNumber.
+ * @param[out] offset The offset number contains the information is written
+ * in * offset.
+ * @param[in] extentId The target extent ID.
+ */
+void
+vci_GetExtentInfoPosition(BlockNumber *blockNumber,
+ OffsetNumber *offset,
+ int32 extentId)
+{
+ const int maxExtentInfoInFirstPage = (BLCKSZ -
+ (vcimrv_extent_info & VCI_MRV_MASK_OFFSET)) /
+ sizeof(vcis_m_extent_t);
+ const int maxExtentInfoInPage = VCI_MAX_PAGE_SPACE /
+ sizeof(vcis_m_extent_t);
+
+ Assert(blockNumber);
+ Assert(offset);
+
+ if (extentId < maxExtentInfoInFirstPage)
+ {
+ *blockNumber = vcimrv_extent_info >> VCI_MRV_PAGE_SHIFT;
+ *offset = (vcimrv_extent_info & VCI_MRV_MASK_OFFSET) +
+ (extentId * sizeof(vcis_m_extent_t));
+ }
+ else
+ {
+ int32 extentIdRem = extentId - maxExtentInfoInFirstPage;
+
+ *blockNumber = extentIdRem / maxExtentInfoInPage;
+ extentIdRem -= *blockNumber * maxExtentInfoInPage;
+ *blockNumber += 1 + (vcimrv_extent_info >> VCI_MRV_PAGE_SHIFT);
+ *offset = VCI_MIN_PAGE_HEADER +
+ (extentIdRem * sizeof(vcis_m_extent_t));
+ }
+}
+
+static void
+WriteAllItemsInPage(Relation rel,
+ Buffer buffer,
+ uint16 numItems)
+{
+ for (uint16 iId = 0; iId < numItems; ++iId)
+ vci_WriteItem(rel, buffer, iId + FirstOffsetNumber);
+}
+
+/*
+ * *********************************************************
+ * Relation and buffer control
+ * *********************************************************
+ */
+/*
+ * vci_PreparePagesWithOneItemIfNecessary()
+ * This function checks if the relation has the DB page pointed
+ * by an argument. If it does not exists, the function extends
+ * the relation and initialize extended pages with one item per
+ * page. Mind that this function does not touch existing pages.
+ * If you need to format existing pages, use vci_InitPage().
+ *
+ * vci_InitPage()
+ * Low level function.
+ *
+ * This function formats the existing DB page, pointed by
+ * relation and page ID (block number), with empty items.
+ * The number of items are also passed by an argument.
+ *
+ * vci_PreparePagesWithOneItemIfNecessary() is more convenient.
+ * For pages with one item, the macro vci_InitOneItemPage() is
+ * defined.
+ *
+ * vci_WriteItem()
+ * Mark the buffer dirty, and write out WAL from the pointed
+ * item in the buffer.
+ *
+ * vci_WriteOnePageIfNecessaryAndNext()
+ * A utility function.
+ * This function takes new page ID and old page ID in the
+ * arguments. If they are different, write out the old page,
+ * assumed which is loaded in the given buffer, and read
+ * the new page.
+ * If the page IDs are same, do nothing.
+ *
+ */
+
+/**
+ * @brief This function checks if the relation has the DB page with the page ID
+ * blockNumber.
+ *
+ * When it does not exists, the function extends the relation and initialize
+ * extended pages with one item per page.
+ *
+ * @param[in] rel The relation.
+ * @param[in] blockNumber The block number to be examined.
+ * @param[in] numItems The number of items the page is initialized with.
+ * @param[in] forceInit If true, the block is initialized anyway.
+ * @param[in] logItems If true, write all items in the pages into WAL.
+ */
+void
+vci_PreparePagesIfNecessaryCore(Relation rel,
+ BlockNumber blockNumber,
+ uint16 numItems,
+ bool forceInit,
+ bool logItems)
+{
+ BlockNumber existingPages = RelationGetNumberOfBlocks(rel);
+
+ Assert(0 < numItems);
+
+ if (!BlockNumberIsValid(blockNumber))
+ ereport(ERROR, (errmsg("data relation full"), errhint("Normally relations of VCI index are smaller than the table relation, therefore this error must not happen. Disable VCI by 'SELECT vci_disable();'")));
+
+ if (existingPages <= blockNumber)
+ {
+ for (BlockNumber pId = existingPages; pId <= blockNumber; ++pId)
+ {
+ Buffer buffer = ReadBufferExtended(rel, MAIN_FORKNUM,
+ P_NEW, RBM_ZERO_AND_LOCK, NULL);
+
+ vci_InitPageCore(buffer, numItems, true);
+ if (logItems)
+ WriteAllItemsInPage(rel, buffer, numItems);
+ UnlockReleaseBuffer(buffer);
+ }
+ }
+ else
+ {
+ Buffer buffer = ReadBuffer(rel, blockNumber);
+ Page page = BufferGetPage(buffer);
+ bool needUnlock = false;
+
+ if (PageIsNew(page) || forceInit)
+ {
+ vci_InitPageCore(buffer, numItems, false);
+
+ if (logItems)
+ {
+ LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
+ WriteAllItemsInPage(rel, buffer, numItems);
+ needUnlock = true;
+ }
+ }
+ if (needUnlock)
+ UnlockReleaseBuffer(buffer);
+ else
+ ReleaseBuffer(buffer);
+ }
+}
+
+/**
+ * @brief This function writes a given number of items in the buffer.
+ *
+ * @param[in] buffer Postgres DB buffer to be initialized.
+ * @param[in] numItems The number of items the page is initialized with.
+ * @param[in] locked true if the buffer is locked, false otherwise.
+ */
+void
+vci_InitPageCore(Buffer buffer, int16 numItems, bool locked)
+{
+ if (!locked)
+ LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
+
+ {
+ uint32 size;
+ uint32 itemSize;
+ Page page = BufferGetPage(buffer);
+ PageHeader pageHeader = (PageHeader) page;
+
+ PageInit(page, BLCKSZ, 0);
+ pageHeader->pd_lower += sizeof(ItemIdData) * numItems;
+ size = pageHeader->pd_upper - pageHeader->pd_lower;
+ itemSize = vci_RoundDownValue(size / numItems,
+ VCI_DATA_ALIGNMENT_IN_STORAGE);
+ for (int32 aId = numItems; aId--;)
+ {
+ HeapTupleHeader hTup;
+
+ pageHeader->pd_upper -= itemSize;
+ pageHeader->pd_linp[aId].lp_off = pageHeader->pd_upper;
+ pageHeader->pd_linp[aId].lp_len = itemSize;
+ pageHeader->pd_linp[aId].lp_flags = LP_NORMAL;
+ hTup = (HeapTupleHeader) PageGetItem(page, &(pageHeader->pd_linp[aId]));
+ hTup->t_infomask2 = 0;
+ hTup->t_infomask = HEAP_XMIN_FROZEN | HEAP_XMAX_INVALID;
+ hTup->t_hoff = vci_RoundUpValue(offsetof(HeapTupleHeaderData, t_bits),
+ VCI_DATA_ALIGNMENT_IN_STORAGE);
+ }
+ MarkBufferDirty(buffer);
+ Assert(pageHeader->pd_lower <= pageHeader->pd_upper);
+ }
+
+ if (!locked)
+ LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+}
+
+/**
+ * @brief This function get or newly create a DB buffer page, and put the
+ * header information that only one item is in the page, and the size of
+ * item is 8140 bytes, and the data type is bytea.
+ *
+ * @param[in] rel The relation.
+ * @param[in] blockNumber The block number to be initialized.
+ * @param[in] numItems The number of items the page is initialized with.
+ */
+/*
+ * dead code
+ * LCOV_EXCL_START
+ */
+void
+vci_InitPage(Relation rel, BlockNumber blockNumber, int16 numItems)
+{
+ Buffer buffer;
+
+ Assert(BlockNumberIsValid(blockNumber));
+ buffer = ReadBuffer(rel, blockNumber);
+ vci_InitPageCore(buffer, numItems, false);
+ ReleaseBuffer(buffer);
+}
+
+/* LCOV_EXCL_STOP */
+
+/**
+ * @brief This function mark the buffer dirty, and make WAL from the item
+ * in the buffer.
+ *
+ * We assume that the relation is only modified by ROS command exclusively.
+ * So, we do not put strict lock here.
+ *
+ * @param[in] rel The relation.
+ * @param[in] buffer PostgreSQL DB buffer having the page data.
+ * @param[in] numItems The number of items the page is initialized with.
+ */
+void
+vci_WriteItem(Relation rel,
+ Buffer buffer,
+ OffsetNumber offsetNumber)
+{
+ Page page = BufferGetPage(buffer);
+ ItemId tup = PageGetItemId(page, offsetNumber);
+ HeapTupleHeader htup = (HeapTupleHeader) PageGetItem(page, tup);
+
+ Assert(BufferIsValid(buffer));
+ Assert(OffsetNumberIsValid(offsetNumber));
+
+ MarkBufferDirty(buffer);
+
+ if (RelationNeedsWAL(rel))
+ {
+ xl_heap_inplace xlrec;
+ XLogRecPtr recptr;
+ uint8 info = 0;
+ uint32 newlen;
+
+ xlrec.offnum = offsetNumber;
+ xlrec.dbId = MyDatabaseId;
+ xlrec.tsId = MyDatabaseTableSpace;
+ xlrec.relcacheInitFileInval = false;
+ xlrec.nmsgs = 0;
+
+ /*
+ * originally taken from heap_inplace_update() in
+ * src/backend/access/heap/heapam.c
+ */
+ XLogBeginInsert();
+ XLogRegisterData(&xlrec, MinSizeOfHeapInplace);
+
+ XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
+
+ newlen = VCI_ITEM_SPACE(PageGetMaxOffsetNumber(page));
+ XLogRegisterBufData(0, (char *) htup + htup->t_hoff, newlen);
+
+ START_CRIT_SECTION();
+ recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_INPLACE | info);
+
+ PageSetLSN(page, recptr);
+
+ END_CRIT_SECTION();
+ }
+}
+
+/**
+ * @brief This function first compares blockNumber and blockNumberOld.
+ *
+ * If they differ each other, write out the buffer in the DB page of
+ * blockNumberOld, and read the DB page of blockNumber.
+ * If the are same, do nothing.
+ *
+ * @param[in] relation The relation.
+ * @param[in] blockNumber New page ID.
+ * @param[in] blockNumberOld Old page ID. The data is in buffer.
+ * @param[in] buffer The buffer contains the old page.
+ * @return buffer contains new page, exclusively locked.
+ */
+Buffer
+vci_WriteOnePageIfNecessaryAndGetBuffer(Relation relation,
+ BlockNumber blockNumber,
+ BlockNumber blockNumberOld,
+ Buffer buffer)
+{
+ if (blockNumber == blockNumberOld)
+ return buffer;
+ if (BlockNumberIsValid(blockNumberOld))
+ {
+ vci_WriteOneItemPage(relation, buffer);
+ UnlockReleaseBuffer(buffer);
+ }
+ buffer = vci_ReadBufferWithPageInit(relation, blockNumber);
+ LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
+
+ return buffer;
+}
+
+/*
+ * *********************************************************
+ * PostgreSQL Attributes (columns)
+ * *********************************************************
+ */
+
+/*
+ * *********************************************************
+ * VCI "columns"
+ * Here, a "column" may have only one data relation,
+ * or a pair of meta data relation and data relation.
+ * It includes delete vector, null vector, TID relation,
+ *
+ * *********************************************************
+ */
+/*
+ * vci_GetSumOfAttributeIndices()
+ * This function counts up all the VCI "columns" defined
+ * in num_vcis_attribute_type.
+ *
+ * vci_GetAttrTypeAndIndexFromSumOfIndices()
+ * Get vcis_attribute_type_t and index from given
+ * sequential index.
+ */
+
+/**
+ * @brief This function counts up all the VCI "columns" defined
+ * in num_vcis_attribute_type.
+ *
+ * @param[in] numColumns Number of normal columns in VCI index.
+ * @return number of total columns, not only of indexed columns, but also
+ * auxiliary columns.
+ */
+int
+vci_GetSumOfAttributeIndices(int16 numColumns)
+{
+ int result = 0;
+
+ for (int aId = 0; aId < num_vcis_attribute_type; ++aId)
+ result += vci_GetNumIndexForAttributeType(aId, numColumns);
+
+ return result;
+}
+
+/**
+ * @brief Get Attribute type defined in vcis_attribute_type_t and
+ * index of the target category.
+ *
+ * @param[out] attrType The attribute type is wirtten in *attrType.
+ * @param[out] index The index is wirtten in *index.
+ * If no corresponding attribute exists, *index set to -1.
+ * @param[in] numColumns The number of normal columns in VCI index.
+ * @param[in] sumOfIndex The sequential index of target column.
+ */
+void
+vci_GetAttrTypeAndIndexFromSumOfIndices(vcis_attribute_type_t *attrType,
+ int *index,
+ int16 numColumns,
+ int sumOfIndex)
+{
+ int sum = 0;
+
+ *index = 0;
+ for (*attrType = 0; *attrType < num_vcis_attribute_type; ++*attrType)
+ {
+ int inc = vci_GetNumIndexForAttributeType(*attrType, numColumns);
+
+ if ((sum <= sumOfIndex) && (sumOfIndex < (sum + inc)))
+ {
+ *index = sumOfIndex - sum;
+
+ return;
+ }
+ sum += inc;
+ }
+ *index = -1;
+}
+
+/**
+ * @brief Calculate the bid ID of null bit vector for given column ID.
+ *
+ * @param[in] tupleDesc The tuple descriptor of VCI main relation.
+ * @param[in] columnId Target column ID.
+ * @return The bit ID in null bit vector. For not nullable columns, return -1.
+ */
+int16
+vci_GetBitIdInNullBits(TupleDesc tupleDesc, int16 columnId)
+{
+ return columnId;
+}
+
+/**
+ * @brief Get the column widths in the worst case.
+ *
+ * @param attr Attribute information of the columns.
+ * @return The width in the worst case.
+ */
+int16
+vci_GetColumnWorstSize(Form_pg_attribute attr)
+{
+ if (0 <= attr->attlen) /* fixed length data */
+ return attr->attlen;
+
+ /* variable or long length data */
+ if (0 <= attr->atttypmod)
+ {
+ int32 columnSize;
+
+ switch (attr->atttypid)
+ {
+ /* for bit(n), varbit(n). */
+ case BITOID:
+ case VARBITOID:
+ columnSize = VARBITTOTALLEN(attr->atttypmod);
+ break;
+
+ /* for numeric(p,q), retrun 'p'+LL . */
+ case NUMERICOID:
+ columnSize = (attr->atttypmod >> 16) + VARHDRSZ;
+ break;
+
+ case BPCHAROID:
+ case VARCHAROID:
+ if (attr->atttypmod < VARHDRSZ)
+ columnSize = (attr->atttypmod - VARHDRSZ) * MAX_MULTIBYTE_CHAR_LEN + VARHDRSZ;
+ else
+ columnSize = attr->atttypmod * MAX_MULTIBYTE_CHAR_LEN;
+ break;
+
+ default:
+ {
+#ifdef VCI_USE_COMPACT_VARLENA
+ if (attr->atttypmod < VARATT_SHORT_MAX)
+ columnSize = attr->atttypmod - VARHDRSZ + VARHDRSZ_SHORT;
+ else
+ columnSize = attr->atttypmod;
+#else
+ columnSize = attr->atttypmod;
+#endif
+ }
+ break;
+ }
+
+ if (columnSize < MaxHeapTupleSize)
+ return (int16) columnSize;
+ }
+
+ /* worst size -> MaxHeapTupleSize(8k) */
+ /* unlimited data size */
+ return MaxHeapTupleSize;
+
+ /*
+ * Large data are externally toasted and the size of tuple including the
+ * large attribute is limited to TOAST_TUPLE_TARGET, which is BLCKSZ / 4
+ * normally. But, UN-TOASTED -> MaxHeapTupleSize.
+ */
+}
+
+/**
+ * @brief from vci_MainRelHeaderInfo, column IDs in original heap relation
+ * and VCI index relation are collected.
+ *
+ * This function also collect the worst-case sizes of columns.
+ * attributes, just packed.
+ *
+ * @param[out] heapAttrNumList Pointer to an array of AttrNumber.
+ * The attribute numbers (column ID) in the heap relation are stored here.
+ * The AttrNumber is one-origin.
+ * The length of array must be larger than numColumns.
+ *
+ * @param[out] indxColumnIdList Pointer to an array of int16.
+ * The column IDs in the VCI main relation are stored here.
+ * This is zero-origin.
+ * The length of array must be larger than numColumns.
+ *
+ * @param[out] columnSizeList Pointer to an array of int16.
+ * The worst-case widths are stored here.
+ * The length of array must be larger than numColumns.
+ *
+ * @param[in] numColumn Number of columns defined in VCI index.
+ * @param[in] info VCI main relation header information.
+ * @param[in] heapOid OID of original PostgreSQL tables.
+ * @return sum of columnSizeList.
+ */
+Size
+vci_GetColumnIdsAndSizes(AttrNumber *heapAttrNumList,
+ int16 *indxColumnIdList,
+ int16 *columnSizeList,
+ int numColumn,
+ vci_MainRelHeaderInfo *info,
+ Oid heapOid)
+{
+ LOCKMODE lockmode = AccessShareLock;
+ Oid tableOid = info->rel->rd_index->indrelid;
+ Relation tableRel;
+ TupleDesc tupleDesc;
+ Size result = 0;
+
+ tableRel = table_open(tableOid, lockmode);
+ tupleDesc = RelationGetDescr(tableRel);
+
+ for (int colId = VCI_FIRST_NORMALCOLUMN_ID; colId < numColumn; ++colId)
+ {
+ Form_pg_attribute attr;
+ vcis_m_column_t *mColumn = vci_GetMColumn(info, colId);
+ Buffer buffer;
+ Relation rel = table_open(mColumn->meta_oid, lockmode);
+ vcis_column_meta_t *metaHeader = vci_GetColumnMeta(&buffer, rel);
+ int16 attnum = metaHeader->pgsql_attnum;
+
+ heapAttrNumList[colId] = attnum;
+ attr = TupleDescAttr(tupleDesc, attnum - 1);
+
+ ReleaseBuffer(buffer);
+ table_close(rel, lockmode);
+
+ /*
+ * Previously, "attr->attnum - 1" was used for the right value instead
+ * of the simple sequencial number, colId (The attr is extracted from
+ * indexRel). This was for future expanding to enable to add columns
+ * to or delete ones from VCI after creating. But this is not
+ * implemented. And then, the attr is no longer reliable because real
+ * columns information is stored in the vci_column_ids option not in
+ * indexRel when using vci_create().
+ */
+ indxColumnIdList[colId] = colId;
+
+ if (!AttributeNumberIsValid(heapAttrNumList[colId]))
+ elog(ERROR, "column not found."); /* FIXME */
+
+ result += columnSizeList[colId] = vci_GetColumnWorstSize(attr);
+ }
+
+ table_close(tableRel, lockmode);
+
+ return result;
+}
+
+/**
+ * @brief Count number of nullable columns in a tuple descriptor.
+ *
+ * @param[in] tupleDesc tuple descriptor
+ * @return Number of nullable columns in the relation.
+ */
+int
+vci_GetNumberOfNullableColumn(TupleDesc tupleDesc)
+{
+ int result = 0;
+
+ for (int aId = 0; aId < tupleDesc->natts; ++aId)
+ {
+ Assert(!((TupleDescAttr(tupleDesc, aId)->attnotnull)));
+ ++result;
+ }
+
+ return result;
+}
+
+/**
+ * @brief Sarch for free extent and return the extent ID.
+ *
+ * This function reads extent information in the ROS main relation and checks
+ * if the extent has its xgen and xdel are both InvalidTransactionId.
+ * The check is done in vci_isFreeExtent().
+ */
+static uint32
+SearchFreeExtent(vci_MainRelHeaderInfo *info)
+{
+ int32 numExtents = vci_GetMainRelVar(info, vcimrv_num_extents, 0);
+ int32 extentId = numExtents;
+ BlockNumber blockNumber;
+ OffsetNumber offset;
+ Buffer buffer = InvalidBuffer;
+ Page pageHeader = NULL;
+
+ /* search deleted extent first */
+
+ vcis_m_extent_t *extentInfo;
+ vci_meta_item_scanner_t *scan =
+ vci_BeginMetaItemScan(info->rel, BUFFER_LOCK_SHARE);
+
+ while ((extentInfo = vci_GetMExtentNext(info, scan)) != NULL)
+ {
+ if (vci_ExtentIsFree(extentInfo))
+ {
+ extentId = scan->index;
+ break;
+ }
+ }
+ vci_EndMetaItemScan(scan);
+
+ /* if no deleted extent, create a new extent */
+ if (extentId == numExtents)
+ {
+ while (true)
+ {
+ vcis_m_extent_t *extentInfo_new;
+ bool extentIsFree;
+
+ vci_GetExtentInfoPosition(&blockNumber, &offset, extentId);
+ vci_PreparePagesWithOneItemIfNecessary(info->rel, blockNumber);
+ buffer = ReadBuffer(info->rel, blockNumber);
+
+ LockBuffer(buffer, BUFFER_LOCK_SHARE);
+
+ pageHeader = BufferGetPage(buffer);
+ extentInfo_new = (vcis_m_extent_t *) &(((char *) pageHeader)[offset]);
+ Assert(extentInfo_new->xgen == InvalidTransactionId);
+ Assert((extentInfo_new->xdel == InvalidTransactionId) || (extentInfo_new->xdel == FrozenTransactionId));
+ extentIsFree = vci_ExtentIsFree(extentInfo_new);
+
+ UnlockReleaseBuffer(buffer);
+
+ if (extentIsFree)
+ break;
+ else
+ ++extentId;
+ }
+ }
+
+ return extentId;
+}
+
+/**
+ * @brief Get free extent Id.
+ *
+ * This function first check the pointer in main relation to one free extent.
+ * It it is not free extent, then scan the main relation to find free one.
+ * @param[in] info Pointer to the target vci_MainRelHeaderInfo.
+ * @return ID of a free extent.
+ */
+uint32
+vci_GetFreeExtentId(vci_MainRelHeaderInfo *info)
+{
+ Buffer buffer;
+ int32 extentId;
+ vcis_m_extent_t *extentInfo;
+ bool isFreeExtent;
+
+ /* first, check the pointed extent */
+ extentId = 0;
+ {
+ extentInfo = vci_GetMExtent(&buffer, info, extentId);
+
+ LockBuffer(buffer, BUFFER_LOCK_SHARE);
+ isFreeExtent = vci_ExtentIsFree(extentInfo);
+ UnlockReleaseBuffer(buffer);
+
+ if (isFreeExtent)
+ return extentId;
+ }
+
+ /* scan the VCI main relation to find free extent */
+ extentId = SearchFreeExtent(info);
+ extentInfo = vci_GetMExtent(&buffer, info, extentId);
+
+ LockBuffer(buffer, BUFFER_LOCK_SHARE);
+ Assert(vci_ExtentIsFree(extentInfo));
+ UnlockReleaseBuffer(buffer);
+
+ return extentId;
+}
+
+/*
+ * *************
+ * ** CAUTION **
+ * *************
+ * USE vci_WriteExtentInfoInMainRosForWosRosConvInit() IN SOME TRANSACTION.
+ * GetCurrentTransactionId() IS USED.
+ */
+
+/**
+ * @brief The function to call before starting WOS -> ROS conversion to write
+ * recovery information.
+ *
+ * This function write new current ROS ID to the header area of ROS main
+ * relation, ROS command, and target extent ID. It also write
+ * InvalidTransactionId at the target extent info.
+ *
+ * @param[in] info pointer to the target vci_MainRelHeaderInfo.
+ * @param[in] extentId target extent ID.
+ * @param[in] extentId target common dictionary ID.
+ * @param[in] xid transaction ID of this write operation.
+ * @param[in] command command of this operation.
+ */
+void
+vci_WriteExtentInfoInMainRosForWriteExtentOrCommonDict(
+ vci_MainRelHeaderInfo *info,
+ int32 extentId,
+ int32 dictionaryId,
+ TransactionId xid,
+ vci_ros_command_t command)
+{
+ int32 numExtents = vci_GetMainRelVar(info, vcimrv_num_extents, 0);
+
+ Assert(0 <= numExtents);
+ Assert(VCI_FIRST_NORMAL_EXTENT_ID <= extentId);
+ if (numExtents <= extentId)
+ {
+ BlockNumber blockNumber;
+ OffsetNumber offset;
+
+ numExtents = extentId + 1;
+ vci_GetExtentInfoPosition(&blockNumber, &offset, extentId);
+ vci_PreparePagesWithOneItemIfNecessary(info->rel, blockNumber);
+ }
+ vci_SetMainRelVar(info, vcimrv_num_extents, 0, numExtents);
+ vci_SetMainRelVar(info, vcimrv_current_ros_version, 0, xid);
+ vci_SetMainRelVar(info, vcimrv_ros_command, 0, command);
+ vci_WriteMainRelVar(info, vci_wmrv_update);
+}
+
+vcis_m_extent_t *
+vci_GetMExtent(Buffer *buffer, vci_MainRelHeaderInfo *info, int32 extentId)
+{
+ BlockNumber blockNumber;
+ OffsetNumber offset;
+ Page page;
+
+ Assert(VCI_FIRST_NORMAL_EXTENT_ID <= extentId);
+ vci_GetExtentInfoPosition(&blockNumber, &offset, extentId);
+
+ /*
+ * info->num_extents_allocated is normally -1. When vci_rc_query ==
+ * info->command, it has the expected number of extents calcuated from
+ * number of blocks in VCI main relation.
+ */
+ if (info->num_extents_allocated <= extentId)
+ vci_PreparePagesWithOneItemIfNecessary(info->rel, blockNumber);
+
+ *buffer = vci_ReadBufferWithPageInit(info->rel, blockNumber);
+ page = BufferGetPage(*buffer);
+
+ return (vcis_m_extent_t *) &(((char *) page)[offset]);
+}
+
+vcis_m_extent_t *
+vci_GetMExtentNext(vci_MainRelHeaderInfo *info, vci_meta_item_scanner_t *scan)
+{
+ OffsetNumber offset;
+ BlockNumber block;
+
+ if (!scan->inited)
+ {
+ Page page;
+
+ scan->max_item = vci_GetMainRelVar(info, vcimrv_num_extents, 0);
+ vci_GetExtentInfoPosition(&scan->start_block, &offset, 0);
+ vci_GetExtentInfoPosition(&scan->end_block, &offset, scan->max_item);
+ scan->item_size = sizeof(vcis_m_extent_t);
+ scan->current_block = scan->start_block;
+
+ scan->buffer = ReadBuffer(scan->rel, scan->current_block);
+ LockBuffer(scan->buffer, scan->buf_lockmode);
+
+ page = BufferGetPage(scan->buffer);
+ if (PageIsNew(page))
+ {
+ UnlockReleaseBuffer(scan->buffer);
+ return NULL;
+ }
+
+ Assert(scan->index == -1);
+ Assert(scan->max_item >= 0);
+
+ scan->inited = true;
+ }
+
+ scan->index++;
+
+ if (scan->index >= scan->max_item)
+ return NULL;
+
+ vci_GetExtentInfoPosition(&block, &offset, scan->index);
+
+ if (scan->current_block != block)
+ {
+ Page page;
+
+ Assert(BufferIsValid(scan->buffer));
+
+ if (scan->buf_lockmode == BUFFER_LOCK_EXCLUSIVE)
+ vci_WriteOneItemPage(scan->rel, scan->buffer);
+
+ UnlockReleaseBuffer(scan->buffer);
+
+ scan->buffer = ReadBuffer(scan->rel, block);
+ scan->current_block = block;
+
+ LockBuffer(scan->buffer, scan->buf_lockmode);
+
+ page = BufferGetPage(scan->buffer);
+ if (PageIsNew(page))
+ {
+ UnlockReleaseBuffer(scan->buffer);
+ return NULL;
+ }
+ }
+
+ return (vcis_m_extent_t *) &(((char *) BufferGetPage(scan->buffer))[offset]);
+}
+
+vci_meta_item_scanner_t *
+vci_BeginMetaItemScan(Relation rel, int buf_lock)
+{
+ vci_meta_item_scanner_t *scan = palloc0_object(vci_meta_item_scanner_t);
+
+ Assert((buf_lock == BUFFER_LOCK_SHARE) || (buf_lock == BUFFER_LOCK_EXCLUSIVE));
+
+ scan->inited = false;
+
+ scan->rel = rel;
+ scan->index = -1;
+
+ scan->end_block = InvalidBlockNumber;
+ scan->start_block = InvalidBlockNumber;
+ scan->buffer = InvalidBuffer;
+ scan->current_block = InvalidBlockNumber;
+ scan->max_item = 0;
+ scan->max_item_in_page = 0;
+ scan->item_size = 0;
+ scan->buf_lockmode = buf_lock;
+
+ return scan;
+}
+
+void
+vci_EndMetaItemScan(vci_meta_item_scanner_t *scan)
+{
+ Assert(scan);
+
+ if (BufferIsValid(scan->buffer))
+ {
+ if (scan->buf_lockmode == BUFFER_LOCK_EXCLUSIVE)
+ vci_WriteOneItemPage(scan->rel, scan->buffer);
+
+ UnlockReleaseBuffer(scan->buffer);
+ }
+
+ pfree(scan);
+}
+
+void
+vci_WriteExtentInfo(vci_MainRelHeaderInfo *info,
+ int32 extentId,
+ uint32 numRows,
+ uint32 numDeletedRows,
+ uint32 numDeletedRowsOld,
+ TransactionId xgen,
+ TransactionId xdel)
+{
+ Buffer buffer;
+ vcis_m_extent_t *extentInfo = vci_GetMExtent(&buffer, info, extentId);
+
+ Assert(VCI_FIRST_NORMAL_EXTENT_ID <= extentId);
+
+ LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
+ extentInfo->num_rows = numRows;
+ extentInfo->num_deleted_rows = numDeletedRows;
+ extentInfo->num_deleted_rows_old = numDeletedRowsOld;
+ extentInfo->xgen = xgen;
+ extentInfo->xdel = xdel;
+ extentInfo->flags = 0;
+ vci_WriteOneItemPage(info->rel, buffer);
+ UnlockReleaseBuffer(buffer);
+}
+
+/**
+ * @brief This function checks if the extentID is 0 <= extentID and
+ * extentID < numExtents written in header part of main relation.
+ *
+ * If it passes, check the existence of the DB page where the extent ID
+ * information is written.
+ * It might happen that the page has vanished in some trouble...?
+ * In recovery process, the record of the number of extents should be
+ * corrected. If so, elog is better...
+ *
+ * @param[in] info Pointer to the target vci_MainRelHeaderInfo.
+ * @param[in] extentId The target extent ID.
+ * @retval true The DB page is allocated for the information with given
+ * extent ID.
+ * @retval false Need to allocate new DB page for the information.
+ */
+bool
+vci_ExtentInfoExists(vci_MainRelHeaderInfo *info, int32 extentId)
+{
+ BlockNumber blockNumber;
+ OffsetNumber offset;
+ int32 numExtents = vci_GetMainRelVar(info, vcimrv_num_extents, 0);
+
+ Assert(0 <= numExtents);
+ if (numExtents <= extentId)
+ return false;
+
+ if (0 <= info->num_extents_allocated)
+ return extentId < info->num_extents_allocated;
+
+ vci_GetExtentInfoPosition(&blockNumber, &offset, extentId);
+
+ return blockNumber < RelationGetNumberOfBlocks(info->rel);
+}
+
+static bool
+VisibilityCheck(TransactionId objectXidMin,
+ TransactionId objectXidMax,
+ TransactionId readerXid)
+{
+ /* visibility from generation */
+ bool result = TransactionIdIsValid(objectXidMin) &&
+ (TransactionIdEquals(objectXidMin, FrozenTransactionId) ||
+ /* objectXidMin <= readerXid */
+ TransactionIdPrecedesOrEquals(objectXidMin, readerXid));
+
+ if (!result)
+ return false;
+
+ /* visibility from deletion */
+ return (!TransactionIdIsValid(objectXidMax)) ||
+ (TransactionIdIsNormal(objectXidMax) &&
+ NormalTransactionIdPrecedes(readerXid, objectXidMax));
+}
+
+/**
+ * @brief Test if the extent is visible.
+ *
+ * @param[in] mExtent Pointer to the extent information.
+ * @param[in] xid The transaction ID to access the information.
+ * @retval true Visible.
+ * @retval false Invisible.
+ */
+bool
+vci_ExtentIsVisible(vcis_m_extent_t *mExtent, TransactionId xid)
+{
+ return VisibilityCheck(mExtent->xgen, mExtent->xdel, xid);
+}
+
+bool
+vci_ExtentIsCollectable(vcis_m_extent_t *mExtent, TransactionId wos2rosXid)
+{
+ bool result = false;
+
+ if (TransactionIdIsValid(mExtent->xdel))
+ {
+ result = TransactionIdEquals(mExtent->xdel, FrozenTransactionId) ||
+ /* mExtent->xdel < wos2rosXid */
+ TransactionIdPrecedes(mExtent->xdel, wos2rosXid);
+ }
+
+ return result;
+}
+
+bool
+vci_ExtentIsFree(vcis_m_extent_t *extentInfo)
+{
+ return !TransactionIdIsValid(extentInfo->xdel) && !TransactionIdIsValid(extentInfo->xgen);
+}
+
+/* -------------------------------------------------- */
+/* Recovery function around VCI Main Relation */
+/* -------------------------------------------------- */
+
+void
+vci_UpdateLastRosVersionAndOthers(vci_MainRelHeaderInfo *info)
+{
+ uint32 val;
+
+ val = vci_GetMainRelVar(info, vcimrv_current_ros_version, 0);
+ vci_SetMainRelVar(info, vcimrv_last_ros_version, 0, val);
+ val = vci_GetMainRelVar(info, vcimrv_size_mr, 0);
+ vci_SetMainRelVar(info, vcimrv_size_mr_old, 0, val);
+ val = vci_GetMainRelVar(info, vcimrv_tid_crid_diff_sel, 0);
+ vci_SetMainRelVar(info, vcimrv_tid_crid_diff_sel_old, 0, val);
+
+ vci_WriteMainRelVar(info, vci_wmrv_update);
+}
+
+void
+vci_RecoveryDone(vci_MainRelHeaderInfo *info)
+{
+ uint32 val;
+
+ val = vci_GetMainRelVar(info, vcimrv_last_ros_version, 0);
+ vci_SetMainRelVar(info, vcimrv_current_ros_version, 0, val);
+
+ val = vci_GetMainRelVar(info, vcimrv_size_mr_old, 0);
+ vci_SetMainRelVar(info, vcimrv_size_mr, 0, val);
+
+ val = vci_GetMainRelVar(info, vcimrv_tid_crid_diff_sel_old, 0);
+ vci_SetMainRelVar(info, vcimrv_tid_crid_diff_sel, 0, val);
+
+ vci_WriteMainRelVar(info, vci_wmrv_update);
+}
+
+void
+vci_WriteRecoveryRecordDone(vci_MainRelHeaderInfo *info, vci_ros_command_t command,
+ TransactionId xid)
+{
+ vci_SetMainRelVar(info, vcimrv_current_ros_version, 0, xid);
+ vci_SetMainRelVar(info, vcimrv_ros_command, 0, command);
+ vci_WriteMainRelVar(info, vci_wmrv_update);
+}
+
+void
+vci_WriteRecoveryRecordForExtentInfo(vci_MainRelHeaderInfo *info, int32 newExtentId, int32 oldExtentId)
+{
+ /*
+ * ConvertWos2Ros oldExtentId = VCI_INVALID_EXTENT_ID newExtentId = New
+ * Extent
+ *
+ * CollectDeletedRows oldExtentId = Src Extent( -> Unused Extent)
+ * newExtentId = New Extent
+ *
+ * CollectUnusedExtent oldExtentId = Unused Extent newExtentId =
+ * VCI_INVALID_EXTENT_ID
+ */
+ vci_SetMainRelVar(info, vcimrv_old_extent_id, 0, oldExtentId);
+ vci_SetMainRelVar(info, vcimrv_new_extent_id, 0, newExtentId);
+}
+
+void
+vci_RecoveryExtentInfo(vci_MainRelHeaderInfo *info, vci_ros_command_t command)
+{
+ int32 numExtents;
+ int32 oldExtentId;
+ int32 newExtentId;
+ Buffer s_buffer = InvalidBuffer;
+ Buffer d_buffer = InvalidBuffer;
+ vcis_m_extent_t *extentInfo;
+ int16 colId;
+
+ numExtents = vci_GetMainRelVar(info, vcimrv_num_extents, 0);
+ oldExtentId = vci_GetMainRelVar(info, vcimrv_old_extent_id, 0);
+ newExtentId = vci_GetMainRelVar(info, vcimrv_new_extent_id, 0);
+ colId = vci_GetMainRelVar(info, vcimrv_working_column_id, 0);
+
+ if (oldExtentId != VCI_INVALID_EXTENT_ID)
+ {
+ TransactionId recovery_xdel;
+
+ switch (command)
+ {
+ case vci_rc_collect_deleted:
+ Assert(oldExtentId < numExtents);
+ recovery_xdel = InvalidTransactionId;
+ break;
+ case vci_rc_collect_extent:
+ /* unuse extent Xdel -> Frozen(2) */
+ recovery_xdel = FrozenTransactionId;
+ break;
+ default:
+ Assert(0);
+ recovery_xdel = InvalidTransactionId;
+ break;
+ }
+
+ extentInfo = vci_GetMExtent(&s_buffer, info, oldExtentId); /* from */
+
+ LockBuffer(s_buffer, BUFFER_LOCK_EXCLUSIVE);
+ extentInfo->xdel = recovery_xdel;
+ vci_WriteOneItemPage(info->rel, s_buffer);
+ UnlockReleaseBuffer(s_buffer);
+ }
+
+ if ((newExtentId != VCI_INVALID_EXTENT_ID) && (newExtentId < numExtents))
+ {
+ Assert((command == vci_rc_wos_ros_conv) || (command == vci_rc_collect_deleted));
+ extentInfo = vci_GetMExtent(&d_buffer, info, newExtentId); /* to */
+
+ LockBuffer(d_buffer, BUFFER_LOCK_EXCLUSIVE);
+ extentInfo->xgen = InvalidTransactionId;
+ Assert((extentInfo->xdel == InvalidTransactionId) || (extentInfo->xdel == FrozenTransactionId));
+ extentInfo->xdel = FrozenTransactionId;
+ extentInfo->flags |= VCIS_M_EXTENT_FLAG_ENABLE_RECOVERED_COLID;
+ extentInfo->recovered_colid = colId;
+ vci_WriteOneItemPage(info->rel, d_buffer);
+ UnlockReleaseBuffer(d_buffer);
+ }
+}
+
+void
+vci_WriteRecoveryRecordForUpdateDelVec(vci_MainRelHeaderInfo *info)
+{
+ vcis_m_extent_t *extentInfo;
+ vci_meta_item_scanner_t *scan;
+
+ scan = vci_BeginMetaItemScan(info->rel, BUFFER_LOCK_EXCLUSIVE);
+ while ((extentInfo = vci_GetMExtentNext(info, scan)) != NULL)
+ {
+ extentInfo->num_deleted_rows_old = extentInfo->num_deleted_rows;
+ }
+ vci_EndMetaItemScan(scan);
+}
+
+void
+vci_RecoveryUpdateDelVec(vci_MainRelHeaderInfo *info)
+{
+ vcis_m_extent_t *extentInfo;
+ vci_meta_item_scanner_t *scan;
+
+ scan = vci_BeginMetaItemScan(info->rel, BUFFER_LOCK_EXCLUSIVE);
+ while ((extentInfo = vci_GetMExtentNext(info, scan)) != NULL)
+ {
+ extentInfo->num_deleted_rows = extentInfo->num_deleted_rows_old;
+ }
+ vci_EndMetaItemScan(scan);
+}
+
+const char *
+vci_GetRosCommandName(vci_ros_command_t command)
+{
+ switch (command)
+ {
+ case vci_rc_invalid:
+ return "invalid";
+
+ case vci_rc_vacuum:
+ return "vacuum";
+
+ case vci_rc_query:
+ return "query";
+
+ case vci_rc_drop_index:
+ return "drop index";
+
+ case vci_rc_wos_delete:
+ return "wos delete";
+
+ case vci_rc_wos_insert:
+ return "wos insert";
+
+ case vci_rc_recovery:
+ return "recovery";
+
+ case vci_rc_probe:
+ return "probe";
+
+ case vci_rc_wos_ros_conv_build:
+ return "wos ros conv build";
+
+ case vci_rc_generate_local_ros:
+ return "generate local ros";
+
+ case vci_rc_copy_command:
+ return "copy command";
+
+ case vci_rc_wos_ros_conv:
+ return "wos2ros conversion";
+
+ case vci_rc_update_del_vec:
+ return "update delete vector";
+
+ case vci_rc_collect_deleted:
+ return "collect deleted rows";
+
+ case vci_rc_collect_extent:
+ return "collect extent";
+
+ case vci_rc_update_tid_crid:
+ return "update tid-crid tree";
+
+ default:
+ return "unknown";
+ }
+}
+
+static Buffer
+ReadBufferWithPageInitCore(Relation reln, BlockNumber blockNumber, int16 numItem)
+{
+ Buffer buffer;
+ Page page;
+
+ Assert((reln->rd_rel->relkind == 'i') || (reln->rd_rel->relkind == 'm'));
+ buffer = ReadBuffer(reln, blockNumber);
+
+ page = BufferGetPage(buffer);
+ if (PageIsNew(page))
+ {
+ LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
+ if (PageIsNew(page))
+ vci_InitPageCore(buffer, numItem, true);
+ LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+ }
+
+ return buffer;
+}
+
+/**
+ * @brief Read a buffer containing the requested block of the requested VCI
+ * relation.
+ *
+ * Same as ReadBuffer(), but initialize new page.
+ *
+ * We must generally use this function instead of ReadBuffer(), to access a kind
+ * of VCI relations except Data WOS, Whiteout WOS, and delete vector. But we
+ * don't need to replace ReadBuffer() immediately after vci_PreparePagesIfNecessaryCore().
+ *
+ * @param[in] reln The relation.
+ * @param[in] blockNumber The block number to be read.
+ */
+Buffer
+vci_ReadBufferWithPageInit(Relation reln, BlockNumber blockNumber)
+{
+ return ReadBufferWithPageInitCore(reln, blockNumber, 1);
+}
+
+/**
+ * @brief Read a buffer containing the requested block of the requested delete
+ * vector.
+ *
+ * Same as ReadBuffer(), but initialize new page.
+ *
+ * We must generally use this function instead of ReadBuffer(), to access a
+ * delete vector. But we don't need to replace ReadBuffer() immediately after
+ * vci_PreparePagesIfNecessaryCore().
+ *
+ * @param[in] reln The relation.
+ * @param[in] blockNumber The block number to be read.
+ */
+Buffer
+vci_ReadBufferWithPageInitDelVec(Relation reln, BlockNumber blockNumber)
+{
+ return ReadBufferWithPageInitCore(reln, blockNumber, VCI_ITEMS_IN_PAGE_FOR_DELETE);
+}
diff --git a/contrib/vci/storage/vci_ros_command.c b/contrib/vci/storage/vci_ros_command.c
new file mode 100644
index 0000000..5d4252b
--- /dev/null
+++ b/contrib/vci/storage/vci_ros_command.c
@@ -0,0 +1,4131 @@
+/*-------------------------------------------------------------------------
+ *
+ * vci_ros_command.c
+ *
+ * Portions Copyright (c) 2025, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * contrib/vci/storage/vci_ros_command.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#ifndef WIN32
+#include <sys/time.h>
+#endif
+
+#include "access/heapam.h"
+#include "access/heapam_xlog.h"
+#include "access/relscan.h"
+#include "access/tupdesc.h"
+#include "access/genam.h"
+#include "access/visibilitymap.h" /* for visibilitymap_set() */
+#include "access/xact.h"
+#include "access/tableam.h"
+#include "catalog/index.h"
+#include "catalog/pg_operator.h" /* for TIDLessOperator */
+#include "catalog/storage.h"
+#include "commands/vacuum.h"
+#include "storage/freespace.h"
+#include "storage/itemptr.h"
+#include "storage/lmgr.h"
+#include "storage/procarray.h"
+#include "storage/smgr.h" /* for RelationSetTargetBlock() */
+#include "utils/rel.h"
+#include "utils/syscache.h"
+#include "utils/tuplesort.h"
+
+#include "postgresql_copy.h"
+
+#include "vci.h"
+#include "vci_chunk.h"
+
+#include "vci_columns.h"
+#include "vci_columns_data.h"
+
+#include "vci_fetch.h"
+#include "vci_freelist.h"
+#include "vci_mem.h"
+#include "vci_ros.h"
+#include "vci_ros_command.h"
+#include "vci_tidcrid.h"
+#include "vci_wos.h"
+#include "vci_xact.h"
+
+extern bool HeapTupleSatisfiesWos2Ros(HeapTuple htup, Snapshot snapshot, Buffer buffer);
+extern bool HeapTupleSatisfiesLocalRos(HeapTuple htup, Snapshot snapshot, Buffer buffer);
+bool VCITupleSatisfiesVisibility(HeapTuple htup, Snapshot snapshot, Buffer buffer);
+
+typedef enum
+{
+ CEK_CountDeletedRows,
+ CEK_CountUnusedExtents,
+} CEKind;
+
+typedef enum
+{
+ WOS_Data,
+ WOS_Whiteout,
+} WosKind;
+
+typedef struct
+{
+ ItemPointerData orig_tid;
+
+ ItemPointerData wos_tid;
+
+ bool movable;
+
+ int64 xid64;
+
+} vci_tid_tid_xid64_t;
+
+static bool WaitTransactionEndOfLastRosCommand(vci_MainRelHeaderInfo *info);
+static void fillTidListFromTidSortState(vci_RosCommandContext *comContext, int numRows);
+static int ConvertWos2Ros(vci_RosCommandContext *comContext);
+static void FillValuesColumnwiseFromRosChunkStorage(vci_virtual_tuples_t *vTuples, RosChunkStorage *rosChunkStorage);
+static void FillIsNullColumnwiseFromRosChunkStorage(vci_virtual_tuples_t *vTuples, RosChunkStorage *rosChunkStorage);
+static void FillIsNullRowwiseFromRosChunkStorage(vci_virtual_tuples_t *vTuples, RosChunkStorage *rosChunkStorage);
+static void FillValuesRowwiseFromRosChunkStorage(vci_virtual_tuples_t *vTuples, RosChunkStorage *rosChunkStorage);
+static void AppendDataToLocalRos(vci_local_ros_t *localRos, RosChunkStorage *storage, vci_MainRelHeaderInfo *info);
+static Size ConvertWos2LocalRos(vci_RosCommandContext *comContext);
+static void FillOneRosChunkBuffer(vci_RosCommandContext *comContext, int rowId, int numRowsToConvert);
+static void ReadOneExtentAndStoreInChunkStorage(vci_RosCommandContext *comContext);
+static Size ConvertWhiteOut2LocalDeleteList(vci_RosCommandContext *comContext, int sel);
+static bool NeedMainRelHeaderUpdate(vci_ros_command_t command);
+static int CmpUint64(const void *pa, const void *pb);
+static void FlushTidCridPairListToTreeForBuild(vci_TidCridRelations *relPair, vcis_tidcrid_pair_list_t *appList, BlockNumber blockNumber);
+static void UpdateTidCridForBuild(vci_RosCommandContext *comContext);
+static void vci_build_callback(Relation rel, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *state);
+static void FinalizeBuild(vci_RosCommandContext *comContext);
+static double GetEstimatedNumRows(Oid relid);
+static void RemoveWosEntries(vci_RosCommandContext *comContext, WosKind wos_kind);
+static uint64 cleanUpWos(vci_RosCommandContext *comContext, vci_MainRelVar wosType);
+static uint64 UpdateDelVec(vci_RosCommandContext *comContext, Size workareaSize, uint64 numRowsAtOnce);
+static void writeNumDeleteRowsIntoExntetInfo(vci_MainRelHeaderInfo *info, int32 topExtentId, uint32 numExtents, uint32 *numDeletedRows);
+static vci_target_extent_info_t CountExtents(Relation mainRel, uint32 threshold, CEKind kind);
+static HeapTuple getTupleFromVector(int offset, TupleDesc tupleDesc, vci_virtual_tuples_t *vecSet);
+static void FillOneRosChunkBufferFromExtent(vci_RosCommandContext *comContext, int32 extentId, uint32 *rowIdInExtent);
+static bool isCdrTargetExtentValid(vci_RosCommandContext *comContext);
+static int32 CollectDeletedRows(vci_RosCommandContext *comContext, Snapshot snapshot);
+static uint32 SearchUnusedExtent(vci_MainRelHeaderInfo *info);
+static void CollectUnusedExtent(vci_RosCommandContext *comContext);
+static void UpdateTidCrid(vci_RosCommandContext *comContext, Size workareaSize);
+static void collectBlockNumberToMove(vci_RosCommandContext *comContext, int numPages);
+static void freezeMainAndRos(vci_RosCommandContext *comContext);
+static void freezeWos(vci_RosCommandContext *comContext, vci_MainRelVar wosType, Snapshot snapshot);
+static void truncateRos(vci_RosCommandContext *comContext);
+static void truncateWos(vci_RosCommandContext *comContext);
+static void constructTidArray(vci_RosCommandContext *comContext, int max_data_wos_entries, int max_whiteout_wos_entries);
+static int comparator_orig_tid_xid64(const void *pa, const void *pb);
+static bool can_select_candidate_for_wos2ros_conv(vci_tid_tid_xid64_t *data_wos_item, vci_RosCommandContext *comContext, ItemPointer last_whiteout_orig_tid);
+static bool can_select_candidate_for_update_delvec(vci_tid_tid_xid64_t *whiteout_wos_item, vci_RosCommandContext *comContext);
+static void put_entry_into_tid_list(vci_RosCommandContext *comContext, WosKind wos_kind, ItemPointer orig_tid, ItemPointer wos_tid);
+static bool get_entry_into_tid_list(vci_RosCommandContext *comContext, WosKind wos_kind, ItemPointer orig_tid, ItemPointer wos_tid);
+static int readTidListFromWosIntoTidArray(Oid wos_od, WosKind wos_kind, vci_tid_tid_xid64_t *wos_entris, int max_wos_entries, Snapshot snapshot);
+static void constructTidSortState(vci_RosCommandContext *comContext);
+static void readTidListFromWosIntoTidSortState(Oid wos_oid, WosKind wos_kind, TupleTableSlot *slot, Tuplesortstate *sortstate, Snapshot snapshot, TransactionId wosros_xid);
+static bool getValidTidSortState(Tuplesortstate *sortstate, TupleTableSlot *slot, vci_tid_tid_xid64_t *item);
+static int32 compareXid64(int64 data_wos_xid64, int64 whiteout_wos_xid64);
+
+/*
+ * WOS -> ROS conversion
+ * We have two situations of WOS -> ROS conversion.
+ * 1. conversion process to reduce WOS and move data into ROS.
+ * In this case, all columns registered to the VCI are converted into
+ * ROS style and stored each relation. The column meta data relations
+ * are also updated. We normally convert one full extent at a time.
+ * The precise description is,
+ * A. take an exclusive lock to the main relation header.
+ * B. recover ROS if broken.
+ * C. scan WOS with care of freeze condition and deleted condition
+ * and collect live TID, up to 256 K rows.
+ * D. sort TID.
+ * E. write conversion information into VCI main relation header and
+ * extent info.
+ * F. collect target tuples and build ROS data. Here we have chunk
+ * the data, since the work area might be limited.
+ * G. Find extent and free spaces to write the data.
+ * H. Write meta data.
+ * I. Write extent.
+ * J. Finalize meta data and VCI main relation.
+ * K. release the main relation header.
+ * For this purpose, we need VCI main relation, size of workarea.
+ *
+ * 2. local ROS conversion.
+ * In this case, given columns are converted into ROS style and stored
+ * in memory. All the visible data are converted.
+ * The precise description is,
+ * A. scan WOS with care of visibility and deleted condition and collect
+ * visible TID.
+ * B. sort TID.
+ * C. take an exclusive lock to the main relation header.
+ * D. recover ROS if broken.
+ * E. collect target tuples and build local ROS data.
+ * F. release the main relation header.
+ * For this purpose, we need VCI main relation, size of area to store,
+ * necessary column ID list.
+ *
+ */
+
+/* -------------------------------------------------------------- */
+
+#define PERIOD_TO_CHECK_TRANSACTION_END (INT64CONST(1000)) /* 1 ms */
+#define DURATION_TO_CHECK_TRANSACTION_END (100000) /* 100 s */
+
+/*
+ * Copy from vacuumlazy.c
+ */
+#define VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL 20 /* ms */
+#define VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL 50 /* ms */
+#define VACUUM_TRUNCATE_LOCK_TIMEOUT 5000 /* ms */
+
+/**
+ * @brief This function is designed to detect transaction end after VCI
+ * exclusive write lock is released.
+ *
+ * If the transaction of previous ROS command is not commited nor aborted,
+ * wait for the end for time specified by the macro
+ * We expect that normally ROS command is soon commited
+ * DURATION_TO_CHECK_TRANSACTION_END (originally 100 seconds)
+ * after the lock is released.
+ * When the end is not detected, the function returns false,
+ * otherwise true.
+ *
+ * @param[in] info Pointer to vci_MainRelHeaderInfo whose VCI index is
+ * determined.
+ * @retval true The transaction of the previous ROS command is detected
+ * in a wait-time.
+ * @retval false The transaction end is not detected.
+ */
+static bool
+WaitTransactionEndOfLastRosCommand(vci_MainRelHeaderInfo *info)
+{
+ /*
+ * current ROS version is the transaction ID of last ROS command
+ */
+ TransactionId curRosVer = vci_GetMainRelVar(info, vcimrv_current_ros_version, 0);
+ int checkCount;
+
+ if (!TransactionIdIsValid(curRosVer))
+ return true;
+
+ if (TransactionIdIsCurrentTransactionId(curRosVer))
+ return true;
+
+ for (checkCount = 0;
+ (checkCount < DURATION_TO_CHECK_TRANSACTION_END) &&
+ (!ConditionalXactLockTableWait(curRosVer, false));
+ ++checkCount)
+ {
+ pg_usleep(PERIOD_TO_CHECK_TRANSACTION_END); /* wait 1 ms */
+ }
+
+ return checkCount < DURATION_TO_CHECK_TRANSACTION_END;
+}
+
+/**
+ * @brief This function determine the result of the transaction status
+ * of the previous ROS command.
+ *
+ * First, it waits the end of the transaction of the previous if necessary.
+ * When it is committed successfully, just update current ROS version.
+ * Otherwise, tries to recover VCI relations.
+ *
+ * @param[in] info Pointer to vci_MainRelHeaderInfo whose VCI index is
+ * determined.
+ *
+ * @note Assuming that this function is called under main relation is locked
+ * exclusively.
+ */
+void
+vci_RecoverOneVCIIfNecessary(vci_MainRelHeaderInfo *info)
+{
+ TransactionId curRosVer;
+ TransactionId lastRosVer;
+ vci_ros_command_t commandSave = info->command;
+
+ Assert(info);
+
+ vci_ChangeCommand(info, vci_rc_recovery);
+
+ /*
+ * Since the transaction is commited or abort after the lock is released,
+ * we have to wait for it.
+ */
+ if (!WaitTransactionEndOfLastRosCommand(info))
+ elog(ERROR, "unterminated ROS command");
+
+ curRosVer = vci_GetMainRelVar(info, vcimrv_current_ros_version, 0);
+ lastRosVer = vci_GetMainRelVar(info, vcimrv_last_ros_version, 0);
+
+ if (!TransactionIdEquals(curRosVer, lastRosVer))
+ {
+ switch (vci_transaction_get_type(curRosVer))
+ {
+ case VCI_XACT_SELF:
+ /* The last ROS version has been already updated */
+ break;
+
+ case VCI_XACT_IN_PROGRESS:
+ elog(PANIC, "internal error. multiple ROS command running");
+ break;
+
+ case VCI_XACT_DID_COMMIT:
+ /* update last ROS version and others */
+ vci_UpdateLastRosVersionAndOthers(info);
+ break;
+
+ case VCI_XACT_DID_ABORT:
+ case VCI_XACT_DID_CRASH:
+ {
+ vci_ros_command_t command;
+
+ command = vci_GetMainRelVar(info, vcimrv_ros_command, 0);
+
+ elog(DEBUG1, "crash recovery: previous command=\"%s\"(%d)",
+ vci_GetRosCommandName(command), command);
+
+ switch (command)
+ {
+ case vci_rc_update_del_vec:
+ vci_RecoveryUpdateDelVec(info);
+ break;
+
+ case vci_rc_wos_ros_conv:
+ case vci_rc_collect_deleted:
+ case vci_rc_collect_extent:
+ vci_RecoveryExtentInfo(info, command);
+ vci_RecoveryFreeSpace(info, command);
+ break;
+
+ case vci_rc_update_tid_crid:
+ vci_RecoveryTidCrid(info);
+ vci_RecoveryFreeSpaceForTidCrid(info);
+ break;
+
+ default:
+ elog(PANIC, "last recorded ros command is fatally broken.");
+ break;
+ }
+
+ vci_RecoveryDone(info);
+ }
+ break;
+
+ case VCI_XACT_INVALID:
+ elog(PANIC, "should not reach here");
+ break;
+ }
+ }
+
+ vci_ChangeCommand(info, commandSave);
+}
+
+static void
+fillTidListFromTidSortState(vci_RosCommandContext *comContext, int numRows)
+{
+ int count = 0;
+
+ Assert(numRows <= VCI_NUM_ROWS_IN_EXTENT);
+
+ for (int i = 0; i < numRows; i++)
+ {
+ Assert(count < comContext->wos2ros_array.max);
+
+ if (!get_entry_into_tid_list(comContext, WOS_Data,
+ &comContext->wos2ros_array.orig_tids[i],
+ &comContext->wos2ros_array.wos_tids[i]))
+ break;
+
+ count++;
+ }
+
+ comContext->wos2ros_array.num = count;
+ comContext->numRowsToConvert = count;
+}
+
+static int
+ConvertWos2Ros(vci_RosCommandContext *comContext)
+{
+ int result = 0;
+
+ if (comContext->numRowsToConvert < 1)
+ {
+ elog(DEBUG2, "stop WOS to ROS conversion numRowsToConvert = %d", comContext->numRowsToConvert);
+ return 0;
+ }
+
+ elog(DEBUG2, "start to convert WOS to ROS");
+
+ /* obtain target extent ID */
+ /* comContext->extentId = vci_GetFreeExtentId(&(comContext->info)); */
+ elog(DEBUG2,
+ "WOS -> ROS conversion: index: %s extent ID: " INT64_FORMAT,
+ RelationGetRelationName(comContext->info.rel),
+ (int64) comContext->extentId);
+
+ /*
+ * Set WOS->ROS conversion data and write main relation for recovery.
+ * Header and extent info. Here, we also put current ROS version to the
+ * actual current transaction ID.
+ */
+ vci_WriteExtentInfoInMainRosForWosRosConvInit(&(comContext->info),
+ comContext->extentId,
+ comContext->xid);
+
+ vci_ResetRosChunkStorage(&(comContext->storage));
+ vci_ResetRosChunkBufferCounter(&(comContext->buffer));
+
+ /* read data for one extent */
+ ReadOneExtentAndStoreInChunkStorage(comContext);
+
+ /* write one extent into ROS */
+ vci_AddTidCridUpdateList(&(comContext->info),
+ &(comContext->storage),
+ comContext->extentId);
+ vci_WriteOneExtent(&(comContext->info),
+ &(comContext->storage),
+ comContext->extentId,
+ comContext->xid,
+ InvalidTransactionId,
+ comContext->xid);
+
+ result = comContext->storage.numTotalRows;
+
+ elog(DEBUG2, "converted %d rows into ROS", result);
+
+ return result;
+}
+
+static void
+FillValuesColumnwiseFromRosChunkStorage(vci_virtual_tuples_t *vTuples,
+ RosChunkStorage *rosChunkStorage)
+{
+ for (int16 columnId = 0; columnId < vTuples->num_columns; ++columnId)
+ {
+ switch (vTuples->column_info[columnId].comp_type)
+ {
+ case vcis_compression_type_fixed_raw:
+ vci_FillFixedWidthColumnarFromRosChunkStorage(vTuples,
+ columnId, rosChunkStorage);
+ break;
+ case vcis_compression_type_variable_raw:
+ vci_FillVariableWidthColumnarFromRosChunkStorage(vTuples,
+ columnId, rosChunkStorage);
+ break;
+ default:
+ Assert(false);
+ elog(ERROR, "internal error: unsupported compression type");
+ }
+ }
+}
+
+static void
+FillIsNullColumnwiseFromRosChunkStorage(vci_virtual_tuples_t *vTuples,
+ RosChunkStorage *rosChunkStorage)
+{
+ const int16 strideR = 64;
+ int baseOffset = 0;
+ int16 *nullableColumnId = vci_GetNullableColumnIds(vTuples);
+
+ if (vTuples->num_columns < 1)
+ return;
+
+ Assert(0 < rosChunkStorage->numFilled);
+ Assert(vTuples->num_columns <= rosChunkStorage->chunk[0]->numColumns);
+ Assert(vTuples->fetch_context->query_context->num_nullable_columns <= rosChunkStorage->chunk[0]->numNullableColumns);
+ Assert(rosChunkStorage->numTotalRows <= vTuples->num_rows_in_extent);
+
+ MemSet(vTuples->isnull, 0, vTuples->num_columns * vTuples->num_rows_in_extent);
+
+ for (int sId = 0; sId < rosChunkStorage->numFilled; ++sId)
+ {
+ RosChunkBuffer *chunk = rosChunkStorage->chunk[sId];
+
+ for (int rId = 0; rId < chunk->numFilled; rId += strideR)
+ {
+ int pIdMax = Min(rId + strideR, chunk->numFilled);
+
+ for (int bitId = 0; bitId < chunk->numNullableColumns; ++bitId)
+ {
+ int colId = nullableColumnId[bitId];
+
+ if (VCI_FIRST_NORMALCOLUMN_ID <= colId)
+ {
+ uint8 *dst = (uint8 *) &(vTuples->isnull[(vTuples->num_rows_in_extent * colId) + baseOffset]);
+
+ for (int pId = rId; pId < pIdMax; ++pId)
+ dst[pId] = vci_GetBit((uint8 *) &(chunk->nullData[chunk->nullWidthInByte * pId]), bitId);
+ }
+ }
+ }
+ baseOffset += chunk->numFilled;
+ }
+ Assert(rosChunkStorage->numTotalRows == baseOffset);
+}
+
+static void
+FillIsNullRowwiseFromRosChunkStorage(vci_virtual_tuples_t *vTuples,
+ RosChunkStorage *rosChunkStorage)
+{
+ abort();
+}
+
+static void
+FillValuesRowwiseFromRosChunkStorage(vci_virtual_tuples_t *vTuples,
+ RosChunkStorage *rosChunkStorage)
+{
+ abort();
+}
+
+static void
+AppendDataToLocalRos(vci_local_ros_t *localRos,
+ RosChunkStorage *storage,
+ vci_MainRelHeaderInfo *info)
+{
+ MemoryContext oldMemCtx;
+ struct vci_virtual_tuples *vTuples;
+ int32 extentId;
+
+ oldMemCtx = MemoryContextSwitchTo(localRos->memory_context);
+
+ ++(localRos->num_local_extents);
+ extentId = -(localRos->num_local_extents);
+
+ localRos->extent = vci_repalloc(localRos->extent,
+ sizeof(vci_virtual_tuples_t *) *
+ localRos->num_local_extents);
+ vTuples = vci_CSCreateVirtualTuplesWithNumRows(localRos->fetch_context,
+ storage->numTotalRows);
+ localRos->extent[localRos->num_local_extents - 1] = vTuples;
+
+ /*
+ * Originally, localRos->size_vector_memory_context has the total size of
+ * vector sets. The third parameter of vci_CSInitializeVectorSet() is the
+ * size for one vector set. Normally, we give up when many data are stored
+ * in ROS. So, we can fix the maximum number of extents.
+ */
+
+ vTuples->num_rows = storage->numTotalRows;
+ vTuples->extent_id = extentId;
+ vTuples->num_rows_in_extent = storage->numTotalRows;
+ vTuples->row_id_in_extent = 0;
+ vTuples->status = vcirvs_read_whole;
+
+ if (vTuples->crid)
+ vci_FillCridInVirtualTuples(vTuples);
+
+ MemSet(vTuples->skip, 0, sizeof(uint16) * vTuples->num_rows_in_extent);
+
+ if (vTuples->tid)
+ vci_FillFixedWidthColumnarFromRosChunkStorage(vTuples, VCI_COLUMN_ID_TID, storage);
+
+ if (vTuples->use_column_store)
+ {
+ FillIsNullColumnwiseFromRosChunkStorage(vTuples, storage);
+ FillValuesColumnwiseFromRosChunkStorage(vTuples, storage);
+ }
+ else
+ {
+ FillIsNullRowwiseFromRosChunkStorage(vTuples, storage);
+ FillValuesRowwiseFromRosChunkStorage(vTuples, storage);
+ }
+
+ MemoryContextSwitchTo(oldMemCtx);
+}
+
+static Size
+ConvertWos2LocalRos(vci_RosCommandContext *comContext)
+{
+ Size result = 0;
+
+ if (comContext->numRowsToConvert < 1)
+ return 0;
+
+ elog(DEBUG2, "start to generate local ROS");
+
+ for (comContext->extentId = -1; (!comContext->done);
+ comContext->extentId -= 1)
+ {
+ elog(DEBUG3,
+ "WOS -> local ROS conversion: index: %s extent ID:%d\n",
+ RelationGetRelationName(comContext->info.rel),
+ comContext->extentId);
+
+ vci_ResetRosChunkStorage(&(comContext->storage));
+ vci_ResetRosChunkBufferCounter(&(comContext->buffer));
+
+ /* read data for one extent */
+ ReadOneExtentAndStoreInChunkStorage(comContext);
+
+ /* write one extent into ROS */
+ if (0 < comContext->storage.numTotalRows)
+ AppendDataToLocalRos(comContext->local_ros,
+ &(comContext->storage),
+ &(comContext->info));
+
+ result += comContext->storage.numTotalRows;
+ elog(DEBUG2, "converted %llu rows into local ROS",
+ (unsigned long long) result);
+ }
+
+ return result;
+}
+
+/* **************************************
+ * ** CAUTION: AttrNumber is 1 origin. **
+ * **************************************
+ */
+/**
+ * assuming when tIdList != NULL, TID list in tIdList to be read.
+ * not sequential scan, so scan is NULL.
+ * when tIdList == NULL, scan != NULL, sequential scan.
+ *
+ * @retval true some data remain
+ * @retval false no data remain
+ */
+static void
+FillOneRosChunkBuffer(vci_RosCommandContext *comContext,
+ int rowId,
+ int numRowsToConvert)
+{
+ TupleDesc tupleDesc = RelationGetDescr(comContext->heapRel);
+ Snapshot snapshot = GetActiveSnapshot();
+
+ if (comContext->wos2ros_array.max > 0)
+ {
+ uint32 sel PG_USED_FOR_ASSERTS_ONLY;
+ vci_ros_command_t command = comContext->command;
+
+#ifdef USE_ASSERT_CHECKING
+ vci_TidCridUpdateListContext *oldListContext = NULL;
+#endif
+
+ if ((command == vci_rc_wos_ros_conv) ||
+ (command == vci_rc_collect_deleted))
+ {
+ sel = vci_GetMainRelVar(&comContext->info, vcimrv_tid_crid_diff_sel, 0);
+
+#ifdef USE_ASSERT_CHECKING
+ oldListContext = vci_OpenTidCridUpdateList(&comContext->info, sel);
+#endif
+ }
+ else if (command == vci_rc_generate_local_ros)
+ {
+ sel = comContext->local_ros->fetch_context->query_context->tid_crid_diff_sel;
+ }
+
+ for (int offset = 0; offset < numRowsToConvert; ++offset)
+ {
+ HeapTupleData tuple;
+ Buffer buffer;
+ int actualOffset = rowId + comContext->wos2ros_array.offset + offset;
+
+ if (comContext->wos2ros_array.num <= actualOffset)
+ {
+ comContext->done = true;
+ break;
+ }
+
+ CHECK_FOR_INTERRUPTS();
+
+ tuple.t_self = comContext->wos2ros_array.orig_tids[actualOffset];
+
+ if (!heap_fetch(comContext->heapRel, snapshot, &tuple, &buffer, true))
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("vci index \"%s\" corrupted", RelationGetRelationName(comContext->info.rel)),
+ errdetail("TID (%d,%d) has been deleted from table \"%s\"",
+ ItemPointerGetBlockNumber(&tuple.t_self),
+ ItemPointerGetOffsetNumber(&tuple.t_self),
+ RelationGetRelationName(comContext->heapRel)),
+ errhint("Use DROP INDEX \"%s\"", RelationGetRelationName(comContext->info.rel))));
+ }
+
+#ifdef USE_ASSERT_CHECKING
+ if (oldListContext)
+ {
+ uint64 cridUint = vci_GetCridFromTid(oldListContext, &tuple.t_self, NULL);
+
+ if (cridUint != VCI_INVALID_CRID)
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("vci index \"%s\" corrupted", RelationGetRelationName(comContext->info.rel)),
+ errdetail("try to insert TID (%d,%d) into ROS twice: extentId=%d, index=%d",
+ ItemPointerGetBlockNumber(&tuple.t_self),
+ ItemPointerGetOffsetNumber(&tuple.t_self),
+ vci_CalcExtentIdFromCrid64(cridUint),
+ vci_CalcRowIdInExtentFromCrid64(cridUint)),
+ errhint("Use DROP INDEX \"%s\"", RelationGetRelationName(comContext->info.rel))));
+ }
+#endif
+
+ vci_FillOneRowInRosChunkBuffer(&(comContext->buffer),
+ &(comContext->info),
+ &tuple.t_self,
+ &tuple,
+ comContext->indxColumnIdList,
+ comContext->heapAttrNumList,
+ tupleDesc);
+
+ if (comContext->data_wos_del_list)
+ {
+ tuplesort_putdatum(comContext->data_wos_del_list,
+ ItemPointerGetDatum(&comContext->wos2ros_array.wos_tids[actualOffset]), false);
+ }
+
+ ReleaseBuffer(buffer);
+ }
+
+#ifdef USE_ASSERT_CHECKING
+ if (oldListContext)
+ vci_CloseTidCridUpdateList(oldListContext);
+#endif
+ }
+}
+
+static void
+ReadOneExtentAndStoreInChunkStorage(vci_RosCommandContext *comContext)
+{
+ /* collect data for one extent */
+ for (Size rowId = 0;
+ rowId < comContext->numRowsToConvert;
+ rowId += comContext->numRowsAtOnce)
+ {
+ /* the number of rows in one chunk */
+ int numRowsToConvert = comContext->numRowsToConvert - rowId;
+
+ if (comContext->numRowsAtOnce - comContext->buffer.numFilled < numRowsToConvert)
+ numRowsToConvert = comContext->numRowsAtOnce - comContext->buffer.numFilled;
+
+ CHECK_FOR_INTERRUPTS();
+
+ /* fetch the data from original relation */
+ FillOneRosChunkBuffer(comContext, rowId, numRowsToConvert);
+ if (0 < comContext->buffer.numFilled)
+ {
+ /* copy chunk buffer in a compact manner */
+ vci_RegisterChunkBuffer(&(comContext->storage), &(comContext->buffer));
+ vci_ResetRosChunkBufferCounter(&(comContext->buffer));
+ }
+ }
+
+ comContext->wos2ros_array.offset += comContext->numRowsToConvert;
+}
+
+static Size
+ConvertWhiteOut2LocalDeleteList(vci_RosCommandContext *comContext,
+ int sel)
+{
+ vci_local_delete_list *list = &(comContext->local_ros->local_delete_list);
+ vci_TidCridUpdateListContext *tidCridListContext;
+
+ Assert(list);
+ Assert(list->num_entry < list->length);
+
+ tidCridListContext = vci_OpenTidCridUpdateList(&comContext->info, sel);
+
+ for (int cId = 0; cId < comContext->delvec_array.num; cId++)
+ {
+ ItemPointerData orig_tid;
+ uint64 crid;
+
+ orig_tid = comContext->delvec_array.orig_tids[cId];
+
+ crid = vci_GetCridFromTid(tidCridListContext, &orig_tid, NULL);
+
+ if (crid == VCI_INVALID_CRID)
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("vci index \"%s\" corrupted", RelationGetRelationName(comContext->info.rel)),
+ errdetail("try to delete TID (%d,%d) into local delete list",
+ ItemPointerGetBlockNumber(&orig_tid),
+ ItemPointerGetOffsetNumber(&orig_tid)),
+ errhint("Use DROP INDEX \"%s\"", RelationGetRelationName(comContext->info.rel))));
+
+ list->crid_list[list->num_entry] = crid;
+ list->num_entry++;
+ }
+
+ vci_CloseTidCridUpdateList(tidCridListContext);
+
+ return list->num_entry;
+}
+
+static bool
+NeedMainRelHeaderUpdate(vci_ros_command_t command)
+{
+ switch (command)
+ {
+ case vci_rc_recovery:
+ case vci_rc_wos_ros_conv:
+ case vci_rc_update_del_vec:
+ case vci_rc_collect_deleted:
+ /* case vci_rc_compaction: */
+ case vci_rc_update_tid_crid:
+ case vci_rc_collect_extent:
+ case vci_rc_copy_command:
+ case vci_rc_wos_ros_conv_build:
+
+ return true;
+
+ case vci_rc_wos_delete:
+ case vci_rc_wos_insert:
+ case vci_rc_probe:
+ case vci_rc_query:
+ case vci_rc_generate_local_ros:
+ case vci_rc_drop_index:
+ case vci_rc_vacuum:
+
+ return false;
+
+ default:
+ Assert(false);
+ elog(ERROR, "internal error: unexpected ROS command");
+ }
+
+ return false;
+}
+
+void
+vci_ReleaseMainRelInCommandContext(vci_RosCommandContext *comContext)
+{
+ /* release the main relation */
+ vci_ReleaseMainRelHeader(&(comContext->info));
+}
+
+void
+vci_CloseHeapRelInCommandContext(vci_RosCommandContext *comContext)
+{
+ if (RelationIsValid(comContext->heapRel))
+ table_close(comContext->heapRel, AccessShareLock);
+ comContext->heapRel = NULL;
+}
+
+static int
+CmpUint64(const void *pa, const void *pb)
+{
+ uint64 a = *(uint64 *) pa;
+ uint64 b = *(uint64 *) pb;
+
+ return (a < b) ? -1 : ((b < a) ? 1 : 0);
+}
+
+void
+vci_InitRosCommandContext0(vci_RosCommandContext *context,
+ Relation rel, vci_ros_command_t command)
+{
+ Assert(context);
+
+ MemSet(context, 0, sizeof(*context));
+
+ context->command = command;
+ context->indexOid = RelationGetRelid(rel);
+
+ vci_InitMainRelHeaderInfo(&(context->info), rel, command);
+ vci_KeepMainRelHeader(&(context->info));
+}
+
+void
+vci_InitRosCommandContext1(vci_RosCommandContext *comContext,
+ Size workareaSize,
+ int numInsertRows,
+ int numDeleteRows,
+ bool readOriginalData)
+{
+ Size worstCaseTupleSize;
+ int numColumns;
+
+ Assert(comContext);
+
+ comContext->xid = ((vci_rc_query == comContext->command) ||
+ (vci_rc_generate_local_ros == comContext->command)) ?
+ InvalidTransactionId : GetCurrentTransactionId();
+
+ comContext->heapOid = IndexGetRelation(comContext->info.rel->rd_id, false);
+
+ comContext->local_ros = NULL;
+ comContext->done = false;
+
+ switch (comContext->command)
+ {
+ case vci_rc_generate_local_ros:
+ comContext->wos2ros_array.orig_tids = palloc_array(ItemPointerData, numInsertRows);
+ comContext->wos2ros_array.max = numInsertRows;
+ comContext->delvec_array.orig_tids = palloc_array(ItemPointerData, numDeleteRows);
+ comContext->delvec_array.max = numDeleteRows;
+ break;
+
+ case vci_rc_wos_ros_conv:
+ case vci_rc_collect_deleted:
+ comContext->wos2ros_array.orig_tids = palloc_array(ItemPointerData, numInsertRows);
+ comContext->wos2ros_array.wos_tids = palloc_array(ItemPointerData, numInsertRows);
+ comContext->wos2ros_array.max = numInsertRows;
+ break;
+
+ default:
+ break;
+ }
+
+ comContext->numRowsToConvert = Min(Max(numInsertRows, numDeleteRows), VCI_NUM_ROWS_IN_EXTENT);
+
+ /*
+ * Column sizes
+ */
+ numColumns = vci_GetMainRelVar(&(comContext->info), vcimrv_num_columns, 0);
+
+ /*
+ * get column size in worst case and column ID lists for both original
+ * relation and VCI relation
+ */
+ comContext->numColumns = numColumns;
+
+ if (readOriginalData)
+ {
+ Size allocatableSize = Min(workareaSize, MaxAllocSize);
+ int numRowsAtOnce;
+ int largestTupleSize;
+
+ comContext->heapAttrNumList = palloc_array(AttrNumber, numColumns);
+ comContext->indxColumnIdList = palloc_array(int16, numColumns);
+ comContext->columnSizeList = palloc_array(int16, numColumns);
+ worstCaseTupleSize = vci_GetColumnIdsAndSizes(
+ comContext->heapAttrNumList,
+ comContext->indxColumnIdList,
+ comContext->columnSizeList,
+ numColumns,
+ &(comContext->info),
+ comContext->heapOid);
+
+ comContext->heapRel = table_open(comContext->heapOid, AccessShareLock);
+
+ /*
+ * PostgreSQL limits the tuple size by TOAST_TUPLE_TARGET, normally.
+ * The upper limit of the tuple size is smaller than BLCKSZ. We use
+ * other area to keep the offset or data size in the chunk buffers or
+ * ROS. Here, we assume the type of offset is uint32.
+ */
+ largestTupleSize = worstCaseTupleSize +
+ (comContext->numColumns * sizeof(uint32));
+
+ /* The number of rows in one chunk */
+ numRowsAtOnce = (int) (allocatableSize * VCI_WOS_ROS_WORKAREA_SAFE_RATIO /
+ largestTupleSize);
+ numRowsAtOnce = (numRowsAtOnce / VCI_COMPACTION_UNIT_ROW) * VCI_COMPACTION_UNIT_ROW;
+ numRowsAtOnce = Max(numRowsAtOnce, VCI_COMPACTION_UNIT_ROW);
+ numRowsAtOnce = Min(numRowsAtOnce, VCI_NUM_ROWS_IN_EXTENT);
+
+ comContext->numRowsAtOnce = numRowsAtOnce;
+ }
+ else
+ {
+ comContext->heapAttrNumList = NULL;
+ comContext->indxColumnIdList = NULL;
+ comContext->columnSizeList = NULL;
+ comContext->heapRel = NULL;
+ comContext->numRowsAtOnce = VCI_COMPACTION_UNIT_ROW;
+ }
+
+ comContext->scan = NULL;
+
+ switch (comContext->command)
+ {
+ case vci_rc_wos_ros_conv:
+ case vci_rc_collect_deleted:
+ case vci_rc_update_del_vec:
+ case vci_rc_vacuum:
+ comContext->oldestXmin = GetOldestNonRemovableTransactionId(comContext->info.rel);
+ comContext->wos2rosXid = comContext->oldestXmin;
+ break;
+
+ case vci_rc_generate_local_ros:
+ default:
+ comContext->oldestXmin = InvalidTransactionId;
+ comContext->wos2rosXid = InvalidTransactionId;
+ break;
+ }
+
+}
+
+void
+vci_InitRosCommandContext2(vci_RosCommandContext *comContext, Size workareaSize)
+{
+ bool make_wos2ros_tid_list = false;
+ bool make_delvec_tid_list = false;
+
+ comContext->data_wos_del_list =
+ tuplesort_begin_datum(TIDOID, TIDLessOperator, InvalidOid, false,
+ Min(workareaSize / 1024 / 3, INT_MAX), NULL, TUPLESORT_NONE);
+ comContext->whiteout_wos_del_list =
+ tuplesort_begin_datum(TIDOID, TIDLessOperator, InvalidOid, false,
+ Min(workareaSize / 1024 / 3, INT_MAX), NULL, TUPLESORT_NONE);
+
+ switch (comContext->command)
+ {
+ case vci_rc_wos_ros_conv:
+ make_wos2ros_tid_list = true;
+ break;
+
+ case vci_rc_collect_deleted:
+ make_wos2ros_tid_list = true;
+ break;
+
+ case vci_rc_update_del_vec:
+ make_delvec_tid_list = true;
+ break;
+
+ default:
+ break;
+ }
+
+ if (make_wos2ros_tid_list || make_delvec_tid_list)
+ {
+ TupleDesc tupDesc;
+ AttrNumber sortKeys[] = {1};
+ Oid sortOperators[] = {TIDLessOperator};
+ Oid sortCollations[] = {InvalidOid};
+ bool nullsFirstFlags[] = {false};
+
+ tupDesc = CreateTemplateTupleDesc(2);
+
+ TupleDescInitEntry(tupDesc, (AttrNumber) 1, "orig_tid", TIDOID, -1, 0);
+ TupleDescInitEntry(tupDesc, (AttrNumber) 2, "wos_tid", TIDOID, -1, 0);
+
+ comContext->tid_tid_tupdesc = tupDesc;
+ comContext->tid_tid_slot = MakeSingleTupleTableSlot(tupDesc, &TTSOpsHeapTuple);
+
+ if (make_wos2ros_tid_list)
+ {
+ comContext->wos2ros_tid_list =
+ tuplesort_begin_heap(tupDesc, 1,
+ sortKeys, sortOperators, sortCollations, nullsFirstFlags,
+ Min(workareaSize / 1024 / 3, INT_MAX), NULL, TUPLESORT_NONE);
+ }
+
+ if (make_delvec_tid_list)
+ {
+ comContext->delvec_tid_list =
+ tuplesort_begin_heap(tupDesc, 1,
+ sortKeys, sortOperators, sortCollations, nullsFirstFlags,
+ Min(workareaSize / 1024 / 3, INT_MAX), NULL, TUPLESORT_NONE);
+ }
+ }
+}
+
+void
+vci_InitRosChunkStroageAndBuffer(vci_RosCommandContext *comContext, bool forAppending)
+{
+ int numRowsAtOnce;
+
+ Assert(RelationIsValid(comContext->heapRel));
+
+ numRowsAtOnce = comContext->numRowsAtOnce;
+
+ /* Initialize the buffers for building chunks of ROS data */
+ vci_InitOneRosChunkBuffer(&(comContext->buffer),
+ numRowsAtOnce,
+ comContext->columnSizeList,
+ comContext->numColumns,
+ false,
+ &(comContext->info));
+
+ vci_InitRosChunkStorage(&(comContext->storage), numRowsAtOnce, forAppending);
+}
+
+void
+vci_CleanRosCommandContext(vci_RosCommandContext *comContext, bool neverWrite)
+{
+ if (comContext->tid_tid_slot)
+ {
+ ExecClearTuple(comContext->tid_tid_slot);
+ pfree(comContext->tid_tid_slot);
+ comContext->tid_tid_slot = NULL;
+ }
+
+ if (comContext->data_wos_del_list)
+ {
+ tuplesort_end(comContext->data_wos_del_list);
+ comContext->data_wos_del_list = NULL;
+ }
+
+ if (comContext->whiteout_wos_del_list)
+ {
+ tuplesort_end(comContext->whiteout_wos_del_list);
+ comContext->whiteout_wos_del_list = NULL;
+ }
+
+ if (comContext->wos2ros_tid_list)
+ {
+ tuplesort_end(comContext->wos2ros_tid_list);
+ comContext->wos2ros_tid_list = NULL;
+ }
+
+ if (comContext->delvec_tid_list)
+ {
+ tuplesort_end(comContext->delvec_tid_list);
+ comContext->delvec_tid_list = NULL;
+ }
+
+ if (comContext->tid_tid_tupdesc)
+ {
+ FreeTupleDesc(comContext->tid_tid_tupdesc);
+ comContext->tid_tid_tupdesc = NULL;
+ }
+
+ /* Close original heap relation if it is opened. */
+ vci_CloseHeapRelInCommandContext(comContext);
+
+ /*
+ * Release chunk buffers - WOS ROS Conv.
+ */
+ if (comContext->command == vci_rc_wos_ros_conv)
+ {
+ vci_DestroyOneRosChunkBuffer(&(comContext->buffer));
+ vci_DestroyRosChunkStorage(&(comContext->storage));
+ }
+
+ if (NULL != comContext->heapAttrNumList)
+ {
+ /* release local work area */
+ pfree(comContext->heapAttrNumList);
+ pfree(comContext->indxColumnIdList);
+ pfree(comContext->columnSizeList);
+ comContext->heapAttrNumList = NULL;
+ comContext->indxColumnIdList = NULL;
+ comContext->columnSizeList = NULL;
+ }
+
+ /* release local work area */
+ if (comContext->wos2ros_array.orig_tids)
+ {
+ pfree(comContext->wos2ros_array.orig_tids);
+ comContext->wos2ros_array.orig_tids = NULL;
+ }
+
+ if (comContext->wos2ros_array.wos_tids)
+ {
+ pfree(comContext->wos2ros_array.wos_tids);
+ comContext->wos2ros_array.wos_tids = NULL;
+ }
+
+ if (comContext->delvec_array.orig_tids)
+ {
+ pfree(comContext->delvec_array.orig_tids);
+ comContext->delvec_array.orig_tids = NULL;
+ }
+
+ if (comContext->utility_array.orig_blknos)
+ {
+ pfree(comContext->utility_array.orig_blknos);
+ comContext->utility_array.orig_blknos = NULL;
+ }
+
+ if (neverWrite)
+ return;
+
+ /* write header of the main relation */
+ if (NeedMainRelHeaderUpdate(comContext->command))
+ vci_WriteMainRelVar(&(comContext->info),
+ vci_wmrv_update);
+}
+
+void
+vci_FinRosCommandContext(vci_RosCommandContext *comContext, bool neverWrite)
+{
+ vci_CleanRosCommandContext(comContext, neverWrite);
+
+ /* release the main relation */
+ vci_ReleaseMainRelInCommandContext(comContext);
+
+ comContext->indexOid = InvalidOid;
+ comContext->command = vci_rc_invalid;
+}
+
+/**
+ * numRows is from 1 to VCI_NUM_ROWS_IN_EXTENT
+ * workareaSize should be taken from the configuration parameter
+ * in postgresql.conf.
+ * It just convert one extent.
+ */
+int
+vci_ConvertWos2Ros(Relation mainRel, Size workareaSize, int numRows)
+{
+ vci_RosCommandContext comContext;
+ MemoryContext memCtxWos2Ros;
+ MemoryContext oldMemCtx;
+ int result = -1;
+
+ Assert((0 < numRows) && (numRows <= VCI_NUM_ROWS_IN_EXTENT));
+
+ vci_InitRosCommandContext0(&comContext, mainRel, vci_rc_wos_ros_conv);
+
+ /* recover ROS if necessary */
+ vci_RecoverOneVCIIfNecessary(&(comContext.info));
+
+ /* prepare local work area */
+ memCtxWos2Ros = AllocSetContextCreate(TopTransactionContext,
+ "WOS->ROS conversion",
+ ALLOCSET_DEFAULT_SIZES);
+ oldMemCtx = MemoryContextSwitchTo(memCtxWos2Ros);
+
+ vci_InitRosCommandContext1(&comContext,
+ workareaSize / 3 * 2,
+ numRows, 0,
+ true);
+
+ vci_InitRosCommandContext2(&comContext, workareaSize / 3);
+
+ if (TransactionIdPrecedes(GetCurrentTransactionId(),
+ (TransactionId) vci_GetMainRelVar(&comContext.info, vcimrv_current_ros_version, 0)))
+ goto done;
+
+ GetActiveSnapshot();
+
+ /* obtain new extent ID */
+ comContext.extentIdSrc = VCI_INVALID_EXTENT_ID;
+ comContext.extentId = vci_GetFreeExtentId(&(comContext.info));
+
+ /* Write Recovery Information of this command. */
+ vci_WriteRecoveryRecordForExtentInfo(&comContext.info, comContext.extentId, comContext.extentIdSrc);
+ vci_InitRecoveryRecordForFreeSpace(&comContext.info);
+
+ vci_WriteRecoveryRecordDone(&comContext.info, comContext.command, comContext.xid);
+
+ constructTidSortState(&comContext);
+
+ /* call Main routine */
+ fillTidListFromTidSortState(&comContext, numRows);
+
+ vci_InitRosChunkStroageAndBuffer(&comContext, false /* no append */ );
+
+ result = ConvertWos2Ros(&comContext);
+
+ /* remove WOS entries */
+ cleanUpWos(&comContext, vcimrv_data_wos_oid);
+ cleanUpWos(&comContext, vcimrv_whiteout_wos_oid);
+
+ /* Xmax WOS entry */
+ RemoveWosEntries(&comContext, WOS_Data);
+ RemoveWosEntries(&comContext, WOS_Whiteout);
+
+done:
+ /* Finalize ROS */
+ vci_FinRosCommandContext(&comContext, false);
+
+ MemoryContextSwitchTo(oldMemCtx);
+ MemoryContextDelete(memCtxWos2Ros);
+
+ return result;
+}
+
+static void
+FlushTidCridPairListToTreeForBuild(vci_TidCridRelations *relPair,
+ vcis_tidcrid_pair_list_t *appList,
+ BlockNumber blockNumber)
+{
+ if (0 < appList->num)
+ {
+ ItemPointerData treeNode;
+
+ vci_GetTidCridSubTree(relPair, blockNumber, &treeNode);
+ if (!ItemPointerIsValid(&treeNode))
+ vci_CreateTidCridSubTree(relPair, blockNumber, &treeNode);
+ vci_UpdateTidCridSubTree(relPair, &treeNode, appList);
+ }
+ appList->num = 0;
+}
+
+static void
+UpdateTidCridForBuild(vci_RosCommandContext *comContext)
+{
+ RosChunkStorage *src = &(comContext->storage);
+ vci_TidCridRelations relPair;
+ const LOCKMODE lockmode = ExclusiveLock;
+ BlockNumber blockNumber = InvalidBlockNumber;
+ int32 offset = offsetof(vcis_tidcrid_pair_list_t, body);
+ int rowIdInExt = 0;
+ vcis_tidcrid_pair_list_t *appList = palloc(offset
+ + (sizeof(vcis_tidcrid_pair_item_t) * src->numTotalRows));
+
+ vci_OpenTidCridRelations(&relPair, &comContext->info, lockmode);
+ appList->num = 0;
+
+ for (int chunkId = 0; chunkId < src->numFilled; ++chunkId)
+ {
+ for (int rowId = 0; rowId < src->chunk[chunkId]->numFilled; ++rowId)
+ {
+ ItemPointer itemPtr = (ItemPointer) &(src->chunk[chunkId]->
+ tidData[sizeof(ItemPointerData) * rowId]);
+
+ if (blockNumber != ItemPointerGetBlockNumber(itemPtr))
+ {
+ if (BlockNumberIsValid(blockNumber))
+ FlushTidCridPairListToTreeForBuild(&relPair, appList,
+ blockNumber);
+ blockNumber = ItemPointerGetBlockNumber(itemPtr);
+ }
+
+ Assert(appList->num < src->numTotalRows);
+ appList->body[appList->num].crid = vci_GetCridFromUint64(
+ vci_CalcCrid64(comContext->extentId, rowIdInExt));
+ ItemPointerCopy(itemPtr, &appList->body[appList->num].page_item_id);
+ (appList->num)++;
+
+ Assert(rowIdInExt < src->numTotalRows);
+ rowIdInExt++;
+ }
+ }
+ if (BlockNumberIsValid(blockNumber))
+ FlushTidCridPairListToTreeForBuild(&relPair, appList, blockNumber);
+ pfree(appList);
+ vci_CloseTidCridRelations(&relPair, lockmode);
+}
+
+/* Implementation of callback interface:IndexBuildCallback */
+static void
+vci_build_callback(Relation rel,
+ ItemPointer tid,
+ Datum *values,
+ bool *isnull,
+ bool tupleIsAlive,
+ void *state)
+{
+ vci_RosCommandContext *comContext = (vci_RosCommandContext *) state;
+
+ Assert(comContext);
+
+ if (tupleIsAlive)
+ {
+ Assert((0 <= comContext->buffer.numFilled) &&
+ (comContext->buffer.numFilled < comContext->numRowsAtOnce));
+
+ vci_FillOneRowInRosChunkBuffer(&(comContext->buffer),
+ &(comContext->info),
+ &IndexHeapTuple->t_self, /* use the original heap
+ * tuple saved in
+ * heapam_index_build_range_scan() */
+ IndexHeapTuple, /* use the original heap
+ * tuple saved in
+ * heapam_index_build_range_scan() */
+ comContext->indxColumnIdList,
+ comContext->heapAttrNumList,
+ RelationGetDescr(comContext->heapRel));
+
+ if (comContext->numRowsAtOnce <= comContext->buffer.numFilled)
+ {
+ vci_RegisterChunkBuffer(&(comContext->storage),
+ &(comContext->buffer));
+ vci_ResetRosChunkBufferCounter(&(comContext->buffer));
+ }
+
+ if (VCI_NUM_ROWS_IN_EXTENT <=
+ (comContext->storage.numTotalRows + comContext->buffer.numFilled))
+ {
+ Assert(TransactionIdIsValid(comContext->xid));
+ if (0 < comContext->buffer.numFilled)
+ {
+ vci_RegisterChunkBuffer(&(comContext->storage),
+ &(comContext->buffer));
+ vci_ResetRosChunkBufferCounter(&(comContext->buffer));
+ }
+ vci_WriteExtentInfoInMainRosForWosRosConvInit(&(comContext->info),
+ comContext->extentId,
+ comContext->xid);
+ UpdateTidCridForBuild(comContext);
+ vci_WriteOneExtent(&(comContext->info),
+ &(comContext->storage),
+ comContext->extentId,
+ comContext->xid,
+ InvalidTransactionId,
+ comContext->xid);
+ vci_ResetRosChunkStorage(&(comContext->storage));
+ comContext->extentId++;
+ }
+ }
+}
+
+static void
+FinalizeBuild(vci_RosCommandContext *comContext)
+{
+ if (0 < comContext->buffer.numFilled)
+ vci_RegisterChunkBuffer(&(comContext->storage),
+ &(comContext->buffer));
+
+ if (0 < comContext->storage.numTotalRows)
+ {
+ Assert(TransactionIdIsValid(comContext->xid));
+ vci_WriteExtentInfoInMainRosForWosRosConvInit(&(comContext->info),
+ comContext->extentId,
+ comContext->xid);
+ UpdateTidCridForBuild(comContext);
+ vci_WriteOneExtent(&(comContext->info),
+ &(comContext->storage),
+ comContext->extentId,
+ comContext->xid,
+ InvalidTransactionId,
+ comContext->xid);
+ comContext->extentId++;
+ }
+}
+
+/**
+ * @brief Obtain number of rows in the relation estimated by ANALYZE or
+ * VACUUM commands.
+ *
+ * @param[in] relid The Oid of the relation.
+ * @return The estimated number of rows.
+ */
+static double
+GetEstimatedNumRows(Oid relid)
+{
+ HeapTuple tp = SearchSysCache1(RELOID, ObjectIdGetDatum(relid));
+
+ if (HeapTupleIsValid(tp))
+ {
+ Form_pg_class reltup = (Form_pg_class) GETSTRUCT(tp);
+ double result = Max(reltup->reltuples, 0);
+
+ ReleaseSysCache(tp);
+
+ return result;
+ }
+ else
+ return 0.0;
+}
+
+/**
+ * This function is assumed when the VCI index is newly built, and
+ * it converts all the data in the relation of PostgreSQL into ROS.
+ */
+double
+vci_ConvertWos2RosForBuild(Relation mainRel,
+ Size workareaSize,
+ IndexInfo *indexInfo)
+{
+ vci_RosCommandContext comContext;
+ MemoryContext memCtxWos2Ros;
+ MemoryContext oldMemCtx;
+ double result = 0;
+
+ vci_InitRosCommandContext0(&comContext, mainRel,
+ vci_rc_wos_ros_conv_build);
+
+ /* prepare local work area */
+ memCtxWos2Ros = AllocSetContextCreate(TopTransactionContext,
+ "WOS->ROS conversion",
+ ALLOCSET_DEFAULT_SIZES);
+ oldMemCtx = MemoryContextSwitchTo(memCtxWos2Ros);
+
+ vci_InitRosCommandContext1(&comContext,
+ workareaSize,
+ VCI_NUM_ROWS_IN_EXTENT, 0,
+ true);
+
+ vci_InitRosChunkStroageAndBuffer(&comContext, false /* no append */ );
+
+ comContext.extentId = VCI_FIRST_NORMAL_EXTENT_ID;
+
+ /*
+ * Initialize information for printing progress
+ */
+ comContext.estimatedNumRows = GetEstimatedNumRows(
+ RelationGetRelid(comContext.heapRel));
+ if (comContext.estimatedNumRows < 1)
+ comContext.estimatedNumRows = 1;
+ comContext.numConvertedRows = 0;
+ strcpy(comContext.relName, RelationGetRelationName(mainRel));
+
+ result = table_index_build_scan(comContext.heapRel,
+ mainRel,
+ indexInfo,
+ true, /* allow syncscan */
+ true,
+ vci_build_callback,
+ (void *) &comContext, NULL);
+ indexInfo->ii_BrokenHotChain = true;
+ FinalizeBuild(&comContext);
+
+ vci_FinRosCommandContext(&comContext, false);
+
+ MemoryContextSwitchTo(oldMemCtx);
+ MemoryContextDelete(memCtxWos2Ros);
+
+ return result;
+}
+
+static void
+RemoveWosEntries(vci_RosCommandContext *comContext, WosKind wos_kind)
+{
+ Datum value;
+ bool isnull;
+ Relation rel;
+ Oid wos_oid;
+ Tuplesortstate *sortstate = NULL;
+
+ switch (wos_kind)
+ {
+ case WOS_Data:
+ wos_oid = vci_GetMainRelVar(&comContext->info, vcimrv_data_wos_oid, 0);
+ sortstate = comContext->data_wos_del_list;
+ break;
+
+ case WOS_Whiteout:
+ wos_oid = vci_GetMainRelVar(&comContext->info, vcimrv_whiteout_wos_oid, 0);
+ sortstate = comContext->whiteout_wos_del_list;
+ break;
+ default:
+ wos_oid = InvalidOid;
+ break;
+ }
+
+ tuplesort_performsort(sortstate);
+
+ rel = relation_open(wos_oid, RowExclusiveLock);
+
+ while (tuplesort_getdatum(sortstate, true, true, &value, &isnull, NULL))
+ {
+ ItemPointer tid;
+
+ tid = DatumGetItemPointer(value);
+
+ simple_heap_delete(rel, tid);
+ }
+
+ RelationSetTargetBlock(rel, InvalidBlockNumber);
+
+ relation_close(rel, RowExclusiveLock);
+}
+
+static uint64
+cleanUpWos(vci_RosCommandContext *comContext, vci_MainRelVar wosType)
+{
+ const LOCKMODE lockmode = ShareUpdateExclusiveLock;
+ vci_MainRelHeaderInfo *info;
+ BlockNumber nblocks;
+ ItemPointer dead_tuples;
+ int max_dead_tuples;
+ uint64 total_live = 0;
+
+ HeapTupleData tuple;
+
+ Oid oidWosType;
+ TransactionId oldestXmin;
+ Relation rel;
+
+ info = &comContext->info;
+
+ oldestXmin = comContext->oldestXmin;
+
+ oidWosType = vci_GetMainRelVar(info, wosType, 0);
+
+ rel = table_open(oidWosType, lockmode);
+
+ max_dead_tuples = MaxHeapTuplesPerPage;
+ dead_tuples = palloc0_array(ItemPointerData, max_dead_tuples);
+
+ nblocks = RelationGetNumberOfBlocks(rel);
+ for (BlockNumber blkno = 0; blkno < nblocks; blkno++)
+ {
+ Size freespace;
+ int num_dead_tuples = 0;
+ TransactionId snapshotConflictHorizon = InvalidTransactionId;
+
+ Buffer buffer;
+ Buffer vmbuffer = InvalidBuffer;
+ Page page;
+ OffsetNumber maxoff;
+
+ OffsetNumber unused[MaxOffsetNumber];
+ int nunused = 0;
+ bool is_visible_page = true;
+
+ /* Get a buffer containing the target block. */
+ buffer = ReadBuffer(rel, blkno);
+ page = BufferGetPage(buffer);
+
+ if (!ConditionalLockBufferForCleanup(buffer))
+ {
+ ReleaseBuffer(buffer);
+ continue;
+ }
+
+ /* Collect removable dead tuples in the target block. */
+ maxoff = PageGetMaxOffsetNumber(page);
+ for (OffsetNumber offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum))
+ {
+ ItemId itemid = PageGetItemId(page, offnum);
+
+ /* Unused items require no processing, but we count 'em */
+ if (!ItemIdIsUsed(itemid))
+ continue;
+
+ /* Redirect items mustn't be touched */
+ if (ItemIdIsRedirected(itemid))
+ continue;
+
+ ItemPointerSet(&(tuple.t_self), blkno, offnum);
+
+ /*
+ * DEAD item pointers are to be vacuumed normally; but we don't
+ * count them in tups_vacuumed, else we'd be double-counting (at
+ * least in the common case where heap_page_prune() just freed up
+ * a non-HOT tuple).
+ */
+ if (ItemIdIsDead(itemid))
+ {
+ dead_tuples[num_dead_tuples++] = tuple.t_self;
+ continue;
+ }
+
+ Assert(ItemIdIsNormal(itemid));
+
+ tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
+ tuple.t_len = ItemIdGetLength(itemid);
+ tuple.t_tableOid = RelationGetRelid(rel);
+
+ switch (HeapTupleSatisfiesVacuum(&tuple, oldestXmin, buffer))
+ {
+ case HEAPTUPLE_DEAD:
+ dead_tuples[num_dead_tuples++] = tuple.t_self;
+ HeapTupleHeaderAdvanceConflictHorizon(tuple.t_data,
+ &snapshotConflictHorizon);
+ break;
+ case HEAPTUPLE_LIVE:
+ ++total_live;
+ break;
+ case HEAPTUPLE_RECENTLY_DEAD:
+ case HEAPTUPLE_INSERT_IN_PROGRESS:
+ case HEAPTUPLE_DELETE_IN_PROGRESS:
+ break;
+ default:
+ elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
+ break;
+ }
+ }
+
+ if (num_dead_tuples == 0)
+ {
+ /*
+ * Skip repair of a fragmentation, because dead tuple is not
+ * exist.
+ */
+ UnlockReleaseBuffer(buffer);
+ continue;
+ }
+
+ visibilitymap_pin(rel, blkno, &vmbuffer);
+
+ /*
+ * this routine is copied from lazy_vacuum_heap_rel() &
+ * lazy_vacuum_heap_page(), and modified.
+ */
+
+ START_CRIT_SECTION();
+
+ for (int tupindex = 0; tupindex < num_dead_tuples; tupindex++)
+ {
+ BlockNumber tblk;
+ OffsetNumber toff;
+ ItemId itemid;
+
+ HeapTupleHeader htup;
+
+ tblk = ItemPointerGetBlockNumber(&dead_tuples[tupindex]);
+ if (tblk != blkno)
+ break; /* past end of tuples for this block */
+ toff = ItemPointerGetOffsetNumber(&dead_tuples[tupindex]);
+
+ itemid = PageGetItemId(page, toff);
+ if (!ItemIdHasStorage(itemid))
+ continue;
+ if (!ItemIdIsDead(itemid))
+ continue;
+
+ htup = (HeapTupleHeader) PageGetItem(page, itemid);
+ dead_tuples[tupindex] = *(ItemPointer) ((char *) htup + htup->t_hoff);
+
+ Assert(ItemIdIsDead(itemid) && !ItemIdHasStorage(itemid));
+ ItemIdSetUnused(itemid);
+ unused[nunused++] = toff;
+ }
+
+ /* Attempt to truncate line pointer array now */
+ if (nunused > 0)
+ PageTruncateLinePointerArray(page);
+
+
+ /* Mark buffer dirty before we write WAL. */
+ MarkBufferDirty(buffer);
+
+ for (OffsetNumber offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum))
+ {
+ ItemId itemid = PageGetItemId(page, offnum);
+
+ if (ItemIdIsUsed(itemid))
+ {
+ is_visible_page = false;
+ break;
+ }
+ }
+
+ if (BufferIsValid(vmbuffer))
+ {
+ if (is_visible_page)
+ {
+ PageSetAllVisible(page);
+ MarkBufferDirty(buffer);
+ visibilitymap_set(rel, blkno, buffer, InvalidXLogRecPtr,
+ vmbuffer, InvalidTransactionId, VISIBILITYMAP_ALL_VISIBLE);
+ }
+
+ ReleaseBuffer(vmbuffer);
+ }
+
+ /* XLOG stuff */
+ if (nunused > 0 && RelationNeedsWAL(rel))
+ {
+ /*
+ * Commit add323d added the vmbuffer/vmflags parameters.
+ * A quick fix was needed to allow build to proceed.
+ *
+ * TODO Confirm if passing InvalidBuffer, 0 is OK here.
+ */
+ log_heap_prune_and_freeze(rel, buffer,
+ InvalidBuffer, /* vmbuffer */
+ 0, /* vmflags */
+ snapshotConflictHorizon,
+ false, /* no cleanup lock required */
+ PRUNE_ON_ACCESS,
+ NULL, 0, /* frozen */
+ NULL, 0, /* redirected */
+ NULL, 0, /* dead */
+ unused, nunused);
+ }
+
+ END_CRIT_SECTION();
+
+ freespace = PageGetHeapFreeSpace(page);
+
+ LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+ ReleaseBuffer(buffer);
+
+ RecordPageWithFreeSpace(rel, blkno, freespace);
+
+ /*
+ * in vci_UnregisterTIDFromTIDTree(), TidTree in memory will be
+ * rebuild when the size was too large to store in memory, and the
+ * size is reduced to fit to the memory size. At that time, data WOS
+ * will be scan to obtain TID list. So, vci_UnregisterTIDFromTIDTree()
+ * can not be called in the critical section above.
+ */
+ }
+
+ pfree(dead_tuples);
+ table_close(rel, lockmode);
+
+ return total_live;
+}
+
+/**
+ * generate local ROS.
+ * This function is assumed to be called in backend process, not parallel
+ * background worker. Here, vci_CSFetchContext is used unlocalized.
+ */
+vci_local_ros_t *
+vci_GenerateLocalRos(vci_CSQueryContext queryContext,
+ Size workareaSize,
+ int64 numDataWosRows,
+ int64 numWhiteoutWosRows)
+{
+ vci_RosCommandContext comContext;
+ int numRowsInExtent;
+ MemoryContext localMemCtx;
+ MemoryContext sharedMemCtx;
+ MemoryContext oldMemCtx;
+ vci_local_ros_t *result;
+ Size partedWorkareaSize = workareaSize / 4;
+ int64 numLocalDeleteListRows;
+
+ numRowsInExtent = vci_GetNumRowsInLocalRosExtent(queryContext->num_columns);
+
+ sharedMemCtx = AllocSetContextCreate(queryContext->shared_memory_context,
+ "Work for Local ROS generation",
+ ALLOCSET_DEFAULT_SIZES);
+
+ result = MemoryContextAllocZero(sharedMemCtx, sizeof(vci_local_ros_t));
+ result->num_local_extents = 0;
+ result->extent = NULL;
+ result->memory_context = sharedMemCtx;
+ result->fetch_context = vci_CSCreateFetchContextBase(queryContext,
+ Min(numRowsInExtent, numDataWosRows),
+ queryContext->num_columns,
+ queryContext->attr_num,
+ true,
+ true,
+ true,
+ false); /* no compression */
+
+ numRowsInExtent = result->fetch_context->num_rows_read_at_once;
+
+ Assert(queryContext == result->fetch_context->query_context);
+
+ /*
+ * Local Delete List
+ */
+ numLocalDeleteListRows = numDataWosRows + numWhiteoutWosRows;
+
+ result->local_delete_list.crid_list =
+ MemoryContextAllocZero(result->memory_context,
+ sizeof(*(result->local_delete_list.crid_list)) * numLocalDeleteListRows);
+ result->local_delete_list.num_entry = 0;
+ result->local_delete_list.length = numLocalDeleteListRows;
+
+ Assert(0 == ((uintptr_t) (result->local_delete_list.crid_list) & (MAXIMUM_ALIGNOF - 1)));
+
+ localMemCtx = AllocSetContextCreate(TopTransactionContext,
+ "Work for Local ROS generation",
+ ALLOCSET_DEFAULT_SIZES);
+ oldMemCtx = MemoryContextSwitchTo(localMemCtx);
+
+ vci_InitRosCommandContext0(&comContext, queryContext->info->rel,
+ vci_rc_generate_local_ros);
+ vci_InitRosCommandContext1(&comContext,
+ partedWorkareaSize,
+ numDataWosRows, numWhiteoutWosRows,
+ true);
+
+ vci_InitRosChunkStroageAndBuffer(&comContext, false /* no append */ );
+
+ comContext.inclusiveXid = queryContext->inclusive_xid;
+ comContext.exclusiveXid = queryContext->exclusive_xid;
+
+ Assert(queryContext->num_data_wos_entries <= VCI_NUM_ROWS_IN_EXTENT * VCI_MAX_NUMBER_UNCONVERTED_ROS);
+ Assert(queryContext->num_whiteout_wos_entries <= VCI_NUM_ROWS_IN_EXTENT * VCI_MAX_NUMBER_UNCONVERTED_ROS);
+
+ constructTidArray(&comContext,
+ (int) queryContext->num_data_wos_entries,
+ (int) queryContext->num_whiteout_wos_entries);
+
+ comContext.numRowsToConvert = Min(comContext.numRowsToConvert,
+ numRowsInExtent);
+ comContext.local_ros = result;
+ queryContext->local_ros = result;
+
+ MemoryContextSwitchTo(sharedMemCtx);
+
+ PG_TRY();
+ {
+ ConvertWos2LocalRos(&comContext);
+
+ comContext.local_ros = result;
+
+ ConvertWhiteOut2LocalDeleteList(&comContext,
+ result->fetch_context->query_context->tid_crid_diff_sel);
+
+ qsort(result->local_delete_list.crid_list,
+ result->local_delete_list.num_entry,
+ sizeof(uint64),
+ CmpUint64);
+
+ queryContext->local_ros = result;
+ queryContext->num_local_ros_extents = result->num_local_extents;
+ queryContext->delete_list = comContext.local_ros->local_delete_list.crid_list;
+ queryContext->num_delete = comContext.local_ros->local_delete_list.num_entry;
+ }
+ PG_CATCH();
+ {
+ if (geterrcode() == ERRCODE_OUT_OF_MEMORY)
+ {
+ vci_FinRosCommandContext(&comContext, true /* never write */ );
+
+ MemoryContextSwitchTo(oldMemCtx);
+ MemoryContextDelete(localMemCtx);
+ }
+
+ PG_RE_THROW();
+ }
+ PG_END_TRY();
+
+ vci_FinRosCommandContext(&comContext, false);
+
+ MemoryContextSwitchTo(oldMemCtx);
+ MemoryContextDelete(localMemCtx);
+
+ return result;
+}
+
+/**
+ * in vci_DestroyLocalRos(), release the memory context allocated to the
+ * local ros pointed by localRos.
+ * We have not need to pfree() each element.
+ */
+void
+vci_DestroyLocalRos(vci_local_ros_t *localRos)
+{
+ MemoryContext memCtx;
+
+ Assert(localRos);
+ memCtx = localRos->memory_context;
+ MemoryContextDelete(memCtx);
+}
+
+uint32
+vci_CountFreezedInDataWos(Relation mainRel, Size workareaSize)
+{
+ uint32 count = 0;
+ vci_MainRelHeaderInfo infoData = {0};
+ vci_MainRelHeaderInfo *info = &infoData;
+
+ Oid dataWosOid;
+ Relation dataWosRel;
+
+ TableScanDesc scan;
+ HeapTuple tuple;
+ Snapshot snapshot;
+
+ vci_InitMainRelHeaderInfo(info, mainRel, vci_rc_probe);
+ vci_KeepMainRelHeader(info);
+
+ dataWosOid = (Oid) vci_GetMainRelVar(info, vcimrv_data_wos_oid, 0);
+ dataWosRel = table_open(dataWosOid, AccessShareLock);
+
+ snapshot = vci_GetSnapshotForWos2Ros();
+
+ scan = table_beginscan(dataWosRel, snapshot, 0, NULL);
+ scan->rs_flags &= ~SO_ALLOW_PAGEMODE;
+
+ while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
+ {
+ count++;
+ }
+ table_endscan(scan);
+
+ PopActiveSnapshot();
+
+ /* release the data WOS relation */
+ table_close(dataWosRel, AccessShareLock);
+
+ /* release the main relation */
+ vci_ReleaseMainRelHeader(info);
+
+ return count;
+}
+
+/* --------------------------------------------------------------*/
+/* Update Delete Lists */
+/* --------------------------------------------------------------*/
+
+uint32
+vci_CountFreezedInWhiteoutWos(Relation mainRel, Size workareaSize)
+{
+ uint32 count = 0;
+ vci_MainRelHeaderInfo infoData = {0};
+ vci_MainRelHeaderInfo *info = &infoData;
+
+ Oid whiteoutWosOid;
+ Relation whiteoutWosRel;
+
+ TableScanDesc scan;
+ HeapTuple tuple;
+ Snapshot snapshot;
+
+ vci_InitMainRelHeaderInfo(info, mainRel, vci_rc_probe);
+ vci_KeepMainRelHeader(info);
+
+ whiteoutWosOid = (Oid) vci_GetMainRelVar(info, vcimrv_whiteout_wos_oid, 0);
+ whiteoutWosRel = table_open(whiteoutWosOid, AccessShareLock);
+
+ snapshot = vci_GetSnapshotForWos2Ros();
+
+ scan = table_beginscan(whiteoutWosRel, snapshot, 0, NULL);
+ scan->rs_flags &= ~SO_ALLOW_PAGEMODE;
+ while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
+ {
+ count++;
+ }
+ table_endscan(scan);
+
+ PopActiveSnapshot();
+
+ /* release the data WOS relation */
+ table_close(whiteoutWosRel, AccessShareLock);
+
+ /* release the main relation */
+ vci_ReleaseMainRelHeader(info);
+
+ return count;
+}
+
+static uint64
+UpdateDelVec(vci_RosCommandContext *comContext, Size workareaSize, uint64 numRowsAtOnce)
+{
+ uint32 numExtents;
+ Tuplesortstate *cridList;
+ uint64 result = 0;
+
+ if (comContext->num_delvec_tids == 0)
+ return 0;
+
+ numExtents = vci_GetMainRelVar(&comContext->info, vcimrv_num_extents, 0);
+
+ cridList =
+ tuplesort_begin_datum(INT8OID, Int8LessOperator, InvalidOid, false,
+ Min(workareaSize / 1024 / 2, INT_MAX), NULL, TUPLESORT_NONE);
+
+ /*
+ * Phase 1. Convert TID List -> CRID List
+ */
+ do
+ {
+ vci_TidCridUpdateListContext *oldListContext;
+ Tuplesortstate *addList;
+
+ uint32 oldSel;
+ uint32 newSel;
+
+ oldSel = vci_GetMainRelVar(&comContext->info, vcimrv_tid_crid_diff_sel, 0);
+ newSel = 1 ^ oldSel;
+
+ oldListContext = vci_OpenTidCridUpdateList(&comContext->info, oldSel);
+
+ addList =
+ tuplesort_begin_datum(TIDOID, TIDLessOperator, InvalidOid, false,
+ Min(workareaSize / 1024 / 2, INT_MAX), NULL, TUPLESORT_NONE);
+ while (result < numRowsAtOnce)
+ {
+ ItemPointerData orig_tid;
+ ItemPointerData wos_tid;
+ uint64 cridUint;
+
+ if (!get_entry_into_tid_list(comContext, WOS_Whiteout, &orig_tid, &wos_tid))
+ break;
+
+ if (comContext->whiteout_wos_del_list)
+ tuplesort_putdatum(comContext->whiteout_wos_del_list, ItemPointerGetDatum(&wos_tid), false);
+
+ cridUint = vci_GetCridFromTid(oldListContext, &orig_tid, NULL);
+
+ if (cridUint == VCI_INVALID_CRID)
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("vci index \"%s\" corrupted", RelationGetRelationName(comContext->info.rel)),
+ errdetail("try to delete TID (%d,%d) into delete vector twice",
+ ItemPointerGetBlockNumber(&orig_tid),
+ ItemPointerGetOffsetNumber(&orig_tid)),
+ errhint("Use DROP INDEX \"%s\"", RelationGetRelationName(comContext->info.rel))));
+
+ /* list for storage */
+ tuplesort_putdatum(addList, ItemPointerGetDatum(&orig_tid), false);
+
+ /* list for operation */
+ tuplesort_putdatum(cridList, Int64GetDatum((int64) cridUint), false);
+
+ result++;
+ }
+
+ vci_CloseTidCridUpdateList(oldListContext);
+
+ tuplesort_performsort(addList);
+
+ /* Insert TID->CRID(Invalid) List */
+ vci_MergeAndWriteTidCridUpdateList(&comContext->info, newSel, oldSel, addList, vci_GetCridFromUint64(VCI_INVALID_CRID));
+
+ tuplesort_end(addList);
+
+ } while (false); /* phase1 */
+
+ elog(DEBUG2, "CRID List OK");
+
+ /*
+ * Phase 2. loop for crid
+ */
+ do
+ {
+ LOCKMODE lockmode = RowExclusiveLock;
+ vci_ColumnRelations delvecCol;
+
+ BlockNumber prevBlkno = InvalidBlockNumber;
+ OffsetNumber prevOffset = InvalidOffsetNumber;
+
+ Buffer buffer = InvalidBuffer;
+ Page page = NULL;
+
+ bool readFirstBlock = false;
+ Datum value;
+ bool isnull;
+
+ uint32 numDeletedRows[VCI_MAX_PAGE_SPACE / sizeof(vcis_m_extent_t)];
+ int32 topExtentId = -1;
+ BlockNumber topBlockNumber = InvalidBlockNumber;
+
+ memset(numDeletedRows, 0, sizeof(numDeletedRows));
+
+ tuplesort_performsort(cridList);
+
+ vci_OpenColumnRelations(&delvecCol, &comContext->info,
+ VCI_COLUMN_ID_DELETE, lockmode);
+
+ while (tuplesort_getdatum(cridList, true, true, &value, &isnull, NULL))
+ {
+ HeapTupleHeader htup;
+ int32 extentId;
+ BlockNumber blkno;
+ OffsetNumber offset;
+ uint32 byte_num;
+ uint32 setBitPos;
+ uint64 crid;
+ BlockNumber extentInfoBlkno;
+ OffsetNumber extentInfoOffset;
+
+ crid = (uint64) DatumGetInt64(value);
+
+ extentId = vci_CalcExtentIdFromCrid64(crid);
+ blkno = vci_CalcBlockNumberFromCrid64ForDelete(crid);
+ offset = vci_CalcOffsetNumberFromCrid64ForDelete(crid);
+ byte_num = vci_CalcByteFromCrid64ForDelete(crid);
+ setBitPos = vci_CalcBitFromCrid64ForDelete(crid);
+
+ if ((blkno != prevBlkno) || (offset != prevOffset))
+ {
+ if (readFirstBlock)
+ {
+ /* write Tuple & WAL */
+ vci_WriteItem(delvecCol.data, buffer, prevOffset);
+ }
+ }
+
+ if (blkno != prevBlkno)
+ {
+ if (readFirstBlock)
+ UnlockReleaseBuffer(buffer);
+
+ buffer = vci_ReadBufferWithPageInitDelVec(delvecCol.data, blkno);
+ LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
+ page = BufferGetPage(buffer);
+
+ readFirstBlock = true;
+ }
+
+ /* Calc bits & overwrite */
+ htup = (HeapTupleHeader) PageGetItem(page, PageGetItemId(page, offset));
+ *((char *) htup + htup->t_hoff + byte_num) |= 1 << setBitPos;
+
+ vci_GetExtentInfoPosition(&extentInfoBlkno, &extentInfoOffset, extentId);
+
+ if (topBlockNumber != extentInfoBlkno)
+ {
+ writeNumDeleteRowsIntoExntetInfo(&comContext->info, topExtentId, numExtents, numDeletedRows);
+
+ memset(numDeletedRows, 0, sizeof(numDeletedRows));
+
+ topExtentId = extentId;
+ topBlockNumber = extentInfoBlkno;
+ }
+
+ numDeletedRows[extentId - topExtentId]++;
+
+ prevBlkno = blkno;
+ prevOffset = offset;
+ }
+
+ /* write remaining Tuple & WAL, and release buffer */
+ if (readFirstBlock)
+ {
+ Assert(BufferIsValid(buffer));
+ vci_WriteItem(delvecCol.data, buffer, prevOffset);
+ UnlockReleaseBuffer(buffer);
+ }
+
+ /* Close Column */
+ vci_CloseColumnRelations(&delvecCol, lockmode);
+
+ if (BlockNumberIsValid(topBlockNumber))
+ writeNumDeleteRowsIntoExntetInfo(&comContext->info, topExtentId, numExtents, numDeletedRows);
+
+ } while (false); /* phase 2 */
+
+ tuplesort_end(cridList);
+
+ elog(DEBUG2, "update delvec OK");
+
+ return result;
+}
+
+static void
+writeNumDeleteRowsIntoExntetInfo(vci_MainRelHeaderInfo *info, int32 topExtentId, uint32 numExtents, uint32 *numDeletedRows)
+{
+ BlockNumber topBlockNumber;
+ OffsetNumber topOffsetNumber;
+ Buffer buffer;
+ Page page;
+
+ if (topExtentId < 0)
+ return;
+
+ vci_GetExtentInfoPosition(&topBlockNumber, &topOffsetNumber, topExtentId);
+
+ buffer = vci_ReadBufferWithPageInit(info->rel, topBlockNumber);
+
+ /* LockBuffer(buffer, BUFFER_LOCK_SHARE); */
+ LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
+
+ page = BufferGetPage(buffer);
+
+ for (int32 extentId = topExtentId; (uint32) extentId < numExtents; extentId++)
+ {
+ BlockNumber curBlockNumber;
+ OffsetNumber curOffsetNumber;
+ vcis_m_extent_t *extentInfo;
+
+ vci_GetExtentInfoPosition(&curBlockNumber, &curOffsetNumber, extentId);
+
+ if (curBlockNumber != topBlockNumber)
+ break;
+
+ extentInfo = (vcis_m_extent_t *) &(((char *) page)[curOffsetNumber]);
+
+ extentInfo->num_deleted_rows += numDeletedRows[extentId - topExtentId];
+ }
+
+ vci_WriteOneItemPage(info->rel, buffer);
+
+ UnlockReleaseBuffer(buffer);
+}
+
+int
+vci_UpdateDelVec(Relation mainRel, Size workareaSize, int numRows)
+{
+ int result = -1;
+ MemoryContext memCtxWos2Ros;
+ MemoryContext oldMemCtx;
+ vci_RosCommandContext comContext;
+
+ vci_InitRosCommandContext0(&comContext, mainRel, vci_rc_update_del_vec);
+
+ /* recover ROS if necessary */
+ vci_RecoverOneVCIIfNecessary(&(comContext.info));
+
+ /* Change Mem Context */
+ memCtxWos2Ros = AllocSetContextCreate(TopTransactionContext,
+ "Delete Vector Update.",
+ ALLOCSET_DEFAULT_SIZES);
+ oldMemCtx = MemoryContextSwitchTo(memCtxWos2Ros);
+
+ /* Create TID List from Whiteout WOS */
+ vci_InitRosCommandContext1(&comContext,
+ workareaSize / 2,
+ 0, numRows,
+ false);
+
+ vci_InitRosCommandContext2(&comContext, workareaSize / 2);
+
+ if (TransactionIdPrecedes(GetCurrentTransactionId(),
+ (TransactionId) vci_GetMainRelVar(&comContext.info, vcimrv_current_ros_version, 0)))
+ goto done;
+
+ GetActiveSnapshot();
+
+ /* Write Recovery Information */
+ vci_WriteRecoveryRecordForUpdateDelVec(&comContext.info);
+
+ vci_WriteRecoveryRecordDone(&comContext.info, comContext.command, comContext.xid);
+
+ constructTidSortState(&comContext);
+
+ /* call Main routine */
+ result = UpdateDelVec(&comContext, workareaSize / 2, Min(numRows, VCI_NUM_ROWS_IN_EXTENT));
+
+ /* Clean up WOS entry */
+ cleanUpWos(&comContext, vcimrv_data_wos_oid);
+ cleanUpWos(&comContext, vcimrv_whiteout_wos_oid);
+
+ /* Xmax WOS entry */
+ RemoveWosEntries(&comContext, WOS_Data);
+ RemoveWosEntries(&comContext, WOS_Whiteout);
+
+done:
+ /* Finalize ROS */
+ vci_FinRosCommandContext(&comContext, false);
+
+ MemoryContextSwitchTo(oldMemCtx);
+ MemoryContextDelete(memCtxWos2Ros);
+
+ return result;
+}
+
+/* -------------------------------------------------------------- */
+/* Correction Deleted Rows */
+/* -------------------------------------------------------------- */
+
+static vci_target_extent_info_t
+CountExtents(Relation mainRel, uint32 threshold, CEKind kind)
+{
+ TransactionId wos2rosXid;
+
+ vci_MainRelHeaderInfo infoData = {0};
+ vci_MainRelHeaderInfo *info = &infoData;
+
+ vcis_m_extent_t *extentInfo;
+ vci_meta_item_scanner_t *scan;
+
+ vci_target_extent_info_t result = {0, -1 /* not-found-value */ };
+ uint32 max_deleted_rows = 0;
+
+ wos2rosXid = GetOldestNonRemovableTransactionId(mainRel);
+
+ vci_InitMainRelHeaderInfo(info, mainRel, vci_rc_probe);
+ vci_KeepMainRelHeader(info);
+ vci_GetMainRelVar(info, vcimrv_num_extents, 0);
+
+ scan = vci_BeginMetaItemScan(info->rel, BUFFER_LOCK_SHARE);
+ while ((extentInfo = vci_GetMExtentNext(info, scan)) != NULL)
+ {
+ if (kind == CEK_CountDeletedRows)
+ {
+ if (vci_ExtentIsFree(extentInfo))
+ continue;
+
+ if (!vci_ExtentIsVisible(extentInfo, wos2rosXid))
+ continue;
+
+ if (TransactionIdIsValid(extentInfo->xdel))
+ continue;
+
+ if (extentInfo->num_deleted_rows >= threshold)
+ {
+ if (max_deleted_rows <= extentInfo->num_deleted_rows)
+ {
+ result.best_extent_id = scan->index;
+ max_deleted_rows = extentInfo->num_deleted_rows;
+ }
+ result.num_fit_extents++;
+ }
+ }
+ else
+ {
+ if (vci_ExtentIsFree(extentInfo))
+ continue;
+
+ if (vci_ExtentIsCollectable(extentInfo, wos2rosXid))
+ {
+ result.best_extent_id = scan->index;
+ result.num_fit_extents++;
+ }
+ }
+ }
+ vci_EndMetaItemScan(scan);
+
+ /* release the main relation */
+ vci_ReleaseMainRelHeader(info);
+
+ return result;
+}
+
+vci_target_extent_info_t
+vci_CountDeletedRowsInROS(Relation mainRel, uint32 threshold)
+{
+ return CountExtents(mainRel, threshold, CEK_CountDeletedRows);
+}
+
+static HeapTuple
+getTupleFromVector(int offset,
+ TupleDesc tupleDesc,
+ vci_virtual_tuples_t *vecSet)
+{
+ HeapTuple result;
+ vci_CSFetchContext fetchContext = vecSet->fetch_context;
+ vci_CSQueryContext queryContext = fetchContext->query_context;
+ Datum values[MaxAttrNumber];
+ bool isnull[MaxAttrNumber];
+
+ Assert((0 <= offset) && (offset < vecSet->num_rows));
+ Assert(tupleDesc->natts == vecSet->num_columns);
+ for (int cId = 0; cId < vecSet->num_columns; ++cId)
+ {
+ int tgtId = queryContext->column_id[fetchContext->column_link[cId]];
+
+ Assert((0 <= tgtId) && (tgtId < queryContext->num_columns));
+ values[tgtId] = vci_CSGetValuesOfVirtualTupleColumnar(vecSet, cId)[offset];
+ isnull[tgtId] = vci_CSGetIsNullOfVirtualTupleColumnar(vecSet, cId)[offset];
+ }
+ result = heap_form_tuple(tupleDesc, values, isnull);
+#ifdef __s390x__
+ result->t_self = vci_CSGetTidInItemPointerFromVirtualTuples(vecSet, offset);
+#else
+ result->t_self = *(vci_CSGetTidInItemPointerFromVirtualTuples(vecSet, offset));
+#endif
+
+ return result;
+}
+
+static void
+FillOneRosChunkBufferFromExtent(vci_RosCommandContext *comContext,
+ int32 extentId,
+ uint32 *rowIdInExtent)
+{
+ vci_CSQueryContext queryContext;
+ vci_CSFetchContext fetchContext;
+ vci_CSFetchContext localContext;
+ vci_virtual_tuples_t *vectorSet = NULL;
+
+ TupleDesc tupleDesc;
+ AttrNumber *tableAttrNumList;
+ AttrNumber *fetchAttrNumList;
+ int numFetchRowsAtOnce = Min(comContext->numRowsAtOnce, VCI_MAX_NUM_ROW_TO_FETCH);
+ vci_ros_command_t saveCommand1;
+
+ saveCommand1 = comContext->info.command;
+
+ /* Get a descriptor of the index relation(VCI main relation). */
+ /* This is not a descriptor of the table relation. */
+ /* This including only target columns for VCI. */
+ tupleDesc = vci_GetTupleDescr(&comContext->info);
+ Assert(comContext->numColumns == tupleDesc->natts);
+
+ /* Create pg_attribute::attnum list of the table relation for initialize, */
+ /* and create serial number of ROS columners for fetch. */
+ tableAttrNumList = palloc_array(AttrNumber, comContext->numColumns);
+ fetchAttrNumList = palloc_array(AttrNumber, comContext->numColumns);
+ for (int colId = VCI_FIRST_NORMALCOLUMN_ID; colId < comContext->numColumns; ++colId)
+ {
+ tableAttrNumList[colId] = comContext->heapAttrNumList[colId];
+ fetchAttrNumList[colId] = (AttrNumber) (comContext->indxColumnIdList[colId] + 1);
+ }
+
+ /* queryContext */
+ queryContext = vci_CSCreateQueryContext(RelationGetRelid(comContext->info.rel),
+ comContext->numColumns,
+ tableAttrNumList,
+ TopTransactionContext,
+ false,
+ false);
+
+ /* fetchContext */
+ fetchContext = vci_CSCreateFetchContext(queryContext,
+ numFetchRowsAtOnce,
+ comContext->numColumns,
+ tableAttrNumList,
+ true, /* use ColumnStore */
+ true, /* return Tid */
+ false); /* NOT return CRID */
+
+ localContext = vci_CSLocalizeFetchContext(fetchContext,
+ CurrentMemoryContext);
+
+ {
+ vci_extent_status_t *status = vci_CSCreateCheckExtent(localContext);
+ bool extent_ok;
+
+ vci_CSCheckExtent(status, localContext, extentId, true);
+
+ elog(DEBUG2, "status: %d, %d, %d, %d", status->size, status->num_rows,
+ status->existence, status->visible);
+
+ extent_ok = status->existence && status->visible;
+
+ vci_CSDestroyCheckExtent(status);
+
+ if (!extent_ok)
+ {
+ comContext->done = true;
+ goto done;
+ }
+ }
+
+ /* VectorSet */
+ vectorSet = vci_CSCreateVirtualTuples(localContext);
+
+ {
+ while (comContext->buffer.numFilled < comContext->numRowsAtOnce)
+ {
+ /* int numFetchRows; */
+ int numRead;
+
+ if ((*rowIdInExtent) >= VCI_NUM_ROWS_IN_EXTENT)
+ {
+ comContext->done = true;
+ goto done;
+ }
+
+ /*
+ * if (((*rowIdInExtent) + numFetchRowsAtOnce) <=
+ * VCI_NUM_ROWS_IN_EXTENT) numFetchRows = numFetchRowsAtOnce; else
+ * numFetchRows = VCI_NUM_ROWS_IN_EXTENT - (*rowIdInExtent);
+ */
+
+ /* FIXME: Does it need to use numFetchRows?? */
+ /* let the vci_CSFetchVirtualTuples optimize the number of rows */
+ numRead = vci_CSFetchVirtualTuples(vectorSet,
+ vci_CalcCrid64(extentId, *rowIdInExtent),
+ numFetchRowsAtOnce);
+
+ if (numRead < 1)
+ {
+ comContext->done = true;
+ goto done;
+ }
+
+ /* Read fetched data as HeapTuple */
+ for (int offset = 0; offset < numRead; ++offset)
+ {
+ HeapTuple tuple = NULL;
+ uint16 skip = vci_CSGetSkipFromVirtualTuples(vectorSet)[offset];
+
+ if (0 < skip)
+ {
+ (*rowIdInExtent) += skip;
+ offset += skip - 1;
+ continue;
+ }
+
+ tuple = getTupleFromVector(offset, tupleDesc, vectorSet);
+ (*rowIdInExtent) += 1;
+
+ if (tuple != NULL)
+ {
+ /* ... and register to ROS Chunk. */
+ vci_FillOneRowInRosChunkBuffer(&(comContext->buffer),
+ &(comContext->info),
+ &tuple->t_self,
+ tuple,
+ comContext->indxColumnIdList,
+ fetchAttrNumList,
+ tupleDesc);
+ if (comContext->buffer.numFilled == comContext->numRowsAtOnce)
+ break;
+ }
+ else
+ {
+ Assert(false);
+ elog(LOG, "internal error: CDR command failed");
+ }
+ }
+ }
+ }
+
+done:
+ if (vectorSet)
+ vci_CSDestroyVirtualTuples(vectorSet);
+ vci_CSDestroyFetchContext(localContext);
+ vci_CSDestroyFetchContext(fetchContext);
+ vci_CSDestroyQueryContext(queryContext);
+
+ pfree(fetchAttrNumList);
+ pfree(tableAttrNumList);
+
+ comContext->info.command = saveCommand1;
+}
+
+static bool
+isCdrTargetExtentValid(vci_RosCommandContext *comContext)
+{
+ bool result;
+ uint32 numExtents;
+ vcis_m_extent_t *extentInfo;
+ Buffer buffer = InvalidBuffer;
+
+ if (comContext->extentId == comContext->extentIdSrc)
+ return false;
+
+ numExtents = vci_GetMainRelVar(&comContext->info, vcimrv_num_extents, 0);
+ if (numExtents <= comContext->extentIdSrc)
+ return false;
+
+ extentInfo = vci_GetMExtent(&buffer, &comContext->info, comContext->extentIdSrc);
+ LockBuffer(buffer, BUFFER_LOCK_SHARE);
+ result = vci_ExtentIsVisible(extentInfo, comContext->wos2rosXid) && !TransactionIdIsValid(extentInfo->xdel);
+ UnlockReleaseBuffer(buffer);
+
+ return result;
+}
+
+static int32
+CollectDeletedRows(vci_RosCommandContext *comContext, Snapshot snapshot)
+{
+ uint32 rowIdInExtent;
+
+ vcis_m_extent_t *extentInfo;
+ Buffer buffer = InvalidBuffer;
+
+ int numRows;
+
+ Assert(0 == (comContext->numRowsAtOnce % VCI_COMPACTION_UNIT_ROW));
+
+ /*
+ * Set CDR data and write main relation for recovery. Header and extent
+ * info. Here, we also put current ROS version to the actual current
+ * transaction ID.
+ */
+ vci_WriteExtentInfoInMainRosForWriteExtent(&comContext->info,
+ comContext->extentId,
+ comContext->xid,
+ vci_rc_collect_deleted);
+
+ /* Create ROS Chunk from target Extent */
+ vci_ResetRosChunkStorage(&(comContext->storage));
+ vci_ResetRosChunkBufferCounter(&(comContext->buffer));
+
+ /* collect data from old extent for new extent */
+ rowIdInExtent = 0;
+ while (!comContext->done)
+ {
+
+ CHECK_FOR_INTERRUPTS();
+
+ /* fetch the data from old extents for one chunk */
+ FillOneRosChunkBufferFromExtent(comContext,
+ comContext->extentIdSrc, &rowIdInExtent);
+
+ if (comContext->buffer.numFilled == comContext->numRowsAtOnce)
+ {
+ /* copy chunk buffer in a compact manner */
+ vci_RegisterChunkBuffer(&(comContext->storage), &(comContext->buffer));
+ vci_ResetRosChunkBufferCounter(&(comContext->buffer));
+
+ Assert(comContext->storage.numTotalRows <= VCI_NUM_ROWS_IN_EXTENT);
+ }
+ else
+ {
+ Assert(comContext->done);
+
+ /*
+ * We read and fill data in unit of VCI_COMPACTION_UNIT_ROW. The
+ * remaining data is read outside this loop to merge data read
+ * newly from WOS.
+ */
+ }
+ }
+ comContext->done = false;
+
+ elog(DEBUG2, "... collected deleted extent %d -> %d", comContext->extentIdSrc,
+ comContext->extentId);
+
+ /*
+ * Now, reading from old extent was completed. Write Current ROS Version
+ * to VCI main relation as the XDel of old extent.
+ */
+ extentInfo = vci_GetMExtent(&buffer, &(comContext->info),
+ comContext->extentIdSrc);
+ LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
+ extentInfo->xdel = comContext->xid;
+ vci_WriteOneItemPage(comContext->info.rel, buffer);
+ UnlockReleaseBuffer(buffer);
+
+ /* Append data from WOS */
+
+ numRows = Min((VCI_NUM_ROWS_IN_EXTENT - comContext->storage.numTotalRows
+ - comContext->buffer.numFilled),
+ comContext->numRowsToConvert);
+
+ if (numRows > 0)
+ {
+ fillTidListFromTidSortState(comContext, numRows);
+
+ ReadOneExtentAndStoreInChunkStorage(comContext);
+ }
+
+ /* Copy the remaining data to chunk buffer in a compact manner */
+ if (0 < comContext->buffer.numFilled)
+ {
+ vci_RegisterChunkBuffer(&(comContext->storage), &(comContext->buffer));
+ vci_ResetRosChunkBufferCounter(&(comContext->buffer));
+
+ Assert(comContext->storage.numTotalRows <= VCI_NUM_ROWS_IN_EXTENT);
+ }
+
+ /*
+ * Update TID-CRID List, and Write Ros Chunk into new extent.
+ */
+ comContext->numRowsToConvert = comContext->storage.numTotalRows;
+
+ if (comContext->numRowsToConvert == 0)
+ {
+
+ vci_SetMainRelVar(&comContext->info, vcimrv_new_extent_id, 0, VCI_INVALID_EXTENT_ID);
+
+ return 0;
+ }
+
+ vci_AddTidCridUpdateList(&(comContext->info),
+ &(comContext->storage),
+ comContext->extentId);
+ vci_WriteOneExtent(&(comContext->info),
+ &(comContext->storage),
+ comContext->extentId, /* to */
+ comContext->xid,
+ InvalidTransactionId,
+ comContext->xid);
+
+ return comContext->storage.numTotalRows;
+}
+
+int
+vci_CollectDeletedRows(Relation mainRel, Size workareaSize, int32 extentId)
+{
+ int result = -1;
+
+ MemoryContext memCtxWos2Ros;
+ MemoryContext oldMemCtx;
+ vci_RosCommandContext comContext;
+ Snapshot snapshot;
+
+ vci_InitRosCommandContext0(&comContext, mainRel, vci_rc_collect_deleted);
+
+ /* excute recovery previous ROS command if necessary */
+ vci_RecoverOneVCIIfNecessary(&(comContext.info));
+
+ /* Change Mem Context */
+ memCtxWos2Ros = AllocSetContextCreate(TopTransactionContext,
+ "Collect Deleted Rows",
+ ALLOCSET_DEFAULT_SIZES);
+ oldMemCtx = MemoryContextSwitchTo(memCtxWos2Ros);
+
+ /* CommandContext */
+ vci_InitRosCommandContext1(&comContext,
+ workareaSize / 3 * 2,
+ VCI_NUM_ROWS_IN_EXTENT, 0,
+ true);
+
+ vci_InitRosCommandContext2(&comContext, workareaSize / 3);
+
+ if (TransactionIdPrecedes(GetCurrentTransactionId(),
+ (TransactionId) vci_GetMainRelVar(&comContext.info, vcimrv_current_ros_version, 0)))
+ goto done;
+
+ snapshot = GetActiveSnapshot();
+
+ /* obtain new extent ID */
+ comContext.extentIdSrc = extentId;
+ comContext.extentId = vci_GetFreeExtentId(&(comContext.info));
+
+ if (!isCdrTargetExtentValid(&comContext))
+ goto done;
+
+ /* Write Recovery Information of this command. */
+ vci_WriteRecoveryRecordForExtentInfo(&comContext.info, comContext.extentId, comContext.extentIdSrc);
+ vci_InitRecoveryRecordForFreeSpace(&comContext.info);
+
+ vci_WriteRecoveryRecordDone(&comContext.info, comContext.command, comContext.xid);
+
+ constructTidSortState(&comContext);
+
+ vci_InitRosChunkStroageAndBuffer(&comContext, true /* append */ );
+
+ /* call Main routine */
+ result = CollectDeletedRows(&comContext, snapshot);
+
+ cleanUpWos(&comContext, vcimrv_data_wos_oid);
+ cleanUpWos(&comContext, vcimrv_whiteout_wos_oid);
+
+ /* Xmax WOS entry */
+ RemoveWosEntries(&comContext, WOS_Data);
+ RemoveWosEntries(&comContext, WOS_Whiteout);
+
+done:
+ /* Finalize ROS */
+ vci_FinRosCommandContext(&comContext, false);
+
+ MemoryContextSwitchTo(oldMemCtx);
+ MemoryContextDelete(memCtxWos2Ros);
+
+ return result;
+}
+
+/* -------------------------------------------------------------- */
+/* Collect Unused Extent */
+/* -------------------------------------------------------------- */
+
+vci_target_extent_info_t
+vci_CountUnusedExtents(Relation mainRel)
+{
+ return CountExtents(mainRel, 0, CEK_CountUnusedExtents);
+}
+
+static uint32
+SearchUnusedExtent(vci_MainRelHeaderInfo *info)
+{
+ int32 extentIdFirstFound = VCI_INVALID_EXTENT_ID;
+ TransactionId OldestXmin;
+ vcis_m_extent_t *extentInfo;
+ vci_meta_item_scanner_t *scan;
+
+ OldestXmin = GetOldestNonRemovableTransactionId(info->rel);
+
+ /* search deleted extent */
+ scan = vci_BeginMetaItemScan(info->rel, BUFFER_LOCK_SHARE);
+ while ((extentInfo = vci_GetMExtentNext(info, scan)) != NULL)
+ {
+ if (vci_ExtentIsCollectable(extentInfo, OldestXmin))
+ {
+ extentIdFirstFound = scan->index;
+ break;
+ }
+ }
+ vci_EndMetaItemScan(scan);
+
+ return extentIdFirstFound;
+}
+
+static void
+CollectUnusedExtent(vci_RosCommandContext *comContext)
+{
+ int16 numColumns = vci_GetMainRelVar(&comContext->info, vcimrv_num_columns, 0);
+ int16 recoveredColId = VCI_INVALID_COLUMN_ID;
+ vcis_m_extent_t *extentInfo;
+ Buffer buffer = InvalidBuffer;
+
+ extentInfo = vci_GetMExtent(&buffer, &comContext->info, comContext->extentId);
+
+ LockBuffer(buffer, BUFFER_LOCK_SHARE);
+ if (extentInfo->flags & VCIS_M_EXTENT_FLAG_ENABLE_RECOVERED_COLID)
+ recoveredColId = extentInfo->recovered_colid;
+ UnlockReleaseBuffer(buffer);
+
+ for (int16 colId = VCI_COLUMN_ID_NULL; colId < numColumns; ++colId)
+ {
+ vci_ColumnRelations relPairData;
+ vci_ColumnRelations *relPair = &relPairData;
+ vcis_c_extent_t *extentPointer;
+
+ LOCKMODE lockmode = RowExclusiveLock;
+
+ Buffer bufData;
+ Buffer bufMeta;
+ BlockNumber blockNumber;
+ BlockNumber startBlockNumber;
+
+ Page page;
+
+ vcis_extent_t *extentHead;
+
+ vci_OpenColumnRelations(relPair, &comContext->info, colId, lockmode);
+
+ /* target column-extent pointer */
+ extentPointer = vci_GetColumnExtent(&bufMeta, &blockNumber,
+ relPair->meta,
+ comContext->extentId);
+ startBlockNumber = extentPointer->enabled ? extentPointer->block_number : InvalidBlockNumber;
+ ReleaseBuffer(bufMeta);
+
+ if (!BlockNumberIsValid(startBlockNumber))
+ {
+ /* Close Column */
+ elog(DEBUG2, "this is invalid extent pointer!!");
+ vci_CloseColumnRelations(relPair, lockmode);
+ continue;
+ }
+
+ /* get extent Header */
+ bufData = vci_ReadBufferWithPageInit(relPair->data, startBlockNumber);
+ page = BufferGetPage(bufData);
+ extentHead = vci_GetExtentT(page);
+
+ if (colId == recoveredColId)
+ goto skip_collect_freelist;
+
+ /* Freelist link node */
+ {
+ bool isFixedLength;
+
+ isFixedLength = true;
+ if (VCI_FIRST_NORMALCOLUMN_ID <= colId)
+ {
+ vcis_m_column_t *colInfo;
+
+ colInfo = vci_GetMColumn(&comContext->info, colId);
+ if (colInfo->comp_type != vcis_compression_type_fixed_raw)
+ isFixedLength = false;
+ }
+
+ if (!isFixedLength)
+ {
+ vcis_free_space_t newFS;
+ BlockNumber newFSBlockNumber;
+
+ vci_MakeFreeSpace(relPair, startBlockNumber, &newFSBlockNumber, &newFS, true);
+
+ /* FIXME */ /* The common dictionary should be collected? */
+ vci_WriteRecoveryRecordForFreeSpace(relPair,
+ colId, VCI_INVALID_DICTIONARY_ID,
+ newFSBlockNumber,
+ &newFS);
+
+ ReleaseBuffer(bufData);
+ vci_AppendFreeSpaceToLinkList(relPair,
+ newFSBlockNumber,
+ newFS.prev_pos,
+ newFS.next_pos,
+ newFS.size);
+ }
+ else
+ {
+ LockBuffer(bufData, BUFFER_LOCK_EXCLUSIVE);
+ extentHead->type = vcis_free_space;
+ vci_WriteOneItemPage(relPair->data, bufData);
+ UnlockReleaseBuffer(bufData);
+ }
+ }
+
+skip_collect_freelist:
+ vci_WriteRawDataExtentInfo(relPair->meta,
+ comContext->extentId,
+ InvalidBlockNumber,
+ 0,
+ NULL, /* min */
+ NULL, /* max */
+ false,
+ false);
+
+ /* Close Column */
+ vci_CloseColumnRelations(relPair, lockmode);
+ }
+ /* loop for each column */
+
+ vci_WriteExtentInfo(&comContext->info,
+ comContext->extentId,
+ 0,
+ 0,
+ 0,
+ InvalidTransactionId,
+ InvalidTransactionId);
+}
+
+int
+vci_CollectUnusedExtent(Relation mainRel, Size workareaSize)
+{
+ int result = -1;
+
+ MemoryContext memCtxWos2Ros;
+ MemoryContext oldMemCtx;
+ vci_RosCommandContext comContext;
+
+ vci_InitRosCommandContext0(&comContext, mainRel, vci_rc_collect_extent);
+
+ /* excute recovery previous ROS command if necessary */
+ vci_RecoverOneVCIIfNecessary(&(comContext.info));
+
+ /* Change Mem Context */
+ memCtxWos2Ros = AllocSetContextCreate(TopTransactionContext,
+ "Collect Deleted Extent",
+ ALLOCSET_DEFAULT_SIZES);
+ oldMemCtx = MemoryContextSwitchTo(memCtxWos2Ros);
+
+ /* CommandContext */
+ vci_InitRosCommandContext1(&comContext,
+ workareaSize,
+ 0, 0,
+ false);
+
+ if (TransactionIdPrecedes(GetCurrentTransactionId(),
+ (TransactionId) vci_GetMainRelVar(&comContext.info, vcimrv_current_ros_version, 0)))
+ goto done;
+
+ comContext.extentIdSrc = VCI_INVALID_EXTENT_ID;
+ comContext.extentId = SearchUnusedExtent(&comContext.info);
+
+ if (comContext.extentId == VCI_INVALID_EXTENT_ID)
+ goto done;
+
+ /* Write Recovery Infomation of this command. */
+ vci_WriteRecoveryRecordForExtentInfo(&comContext.info, VCI_INVALID_EXTENT_ID, comContext.extentId);
+ vci_InitRecoveryRecordForFreeSpace(&comContext.info);
+
+ vci_WriteRecoveryRecordDone(&comContext.info, comContext.command, comContext.xid);
+
+ /* call Main routine */
+ CollectUnusedExtent(&comContext);
+
+ result = comContext.extentId;
+
+done:
+ /* Finalize ROS */
+ vci_FinRosCommandContext(&comContext, false);
+
+ MemoryContextSwitchTo(oldMemCtx);
+ MemoryContextDelete(memCtxWos2Ros);
+
+ return result;
+}
+
+/* -------------------------------------------------------------- */
+/* Update TID-CRID Tree */
+/* -------------------------------------------------------------- */
+
+int32
+vci_CountTidCridUpdateListLength(Relation mainRel, Size workarea)
+{
+ int32 result;
+ vci_MainRelHeaderInfo infoData = {0};
+ vci_MainRelHeaderInfo *info = &infoData;
+ int32 oldSel;
+
+ vci_InitMainRelHeaderInfo(info, mainRel, vci_rc_probe);
+ vci_KeepMainRelHeader(info);
+
+ oldSel = vci_GetMainRelVar(info, vcimrv_tid_crid_diff_sel, 0);
+ result = vci_GetTidCridUpdateListLength(info, oldSel);
+
+ /* release the main relation */
+ vci_ReleaseMainRelHeader(info);
+
+ return result;
+}
+
+/**
+ * @param[in] comContext Conv Context
+ * @param[in] workareaSize
+ */
+static void
+UpdateTidCrid(vci_RosCommandContext *comContext, Size workareaSize)
+{
+ const LOCKMODE lockmode = RowExclusiveLock;
+ int i;
+
+ vci_TidCridRelations relPairData;
+ vci_TidCridRelations *relPair = &relPairData;
+
+ vci_TidCridUpdateListContext *oldListContext = NULL;
+ BlockNumber prevOldListBlkno = InvalidBlockNumber;
+ vcis_tidcrid_pair_item_t *array;
+
+ vcis_tidcrid_pair_list_t *moveList;
+ Tuplesortstate *deleteList;
+
+ uint32 oldSel = vci_GetMainRelVar(&comContext->info, vcimrv_tid_crid_diff_sel, 0);
+ uint32 newSel = 1 ^ oldSel;
+
+ oldListContext = vci_OpenTidCridUpdateList(&comContext->info, oldSel);
+
+ moveList = palloc(offsetof(vcis_tidcrid_pair_list_t, body) + (sizeof(vcis_tidcrid_pair_item_t) * MaxHeapTuplesPerPage));
+ moveList->num = 0;
+
+ deleteList =
+ tuplesort_begin_datum(TIDOID, TIDLessOperator, InvalidOid, false,
+ Min(workareaSize / 1024, INT_MAX), NULL, TUPLESORT_NONE);
+ array = palloc_array(vcis_tidcrid_pair_item_t, VCI_TID_CRID_UPDATE_PAGE_ITEMS);
+
+ vci_OpenTidCridRelations(relPair, &comContext->info, lockmode);
+
+ i = 0;
+
+ for (uint32 toMove = 0; toMove < comContext->utility_array.num; toMove++)
+ {
+ ItemPointerData treeNodeData;
+ ItemPointer treeNode = &treeNodeData;
+
+ BlockNumber blkToMove;
+
+ blkToMove = comContext->utility_array.orig_blknos[toMove];
+
+ moveList->num = 0;
+
+ for (; i < oldListContext->count; i++)
+ {
+ BlockNumber blkno = VCI_TID_CRID_UPDATE_BODY_PAGE_ID + (i / VCI_TID_CRID_UPDATE_PAGE_ITEMS);
+ vcis_tidcrid_pair_item_t item;
+
+ if (prevOldListBlkno != blkno)
+ {
+ vci_ReadOneBlockFromTidCridUpdateList(oldListContext, blkno, array);
+ prevOldListBlkno = blkno;
+ }
+
+ item = array[i % VCI_TID_CRID_UPDATE_PAGE_ITEMS];
+
+ if (ItemPointerGetBlockNumber(&item.page_item_id) != blkToMove)
+ break;
+
+ Assert(moveList->num < MaxHeapTuplesPerPage);
+
+ moveList->body[moveList->num] = item;
+ moveList->num++;
+
+ tuplesort_putdatum(deleteList, ItemPointerGetDatum(&item.page_item_id), false);
+ }
+
+ if (moveList->num == 0)
+ continue;
+
+ vci_GetTidCridSubTree(relPair, blkToMove, treeNode);
+
+ if (!ItemPointerIsValid(treeNode))
+ vci_CreateTidCridSubTree(relPair, blkToMove, treeNode);
+
+ vci_UpdateTidCridSubTree(relPair, treeNode, moveList);
+ }
+
+ pfree(array);
+ pfree(moveList);
+
+ vci_CloseTidCridRelations(relPair, lockmode);
+
+ vci_CloseTidCridUpdateList(oldListContext);
+
+ tuplesort_performsort(deleteList);
+
+ vci_MergeAndWriteTidCridUpdateList(&comContext->info, newSel, oldSel, deleteList, vci_GetCridFromUint64(VCI_MOVED_CRID));
+
+ tuplesort_end(deleteList);
+}
+
+/**
+ * @param[in,out] comContext Conv Context
+ * @param[in] numPages
+ */
+static void
+collectBlockNumberToMove(vci_RosCommandContext *comContext, int numPages)
+{
+ uint32 oldSel;
+ vci_TidCridUpdateListContext *oldListContext;
+ BlockNumber prevblk = InvalidBlockNumber;
+ vcis_tidcrid_pair_item_t *array;
+ BlockNumber blockNumber = VCI_TID_CRID_UPDATE_BODY_PAGE_ID;
+ uint64 count = 0;
+
+ oldSel = vci_GetMainRelVar(&comContext->info, vcimrv_tid_crid_diff_sel, 0);
+ oldListContext = vci_OpenTidCridUpdateList(&comContext->info, oldSel);
+
+ comContext->utility_array.num = 0;
+
+ array = palloc_array(vcis_tidcrid_pair_item_t, VCI_TID_CRID_UPDATE_PAGE_ITEMS);
+
+ while (blockNumber < oldListContext->nblocks)
+ {
+ int i;
+
+ vci_ReadOneBlockFromTidCridUpdateList(oldListContext, blockNumber, array);
+
+ for (i = 0; (i < VCI_TID_CRID_UPDATE_PAGE_ITEMS) && (count < oldListContext->count); i++)
+ {
+ BlockNumber blkno = ItemPointerGetBlockNumber(&array[i].page_item_id);
+
+ if (prevblk != blkno)
+ {
+ comContext->utility_array.orig_blknos[comContext->utility_array.num++] = blkno;
+ prevblk = blkno;
+
+ if (numPages == comContext->utility_array.num)
+ goto done;
+ }
+
+ count++;
+ }
+
+ blockNumber++;
+ }
+
+done:
+ pfree(array);
+
+ vci_CloseTidCridUpdateList(oldListContext);
+}
+
+int
+vci_UpdateTidCrid(Relation mainRel, Size workareaSize, int numPages)
+{
+ int result = 0;
+
+ MemoryContext memCtxWos2Ros;
+ MemoryContext oldMemCtx;
+ vci_RosCommandContext comContext;
+
+ vci_InitRosCommandContext0(&comContext, mainRel, vci_rc_update_tid_crid);
+
+ /* excute recovery previous ROS command if necessary */
+ vci_RecoverOneVCIIfNecessary(&(comContext.info));
+
+ /* Change Mem Context */
+ memCtxWos2Ros = AllocSetContextCreate(TopTransactionContext,
+ "TIDCRID Tree Update",
+ ALLOCSET_DEFAULT_SIZES);
+ oldMemCtx = MemoryContextSwitchTo(memCtxWos2Ros);
+
+ /* CommandContext */
+ vci_InitRosCommandContext1(&comContext,
+ workareaSize,
+ 0, 0,
+ false);
+
+ if (TransactionIdPrecedes(GetCurrentTransactionId(),
+ (TransactionId) vci_GetMainRelVar(&comContext.info, vcimrv_current_ros_version, 0)))
+ goto done;
+
+ comContext.utility_array.orig_blknos = palloc_array(BlockNumber, numPages);
+ comContext.utility_array.max = numPages;
+
+ collectBlockNumberToMove(&comContext, numPages);
+
+ result = comContext.utility_array.num;
+
+ /* Write Recovery Information of this command. */
+ vci_InitRecoveryRecordForTidCrid(&comContext.info);
+ vci_InitRecoveryRecordForFreeSpace(&comContext.info);
+
+ vci_WriteRecoveryRecordDone(&comContext.info, comContext.command, comContext.xid);
+
+ /* call Main routine */
+ UpdateTidCrid(&comContext, workareaSize);
+
+done:
+ /* Finalize ROS */
+ vci_FinRosCommandContext(&comContext, false);
+
+ MemoryContextSwitchTo(oldMemCtx);
+ MemoryContextDelete(memCtxWos2Ros);
+
+ return result;
+}
+
+/* -------------------------------------------------------------- */
+/* Vacuum and Freeze */
+/* -------------------------------------------------------------- */
+
+static void
+freezeMainAndRos(vci_RosCommandContext *comContext)
+{
+ vcis_m_extent_t *mExtent;
+ TransactionId wos2rosXid = comContext->wos2rosXid;
+ vci_meta_item_scanner_t *scan;
+ TransactionId lastRosVer;
+
+ lastRosVer = vci_GetMainRelVar(&comContext->info, vcimrv_last_ros_version, 0);
+ if (TransactionIdIsNormal(lastRosVer) && TransactionIdPrecedes(lastRosVer, wos2rosXid))
+ vci_SetMainRelVar(&comContext->info, vcimrv_last_ros_version, 0, FrozenTransactionId);
+
+ scan = vci_BeginMetaItemScan(comContext->info.rel, BUFFER_LOCK_EXCLUSIVE);
+ while ((mExtent = vci_GetMExtentNext(&comContext->info, scan)) != NULL)
+ {
+ if (TransactionIdIsNormal(mExtent->xgen) &&
+ TransactionIdPrecedes(mExtent->xgen, wos2rosXid)) /* mExtent->xgen <
+ * wos2rosXid */
+ mExtent->xgen = FrozenTransactionId;
+
+ if (TransactionIdIsNormal(mExtent->xdel) &&
+ TransactionIdPrecedes(mExtent->xdel, wos2rosXid)) /* mExtent->xdel <
+ * wos2rosXid */
+ mExtent->xdel = FrozenTransactionId;
+ }
+ vci_EndMetaItemScan(scan);
+}
+
+/*
+ * VCITupleSatisfiesVisibility
+ * True iff heap tuple satisfies a time qual.
+ *
+ * Notes:
+ * Assumes heap tuple is valid, and buffer at least share locked.
+ *
+ * Copy of OSS HeapTupleSatisfiesVisibulity() for VCI snapshot types
+ *
+ */
+bool
+VCITupleSatisfiesVisibility(HeapTuple htup, Snapshot snapshot, Buffer buffer)
+{
+ switch (snapshot->snapshot_type)
+ {
+ case SNAPSHOT_VCI_WOS2ROS:
+ return HeapTupleSatisfiesWos2Ros(htup, snapshot, buffer);
+ case SNAPSHOT_VCI_LOCALROS:
+ return HeapTupleSatisfiesLocalRos(htup, snapshot, buffer);
+ default:
+ return HeapTupleSatisfiesVisibility(htup, snapshot, buffer);
+ }
+ return false;
+}
+
+static void
+freezeWos(vci_RosCommandContext *comContext, vci_MainRelVar wosType, Snapshot snapshot)
+{
+ LOCKMODE lockmode = ShareUpdateExclusiveLock;
+ Oid oid;
+ HeapTupleFreeze *frozen;
+ Relation rel;
+ BlockNumber nblocks;
+
+ frozen = palloc0_array(HeapTupleFreeze, MaxHeapTuplesPerPage);
+
+ oid = vci_GetMainRelVar(&comContext->info, wosType, 0);
+
+ rel = table_open(oid, lockmode);
+
+ nblocks = RelationGetNumberOfBlocks(rel);
+
+ for (BlockNumber blkno = 0; blkno < nblocks; blkno++)
+ {
+ Buffer buffer;
+ Page page;
+ OffsetNumber maxoff;
+ int nfrozen = 0;
+
+ buffer = ReadBuffer(rel, blkno);
+
+ LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
+
+ page = BufferGetPage(buffer);
+
+ maxoff = PageGetMaxOffsetNumber(page);
+
+ for (OffsetNumber offnum = FirstOffsetNumber;
+ offnum <= maxoff;
+ offnum = OffsetNumberNext(offnum))
+ {
+ ItemId itemid;
+ HeapTupleData loctup;
+
+ itemid = PageGetItemId(page, offnum);
+
+ if (ItemIdIsNormal(itemid))
+ {
+ bool valid;
+ TransactionId xmin;
+
+ loctup.t_tableOid = RelationGetRelid(rel);
+ loctup.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
+ loctup.t_len = ItemIdGetLength(itemid);
+ ItemPointerSet(&loctup.t_self, blkno, offnum);
+
+ valid = VCITupleSatisfiesVisibility(&loctup, snapshot, buffer);
+
+ HeapCheckForSerializableConflictOut(valid, rel, &loctup, buffer, snapshot);
+
+ xmin = HeapTupleHeaderGetXmin(loctup.t_data);
+
+ if (valid &&
+ !TransactionIdEquals(xmin, FrozenTransactionId) &&
+ TransactionIdPrecedes(xmin, comContext->oldestXmin))
+ {
+ HeapTupleFreeze *frz = &frozen[nfrozen];
+ HeapTupleHeader tuple = loctup.t_data;
+
+ frz->frzflags = 0;
+ frz->t_infomask2 = tuple->t_infomask2;
+ frz->t_infomask = tuple->t_infomask | HEAP_XMIN_FROZEN;
+ frz->xmax = HeapTupleHeaderGetRawXmax(tuple);
+ frz->offset = offnum;
+
+ nfrozen++;
+ }
+ }
+ }
+
+ if (nfrozen > 0)
+ {
+ heap_pre_freeze_checks(buffer, frozen, nfrozen);
+ START_CRIT_SECTION();
+ heap_freeze_prepared_tuples(buffer, frozen, nfrozen);
+ MarkBufferDirty(buffer);
+
+ /* Now WAL-log freezing if necessary */
+ if (RelationNeedsWAL(rel))
+ {
+ /*
+ * Commit add323d added the vmbuffer/vmflags parameters.
+ * A quick fix was needed to allow build to proceed.
+ *
+ * TODO Confirm if passing InvalidBuffer, 0 is OK here.
+ */
+ log_heap_prune_and_freeze(rel, buffer,
+ InvalidBuffer, /* vmbuffer */
+ 0, /* vmflags */
+ comContext->oldestXmin,
+ false, /* no cleanup lock
+ * required */
+ PRUNE_VACUUM_SCAN,
+ frozen, nfrozen,
+ NULL, 0, /* redirected */
+ NULL, 0, /* dead */
+ NULL, 0); /* unused */
+ }
+ END_CRIT_SECTION();
+ }
+ UnlockReleaseBuffer(buffer);
+ }
+
+ table_close(rel, lockmode);
+
+ pfree(frozen);
+}
+
+/**
+ * @param[in] comContext Conv Context
+ *
+ * @note
+ * This is not transaction-safe, because the truncation is done immediately
+ * and cannot be rolled back later. Caller is responsible for having
+ * checked permissions etc, and must have obtained AccessExclusiveLock.
+ */
+static void
+truncateRos(vci_RosCommandContext *comContext)
+{
+ const LOCKMODE lockmode = ShareUpdateExclusiveLock;
+
+ vci_meta_item_scanner_t *scan;
+ vcis_m_extent_t *extentInfo;
+ int32 lastAvailableExtentId = -1;
+
+ scan = vci_BeginMetaItemScan(comContext->info.rel, BUFFER_LOCK_SHARE);
+ while ((extentInfo = vci_GetMExtentNext(&comContext->info, scan)) != NULL)
+ {
+ if (TransactionIdIsValid(extentInfo->xgen) ||
+ TransactionIdIsValid(extentInfo->xdel))
+ lastAvailableExtentId = scan->index;
+ }
+ vci_EndMetaItemScan(scan);
+
+ vci_SetMainRelVar(&comContext->info, vcimrv_num_extents, 0, lastAvailableExtentId + 1);
+
+ for (int colId = VCI_FIRST_NORMALCOLUMN_ID; colId < comContext->numColumns; ++colId)
+ {
+ vcis_m_column_t *colInfo;
+
+ vci_ColumnRelations relPairData;
+ vci_ColumnRelations *relPair = &relPairData;
+
+ BlockNumber nblocks;
+
+ colInfo = vci_GetMColumn(&comContext->info, colId);
+
+ vci_OpenColumnRelations(relPair, &comContext->info, colId, lockmode);
+
+ nblocks = RelationGetNumberOfBlocks(relPair->data);
+
+ if (colInfo->comp_type != vcis_compression_type_fixed_raw)
+ {
+ BlockNumber sentinelBlockNumber;
+ vcis_column_meta_t *columnMeta;
+
+ elog(DEBUG2, " -- colId %d ,variable column ", colId);
+
+ columnMeta = vci_GetColumnMeta(&relPair->bufMeta, relPair->meta);
+ sentinelBlockNumber = columnMeta->free_page_end_id;
+ ReleaseBuffer(relPair->bufMeta);
+
+ Assert(sentinelBlockNumber + 1 <= nblocks);
+
+ RelationTruncate(relPair->data, sentinelBlockNumber + 1);
+ elog(DEBUG2, " end");
+ }
+ else
+ {
+ int16 columnSize;
+ int extentHeaderSize;
+ Size dataSize;
+ int numExtentPages;
+ BlockNumber startBlockNumber;
+
+ elog(DEBUG2, " -- colId %d ,variable column ", colId);
+
+ columnSize = vci_GetFixedColumnSize(&comContext->info, colId);
+ extentHeaderSize = vci_GetExtentFixedLengthRawDataHeaderSize(VCI_NUM_ROWS_IN_EXTENT);
+ dataSize = (Size) columnSize * VCI_NUM_ROWS_IN_EXTENT;
+ numExtentPages = vci_GetNumBlocks(dataSize + extentHeaderSize);
+ startBlockNumber = (lastAvailableExtentId + 1) * numExtentPages;
+
+ if (startBlockNumber < nblocks)
+ RelationTruncate(relPair->data, startBlockNumber);
+
+ elog(DEBUG2, " end");
+
+ }
+
+ vci_CloseColumnRelations(relPair, lockmode);
+ }
+}
+
+/**
+ * @param[in] comContext Conv Context
+ */
+static void
+truncateWos(vci_RosCommandContext *comContext)
+{
+ LOCKMODE lockmode = ShareUpdateExclusiveLock;
+
+ Oid oid[2] = {
+ vci_GetMainRelVar(&comContext->info, vcimrv_data_wos_oid, 0),
+ vci_GetMainRelVar(&comContext->info, vcimrv_whiteout_wos_oid, 0)
+ };
+
+ for (int i = 0; i < 2; i++)
+ {
+ Relation rel = table_open(oid[i], lockmode);
+ int lock_retry = 0;
+ BlockNumber old_rel_pages;
+ BlockNumber new_rel_pages;
+ BlockNumber blkno;
+
+ while (true)
+ {
+ if (ConditionalLockRelation(rel, AccessExclusiveLock))
+ break;
+
+ /*
+ * * Check for interrupts while trying to (re-)acquire the
+ * exclusive * lock.
+ */
+ CHECK_FOR_INTERRUPTS();
+
+ if (++lock_retry > (VACUUM_TRUNCATE_LOCK_TIMEOUT /
+ VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL))
+ {
+ table_close(rel, lockmode);
+ return;
+ }
+
+ pg_usleep(VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL);
+ }
+
+ blkno = old_rel_pages = new_rel_pages = RelationGetNumberOfBlocks(rel);
+
+ while (blkno > 0)
+ {
+ Buffer buffer;
+ Page page;
+ OffsetNumber maxoff;
+
+ blkno--;
+
+ buffer = ReadBuffer(rel, blkno);
+
+ LockBuffer(buffer, BUFFER_LOCK_SHARE);
+ page = BufferGetPage(buffer);
+
+ if (PageIsNew(page) || PageIsEmpty(page))
+ {
+ UnlockReleaseBuffer(buffer);
+
+ new_rel_pages = blkno;
+ continue;
+ }
+
+ maxoff = PageGetMaxOffsetNumber(page);
+
+ for (OffsetNumber offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum))
+ {
+ ItemId itemid;
+
+ itemid = PageGetItemId(page, offnum);
+
+ if (ItemIdIsUsed(itemid))
+ {
+ UnlockReleaseBuffer(buffer);
+ goto found_use_item;
+ }
+ }
+
+ UnlockReleaseBuffer(buffer);
+
+ new_rel_pages = blkno;
+ }
+
+found_use_item:
+ if (new_rel_pages < old_rel_pages)
+ RelationTruncate(rel, new_rel_pages);
+
+ UnlockRelation(rel, AccessExclusiveLock);
+
+ table_close(rel, lockmode);
+ }
+}
+
+void
+vci_VacuumRos(Relation mainRel, IndexVacuumInfo *vacuumInfo)
+{
+ MemoryContext memCtxWos2Ros;
+ MemoryContext oldMemCtx;
+ vci_RosCommandContext comContext;
+ Snapshot snapshot;
+
+ vci_InitRosCommandContext0(&comContext, mainRel, vci_rc_vacuum);
+
+ /* recover ROS if necessary */
+ vci_RecoverOneVCIIfNecessary(&(comContext.info));
+
+ /* Change Mem Context */
+ memCtxWos2Ros = AllocSetContextCreate(TopTransactionContext,
+ "Vacuum",
+ ALLOCSET_DEFAULT_SIZES);
+ oldMemCtx = MemoryContextSwitchTo(memCtxWos2Ros);
+
+ vci_InitRosCommandContext1(&comContext, 0, 0, 0, false);
+
+ snapshot = GetActiveSnapshot();
+
+ /* remove WOS entries */
+ elog(DEBUG2, " -- wos");
+ cleanUpWos(&comContext, vcimrv_data_wos_oid);
+ cleanUpWos(&comContext, vcimrv_whiteout_wos_oid);
+ freezeWos(&comContext, vcimrv_data_wos_oid, snapshot);
+ freezeWos(&comContext, vcimrv_whiteout_wos_oid, snapshot);
+ truncateWos(&comContext);
+
+ elog(DEBUG2, " -- ros");
+ freezeMainAndRos(&comContext);
+ truncateRos(&comContext);
+
+ elog(DEBUG2, " -- end");
+
+ vci_UpdateXidGeneration(&comContext.info);
+
+ /* Finalize ROS */
+ vci_FinRosCommandContext(&comContext, true /* never write */ );
+
+ MemoryContextSwitchTo(oldMemCtx);
+ MemoryContextDelete(memCtxWos2Ros);
+}
+
+static void
+constructTidArray(vci_RosCommandContext *comContext, int max_data_wos_entries, int max_whiteout_wos_entries)
+{
+ vci_MainRelHeaderInfo *info;
+ Snapshot snapshot;
+ Oid data_wos_oid;
+ Oid whiteout_wos_oid;
+ vci_tid_tid_xid64_t *data_wos_entries;
+ vci_tid_tid_xid64_t *whiteout_wos_entries;
+ int num_data_wos_entries = 0;
+ int num_whiteout_wos_entries = 0;
+ int data_wos_entries_pos = 0;
+ int whiteout_wos_entries_pos = 0;
+
+ info = &comContext->info;
+
+ data_wos_oid = vci_GetMainRelVar(info, vcimrv_data_wos_oid, 0);
+ whiteout_wos_oid = vci_GetMainRelVar(info, vcimrv_whiteout_wos_oid, 0);
+
+ data_wos_entries = palloc_array(vci_tid_tid_xid64_t, max_data_wos_entries);
+ whiteout_wos_entries = palloc_array(vci_tid_tid_xid64_t, max_whiteout_wos_entries);
+
+ snapshot = vci_GetSnapshotForLocalRos(comContext->inclusiveXid, comContext->exclusiveXid);
+
+ num_data_wos_entries =
+ readTidListFromWosIntoTidArray(data_wos_oid, WOS_Data,
+ data_wos_entries, max_data_wos_entries,
+ snapshot);
+
+ num_whiteout_wos_entries =
+ readTidListFromWosIntoTidArray(whiteout_wos_oid, WOS_Whiteout,
+ whiteout_wos_entries, max_whiteout_wos_entries,
+ snapshot);
+
+ Assert(num_data_wos_entries <= max_data_wos_entries);
+ Assert(num_whiteout_wos_entries <= max_whiteout_wos_entries);
+
+ qsort(data_wos_entries, num_data_wos_entries, sizeof(vci_tid_tid_xid64_t), comparator_orig_tid_xid64);
+ qsort(whiteout_wos_entries, num_whiteout_wos_entries, sizeof(vci_tid_tid_xid64_t), comparator_orig_tid_xid64);
+
+ while ((data_wos_entries_pos < num_data_wos_entries) &&
+ (whiteout_wos_entries_pos < num_whiteout_wos_entries))
+ {
+ int32 res;
+ vci_tid_tid_xid64_t data_wos_item;
+ vci_tid_tid_xid64_t whiteout_wos_item;
+
+ data_wos_item = data_wos_entries[data_wos_entries_pos];
+ whiteout_wos_item = whiteout_wos_entries[whiteout_wos_entries_pos];
+
+ res = ItemPointerCompare(&data_wos_item.orig_tid, &whiteout_wos_item.orig_tid);
+
+ if (res == 0)
+ res = compareXid64(data_wos_item.xid64, whiteout_wos_item.xid64);
+
+ if (res < 0)
+ {
+ comContext->wos2ros_array.orig_tids[comContext->wos2ros_array.num] =
+ data_wos_item.orig_tid;
+
+ comContext->wos2ros_array.num++;
+ data_wos_entries_pos++;
+ }
+ else if (res > 0)
+ {
+ comContext->delvec_array.orig_tids[comContext->delvec_array.num] =
+ whiteout_wos_item.orig_tid;
+
+ comContext->delvec_array.num++;
+ whiteout_wos_entries_pos++;
+ }
+ else
+ {
+ data_wos_entries_pos++;
+ whiteout_wos_entries_pos++;
+ }
+ }
+
+ while (data_wos_entries_pos < num_data_wos_entries)
+ {
+ comContext->wos2ros_array.orig_tids[comContext->wos2ros_array.num] =
+ data_wos_entries[data_wos_entries_pos].orig_tid;
+
+ comContext->wos2ros_array.num++;
+ data_wos_entries_pos++;
+ }
+
+ while (whiteout_wos_entries_pos < num_whiteout_wos_entries)
+ {
+ comContext->delvec_array.orig_tids[comContext->delvec_array.num] =
+ whiteout_wos_entries[whiteout_wos_entries_pos].orig_tid;
+
+ comContext->delvec_array.num++;
+ whiteout_wos_entries_pos++;
+ }
+
+ PopActiveSnapshot();
+
+ pfree(data_wos_entries);
+ pfree(whiteout_wos_entries);
+}
+
+static int
+comparator_orig_tid_xid64(const void *pa, const void *pb)
+{
+ vci_tid_tid_xid64_t *a = (vci_tid_tid_xid64_t *) pa;
+ vci_tid_tid_xid64_t *b = (vci_tid_tid_xid64_t *) pb;
+ int res;
+
+ res = ItemPointerCompare(&a->orig_tid, &b->orig_tid);
+
+ if (res == 0)
+ {
+ if (a->xid64 == b->xid64)
+ res = 0;
+ else if (a->xid64 > b->xid64)
+ res = 1;
+ else
+ res = -1;
+ }
+
+ return res;
+}
+
+/**
+ * @param[in,out] comContext Conv Context
+ * @param[in] snapshot Snapshot
+ */
+static void
+constructTidSortState(vci_RosCommandContext *comContext)
+{
+ vci_MainRelHeaderInfo *info;
+ Snapshot snapshot;
+ Oid data_wos_oid;
+ Oid whiteout_wos_oid;
+ MemoryContext workcontext;
+ MemoryContext oldcontext;
+ TupleDesc tupDesc;
+ Tuplesortstate *data_wos_valid_tid_sortstate;
+ Tuplesortstate *whiteout_wos_valid_tid_sortstate;
+ AttrNumber sortKeys[2] = {1, 3};
+ Oid sortOperators[2] = {TIDLessOperator, Int8LessOperator};
+ Oid sortCollations[2] = {InvalidOid, InvalidOid,};
+ bool nullsFirstFlags[2] = {false, false};
+ TupleTableSlot *data_wos_valid_slot;
+ TupleTableSlot *whiteout_wos_valid_slot;
+ vci_tid_tid_xid64_t data_wos_item;
+ vci_tid_tid_xid64_t whiteout_wos_item;
+ bool is_terminated_data_wos = false;
+ bool is_terminated_whiteout_wos = false;
+ int64 numInsertRows = 0;
+ int64 numDeleteRows = 0;
+ ItemPointerData last_whiteout_orig_tid;
+
+ info = &comContext->info;
+
+ data_wos_oid = vci_GetMainRelVar(info, vcimrv_data_wos_oid, 0);
+ whiteout_wos_oid = vci_GetMainRelVar(info, vcimrv_whiteout_wos_oid, 0);
+
+ workcontext = AllocSetContextCreate(CurrentMemoryContext,
+ "Construct Tid Sort State",
+ ALLOCSET_DEFAULT_SIZES);
+
+ oldcontext = MemoryContextSwitchTo(workcontext);
+
+ tupDesc = CreateTemplateTupleDesc(4);
+
+ TupleDescInitEntry(tupDesc, (AttrNumber) 1, "orig_tid", TIDOID, -1, 0);
+ TupleDescInitEntry(tupDesc, (AttrNumber) 2, "wos_tid", TIDOID, -1, 0);
+ TupleDescInitEntry(tupDesc, (AttrNumber) 3, "xid64", INT8OID, -1, 0);
+ TupleDescInitEntry(tupDesc, (AttrNumber) 4, "movable", BOOLOID, -1, 0);
+
+ data_wos_valid_tid_sortstate =
+ tuplesort_begin_heap(tupDesc, 2,
+ sortKeys, sortOperators, sortCollations, nullsFirstFlags,
+ VciGuc.maintenance_work_mem / 8 * 3, NULL,
+ TUPLESORT_NONE);
+
+ whiteout_wos_valid_tid_sortstate =
+ tuplesort_begin_heap(tupDesc, 2,
+ sortKeys, sortOperators, sortCollations, nullsFirstFlags,
+ VciGuc.maintenance_work_mem / 8 * 3, NULL,
+ TUPLESORT_NONE);
+
+ data_wos_valid_slot = MakeSingleTupleTableSlot(tupDesc, &TTSOpsHeapTuple);
+ whiteout_wos_valid_slot = MakeSingleTupleTableSlot(tupDesc, &TTSOpsHeapTuple);
+
+ snapshot = vci_GetSnapshotForWos2Ros();
+
+ readTidListFromWosIntoTidSortState(data_wos_oid, WOS_Data,
+ data_wos_valid_slot,
+ data_wos_valid_tid_sortstate,
+ snapshot,
+ comContext->wos2rosXid);
+
+ readTidListFromWosIntoTidSortState(whiteout_wos_oid, WOS_Whiteout,
+ whiteout_wos_valid_slot,
+ whiteout_wos_valid_tid_sortstate,
+ snapshot,
+ comContext->wos2rosXid);
+
+ tuplesort_performsort(data_wos_valid_tid_sortstate);
+ tuplesort_performsort(whiteout_wos_valid_tid_sortstate);
+
+ if (!getValidTidSortState(data_wos_valid_tid_sortstate, data_wos_valid_slot, &data_wos_item))
+ is_terminated_data_wos = true;
+
+ if (!getValidTidSortState(whiteout_wos_valid_tid_sortstate, whiteout_wos_valid_slot, &whiteout_wos_item))
+ is_terminated_whiteout_wos = true;
+
+ ItemPointerSetInvalid(&last_whiteout_orig_tid);
+
+ while (!is_terminated_data_wos && !is_terminated_whiteout_wos)
+ {
+ int32 res;
+
+ res = ItemPointerCompare(&data_wos_item.orig_tid, &whiteout_wos_item.orig_tid);
+
+ if (res == 0)
+ res = compareXid64(data_wos_item.xid64, whiteout_wos_item.xid64);
+
+ if (res < 0)
+ {
+ if (can_select_candidate_for_wos2ros_conv(&data_wos_item, comContext, &last_whiteout_orig_tid))
+ {
+ put_entry_into_tid_list(comContext, WOS_Data, &data_wos_item.orig_tid, &data_wos_item.wos_tid);
+
+ numInsertRows++;
+ }
+
+ if (!getValidTidSortState(data_wos_valid_tid_sortstate, data_wos_valid_slot, &data_wos_item))
+ is_terminated_data_wos = true;
+ }
+ else if (res > 0)
+ {
+ last_whiteout_orig_tid = whiteout_wos_item.orig_tid;
+
+ if (can_select_candidate_for_update_delvec(&whiteout_wos_item, comContext))
+ {
+ put_entry_into_tid_list(comContext, WOS_Whiteout, &whiteout_wos_item.orig_tid, &whiteout_wos_item.wos_tid);
+
+ numDeleteRows++;
+ }
+
+ if (!getValidTidSortState(whiteout_wos_valid_tid_sortstate, whiteout_wos_valid_slot, &whiteout_wos_item))
+ is_terminated_whiteout_wos = true;
+ }
+ else
+ {
+ if (data_wos_item.movable && whiteout_wos_item.movable)
+ {
+ if (comContext->data_wos_del_list)
+ tuplesort_putdatum(comContext->data_wos_del_list,
+ ItemPointerGetDatum(&data_wos_item.wos_tid), false);
+
+ if (comContext->whiteout_wos_del_list)
+ tuplesort_putdatum(comContext->whiteout_wos_del_list,
+ ItemPointerGetDatum(&whiteout_wos_item.wos_tid), false);
+ }
+
+ if (!getValidTidSortState(data_wos_valid_tid_sortstate, data_wos_valid_slot, &data_wos_item))
+ is_terminated_data_wos = true;
+
+ if (!getValidTidSortState(whiteout_wos_valid_tid_sortstate, whiteout_wos_valid_slot, &whiteout_wos_item))
+ is_terminated_whiteout_wos = true;
+ }
+ }
+
+ if (!is_terminated_data_wos && comContext->wos2ros_tid_list)
+ {
+ do
+ {
+ if (can_select_candidate_for_wos2ros_conv(&data_wos_item, comContext, &last_whiteout_orig_tid))
+ {
+ put_entry_into_tid_list(comContext, WOS_Data, &data_wos_item.orig_tid, &data_wos_item.wos_tid);
+ numInsertRows++;
+ }
+ } while (getValidTidSortState(data_wos_valid_tid_sortstate,
+ data_wos_valid_slot, &data_wos_item));
+ }
+
+ if (!is_terminated_whiteout_wos && comContext->delvec_tid_list)
+ {
+ do
+ {
+ if (can_select_candidate_for_update_delvec(&whiteout_wos_item, comContext))
+ {
+ put_entry_into_tid_list(comContext, WOS_Whiteout, &whiteout_wos_item.orig_tid, &whiteout_wos_item.wos_tid);
+
+ numDeleteRows++;
+ }
+ } while (getValidTidSortState(whiteout_wos_valid_tid_sortstate,
+ whiteout_wos_valid_slot, &whiteout_wos_item));
+ }
+
+ tuplesort_end(whiteout_wos_valid_tid_sortstate);
+ tuplesort_end(data_wos_valid_tid_sortstate);
+
+ FreeTupleDesc(tupDesc);
+
+ PopActiveSnapshot();
+
+ MemoryContextSwitchTo(oldcontext);
+ MemoryContextDelete(workcontext);
+
+ if (comContext->wos2ros_tid_list)
+ {
+ tuplesort_performsort(comContext->wos2ros_tid_list);
+ comContext->num_wos2ros_tids = numInsertRows;
+ }
+
+ if (comContext->delvec_tid_list)
+ {
+ tuplesort_performsort(comContext->delvec_tid_list);
+ comContext->num_delvec_tids = numDeleteRows;
+ }
+}
+
+static bool
+can_select_candidate_for_wos2ros_conv(vci_tid_tid_xid64_t *data_wos_item, vci_RosCommandContext *comContext, ItemPointer last_whiteout_orig_tid)
+{
+ if (!data_wos_item->movable)
+ return false;
+
+ if (!comContext->wos2ros_tid_list)
+ return false;
+
+ if (!comContext->delvec_tid_list)
+ if (ItemPointerIsValid(last_whiteout_orig_tid) &&
+ ItemPointerEquals(last_whiteout_orig_tid, &data_wos_item->orig_tid))
+ return false;
+
+ return true;
+}
+
+static bool
+can_select_candidate_for_update_delvec(vci_tid_tid_xid64_t *whiteout_wos_item, vci_RosCommandContext *comContext)
+{
+ if (!whiteout_wos_item->movable)
+ return false;
+
+ if (!comContext->delvec_tid_list)
+ return false;
+
+ return true;
+}
+
+static void
+put_entry_into_tid_list(vci_RosCommandContext *comContext, WosKind wos_kind, ItemPointer orig_tid, ItemPointer wos_tid)
+{
+ TupleTableSlot *slot;
+ Tuplesortstate *sortstate;
+
+ slot = comContext->tid_tid_slot;
+
+ ExecClearTuple(slot);
+
+ if (wos_kind == WOS_Data)
+ sortstate = comContext->wos2ros_tid_list;
+ else
+ sortstate = comContext->delvec_tid_list;
+
+ Assert(sortstate != NULL);
+
+ slot->tts_values[0] = ItemPointerGetDatum(orig_tid);
+ slot->tts_values[1] = ItemPointerGetDatum(wos_tid);
+ slot->tts_isnull[0] = false;
+ slot->tts_isnull[1] = false;
+
+ slot->tts_flags |= TTS_FLAG_EMPTY;
+
+ ExecStoreVirtualTuple(slot);
+
+ tuplesort_puttupleslot(sortstate, slot);
+}
+
+static bool
+get_entry_into_tid_list(vci_RosCommandContext *comContext, WosKind wos_kind, ItemPointer orig_tid, ItemPointer wos_tid)
+{
+ bool isnull;
+ TupleTableSlot *slot;
+ Tuplesortstate *sortstate;
+
+ slot = MakeSingleTupleTableSlot(comContext->tid_tid_slot->tts_tupleDescriptor, &TTSOpsMinimalTuple);
+
+ if (wos_kind == WOS_Data)
+ sortstate = comContext->wos2ros_tid_list;
+ else
+ sortstate = comContext->delvec_tid_list;
+
+ Assert(sortstate != NULL);
+
+ if (!tuplesort_gettupleslot(sortstate, true, false, slot, NULL))
+ {
+ ExecDropSingleTupleTableSlot(slot);
+ return false;
+ }
+
+ slot_getsomeattrs(slot, 2);
+
+ *orig_tid = *DatumGetItemPointer(slot_getattr(slot, 1, &isnull));
+ *wos_tid = *DatumGetItemPointer(slot_getattr(slot, 2, &isnull));
+
+ ExecDropSingleTupleTableSlot(slot);
+ return true;
+}
+
+static int
+readTidListFromWosIntoTidArray(Oid wos_oid, WosKind wos_kind, vci_tid_tid_xid64_t *wos_entris, int max_wos_entries, Snapshot snapshot)
+{
+ LOCKMODE lockmode = AccessShareLock;
+ TableScanDesc scan;
+ HeapTuple tuple;
+ Relation rel;
+ TupleDesc tupleDesc;
+ int num_rows = 0;
+
+ rel = relation_open(wos_oid, lockmode);
+
+ tupleDesc = RelationGetDescr(rel);
+
+ CHECK_FOR_INTERRUPTS();
+
+ scan = table_beginscan(rel, snapshot, 0, NULL);
+
+ scan->rs_flags &= ~SO_ALLOW_PAGEMODE;
+ while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
+ {
+ bool isnull;
+
+ if (max_wos_entries <= num_rows)
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("too many WOS rows over estimation")));
+
+ wos_entris[num_rows].orig_tid = *DatumGetItemPointer(heap_getattr(tuple, 1, tupleDesc, &isnull)); /* original_tid in WOS */
+ wos_entris[num_rows].wos_tid = tuple->t_self;
+ wos_entris[num_rows].xid64 = DatumGetInt64(heap_getattr(tuple, 2, tupleDesc, &isnull)); /* xid64 in WOS */
+
+ wos_entris[num_rows].movable = true;
+
+ Assert(ItemPointerIsValid(&wos_entris[num_rows].orig_tid));
+
+ CHECK_FOR_INTERRUPTS();
+
+ num_rows++;
+ }
+ table_endscan(scan);
+
+ table_close(rel, lockmode);
+
+ return num_rows;
+}
+
+static void
+readTidListFromWosIntoTidSortState(Oid wos_oid, WosKind wos_kind,
+ TupleTableSlot *slot, Tuplesortstate *sortstate,
+ Snapshot snapshot,
+ TransactionId wos2ros_xid)
+{
+ LOCKMODE lockmode = AccessShareLock;
+ TableScanDesc scan;
+ HeapTuple tuple;
+ Relation rel;
+ TupleDesc tupleDesc;
+
+ rel = relation_open(wos_oid, lockmode);
+ tupleDesc = RelationGetDescr(rel);
+
+ CHECK_FOR_INTERRUPTS();
+
+ scan = table_beginscan(rel, snapshot, 0, NULL);
+ scan->rs_flags &= ~SO_ALLOW_PAGEMODE;
+ while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
+ {
+ TransactionId xmin;
+ bool isnull;
+ bool movable;
+
+ xmin = HeapTupleHeaderGetXmin(tuple->t_data);
+ movable = TransactionIdPrecedes(xmin, wos2ros_xid);
+ ExecClearTuple(slot);
+
+ slot->tts_values[0] = heap_getattr(tuple, 1, tupleDesc, &isnull); /* original_tid in WOS */
+ slot->tts_values[1] = ItemPointerGetDatum(&tuple->t_self);
+ slot->tts_values[2] = heap_getattr(tuple, 2, tupleDesc, &isnull); /* xid64 in WOS */
+ slot->tts_values[3] = BoolGetDatum(movable);
+
+ slot->tts_isnull[0] = false;
+ slot->tts_isnull[1] = false;
+ slot->tts_isnull[2] = false;
+ slot->tts_isnull[3] = false;
+
+ slot->tts_flags |= TTS_FLAG_EMPTY;
+
+ ExecStoreVirtualTuple(slot);
+
+ tuplesort_puttupleslot(sortstate, slot);
+
+ CHECK_FOR_INTERRUPTS();
+ }
+ table_endscan(scan);
+
+ relation_close(rel, lockmode);
+}
+
+static bool
+getValidTidSortState(Tuplesortstate *sortstate, TupleTableSlot *slot, vci_tid_tid_xid64_t *item)
+{
+ bool isnull;
+ TupleTableSlot *tempslot;
+
+ tempslot = MakeSingleTupleTableSlot(slot->tts_tupleDescriptor, &TTSOpsMinimalTuple);
+
+ if (!tuplesort_gettupleslot(sortstate, true, false, tempslot, NULL))
+ {
+ ExecDropSingleTupleTableSlot(tempslot);
+ return false;
+ }
+
+ slot_getsomeattrs(tempslot, 4);
+
+ item->orig_tid = *DatumGetItemPointer(slot_getattr(tempslot, 1, &isnull));
+ item->wos_tid = *DatumGetItemPointer(slot_getattr(tempslot, 2, &isnull));
+ item->xid64 = DatumGetInt64(slot_getattr(tempslot, 3, &isnull));
+ item->movable = DatumGetBool(slot_getattr(tempslot, 4, &isnull));
+
+ ExecDropSingleTupleTableSlot(tempslot);
+ return true;
+}
+
+static int32
+compareXid64(int64 data_wos_xid64, int64 whiteout_wos_xid64)
+{
+ Assert((data_wos_xid64 > 0) && (whiteout_wos_xid64 > 0));
+
+ if (data_wos_xid64 == whiteout_wos_xid64)
+ {
+ return 0;
+ }
+ else if (data_wos_xid64 > whiteout_wos_xid64)
+ {
+ return +1;
+ }
+ else
+ {
+
+ return 0;
+ }
+}
diff --git a/contrib/vci/storage/vci_ros_daemon.c b/contrib/vci/storage/vci_ros_daemon.c
new file mode 100644
index 0000000..273585e
--- /dev/null
+++ b/contrib/vci/storage/vci_ros_daemon.c
@@ -0,0 +1,859 @@
+/*-------------------------------------------------------------------------
+ *
+ * vci_ros_daemon.c
+ *
+ * Portions Copyright (c) 2025, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * contrib/vci/storage/vci_ros_daemon.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/htup.h"
+#include "access/htup_details.h"
+#include "access/htup_details.h"
+#include "access/xact.h"
+#include "access/xlog.h"
+#include "c.h"
+#include "catalog/index.h"
+#include "catalog/pg_database.h"
+#include "fmgr.h"
+#include "lib/ilist.h"
+#include "miscadmin.h"
+#include "postmaster/autovacuum.h"
+#include "postmaster/bgworker.h"
+#include "storage/bufpage.h"
+#include "storage/ipc.h"
+#include "storage/latch.h"
+#include "storage/lwlock.h"
+#include "storage/proc.h"
+#include "storage/procarray.h" /* for TransactionIdIsInProgress() */
+/* #include "storage/shmem.h" */
+#include "utils/guc.h"
+#include "utils/memutils.h"
+#include "utils/rel.h"
+#include "utils/snapmgr.h"
+#include "utils/syscache.h"
+#include "pgstat.h"
+
+#include "vci.h"
+#include "vci_mem.h"
+#include "vci_ros.h"
+#include "vci_ros_daemon.h"
+#include "vci_ros_command.h"
+
+#include "vci_memory_entry.h"
+
+/**
+ * message on worker exit.
+ */
+typedef struct message_on_worker_exit
+{
+ int log_min_messages;
+ int message_level;
+ char message[1024];
+} message_on_worker_exit_t;
+
+static message_on_worker_exit_t message_on_worker_exit;
+
+#define INIT_MESSAGE_ON_WORKER_EXIT() \
+do \
+{ \
+ MemSet(&message_on_worker_exit, 0x00, sizeof(message_on_worker_exit)); \
+ message_on_worker_exit.log_min_messages = log_min_messages; \
+ on_proc_exit(callback_on_exit_worker, Int32GetDatum(0)); \
+} while (0)
+
+#define SET_MESSAGE_ON_WORKER_EXIT(elevel, ...) \
+do \
+{ \
+ message_on_worker_exit.message_level = (elevel); \
+ snprintf(message_on_worker_exit.message, sizeof(message_on_worker_exit.message), __VA_ARGS__); \
+ message_on_worker_exit.log_min_messages = log_min_messages; \
+ log_min_messages = PANIC; \
+} while (0)
+
+#define RESET_MESSAGE_ON_WORKER_EXIT() \
+do \
+{ \
+ log_min_messages = message_on_worker_exit.log_min_messages; \
+ message_on_worker_exit.message_level = 0; \
+ message_on_worker_exit.message[0] = '\0'; \
+} while (0)
+
+static bool TryToOpenVCIRelations(Oid indexOid, LOCKMODE heapLock, LOCKMODE indexLock,
+ Relation *heapRel, Relation *indexRel);
+static void CheckRosControlWorkerCancel(void);
+static void callback_on_exit_worker(int code, Datum arg);
+
+/* BGW_MAXREN = 64 */
+/* If the ROS control worker name is changed then update the bgw_name check in LockAcquire() too.*/
+static const char VCI_ROS_CONTROL_DAEMON_NAME[BGW_MAXLEN] = "vci:ROS control daemon";
+static const char VCI_ROS_CONTROL_WORKER_NAME_TEMP[BGW_MAXLEN] = "vci:ROS control worker(slot=%d)";
+static const char VCI_ROS_CONTROL_WORKER_TYPE[BGW_MAXLEN] = "vci:ROS control worker";
+
+/* flags set by signal handlers */
+static volatile sig_atomic_t gotSighup = false;
+static volatile sig_atomic_t gotSigterm = false;
+
+static vci_workerslot_t *workerslot;
+
+static char probeMessage[num_vci_rc][1024] =
+{
+ " data WOS count : %8d / %8d.",
+ " whiteout WOS count : %8d / %8d.",
+ " CDR : %8d / %8d (extent %d).",
+ " CDE : %8d / %8d (extent %d).",
+ " TIDCRID : %8d / %8d.",
+};
+
+/* ------------ daemon -------------- */
+
+/**
+ * Register ROS Control daemon function called from _PG_init_
+ */
+void
+vci_ROS_control_daemon_setup(void)
+{
+ BackgroundWorker worker;
+
+ /* for internal use */
+ if (VciGuc.enable_ros_control_daemon == false)
+ {
+ elog(DEBUG1, "vci: no daemon mode");
+ return;
+ }
+
+ memset(&worker, 0, sizeof(worker));
+ /* set up common data for all our workers */
+ worker.bgw_flags = BGWORKER_SHMEM_ACCESS |
+ BGWORKER_BACKEND_DATABASE_CONNECTION;
+ /* worker.bgw_start_time = BgWorkerStart_ConsistentState; */
+ worker.bgw_start_time = BgWorkerStart_RecoveryFinished;
+ /* worker.bgw_start_time = BgWorkerStart_PostmasterStart; */
+
+ worker.bgw_restart_time = VCI_DAEMON_RESTART_TIME;
+ worker.bgw_notify_pid = 0;
+
+ snprintf(worker.bgw_name, BGW_MAXLEN, VCI_ROS_CONTROL_DAEMON_NAME);
+ snprintf(worker.bgw_type, BGW_MAXLEN, VCI_ROS_CONTROL_DAEMON_NAME);
+ strcpy(worker.bgw_library_name, VCI_STRING);
+ strcpy(worker.bgw_function_name, "vci_ROS_control_daemon_main");
+ worker.bgw_main_arg = (Datum) 0;
+
+ RegisterBackgroundWorker(&worker);
+}
+
+/**
+ * Signal handler for SIGTERM
+ *
+ * @description
+ * Set a flag to let the main loop to terminate, and set our latch to wake it up.
+ *
+ * @param[in] SIGNAL_ARGS
+ */
+static void
+vci_ROSControlDaemonSigterm(SIGNAL_ARGS)
+{
+ gotSigterm = true;
+ if (MyProc)
+ SetLatch(&MyProc->procLatch);
+}
+
+/**
+ * Signal handler for SIGHUP
+ *
+ * @description
+ * Set a flag to tell the main loop to reread the config file, and set
+ * our latch to wake it up.
+ *
+ * @params[in] SIGNAL_ARGS
+ */
+static void
+vci_ROSControlDaemonSighup(SIGNAL_ARGS)
+{
+ gotSighup = true;
+ if (MyProc)
+ SetLatch(&MyProc->procLatch);
+}
+
+/**
+ * ROS control DAEMON's entory point.
+ */
+void
+vci_ROS_control_daemon_main(Datum main_arg)
+{
+ /*
+ * XXX - VCI wants to pretend this worker is like an autovacuum launcher;
+ * Let's set the MyBackendType to achieve this.
+ */
+ MyBackendType = B_AUTOVAC_LAUNCHER;
+
+ pg_bindtextdomain(TEXTDOMAIN);
+
+ /* StringInfoData buf; */
+ elog(DEBUG1, "start initialize %s", MyBgworkerEntry->bgw_name);
+
+ /* Establish signal handlers before unblocking signals. */
+ pqsignal(SIGHUP, vci_ROSControlDaemonSighup);
+ pqsignal(SIGTERM, vci_ROSControlDaemonSigterm);
+ pqsignal(SIGQUIT, vci_ROSControlDaemonSigterm);
+ pqsignal(SIGINT, vci_ROSControlDaemonSigterm);
+
+ /* pqsignal(SIGUSR1, vci_ROSNotify); */
+
+ /* We're now ready to receive signals */
+ BackgroundWorkerUnblockSignals();
+
+ BackgroundWorkerInitializeConnection(NULL, NULL, 0); /* Connect to Shared
+ * database */
+
+ /* Connect DB to access common system catalog */
+
+ workerslot = palloc0_array(vci_workerslot_t, VciGuc.control_max_workers);
+
+ /* Main loop */
+ while (!gotSigterm)
+ {
+ int rc;
+
+ CHECK_FOR_INTERRUPTS();
+
+ /*
+ * Background workers mustn't call usleep() or any direct equivalent:
+ * instead, they may wait on their process latch, which sleeps as
+ * necessary, but is awakened if postmaster dies. That way the
+ * background process goes away immediately in an emergency.
+ */
+ rc = WaitLatch(&MyProc->procLatch,
+ WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
+ VciGuc.control_naptime * INT64CONST(1000),
+ PG_WAIT_EXTENSION);
+ ResetLatch(&MyProc->procLatch);
+
+ /* emergency bailout if postmaster has died */
+ if (rc & WL_POSTMASTER_DEATH)
+ proc_exit(1); /* abnormal end */
+
+ if (gotSigterm)
+ goto done;
+
+ LWLockAcquire(VciShmemAddr->io_load_lock, LW_EXCLUSIVE);
+
+ /* Check VCI' database is exists */
+ vci_RemoveMemoryEntryOnDroppedDatabase();
+
+ vci_update_memoryentry_in_devloadinfo();
+
+ if (gotSigterm)
+ {
+ LWLockRelease(VciShmemAddr->io_load_lock);
+ goto done;
+ }
+
+ VciShmemAddr->translated_dev_pos = 0;
+
+ elog(DEBUG2, ">>> 1. control_max_workers = %d", VciGuc.control_max_workers);
+ for (int i = 0; i < VciGuc.control_max_workers; i++)
+ {
+ elog(DEBUG2, ">>> 1. workerslot[%d].pid is %d", i, (int) workerslot[i].pid);
+ if (workerslot[i].pid != 0)
+ {
+ pid_t pid;
+ BgwHandleStatus status;
+
+ status = GetBackgroundWorkerPid(&workerslot[i].handle, &pid);
+ switch (status)
+ {
+ case BGWH_STOPPED:
+ workerslot[i].pid = 0;
+ break;
+ case BGWH_NOT_YET_STARTED:
+ case BGWH_POSTMASTER_DIED:
+ case BGWH_STARTED:
+ break;
+ default:
+ /* LCOV_EXCL_START */
+ elog(PANIC, "invalid BgwHandleStatus in vci_ROS_control_daemon_main");
+ /* LCOV_EXCL_STOP */
+ break;
+ }
+
+ if (gotSigterm)
+ {
+ LWLockRelease(VciShmemAddr->io_load_lock);
+ goto done;
+ }
+ }
+ }
+
+ LWLockAcquire(VciShmemAddr->memory_entries->lock, LW_SHARED);
+
+ vci_ResetDevloadCurrentPos();
+
+ if (!fullPageWrites)
+ goto reload_configuration;
+
+ elog(DEBUG2, ">>> 2. control_max_workers = %d", VciGuc.control_max_workers);
+ for (int i = 0; i < VciGuc.control_max_workers; i++)
+ {
+ elog(DEBUG2, ">>> 2. workerslot[%d].pid is %d", i, (int) workerslot[i].pid);
+ if (workerslot[i].pid == 0)
+ {
+ bool worker_running = false;
+
+ if (!vci_GetWosRosConvertingVCI(&VciShmemAddr->worker_args_array[i]))
+ break;
+
+ Assert(OidIsValid(VciShmemAddr->worker_args_array[i].dbid));
+ Assert(OidIsValid(VciShmemAddr->worker_args_array[i].oid));
+
+ for (int j = 0; j < VciGuc.control_max_workers; j++)
+ {
+ if (workerslot[j].pid != 0 &&
+ workerslot[j].dbid == VciShmemAddr->worker_args_array[i].dbid &&
+ workerslot[j].oid == VciShmemAddr->worker_args_array[i].oid)
+ {
+ elog(DEBUG1, "a worker is running on VCI (oid=%d, dbid=%d)",
+ VciShmemAddr->worker_args_array[i].oid,
+ VciShmemAddr->worker_args_array[i].dbid);
+ worker_running = true;
+ break;
+ }
+ }
+
+ if (!worker_running)
+ {
+ workerslot[i] = vci_LaunchROSControlWorker(&VciShmemAddr->worker_args_array[i], i);
+ workerslot[i].oid = VciShmemAddr->worker_args_array[i].oid;
+ workerslot[i].dbid = VciShmemAddr->worker_args_array[i].dbid;
+ }
+ }
+
+ }
+
+ /*
+ * In case of a SIGHUP, just reload the configuration. (?)
+ */
+reload_configuration:
+ if (gotSighup)
+ {
+ gotSighup = false;
+ ProcessConfigFile(PGC_SIGHUP);
+ }
+
+ vci_MoveTranslatedVCI2Tail();
+
+ LWLockRelease(VciShmemAddr->memory_entries->lock);
+
+ LWLockRelease(VciShmemAddr->io_load_lock);
+ }
+
+done:
+
+ /*
+ * Daemon terminate by exit code=1, restart by postmaster as necessary.
+ */
+ proc_exit(1);
+}
+
+/* ------------ Worker -------------- */
+
+vci_workerslot_t
+vci_LaunchROSControlWorker(vci_wosros_conv_worker_arg_t *vciinfo, int slot_id)
+/* vci_database_priority_t *item, */
+{
+ BackgroundWorker worker;
+ BackgroundWorkerHandle *handle;
+ pid_t pid;
+
+ vci_workerslot_t result;
+
+ /* Assert(MyDatabaseId == InvalidOid); */
+
+ worker.bgw_flags = BGWORKER_SHMEM_ACCESS | BGWORKER_BACKEND_DATABASE_CONNECTION;
+ worker.bgw_start_time = BgWorkerStart_ConsistentState;
+ worker.bgw_restart_time = BGW_NEVER_RESTART;
+
+ sprintf(worker.bgw_library_name, VCI_STRING);
+ sprintf(worker.bgw_function_name, "vci_ROS_control_worker_main");
+ snprintf(worker.bgw_name, BGW_MAXLEN, VCI_ROS_CONTROL_WORKER_NAME_TEMP, slot_id);
+ snprintf(worker.bgw_type, BGW_MAXLEN, VCI_ROS_CONTROL_WORKER_TYPE);
+
+/*
+ worker.bgw_main_arg = PointerGetDatum(item);
+*/
+ worker.bgw_main_arg = PointerGetDatum(vciinfo);
+ worker.bgw_notify_pid = 0; /* don't notify by SIG_USR1 since it calls
+ * SetLatch and and awakens the parent process
+ * ROS daemon. That results ROS daemon
+ * spawning unnecessary multiple ROS control
+ * workers. */
+
+ if (!RegisterDynamicBackgroundWorker(&worker, &handle))
+ ereport(ERROR,
+ (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+ errmsg("could not register background process"),
+ errhint("You may need to increase max_worker_processes.")));
+
+ /* Wait for workers to become ready. */
+ while (true)
+ {
+ BgwHandleStatus status;
+
+ status = GetBackgroundWorkerPid(handle, &pid);
+ if (gotSigterm)
+ break;
+
+ switch (status)
+ {
+ case BGWH_NOT_YET_STARTED:
+ continue;
+
+ case BGWH_STARTED:
+ goto done;
+
+ case BGWH_STOPPED:
+ pid = 0;
+ goto done;
+
+ case BGWH_POSTMASTER_DIED:
+ pid = 0;
+ goto done;
+
+ default:
+ /* LCOV_EXCL_START */
+ elog(PANIC, "should not reach here");
+ /* LCOV_EXCL_STOP */
+ goto done;
+ }
+ }
+
+done:
+ result.pid = pid;
+ result.handle = *handle;
+
+ pfree(handle);
+
+ return result;
+}
+
+/**
+ *
+ */
+static inline bool
+vci_GetRosCommandExecFlag(char flag, vci_ros_command_t command_id)
+{
+ return (flag & (1 << command_id)) != 0;
+}
+
+static inline void
+vci_SetRosCommandExecFlag(char *flag, vci_ros_command_t command_id)
+{
+ *flag |= (1 << command_id);
+}
+
+static int
+determine_ExecCommand_and_Extent(const Oid vci_oid,
+ char *targetExecFlag,
+ int32 *targetExtentForCdr,
+ bool force_wosros_conv)
+{
+ Relation indexRel;
+ Relation heapRel;
+
+ /* Transaction Start */
+ SetCurrentStatementStartTimestamp();
+ StartTransactionCommand();
+ PushActiveSnapshot(GetTransactionSnapshot());
+
+ /* Try to open the heap relation & the index relation. */
+ if (!TryToOpenVCIRelations(vci_oid, AccessShareLock, AccessShareLock,
+ &heapRel, &indexRel))
+ {
+ AbortCurrentTransaction();
+ return -1;
+ }
+
+ /* Check request for ros control worker cancel. */
+ CheckRosControlWorkerCancel();
+
+ MemSet(targetExecFlag, 0, sizeof(char));
+ MemSet(targetExtentForCdr, 0, sizeof(int32));
+
+ for (vci_ros_command_t command = 0; command < num_vci_rc; command++)
+ {
+ int32 count = 0;
+ vci_target_extent_info_t extent_info = {0, -1};
+ int32 targetExtentId;
+
+ switch (command)
+ {
+ case vci_rc_wos_ros_conv:
+ /* 1. count DataWOS */
+ count = vci_CountFreezedInDataWos(indexRel, MaxAllocSize);
+ break;
+
+ case vci_rc_update_del_vec:
+ /* 2. count WhiteoutWOS */
+ count = vci_CountFreezedInWhiteoutWos(indexRel, MaxAllocSize);
+ break;
+
+ case vci_rc_collect_deleted:
+ /* 3. count deleted rows in each extent */
+ extent_info = vci_CountDeletedRowsInROS(indexRel, (uint32) VciGuc.cdr_threshold);
+ break;
+
+ case vci_rc_update_tid_crid:
+ /* 5. count TID->CRID update list */
+ count = vci_CountTidCridUpdateListLength(indexRel, MaxAllocSize);
+ break;
+
+ case vci_rc_collect_extent:
+ /* 6. count unused extents */
+ extent_info = vci_CountUnusedExtents(indexRel);
+ break;
+
+ default:
+ /* LCOV_EXCL_START */
+ elog(ERROR, "unexpected ROS command");
+ /* LCOV_EXCL_STOP */
+ break;
+ }
+
+ switch (command)
+ {
+ case vci_rc_wos_ros_conv:
+ elog(DEBUG2, &probeMessage[vci_rc_wos_ros_conv][0], count, VciGuc.wosros_conv_threshold);
+ if (force_wosros_conv || count >= VciGuc.wosros_conv_threshold)
+ vci_SetRosCommandExecFlag(targetExecFlag, vci_rc_wos_ros_conv);
+ break;
+
+ case vci_rc_update_del_vec:
+ elog(DEBUG2, &probeMessage[vci_rc_update_del_vec][0], count, VCI_UPDATE_DELVEC_THRESHOLD);
+ if (force_wosros_conv || count >= VCI_UPDATE_DELVEC_THRESHOLD)
+ vci_SetRosCommandExecFlag(targetExecFlag, vci_rc_update_del_vec);
+ break;
+
+ case vci_rc_update_tid_crid:
+ elog(DEBUG2, &probeMessage[vci_rc_update_tid_crid][0], count, VCI_UPDATE_TIDCRID_THRESHOLD);
+ if (count >= VCI_UPDATE_TIDCRID_THRESHOLD)
+ vci_SetRosCommandExecFlag(targetExecFlag, vci_rc_update_tid_crid);
+ break;
+
+ case vci_rc_collect_extent:
+ case vci_rc_collect_deleted:
+ targetExtentId = VCI_INVALID_EXTENT_ID;
+ if (extent_info.num_fit_extents > 0)
+ {
+ targetExtentId = extent_info.best_extent_id;
+
+ if (command == vci_rc_collect_deleted)
+ *targetExtentForCdr = targetExtentId;
+
+ vci_SetRosCommandExecFlag(targetExecFlag, command);
+ }
+ break;
+
+ default:
+ /* LCOV_EXCL_START */
+ elog(ERROR, "unexpected ROS command");
+ /* LCOV_EXCL_STOP */
+ break;
+ }
+ }
+
+ /* unlock VCI main rel */
+ index_close(indexRel, AccessShareLock);
+
+ table_close(heapRel, AccessShareLock);
+
+ /* Transaction End */
+ PopActiveSnapshot();
+ CommitTransactionCommand();
+
+ return 0;
+}
+
+/**
+ * update ROS
+ *
+ * @param[in] targetIndexOid target index oid
+ * @param[in] targetExecCommandFlag target exec commands
+ * @param[in] targetExtentId target extent id
+ * @param[out] num_converted_data_wos number of rows coverted in Data WOS
+ * @param[out] num_converted_whiteout_wos number of rows converted in Whiteout WOS
+ */
+static void
+vci_executeROScommand(Oid targetIndexOid, char targetExecCommandFlag, int32 targetExtentId,
+ int *num_converted_data_wos, int *num_converted_whiteout_wos)
+{
+ /*
+ * loop for executing ROS commaand each command is excuted in anoter
+ * Transaction();
+ */
+ for (vci_ros_command_t command = 0; command < num_vci_rc; command++)
+ {
+ if (vci_GetRosCommandExecFlag(targetExecCommandFlag, command))
+ {
+ Relation mainRel;
+ Relation heapRel;
+ Size workAreaSize = VciGuc.maintenance_work_mem * INT64CONST(1024);
+
+ instr_time s_time;
+ instr_time e_time;
+ volatile Snapshot snapshot;
+
+ /* Check request for ros control worker cancel. */
+ CheckRosControlWorkerCancel();
+
+ /* transaction start */
+ SetCurrentStatementStartTimestamp();
+ StartTransactionCommand();
+ snapshot = GetTransactionSnapshot();
+ PushActiveSnapshot(snapshot);
+ GetCurrentTransactionId();
+
+ /** Try to open the heap relation & the index relation,
+ * and get ShareUpdateExclusiveLock for the index relation. */
+ if (!TryToOpenVCIRelations(targetIndexOid, AccessShareLock, ShareUpdateExclusiveLock,
+ &heapRel, &mainRel))
+ {
+ /* Exit worker process. */
+ AbortCurrentTransaction();
+ return;
+ }
+
+ elog(LOG, "starts ROS command \"%s\"", vci_GetRosCommandName(command));
+ INSTR_TIME_SET_CURRENT(s_time);
+
+ switch (command)
+ {
+ case vci_rc_wos_ros_conv:
+ /* 1. WOS->ROS conversion */
+ *num_converted_data_wos = vci_ConvertWos2Ros(mainRel, workAreaSize, VciGuc.wosros_conv_threshold);
+ break;
+
+ case vci_rc_update_del_vec:
+ /* 2. update delete vector */
+ *num_converted_whiteout_wos = vci_UpdateDelVec(mainRel, workAreaSize, VCI_UPDATE_DELVEC_THRESHOLD);
+ break;
+
+ case vci_rc_collect_deleted:
+ /* 3. collect deleted rows */
+ vci_CollectDeletedRows(mainRel, workAreaSize, targetExtentId);
+ break;
+
+ case vci_rc_update_tid_crid:
+ /* 5. update TID->CRID update list to TID-CRID tree */
+ vci_UpdateTidCrid(mainRel, workAreaSize, 10000);
+ break;
+
+ case vci_rc_collect_extent:
+ /* 6. collect an unused extent */
+ vci_CollectUnusedExtent(mainRel, workAreaSize);
+ break;
+
+ default:
+ /* LCOV_EXCL_START */
+ elog(ERROR, "unexpected ROS command");
+ /* LCOV_EXCL_STOP */
+ break;
+ }
+
+ index_close(mainRel, ShareUpdateExclusiveLock);
+ table_close(heapRel, AccessShareLock);
+
+ PopActiveSnapshot();
+ CommitTransactionCommand();
+
+ INSTR_TIME_SET_CURRENT(e_time);
+ INSTR_TIME_SUBTRACT(e_time, s_time);
+ elog(LOG, "finished ROS command \"%s\" (%.03f ms)", vci_GetRosCommandName(command),
+ INSTR_TIME_GET_MILLISEC(e_time));
+ }
+ }
+}
+
+/*
+ * @param[in] dboid id of db to which the worker connects.
+ * @pramm[in] username user name
+ */
+static void
+BackgroundWorkerInitializeConnectionByOid1(Oid dboid, const char *username)
+{
+ BackgroundWorker *worker = MyBgworkerEntry;
+
+ /* XXX is this the right errcode? */
+ if (!(worker->bgw_flags & BGWORKER_BACKEND_DATABASE_CONNECTION))
+ ereport(FATAL,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("database connection requirement not indicated during registration")));
+
+ InitPostgres(NULL, dboid, username, InvalidOid, 0, NULL);
+
+ /* it had better not gotten out of "init" mode yet */
+ if (!IsInitProcessingMode())
+ ereport(ERROR,
+ (errmsg("invalid processing mode in background worker")));
+ SetProcessingMode(NormalProcessing);
+}
+
+#define RATIO_OF_INCREASE 1.1
+
+/**
+ * @param[in] main_arg id of vci, a WOS->ROS transfomation of which is performed.
+ */
+void
+vci_ROS_control_worker_main(Datum main_arg)
+{
+ Oid targetIndexOid = InvalidOid;
+ int32 targetExtentId = 01;
+ char targetExecCommandFlag = 0x00;
+
+ Oid dboid;
+ vci_wosros_conv_worker_arg_t *vciinfo;
+ int ret;
+ int num_converted_data_wos = INT_MAX;
+ int num_converted_whiteout_wos = INT_MAX;
+
+ pg_bindtextdomain(TEXTDOMAIN);
+
+ pqsignal(SIGHUP, vci_ROSControlDaemonSighup);
+ pqsignal(SIGTERM, vci_ROSControlDaemonSigterm);
+ pqsignal(SIGQUIT, vci_ROSControlDaemonSigterm);
+ pqsignal(SIGINT, vci_ROSControlDaemonSigterm);
+ /* pqsignal(SIGUSR1, vci_ROSNotify); */
+
+ /* Check full_page_writers=off */
+ if (!fullPageWrites)
+ return;
+
+ /* We're now ready to receive signals */
+ BackgroundWorkerUnblockSignals();
+
+ INIT_MESSAGE_ON_WORKER_EXIT();
+
+ /*
+ * Checkout the Postmaster was rebooted. if
+ * (MyBgworkerEntry->bgw_notify_pid == 0) return;
+ */
+
+ /* Connect to DB corresponding to dbid */
+
+ vciinfo = (vci_wosros_conv_worker_arg_t *) DatumGetPointer(main_arg);
+ targetIndexOid = vciinfo->oid;
+ dboid = vciinfo->dbid;
+
+ SET_MESSAGE_ON_WORKER_EXIT(DEBUG1, "worker: Failed to connect '%d'.", dboid);
+ BackgroundWorkerInitializeConnectionByOid1(dboid, NULL);
+ RESET_MESSAGE_ON_WORKER_EXIT();
+
+ elog(DEBUG1, "worker: connect to %d is OK. do wos->ros conversion on vci %d", dboid, targetIndexOid);
+
+#if 0
+ /**
+ * TODO -- Put thi call back again if/when Iwata-San's separate bgworker patch is accepted.
+ * See https://www.postgresql.org/message-id/OS7PR01MB11964335F36BE41021B62EAE8EAE4A%40OS7PR01MB11964.jpnprd01.prod.outlook.com
+ */
+
+ /* Accept cancel by admin commands. */
+ AcceptBackgroundWorkerCancel(MyDatabaseId, BGWORKER_CANCEL_ADMIN_COMMANDS);
+#endif
+
+ ret = determine_ExecCommand_and_Extent(targetIndexOid, &targetExecCommandFlag,
+ &targetExtentId, vciinfo->force_next_wosros_conv);
+
+ if (ret == 0)
+ vci_executeROScommand(targetIndexOid, targetExecCommandFlag, targetExtentId,
+ &num_converted_data_wos, &num_converted_whiteout_wos);
+
+ if (vciinfo->force_next_wosros_conv &&
+ num_converted_data_wos == 0 &&
+ num_converted_whiteout_wos == 0)
+ {
+ vci_id_t vciid;
+
+ vciid.oid = targetIndexOid;
+ vciid.dbid = dboid;
+
+ vci_SetForceNextWosRosConvFlag(&vciid, false);
+ }
+
+}
+
+/**
+ * Try to open the heap relation & the index relation.
+ * open the heap relation to detect AccessExclusiveLock of the heap
+ * relation, before opening the index relation.
+ */
+static bool
+TryToOpenVCIRelations(Oid indexOid, LOCKMODE heapLock, LOCKMODE indexLock,
+ Relation *heapRel, Relation *indexRel)
+{
+ Oid heapOid;
+
+ heapOid = IndexGetRelation(indexOid, true);
+ if (OidIsValid(heapOid))
+ {
+ *heapRel = try_relation_open(heapOid, heapLock);
+ if (*heapRel != NULL)
+ {
+ *indexRel = try_relation_open(indexOid, indexLock);
+ if (*indexRel != NULL)
+ {
+ if (isVciIndexRelation(*indexRel))
+ return true;
+
+ relation_close(*indexRel, indexLock);
+ }
+
+ relation_close(*heapRel, heapLock);
+ }
+ }
+
+ elog(DEBUG1, "worker: The relation the OID=%d indicates was deleted.", indexOid);
+
+ return false;
+}
+
+/**
+ * Check request for ros control worker cancel.
+ */
+static void
+CheckRosControlWorkerCancel(void)
+{
+#ifdef WIN32
+ if (UNBLOCKED_SIGNAL_QUEUE())
+ pgwin32_dispatch_queued_signals();
+#endif /* WIN32 */
+
+ if (gotSigterm)
+ {
+ ereport(DEBUG1,
+ (errcode(ERRCODE_ADMIN_SHUTDOWN),
+ errmsg_internal("terminating VCI worker process due to administrator command")));
+ /* process terminate. */
+ exit(1);
+
+ }
+}
+
+/**
+ * callback on exit worker fro message.
+ */
+static void
+callback_on_exit_worker(int code, Datum arg)
+{
+ log_min_messages = message_on_worker_exit.log_min_messages;
+
+ if (message_on_worker_exit.message[0])
+ {
+ elog(message_on_worker_exit.message_level,
+ "%s", message_on_worker_exit.message);
+ message_on_worker_exit.message[0] = '\0';
+ }
+ elog(DEBUG1, "worker: ROS control worker exit code=%d.", code);
+}
--
1.8.3.1