From db24b449d9607f056f4e546fdc1492952d5b677d Mon Sep 17 00:00:00 2001 From: Peter Smith Date: Tue, 9 Dec 2025 15:17:23 +1100 Subject: [PATCH v20251209] VCI - main - part2 --- contrib/vci/include/postgresql_copy.h | 176 ++ contrib/vci/include/vci_chunk.h | 114 + contrib/vci/include/vci_columns.h | 319 +++ contrib/vci/include/vci_columns_data.h | 33 + contrib/vci/include/vci_fetch.h | 1007 ++++++++ contrib/vci/include/vci_freelist.h | 75 + contrib/vci/include/vci_mem.h | 177 ++ contrib/vci/include/vci_memory_entry.h | 118 + contrib/vci/include/vci_ros.h | 1085 +++++++++ contrib/vci/include/vci_ros_command.h | 214 ++ contrib/vci/include/vci_ros_daemon.h | 69 + contrib/vci/include/vci_tidcrid.h | 344 +++ contrib/vci/include/vci_wos.h | 29 + contrib/vci/include/vci_xact.h | 39 + contrib/vci/storage/Makefile | 34 + contrib/vci/storage/meson.build | 19 + contrib/vci/storage/vci_ros.c | 1674 +++++++++++++ contrib/vci/storage/vci_ros_command.c | 4160 ++++++++++++++++++++++++++++++++ contrib/vci/storage/vci_ros_daemon.c | 865 +++++++ 19 files changed, 10551 insertions(+) create mode 100644 contrib/vci/include/postgresql_copy.h create mode 100644 contrib/vci/include/vci_chunk.h create mode 100644 contrib/vci/include/vci_columns.h create mode 100644 contrib/vci/include/vci_columns_data.h create mode 100644 contrib/vci/include/vci_fetch.h create mode 100644 contrib/vci/include/vci_freelist.h create mode 100644 contrib/vci/include/vci_mem.h create mode 100644 contrib/vci/include/vci_memory_entry.h create mode 100644 contrib/vci/include/vci_ros.h create mode 100644 contrib/vci/include/vci_ros_command.h create mode 100644 contrib/vci/include/vci_ros_daemon.h create mode 100644 contrib/vci/include/vci_tidcrid.h create mode 100644 contrib/vci/include/vci_wos.h create mode 100644 contrib/vci/include/vci_xact.h create mode 100644 contrib/vci/storage/Makefile create mode 100644 contrib/vci/storage/meson.build create mode 100644 contrib/vci/storage/vci_ros.c create mode 100644 contrib/vci/storage/vci_ros_command.c create mode 100644 contrib/vci/storage/vci_ros_daemon.c diff --git a/contrib/vci/include/postgresql_copy.h b/contrib/vci/include/postgresql_copy.h new file mode 100644 index 0000000..f302232 --- /dev/null +++ b/contrib/vci/include/postgresql_copy.h @@ -0,0 +1,176 @@ +/*------------------------------------------------------------------------- + * + * postgresql_copy.h + * Definitions copied from PostgreSQL core + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/include/postgresql_copy.h + * + *------------------------------------------------------------------------- + */ +#ifndef POSTGRESQL_COPY_H +#define POSTGRESQL_COPY_H + +/* + * src/backend/utils/adt/float.c + */ +#include "postgres.h" + +#include + +#include "catalog/pg_type.h" +#include "datatype/timestamp.h" +#include "utils/array.h" +#include "utils/date.h" +#include "utils/elog.h" +#include "utils/errcodes.h" + +/* + * check to see if a float4/8 val has underflowed or overflowed + */ +#define CHECKFLOATVAL(val, inf_is_valid, zero_is_valid) \ +do { \ + if (isinf(val) && !(inf_is_valid)) \ + ereport(ERROR, \ + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), \ + errmsg("value out of range: overflow"))); \ + \ + if ((val) == 0.0 && !(zero_is_valid)) \ + ereport(ERROR, \ + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), \ + errmsg("value out of range: underflow"))); \ +} while(0) + +/* + * src/backend/utils/adt/float.c + */ +static inline float8 * +check_float8_array(ArrayType *transarray, const char *caller, int n) +{ + /* + * We expect the input to be an N-element float array; verify that. We + * don't need to use deconstruct_array() since the array data is just + * going to look like a C array of N float8 values. + */ + if (ARR_NDIM(transarray) != 1 || + ARR_DIMS(transarray)[0] != n || + ARR_HASNULL(transarray) || + ARR_ELEMTYPE(transarray) != FLOAT8OID) + elog(ERROR, "%s: expected %d-element float8 array", caller, n); + return (float8 *) ARR_DATA_PTR(transarray); +} + +typedef struct Int8TransTypeData +{ + int64 count; + int64 sum; +} Int8TransTypeData; + +#ifdef VCI_USE_CMP_FUNC +/* + * interval_relop - is interval1 relop interval2 + * + * collate invalid interval at the end + */ +static inline TimeOffset +interval_cmp_value(const Interval *interval) +{ + TimeOffset span; + + span = interval->time; + +#ifdef HAVE_INT64_TIMESTAMP + span += interval->month * INT64CONST(30) * USECS_PER_DAY; + span += interval->day * INT64CONST(24) * USECS_PER_HOUR; +#else + span += interval->month * ((double) DAYS_PER_MONTH * SECS_PER_DAY); + span += interval->day * ((double) HOURS_PER_DAY * SECS_PER_HOUR); +#endif + + return span; +} + +static int +interval_cmp_internal(Interval *interval1, Interval *interval2) +{ + TimeOffset span1 = interval_cmp_value(interval1); + TimeOffset span2 = interval_cmp_value(interval2); + + return ((span1 < span2) ? -1 : (span1 > span2) ? 1 : 0); +} + +static int +timetz_cmp_internal(TimeTzADT *time1, TimeTzADT *time2) +{ + TimeOffset t1, + t2; + + /* Primary sort is by true (GMT-equivalent) time */ +#ifdef HAVE_INT64_TIMESTAMP + t1 = time1->time + (time1->zone * USECS_PER_SEC); + t2 = time2->time + (time2->zone * USECS_PER_SEC); +#else + t1 = time1->time + time1->zone; + t2 = time2->time + time2->zone; +#endif + + if (t1 > t2) + return 1; + if (t1 < t2) + return -1; + + /* + * If same GMT time, sort by timezone; we only want to say that two + * timetz's are equal if both the time and zone parts are equal. + */ + if (time1->zone > time2->zone) + return 1; + if (time1->zone < time2->zone) + return -1; + + return 0; +} +#endif + +/* taken from numeric.c */ + +typedef int16 NumericDigit; +struct NumericShort +{ + uint16 n_header; /* Sign + display scale + weight */ + NumericDigit n_data[1]; /* Digits */ +}; + +struct NumericLong +{ + uint16 n_sign_dscale; /* Sign + display scale */ + int16 n_weight; /* Weight of 1st digit */ + NumericDigit n_data[1]; /* Digits */ +}; + +union NumericChoice +{ + uint16 n_header; /* Header word */ + struct NumericLong n_long; /* Long form (4-byte header) */ + struct NumericShort n_short; /* Short form (2-byte header) */ +}; + +struct NumericData +{ + int32 vl_len_; /* varlena header (do not touch directly!) */ + union NumericChoice choice; /* choice of format */ +}; + +typedef struct NumericVar +{ + int ndigits; /* # of digits in digits[] - can be 0! */ + int weight; /* weight of first digit */ + int sign; /* NUMERIC_POS, NUMERIC_NEG, or NUMERIC_NAN */ + int dscale; /* display scale */ + NumericDigit *buf; /* start of palloc'd space for digits[] */ + NumericDigit *digits; /* base-NBASE digits */ +} NumericVar; + +#endif /* POSTGRESQL_COPY_H */ diff --git a/contrib/vci/include/vci_chunk.h b/contrib/vci/include/vci_chunk.h new file mode 100644 index 0000000..9c4f628 --- /dev/null +++ b/contrib/vci/include/vci_chunk.h @@ -0,0 +1,114 @@ +/*------------------------------------------------------------------------- + * + * vci_chunk.h + * Definitions and Declarations of ROS chunk buffer strage. + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/include/vci_chunk.h + * + *------------------------------------------------------------------------- + */ + +#ifndef VCI_CHUNK_H +#define VCI_CHUNK_H + +#include "postgres.h" + +#include "miscadmin.h" +#include "utils/snapmgr.h" +#include "utils/timestamp.h" +#include "utils/uuid.h" + +#include "vci.h" +#include "vci_ros.h" + +/** + * @brief RosChunkBuffer is a buffer to store one chunk. + * + * We use RosChunkBuffer in two purposes. One is to store data obtaind + * directly from PostgreSQL relation. For this purpose, we prepare this + * buffer to have enough space to store data even when all the attributes + * have the size of worst case, that never happens. Once the chunk is + * stored in this buffer, we inspect the size of each column in the chunk. + * Afterward, we copy all the chunk data into RosChunkStorage with removing + * unused spaces. Here, we use RosChunkBuffer for each chunk, but this + * time we prepare the buffer with the size suitable for each chunk. ROS + * without compression is built from RosChunkStorage directly. + */ +typedef struct RosChunkBuffer +{ + int16 numColumns; /* number of columns */ + int16 numNullableColumns; /* number of nullable columns */ + + /** number of columns which need offset data for each entry because they + * have variable-length fields or fields longer than eight bytes, say, + * reference Datum. + */ + int16 numColumnsWithIndex; + + int nullWidthInByte; /* The byte width of null bit vector. */ + int numRowsAtOnce; /* the maximum number of rows in the chunk */ + int numFilled; /* the number of rows actually contained here */ + vcis_compression_type_t *compType; /* Array of compression type for + * columns. */ + int16 *nullBitId; /* -1 for NOT NULLABLE */ + int16 *columnSizeList; /* the sizes of columns in the worst case */ + void *dataAllocPtr; /* pointer keeping allocated area */ + char **data; /* buffer for each column */ + vci_offset_in_extent_t **dataOffset; /* offset to each datum */ + char *nullData; /* pointer to array of null bit vector. */ + char *tidData; /* pointer to array of TID. */ + char *deleteData; /* pointer to array of delete information */ +} RosChunkBuffer; + +/** + * @brief Structure to keep buffers that keeps column-wise data built from WOS. + */ +typedef struct RosChunkStorage +{ + int numChunks; /* The length of allocated chunk. */ + int numFilled; /* The number of chunk actually used. */ + int numTotalRows; /* The sum of rows in registered chunks. */ + bool forAppending; /* True to append data to the shrunken extent. */ + + /** Array of pointers to RosChunkBuffer, which is copied in a compact + * manner to reduce the memory. + */ + RosChunkBuffer **chunk; +} RosChunkStorage; + +extern void + vci_InitOneRosChunkBuffer(RosChunkBuffer *rosChunkBuffer, + int numRowsAtOnce, + int16 *columnSizeList, + int numColumns, + bool useDeleteVector, + vci_MainRelHeaderInfo *info); +extern void + vci_InitRosChunkStorage(RosChunkStorage *rosChunkStorage, + int numRowsAtOnce, + bool forAppending); +extern void + vci_DestroyOneRosChunkBuffer(RosChunkBuffer *rosChunkBuffer); +extern void + vci_DestroyRosChunkStorage(RosChunkStorage *rosChunkStorage); +extern PGDLLEXPORT void + vci_ResetRosChunkStorage(RosChunkStorage *rosChunkStorage); +extern void + vci_FillOneRowInRosChunkBuffer(RosChunkBuffer *rosChunkBuffer, + vci_MainRelHeaderInfo *info, + ItemPointer tid, + HeapTuple tuple, + int16 *dstColumnIdList, + AttrNumber *heapAttrNumList, + TupleDesc tupleDesc); +extern void + vci_ResetRosChunkBufferCounter(RosChunkBuffer *buffer); +extern void + vci_RegisterChunkBuffer(RosChunkStorage *rosChunkStorage, RosChunkBuffer *src); +extern Size + vci_GetDataSizeInChunkStorage(RosChunkStorage *src, int columnId, bool asFixed); + +#endif /* #ifndef VCI_CHUNK_H */ diff --git a/contrib/vci/include/vci_columns.h b/contrib/vci/include/vci_columns.h new file mode 100644 index 0000000..18d6e9a --- /dev/null +++ b/contrib/vci/include/vci_columns.h @@ -0,0 +1,319 @@ +/*------------------------------------------------------------------------- + * + * vci_columns.h + * Definitions and declarations of VCI column store and extents + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/include/vci_columns.h + * + *------------------------------------------------------------------------- + */ + +#ifndef VCI_COLUMNS_H +#define VCI_COLUMNS_H + +#include "postgres.h" + +#include "vci.h" +#include "vci_chunk.h" +#include "vci_ros.h" +#include "vci_tidcrid.h" + +/** header page ID of column meta data */ +#define VCI_COLUMN_META_HEADER_PAGE_ID (0) + +/** First page of Column data relations */ +#define VCI_COLUMN_DATA_FIRST_PAGE_ID (0) + +/** Column number of Column meta header page */ +#define VCI_NUM_COLUMN_META_HEADER_PAGE (1) + +/** Column ID of first Normal Column */ +#define VCI_FIRST_NORMALCOLUMN_ID (0) + +/** Column ID of special column */ +#define VCI_COLUMN_ID_TID (-1) +#define VCI_COLUMN_ID_NULL (-2) +#define VCI_COLUMN_ID_DELETE (-3) +#define VCI_COLUMN_ID_CRID (-4) /** @todo what is this? */ + +/** The data below are not column-stored data. + * We prepare them for convenience. + */ +#define VCI_COLUMN_ID_TID_CRID (-5) +#define VCI_COLUMN_ID_TID_CRID_UPDATE (-6) +#define VCI_COLUMN_ID_TID_CRID_WRITE (-7) +#define VCI_COLUMN_ID_TID_CRID_CDR (-8) +#define VCI_COLUMN_ID_DATA_WOS (-9) +#define VCI_COLUMN_ID_WHITEOUT_WOS (-10) + +#define VCI_INVALID_COLUMN_ID ((int16) -11) + +/** Vector bit count in one item (tuple) for delete vector */ +#define VCI_NUM_ROWS_IN_ONE_ITEM_FOR_DELETE (1024) + +/** Item number in page for delete vector */ +#define VCI_ITEMS_IN_PAGE_FOR_DELETE (52) + +/** Page number in extent for delete vector */ +#define VCI_NUM_PAGES_IN_EXTENT_FOR_DELETE (5) + +static inline BlockNumber +vci_CalcBlockNumberFromCrid64ForDelete(uint64 crid64) +{ + return (vci_CalcExtentIdFromCrid64(crid64) * + VCI_NUM_PAGES_IN_EXTENT_FOR_DELETE) + + (vci_CalcRowIdInExtentFromCrid64(crid64) / + (VCI_ITEMS_IN_PAGE_FOR_DELETE * + VCI_NUM_ROWS_IN_ONE_ITEM_FOR_DELETE)); +} + +static inline OffsetNumber +vci_CalcOffsetNumberFromCrid64ForDelete(uint64 crid64) +{ + return ((vci_CalcRowIdInExtentFromCrid64(crid64) / + VCI_NUM_ROWS_IN_ONE_ITEM_FOR_DELETE) % + VCI_ITEMS_IN_PAGE_FOR_DELETE) + FirstOffsetNumber; +} + +static inline uint32 +vci_CalcByteFromCrid64ForDelete(uint64 crid64) +{ + return (crid64 % VCI_NUM_ROWS_IN_ONE_ITEM_FOR_DELETE) / BITS_PER_BYTE; +} + +static inline uint32 +vci_CalcBitFromCrid64ForDelete(uint64 crid64) +{ + return crid64 & (BITS_PER_BYTE - 1); +} + +/** + * Pointing extent position of each column in BlockNumber. + * + * @description + * This is used in vcis_column_meta_t.block_number_extent. + * The field is not defined in the definition of the structure, because + * we have the other variable length field "common_dict_info". + * This block_number_extent follows the field. + * + * @note + * unused entries have InvalidBlockNumber in block_number and + * zero in num_blocks. + */ +typedef struct vcis_c_extent +{ + BlockNumber block_number; /* the position in the column data relation */ + BlockNumber num_blocks; /* the length in DB page unit */ + + bool enabled; /* block_number is enabled if true */ + + /* FIXME */ /* fill me */ + bool valid_min_max; /* size of min is + * vcis_column_meta_t.min_max_element_size */ + char min[1]; /* max follows min. */ +} vcis_c_extent_t; + +/** + * common dictionary info of each column + * + * @descriptions + *This is used in vcis_column_meta_t.common_dict_info + * + * @note + * unused entries have InvalidBlockNumber in block_number and + * zero in num_blocks. + */ +typedef struct vcis_c_common_dict +{ + BlockNumber block_number; /* the position in the column data relation */ + BlockNumber num_blocks; /* the length in DB page unit */ +} vcis_c_common_dict_t; + +typedef struct vcis_column_meta +{ + vcis_attribute_type_t vcis_attr_type; /* Attribute type */ + + Oid pgsql_atttypid; /* taken from FormData_pg_attribute.atttypid */ + int16 pgsql_attnum; /* taken from FormData_pg_attribute.attnum */ + int16 pgsql_attlen; /* taken from FormData_pg_attribute.attlen */ + int32 pgsql_atttypmod; /* taken from + * FormData_pg_attribute.atttypmod */ + uint32 num_extents; /* number of extents (for debug) */ + uint32 num_extents_old; /* previous number of extents (for + * recovery) */ + + BlockNumber free_page_begin_id; /* page ID of the first free area */ + + BlockNumber free_page_end_id; /* page ID of the last free area */ + + /** + * The DB page ID of free area that located in front of the added or + * deleted extent by the ROS command. (for recovery) + * This is used to recover free area list. + */ + BlockNumber free_page_prev_id; + + /** + * Same as free_page_prev_id, but just behind the added or deleted extent. + */ + BlockNumber free_page_next_id; + + /** + * The freespace size of added or deleted extent by the ROS command (for recovery) + */ + uint32 free_page_old_size; + + /** + * The freespace position of added or deleted extent in BlockNumber + * by the ROS command (for recovery) + */ + BlockNumber new_data_head; + + BlockNumber num_free_pages; /* number of free DB pages in the listed free + * area */ + BlockNumber num_free_pages_old; /* for recovery */ + BlockNumber num_free_page_blocks; /* number of free areas, not number of + * free DB pages */ + BlockNumber num_free_page_blocks_old; /* for recovery */ + + /*--- Above must be same as vcis_tidcrid_meta_t ---*/ + + uint32 common_flag_0; /* vcis_column_meta_flag */ + + uint32 min_max_field_size; /* size of min_max field size */ + uint32 min_max_content_size; /* size of min_max content size */ + uint16 num_common_dicts; /* Number of common dictionarys */ + int16 latest_common_dict_id; /* Id of the latest common dictionary */ + uint32 common_dict_info_offset; /* offset of common_dict_info[0] */ + uint32 block_number_extent_offset; /* offset of extent_pointer[0] */ + + vcis_c_common_dict_t common_dict_info[1]; /* common dictionary + * informations */ + /* block_number_extent follows common_dict_info[num_common_dict - 1] */ +} vcis_column_meta_t; + +/** + * @brief Get pointer to vcis_extent_t in the give DB page. + */ +#define vci_GetExtentT(page) \ + ((vcis_extent_t *) &((page)[VCI_MIN_PAGE_HEADER])) + +/* + * Extend headers + */ +typedef struct vcis_extent +{ + uint32 size; /* Size of extent */ + vcis_extent_type_t type; + uint32 id; /* Extend id */ + vcis_compression_type_t comp_type; /* Compression method */ + uint32 offset_offset; /* Offset to the offset */ + uint32 offset_size; /* Size of the offset size */ + uint32 data_offset; /* Offset to the data */ + uint32 data_size; /* Data size */ + uint16 compressed; /* 0 for not compressed, 1 for compressed */ + int16 dict_offset; /* or common dictionary ID (>= -1) when + * dict_size == 0 */ + uint32 dict_size; /* Size to the dictionary data */ + vcis_dict_type_t dict_type; /* The type of dictionary */ + char dict_body[1]; /* the mainbody of the dictionary */ + /* offset_body and data_body follows dict_body */ +} vcis_extent_t; + +typedef vci_RelationPair vci_ColumnRelations; + +extern PGDLLEXPORT vcis_column_meta_t *vci_GetColumnMeta(Buffer *buffer, Relation rel); +extern PGDLLEXPORT vcis_c_extent_t *vci_GetColumnExtent(Buffer *buffer, + BlockNumber *blockNumber, + Relation rel, + int32 extentId); + +extern PGDLLEXPORT void vci_OpenColumnRelations(vci_ColumnRelations *rel, + vci_MainRelHeaderInfo *info, + int16 columnId, + LOCKMODE lockmode); + +extern void vci_CloseColumnRelations(vci_ColumnRelations *rel, + LOCKMODE lockmode); + +extern void vci_InitializeColumnRelations(vci_MainRelHeaderInfo *info, + TupleDesc tupdesc, + Relation heapRel); + +extern void vci_WriteRawDataExtentInfo(Relation rel, + int32 extentId, + uint32 startPageID, + uint32 numBlocks, + char *minData, + char *maxData, + bool validMinMax, + bool checkOverwrite); + +extern void vci_WriteOneExtent(vci_MainRelHeaderInfo *info, + RosChunkStorage *src, + int extentId, + TransactionId xgen, /* xgen in extent info */ + TransactionId xdel, /* xdel in extent info */ + TransactionId xid); /* in tuple header */ + +/* columns to fetcher Interface */ +extern void vci_GetElementPosition(uint32 *offset, + BlockNumber *blockNumberBase, + uint32 *dataOffset, + vci_ColumnRelations *rel, + int32 extentId, + uint32 rowIdInExtent, + Form_pg_attribute attr); + +extern PGDLLEXPORT void vci_GetChunkPositionAndSize(uint32 *offset, + Size *totalSize, + BlockNumber *blockNumberBase, + uint32 *dataOffset, + vci_ColumnRelations *rel, + int32 extentId, + uint32 rowIdInExtent, + int32 numUnit, + Form_pg_attribute attr); + +extern uint16 + vci_GetFixedColumnSize(vci_MainRelHeaderInfo *info, int16 columnId); +extern void + vci_GetPositionForFixedColumn(BlockNumber *blockNumber, + uint32 *offset, + vci_MainRelHeaderInfo *info, + int16 columnId, + int32 extentId, + uint32 rowIdInExtent, + bool atEnd); + +extern PGDLLEXPORT void + vci_InitializeDictInfo(vci_DictInfo *dictInfo); + +/* *************************** + * Min-Max info + * *************************** + */ + +static inline void +vci_Initvci_ColumnRelations(vci_ColumnRelations *rel) +{ + rel->meta = NULL; + rel->data = NULL; +} + +/* function to write meta data header + * argumtents + * Relation relMeta + * Buffer buffer + */ +static inline void +vci_WriteColumnMetaDataHeader(Relation relMeta, + Buffer buffer) +{ + vci_WriteOneItemPage(relMeta, buffer); +} + +#endif /* VCI_COLUMNS_H */ diff --git a/contrib/vci/include/vci_columns_data.h b/contrib/vci/include/vci_columns_data.h new file mode 100644 index 0000000..d21c0a1 --- /dev/null +++ b/contrib/vci/include/vci_columns_data.h @@ -0,0 +1,33 @@ +/*------------------------------------------------------------------------- + * + * vci_columns_data.h + * Declarations of functions to check which columns are indexed. + * + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/include/vci_columns_data.h + * + *------------------------------------------------------------------------- + */ +#ifndef VCI_COLUMNS_DATA_H +#define VCI_COLUMNS_DATA_H + +#include "access/tupdesc.h" +#include "access/attnum.h" +#include "nodes/bitmapset.h" +#include "storage/lock.h" +#include "utils/palloc.h" +#include "utils/rel.h" + +#include "vci_ros.h" + +extern TupleDesc vci_ExtractColumnDataUsingIds(const char *vci_column_ids, Relation heapRel); +extern PGDLLEXPORT TupleDesc vci_GetTupleDescr(vci_MainRelHeaderInfo *info); +extern Bitmapset *vci_MakeIndexedColumnBitmap(Oid mainRelationOid, MemoryContext sharedMemCtx, LOCKMODE lockmode); +extern Bitmapset *vci_MakeDroppedColumnBitmap(Relation indexRel); +extern char *vci_ConvertAttidBitmap2String(Bitmapset *attid_bitmap); +extern AttrNumber vci_GetAttNum(TupleDesc desc, const char *name); + +#endif /* VCI_COLUMNS_DATA_H */ diff --git a/contrib/vci/include/vci_fetch.h b/contrib/vci/include/vci_fetch.h new file mode 100644 index 0000000..60326ee --- /dev/null +++ b/contrib/vci/include/vci_fetch.h @@ -0,0 +1,1007 @@ +/*------------------------------------------------------------------------- + * + * vci_fetch.h + * Definitions and declarations of Column store fetch + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/include/vci_fetch.h + * + *------------------------------------------------------------------------- + */ +#ifndef VCI_FETCH_H +#define VCI_FETCH_H + +#include "postgres.h" + +#include "access/attnum.h" +#include "utils/guc.h" + +#include "vci.h" +#include "vci_columns.h" + +#include "vci_mem.h" +#include "vci_ros.h" + +/* Get function of virtual tuples may used to get the storage area. + * In that case, no rows may stored. + * So, skipping the assertion check by default. + * To use the assertion check, define CHECK_VTUPLE_GET_RANGE. + */ +#define CHECK_VTUPLE_GET_RANGE + +/* + * memory image of data loaded by vci_CSFetchVirtualTuples(). + * The area is allocated by vci_CSCreateVirtualTuples(), and the addresses + * are fixed except for each data in "column N area"s. + * + * ADDRESS CONTENT + * allocated (palloc-ed address) + * (no padding) + * flags (or skip) uint8 of tuple[0] + * (bit 0 is a copy of uint8 of tuple[1] + * delete vector) . + * . + * uint8 of tuple[num_rows_read_at_once-1] + * uint8 of tuple[num_rows_read_at_once] (extra element) + * (no padding) + * isnull bool[0]--bool[num_columns-1] of tuple[0] + * bool[0]--bool[num_columns-1] of tuple[1] + * . + * . + * bool[0]--bool[num_columns-1] of tuple[num_..._once-1] + * (padding if necessary) + * crid (aligned) int64 of tuple[0] + * (when need_crid is int64 of tuple[1] + * true) . + * . + * int64 of tuple[num_rows_read_at_once-1] + * (no padding) + * tid (aligned) int64 of tuple[0] + * (when need_tid is int64 of tuple[1] + * true) . + * . + * . + * int64 of tuple[num_rows_read_at_once-1] + * (no padding) + * values (aligned) Datum[0]--Datum[num_columns-1] of tuple[0] + * Datum[0]--Datum[num_columns-1] of tuple[1] + * . + * . + * Datum[0]--Datum[num_columns-1] of tuple[num_..._once-1] + * (padding if necessary) + * column 0 area aligned data are stored when the element size is + * (aligned) larger than sizeof(Datum). Each datum are pointed + * by Datum[0] of tuples in the upper "data" area. + * The size for the area is calculated using worst + * case size. + * (free space) + * (padding if necessary) + * column 1 area aligned data are stored when the element size is + * (aligned) larger than sizeof(Datum). Each datum are pointed + * by Datum[1] of tuples in the upper "data" area. + * The size for the area is calculated using worst + * case size. + * . + * . + * (free space) + * (padding if necessary) + * column (num_rows-1) area + * (aligned) + * + * + * usage: + * + * ---- in backend process ---- + * vci_CSQueryContext queryContext = vci_CSCreateQueryContext( mainRelationOid, + * numReadColumns, attrNum, sharedMemCtx); + * + * Size localRosSize = vci_CSEstimateLocalRosSize(queryContext); + * if (limitLocalRos <= localRosSize) + * goto PostgreSQLQueryExecution; + * vci_local_ros_t *localRos = vci_CSGenerateLocalRos(queryContext); + * + * vci_CSFetchContext fetchContext = vci_CSCreateFetchContext( queryContext, + * numRowsReadAtOnce, + * useColumnStore, + * numReadColumns, attrNum, + * returnTid, returnCrid); + * Size fetchContextSize = vci_CSGetFetchContextSize(fetchContext); + * if (limitFetchContext <= sumOfFetchContextSize) + * goto PostgreSQLQueryExecution; + * + * ---- in background worker ---- + * int lenVector = vci_CSGetActualNumRowsReadAtOnce(fetchContext); + * vci_CSFetchContext localContext = vci_CSLocalizeFetchContext(fetchContext); + * vci_virtual_tuples_t *vTuples = vci_CSCreateVirtualTuples(localContext); + * + * ** here you can make pointers to vTuples from PostgreSQL virtual tuples. ** + * + * vci_extent_status_t *status =vci_CSCreateCheckExtent(fetchContext); + * + * for (extentID) + * { + * vci_CSCheckExtent(status, localContext, extentId, readMinMax); + * if (status->existence && status->visible) + * { + * ** loop of vectors and rows ** + * ** number of rows in the extent is in status->num_rows ** + * for (vectorID) + * { + * int readableRows = vci_CSFetchVirtualTuples(vTuples, + * vectroID * lenVector, + * lenVector); + * for (idInVector = 0; idInVector < readableRows; ++ idInVector) + * { + * + * ** normal style from here ** + * int8 *flags = vci_CSGetSkipOfVirtualTuple(vTuples); + * if ((* flags) & vcivtf_delete) + * continue; + * + * ** Row wise ** + * Datum *values = vci_CSGetValuesOfVirtualTuple(vTuples, + * idInVector); + * bool *isnull = vci_CSGetIsNullOfVirtualTuple(vTuples, + * idInVector); + * + * ** Column wise ** + * Datum *values = vci_CSGetValuesOfVirtualTupleColumnar(vTuples, + * columnId); + * bool *isnull = vci_CSGetIsNullOfVirtualTupleColumnar(vTuples, + * columnId); + * + * int64 *crid = vci_CSGetCridOfVirtualTuple(vTuples, + * idInVector); + * int64 *tid = vci_CSGetTidOfVirtualTuple(vTuples, + * idInVector); + * UpdateVirtualTupleLinks(); + * EvaluateQualsEtc(); + * ** normal style to here ** + * + * ** if you use fixed linked virtual tuples from here ** + * SelectPostgreSQLVirtualTuple(); + * EvaluateQualsEtc(); + * ** if you use fixed linked virtual tuples to here ** + * + * } + * } + * } + * } + * + * vci_CSDestroyCheckExtent(status) + * vci_CSDestroyVirtualTuples(vTuples); + * vci_CSDestroyFetchContext(localContext); + * + * ---- in backend process ---- + * vci_CSDestroyFetchContext(fetchContext); + * vci_CSDestroyLocalRos(localRos); + * vci_CSDestroyQueryContext(queryContext); + */ + +/** + * @brief Information to fetch data from one relation used in a query. + * + * When multiple relations are used in one query, + * multiple vci_CSQueryContextData should be created. + */ +typedef struct vci_CSQueryContextData +{ + /** Number of columns of the relation used in the query. */ + int num_columns; + + /** Attribute number in original PostgreSQL relation. */ + AttrNumber *volatile attr_num; + + /** Column ID in VCI main relation. */ + int16 *volatile column_id; + + /* Number of maximum WOS entries */ + int64 num_data_wos_entries; + + /* Number of maximum whiteout WOS entries */ + int64 num_whiteout_wos_entries; + + /** + * Number of entries in delete_list, just a copy of + * vci_local_ros_t.local_delete_list->num_entry. + */ + int num_delete; + + /** + * Local delete list, containing whiteout WOS. + * CAUTION : THIS POINTER VALUE IS JUST A COPY OF + * vci_local_ros_t.local_delete_list->crid_list. + * NEVER pfree(). + */ + uint64 *delete_list; + + /** + * Number of extents of local ROS. + * To keep the extnets of local ROS at reasonable size, + * they may contain fewer rows than 262,144 rows. + */ + int num_local_ros_extents; + + vci_local_ros_t *local_ros; /* pointer to the local ROS. */ + + /** Number of extents in ROS. */ + int num_ros_extents; + + /** + * Pointer to main relation information. + * The object is allocated in shared_memory_context, + * but info->rel cannot access from other process than that creates + * vci_CSFetchContext. + * In order to access main relation, open using main_relation_oid. + */ + vci_MainRelHeaderInfo *volatile info; + + /** Heap relation indexed by VCI to keep shared lock. */ + volatile Relation heap_rel; + + /** Oid of VCI main relation. */ + Oid main_relation_oid; + + uint32 num_nullable_columns; /* Number of nullable columns */ + uint32 null_width_in_byte; /* Size of null bit vector per row */ + + /** + * ROS version taken from current ROS version or last ROS version. + */ + TransactionId ros_version; + + /** + * @see inclusiveXid of struct vci_RosCommandContext + */ + TransactionId inclusive_xid; + + /** + * @see exclusiveXid of struct vci_RosCommandContext + */ + TransactionId exclusive_xid; + + uint32 tid_crid_diff_sel; /* Selection of TID CRID difference. */ + + /** + * Memory context where all the shared data are allocate, + * including the elements in this sturcture. + */ + MemoryContext shared_memory_context; + + /** lockmode of index relation (main relation) */ + LOCKMODE lockmode; + +} vci_CSQueryContextData; +typedef vci_CSQueryContextData *vci_CSQueryContext; + +/** + * @brief Buffer for decompression, + * + * and concatenate data separated into multiple pages. + */ +typedef struct vci_seq_scan_buffer +{ + int num_buffers; +} vci_seq_scan_buffer_t; + +/** + * @brief Context to fetch vectors. + * + * Vector itself is in vci_virtual_tuples_t, + * and the running parameters are kept in it. + * A master instance of vci_CSFetchContextData is created by backend process, + * then background workers copy to have locally. + * Some member variables in local copy is over-written, marked as + * \b LOCALIZED \b VARIABLE . + */ +typedef struct vci_CSFetchContextData +{ + uint32 size; /* Size of this structure. */ + + int32 extent_id; /* The extent ID of stored virtual tuples. */ + uint16 num_rows; /* Number of stored virtual tuples. */ + + int16 num_columns; /* Number of columns to fetch in this context. */ + + /** + * Number of rows for the context to read at once. + * The fetcher read multiple lines at once and store them into the + * virtual tuple storage. + */ + uint32 num_rows_read_at_once; + + bool use_column_store; /* Store data in columnar style (true) or + * not. */ + + bool need_crid; /* Fetch CRID or not. */ + bool need_tid; /* Fetch TID or not. */ + + /** Used in decompression or data concatenation. */ + vci_seq_scan_buffer_t *buffer; + + /** \b LOCALIZED \b VARIABLE \n + * The ROS data fetched are stored in this context. + * virtual tuple storage is located here. + */ + MemoryContext local_memory_context; + + /** The size of virtual Tuple storage. + * This is sum of size_values, size_flags, and sizes of area pointed by + * vci_virtual_tuples_t->column_info[columnId].al_area. + */ + Size size_vector_memory_context; + + /** area where Datum and pointers are stores */ + Size size_values; + + /** The area where nulls, skip information, local skip information, + * TIDs, CRIDs, dictionaries, compression workarea and temporay + * area for wor-wise mode are placed. + * The amount of dictionary sizes is in size_dictionary_area. + * The workarea size for compression and decompression is in + * size_decompression_area. + */ + Size size_flags; + + /** The memory size for dictionaries + * This is included in size_flags. + */ + Size size_dictionary_area; + + /** Workarea size to decompress one VCI_COMPACTION_UNIT_ROW. + * The size is calculated as + * MAXALIGN(VCI_MAX_PAGE_SPACE * VCI_COMPACTION_UNIT_ROW) + * when size_dictionary_area != 0, or zero. + * This is included in size_flags. + */ + Size size_decompression_area; + + /** The query context this fetch context belongs to. */ + vci_CSQueryContext query_context; + + /** \b LOCALIZED \b VARIABLE \n + * VCI main relation information used in localized fetch. + * Since the file discriptor or Relation structure must be obtained + * in each process, the main relation information also calculated in + * each process. + */ + vci_MainRelHeaderInfo *info; + + /** \b LOCALIZED \b VARIABLE \n + * Relations of the delete vector. + */ + vci_ColumnRelations rel_delete; + + /** \b LOCALIZED \b VARIABLE \n + * Relations of the null bit vector. + */ + vci_ColumnRelations rel_null; + + /** \b LOCALIZED \b VARIABLE \n + * Relations of the TID vector. + */ + vci_ColumnRelations rel_tid; + + /** \b LOCALIZED \b VARIABLE \n + * Pointer to the array of relations of normal columns. + */ + vci_ColumnRelations *rel_column; + + /** + * The column ID translation table. + * Since the column IDs in fetch vector are differ from those of + * VCI main relations, + * we have the translation table from the former to the latter here. + */ + int16 column_link[1]; /* VARIABLE LENGTH ARRAY */ +} vci_CSFetchContextData; /* VARIABLE LENGTH STRUCT */ +typedef vci_CSFetchContextData *vci_CSFetchContext; + +/** + * @brief Structure to keep minimum and maximum value for a column. + */ +typedef struct vci_minmax +{ + bool valid; /* min and max are meaningful (true) or not + * (false). */ + char min[VCI_MAX_MIN_MAX_SIZE]; /* Minimum value. */ + char max[VCI_MAX_MIN_MAX_SIZE]; /* Maximum value. */ +} vci_minmax_t; + +/** + * @brief The extent information which is obtained before fetching the + * extent itself. + * + * It has information of existence, visibility of the extent, + * number of rows in the extent, + * and the minimum and maximum values of the extent. + */ +typedef struct vci_extent_status +{ + uint32 size; /* Size of this structure. */ + uint32 num_rows; /* Number of rows in the extent. */ + bool existence; /* Existence of the extent. */ + bool visible; /* Visibility of the extent. */ + + /** The minimum and the maximum values of columns to be fetched. */ + vci_minmax_t minmax[1]; /* VARIABLE LENGTH ARRAY */ +} vci_extent_status_t; /* VARIABLE LENGTH STRUCT */ + +/** + * @brief The status after reading vector. + */ +typedef enum vci_read_vector_status_t +{ + vcirvs_read_whole, /* Whole the data, that are required, are + * read. */ + vcirvs_out_of_memory, /* Partially read since out of memory. */ + vcirvs_end_of_extent, /* Reaches the end of extent. */ + + /** Failed to read since the parameter is out of range. */ + vcirvs_out_of_range, + + vcirvs_not_visible, /* Failed to read since the extent is + * invisible. */ + vcirvs_not_exist, /* The specified extent is not exists. */ +} vci_read_vector_status_t; + +/** + * @brief Information of a fetched column in virtual tuple. + */ +typedef struct vci_virtual_tuples_column_info +{ + char *area; /* Aligned pointer of al_area. NEVER pfree() */ + + /** Allocated pointer, actual palloced() address is kept. */ + char *al_area; + + int32 null_bit_id; /* Null bit ID in null bit vector. */ + uint32 max_column_size; /* The maximum size of data in the column. */ + + /** true when the value is passed by the pointer (datum by reference). + * false when the value itself is contained in Datum (datum by value). + */ + bool strict_datum_type; + + vcis_compression_type_t comp_type; /* Compression method used. */ + Oid atttypid; /* Type ID of attribute. */ + bool *isnull; /* Pointer to the isnull flag area. */ + Datum *values; /* Pointer to the Datum array area. */ + + /** The information of the dictionary of LZVF compression. */ + vci_DictInfo *dict_info; +} vci_virtual_tuples_column_info_t; + +/** + * @brief Information of virtual tuple, a set of fetched data. + * + * In the form, both colum-wise and row-wise are supported. + */ +typedef struct vci_virtual_tuples +{ + uint32 size; /* Size of this instance. */ + uint16 num_columns; /* Number of columns to store. */ + int32 extent_id; /* The extent ID of stored data. */ + + /** Physically recorded number of rows in the target extent. */ + uint32 num_rows_in_extent; + + /** The row ID in extent of the stored first datum. */ + uint32 row_id_in_extent; + + uint32 num_rows; /* Number of stored rows in this structure. */ + + uint32 buffer_capacity; /* Capacity in unit of rows in this + * structure. */ + + vci_read_vector_status_t status; /* Read status. */ + + /** + * This keeps the position of first tuple of vector, + * since the first virtual tuple of the vector is not always the first + * entry of stored data. + * At present, the upstream users requre that always the first data + * to be placed at the same address, this member variable is always + * set to zero. + */ + uint32 offset_of_first_tuple_of_vector; + + /** + * Number of rows for the context to read at once. + * The fetcher read multiple lines at once and store them into the + * virtual tuple storage. + */ + uint32 num_rows_read_at_once; + + /** The fetch context for this virtual tuple. */ + vci_CSFetchContext fetch_context; + + /** True for store in column-wise style. False for row-wise. */ + bool use_column_store; + + /** + * The size of virtual Tuple storage. + * This is sum of size_values, size_flags, and sizes of area pointed by + * vci_virtual_tuples_t->column_info[columnId].al_area. + */ + Size size_vector_memory_context; + + /** The size of the area where Datum and pointers are stores. */ + Size size_values; + + /** + * The size of the area where nulls, skip information, + * local skip information, TIDs, CRIDs, dictionaries, + * compression workarea and temporay area for wor-wise mode are placed. + * The amount of dictionary sizes is in size_dictionary_area. + * The workarea size for compression / decompression is in + * size_decompression_area. + */ + Size size_flags; + + /** + * The memory size for dictionaries. + * This is included in size_flags. + */ + Size size_dictionary_area; + + /** + * Workarea size to decompress one VCI_COMPACTION_UNIT_ROW. + * The size is calculated as + * MAXALIGN(VCI_MAX_PAGE_SPACE * VCI_COMPACTION_UNIT_ROW) + * when size_dictionary_area != 0, or zero. + * This is included in size_flags. + */ + Size size_decompression_area; + + int64 *crid; /* Aligned pointer to CRID list in al_flags */ + + /** Aligned pointer to TID list in al_flags. + * ItemPointerData are wrtten. + */ + int64 *tid; + + /** Aligned pointer to skip list. */ + uint16 *skip; + + /** Aligned pointer to skip list for local ROS. */ + uint16 *local_skip; + + /** Aligned pointer to the area for isnull of all columns. */ + bool *isnull; + + /** + * In row-wise mode, the vector in local ROS is once built here. + * The area is allocated in local_memory_context. + * The size is + * num_rows_read_at_once * num_columns * (sizeof(Datum) + sizeof(bool)) + */ + char *row_wise_local_ros; + + /** + * Workarea to decompress data. + * Dictionaries follow work_decompression + */ + char *work_decompression; + + /** Aligned pointer to the area for values of all columns in al_values. */ + Datum *values; + + /** Aligned pointer to the area for meta information like skip, TID, + * NULL, and so on. + */ + char *flags; + + char *al_values; /* Allocated pointer for values. */ + char *al_flags; /* Allocated pointer for flags. */ + + /** Array of column informations. */ + vci_virtual_tuples_column_info_t column_info[1]; /* VARIABLE LENGTH ARRAY */ +} vci_virtual_tuples_t; /* VARIABLE LENGTH STRUCT */ + +extern PGDLLEXPORT vci_CSQueryContext vci_CSCreateQueryContextWLockMode(Oid mainRelationOid, + int numReadColumns, + /* attribute number in original relation */ + AttrNumber *attrNum, + MemoryContext sharedMemCtx, + LOCKMODE lockmode); + +/** + * @brief Create query context. + * + * @param[in] mainRelationOid Oid of VCI main relation. + * @param[in] numReadColumns The number of read columns in the part of query. + * @param[in] attrNum The attribute numbers in the original heap relation, + * not those of the VCI main relation. + * @param[in] sharedMemCtx The shared memory context to keep elements of + * query context, fetch context, local ROS. + * @param[in] recoveryInProgress true if recovery is still in progress. + * @param[in] estimatingLocalROSSize true if creating a local ROS. + * @return The pointer to the allocated vci_CSQueryContext. + */ +static inline vci_CSQueryContext +vci_CSCreateQueryContext(Oid mainRelationOid, + int numReadColumns, + AttrNumber *attrNum, + /* attribute number in original relation */ + MemoryContext sharedMemCtx, + bool recoveryInProgress, + bool estimatingLocalROSSize) +{ + /* + * ShareUpdateExclusiveLock is used for creating local ROS. But on the + * standby, AccessShareLock is used because queries on the standby can be + * used only RowExclusiveLock or weaker ones. + */ + LOCKMODE lockmode = (recoveryInProgress || estimatingLocalROSSize) ? AccessShareLock : ShareUpdateExclusiveLock; + + return vci_CSCreateQueryContextWLockMode(mainRelationOid, numReadColumns, + attrNum, sharedMemCtx, lockmode); +} + +extern PGDLLEXPORT void vci_CSDestroyQueryContext(vci_CSQueryContext queryContext); + +/* obtain the worst size of local ROS to be estimated */ +extern Size vci_CSEstimateLocalRosSize(vci_CSQueryContext queryContext); + +extern PGDLLEXPORT vci_local_ros_t *vci_CSGenerateLocalRos(vci_CSQueryContextData *queryContext); + +/** + * @brief Entry point to destroy local ROS. + * + * @param[in] localRos Local ROS to be destroyed. + */ +static inline void +vci_CSDestroyLocalRos(vci_local_ros_t *localRos) +{ + vci_DestroyLocalRos(localRos); +} + +extern PGDLLEXPORT vci_CSFetchContext vci_CSCreateFetchContextBase( + vci_CSQueryContext queryContext, + uint32 numRowsReadAtOnce, + int16 numReadColumns, + /* attribute number in original relation */ + AttrNumber *attrNum, + bool useColumnStore, + bool returnTid, + bool returnCrid, + bool useCompression); + +#define VCI_MAX_NUM_ROW_TO_FETCH (65536 - VCI_COMPACTION_UNIT_ROW) + +/** + * @brief The entry point to the function creating fetch context. + * + * The actual number of rows read at once is quantized + * by VCI_COMPACTION_UNIT_ROW by the formula, + * actualNumRowsReadAtOnce + * = TYPEALIGN(VCI_COMPACTION_UNIT_ROW, numRowsReadAtOnce), + * and numRowsReadAtOnce is unsigned 16 bit integer, it should be smaller than + * or equal to VCI_MAX_NUM_ROW_TO_FETCH. Otherwise, it returns NULL. + * + * @param[in] queryContext The query context. + * @param[in] numRowsReadAtOnce The number of rows which read at once and + * stored in the virtual tuples. + * @param[in] numReadColumns The number of columns to be read. + * @param[in] attrNum The pointer to the array which has the attribute numbers + * of the original heap relation, not VCI main relation. + * @param[in] useColumnStore True for column-wise store. False for row-wise. + * @param[in] returnTid True to get TID in virtual tuples. + * @param[in] returnCrid True to get CRID in virtual tuples. + * @return The pointer to the created fetch context. + * NULL if some parameters are invald resulting no fetch context is created. + */ +static inline vci_CSFetchContext +vci_CSCreateFetchContext(vci_CSQueryContext queryContext, + uint16 numRowsReadAtOnce, + int16 numReadColumns, + /* attribute number in original relation */ + AttrNumber *attrNum, + bool useColumnStore, + bool returnTid, + bool returnCrid) +{ + return vci_CSCreateFetchContextBase(queryContext, + numRowsReadAtOnce, + numReadColumns, + attrNum, + useColumnStore, + returnTid, + returnCrid, + false); +} + +extern PGDLLEXPORT void vci_CSDestroyFetchContext(vci_CSFetchContext fetchContext); +extern PGDLLEXPORT vci_CSFetchContext vci_CSLocalizeFetchContext( + vci_CSFetchContext fetchContext, + MemoryContext memoryContext); +extern PGDLLEXPORT vci_extent_status_t *vci_CSCreateCheckExtent( + vci_CSFetchContext localContext); +extern PGDLLEXPORT void vci_CSDestroyCheckExtent(vci_extent_status_t *status); +extern PGDLLEXPORT void vci_CSCheckExtent(vci_extent_status_t *status, + vci_CSFetchContext fetchContext, + int32 extentId, + bool readMinMax); + +extern PGDLLEXPORT vci_virtual_tuples_t *vci_CSCreateVirtualTuplesWithNumRows(vci_CSFetchContext fetchContext, uint32 numRows); + +/** + * @brief Create virtual tuples according to the context. + * + * @param[in] localContext The localized fetch context. + * @return The created virtual tuples. + */ +static inline vci_virtual_tuples_t * +vci_CSCreateVirtualTuples(vci_CSFetchContext localContext) +{ + return vci_CSCreateVirtualTuplesWithNumRows(localContext, + localContext->num_rows_read_at_once); +} + +extern PGDLLEXPORT void vci_CSDestroyVirtualTuples(vci_virtual_tuples_t *vTuples); + +/** + * @brief Get the address of the area where Datum of the specified column + * is stored. + * + * At present, the upstream requester requires the start address fixed. + * For better performance, it is better that the start address is modifiable, + * to fetch many rows at once, or to use local ROS directly. + * + * @param[in] vTuples The virtual tuples. + * @param[in] columnId Target column ID. + * @return The pointer to the Datum array. + */ +static inline Datum * +vci_CSGetValueAddrFromVirtualTuplesColumnwise(vci_virtual_tuples_t *vTuples, uint16 columnId) +{ + return vTuples->column_info[columnId].values; +} + +/** + * @brief Get the address of the area where isnull of the specified column + * is stored. + * + * At present, the upstream requester requires the start address fixed. + * For better performance, it is better that the start address is modifiable, + * to fetch many rows at once, or to use local ROS directly. + * + * @param[in] vTuples The virtual tuples. + * @param[in] columnId Target column ID. + * @return The pointer to the bool array. + */ +static inline bool * +vci_CSGetIsNullAddrFromVirtualTuplesColumnwise(vci_virtual_tuples_t *vTuples, uint16 columnId) +{ + return vTuples->column_info[columnId].isnull; +} + +/** + * @brief Get the address of the skip information of the specified column + * is stored. + * + * @param[in] vTuples The virtual tuples. + * @return The pointer to the skip information array. + */ +static inline uint16 * +vci_CSGetSkipAddrFromVirtualTuples(vci_virtual_tuples_t *vTuples) +{ + return vTuples->skip; +} + +/** + * @brief Get the vector of specified skip information. + * + * @param[in] vTuples The virtual tuples. + * @return The pointer to the array of skip information. + * + * @note The instrtuction is the same as + * vci_CSGetValuesOfVirtualTupleColumnar(). + */ +static inline uint16 * +vci_CSGetSkipFromVirtualTuples(vci_virtual_tuples_t *vTuples) +{ +#ifdef CHECK_VTUPLE_GET_RANGE + Assert((0 <= vTuples->offset_of_first_tuple_of_vector) && + (vTuples->offset_of_first_tuple_of_vector < vTuples->num_rows)); +#endif /* #ifdef CHECK_VTUPLE_GET_RANGE */ + + return &(vTuples->skip[vTuples->offset_of_first_tuple_of_vector]); +} + +/** + * @brief Get the vector of TID. + * + * @param[in] vTuples The virtual tuples. + * @return The pointer to the array of TID information in int64* form. + * + * @note This function is available when the fetch context is created + * with the option returnTid is true. + * This function can be available independent of useColumnStore option. + */ +/* Cast please */ +static inline int64 * +vci_CSGetTidFromVirtualTuples(vci_virtual_tuples_t *vTuples) +{ +#ifdef CHECK_VTUPLE_GET_RANGE + Assert((0 <= vTuples->offset_of_first_tuple_of_vector) && + (vTuples->offset_of_first_tuple_of_vector < vTuples->num_rows)); +#endif /* #ifdef CHECK_VTUPLE_GET_RANGE */ + + return &(vTuples->tid[vTuples->offset_of_first_tuple_of_vector]); +} + +/** + * @brief Get the TID of specified tuple. + * + * @param[in] vTuples The virtual tuples. + * @param[in] offsetInVector offset in the vector. + * @return TID information. + * + * @note The instruction is the same as vci_GetTidFromVirtualTuples(). + */ +#ifdef __s390x__ +static inline ItemPointerData +vci_CSGetTidInItemPointerFromVirtualTuples(vci_virtual_tuples_t *vTuples, + int offsetInVector) +{ + ItemPointerData ipd; + int64 result = (vci_CSGetTidFromVirtualTuples(vTuples))[offsetInVector]; +#ifdef WORDS_BIGENDIAN + result = result << 16; +#else +#endif + ipd = *((ItemPointer) &result); + return ipd; +} +#else +static inline ItemPointer +vci_CSGetTidInItemPointerFromVirtualTuples(vci_virtual_tuples_t *vTuples, + int offsetInVector) +{ + return (ItemPointer) &(vci_CSGetTidFromVirtualTuples(vTuples) + [offsetInVector]); +} +#endif + +extern PGDLLEXPORT int vci_CSFetchVirtualTuples(vci_virtual_tuples_t *vTuples, + int64 cridStart, + uint32 numReadRows); + +/** + * @brief Get the tuple specified. + * + * @param[in] vTuples The virtual tuples. + * @param[in] offsetInVector offset in the vector. + * @return The pointer to the array of Datum. + * + * @note This function can be used when the fetch context is created in + * row-wise mode, i.e. useColumnStore = false. + * The column fetcher is read rows in unit of VCI_COMPACTION_UNIT_ROW. + * Therefore, at the start address of the buffer does not always have + * the specified data. + * The specified data is pointed by the offset of + * vTuples->offset_of_first_tuple_of_vector, actually. + * To have the data at the start address, always read rows of multiples + * of VCI_COMPACTION_UNIT_ROW at once. + * For example, when VCI_COMPACTION_UNIT_ROW = 128, then + * read 128 rows at once from the row ID in the extent, 0, 128, 256, 384, .... + * Or, read 256 rows at once from the row ID in the extent, 0, 256, 512, ... + */ +static inline Datum * +vci_CSGetValuesOfVirtualTuple(vci_virtual_tuples_t *vTuples, + uint32 offsetInVector) +{ + offsetInVector += vTuples->offset_of_first_tuple_of_vector; + +#ifdef CHECK_VTUPLE_GET_RANGE + Assert(!vTuples->use_column_store); + Assert((0 <= offsetInVector) && (offsetInVector < vTuples->num_rows)); +#endif /* #ifdef CHECK_VTUPLE_GET_RANGE */ + + return &(vTuples->values[vTuples->num_columns * offsetInVector]); +} + +/** + * @brief Get the isnull of specified tuple. + * + * @param[in] vTuples The virtual tuples. + * @param[in] offsetInVector offset in the vector. + * @return The pointer to the array of bool. + * + * @note See instruction of vci_CSGetValuesOfVirtualTuple(). + */ +static inline bool * +vci_CSGetIsNullOfVirtualTuple(vci_virtual_tuples_t *vTuples, + int32 offsetInVector) +{ + offsetInVector += vTuples->offset_of_first_tuple_of_vector; + +#ifdef CHECK_VTUPLE_GET_RANGE + Assert(!vTuples->use_column_store); + Assert((0 <= offsetInVector) && ((uint32) offsetInVector < vTuples->num_rows)); +#endif /* #ifdef CHECK_VTUPLE_GET_RANGE */ + + return &(vTuples->isnull[vTuples->num_columns * offsetInVector]); +} + +/** + * @brief Get the vector of specified column data. + * + * @param[in] vTuples The virtual tuples. + * @param[in] columnId The column ID. + * @return The pointer to the array of Datum. + * + * @note This function can be used when the fetch context is created in + * column-wise mode, i.e. useColumnStore = true. + * The other instruction is the same as vci_CSGetValuesOfVirtualTuple(). + */ +static inline Datum * +vci_CSGetValuesOfVirtualTupleColumnar(vci_virtual_tuples_t *vTuples, uint16 columnId) +{ +#ifdef CHECK_VTUPLE_GET_RANGE + Assert(vTuples->use_column_store); + Assert((VCI_FIRST_NORMALCOLUMN_ID <= columnId) && (columnId < vTuples->num_columns)); +#endif /* #ifdef CHECK_VTUPLE_GET_RANGE */ + + return &(vTuples->column_info[columnId].values + [vTuples->offset_of_first_tuple_of_vector]); +} + +/** + * @brief Get the vector of specified isnull information. + * + * @param[in] vTuples The virtual tuples. + * @param[in] columnId The column ID. + * @return The pointer to the array of bool. + * + * @note The instrtuction is the same as + * vci_CSGetValuesOfVirtualTupleColumnar(). + */ +static inline bool * +vci_CSGetIsNullOfVirtualTupleColumnar(vci_virtual_tuples_t *vTuples, uint16 columnId) +{ +#ifdef CHECK_VTUPLE_GET_RANGE + Assert(vTuples->use_column_store); + Assert((VCI_FIRST_NORMALCOLUMN_ID <= columnId) && (columnId < vTuples->num_columns)); +#endif /* #ifdef CHECK_VTUPLE_GET_RANGE */ + + return &(vTuples->column_info[columnId].isnull[vTuples->offset_of_first_tuple_of_vector]); +} + +/** + * @brief Obtains the column ID in the VCI main relation from the serial number + * in a set of read columns listed in vci_CSFetchContext. + * + * @param[in] fetchContext The fetch context. + * @param[in] serialNumber The serial number in a set of read columns. + * @return the columnID in the VCI main relation. + */ +static inline int16 +vci_GetColumnIdFromFetchContext(vci_CSFetchContext fetchContext, + int16 serialNumber) +{ + int cId; + + Assert((0 <= serialNumber) && (serialNumber < fetchContext->num_columns)); + cId = fetchContext->column_link[serialNumber]; + Assert((0 <= cId) && (cId < fetchContext->query_context->num_columns)); + + return fetchContext->query_context->column_id[cId]; +} + +extern void vci_FillCridInVirtualTuples(vci_virtual_tuples_t *vTuples); +extern void + vci_FillFixedWidthColumnarFromRosChunkStorage(vci_virtual_tuples_t *vTuples, + int16 columnId, + RosChunkStorage *rosChunkStorage); +extern void + vci_FillVariableWidthColumnarFromRosChunkStorage(vci_virtual_tuples_t *vTuples, + int16 columnId, + RosChunkStorage *rosChunkStorage); +extern int16 *vci_GetNullableColumnIds(vci_virtual_tuples_t *vTuples); + +#endif /* VCI_FETCH_H */ diff --git a/contrib/vci/include/vci_freelist.h b/contrib/vci/include/vci_freelist.h new file mode 100644 index 0000000..8cdfec7 --- /dev/null +++ b/contrib/vci/include/vci_freelist.h @@ -0,0 +1,75 @@ +/*------------------------------------------------------------------------- + * + * vci_freelist.h + * Definitions and declarations of Free space link list + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/include/vci_freelist.h + * + *------------------------------------------------------------------------- + */ + +#ifndef VCI_FREELIST_H +#define VCI_FREELIST_H + +#include "postgres.h" + +#include "vci.h" +#include "vci_columns.h" +#include "vci_ros.h" + +#define VCI_FREESPACE_ITEM_ID FirstOffsetNumber + +typedef struct vcis_free_space +{ + uint32 size; + + vcis_extent_type_t type; + + BlockNumber prev_pos; + + BlockNumber next_pos; +} vcis_free_space_t; + +#define vci_hasFreeLinkNode(freespace) \ + (vcis_free_space == (freespace)->type) \ + || (vcis_tidcrid_type_pagetag == (freespace)->type) + +extern PGDLLEXPORT vcis_free_space_t *vci_GetFreeSpace(vci_RelationPair *relPair, BlockNumber blk); + +extern int32 vci_MakeFreeSpace(vci_RelationPair *relPair, + BlockNumber startBlockNumber, + BlockNumber *newFSBlockNumber, + vcis_free_space_t *newFS, + bool coalesce); + +extern void vci_AppendFreeSpaceToLinkList(vci_RelationPair *relPair, + BlockNumber startBlockNumber, + BlockNumber prevFreeBlockNumber, + BlockNumber nextFreeBlockNumber, + BlockNumber size); + +extern BlockNumber vci_FindFreeSpaceForExtent(vci_RelationPair *relPair, + BlockNumber requiredSize); + +extern void vci_RemoveFreeSpaceFromLinkList(vci_RelationPair *relPair, + BlockNumber startBlockNumber, + BlockNumber numExtentPages); + +/* *************** */ +/* Recovery */ +/* *************** */ + +extern void vci_InitRecoveryRecordForFreeSpace(vci_MainRelHeaderInfo *info); + +extern void vci_WriteRecoveryRecordForFreeSpace(vci_RelationPair *relPair, + int16 colId, + int16 dictId, + BlockNumber StartBlockNumber, + vcis_free_space_t *FS); + +extern void vci_RecoveryFreeSpace(vci_MainRelHeaderInfo *info, vci_ros_command_t command); + +#endif /* VCI_FREELIST_H */ diff --git a/contrib/vci/include/vci_mem.h b/contrib/vci/include/vci_mem.h new file mode 100644 index 0000000..3f455d5 --- /dev/null +++ b/contrib/vci/include/vci_mem.h @@ -0,0 +1,177 @@ +/*------------------------------------------------------------------------- + * + * vci_mem.h + * Definitions of on-memmory structures + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/include/vci_mem.h + * + *------------------------------------------------------------------------- + */ +#ifndef VCI_MEM_H +#define VCI_MEM_H + +#include "postgres.h" + +#include + +#include "lib/ilist.h" +#include "portability/instr_time.h" +#include "storage/lwlock.h" +#include "utils/palloc.h" + +#include "vci.h" +#include "vci_ros.h" +#include "vci_memory_entry.h" + +/*------------------------------------------------------------------------- + * START: Copied from include/vci_port.h + *------------------------------------------------------------------------- + */ + +#ifndef VCI_PORT_H +#define VCI_PORT_H + +/* + * key for vci_devload_t + */ +#define VCI_PSEUDO_UNMONITORED_DEVICE "" + +#ifndef WIN32 +#define VCI_PATH_MAX PATH_MAX +#else +#define VCI_PATH_MAX MAX_PATH +#endif + +/* + * Memory entry on the each device + * + * * head is the actual list, link is used to track unused entries + */ +typedef struct +{ + dlist_head head; + dlist_node link; +} vci_memory_entry_list_t; + +/* + * IO statistics, mount information, etc for each devices + */ +typedef struct +{ + char devname[VCI_PATH_MAX]; + + vci_memory_entry_list_t *memory_entry_queue; + + /* + * Next position when memory entry would be traced. NULL means there are + * no entries to be seen. + */ + dlist_node *memory_entry_pos; +} vci_devload_t; + +#endif /* VCI_PORT_H */ + +/*------------------------------------------------------------------------- + * END: Copied from include/vci_port.h + *------------------------------------------------------------------------- + */ + +typedef struct VciGucStruct +{ + bool have_loaded_postgresql_conf; + + bool enable; + + bool log_query; + + int cost_threshold; + + int table_scan_policy; + + /* GUC parameters read from postgresq.conf */ + int maintenance_work_mem; + int max_devices; /* max device num for storage */ + + /* ROS control worker/daemon */ + int control_max_workers; + int control_naptime; + + /* command thresholds */ + int wosros_conv_threshold; + int cdr_threshold; + + /* for custom plan execution */ + int max_local_ros_size; + + /* for parallel processing */ + int table_rows_threshold; + + bool enable_seqscan; + bool enable_indexscan; + bool enable_bitmapheapscan; + bool enable_sort; + bool enable_hashagg; + bool enable_sortagg; + bool enable_plainagg; + bool enable_hashjoin; + bool enable_nestloop; + + /* GUC parameters for internal use */ + bool enable_ros_control_daemon; + +} VciGucStruct; + +extern PGDLLEXPORT VciGucStruct VciGuc; + +/* + * Data structure on shared memory + * + * The instance would be allocated on the shared memory and can be accessed via + * VciShmemAddr. + */ +typedef struct VciShmemStruct +{ + /* --- ROS Control Daemon --- */ + + /* Attributes for passing attributes to a worker */ + + vci_wosros_conv_worker_arg_t *worker_args_array; + + /** vci_memory_entries_t is defined in vci_ros.h + * That keeps information of VCI indices kept in memory. + * The life is the same with PostgreSQL instance. + */ + vci_memory_entries_t *memory_entries; + + dlist_head memory_entry_device_unknown_list; + + /* Standby server controller */ + LWLock *standby_exec_loc; + int num_standby_exec_queries; + + /* IO statistics */ + + vci_devload_t *devload_array; + + vci_memory_entry_list_t *memory_entry_queue_array; + + dlist_head free_memory_entry_queue_list; /**list of memory_entry_queue_array */ + int num_devload_info; /* monitored device numbers + 1(for + * unmonitored devices) */ + int max_devices; /* max device num for storage */ + int translated_dev_pos; /* index of a device VCIs on which is to + * be translated */ + LWLock *io_load_lock; + + /* Additional Lwlocks used by various modules */ + LWLock *vci_memory_entries_lock; + LWLock *vci_query_context_lock; + LWLock *vci_mnt_point2dev_lock; +} VciShmemStruct; + +extern PGDLLEXPORT VciShmemStruct *VciShmemAddr; + +#endif /* VCI_MEM_H */ diff --git a/contrib/vci/include/vci_memory_entry.h b/contrib/vci/include/vci_memory_entry.h new file mode 100644 index 0000000..7aba17e --- /dev/null +++ b/contrib/vci/include/vci_memory_entry.h @@ -0,0 +1,118 @@ +/*------------------------------------------------------------------------- + * + * vci_memory_entry.h + * Definitions and declarations of on-memory structures per VCI index + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/include/vci_memory_entry.h + * + *------------------------------------------------------------------------- + */ + +#ifndef VCI_MEMORY_ENTRY_H +#define VCI_MEMORY_ENTRY_H + +#include "lib/ilist.h" +#include "storage/lwlock.h" + +#include "vci_ros.h" + +/** + * the key when searching a vci_memory_entry_t type value from its set. + */ +typedef struct +{ + Oid oid; /* Oid of VCI main relation */ + Oid dbid; /* Oid of database where VCI main relations + * belongs */ +} vci_id_t; + +/** + * VCI index placeholder to determine the target of ROS command by ROS daemon + */ +typedef struct +{ + vci_id_t id; /* identifier of vci_memory_entry_t */ + Oid tsid; /* Oid of tablespace where VCI a main relation + * belongs */ + + /** + * If tsid is equal to InvalidOid, the Oid corresponding to default table + * space. Otherwise, this is equal to tsid. + */ + Oid real_tsid; + + /** + * Timestamp used for least recent update. + * We do nothing for the wraparound effect, aka "wraparound failures" in + * the PostgreSQL manual. + */ + int32 time_stamp; + + /** + * flag to force the ROS control daemon to do WOS->ROS conversion + * at next WOS->ROS conversion stage regardless of the WOS size. + * + * This flag is set to true when a local WOS->ROS conversion fails + * on account of out-of-memory error. This flag is set to false when + * WOS->ROS conversion is done. + */ + bool force_next_wosros_conv; + + dlist_node link; /* links of vci indexes on a same device */ + +} vci_memory_entry_t; + +/** + * @brief Contains the pointer to the array of vci_memory_entry_t, + * and a lock. + * + * The lock must be used when the array is exclusively accessed, say + * add / remove entries to / from the array, or so. + * + * The instance of vci_memory_entries_t and the array of entries must + * be allocated in shared memory living throughout the PostgreSQL instance. + */ +typedef struct +{ + /** + * Lock to update member variables of vci_memory_entries_t. + */ + LWLock *lock; + + /** + * Number of allocated vci_memory_entry_t pointed by data[]. + */ + uint32 capacity_hash_entries; + + /** + * Current time stamp value, used to least-recently-updated method. + * Instances of vci_memory_entry_t have the timestamp of last access, + * which we do not care wraparound effect, aka "wraparound failures" in + * the PostgreSQL manual. + */ + int32 time_stamp; + + /** + * Pointer to the array of vci_memory_entry_t. + */ + vci_memory_entry_t data[1]; /* VARIABLE LENGTH ARRAY */ + +} vci_memory_entries_t; + +extern Size vci_GetSizeOfMemoryEntries(void); +extern void vci_InitMemoryEntries(void); + +extern void vci_TouchMemoryEntry(vci_id_t *vciid, Oid tsid); +extern bool vci_GetWosRosConvertingVCI(vci_wosros_conv_worker_arg_t *vci_info); +extern void vci_freeMemoryEntry(vci_id_t *vciid); + +extern void vci_update_memoryentry_in_devloadinfo(void); +extern void vci_MoveTranslatedVCI2Tail(void); +extern void vci_ResetDevloadCurrentPos(void); +extern void vci_RemoveMemoryEntryOnDroppedDatabase(void); +extern void vci_SetForceNextWosRosConvFlag(vci_id_t *vciid, bool value); + +#endif /* VCI_MEMORY_ENTRY_H */ diff --git a/contrib/vci/include/vci_ros.h b/contrib/vci/include/vci_ros.h new file mode 100644 index 0000000..16f561b --- /dev/null +++ b/contrib/vci/include/vci_ros.h @@ -0,0 +1,1085 @@ +/*------------------------------------------------------------------------- + * + * vci_ros.h + * Definitions and declarations of VCI main relation + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/include/vci_ros.h + * + *------------------------------------------------------------------------- + */ + +/**************************************************************************** + * ** CAUTION: THE STRUCTURES DEFINED IN THIS HEADER FILE WITH THE PREFIX ** + * ** OF "vcis_" AND vci_MainRelVar, vcis_Crid DEFINE THE FORMAT OF THE ROS ** + * ** DATA. ANY MODIFICATION ON THEM MAY CAUSE FORMAT INCOMPATIBILITY. ** + * ** PLEASE BE SURE TO CHANGE THE VALUE OF EITHER MACRO ** + * ** VCI_ROS_VERSION_MAJOR OR VCI_ROS_VERSION_MINOR, TO DETECT FORMAT ** + * ** INCOMPATIBILITY. ** + * ************************************************************************** + */ + +#ifndef VCI_ROS_H +#define VCI_ROS_H + +#include "postgres.h" + +#include "access/heapam.h" +#include "access/htup_details.h" +#include "c.h" +#include "catalog/pg_attribute.h" +#include "catalog/pg_class.h" +#include "nodes/execnodes.h" +#include "storage/block.h" +#include "storage/buf.h" +#include "storage/bufmgr.h" +#include "storage/itemptr.h" +#include "storage/lock.h" +#include "storage/off.h" +#include "utils/memutils.h" +#include "utils/rel.h" +#include "utils/relcache.h" +#include "utils/snapshot.h" + +#include "vci.h" + +#include "vci_utils.h" + +#if (!defined(WIN32)) +#define UINT uint +#endif + +#define VCI_ROS_VERSION_MAJOR ((uint32) 0x00000000) +#define VCI_ROS_VERSION_MINOR ((uint32) 0x0000000D) + +/** + * @brief IDs of ROS commands. + */ +typedef enum vci_ros_command +{ + vci_rc_invalid = -11, /* Invalid case. */ + + /** For vacuum with vci_mrlm_read_write_exclusive. */ + vci_rc_vacuum = -10, + + /** For normal query with vci_mrlm_read_share. */ + vci_rc_query = -9, + + /** For DROP command with vci_mrlm_read_write_exclusive. */ + vci_rc_drop_index = -8, + + /** For DELETE or UPDATE commands with vci_mrlm_read_share. */ + vci_rc_wos_delete = -7, + + /** For INSERT or UPDATE commands with vci_mrlm_read_share. */ + vci_rc_wos_insert = -6, + + /** For recovering ROS with vci_mrlm_read_share, assumed that this command + * is used in vci_mrlm_write_exclusive lock of ROS commands. */ + vci_rc_recovery = -5, + + /** For collecting VCI information with vci_mrlm_read_share. + * This is also used by vci_KeepMainRelHeader() and + * vci_KeepMainRelHeaderWOVersionCheck() automatically. + * */ + vci_rc_probe = -4, + + /** For building ROS in initial index building with + * vci_mrlm_read_write_exclusive. */ + vci_rc_wos_ros_conv_build = -3, + + /** For building local ROS with vci_mrlm_read_write_exclusive, to serialize + * ROS commands. + */ + vci_rc_generate_local_ros = -2, + + /** For COPY command with vci_mrlm_write_share. */ + vci_rc_copy_command = -1, + + /** For WOS -> ROS conversion with vci_mrlm_write_exclusive */ + vci_rc_wos_ros_conv = 0, + + /** For updating delete vector with vci_mrlm_write_exclusive */ + vci_rc_update_del_vec, + + /** For collecting deleted rows with vci_mrlm_write_exclusive */ + vci_rc_collect_deleted, + + /** For collecting deleted extents, unable to access anymore, + * with vci_mrlm_write_exclusive + */ + vci_rc_collect_extent, + + /** For updating TID -> CRID relations with vci_mrlm_write_exclusive */ + vci_rc_update_tid_crid, + + /** For compaction with vci_mrlm_write_exclusive */ + /* vci_rc_compaction, */ + + num_vci_rc, /* anchor */ +} vci_ros_command_t; + +/** + * @brief function to obtain the size of the varlena headers. + * + * @param[in] ptr Pointer to the varlena. + * @return Header size of given varlena. + */ +static inline int32 +vci_VARHDSZ_ANY(void *ptr) +{ + return VARATT_IS_1B_E(ptr) ? VARHDRSZ_EXTERNAL + : ((VARATT_IS_1B(ptr) ? VARHDRSZ_SHORT : VARHDRSZ)); +} + +/** taken from src/backend/utils/adt/tid.c */ +#define DatumGetItemPointer(X) ((ItemPointer) DatumGetPointer(X)) +/** taken from src/backend/utils/adt/tid.c */ +#define ItemPointerGetDatum(X) PointerGetDatum(X) + +typedef uint32 vci_offset_in_extent_t; /* offset to data */ + +/** bit width of maximum number of row ID in an extent */ +#define VCI_CRID_ROW_ID_BIT_WIDTH (18) + +/** Calculate CRID in int64 format from extentID and rowID in extent */ +static inline int64 +vci_CalcCrid64(int32 extentId, uint32 rowIdInExtent) +{ + return ((int64) extentId << VCI_CRID_ROW_ID_BIT_WIDTH) | + (rowIdInExtent & ((UINT64CONST(1) << VCI_CRID_ROW_ID_BIT_WIDTH) - 1)); +} + +/** Calculate extentID from CRID in int64 format */ +static inline int32 +vci_CalcExtentIdFromCrid64(int64 crid64) +{ + return (int32) (crid64 >> VCI_CRID_ROW_ID_BIT_WIDTH); +} + +/** Calculate rowID in extent from CRID in int64 format */ +static inline uint32 +vci_CalcRowIdInExtentFromCrid64(int64 crid64) +{ + return (uint32) (crid64 & ((UINT64CONST(1) << VCI_CRID_ROW_ID_BIT_WIDTH) - 1)); +} + +/** Maximum number of rows in an extent. (256 * 1024) for 18 bits */ +#define VCI_NUM_ROWS_IN_EXTENT (1 << VCI_CRID_ROW_ID_BIT_WIDTH) + +#define VCI_MAX_NUMBER_UNCONVERTED_ROS (128) + +#define VCI_INVALID_CRID_IN_48_BIT (UINT64CONST(0xFFFF800000000000)) +#define VCI_INVALID_CRID VCI_INVALID_CRID_IN_48_BIT + +#define VCI_MOVED_CRID_IN_48_BIT (UINT64CONST(0xFFFFC00000000000)) +#define VCI_MOVED_CRID VCI_MOVED_CRID_IN_48_BIT + +/** Value indicating invalid extent. The value is 0xE0000000 */ +#define VCI_INVALID_EXTENT_ID \ + ((int32) (VCI_INVALID_CRID_IN_48_BIT >> VCI_CRID_ROW_ID_BIT_WIDTH)) + +/** ID of the first extent stored in the storage. */ +#define VCI_FIRST_NORMAL_EXTENT_ID (0) + +/** Value indicating invalid dictionary. The value is -1 */ +#define VCI_INVALID_DICTIONARY_ID (-1) + +/** The number of rows converted at once by WOS->ROS converter. + * Offset is assigned every VCI_COMPACTION_UNIT_ROW rows. + */ +#define VCI_COMPACTION_UNIT_ROW (128) + +/** The ratio to keep usage of work area in safe level */ +#define VCI_WOS_ROS_WORKAREA_SAFE_RATIO (0.5) + +/** Base alignment in storage. + * In the storage, normally VCI uses four-byte integers. + * Thus, we align the data in the storage by four bytes. + */ +#define VCI_DATA_ALIGNMENT_IN_STORAGE (4) + +/** Aligned values, rounded up */ +#define vci_RoundUpValue(value, unit) \ + ((((value) + (unit) - 1) / (unit)) * (unit)) +/** Aligned values, rounded down */ +#define vci_RoundDownValue(value, unit) \ + (((value) / (unit)) * (unit)) + +/** Get byte size of data in an item when a page contains multiple items. + * @param[in] numItem Number of items in a page. + * @return The size of data in an item in byte. + */ +#define VCI_ITEM_SPACE(numItem) \ + ((((BLCKSZ - offsetof(PageHeaderData, pd_linp) \ + - (numItem * (sizeof(HeapTupleHeaderData) + sizeof(ItemIdData)))) \ + / numItem) / VCI_DATA_ALIGNMENT_IN_STORAGE) \ + * VCI_DATA_ALIGNMENT_IN_STORAGE) + +/** Get byte size of an item include item header, + * when a page contains multiple items. + * @param[in] numItem Number of items in a page. + * @return The size of an item in byte. + */ +#define VCI_ITEM_SIZE(numItem) \ + (VCI_ITEM_SPACE(numItem) + sizeof(HeapTupleHeaderData)) + +/** Minimum header space in DB page with one item, normally 52 byts */ +#define VCI_MIN_PAGE_HEADER \ + (SizeOfPageHeaderData + sizeof(HeapTupleHeaderData) \ + + sizeof(ItemIdData)) + +/** Available area in DB page with one item, normally 8140 bytes */ +#define VCI_MAX_PAGE_SPACE (BLCKSZ - VCI_MIN_PAGE_HEADER) + +/** + * @brief Return ID of the target page and offset in the target page + * calculated from the position. + * + * The position and offsetInPage is measured in data area in DB pages. We do + * not care the header of DB page in this macro. + * + * @param[out] blockNumber Block number for the given position. + * @param[out] offsetInPage Offset in page in byte, ignoring page header, + * for the given position. + * @param[in] position Byte offset in area formed by multiple DB pages. + */ +static inline void +vci_GetBlockNumberAndOffsetInPage(BlockNumber *blockNumber, + uint32 *offsetInPage, + uint32 position) +{ + *blockNumber = position / VCI_MAX_PAGE_SPACE; + *offsetInPage = position - (*blockNumber * VCI_MAX_PAGE_SPACE); +} + +/** + * @brief Get number of pages to write given data size. + * + * @param[in] size The data size. + * @return Number of pages to write. + */ +static inline uint32 +vci_GetNumBlocks(Size size) +{ + if (size == MaxBlockNumber) + return MaxBlockNumber; + + return (size + VCI_MAX_PAGE_SPACE - 1) / VCI_MAX_PAGE_SPACE; +} + +/** Maximum data size of maximum and minimum values in extents. */ +#define VCI_MAX_MIN_MAX_SIZE (16) + +/* Accessing VCI main relation header + * Because the header of VCI main relation has three pages, we can not map + * one structure of C on the header pages simply. + * Instead, we use access functions. + * + * In order to, first prepare a variable to keep page info and call the + * initialize function, with relation opend already. + * vci_InitMainRelHeaderInfo(info, rel) + * + * use one of these two * functions. + * vci_KeepReadingMainRelHeader() + * Read header pages for reading, pin and lock them. + * vci_KeepWritingMainRelHeader() + * Read header pages for writing, pin and lock them. + * + * We have to repair all VCI relation, if some of them are broken. + * Just call the next for the purpose. + * vci_RecoverOneVCIIfNecessary() + * + * Then, use the following two functions, + * + * vci_SetMainRelVar() + * To set the value to the field. + * vci_GetMainRelVar() + * To get the value of the field. + * + * Or, if you access column_info, use + * vci_GetMColumn() + * which gives the pointer to the vcis_m_column_t on the DB buffer directly. + * + * The field is defined in enum enum vci_MainRelVar. + * + * + * To write the updated data, use the funcition + * vci_WriteMainRelVar() + * + * After accessing the header, release the DB pages with the following + * function. + * + * vci_ReleaseMainRelHeader() + * Release header pages, pins and locks. + */ + +/** + * @brief Field names and addresses of VCI main relation. + * + * These enum values has the page ID at upper 16 bits, and offset for the + * field at lower 16 bits. + * The offset is measured from the top of DB page, not after the page header. + * + * This is for struct vcis_main_t. + * Because the header ov VCI main relation has three pages, we can not map + * one structure of C on the header pages. + * + * Minimum header in DB page is 52 bytes (0x34) + */ +typedef enum vci_MainRelVar +{ + /* page 0 */ + vcimrv_data_wos_oid = 0x00000034, + vcimrv_whiteout_wos_oid = 0x00000038, + /* vcimrv_cdr_tid_crid_data_oid = 0x0000003C, //reserved */ + vcimrv_tid_crid_meta_oid = 0x00000040, + vcimrv_tid_crid_data_oid = 0x00000044, + vcimrv_tid_crid_update_oid_0 = 0x00000048, + vcimrv_tid_crid_update_oid_1 = 0x0000004C, + /* vcimrv_tid_crid_write_oid = 0x00000050, //reserved */ + vcimrv_delete_meta_oid = 0x00000054, + vcimrv_delete_data_oid = 0x00000058, + vcimrv_null_meta_oid = 0x0000005C, + vcimrv_null_data_oid = 0x00000060, + vcimrv_tid_meta_oid = 0x00000064, + vcimrv_tid_data_oid = 0x00000068, + vcimrv_ros_version_major = 0x0000006C, /** MUST BE 0x0000006C */ + vcimrv_ros_version_minor = 0x00000070, /** MUST BE 0x00000070 */ + vcimrv_num_nullable_columns = 0x00000074, + vcimrv_null_width_in_byte = 0x00000078, /** byte size of null bit vector for one row. */ + vcimrv_column_info_offset = 0x0000007C, + vcimrv_num_columns = 0x00000080, + vcimrv_extent_info_offset = 0x00000084, + /* page 0 to 2 */ + vcimrv_column_info = 0x00000088, + /* page 3 */ + vcimrv_size_mr = 0x00030034, /** @todo Maybe, dose not need */ + vcimrv_size_mr_old = 0x00030038, /** @todo Maybe, dose not need */ + vcimrv_current_ros_version = 0x0003003C, + vcimrv_last_ros_version = 0x00030040, + vcimrv_tid_crid_diff_sel = 0x00030044, + vcimrv_tid_crid_diff_sel_old = 0x00030048, + vcimrv_xid_generation = 0x0003004C, + vcimrv_xid_gen_update_xid = 0x00030050, + /* vcimrv_xgen_tid_crid_write = 0x00030054, //reserved */ + /* vcimrv_num_tid_crid_update_oid_0 = 0x00030058, //reserved */ + /* vcimrv_num_tid_crid_update_oid_1 = 0x0003005C, //reserved */ + vcimrv_ros_command = 0x00030060, + /* vcimrv_ros_conv_extent_id = 0x00030064, //reserved */ + /* vcimrv_ros_conv_common_dict_id = 0x00030068, //reserved */ + vcimrv_old_extent_id = 0x0003006C, + vcimrv_new_extent_id = 0x00030070, + vcimrv_working_column_id = 0x00030074, + vcimrv_working_dictionary_id = 0x00030078, + vcimrv_tid_crid_operation = 0x0003007C, + vcimrv_tid_crid_target_blocknumber = 0x00030080, + vcimrv_tid_crid_target_info = 0x00030084, + vcimrv_tid_crid_free_blocknumber = 0x00030088, + /* vcimrv_compaction_colmn_id = 0x0003007C, //reserved */ + /* vcimrv_compaction_extent_id = 0x00030080, //reserved */ + /* vcimrv_compaction_old_block_number = 0x00030084, //reserved */ + /* vcimrv_compaction_new_block_number = 0x00030088, //reserved */ + vcimrv_num_unterminated_copy_cmd = 0x0003008C, + vcimrv_tid_crid_tag_bitmap = 0x00030090, + /* vcimrv_num_request_cdr = 0x00030090, //reserved */ + /* vcimrv_num_appendable_extents = 0x00030094, //reserved */ + /* vcimrv_num_compaction = 0x00030098, //reserved */ + /* vcimrv_extent_id_to_write = 0x0003009C, //reserved */ + vcimrv_num_extents = 0x000300A0, + vcimrv_num_extents_old = 0x000300A4, + vcimrv_extent_info = 0x000300A8, + + /* error code */ + vcimrv_invalid = 0xFFFFFFFF, +} vci_MainRelVar; + +/** mask data to get offset for fileds in VCI main relation header in DB page */ +#define VCI_MRV_MASK_OFFSET (0xFFFF) +/** bit to shift to get DB page ID for fileds in VCI main relation header */ +#define VCI_MRV_PAGE_SHIFT (16) + +/** + * @brief Get block number for given field of main relation header. + * + * @param[in] value value defined in vci_MainRelVar. + * @return Block number containing given field. + */ +#define vci_MRVGetBlockNumber(value) ((value) >> VCI_MRV_PAGE_SHIFT) + +/** + * @brief Get offset in DB page for given field of main relation header. + * + * @param[in] value value defined in vci_MainRelVar. + * @return Offset for containing given field from page top including header. + */ +#define vci_MRVGetOffset(value) ((value) & VCI_MRV_MASK_OFFSET) + +/** Number of header pages of VCI main relation */ +#define VCI_NUM_MAIN_REL_HEADER_PAGES (4) + +/** Struct to keep pointers to the header pages of VCI main relation */ +typedef struct vci_MainRelHeaderInfo +{ + Relation rel; /* Relation of VCI main relation */ + + /* + * VCI mainrelation header pages should be initialized with InvalidBuffer + */ + Buffer buffer[VCI_NUM_MAIN_REL_HEADER_PAGES]; /* Buffers for the main + * relation header + * pages. */ + vci_ros_command_t command; /* Command using this structure. */ + + /** number of extents that have the area to store their vcis_m_extent_t + * in main relation. + * This field is used in query execution, otherwise it has "-1". + */ + int32 num_extents_allocated; + /** To create VCI on more than 32 columns, creating TupleDesc by copying table's + * one is required. However, it is too heavy to repeat. So cache the created + * one to cached_tupledesc in initctx context. + */ + MemoryContext initctx; + TupleDesc cached_tupledesc; +} vci_MainRelHeaderInfo; + +/** Minimum size of an extent + * The extents of fixed field length columns has the size. + * The extents of the other types have larger size. + * Use vci_GetExtentFixedLengthRawDataHeaderSize() or something to obtain + * the size actually. + */ +#define VCI_EXTENT_HEADER_SIZE (offsetof(vcis_extent_t, dict_body)) + +/** This function returns the size of header of extent for fixed field length + * data. The size can be calculated from the format and the number of rows + * in an extent. Actually, it is independent of the number of rows, but that + * of variable length depends. + * @param[in] numRowsInExtent The number of rows in the extent. + * @return The size of extent header. + */ +#define vci_GetExtentFixedLengthRawDataHeaderSize(numRowsInExtent) \ + VCI_EXTENT_HEADER_SIZE + +/** Function to calculate necessary number of offset data to the chunks + * of VCI_COMPACTION_UNIT_ROW in ROS. + * @param[in] numRowsInExtent Number of rows in the extent. + * @return Number of necessary offsets. + */ +#define vci_GetOffsetArrayLength(numRowsInExtent) \ + (1 + (((numRowsInExtent) + VCI_COMPACTION_UNIT_ROW - 1) \ + / VCI_COMPACTION_UNIT_ROW)) + +/** Function to calculate data size of necessary offset data to the chunks + * of VCI_COMPACTION_UNIT_ROW in ROS. + * @param[in] numRowsInExtent Number of rows in the extent. + * @return Necessary data size. + */ +#define vci_GetOffsetArraySize(numRowsInExtent) \ + vci_GetOffsetArrayLength(numRowsInExtent) \ + * sizeof(vci_offset_in_extent_t) + +/** This function returns the size of header of extent for variable field + * length data, and compressed data. + * The size can be calculated from the format and the number of rows + * in an extent. Actually, it is independent of the number of rows, but that + * of variable length depends. + * @param[in] numRowsInExtent The number of rows in the extent. + * @return The size of extent header. + */ +#define vci_GetExtentVariableLengthRawDataHeaderSize(numRowsInExtent) \ + (VCI_EXTENT_HEADER_SIZE + vci_GetOffsetArraySize(numRowsInExtent)) + +/** One entry of column_info in VCI main relation + */ +typedef struct vcis_m_column +{ + Oid meta_oid; /** OID of metadata relation */ + Oid data_oid; /** OID of data relation */ + + /* + * int16 max_columns_size; + */ + /** AttrNumber original_attribute_number; */ + int16 max_columns_size; + int16 comp_type; /** vcis_compression_type_t */ +} vcis_m_column_t; + +/** One entry of extent_info in VCI main relation + */ +typedef struct vcis_m_extent +{ + /** number of rows recorded, including marked as deleted. */ + uint32 num_rows; + uint32 num_deleted_rows; /* number of rows marked as deleted. */ + uint32 num_deleted_rows_old; /* num_deleted_rows for recovery */ + TransactionId xgen; /* like xmin */ + TransactionId xdel; /* like xmax */ + + uint16 flags; + uint16 recovered_colid; +} vcis_m_extent_t; + +#define VCIS_M_EXTENT_FLAG_ENABLE_RECOVERED_COLID (0x0001) + +/** + * @brief VCI main relation header area to store by vci_WriteMainRelVar(). + * + * vci_wmrv_all is used when the VCI relation is built, since first two or + * three pages are defined in building time, then not modified at all. + * The last page has ROS command, current ROS version, and extent information + * so will be updated after creation. vci_wmrv_update is used when the last + * page is updated. + */ +typedef enum vci_wmrv_t +{ + vci_wmrv_update, /** Only the last header page will be wrote to storage */ + vci_wmrv_all, /** All the header pages will be wrote to storage */ +} vci_wmrv_t; + +/** I categorized ROS data like TID, NULL bit vector, normal column data + * as shown below. + */ +typedef enum vcis_attribute_type_t +{ + vcis_attribute_type_main = 0, /* data only */ + vcis_attribute_type_data_wos, /* data only */ + vcis_attribute_type_whiteout_wos, /* data only */ + vcis_attribute_type_tid_crid, /* special type, meta and data */ + vcis_attribute_type_tid_crid_update, /* data only */ /* two elements */ + vcis_attribute_type_delete_vec, /* normal column type */ + vcis_attribute_type_null_vec, /* normal column type */ + vcis_attribute_type_tid, /* normal column type */ + vcis_attribute_type_pgsql, /* normal column type */ + /* number of indexed columns */ + num_vcis_attribute_type, +} vcis_attribute_type_t; + +/** + * @brief Gives how many colums or data belong to the given category. + * + * Some categories, defined in vcis_attribute_type_t, have multiple elements. + * For example, vcis_attribute_type_pgsql category contains all the columns + * given in CREATE INDEX command. This function gives how many colums or data + * belong to the given category. + * + * @param[in] attrType Attribute type define in vcis_attribute_type_t. + * For normal columns, it takes vcis_attribute_type_pgsql. + * @param[in] numColumns The number of columns, which is returned when + * attrType is vcis_attribute_type_pgsql. + */ +static inline int +vci_GetNumIndexForAttributeType(vcis_attribute_type_t attrType, + int16 numColumns) +{ + return (vcis_attribute_type_pgsql == attrType) ? numColumns + : ((vcis_attribute_type_tid_crid_update == attrType) ? 2 + : ((0 <= attrType) && (attrType < num_vcis_attribute_type)) ? 1 + : 0); +} + +extern PGDLLEXPORT int vci_GetSumOfAttributeIndices(int16 numColumns); +extern PGDLLEXPORT void vci_GetAttrTypeAndIndexFromSumOfIndices( + vcis_attribute_type_t *attrType, + int *index, + int16 numColumns, + int sumOfIndex); + +typedef enum vcis_compression_type_t +{ + vcis_compression_type_invalid = -1, + vcis_compression_type_fixed_raw = 0, + vcis_compression_type_variable_raw, + vcis_compression_type_fixed_comp, /* reserved */ + vcis_compression_type_auto, /* reserved */ + num_vcis_compression_type, +} vcis_compression_type_t; + +typedef enum vcis_extent_type_t +{ + /** initial value is zero, since newly created DB page is filled with zero. + */ + vcis_undef_space = 0, + + vcis_extent_type_data, + vcis_extent_type_dict, + vcis_free_space, + + vcis_tidcrid_type_leaf, + vcis_tidcrid_type_trunk, + vcis_tidcrid_type_pagetag, + + num_vcis_extent_type, +} vcis_extent_type_t , +vcis_tidcrid_item_type_t; + +/** Type(s) of dictionary. + */ +typedef enum vcis_dict_type_t +{ + /** initial value is zero, since newly created DB page is filled with zero. + */ + vcis_dict_type_none = 0, + vcis_dict_type_lzvf, + num_vcis_dict_type, +} vcis_dict_type_t; + +/** Type(s) of operations in updating TID-CRID tree. + */ +typedef enum +{ + vcis_tid_crid_op_none = 0, + vcis_tid_crid_op_trunk, + vcis_tid_crid_op_leaf_add, + vcis_tid_crid_op_leaf_remove, +} vcis_tid_crid_op_type_t; + +#define vci_GetBlockNumberFromUint64(tId) \ + ((tId) >> (BITS_PER_BYTE * sizeof(OffsetNumber))) +#define vci_GetOffsetFromUint64(tId) \ + ((tId) & ((1U << (BITS_PER_BYTE * sizeof(OffsetNumber))) - 1)) +#define vci_MakeUint64FromBlockNumberAndOffset(blockNumber, offset) \ + (((uint64) (blockNumber) << (BITS_PER_BYTE * sizeof(OffsetNumber))) | (offset)) + +/** Local delete list */ +typedef struct vci_local_delete_list +{ + uint32 num_entry; /* the number of CRID stored */ + uint32 length; /* capacity of crid_list */ + uint64 *crid_list; /* actual values taken from whiteout WOS */ +} vci_local_delete_list; + +struct vci_CSFetchContextData; + +/** Local ROS */ +typedef struct vci_local_ros +{ + vci_local_delete_list local_delete_list; + + /** Number of extents of local ROS. + * The minimum extent ID of the local ROS is (-num_local_extents). + */ + uint32 num_local_extents; + + /** Pointer of the array of pointers to extent data. + * When release the data, first pfree(extent[i]) where i is from zero + * to (num_local_extents - 1), then pfree(extent). + */ + struct vci_virtual_tuples **extent; + + /* Memory context to store local ROS data */ + MemoryContext memory_context; + + /* not localized one */ + /** this fetch_context is allocated in shared memory context created + * in vci_GenerateLocalRos(), and destructed in vci_DestroyLocalRos(). + * In the latter function, the fetch_context is freed automatically. + */ + struct vci_CSFetchContextData *fetch_context; +} vci_local_ros_t; + +typedef struct vci_RelationPair +{ + vci_MainRelHeaderInfo *info; + + Relation meta; + Relation data; + + Buffer bufMeta; + Buffer bufData; +} vci_RelationPair; + +extern PGDLLEXPORT void vci_InitMainRelHeaderInfo(vci_MainRelHeaderInfo *info, + Relation rel, + vci_ros_command_t command); +extern void vci_KeepMainRelHeaderWithoutVersionCheck(vci_MainRelHeaderInfo *info); +extern PGDLLEXPORT void vci_KeepMainRelHeader(vci_MainRelHeaderInfo *info); +extern void vci_ChangeCommand(vci_MainRelHeaderInfo *info, vci_ros_command_t command); + +extern PGDLLEXPORT void vci_ReleaseMainRelHeader(vci_MainRelHeaderInfo *info); + +extern void vci_SetMainRelVar(vci_MainRelHeaderInfo *info, + vci_MainRelVar var, + int elemId, + uint32 value); +extern PGDLLEXPORT uint32 vci_GetMainRelVar(vci_MainRelHeaderInfo *info, + vci_MainRelVar var, + int elemId); +extern void vci_WriteMainRelVar(vci_MainRelHeaderInfo *info, + vci_wmrv_t writeArea); + +extern void vci_InitPageCore(Buffer buffer, int16 numItem, bool locked); +extern void vci_InitPage(Relation rel, BlockNumber blockNumber, int16 numItem); + +extern Buffer vci_ReadBufferWithPageInit(Relation reln, BlockNumber blockNumber); +extern Buffer vci_ReadBufferWithPageInitDelVec(Relation reln, BlockNumber blockNumber); + +/* + * In order to keep the heap tuple plane, set 'p' to attstorage in + * FormData_pg_attribute. + */ + +extern PGDLLEXPORT vci_MainRelVar vci_GetMColumnPosition(int16 columnId); +extern PGDLLEXPORT vcis_m_column_t *vci_GetMColumn(vci_MainRelHeaderInfo *info, int16 columnId); +extern PGDLLEXPORT vcis_m_extent_t *vci_GetMExtent(Buffer *buffer, vci_MainRelHeaderInfo *info, int32 extentId); + +extern void vci_GetExtentInfoPosition(BlockNumber *blockNumber, + OffsetNumber *offset, + int32 extentId); +extern bool vci_ExtentInfoExists(vci_MainRelHeaderInfo *info, int32 extentId); +extern bool vci_ExtentIsVisible(vcis_m_extent_t *mExtent, TransactionId xid); +extern bool vci_ExtentIsCollectable(vcis_m_extent_t *mExtent, TransactionId wos2rosXid); +extern bool vci_ExtentIsFree(vcis_m_extent_t *extentInfo); + +extern uint32 vci_GetFreeExtentId(vci_MainRelHeaderInfo *info); +extern PGDLLEXPORT int16 vci_GetColumnWorstSize(Form_pg_attribute attr); + +/* ************************************** + * ** CAUTION: AttrNumber is 1 origin. ** + * ************************************** + */ +extern Size vci_GetColumnIdsAndSizes(AttrNumber *heapAttrNumList, + int16 *indxColumnIdList, + int16 *columnSizeList, + int numColumn, + vci_MainRelHeaderInfo *info, + Oid heapOid); +extern void vci_WriteExtentInfoInMainRosForWriteExtentOrCommonDict( + vci_MainRelHeaderInfo *info, + int32 extentId, + int32 dictionaryId, + TransactionId xid, + vci_ros_command_t command); + +static inline void +vci_WriteExtentInfoInMainRosForWriteExtent(vci_MainRelHeaderInfo *info, + int32 extentId, + TransactionId xid, + vci_ros_command_t command) +{ + vci_WriteExtentInfoInMainRosForWriteExtentOrCommonDict(info, extentId, + VCI_INVALID_DICTIONARY_ID, + xid, command); +} + +static inline void +vci_SetItemPointerFromTid64(ItemPointer item, uint64 tId) +{ + ItemPointerSet(item, + vci_GetBlockNumberFromUint64(tId), + vci_GetOffsetFromUint64(tId)); +} + +static inline uint64 +vci_GetTid64FromItemPointer(ItemPointer item) +{ + uint64 blockNumber; + + Assert(NULL != item); + blockNumber = BlockIdGetBlockNumber(&(item->ip_blkid)); + + return vci_MakeUint64FromBlockNumberAndOffset(blockNumber, item->ip_posid); +} + +/* ************************************** + * ** CAUTION: AttrNumber is 1 origin. ** + * ************************************** + */ +extern Buffer vci_WriteOnePageIfNecessaryAndGetBuffer(Relation relation, + BlockNumber blockNumber, + BlockNumber blockNumberOld, + Buffer buffer); +extern void vci_WriteExtentInfo(vci_MainRelHeaderInfo *info, + int32 extentId, + uint32 numRows, + uint32 numDeletedRows, + uint32 numDeletedRowsOld, + TransactionId xgen, + TransactionId xdel); + +/* + * ********************************************************* + * functions to recover ROS + * ********************************************************* + */ +extern void vci_RecoverOneVCIIfNecessary(vci_MainRelHeaderInfo *info); + +extern PGDLLEXPORT void + vci_PreparePagesIfNecessaryCore(Relation rel, + BlockNumber blockNumber, + uint16 numItems, + bool forceInit, + bool logItems); + +/** + * @brief This function checks if the relation has the DB page with the page ID + * blockNumber. + * + * When it does not exists, the function extends the relation and initialize + * extended pages with one item per page. + * + * @param[in] rel The relation. + * @param[in] blockNumber The block number to be examined. + * @param[in] numItems The number of items the page is initialized with. + */ +static inline void +vci_FormatPageWithItems(Relation rel, BlockNumber blockNumber, int16 numItems) +{ + vci_PreparePagesIfNecessaryCore(rel, blockNumber, numItems, true, false); +} + +static inline void +vci_PreparePagesIfNecessary(Relation rel, BlockNumber blockNumber, uint16 numItems) +{ + vci_PreparePagesIfNecessaryCore(rel, blockNumber, numItems, false, false); +} + +extern PGDLLEXPORT void vci_WriteItem(Relation rel, + Buffer buffer, + OffsetNumber itemId); + +extern void + vci_UpdateOldFieldsInMetaHeader(Relation rel, TransactionId xId); +extern PGDLLEXPORT uint16 + vci_GetFixedColumnSize(vci_MainRelHeaderInfo *info, int16 columnId); +extern PGDLLEXPORT void + vci_GetPositionForFixedColumn(BlockNumber *blockNumber, + uint32 *offset, + vci_MainRelHeaderInfo *info, + int16 columnId, + int32 extentId, + uint32 rowIdInExtent, + bool atEnd); + +extern int vci_GetNumberOfNullableColumn(TupleDesc tupleDesc); +extern PGDLLEXPORT int16 vci_GetBitIdInNullBits(TupleDesc tupleDesc, int16 columnId); + +extern PGDLLEXPORT Snapshot vci_GetCurrentSnapshot(void); +extern void vci_FinalizeCopyCommand(void); + +struct vci_CSQueryContextData; +extern struct vci_local_ros *vci_GenerateLocalRos( + struct vci_CSQueryContextData *queryContext, + + /* maximum memory size to generate and keep local ROS */ + Size workareaSize, + + /* the number of rows from data WOS to local ROS */ + int64 numDataWosRows, + + /* the number of rows from whiteout WOS to local delete list */ + int64 numWhiteoutWosRows); + +static inline unsigned int +vci_GetNumRowsInLocalRosExtent(int numColumns) +{ + unsigned int numRowsInExtent = MaxAllocSize / Max( + + /* + * The size of area to store pointers to larger data or values of small + * fixed length directly, say each size is smaller than or equal to + * sizeof(Datum). We allocate one are for all columns to support both row + * wise and column wise access. + */ + sizeof(Datum) * numColumns, + + /* + * The size of area to store with larger size than sizeof(Datum). The data + * in the area is pointed from pointers stored in above area, so we can + * allocate separately. + */ + MaxHeapTupleSize); + + return 1U << vci_GetHighestBit(Min(numRowsInExtent, VCI_NUM_ROWS_IN_EXTENT)); +} + +extern void vci_DestroyLocalRos(vci_local_ros_t *localRos); + +#define vci_WriteExtentInfoInMainRosForWosRosConvInit(info, extentId, xid) \ + vci_WriteExtentInfoInMainRosForWriteExtent((info), \ + (extentId), \ + (xid), \ + vci_rc_wos_ros_conv) + +#define vci_WriteExtentInfoInMainRosForCopyInit(info, extentId, xid) \ + vci_WriteExtentInfoInMainRosForWriteExtent((info), \ + (extentId), \ + (xid), \ + vci_rc_copy_command) + +/* + * + */ +static inline void +vci_PreparePagesWithOneItemIfNecessary(Relation relation, + BlockNumber blockNumber) +{ + vci_PreparePagesIfNecessary(relation, blockNumber, 1); +} + +/* this function set the dirty bit, and write all the items in the page + * to the WAL. + * arguments + * Relation rel + * Buffer buffer + */ +static inline void +vci_WriteOneItemPage(Relation rel, + Buffer buffer) +{ + vci_WriteItem(rel, buffer, FirstOffsetNumber); +} + +/* Initialize a DB page with one item format + * argumtents + * Relation relation + * BlockNumber blockNumber + */ +static inline void +vci_InitOneItemPage(Relation relation, BlockNumber blockNumber) +{ + vci_InitPage(relation, blockNumber, 1); +} + +static inline void +vci_FormatPageWithOneItem(Relation rel, BlockNumber blockNumber) +{ + vci_FormatPageWithItems(rel, blockNumber, 1); +} + +static inline uint32 +vci_VarSizeAny(char *ptr) +{ + if (!VARATT_IS_1B(ptr)) + { + static varattrib_4b tmp; + + MemCpy(&tmp, ptr, sizeof(varattrib_4b)); + + return VARSIZE_4B(&tmp); + } + + return VARSIZE_ANY(ptr); +} + +static inline bool +vci_PassByRefForFixed(Form_pg_attribute attr) +{ +#ifndef USE_FLOAT8_BYVAL + if (8 == attr->attlen) + return true; +#endif /* #ifndef USE_FLOAT8_BYVAL */ + + return sizeof(Datum) < (unsigned long) attr->attlen; +} + +static inline void * +vci_repalloc(void *ptr, size_t size) +{ + return ptr ? repalloc(ptr, size) : palloc(size); +} + +static inline bool +vci_GetBit(uint8 *bitArray, int bitId) +{ + return (bitArray[bitId >> 3] >> (bitId & 7)) & 1; +} + +typedef struct vci_DictInfo +{ + /* + * Memory area to read dictionary. This is not used when create new + * dictionaries. + */ + unsigned char *dictionary_storage; + + Size storage_size; /* byte size of dictionary_storage */ + + /* + * The extent ID for individual dictionary. VCI_INVALID_EXTENT_ID for + * common dictionaries. + */ + int32 extent_id; + + /* VCI_INVALID_DICTIONARY_ID for individual dictionary */ + int16 common_dict_id; + + vcis_dict_type_t dict_type; + +} vci_DictInfo; + +Buffer + vci_WriteDataIntoMultiplePages(Relation rel, + BlockNumber *blockNumber, + BlockNumber *blockNumberOld, + uint32 *offsetInPage, + Buffer buffer, + const void *data_, + Size size); + +typedef struct vci_meta_item_scanner +{ + bool inited; + + Relation rel; + int index; + + BlockNumber end_block; /* inclusive */ + BlockNumber start_block; + + Buffer buffer; + BlockNumber current_block; + + int max_item; + int max_item_in_page; + int item_size; + + int buf_lockmode; + +} vci_meta_item_scanner_t; + +typedef struct +{ + Oid oid; /* Oid of VCI main relation */ + Oid dbid; /* Oid of database to which a VCI main + * relation belongs */ + bool force_next_wosros_conv; /* flag to force WOS->ROS conversion + * on next time */ +} vci_wosros_conv_worker_arg_t; + +extern vcis_m_extent_t *vci_GetMExtentNext(vci_MainRelHeaderInfo *info, vci_meta_item_scanner_t *scan); +extern vci_meta_item_scanner_t *vci_BeginMetaItemScan(Relation rel, int buf_lock); +extern void vci_EndMetaItemScan(vci_meta_item_scanner_t *scan); + +/* recovery functions for command */ +extern void vci_UpdateLastRosVersionAndOthers(vci_MainRelHeaderInfo *info); +extern void vci_RecoveryDone(vci_MainRelHeaderInfo *info); +extern void vci_WriteRecoveryRecordDone(vci_MainRelHeaderInfo *info, vci_ros_command_t command, TransactionId xid); + +extern void vci_WriteRecoveryRecordForExtentInfo(vci_MainRelHeaderInfo *info, + int32 newExtentId, int32 oldExtentId); +extern void vci_RecoveryExtentInfo(vci_MainRelHeaderInfo *info, vci_ros_command_t command); + +extern void vci_WriteRecoveryRecordForUpdateDelVec(vci_MainRelHeaderInfo *info); +extern void vci_RecoveryUpdateDelVec(vci_MainRelHeaderInfo *info); +extern const char *vci_GetRosCommandName(vci_ros_command_t command); + +/* ---------------- + * vci_index.c + * ---------------- + */ + +extern bool vci_isVciAdditionalRelation(Relation rel); +extern bool vci_isVciAdditionalRelationTuple(Oid reloid, Form_pg_class reltuple); + +/* ---------------- + * vci_internal_view.c + * ---------------- + */ + +extern void vci_check_prohibited_operation(Node *parseTree, bool *creating_vci_extension); + +#endif /* VCI_ROS_H */ diff --git a/contrib/vci/include/vci_ros_command.h b/contrib/vci/include/vci_ros_command.h new file mode 100644 index 0000000..8c2cb5c --- /dev/null +++ b/contrib/vci/include/vci_ros_command.h @@ -0,0 +1,214 @@ +/*------------------------------------------------------------------------- + * + * vci_ros_command.h + * Definitions and declarations of ROS control commands + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/include/vci_ros_command.h + * + *------------------------------------------------------------------------- + */ + +#ifndef VCI_ROS_COMMAND_H +#define VCI_ROS_COMMAND_H + +#include "postgres.h" +#include "c.h" +#include "utils/tuplesort.h" +#include "access/genam.h" + +#include "vci_ros.h" +#include "vci_chunk.h" + +typedef struct +{ + ItemPointerData *orig_tids; + + ItemPointerData *wos_tids; + + int max; + + int num; + + int offset; +} vci_tid_array_t; + +typedef struct +{ + BlockNumber *orig_blknos; + + int max; + + int num; + +} vci_blk_array_t; + +/** + * @brief Context for ROS commands, containing TID list read from data WOS or + * whiteout WOS, data read from the PostgreSQL heap relation or from ROS, + * related attribute numbers, OIDs, number of rows, and so on. + */ +typedef struct vci_RosCommandContext +{ + vci_ros_command_t command; /* command using this context */ + + RosChunkBuffer buffer; /* data are stored primary here */ + RosChunkStorage storage; /* data are compacted and copied here */ + vci_MainRelHeaderInfo info; /* VCI main relation header */ + + /** numRowsToConvert is something tricky. + * set VCI_NUM_ROWS_IN_EXTENT in index building phase. + * set number of rows (up to VCI_NUM_ROWS_IN_EXTENT) to convert after + * building. + */ + int numRowsToConvert; + + int numRowsAtOnce; /* maximum number of rows in a chunk */ + Relation heapRel; /* the original relation indexed by VCI */ + Oid heapOid; /* the original relation indexed by VCI */ + Oid indexOid; /* the VCI indexed relation */ + + int numColumns; /* number of columns in VCI index */ + + /** the processing extent ID. negative IDs for local ROSes */ + int32 extentId; + int32 extentIdSrc; /* source extentId in copy operation (wos2ros, + * cdr) */ + + struct vci_local_ros *local_ros; /* local ROS */ + + /** list of worst case column size */ + int16 *columnSizeList; + + /** attribute number (1-origin) in the original relation */ + AttrNumber *heapAttrNumList; + + /** index ID (0-origin) in the VCI relation */ + int16 *indxColumnIdList; + + /** transaction ID using this context */ + TransactionId xid; + + TransactionId oldestXmin; + + TransactionId wos2rosXid; + + TransactionId inclusiveXid; + + TransactionId exclusiveXid; + + vci_tid_array_t wos2ros_array; + + vci_tid_array_t delvec_array; + + vci_blk_array_t utility_array; + + /** + * TID on "WOS Relation" list to convert in Item Pointer format + */ + + bool done; /* true if all records are read */ + + /** + * Number of rows in the relation estimated by analyze or vacuum command. + * This is used to build ROS in CREATE INDEX command. + */ + double estimatedNumRows; + + /** + * Number of converted rows. + * This is used to build ROS in CREATE INDEX command. + */ + uint64 numConvertedRows; + + /** + * The name of index relation built. + * This is used to build ROS in CREATE INDEX command. + */ + char relName[NAMEDATALEN]; + + /** + * scan context. + * This is used only in initial building to scan the original relation + * sequentially. + */ + HeapScanDesc scan; + + TupleDesc tid_tid_tupdesc; + + TupleTableSlot *tid_tid_slot; + + /** + * a sorted TID list to be converted into ROS extents + */ + Tuplesortstate *wos2ros_tid_list; + int64 num_wos2ros_tids; + + /** + * a sorted TID list to be converted into a delete vector + */ + Tuplesortstate *delvec_tid_list; + int64 num_delvec_tids; + + Tuplesortstate *data_wos_del_list; + + Tuplesortstate *whiteout_wos_del_list; + +} vci_RosCommandContext; + +typedef struct +{ + int32 num_fit_extents; + int32 best_extent_id; +} vci_target_extent_info_t; + +/* + * ********************************************************* + * Conversion Context operation + * ********************************************************* + */ +extern void vci_InitRosCommandContext0(vci_RosCommandContext *context, + Relation rel, vci_ros_command_t command); +extern void vci_InitRosCommandContext1(vci_RosCommandContext *comContext, + Size workareaSize, + int numInsertRows, + int numDeleteRows, + bool readOriginalData); +extern void vci_InitRosCommandContext2(vci_RosCommandContext *comContext, Size workareaSize); + +extern void vci_InitRosChunkStroageAndBuffer(vci_RosCommandContext *comContext, bool forAppending); + +extern void vci_CleanRosCommandContext(vci_RosCommandContext *comContext, bool neverWrite); +extern void vci_FinRosCommandContext(vci_RosCommandContext *comContext, bool neverWrite); + +extern void vci_ReleaseMainRelInCommandContext(vci_RosCommandContext *comContext); +extern void vci_CloseHeapRelInCommandContext(vci_RosCommandContext *comContext); + +/* + * ********************************************************* + * Functions for ROS command + * ********************************************************* + */ +extern PGDLLEXPORT int vci_ConvertWos2Ros(Relation mainRel, Size workareaSize, int numRows); +extern double vci_ConvertWos2RosForBuild(Relation mainRel, Size workarea, IndexInfo *indexInfo); +extern PGDLLEXPORT int vci_UpdateDelVec(Relation mainRel, Size workareaSize, int numRows); +extern PGDLLEXPORT int vci_CollectDeletedRows(Relation mainRel, Size workareaSize, int32 extentId); +extern PGDLLEXPORT int vci_UpdateTidCrid(Relation mainRel, Size workareaSize, int numPages); +extern PGDLLEXPORT int vci_CollectUnusedExtent(Relation mainRel, Size workareaSize); + +extern void vci_VacuumRos(Relation mainRel, IndexVacuumInfo *vacuumInfo); + +/* + * ********************************************************* + * Probing functions to decided whether to execute the command + * ********************************************************* + */ +extern PGDLLEXPORT uint32 vci_CountFreezedInDataWos(Relation mainRel, Size workarea); +extern PGDLLEXPORT uint32 vci_CountFreezedInWhiteoutWos(Relation mainRel, Size workarea); +extern PGDLLEXPORT vci_target_extent_info_t vci_CountDeletedRowsInROS(Relation mainRel, uint32 threshold); +extern vci_target_extent_info_t vci_CountUnusedExtents(Relation mainRel); +extern int32 vci_CountTidCridUpdateListLength(Relation mainRel, Size workarea); + +#endif /* #ifndef VCI_ROS_COMMAND_H */ diff --git a/contrib/vci/include/vci_ros_daemon.h b/contrib/vci/include/vci_ros_daemon.h new file mode 100644 index 0000000..8def778 --- /dev/null +++ b/contrib/vci/include/vci_ros_daemon.h @@ -0,0 +1,69 @@ +/*------------------------------------------------------------------------- + * + * vci_ros_daemon.h + * Definitions and declarations of ROS Control Daemon and Worker + * + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/include/vci_ros_daemon.h + * + *------------------------------------------------------------------------- + */ +#ifndef VCI_ROS_DAEMON_H +#define VCI_ROS_DAEMON_H + +#include "postgres.h" + +#include "lib/ilist.h" +#include "postmaster/bgworker.h" +#include "utils/relcache.h" + +#include "vci_ros.h" + +/** + * The threshold of tid->crid update list item coutns to execute tid->crid update + */ +#define VCI_UPDATE_TIDCRID_THRESHOLD (1024) + +/** + * The threshold of Whiteout WOS rows to update Delete Vector + */ +#define VCI_UPDATE_DELVEC_THRESHOLD (256 * 1024) + +/** + * @see src/backend/postmaster/bgworker.c + */ +struct BackgroundWorkerHandle +{ + int slot; + uint64 generation; +}; + +typedef struct vci_workerslot +{ + pid_t pid; + + BackgroundWorkerHandle handle; + + Oid dbid; + Oid oid; +} vci_workerslot_t; + +/* ************************* */ +/* daemon functions */ +/* ************************* */ + +extern void vci_ROS_control_daemon_setup(void); +PGDLLEXPORT void vci_ROS_control_daemon_main(Datum main_arg); + +extern PGDLLEXPORT vci_workerslot_t vci_LaunchROSControlWorker(vci_wosros_conv_worker_arg_t *vciinfo, int slot_id); +PGDLLEXPORT void vci_ROS_control_worker_main(Datum main_arg); + +extern BackgroundWorkerHandle vci_LaunchROSControlMaintainer(int mode); +extern void vci_ROS_control_maintainer_main(Datum main_arg); + +extern void vci_InitDbPriorityList(void); + +#endif /* VCI_ROS_DAEMON_H */ diff --git a/contrib/vci/include/vci_tidcrid.h b/contrib/vci/include/vci_tidcrid.h new file mode 100644 index 0000000..6728a60 --- /dev/null +++ b/contrib/vci/include/vci_tidcrid.h @@ -0,0 +1,344 @@ +/*------------------------------------------------------------------------- + * + * vci_tidcrid.h + * Definitions and Declarations of TIDCRID update list and + * TIDCRID Tree relation + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/include/vci_tidcrid.h + * + *------------------------------------------------------------------------- + */ + +#ifndef VCI_TIDCRID_H +#define VCI_TIDCRID_H + +#include "postgres.h" + +#include "utils/tuplesort.h" + +#include "vci.h" +#include "vci_ros.h" +#include "vci_chunk.h" + +/** header page ID of TID->CRID update (differential) list */ +#define VCI_TID_CRID_UPDATE_HEADER_PAGE_ID (0) + +/** first body page ID of TID->CRID update (differential) list */ +#define VCI_TID_CRID_UPDATE_BODY_PAGE_ID (1) + +/** First page of tidcrid tree meta relation */ +#define VCI_TID_CRID_META_FIRST_PAGE_ID (0) + +/** First page of tidcrid tree data relation */ +#define VCI_TID_CRID_DATA_FIRST_PAGE_ID (0) + +/** Item number in page for tidcrid tree relation */ +#define VCI_ITEMS_IN_PAGE_FOR_TID_CRID_TREE (18) + +/** Offset number of page tag */ +#define VCI_TID_CRID_PAGETAG_ITEM_ID (VCI_FREESPACE_ITEM_ID) + +/** Capacity of tidcrid leaf node in bit*/ +#define VCI_TID_CRID_LEAF_CAPACITY_BITS (6) + +/** Capacity of tidcrid leaf node in bit*/ +#define VCI_TID_CRID_LEAF_CAPACITY (1 << VCI_TID_CRID_LEAF_CAPACITY_BITS) + +/** Capacity of tidcrid trunk node in bit*/ +#define VCI_TID_CRID_TRUNK_CAPACITY_BITS (6) + +/** Capacity of tidcrid trunk node in bit*/ +#define VCI_TID_CRID_TRUNK_CAPACITY (1 << VCI_TID_CRID_TRUNK_CAPACITY_BITS) + +/** Index of trunk node */ +#define VCI_TID_CRID_TRUNKNODE (-1) + +/** The number of items in DB page of TID-CRID Update List, normally 678 */ +#define VCI_TID_CRID_UPDATE_PAGE_ITEMS (VCI_MAX_PAGE_SPACE / sizeof(vcis_tidcrid_pair_item_t)) + +/** Available area in DB page of TID-CRID Update List, normally 8136 bytes */ +#define VCI_TID_CRID_UPDATE_PAGE_SPACE (VCI_TID_CRID_UPDATE_PAGE_ITEMS * sizeof(vcis_tidcrid_pair_item_t)) + +#define VCI_TID_CRID_UPDATE_CONTEXT_SAMPLES (1353) + +/* + * On-disk data structure for CRID + * + * GetUin64tFromCrid() can be used to convert to uint64 + * + * Sometimes v2 has special meanings, it represents special CRID. + */ +typedef struct vcis_Crid +{ + uint16 v0; + uint16 v1; + uint16 v2; +} +#ifdef __arm__ + __attribute__((packed)) +#endif +vcis_Crid; + +/* + * Convert vcis_Crid to uint64, on-memory structure + */ +static inline uint64 +vci_GetUint64FromCrid(vcis_Crid crid) +{ + /* Handle special values */ + if (crid.v2 == 0x8000) + return VCI_INVALID_CRID; + if (crid.v2 == 0xc000) + return VCI_MOVED_CRID; + + return ((uint64) crid.v2 << 32) | ((uint64) crid.v1 << 16) | crid.v0; +} + +/* + * Convert uint64 to vcis_Crid, on-disk structure + */ +static inline vcis_Crid +vci_GetCridFromUint64(uint64 crid_uint64) +{ + vcis_Crid crid; + + crid.v0 = crid_uint64 & ((uint64) 0xFFFF); + crid.v1 = (crid_uint64 >> 16) & ((uint64) 0xFFFF); + crid.v2 = (crid_uint64 >> 32) & ((uint64) 0xFFFF); + + return crid; +} + +/* + * TID-CRID tree relation + * + * The relation for the TID-CRID tree adds 18 tuples per page. In more detail, + * each tuple can use only 424 bytes. + * + * Each node of the tree has 64 slots, and each slot has 6 bytes, so 384 bytes + * are used to represent the tree. The remaining part is used for maintenance. + * Also, the initial tuple of each page is used for maintaining the page. + */ + +/* + * Entries of flexible array in vcis_tidcrid_meta + */ +typedef struct vcis_tidcrid_meta_item +{ + BlockNumber block_number; /* block number in TID-CRID tree relation */ + BlockNumber block_number_old; /* previous block_number, used for + * recovery purpose */ + int16 item_id; /* item id on TID-CRID tree relation */ + int16 item_id_old; /* previous item_id, used for recovery purpose */ +} vcis_tidcrid_meta_item_t; + +/* + * Meta relation for TID-CRID tree + * + * XXX: Several arrtibutes are not used but retained, to be consistent with + * Column Meta Relation. + */ +typedef struct vcis_tidcrid_meta +{ + vcis_attribute_type_t vcis_attr_type; /* Attribute type */ + + Oid pgsql_atttypid; /* taken from FormData_pg_attribute.atttypid */ + int16 pgsql_attnum; /* taken from FormData_pg_attribute.attnum */ + int16 pgsql_attlen; /* taken from FormData_pg_attribute.attlen */ + int32 pgsql_atttypmod; /* taken from + * FormData_pg_attribute.atttypmod */ + uint32 num_extents; /* number of extents (for debug) */ + uint32 num_extents_old; /* previous number of extents (for + * recovery) */ + + BlockNumber free_page_begin_id; /* page ID of the first free area */ + BlockNumber free_page_begin_id_old; /* previous free_page_begin_id (for + * recovery) */ + + BlockNumber free_page_end_id; /* page ID of the last free area */ + BlockNumber free_page_end_id_old; /* previous free_page_end_id (for + * recovery) */ + + /** + * The DB page ID of free area that located in front of the added or + * deleted extent by the ROS command. (for recovery) + * This is used to recover free area list. + */ + BlockNumber free_page_prev_id; + + /** + * Same as free_page_prev_id, but just behind the added or deleted extent. + */ + BlockNumber free_page_next_id; + + /** + * The freespace size of added or deleted extent by the ROS command (for recovery) + */ + uint32 free_page_old_size; + + /** + * The freespace position of added or deleted extent in BlockNumber + * by the ROS command (for recovery) + */ + BlockNumber new_data_head; + BlockNumber new_freespace_head; /* @todo unused field */ + + BlockNumber num_free_pages; /* number of free DB pages in the listed free + * area */ + BlockNumber num_free_pages_old; /* for recovery */ + BlockNumber num_free_page_blocks; /* number of free areas, not number of + * free DB pages */ + BlockNumber num_free_page_blocks_old; /* for recovery */ + + /*--- Above must be same as column Meta ---*/ + + BlockNumber num; /* number of Stored items */ + BlockNumber num_old; /* previous num, used for recovery purpose */ + BlockNumber free_block_number; /* number of free blocks */ + int32 offset; /* Offset from the head */ + vcis_tidcrid_meta_item_t body[1]; /* Flexible array of + * vcis_tidcrid_meta_item_t */ +} vcis_tidcrid_meta_t; + +/* + * Metadata at the initial tuple + */ +typedef struct vcis_tidcrid_pagetag +{ + uint32 size; + vcis_extent_type_t type; + BlockNumber prev_pos; + BlockNumber next_pos; + + uint32 num; + uint32 free_size; + uint32 bitmap; + char rsv[4]; +} vcis_tidcrid_pagetag_t; + +/* + * Leaf in the TID-CRID tree + */ +typedef struct vcis_tidcrid_leaf +{ + uint32 size; + vcis_tidcrid_item_type_t type; + + uint64 bitmap; + uint64 unused; + + /* Sum of above must be less than 40 bytes */ + + vcis_Crid crid[VCI_TID_CRID_LEAF_CAPACITY]; /* CRIDs related with TID */ +} vcis_tidcrid_leaf_t; + +/* + * Intermediate (trunk) node in TID-CRID tree + */ +typedef struct vcis_tidcrid_trunk +{ + uint32 size; + vcis_tidcrid_item_type_t type; + + uint64 bitmap; + uint64 unused; + + /* Sum of above must be less than 40 bytes */ + + ItemPointerData leaf_item[VCI_TID_CRID_TRUNK_CAPACITY]; /* Pointer to the leaf */ +} vcis_tidcrid_trunk_t; + +/* + * TID-CRID pair used for TIDCRID update list + */ +typedef struct vcis_tidcrid_pair_item +{ + ItemPointerData page_item_id; /* TID on the original relation */ + vcis_Crid crid; /* CRID */ +} vcis_tidcrid_pair_item_t; + +/* + * TID-CRID Update List + */ +typedef struct vcis_tidcrid_pair_list +{ + uint64 num; /* Number of items in the list */ + + uint16 blocks_per_samp; /* Number of blocks each entries in + * samples_tids[] handles */ + uint16 num_samples; /* Number of entries in samples_tids[] */ + + /* + * TID samples from update list. Sampling condition: + * + * 1. Initial entries in each blocks_per_samp blocks 2. Final entry + */ + ItemPointerData sample_tids[VCI_TID_CRID_UPDATE_CONTEXT_SAMPLES + 1]; + + vcis_tidcrid_pair_item_t body[1]; /* Flexible array of + * vcis_tidcrid_pair_item_t */ +} vcis_tidcrid_pair_list_t; + +typedef struct vci_TidCridUpdateListContext +{ + vci_MainRelHeaderInfo *info; /* Parent VCI main relation */ + + Relation rel; + + /* Number of vcis_tidcrid_pair_item_t entries in the rel */ + uint64 count; + + /* Number of blocks of the rel */ + BlockNumber nblocks; + + /* Head pointer to the TID-CRID Update List */ + vcis_tidcrid_pair_list_t header; + +} vci_TidCridUpdateListContext; + +typedef vci_RelationPair vci_TidCridRelations; + +/* initialize function */ +extern void vci_InitializeTidCridUpdateLists(vci_MainRelHeaderInfo *info); +extern void vci_InitializeTidCridTree(vci_MainRelHeaderInfo *info); + +/* TIDCRID Update List access functions */ + +extern PGDLLEXPORT vci_TidCridUpdateListContext *vci_OpenTidCridUpdateList(vci_MainRelHeaderInfo *info, int sel); +extern PGDLLEXPORT void vci_CloseTidCridUpdateList(vci_TidCridUpdateListContext *context); + +extern PGDLLEXPORT void vci_ReadOneBlockFromTidCridUpdateList(vci_TidCridUpdateListContext *context, BlockNumber blkno, vcis_tidcrid_pair_item_t *array); + +extern int32 vci_GetTidCridUpdateListLength(vci_MainRelHeaderInfo *info, int sel); +extern void vci_MergeAndWriteTidCridUpdateList(vci_MainRelHeaderInfo *info, int newSel, int oldSel, Tuplesortstate *newList, vcis_Crid crid); + +/* TIDCRID Tree access functions */ +extern void vci_OpenTidCridRelations(vci_TidCridRelations *rel, + vci_MainRelHeaderInfo *info, + LOCKMODE lockmode); +extern void vci_CloseTidCridRelations(vci_TidCridRelations *rel, LOCKMODE lockmode); + +extern void vci_GetTidCridSubTree(vci_TidCridRelations *relPair, BlockNumber blkOrig, + ItemPointer retPtr); +extern void vci_CreateTidCridSubTree(vci_TidCridRelations *relPair, BlockNumber blkOrig, + ItemPointer retPtr); +extern void vci_UpdateTidCridSubTree(vci_TidCridRelations *relPair, ItemPointer trunkPtr, + vcis_tidcrid_pair_list_t *newItems); + +/* TID->CRID Conversion */ +extern PGDLLEXPORT uint64 vci_GetCridFromTid(vci_TidCridUpdateListContext *context, ItemPointer tId, bool *fromTree); + +/* Recovery functions */ + +extern void vci_RecoveryFreeSpaceForTidCrid(vci_MainRelHeaderInfo *info); +extern void vci_RecoveryTidCrid(vci_MainRelHeaderInfo *info); +extern void vci_InitRecoveryRecordForTidCrid(vci_MainRelHeaderInfo *info); + +extern void vci_AddTidCridUpdateList(vci_MainRelHeaderInfo *info, + RosChunkStorage *src, + int32 extentId); + +#endif /* VCI_TIDCRID_H */ diff --git a/contrib/vci/include/vci_wos.h b/contrib/vci/include/vci_wos.h new file mode 100644 index 0000000..7bc302b --- /dev/null +++ b/contrib/vci/include/vci_wos.h @@ -0,0 +1,29 @@ +/*------------------------------------------------------------------------- + * + * vci_wos.h + * Declarations of WOS functions + * + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/include/vci_wos.h + * + *------------------------------------------------------------------------- + */ +#ifndef VCI_WOS_H +#define VCI_WOS_H + +#include "postgres.h" + +#include "storage/itemptr.h" +#include "lib/rbtree.h" +#include "utils/relcache.h" +#include "utils/snapshot.h" + +extern Snapshot vci_GetSnapshotForWos2Ros(void); +extern Snapshot vci_GetSnapshotForLocalRos(TransactionId inclusive_xid, TransactionId exclusive_xid); + +extern PGDLLEXPORT uint64 vci_EstimateNumEntriesInHeapRelation(Oid oid); + +#endif /* VCI_WOS_H */ diff --git a/contrib/vci/include/vci_xact.h b/contrib/vci/include/vci_xact.h new file mode 100644 index 0000000..67fe1e4 --- /dev/null +++ b/contrib/vci/include/vci_xact.h @@ -0,0 +1,39 @@ +/*------------------------------------------------------------------------- + * + * vci_xact.h + * Transaction control + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/include/vci_xact.h + * + *------------------------------------------------------------------------- + */ + +#ifndef VCI_XACT_H +#define VCI_XACT_H + +#include "access/xact.h" + +struct vci_MainRelHeaderInfo; + +/* + * States of transactions + */ +enum vci_xact_status_kind +{ + VCI_XACT_INVALID, /* invalid transaction ID */ + VCI_XACT_SELF, /* my transaction */ + VCI_XACT_IN_PROGRESS, /* in-progress transaction (not mine) */ + VCI_XACT_DID_COMMIT, /* committed transaction */ + VCI_XACT_DID_ABORT, /* aborted transaction */ + VCI_XACT_DID_CRASH /* crash was happened during the transaction */ +}; + +extern enum vci_xact_status_kind vci_transaction_get_type(TransactionId xid); + +extern int64 vci_GenerateXid64(TransactionId target_xid, struct vci_MainRelHeaderInfo *info); +extern void vci_UpdateXidGeneration(struct vci_MainRelHeaderInfo *info); + +#endif /* VCI_XACT_H */ diff --git a/contrib/vci/storage/Makefile b/contrib/vci/storage/Makefile new file mode 100644 index 0000000..2ea8365 --- /dev/null +++ b/contrib/vci/storage/Makefile @@ -0,0 +1,34 @@ +# contrib/vci/storage/Makefile + +SUBOBJS = \ +# vci_chunk.o \ +# vci_columns.o \ +# vci_columns_data.o \ +# vci_fetch.o \ +# vci_freelist.o \ +# vci_index.o \ +# vci_internal_view.o \ +# vci_low_utils.o \ +# vci_memory_entry.o \ + vci_ros.o \ + vci_ros_command.o \ + vci_ros_daemon.o \ +# vci_tidcrid.o \ +# vci_wos.o \ +# vci_xact.o + +EXTRA_CLEAN = SUBSYS.o $(SUBOBJS) + +PG_CPPFLAGS = -I $(top_srcdir)/contrib/vci/include + +ifdef USE_PGXS +PGXS := $(shell pg_config --pgxs) +include $(PGXS) +else +subdir = contrib/vci/storage +top_builddir = ../../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif + +override CFLAGS += $(CFLAGS_SL) diff --git a/contrib/vci/storage/meson.build b/contrib/vci/storage/meson.build new file mode 100644 index 0000000..fefe15b --- /dev/null +++ b/contrib/vci/storage/meson.build @@ -0,0 +1,19 @@ +# Copyright (c) 2025, PostgreSQL Global Development Group + +vci_storage_sources = files( +# 'vci_chunk.c', +# 'vci_columns.c', +# 'vci_columns_data.c', +# 'vci_fetch.c', +# 'vci_freelist.c', +# 'vci_index.c', +# 'vci_internal_view.c', +# 'vci_low_utils.c', +# 'vci_memory_entry.c', + 'vci_ros.c', + 'vci_ros_command.c', + 'vci_ros_daemon.c', +# 'vci_tidcrid.c', +# 'vci_wos.c', +# 'vci_xact.c', +) diff --git a/contrib/vci/storage/vci_ros.c b/contrib/vci/storage/vci_ros.c new file mode 100644 index 0000000..869491c --- /dev/null +++ b/contrib/vci/storage/vci_ros.c @@ -0,0 +1,1674 @@ +/*------------------------------------------------------------------------- + * + * vci_ros.c + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/storage/vci_ros.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include + +#include "access/heapam_xlog.h" +#include "access/xact.h" +#include "access/xloginsert.h" +#include "catalog/pg_type.h" +#include "mb/pg_wchar.h" /* for MAX_MULTIBYTE_CHAR_LEN */ +#include "miscadmin.h" +#include "storage/ipc.h" +#include "storage/procarray.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/rel.h" +#include "utils/snapmgr.h" +#include "utils/syscache.h" +#include "utils/varbit.h" + +#include "vci.h" +#include "vci_columns.h" +#include "vci_freelist.h" +#include "vci_ros.h" +#include "vci_mem.h" +#include "vci_wos.h" + +/* + * This file has four parts. + * 1. Accessing VCI main relation header + * 2. Relation and buffer control + * 3. Attributes (columns) + * 4. VCI "columns" + */ + +/* + * ********************************************************* + * Accessing VCI main relation header + * ********************************************************* + */ +/* Accessing VCI main relation header + * Because the header of VCI main relation has three pages, we can not map + * one structure of C on the header pages simply. + * Instead, we use access functions. + * + * In order to, first use one of these two * functions, + * + * vci_KeepReadingMainRelHeader() + * Read header pages for reading, and pin them. + * vci_KeepWritingMainRelHeader() + * Read header pages for writing, and pin them. + * + * Then, use the following two functions, + * + * vci_SetMainRelVar() + * To set the value to the field. + * vci_GetMainRelVar() + * To get the value of the field. + * + * The field is defined in enum enum vci_MainRelVar. + * The format is, page ID is in upper 16 bits, and offset from + * the page top is in lower 16 bits. + * + * To write the header pages out to storage, use the next function. + * + * vci_WriteMainRelVar() + * + * After accessing the header, release the DB pages with the following + * function. + * + * vci_ReleaseMainRelHeader() + * Release header pages. + * + * Other helper functions. + * + * vci_GetMColumnPosition() + * Gives the position of vcis_m_column_t. + * + * vci_GetMColumn() + * Gives vcis_m_column_t. + * + * vci_GetExtentInfoPosition() + * Get the position of vcis_m_extent_t structure for the target + * extentId. + * + * FIXME Lock check function necessary? + * Memo: I think the functions to check the lock status of the VCI main + * relation may be convenient, in order to determine if it is possible to + * start a ROS command. It will be used to avoid conflict between building + * local ROS, the vacuum operation, and other ROS commands. For other ROS + * commands, we do not need to use such functions, just try to lock and + * wait. Vacuum, too. For local ROS conversion, we have to determine if + * other ROS command is running when we evaluate the cost of plans. + */ + +/** + * @brief Initialize the structure info to access the header of VCI main + * relation. + * + * This function "just" initializes the given object. + * To access the information in the header, keep the DB pages in buffer + * using vci_KeepMainRelHeader(). + * The accessors are vci_GetMainRelVar() and vci_SetMainRelVar(). + * After modifying the information, call vci_WriteMainRelVar() to write + * the page back to the storage. + * Finally to release the buffer, call vci_ReleaseMainRelHeader(). + * + * @param[out] info Pointer to the target vci_MainRelHeaderInfo, + * which will be initialized + * @param[in] rel VCI main relation. + * @param[in] command ROS command which uses this structure. + */ +void +vci_InitMainRelHeaderInfo(vci_MainRelHeaderInfo *info, + Relation rel, + vci_ros_command_t command) +{ + int aId; + + Assert(NULL != info); + info->rel = rel; + for (aId = 0; aId < lengthof(info->buffer); ++aId) + info->buffer[aId] = InvalidBuffer; + info->command = command; + info->num_extents_allocated = -1; + info->initctx = CurrentMemoryContext; + info->cached_tupledesc = NULL; +} + +static void +KeepMainRelHeader(vci_MainRelHeaderInfo *info) +{ + int blockNum; + + Assert(NULL != info); + Assert(NULL != info->rel); + for (blockNum = 0; blockNum < lengthof(info->buffer); ++blockNum) + info->buffer[blockNum] = vci_ReadBufferWithPageInit(info->rel, blockNum); +} + +static void +CheckRosVersion(vci_MainRelHeaderInfo *info) +{ + uint32 major = vci_GetMainRelVar(info, vcimrv_ros_version_major, 0); + uint32 minor = vci_GetMainRelVar(info, vcimrv_ros_version_minor, 0); + + if ((major == 0) && (minor == 0)) + ereport(ERROR, (errmsg("ROS has not been formatted yet."), + errhint("This might happen when CREATE INDEX fails. " + "\"DROP INDEX %s;\" and CREATE INDEX again may help.", + RelationGetRelationName(info->rel)))); + + if ((VCI_ROS_VERSION_MAJOR != major) || (VCI_ROS_VERSION_MINOR != minor)) + ereport(ERROR, (errmsg("incompatible VCI version: expected (%d, %d), stored (%d, %d).", VCI_ROS_VERSION_MAJOR, VCI_ROS_VERSION_MINOR, major, minor), + errhint("This can happen when accessing old database with newer VCI modules. DROP and CREATE INDEX may help."))); +} + +static int32 +GetNumberOfExtentsFromSizeOfMainRelation(Relation rel) +{ + const int headerBlockNumber = vcimrv_extent_info >> VCI_MRV_PAGE_SHIFT; + const int maxExtentInfoInFirstPage = (BLCKSZ - + (vcimrv_extent_info & VCI_MRV_MASK_OFFSET)) / + sizeof(vcis_m_extent_t); + const int maxExtentInfoInPage = VCI_MAX_PAGE_SPACE / + sizeof(vcis_m_extent_t); + int numBlocks = RelationGetNumberOfBlocks(rel); + + if (numBlocks <= headerBlockNumber) + return -1; + + return ((numBlocks - (headerBlockNumber + 1)) * maxExtentInfoInPage) + + maxExtentInfoInFirstPage; +} + +static void +UpdateNumberOfExtentsInMainRelHeader(vci_MainRelHeaderInfo *info) +{ + if (vci_rc_query == info->command) + info->num_extents_allocated = GetNumberOfExtentsFromSizeOfMainRelation( + info->rel); + else + info->num_extents_allocated = -1; +} + +/** + * @brief Keep DB pages of VCI header in buffer. + * + * This function acquire one read lock with AccessShareLock. + * This is called only by vci_inner_build(). + * + * @param[in] info Pointer to the target vci_MainRelHeaderInfo. + */ +void +vci_KeepMainRelHeaderWithoutVersionCheck(vci_MainRelHeaderInfo *info) +{ + Assert(info); + Assert(RelationIsValid(info->rel)); + elog(DEBUG3, "open VCI \"%s\" ignoring ROS version", + RelationGetRelationName(info->rel)); + KeepMainRelHeader(info); +} + +/** + * @brief Change command ID stored in vci_MainRelHeaderInfo. + * + * @param[in] info pointer to the target vci_MainRelHeaderInfo. + * @param[in] command new command ID. + */ +void +vci_ChangeCommand(vci_MainRelHeaderInfo *info, vci_ros_command_t command) +{ + Assert(info); + info->command = command; + UpdateNumberOfExtentsInMainRelHeader(info); +} + +/** + * @brief Keep DB pages of VCI header in buffer after checking the ROS version. + * + * This function acquire one read lock with AccessShareLock. + * + * @param[in] info Pointer to the target vci_MainRelHeaderInfo. + */ +void +vci_KeepMainRelHeader(vci_MainRelHeaderInfo *info) +{ + Assert(info); + Assert(RelationIsValid(info->rel)); + elog(DEBUG3, "open VCI \"%s\"", + RelationGetRelationName(info->rel)); + KeepMainRelHeader(info); + CheckRosVersion(info); + UpdateNumberOfExtentsInMainRelHeader(info); +} + +/** + * @brief Write header pages of VCI main relation. + * + * @param[in] info Pointer to the target vci_MainRelHeaderInfo. + * @param[in] writeArea Give vci_wmrv_update for updating the pages for + * recovery, or vci_wmrv_all for all pages. The latter should only be used in + * building the index. + */ +void +vci_WriteMainRelVar(vci_MainRelHeaderInfo *info, + vci_wmrv_t writeArea) +{ + int blockNum; + int start = 0; + + Assert(NULL != info); + Assert(NULL != info->rel); + + elog(DEBUG3, "flush header pages of VCI \"%s\" main relation", + RelationGetRelationName(info->rel)); + + switch (writeArea) + { + case vci_wmrv_update: + start = lengthof(info->buffer) - 1; + break; + case vci_wmrv_all: + start = 0; + break; + default: + ereport(ERROR, (errmsg("internal error. unsupported parameter."), errhint("Disable VCI by 'SELECT vci_disable();'"))); + } + + for (blockNum = start; blockNum < lengthof(info->buffer); ++blockNum) + { + LockBuffer(info->buffer[blockNum], BUFFER_LOCK_EXCLUSIVE); + MarkBufferDirty(info->buffer[blockNum]); + vci_WriteOneItemPage(info->rel, info->buffer[blockNum]); + LockBuffer(info->buffer[blockNum], BUFFER_LOCK_UNLOCK); + } +} + +/** + * @brief Release buffer for the VCI header. + * + * This function release one read lock with AccessShareLock. + * + * @param[in] info Pointer to the target vci_MainRelHeaderInfo. + */ +void +vci_ReleaseMainRelHeader(vci_MainRelHeaderInfo *info) +{ + int blockNum; + + Assert(NULL != info); + Assert(NULL != info->rel); + + elog(DEBUG3, "release VCI \"%s\"", + RelationGetRelationName(info->rel)); + for (blockNum = 0; blockNum < lengthof(info->buffer); ++blockNum) + { + ReleaseBuffer(info->buffer[blockNum]); + info->buffer[blockNum] = InvalidBuffer; + } + info->rel = NULL; + info->cached_tupledesc = NULL; +} + +/** + * @brief Set values in the header part of VCI main relation. + * + * @param[in] info Pointer to the target vci_MainRelHeaderInfo. + * @param[in] var "virtual address" of the variable, defined in + * enum vci_MainRelVar. + * @param[in] elemId Give 0 normally. + * When the target variable has multiple of elements, say an array, + * the element ID should be placed. + * @param[in] value The value to write. + */ +void +vci_SetMainRelVar(vci_MainRelHeaderInfo *info, + vci_MainRelVar var, + int elemId, + uint32 value) +{ + Page page; + unsigned int blockNumber = vci_MRVGetBlockNumber(var); + unsigned int offset = vci_MRVGetOffset(var); + + Assert(blockNumber < lengthof(info->buffer)); + Assert(offset < BLCKSZ); + + page = BufferGetPage(info->buffer[blockNumber]); + ((uint32 *) &(((char *) page)[offset]))[elemId] = value; +} + +/** + * @brief Get values in the header part of VCI main relation. + * + * @param[in] info Pointer to the target vci_MainRelHeaderInfo. + * @param[in] var "virtual address" of the variable, defined in + * enum vci_MainRelVar. + * @param[in] elemId Give 0 normally. + * When the target variable has multiple of elements, say an array, + * the element ID should be placed. + * @return The gotten value. + */ +uint32 +vci_GetMainRelVar(vci_MainRelHeaderInfo *info, + vci_MainRelVar var, + int elemId) +{ + Page page; + unsigned int blockNumber = vci_MRVGetBlockNumber(var); + unsigned int offset = vci_MRVGetOffset(var); + + Assert(blockNumber < lengthof(info->buffer)); + Assert(offset < BLCKSZ); + page = BufferGetPage(info->buffer[blockNumber]); + + return ((uint32 *) &(((char *) page)[offset]))[elemId]; +} + +/** + * @brief Get the position of column information in the VCI main relation. + * + * @param[in] columnId The column ID in the VCI index. + * @return The offset in the page, which including DB page header part. + */ +vci_MainRelVar +vci_GetMColumnPosition(int16 columnId) +{ + const int firstBlockNumber = vci_MRVGetBlockNumber(vcimrv_column_info); + const int numInFirstPage = (BLCKSZ - vci_MRVGetOffset(vcimrv_column_info)) / + sizeof(vcis_m_column_t); + const int numInPage = VCI_MAX_PAGE_SPACE / sizeof(vcis_m_column_t); + int blockNumber; + + Assert(VCI_FIRST_NORMALCOLUMN_ID <= columnId); + if (columnId < numInFirstPage) + { + return (firstBlockNumber << VCI_MRV_PAGE_SHIFT) + + vci_MRVGetOffset(vcimrv_column_info) + + (columnId * sizeof(vcis_m_column_t)); + } + + columnId -= numInFirstPage; + blockNumber = columnId / numInPage; + columnId -= blockNumber * numInPage; + blockNumber += 1 + firstBlockNumber; + Assert(blockNumber < (VCI_NUM_MAIN_REL_HEADER_PAGES - 1)); + + return (blockNumber << VCI_MRV_PAGE_SHIFT) + + VCI_MIN_PAGE_HEADER + + (columnId * sizeof(vcis_m_column_t)); +} + +/** + * @brief Get the column information in the VCI main relation. + * + * @param[in] info Pointer to the target vci_MainRelHeaderInfo. + * @param[in] columnId The column ID in the VCI index. + * @return The pointer to the column information in the header page of + * VCI main relation. + * + * @note + * AFTER ACCESSING vcis_m_column_t, RELEASE BUFFER WITH ReleaseBuffer(buffer); + */ +vcis_m_column_t * +vci_GetMColumn(vci_MainRelHeaderInfo *info, int16 columnId) +{ + Page page; + vci_MainRelVar mrv = vci_GetMColumnPosition(columnId); + + page = BufferGetPage(info->buffer[vci_MRVGetBlockNumber(mrv)]); + + return (vcis_m_column_t *) &(((char *) page)[vci_MRVGetOffset(mrv)]); +} + +/** + * @brief Obtain the position of vcis_m_extent_t structure for + * the target extentId. + * + * vcis_m_extent_t is the information of extents in VCI main relation. + * + * @param[out] blockNumber The block number contains the information is written + * in * blockNumber. + * @param[out] offset The offset number contains the information is written + * in * offset. + * @param[in] extentId The target extent ID. + */ +void +vci_GetExtentInfoPosition(BlockNumber *blockNumber, + OffsetNumber *offset, + int32 extentId) +{ + const int maxExtentInfoInFirstPage = (BLCKSZ - + (vcimrv_extent_info & VCI_MRV_MASK_OFFSET)) / + sizeof(vcis_m_extent_t); + const int maxExtentInfoInPage = VCI_MAX_PAGE_SPACE / + sizeof(vcis_m_extent_t); + + Assert(blockNumber); + Assert(offset); + + if (extentId < maxExtentInfoInFirstPage) + { + *blockNumber = vcimrv_extent_info >> VCI_MRV_PAGE_SHIFT; + *offset = (vcimrv_extent_info & VCI_MRV_MASK_OFFSET) + + (extentId * sizeof(vcis_m_extent_t)); + } + else + { + int32 extentIdRem = extentId - maxExtentInfoInFirstPage; + + *blockNumber = extentIdRem / maxExtentInfoInPage; + extentIdRem -= *blockNumber * maxExtentInfoInPage; + *blockNumber += 1 + (vcimrv_extent_info >> VCI_MRV_PAGE_SHIFT); + *offset = VCI_MIN_PAGE_HEADER + + (extentIdRem * sizeof(vcis_m_extent_t)); + } +} + +static void +WriteAllItemsInPage(Relation rel, + Buffer buffer, + uint16 numItems) +{ + uint16 iId; + + for (iId = 0; iId < numItems; ++iId) + vci_WriteItem(rel, buffer, iId + FirstOffsetNumber); +} + +/* + * ********************************************************* + * Relation and buffer control + * ********************************************************* + */ +/* + * vci_PreparePagesWithOneItemIfNecessary() + * This function checks if the relation has the DB page pointed + * by an argument. If it does not exists, the function extends + * the relation and initialize extended pages with one item per + * page. Mind that this function does not touch existing pages. + * If you need to format existing pages, use vci_InitPage(). + * + * vci_InitPage() + * Low level function. + * + * This function formats the existing DB page, pointed by + * relation and page ID (block number), with empty items. + * The number of items are also passed by an argument. + * + * vci_PreparePagesWithOneItemIfNecessary() is more convenient. + * For pages with one item, the macro vci_InitOneItemPage() is + * defined. + * + * vci_WriteItem() + * Mark the buffer dirty, and write out WAL from the pointed + * item in the buffer. + * + * vci_WriteOnePageIfNecessaryAndNext() + * A utility function. + * This function takes new page ID and old page ID in the + * arguments. If they are different, write out the old page, + * assumed which is loaded in the given buffer, and read + * the new page. + * If the page IDs are same, do nothing. + * + */ + +/** + * @brief This function checks if the relation has the DB page with the page ID + * blockNumber. + * + * When it does not exists, the function extends the relation and initialize + * extended pages with one item per page. + * + * @param[in] rel The relation. + * @param[in] blockNumber The block number to be examined. + * @param[in] numItems The number of items the page is initialized with. + * @param[in] forceInit If true, the block is initialized anyway. + * @param[in] logItems If true, write all items in the pages into WAL. + */ +void +vci_PreparePagesIfNecessaryCore(Relation rel, + BlockNumber blockNumber, + uint16 numItems, + bool forceInit, + bool logItems) +{ + BlockNumber existingPages = RelationGetNumberOfBlocks(rel); + + Assert(0 < numItems); + + if (!BlockNumberIsValid(blockNumber)) + ereport(ERROR, (errmsg("data relation full"), errhint("Normally relations of VCI index are smaller than the table relation, therefore this error must not happen. Disable VCI by 'SELECT vci_disable();'"))); + + if (existingPages <= blockNumber) + { + BlockNumber pId; + + for (pId = existingPages; pId <= blockNumber; ++pId) + { + Buffer buffer = ReadBufferExtended(rel, MAIN_FORKNUM, + P_NEW, RBM_ZERO_AND_LOCK, NULL); + + vci_InitPageCore(buffer, numItems, true); + if (logItems) + WriteAllItemsInPage(rel, buffer, numItems); + UnlockReleaseBuffer(buffer); + } + } + else + { + Buffer buffer = ReadBuffer(rel, blockNumber); + Page page = BufferGetPage(buffer); + bool needUnlock = false; + + if (PageIsNew(page) || forceInit) + { + vci_InitPageCore(buffer, numItems, false); + + if (logItems) + { + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + WriteAllItemsInPage(rel, buffer, numItems); + needUnlock = true; + } + } + if (needUnlock) + UnlockReleaseBuffer(buffer); + else + ReleaseBuffer(buffer); + } +} + +/** + * @brief This function writes a given number of items in the buffer. + * + * @param[in] buffer Postgres DB buffer to be initialized. + * @param[in] numItems The number of items the page is initialized with. + * @param[in] locked true if the buffer is locked, false otherwise. + */ +void +vci_InitPageCore(Buffer buffer, int16 numItems, bool locked) +{ + if (!locked) + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + + { + uint32 size; + uint32 itemSize; + int32 aId; + Page page = BufferGetPage(buffer); + PageHeader pageHeader = (PageHeader) page; + + PageInit(page, BLCKSZ, 0); + pageHeader->pd_lower += sizeof(ItemIdData) * numItems; + size = pageHeader->pd_upper - pageHeader->pd_lower; + itemSize = vci_RoundDownValue(size / numItems, + VCI_DATA_ALIGNMENT_IN_STORAGE); + for (aId = numItems; aId--;) + { + HeapTupleHeader hTup; + + pageHeader->pd_upper -= itemSize; + pageHeader->pd_linp[aId].lp_off = pageHeader->pd_upper; + pageHeader->pd_linp[aId].lp_len = itemSize; + pageHeader->pd_linp[aId].lp_flags = LP_NORMAL; + hTup = (HeapTupleHeader) PageGetItem(page, &(pageHeader->pd_linp[aId])); + hTup->t_infomask2 = 0; + hTup->t_infomask = HEAP_XMIN_FROZEN | HEAP_XMAX_INVALID; + hTup->t_hoff = vci_RoundUpValue(offsetof(HeapTupleHeaderData, t_bits), + VCI_DATA_ALIGNMENT_IN_STORAGE); + } + MarkBufferDirty(buffer); + Assert(pageHeader->pd_lower <= pageHeader->pd_upper); + } + + if (!locked) + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); +} + +/** + * @brief This function get or newly create a DB buffer page, and put the + * header information that only one item is in the page, and the size of + * item is 8140 bytes, and the data type is bytea. + * + * @param[in] rel The relation. + * @param[in] blockNumber The block number to be initialized. + * @param[in] numItems The number of items the page is initialized with. + */ +/* + * dead code + * LCOV_EXCL_START + */ +void +vci_InitPage(Relation rel, BlockNumber blockNumber, int16 numItems) +{ + Buffer buffer; + + Assert(BlockNumberIsValid(blockNumber)); + buffer = ReadBuffer(rel, blockNumber); + vci_InitPageCore(buffer, numItems, false); + ReleaseBuffer(buffer); +} + +/* LCOV_EXCL_STOP */ + +/** + * @brief This function mark the buffer dirty, and make WAL from the item + * in the buffer. + * + * We assume that the relation is only modified by ROS command exclusively. + * So, we do not put strict lock here. + * + * @param[in] rel The relation. + * @param[in] buffer PostgreSQL DB buffer having the page data. + * @param[in] numItems The number of items the page is initialized with. + */ +void +vci_WriteItem(Relation rel, + Buffer buffer, + OffsetNumber offsetNumber) +{ + Page page = BufferGetPage(buffer); + ItemId tup = PageGetItemId(page, offsetNumber); + HeapTupleHeader htup = (HeapTupleHeader) PageGetItem(page, tup); + + Assert(BufferIsValid(buffer)); + Assert(OffsetNumberIsValid(offsetNumber)); + + MarkBufferDirty(buffer); + + if (RelationNeedsWAL(rel)) + { + xl_heap_inplace xlrec; + XLogRecPtr recptr; + uint8 info = 0; + uint32 newlen; + + xlrec.offnum = offsetNumber; + xlrec.dbId = MyDatabaseId; + xlrec.tsId = MyDatabaseTableSpace; + xlrec.relcacheInitFileInval = false; + xlrec.nmsgs = 0; + + /* + * originally taken from heap_inplace_update() in + * src/backend/access/heap/heapam.c + */ + XLogBeginInsert(); + XLogRegisterData(&xlrec, MinSizeOfHeapInplace); + + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); + + newlen = VCI_ITEM_SPACE(PageGetMaxOffsetNumber(page)); + XLogRegisterBufData(0, (char *) htup + htup->t_hoff, newlen); + + START_CRIT_SECTION(); + recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_INPLACE | info); + + PageSetLSN(page, recptr); + + END_CRIT_SECTION(); + } +} + +/** + * @brief This function first compares blockNumber and blockNumberOld. + * + * If they differ each other, write out the buffer in the DB page of + * blockNumberOld, and read the DB page of blockNumber. + * If the are same, do nothing. + * + * @param[in] relation The relation. + * @param[in] blockNumber New page ID. + * @param[in] blockNumberOld Old page ID. The data is in buffer. + * @param[in] buffer The buffer contains the old page. + * @return buffer contains new page, exclusively locked. + */ +Buffer +vci_WriteOnePageIfNecessaryAndGetBuffer(Relation relation, + BlockNumber blockNumber, + BlockNumber blockNumberOld, + Buffer buffer) +{ + if (blockNumber == blockNumberOld) + return buffer; + if (BlockNumberIsValid(blockNumberOld)) + { + vci_WriteOneItemPage(relation, buffer); + UnlockReleaseBuffer(buffer); + } + buffer = vci_ReadBufferWithPageInit(relation, blockNumber); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + + return buffer; +} + +/* + * ********************************************************* + * PostgreSQL Attributes (columns) + * ********************************************************* + */ + +/* + * ********************************************************* + * VCI "columns" + * Here, a "column" may have only one data relation, + * or a pair of meta data relation and data relation. + * It includes delete vector, null vector, TID relation, + * + * ********************************************************* + */ +/* + * vci_GetSumOfAttributeIndices() + * This function counts up all the VCI "columns" defined + * in num_vcis_attribute_type. + * + * vci_GetAttrTypeAndIndexFromSumOfIndices() + * Get vcis_attribute_type_t and index from given + * sequential index. + */ + +/** + * @brief This function counts up all the VCI "columns" defined + * in num_vcis_attribute_type. + * + * @param[in] numColumns Number of normal columns in VCI index. + * @return number of total columns, not only of indexed columns, but also + * auxiliary columns. + */ +int +vci_GetSumOfAttributeIndices(int16 numColumns) +{ + int result = 0; + int aId; + + for (aId = 0; aId < num_vcis_attribute_type; ++aId) + result += vci_GetNumIndexForAttributeType(aId, numColumns); + + return result; +} + +/** + * @brief Get Attribute type defined in vcis_attribute_type_t and + * index of the target category. + * + * @param[out] attrType The attribute type is wirtten in *attrType. + * @param[out] index The index is wirtten in *index. + * If no corresponding attribute exists, *index set to -1. + * @param[in] numColumns The number of normal columns in VCI index. + * @param[in] sumOfIndex The sequential index of target column. + */ +void +vci_GetAttrTypeAndIndexFromSumOfIndices(vcis_attribute_type_t *attrType, + int *index, + int16 numColumns, + int sumOfIndex) +{ + int sum = 0; + + *index = 0; + for (*attrType = 0; *attrType < num_vcis_attribute_type; ++*attrType) + { + int inc = vci_GetNumIndexForAttributeType(*attrType, numColumns); + + if ((sum <= sumOfIndex) && (sumOfIndex < (sum + inc))) + { + *index = sumOfIndex - sum; + + return; + } + sum += inc; + } + *index = -1; +} + +/** + * @brief Calculate the bid ID of null bit vector for given column ID. + * + * @param[in] tupleDesc The tuple descriptor of VCI main relation. + * @param[in] columnId Target column ID. + * @return The bit ID in null bit vector. For not nullable columns, return -1. + */ +int16 +vci_GetBitIdInNullBits(TupleDesc tupleDesc, int16 columnId) +{ + return columnId; +} + +/** + * @brief Get the column widths in the worst case. + * + * @param attr Attribute information of the columns. + * @return The width in the worst case. + */ +int16 +vci_GetColumnWorstSize(Form_pg_attribute attr) +{ + if (0 <= attr->attlen) /* fixed length data */ + return attr->attlen; + + /* variable or long length data */ + if (0 <= attr->atttypmod) + { + int32 columnSize; + + switch (attr->atttypid) + { + /* for bit(n), varbit(n). */ + case BITOID: + case VARBITOID: + columnSize = VARBITTOTALLEN(attr->atttypmod); + break; + + /* for numeric(p,q), retrun 'p'+LL . */ + case NUMERICOID: + columnSize = (attr->atttypmod >> 16) + VARHDRSZ; + break; + + case BPCHAROID: + case VARCHAROID: + if (attr->atttypmod < VARHDRSZ) + columnSize = (attr->atttypmod - VARHDRSZ) * MAX_MULTIBYTE_CHAR_LEN + VARHDRSZ; + else + columnSize = attr->atttypmod * MAX_MULTIBYTE_CHAR_LEN; + break; + + default: + { +#ifdef VCI_USE_COMPACT_VARLENA + if (attr->atttypmod < VARATT_SHORT_MAX) + columnSize = attr->atttypmod - VARHDRSZ + VARHDRSZ_SHORT; + else + columnSize = attr->atttypmod; +#else + columnSize = attr->atttypmod; +#endif + } + break; + } + + if (columnSize < MaxHeapTupleSize) + return (int16) columnSize; + } + + /* worst size -> MaxHeapTupleSize(8k) */ + /* unlimited data size */ + return MaxHeapTupleSize; + + /* + * Large data are externally toasted and the size of tuple including the + * large attribute is limited to TOAST_TUPLE_TARGET, which is BLCKSZ / 4 + * normally. But, UN-TOASTED -> MaxHeapTupleSize. + */ +} + +/** + * @brief from vci_MainRelHeaderInfo, column IDs in original heap relation + * and VCI index relation are collected. + * + * This function also collect the worst-case sizes of columns. + * attributes, just packed. + * + * @param[out] heapAttrNumList Pointer to an array of AttrNumber. + * The attribute numbers (column ID) in the heap relation are stored here. + * The AttrNumber is one-origin. + * The length of array must be larger than numColumns. + * + * @param[out] indxColumnIdList Pointer to an array of int16. + * The column IDs in the VCI main relation are stored here. + * This is zero-origin. + * The length of array must be larger than numColumns. + * + * @param[out] columnSizeList Pointer to an array of int16. + * The worst-case widths are stored here. + * The length of array must be larger than numColumns. + * + * @param[in] numColumn Number of columns defined in VCI index. + * @param[in] info VCI main relation header information. + * @param[in] heapOid OID of original PostgreSQL tables. + * @return sum of columnSizeList. + */ +Size +vci_GetColumnIdsAndSizes(AttrNumber *heapAttrNumList, + int16 *indxColumnIdList, + int16 *columnSizeList, + int numColumn, + vci_MainRelHeaderInfo *info, + Oid heapOid) +{ + LOCKMODE lockmode = AccessShareLock; + Oid tableOid = info->rel->rd_index->indrelid; + Relation tableRel; + TupleDesc tupleDesc; + Size result = 0; + int colId; + + tableRel = table_open(tableOid, lockmode); + tupleDesc = RelationGetDescr(tableRel); + + for (colId = VCI_FIRST_NORMALCOLUMN_ID; colId < numColumn; ++colId) + { + Form_pg_attribute attr; + vcis_m_column_t *mColumn = vci_GetMColumn(info, colId); + Buffer buffer; + Relation rel = table_open(mColumn->meta_oid, lockmode); + vcis_column_meta_t *metaHeader = vci_GetColumnMeta(&buffer, rel); + int16 attnum = metaHeader->pgsql_attnum; + + heapAttrNumList[colId] = attnum; + attr = TupleDescAttr(tupleDesc, attnum - 1); + + ReleaseBuffer(buffer); + table_close(rel, lockmode); + + /* + * Previously, "attr->attnum - 1" was used for the right value instead + * of the simple sequencial number, colId (The attr is extracted from + * indexRel). This was for future expanding to enable to add columns + * to or delete ones from VCI after creating. But this is not + * implemented. And then, the attr is no longer reliable because real + * columns information is stored in the vci_column_ids option not in + * indexRel when using vci_create(). + */ + indxColumnIdList[colId] = colId; + + if (!AttributeNumberIsValid(heapAttrNumList[colId])) + elog(ERROR, "column not found."); /* FIXME */ + + result += columnSizeList[colId] = vci_GetColumnWorstSize(attr); + } + + table_close(tableRel, lockmode); + + return result; +} + +/** + * @brief Count number of nullable columns in a tuple descriptor. + * + * @param[in] tupleDesc tuple descriptor + * @return Number of nullable columns in the relation. + */ +int +vci_GetNumberOfNullableColumn(TupleDesc tupleDesc) +{ + int result = 0; + int aId; + + for (aId = 0; aId < tupleDesc->natts; ++aId) + { + Assert(!((TupleDescAttr(tupleDesc, aId)->attnotnull))); + ++result; + } + + return result; +} + +/** + * @brief Sarch for free extent and return the extent ID. + * + * This function reads extent information in the ROS main relation and checks + * if the extent has its xgen and xdel are both InvalidTransactionId. + * The check is done in vci_isFreeExtent(). + */ +static uint32 +SearchFreeExtent(vci_MainRelHeaderInfo *info) +{ + int32 numExtents = vci_GetMainRelVar(info, vcimrv_num_extents, 0); + int32 extentId = numExtents; + BlockNumber blockNumber; + OffsetNumber offset; + Buffer buffer = InvalidBuffer; + Page pageHeader = NULL; + + /* search deleted extent first */ + + vcis_m_extent_t *extentInfo; + vci_meta_item_scanner_t *scan = + vci_BeginMetaItemScan(info->rel, BUFFER_LOCK_SHARE); + + while ((extentInfo = vci_GetMExtentNext(info, scan)) != NULL) + { + if (vci_ExtentIsFree(extentInfo)) + { + extentId = scan->index; + break; + } + } + vci_EndMetaItemScan(scan); + + /* if no deleted extent, create a new extent */ + if (extentId == numExtents) + { + while (true) + { + vcis_m_extent_t *extentInfo_new; + bool extentIsFree; + + vci_GetExtentInfoPosition(&blockNumber, &offset, extentId); + vci_PreparePagesWithOneItemIfNecessary(info->rel, blockNumber); + buffer = ReadBuffer(info->rel, blockNumber); + + LockBuffer(buffer, BUFFER_LOCK_SHARE); + + pageHeader = BufferGetPage(buffer); + extentInfo_new = (vcis_m_extent_t *) &(((char *) pageHeader)[offset]); + Assert(extentInfo_new->xgen == InvalidTransactionId); + Assert((extentInfo_new->xdel == InvalidTransactionId) || (extentInfo_new->xdel == FrozenTransactionId)); + extentIsFree = vci_ExtentIsFree(extentInfo_new); + + UnlockReleaseBuffer(buffer); + + if (extentIsFree) + break; + else + ++extentId; + } + } + + return extentId; +} + +/** + * @brief Get free extent Id. + * + * This function first check the pointer in main relation to one free extent. + * It it is not free extent, then scan the main relation to find free one. + * @param[in] info Pointer to the target vci_MainRelHeaderInfo. + * @return ID of a free extent. + */ +uint32 +vci_GetFreeExtentId(vci_MainRelHeaderInfo *info) +{ + Buffer buffer; + int32 extentId; + vcis_m_extent_t *extentInfo; + bool isFreeExtent; + + /* first, check the pointed extent */ + extentId = 0; + { + extentInfo = vci_GetMExtent(&buffer, info, extentId); + + LockBuffer(buffer, BUFFER_LOCK_SHARE); + isFreeExtent = vci_ExtentIsFree(extentInfo); + UnlockReleaseBuffer(buffer); + + if (isFreeExtent) + return extentId; + } + + /* scan the VCI main relation to find free extent */ + extentId = SearchFreeExtent(info); + extentInfo = vci_GetMExtent(&buffer, info, extentId); + + LockBuffer(buffer, BUFFER_LOCK_SHARE); + Assert(vci_ExtentIsFree(extentInfo)); + UnlockReleaseBuffer(buffer); + + return extentId; +} + +/* + * ************* + * ** CAUTION ** + * ************* + * USE vci_WriteExtentInfoInMainRosForWosRosConvInit() IN SOME TRANSACTION. + * GetCurrentTransactionId() IS USED. + */ + +/** + * @brief The function to call before starting WOS -> ROS conversion to write + * recovery information. + * + * This function write new current ROS ID to the header area of ROS main + * relation, ROS command, and target extent ID. It also write + * InvalidTransactionId at the target extent info. + * + * @param[in] info pointer to the target vci_MainRelHeaderInfo. + * @param[in] extentId target extent ID. + * @param[in] extentId target common dictionary ID. + * @param[in] xid transaction ID of this write operation. + * @param[in] command command of this operation. + */ +void +vci_WriteExtentInfoInMainRosForWriteExtentOrCommonDict( + vci_MainRelHeaderInfo *info, + int32 extentId, + int32 dictionaryId, + TransactionId xid, + vci_ros_command_t command) +{ + int32 numExtents = vci_GetMainRelVar(info, vcimrv_num_extents, 0); + + Assert(0 <= numExtents); + Assert(VCI_FIRST_NORMAL_EXTENT_ID <= extentId); + if (numExtents <= extentId) + { + BlockNumber blockNumber; + OffsetNumber offset; + + numExtents = extentId + 1; + vci_GetExtentInfoPosition(&blockNumber, &offset, extentId); + vci_PreparePagesWithOneItemIfNecessary(info->rel, blockNumber); + } + vci_SetMainRelVar(info, vcimrv_num_extents, 0, numExtents); + vci_SetMainRelVar(info, vcimrv_current_ros_version, 0, xid); + vci_SetMainRelVar(info, vcimrv_ros_command, 0, command); + vci_WriteMainRelVar(info, vci_wmrv_update); +} + +vcis_m_extent_t * +vci_GetMExtent(Buffer *buffer, vci_MainRelHeaderInfo *info, int32 extentId) +{ + BlockNumber blockNumber; + OffsetNumber offset; + Page page; + + Assert(VCI_FIRST_NORMAL_EXTENT_ID <= extentId); + vci_GetExtentInfoPosition(&blockNumber, &offset, extentId); + + /* + * info->num_extents_allocated is normally -1. When vci_rc_query == + * info->command, it has the expected number of extents calcuated from + * number of blocks in VCI main relation. + */ + if (info->num_extents_allocated <= extentId) + vci_PreparePagesWithOneItemIfNecessary(info->rel, blockNumber); + + *buffer = vci_ReadBufferWithPageInit(info->rel, blockNumber); + page = BufferGetPage(*buffer); + + return (vcis_m_extent_t *) &(((char *) page)[offset]); +} + +vcis_m_extent_t * +vci_GetMExtentNext(vci_MainRelHeaderInfo *info, vci_meta_item_scanner_t *scan) +{ + OffsetNumber offset; + BlockNumber block; + + if (!scan->inited) + { + Page page; + + scan->max_item = vci_GetMainRelVar(info, vcimrv_num_extents, 0); + vci_GetExtentInfoPosition(&scan->start_block, &offset, 0); + vci_GetExtentInfoPosition(&scan->end_block, &offset, scan->max_item); + scan->item_size = sizeof(vcis_m_extent_t); + scan->current_block = scan->start_block; + + scan->buffer = ReadBuffer(scan->rel, scan->current_block); + LockBuffer(scan->buffer, scan->buf_lockmode); + + page = BufferGetPage(scan->buffer); + if (PageIsNew(page)) + { + UnlockReleaseBuffer(scan->buffer); + return NULL; + } + + Assert(scan->index == -1); + Assert(scan->max_item >= 0); + + scan->inited = true; + } + + scan->index++; + + if (scan->index >= scan->max_item) + return NULL; + + vci_GetExtentInfoPosition(&block, &offset, scan->index); + + if (scan->current_block != block) + { + Page page; + + Assert(BufferIsValid(scan->buffer)); + + if (scan->buf_lockmode == BUFFER_LOCK_EXCLUSIVE) + vci_WriteOneItemPage(scan->rel, scan->buffer); + + UnlockReleaseBuffer(scan->buffer); + + scan->buffer = ReadBuffer(scan->rel, block); + scan->current_block = block; + + LockBuffer(scan->buffer, scan->buf_lockmode); + + page = BufferGetPage(scan->buffer); + if (PageIsNew(page)) + { + UnlockReleaseBuffer(scan->buffer); + return NULL; + } + } + + return (vcis_m_extent_t *) &(((char *) BufferGetPage(scan->buffer))[offset]); +} + +vci_meta_item_scanner_t * +vci_BeginMetaItemScan(Relation rel, int buf_lock) +{ + vci_meta_item_scanner_t *scan = palloc0(sizeof(vci_meta_item_scanner_t)); + + Assert((buf_lock == BUFFER_LOCK_SHARE) || (buf_lock == BUFFER_LOCK_EXCLUSIVE)); + + scan->inited = false; + + scan->rel = rel; + scan->index = -1; + + scan->end_block = InvalidBlockNumber; + scan->start_block = InvalidBlockNumber; + scan->buffer = InvalidBuffer; + scan->current_block = InvalidBlockNumber; + scan->max_item = 0; + scan->max_item_in_page = 0; + scan->item_size = 0; + scan->buf_lockmode = buf_lock; + + return scan; +} + +void +vci_EndMetaItemScan(vci_meta_item_scanner_t *scan) +{ + Assert(scan); + + if (BufferIsValid(scan->buffer)) + { + if (scan->buf_lockmode == BUFFER_LOCK_EXCLUSIVE) + vci_WriteOneItemPage(scan->rel, scan->buffer); + + UnlockReleaseBuffer(scan->buffer); + } + + pfree(scan); +} + +void +vci_WriteExtentInfo(vci_MainRelHeaderInfo *info, + int32 extentId, + uint32 numRows, + uint32 numDeletedRows, + uint32 numDeletedRowsOld, + TransactionId xgen, + TransactionId xdel) +{ + Buffer buffer; + vcis_m_extent_t *extentInfo = vci_GetMExtent(&buffer, info, extentId); + + Assert(VCI_FIRST_NORMAL_EXTENT_ID <= extentId); + + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + extentInfo->num_rows = numRows; + extentInfo->num_deleted_rows = numDeletedRows; + extentInfo->num_deleted_rows_old = numDeletedRowsOld; + extentInfo->xgen = xgen; + extentInfo->xdel = xdel; + extentInfo->flags = 0; + vci_WriteOneItemPage(info->rel, buffer); + UnlockReleaseBuffer(buffer); +} + +/** + * @brief This function checks if the extentID is 0 <= extentID and + * extentID < numExtents written in header part of main relation. + * + * If it passes, check the existence of the DB page where the extent ID + * information is written. + * It might happen that the page has vanished in some trouble...? + * In recovery process, the record of the number of extents should be + * corrected. If so, elog is better... + * + * @param[in] info Pointer to the target vci_MainRelHeaderInfo. + * @param[in] extentId The target extent ID. + * @retval true The DB page is allocated for the information with given + * extent ID. + * @retval false Need to allocate new DB page for the information. + */ +bool +vci_ExtentInfoExists(vci_MainRelHeaderInfo *info, int32 extentId) +{ + BlockNumber blockNumber; + OffsetNumber offset; + int32 numExtents = vci_GetMainRelVar(info, vcimrv_num_extents, 0); + + Assert(0 <= numExtents); + if (numExtents <= extentId) + return false; + + if (0 <= info->num_extents_allocated) + return extentId < info->num_extents_allocated; + + vci_GetExtentInfoPosition(&blockNumber, &offset, extentId); + + return blockNumber < RelationGetNumberOfBlocks(info->rel); +} + +static bool +VisibilityCheck(TransactionId objectXidMin, + TransactionId objectXidMax, + TransactionId readerXid) +{ + /* visibility from generation */ + bool result = TransactionIdIsValid(objectXidMin) && + (TransactionIdEquals(objectXidMin, FrozenTransactionId) || + /* objectXidMin <= readerXid */ + TransactionIdPrecedesOrEquals(objectXidMin, readerXid)); + + if (!result) + return false; + + /* visibility from deletion */ + return (!TransactionIdIsValid(objectXidMax)) || + (TransactionIdIsNormal(objectXidMax) && + NormalTransactionIdPrecedes(readerXid, objectXidMax)); +} + +/** + * @brief Test if the extent is visible. + * + * @param[in] mExtent Pointer to the extent information. + * @param[in] xid The transaction ID to access the information. + * @retval true Visible. + * @retval false Invisible. + */ +bool +vci_ExtentIsVisible(vcis_m_extent_t *mExtent, TransactionId xid) +{ + return VisibilityCheck(mExtent->xgen, mExtent->xdel, xid); +} + +bool +vci_ExtentIsCollectable(vcis_m_extent_t *mExtent, TransactionId wos2rosXid) +{ + bool result = false; + + if (TransactionIdIsValid(mExtent->xdel)) + { + result = TransactionIdEquals(mExtent->xdel, FrozenTransactionId) || + /* mExtent->xdel < wos2rosXid */ + TransactionIdPrecedes(mExtent->xdel, wos2rosXid); + } + + return result; +} + +bool +vci_ExtentIsFree(vcis_m_extent_t *extentInfo) +{ + return !TransactionIdIsValid(extentInfo->xdel) && !TransactionIdIsValid(extentInfo->xgen); +} + +/* -------------------------------------------------- */ +/* Recovery function around VCI Main Relation */ +/* -------------------------------------------------- */ + +void +vci_UpdateLastRosVersionAndOthers(vci_MainRelHeaderInfo *info) +{ + uint32 val; + + val = vci_GetMainRelVar(info, vcimrv_current_ros_version, 0); + vci_SetMainRelVar(info, vcimrv_last_ros_version, 0, val); + val = vci_GetMainRelVar(info, vcimrv_size_mr, 0); + vci_SetMainRelVar(info, vcimrv_size_mr_old, 0, val); + val = vci_GetMainRelVar(info, vcimrv_tid_crid_diff_sel, 0); + vci_SetMainRelVar(info, vcimrv_tid_crid_diff_sel_old, 0, val); + + vci_WriteMainRelVar(info, vci_wmrv_update); +} + +void +vci_RecoveryDone(vci_MainRelHeaderInfo *info) +{ + uint32 val; + + val = vci_GetMainRelVar(info, vcimrv_last_ros_version, 0); + vci_SetMainRelVar(info, vcimrv_current_ros_version, 0, val); + + val = vci_GetMainRelVar(info, vcimrv_size_mr_old, 0); + vci_SetMainRelVar(info, vcimrv_size_mr, 0, val); + + val = vci_GetMainRelVar(info, vcimrv_tid_crid_diff_sel_old, 0); + vci_SetMainRelVar(info, vcimrv_tid_crid_diff_sel, 0, val); + + vci_WriteMainRelVar(info, vci_wmrv_update); +} + +void +vci_WriteRecoveryRecordDone(vci_MainRelHeaderInfo *info, vci_ros_command_t command, + TransactionId xid) +{ + vci_SetMainRelVar(info, vcimrv_current_ros_version, 0, xid); + vci_SetMainRelVar(info, vcimrv_ros_command, 0, command); + vci_WriteMainRelVar(info, vci_wmrv_update); +} + +void +vci_WriteRecoveryRecordForExtentInfo(vci_MainRelHeaderInfo *info, int32 newExtentId, int32 oldExtentId) +{ + /* + * ConvertWos2Ros oldExtentId = VCI_INVALID_EXTENT_ID newExtentId = New + * Extent + * + * CollectDeletedRows oldExtentId = Src Extent( -> Unused Extent) + * newExtentId = New Extent + * + * CollectUnusedExtent oldExtentId = Unused Extent newExtentId = + * VCI_INVALID_EXTENT_ID + */ + vci_SetMainRelVar(info, vcimrv_old_extent_id, 0, oldExtentId); + vci_SetMainRelVar(info, vcimrv_new_extent_id, 0, newExtentId); +} + +void +vci_RecoveryExtentInfo(vci_MainRelHeaderInfo *info, vci_ros_command_t command) +{ + int32 numExtents; + int32 oldExtentId; + int32 newExtentId; + Buffer s_buffer = InvalidBuffer; + Buffer d_buffer = InvalidBuffer; + vcis_m_extent_t *extentInfo; + int16 colId; + + numExtents = vci_GetMainRelVar(info, vcimrv_num_extents, 0); + oldExtentId = vci_GetMainRelVar(info, vcimrv_old_extent_id, 0); + newExtentId = vci_GetMainRelVar(info, vcimrv_new_extent_id, 0); + colId = vci_GetMainRelVar(info, vcimrv_working_column_id, 0); + + if (oldExtentId != VCI_INVALID_EXTENT_ID) + { + TransactionId recovery_xdel; + + switch (command) + { + case vci_rc_collect_deleted: + Assert(oldExtentId < numExtents); + recovery_xdel = InvalidTransactionId; + break; + case vci_rc_collect_extent: + /* unuse extent Xdel -> Frozen(2) */ + recovery_xdel = FrozenTransactionId; + break; + default: + Assert(0); + recovery_xdel = InvalidTransactionId; + break; + } + + extentInfo = vci_GetMExtent(&s_buffer, info, oldExtentId); /* from */ + + LockBuffer(s_buffer, BUFFER_LOCK_EXCLUSIVE); + extentInfo->xdel = recovery_xdel; + vci_WriteOneItemPage(info->rel, s_buffer); + UnlockReleaseBuffer(s_buffer); + } + + if ((newExtentId != VCI_INVALID_EXTENT_ID) && (newExtentId < numExtents)) + { + Assert((command == vci_rc_wos_ros_conv) || (command == vci_rc_collect_deleted)); + extentInfo = vci_GetMExtent(&d_buffer, info, newExtentId); /* to */ + + LockBuffer(d_buffer, BUFFER_LOCK_EXCLUSIVE); + extentInfo->xgen = InvalidTransactionId; + Assert((extentInfo->xdel == InvalidTransactionId) || (extentInfo->xdel == FrozenTransactionId)); + extentInfo->xdel = FrozenTransactionId; + extentInfo->flags |= VCIS_M_EXTENT_FLAG_ENABLE_RECOVERED_COLID; + extentInfo->recovered_colid = colId; + vci_WriteOneItemPage(info->rel, d_buffer); + UnlockReleaseBuffer(d_buffer); + } +} + +void +vci_WriteRecoveryRecordForUpdateDelVec(vci_MainRelHeaderInfo *info) +{ + vcis_m_extent_t *extentInfo; + vci_meta_item_scanner_t *scan; + + scan = vci_BeginMetaItemScan(info->rel, BUFFER_LOCK_EXCLUSIVE); + while ((extentInfo = vci_GetMExtentNext(info, scan)) != NULL) + { + extentInfo->num_deleted_rows_old = extentInfo->num_deleted_rows; + } + vci_EndMetaItemScan(scan); +} + +void +vci_RecoveryUpdateDelVec(vci_MainRelHeaderInfo *info) +{ + vcis_m_extent_t *extentInfo; + vci_meta_item_scanner_t *scan; + + scan = vci_BeginMetaItemScan(info->rel, BUFFER_LOCK_EXCLUSIVE); + while ((extentInfo = vci_GetMExtentNext(info, scan)) != NULL) + { + extentInfo->num_deleted_rows = extentInfo->num_deleted_rows_old; + } + vci_EndMetaItemScan(scan); +} + +const char * +vci_GetRosCommandName(vci_ros_command_t command) +{ + switch (command) + { + case vci_rc_invalid: + return "invalid"; + + case vci_rc_vacuum: + return "vacuum"; + + case vci_rc_query: + return "query"; + + case vci_rc_drop_index: + return "drop index"; + + case vci_rc_wos_delete: + return "wos delete"; + + case vci_rc_wos_insert: + return "wos insert"; + + case vci_rc_recovery: + return "recovery"; + + case vci_rc_probe: + return "probe"; + + case vci_rc_wos_ros_conv_build: + return "wos ros conv build"; + + case vci_rc_generate_local_ros: + return "generate local ros"; + + case vci_rc_copy_command: + return "copy command"; + + case vci_rc_wos_ros_conv: + return "wos2ros conversion"; + + case vci_rc_update_del_vec: + return "update delete vector"; + + case vci_rc_collect_deleted: + return "collect deleted rows"; + + case vci_rc_collect_extent: + return "collect extent"; + + case vci_rc_update_tid_crid: + return "update tid-crid tree"; + + default: + return "unknown"; + } +} + +static Buffer +ReadBufferWithPageInitCore(Relation reln, BlockNumber blockNumber, int16 numItem) +{ + Buffer buffer; + Page page; + + Assert((reln->rd_rel->relkind == 'i') || (reln->rd_rel->relkind == 'm')); + buffer = ReadBuffer(reln, blockNumber); + + page = BufferGetPage(buffer); + if (PageIsNew(page)) + { + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + if (PageIsNew(page)) + vci_InitPageCore(buffer, numItem, true); + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + } + + return buffer; +} + +/** + * @brief Read a buffer containing the requested block of the requested VCI + * relation. + * + * Same as ReadBuffer(), but initialize new page. + * + * We must generally use this function instead of ReadBuffer(), to access a kind + * of VCI relations except Data WOS, Whiteout WOS, and delete vector. But we + * don't need to replace ReadBuffer() immediately after vci_PreparePagesIfNecessaryCore(). + * + * @param[in] reln The relation. + * @param[in] blockNumber The block number to be read. + */ +Buffer +vci_ReadBufferWithPageInit(Relation reln, BlockNumber blockNumber) +{ + return ReadBufferWithPageInitCore(reln, blockNumber, 1); +} + +/** + * @brief Read a buffer containing the requested block of the requested delete + * vector. + * + * Same as ReadBuffer(), but initialize new page. + * + * We must generally use this function instead of ReadBuffer(), to access a + * delete vector. But we don't need to replace ReadBuffer() immediately after + * vci_PreparePagesIfNecessaryCore(). + * + * @param[in] reln The relation. + * @param[in] blockNumber The block number to be read. + */ +Buffer +vci_ReadBufferWithPageInitDelVec(Relation reln, BlockNumber blockNumber) +{ + return ReadBufferWithPageInitCore(reln, blockNumber, VCI_ITEMS_IN_PAGE_FOR_DELETE); +} diff --git a/contrib/vci/storage/vci_ros_command.c b/contrib/vci/storage/vci_ros_command.c new file mode 100644 index 0000000..d986d53 --- /dev/null +++ b/contrib/vci/storage/vci_ros_command.c @@ -0,0 +1,4160 @@ +/*------------------------------------------------------------------------- + * + * vci_ros_command.c + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/storage/vci_ros_command.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include +#include + +#ifndef WIN32 +#include +#endif + +#include "access/heapam.h" +#include "access/heapam_xlog.h" +#include "access/relscan.h" +#include "access/tupdesc.h" +#include "access/genam.h" +#include "access/visibilitymap.h" /* for visibilitymap_set() */ +#include "access/xact.h" +#include "access/tableam.h" +#include "catalog/index.h" +#include "catalog/pg_operator.h" /* for TIDLessOperator */ +#include "catalog/storage.h" +#include "commands/vacuum.h" +#include "storage/freespace.h" +#include "storage/itemptr.h" +#include "storage/lmgr.h" +#include "storage/procarray.h" +#include "storage/smgr.h" /* for RelationSetTargetBlock() */ +#include "utils/rel.h" +#include "utils/syscache.h" +#include "utils/tuplesort.h" + +#include "postgresql_copy.h" + +#include "vci.h" +#include "vci_chunk.h" + +#include "vci_columns.h" +#include "vci_columns_data.h" + +#include "vci_fetch.h" +#include "vci_freelist.h" +#include "vci_mem.h" +#include "vci_ros.h" +#include "vci_ros_command.h" +#include "vci_tidcrid.h" +#include "vci_wos.h" +#include "vci_xact.h" + +extern bool HeapTupleSatisfiesWos2Ros(HeapTuple htup, Snapshot snapshot, Buffer buffer); +extern bool HeapTupleSatisfiesLocalRos(HeapTuple htup, Snapshot snapshot, Buffer buffer); +bool VCITupleSatisfiesVisibility(HeapTuple htup, Snapshot snapshot, Buffer buffer); + +typedef enum +{ + CEK_CountDeletedRows, + CEK_CountUnusedExtents, +} CEKind; + +typedef enum +{ + WOS_Data, + WOS_Whiteout, +} WosKind; + +typedef struct +{ + ItemPointerData orig_tid; + + ItemPointerData wos_tid; + + bool movable; + + int64 xid64; + +} vci_tid_tid_xid64_t; + +static bool WaitTransactionEndOfLastRosCommand(vci_MainRelHeaderInfo *info); +static void fillTidListFromTidSortState(vci_RosCommandContext *comContext, int numRows); +static int ConvertWos2Ros(vci_RosCommandContext *comContext); +static void FillValuesColumnwiseFromRosChunkStorage(vci_virtual_tuples_t *vTuples, RosChunkStorage *rosChunkStorage); +static void FillIsNullColumnwiseFromRosChunkStorage(vci_virtual_tuples_t *vTuples, RosChunkStorage *rosChunkStorage); +static void FillIsNullRowwiseFromRosChunkStorage(vci_virtual_tuples_t *vTuples, RosChunkStorage *rosChunkStorage); +static void FillValuesRowwiseFromRosChunkStorage(vci_virtual_tuples_t *vTuples, RosChunkStorage *rosChunkStorage); +static void AppendDataToLocalRos(vci_local_ros_t *localRos, RosChunkStorage *storage, vci_MainRelHeaderInfo *info); +static Size ConvertWos2LocalRos(vci_RosCommandContext *comContext); +static void FillOneRosChunkBuffer(vci_RosCommandContext *comContext, int rowId, int numRowsToConvert); +static void ReadOneExtentAndStoreInChunkStorage(vci_RosCommandContext *comContext); +static Size ConvertWhiteOut2LocalDeleteList(vci_RosCommandContext *comContext, int sel); +static bool NeedMainRelHeaderUpdate(vci_ros_command_t command); +static int CmpUint64(const void *pa, const void *pb); +static void FlushTidCridPairListToTreeForBuild(vci_TidCridRelations *relPair, vcis_tidcrid_pair_list_t *appList, BlockNumber blockNumber); +static void UpdateTidCridForBuild(vci_RosCommandContext *comContext); +static void vci_build_callback(Relation rel, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *state); +static void FinalizeBuild(vci_RosCommandContext *comContext); +static double GetEstimatedNumRows(Oid relid); +static void RemoveWosEntries(vci_RosCommandContext *comContext, WosKind wos_kind); +static uint64 cleanUpWos(vci_RosCommandContext *comContext, vci_MainRelVar wosType); +static uint64 UpdateDelVec(vci_RosCommandContext *comContext, Size workareaSize, uint64 numRowsAtOnce); +static void writeNumDeleteRowsIntoExntetInfo(vci_MainRelHeaderInfo *info, int32 topExtentId, uint32 numExtents, uint32 *numDeletedRows); +static vci_target_extent_info_t CountExtents(Relation mainRel, uint32 threshold, CEKind kind); +static HeapTuple getTupleFromVector(int offset, TupleDesc tupleDesc, vci_virtual_tuples_t *vecSet); +static void FillOneRosChunkBufferFromExtent(vci_RosCommandContext *comContext, int32 extentId, uint32 *rowIdInExtent); +static bool isCdrTargetExtentValid(vci_RosCommandContext *comContext); +static int32 CollectDeletedRows(vci_RosCommandContext *comContext, Snapshot snapshot); +static uint32 SearchUnusedExtent(vci_MainRelHeaderInfo *info); +static void CollectUnusedExtent(vci_RosCommandContext *comContext); +static void UpdateTidCrid(vci_RosCommandContext *comContext, Size workareaSize); +static void collectBlockNumberToMove(vci_RosCommandContext *comContext, int numPages); +static void freezeMainAndRos(vci_RosCommandContext *comContext); +static void freezeWos(vci_RosCommandContext *comContext, vci_MainRelVar wosType, Snapshot snapshot); +static void truncateRos(vci_RosCommandContext *comContext); +static void truncateWos(vci_RosCommandContext *comContext); +static void constructTidArray(vci_RosCommandContext *comContext, int max_data_wos_entries, int max_whiteout_wos_entries); +static int comparator_orig_tid_xid64(const void *pa, const void *pb); +static bool can_select_candidate_for_wos2ros_conv(vci_tid_tid_xid64_t *data_wos_item, vci_RosCommandContext *comContext, ItemPointer last_whiteout_orig_tid); +static bool can_select_candidate_for_update_delvec(vci_tid_tid_xid64_t *whiteout_wos_item, vci_RosCommandContext *comContext); +static void put_entry_into_tid_list(vci_RosCommandContext *comContext, WosKind wos_kind, ItemPointer orig_tid, ItemPointer wos_tid); +static bool get_entry_into_tid_list(vci_RosCommandContext *comContext, WosKind wos_kind, ItemPointer orig_tid, ItemPointer wos_tid); +static int readTidListFromWosIntoTidArray(Oid wos_od, WosKind wos_kind, vci_tid_tid_xid64_t *wos_entris, int max_wos_entries, Snapshot snapshot); +static void constructTidSortState(vci_RosCommandContext *comContext); +static void readTidListFromWosIntoTidSortState(Oid wos_oid, WosKind wos_kind, TupleTableSlot *slot, Tuplesortstate *sortstate, Snapshot snapshot, TransactionId wosros_xid); +static bool getValidTidSortState(Tuplesortstate *sortstate, TupleTableSlot *slot, vci_tid_tid_xid64_t *item); +static int32 compareXid64(int64 data_wos_xid64, int64 whiteout_wos_xid64); + +/* + * WOS -> ROS conversion + * We have two situations of WOS -> ROS conversion. + * 1. conversion process to reduce WOS and move data into ROS. + * In this case, all columns registered to the VCI are converted into + * ROS style and stored each relation. The column meta data relations + * are also updated. We normally convert one full extent at a time. + * The precise description is, + * A. take an exclusive lock to the main relation header. + * B. recover ROS if broken. + * C. scan WOS with care of freeze condition and deleted condition + * and collect live TID, up to 256 K rows. + * D. sort TID. + * E. write conversion information into VCI main relation header and + * extent info. + * F. collect target tuples and build ROS data. Here we have chunk + * the data, since the work area might be limited. + * G. Find extent and free spaces to write the data. + * H. Write meta data. + * I. Write extent. + * J. Finalize meta data and VCI main relation. + * K. release the main relation header. + * For this purpose, we need VCI main relation, size of workarea. + * + * 2. local ROS conversion. + * In this case, given columns are converted into ROS style and stored + * in memory. All the visible data are converted. + * The precise description is, + * A. scan WOS with care of visibility and deleted condition and collect + * visible TID. + * B. sort TID. + * C. take an exclusive lock to the main relation header. + * D. recover ROS if broken. + * E. collect target tuples and build local ROS data. + * F. release the main relation header. + * For this purpose, we need VCI main relation, size of area to store, + * necessary column ID list. + * + */ + +/* -------------------------------------------------------------- */ + +#define PERIOD_TO_CHECK_TRANSACTION_END (INT64CONST(1000)) /* 1 ms */ +#define DURATION_TO_CHECK_TRANSACTION_END (100000) /* 100 s */ + +/* + * Copy from vacuumlazy.c + */ +#define VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL 20 /* ms */ +#define VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL 50 /* ms */ +#define VACUUM_TRUNCATE_LOCK_TIMEOUT 5000 /* ms */ + +/** + * @brief This function is designed to detect transaction end after VCI + * exclusive write lock is released. + * + * If the transaction of previous ROS command is not commited nor aborted, + * wait for the end for time specified by the macro + * We expect that normally ROS command is soon commited + * DURATION_TO_CHECK_TRANSACTION_END (originally 100 seconds) + * after the lock is released. + * When the end is not detected, the function returns false, + * otherwise true. + * + * @param[in] info Pointer to vci_MainRelHeaderInfo whose VCI index is + * determined. + * @retval true The transaction of the previous ROS command is detected + * in a wait-time. + * @retval false The transaction end is not detected. + */ +static bool +WaitTransactionEndOfLastRosCommand(vci_MainRelHeaderInfo *info) +{ + /* + * current ROS version is the transaction ID of last ROS command + */ + TransactionId curRosVer = vci_GetMainRelVar(info, vcimrv_current_ros_version, 0); + int checkCount; + + if (!TransactionIdIsValid(curRosVer)) + return true; + + if (TransactionIdIsCurrentTransactionId(curRosVer)) + return true; + + for (checkCount = 0; + (checkCount < DURATION_TO_CHECK_TRANSACTION_END) && + (!ConditionalXactLockTableWait(curRosVer, false)); + ++checkCount) + { + pg_usleep(PERIOD_TO_CHECK_TRANSACTION_END); /* wait 1 ms */ + } + + return checkCount < DURATION_TO_CHECK_TRANSACTION_END; +} + +/** + * @brief This function determine the result of the transaction status + * of the previous ROS command. + * + * First, it waits the end of the transaction of the previous if necessary. + * When it is committed successfully, just update current ROS version. + * Otherwise, tries to recover VCI relations. + * + * @param[in] info Pointer to vci_MainRelHeaderInfo whose VCI index is + * determined. + * + * @note Assuming that this function is called under main relation is locked + * exclusively. + */ +void +vci_RecoverOneVCIIfNecessary(vci_MainRelHeaderInfo *info) +{ + TransactionId curRosVer; + TransactionId lastRosVer; + vci_ros_command_t commandSave = info->command; + + Assert(info); + + vci_ChangeCommand(info, vci_rc_recovery); + + /* + * Since the transaction is commited or abort after the lock is released, + * we have to wait for it. + */ + if (!WaitTransactionEndOfLastRosCommand(info)) + elog(ERROR, "unterminated ROS command"); + + curRosVer = vci_GetMainRelVar(info, vcimrv_current_ros_version, 0); + lastRosVer = vci_GetMainRelVar(info, vcimrv_last_ros_version, 0); + + if (!TransactionIdEquals(curRosVer, lastRosVer)) + { + switch (vci_transaction_get_type(curRosVer)) + { + case VCI_XACT_SELF: + /* The last ROS version has been already updated */ + break; + + case VCI_XACT_IN_PROGRESS: + elog(PANIC, "internal error. multiple ROS command running"); + break; + + case VCI_XACT_DID_COMMIT: + /* update last ROS version and others */ + vci_UpdateLastRosVersionAndOthers(info); + break; + + case VCI_XACT_DID_ABORT: + case VCI_XACT_DID_CRASH: + { + vci_ros_command_t command; + + command = vci_GetMainRelVar(info, vcimrv_ros_command, 0); + + elog(DEBUG1, "crash recovery: previous command=\"%s\"(%d)", + vci_GetRosCommandName(command), command); + + switch (command) + { + case vci_rc_update_del_vec: + vci_RecoveryUpdateDelVec(info); + break; + + case vci_rc_wos_ros_conv: + case vci_rc_collect_deleted: + case vci_rc_collect_extent: + vci_RecoveryExtentInfo(info, command); + vci_RecoveryFreeSpace(info, command); + break; + + case vci_rc_update_tid_crid: + vci_RecoveryTidCrid(info); + vci_RecoveryFreeSpaceForTidCrid(info); + break; + + default: + elog(PANIC, "last recorded ros command is fatally broken."); + break; + } + + vci_RecoveryDone(info); + } + break; + + case VCI_XACT_INVALID: + elog(PANIC, "should not reach here"); + break; + } + } + + vci_ChangeCommand(info, commandSave); +} + +static void +fillTidListFromTidSortState(vci_RosCommandContext *comContext, int numRows) +{ + int i; + int count = 0; + + Assert(numRows <= VCI_NUM_ROWS_IN_EXTENT); + + for (i = 0; i < numRows; i++) + { + Assert(count < comContext->wos2ros_array.max); + + if (!get_entry_into_tid_list(comContext, WOS_Data, + &comContext->wos2ros_array.orig_tids[i], + &comContext->wos2ros_array.wos_tids[i])) + break; + + count++; + } + + comContext->wos2ros_array.num = count; + comContext->numRowsToConvert = count; +} + +static int +ConvertWos2Ros(vci_RosCommandContext *comContext) +{ + int result = 0; + + if (comContext->numRowsToConvert < 1) + { + elog(DEBUG2, "stop WOS to ROS conversion numRowsToConvert = %d", comContext->numRowsToConvert); + return 0; + } + + elog(DEBUG2, "start to convert WOS to ROS"); + + /* obtain target extent ID */ + /* comContext->extentId = vci_GetFreeExtentId(&(comContext->info)); */ + elog(DEBUG2, + "WOS -> ROS conversion: index: %s extent ID: " INT64_FORMAT, + RelationGetRelationName(comContext->info.rel), + (int64) comContext->extentId); + + /* + * Set WOS->ROS conversion data and write main relation for recovery. + * Header and extent info. Here, we also put current ROS version to the + * actual current transaction ID. + */ + vci_WriteExtentInfoInMainRosForWosRosConvInit(&(comContext->info), + comContext->extentId, + comContext->xid); + + vci_ResetRosChunkStorage(&(comContext->storage)); + vci_ResetRosChunkBufferCounter(&(comContext->buffer)); + + /* read data for one extent */ + ReadOneExtentAndStoreInChunkStorage(comContext); + + /* write one extent into ROS */ + vci_AddTidCridUpdateList(&(comContext->info), + &(comContext->storage), + comContext->extentId); + vci_WriteOneExtent(&(comContext->info), + &(comContext->storage), + comContext->extentId, + comContext->xid, + InvalidTransactionId, + comContext->xid); + + result = comContext->storage.numTotalRows; + + elog(DEBUG2, "converted %d rows into ROS", result); + + return result; +} + +static void +FillValuesColumnwiseFromRosChunkStorage(vci_virtual_tuples_t *vTuples, + RosChunkStorage *rosChunkStorage) +{ + int16 columnId; + + for (columnId = 0; columnId < vTuples->num_columns; ++columnId) + { + switch (vTuples->column_info[columnId].comp_type) + { + case vcis_compression_type_fixed_raw: + vci_FillFixedWidthColumnarFromRosChunkStorage(vTuples, + columnId, rosChunkStorage); + break; + case vcis_compression_type_variable_raw: + vci_FillVariableWidthColumnarFromRosChunkStorage(vTuples, + columnId, rosChunkStorage); + break; + default: + Assert(false); + elog(ERROR, "internal error: unsupported compression type"); + } + } +} + +static void +FillIsNullColumnwiseFromRosChunkStorage(vci_virtual_tuples_t *vTuples, + RosChunkStorage *rosChunkStorage) +{ + const int16 strideR = 64; + int sId; + int baseOffset = 0; + int16 *nullableColumnId = vci_GetNullableColumnIds(vTuples); + + if (vTuples->num_columns < 1) + return; + + Assert(0 < rosChunkStorage->numFilled); + Assert(vTuples->num_columns <= rosChunkStorage->chunk[0]->numColumns); + Assert(vTuples->fetch_context->query_context->num_nullable_columns <= rosChunkStorage->chunk[0]->numNullableColumns); + Assert(rosChunkStorage->numTotalRows <= vTuples->num_rows_in_extent); + + MemSet(vTuples->isnull, 0, vTuples->num_columns * vTuples->num_rows_in_extent); + + for (sId = 0; sId < rosChunkStorage->numFilled; ++sId) + { + RosChunkBuffer *chunk = rosChunkStorage->chunk[sId]; + int rId; + + for (rId = 0; rId < chunk->numFilled; rId += strideR) + { + int pIdMax = Min(rId + strideR, chunk->numFilled); + int bitId; + + for (bitId = 0; bitId < chunk->numNullableColumns; ++bitId) + { + int colId = nullableColumnId[bitId]; + + if (VCI_FIRST_NORMALCOLUMN_ID <= colId) + { + uint8 *dst = (uint8 *) &(vTuples->isnull[(vTuples->num_rows_in_extent * colId) + baseOffset]); + int pId; + + for (pId = rId; pId < pIdMax; ++pId) + dst[pId] = vci_GetBit((uint8 *) &(chunk->nullData[chunk->nullWidthInByte * pId]), bitId); + } + } + } + baseOffset += chunk->numFilled; + } + Assert(rosChunkStorage->numTotalRows == baseOffset); +} + +static void +FillIsNullRowwiseFromRosChunkStorage(vci_virtual_tuples_t *vTuples, + RosChunkStorage *rosChunkStorage) +{ + abort(); +} + +static void +FillValuesRowwiseFromRosChunkStorage(vci_virtual_tuples_t *vTuples, + RosChunkStorage *rosChunkStorage) +{ + abort(); +} + +static void +AppendDataToLocalRos(vci_local_ros_t *localRos, + RosChunkStorage *storage, + vci_MainRelHeaderInfo *info) +{ + MemoryContext oldMemCtx; + struct vci_virtual_tuples *vTuples; + int32 extentId; + + oldMemCtx = MemoryContextSwitchTo(localRos->memory_context); + + ++(localRos->num_local_extents); + extentId = -(localRos->num_local_extents); + + localRos->extent = vci_repalloc(localRos->extent, + sizeof(vci_virtual_tuples_t *) * + localRos->num_local_extents); + vTuples = vci_CSCreateVirtualTuplesWithNumRows(localRos->fetch_context, + storage->numTotalRows); + localRos->extent[localRos->num_local_extents - 1] = vTuples; + + /* + * Originally, localRos->size_vector_memory_context has the total size of + * vector sets. The third parameter of vci_CSInitializeVectorSet() is the + * size for one vector set. Normally, we give up when many data are stored + * in ROS. So, we can fix the maximum number of extents. + */ + + vTuples->num_rows = storage->numTotalRows; + vTuples->extent_id = extentId; + vTuples->num_rows_in_extent = storage->numTotalRows; + vTuples->row_id_in_extent = 0; + vTuples->status = vcirvs_read_whole; + + if (vTuples->crid) + vci_FillCridInVirtualTuples(vTuples); + + MemSet(vTuples->skip, 0, sizeof(uint16) * vTuples->num_rows_in_extent); + + if (vTuples->tid) + vci_FillFixedWidthColumnarFromRosChunkStorage(vTuples, VCI_COLUMN_ID_TID, storage); + + if (vTuples->use_column_store) + { + FillIsNullColumnwiseFromRosChunkStorage(vTuples, storage); + FillValuesColumnwiseFromRosChunkStorage(vTuples, storage); + } + else + { + FillIsNullRowwiseFromRosChunkStorage(vTuples, storage); + FillValuesRowwiseFromRosChunkStorage(vTuples, storage); + } + + MemoryContextSwitchTo(oldMemCtx); +} + +static Size +ConvertWos2LocalRos(vci_RosCommandContext *comContext) +{ + Size result = 0; + + if (comContext->numRowsToConvert < 1) + return 0; + + elog(DEBUG2, "start to generate local ROS"); + + for (comContext->extentId = -1; (!comContext->done); + comContext->extentId -= 1) + { + elog(DEBUG3, + "WOS -> local ROS conversion: index: %s extent ID:%d\n", + RelationGetRelationName(comContext->info.rel), + comContext->extentId); + + vci_ResetRosChunkStorage(&(comContext->storage)); + vci_ResetRosChunkBufferCounter(&(comContext->buffer)); + + /* read data for one extent */ + ReadOneExtentAndStoreInChunkStorage(comContext); + + /* write one extent into ROS */ + if (0 < comContext->storage.numTotalRows) + AppendDataToLocalRos(comContext->local_ros, + &(comContext->storage), + &(comContext->info)); + + result += comContext->storage.numTotalRows; + elog(DEBUG2, "converted %llu rows into local ROS", + (unsigned long long) result); + } + + return result; +} + +/* ************************************** + * ** CAUTION: AttrNumber is 1 origin. ** + * ************************************** + */ +/** + * assuming when tIdList != NULL, TID list in tIdList to be read. + * not sequential scan, so scan is NULL. + * when tIdList == NULL, scan != NULL, sequential scan. + * + * @retval true some data remain + * @retval false no data remain + */ +static void +FillOneRosChunkBuffer(vci_RosCommandContext *comContext, + int rowId, + int numRowsToConvert) +{ + int offset; + TupleDesc tupleDesc = RelationGetDescr(comContext->heapRel); + Snapshot snapshot = GetActiveSnapshot(); + + if (comContext->wos2ros_array.max > 0) + { + uint32 sel PG_USED_FOR_ASSERTS_ONLY; + vci_ros_command_t command = comContext->command; + +#ifdef USE_ASSERT_CHECKING + vci_TidCridUpdateListContext *oldListContext = NULL; +#endif + + if ((command == vci_rc_wos_ros_conv) || + (command == vci_rc_collect_deleted)) + { + sel = vci_GetMainRelVar(&comContext->info, vcimrv_tid_crid_diff_sel, 0); + +#ifdef USE_ASSERT_CHECKING + oldListContext = vci_OpenTidCridUpdateList(&comContext->info, sel); +#endif + } + else if (command == vci_rc_generate_local_ros) + { + sel = comContext->local_ros->fetch_context->query_context->tid_crid_diff_sel; + } + + for (offset = 0; offset < numRowsToConvert; ++offset) + { + HeapTupleData tuple; + Buffer buffer; + int actualOffset = rowId + comContext->wos2ros_array.offset + offset; + + if (comContext->wos2ros_array.num <= actualOffset) + { + comContext->done = true; + break; + } + + CHECK_FOR_INTERRUPTS(); + + tuple.t_self = comContext->wos2ros_array.orig_tids[actualOffset]; + + if (!heap_fetch(comContext->heapRel, snapshot, &tuple, &buffer, true)) + { + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("vci index \"%s\" corrupted", RelationGetRelationName(comContext->info.rel)), + errdetail("TID (%d,%d) has been deleted from table \"%s\"", + ItemPointerGetBlockNumber(&tuple.t_self), + ItemPointerGetOffsetNumber(&tuple.t_self), + RelationGetRelationName(comContext->heapRel)), + errhint("Use DROP INDEX \"%s\"", RelationGetRelationName(comContext->info.rel)))); + } + +#ifdef USE_ASSERT_CHECKING + if (oldListContext) + { + uint64 cridUint = vci_GetCridFromTid(oldListContext, &tuple.t_self, NULL); + + if (cridUint != VCI_INVALID_CRID) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("vci index \"%s\" corrupted", RelationGetRelationName(comContext->info.rel)), + errdetail("try to insert TID (%d,%d) into ROS twice: extentId=%d, index=%d", + ItemPointerGetBlockNumber(&tuple.t_self), + ItemPointerGetOffsetNumber(&tuple.t_self), + vci_CalcExtentIdFromCrid64(cridUint), + vci_CalcRowIdInExtentFromCrid64(cridUint)), + errhint("Use DROP INDEX \"%s\"", RelationGetRelationName(comContext->info.rel)))); + } +#endif + + vci_FillOneRowInRosChunkBuffer(&(comContext->buffer), + &(comContext->info), + &tuple.t_self, + &tuple, + comContext->indxColumnIdList, + comContext->heapAttrNumList, + tupleDesc); + + if (comContext->data_wos_del_list) + { + tuplesort_putdatum(comContext->data_wos_del_list, + ItemPointerGetDatum(&comContext->wos2ros_array.wos_tids[actualOffset]), false); + } + + ReleaseBuffer(buffer); + } + +#ifdef USE_ASSERT_CHECKING + if (oldListContext) + vci_CloseTidCridUpdateList(oldListContext); +#endif + } +} + +static void +ReadOneExtentAndStoreInChunkStorage(vci_RosCommandContext *comContext) +{ + Size rowId; + + /* collect data for one extent */ + for (rowId = 0; + rowId < comContext->numRowsToConvert; + rowId += comContext->numRowsAtOnce) + { + /* the number of rows in one chunk */ + int numRowsToConvert = comContext->numRowsToConvert - rowId; + + if (comContext->numRowsAtOnce - comContext->buffer.numFilled < numRowsToConvert) + numRowsToConvert = comContext->numRowsAtOnce - comContext->buffer.numFilled; + + CHECK_FOR_INTERRUPTS(); + + /* fetch the data from original relation */ + FillOneRosChunkBuffer(comContext, rowId, numRowsToConvert); + if (0 < comContext->buffer.numFilled) + { + /* copy chunk buffer in a compact manner */ + vci_RegisterChunkBuffer(&(comContext->storage), &(comContext->buffer)); + vci_ResetRosChunkBufferCounter(&(comContext->buffer)); + } + } + + comContext->wos2ros_array.offset += comContext->numRowsToConvert; +} + +static Size +ConvertWhiteOut2LocalDeleteList(vci_RosCommandContext *comContext, + int sel) +{ + vci_local_delete_list *list = &(comContext->local_ros->local_delete_list); + int cId; + vci_TidCridUpdateListContext *tidCridListContext; + + Assert(list); + Assert(list->num_entry < list->length); + + tidCridListContext = vci_OpenTidCridUpdateList(&comContext->info, sel); + + for (cId = 0; cId < comContext->delvec_array.num; cId++) + { + ItemPointerData orig_tid; + uint64 crid; + + orig_tid = comContext->delvec_array.orig_tids[cId]; + + crid = vci_GetCridFromTid(tidCridListContext, &orig_tid, NULL); + + if (crid == VCI_INVALID_CRID) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("vci index \"%s\" corrupted", RelationGetRelationName(comContext->info.rel)), + errdetail("try to delete TID (%d,%d) into local delete list", + ItemPointerGetBlockNumber(&orig_tid), + ItemPointerGetOffsetNumber(&orig_tid)), + errhint("Use DROP INDEX \"%s\"", RelationGetRelationName(comContext->info.rel)))); + + list->crid_list[list->num_entry] = crid; + list->num_entry++; + } + + vci_CloseTidCridUpdateList(tidCridListContext); + + return list->num_entry; +} + +static bool +NeedMainRelHeaderUpdate(vci_ros_command_t command) +{ + switch (command) + { + case vci_rc_recovery: + case vci_rc_wos_ros_conv: + case vci_rc_update_del_vec: + case vci_rc_collect_deleted: + /* case vci_rc_compaction: */ + case vci_rc_update_tid_crid: + case vci_rc_collect_extent: + case vci_rc_copy_command: + case vci_rc_wos_ros_conv_build: + + return true; + + case vci_rc_wos_delete: + case vci_rc_wos_insert: + case vci_rc_probe: + case vci_rc_query: + case vci_rc_generate_local_ros: + case vci_rc_drop_index: + case vci_rc_vacuum: + + return false; + + default: + Assert(false); + elog(ERROR, "internal error: unexpected ROS command"); + } + + return false; +} + +void +vci_ReleaseMainRelInCommandContext(vci_RosCommandContext *comContext) +{ + /* release the main relation */ + vci_ReleaseMainRelHeader(&(comContext->info)); +} + +void +vci_CloseHeapRelInCommandContext(vci_RosCommandContext *comContext) +{ + if (RelationIsValid(comContext->heapRel)) + table_close(comContext->heapRel, AccessShareLock); + comContext->heapRel = NULL; +} + +static int +CmpUint64(const void *pa, const void *pb) +{ + uint64 a = *(uint64 *) pa; + uint64 b = *(uint64 *) pb; + + return (a < b) ? -1 : ((b < a) ? 1 : 0); +} + +void +vci_InitRosCommandContext0(vci_RosCommandContext *context, + Relation rel, vci_ros_command_t command) +{ + Assert(context); + + MemSet(context, 0, sizeof(*context)); + + context->command = command; + context->indexOid = RelationGetRelid(rel); + + vci_InitMainRelHeaderInfo(&(context->info), rel, command); + vci_KeepMainRelHeader(&(context->info)); +} + +void +vci_InitRosCommandContext1(vci_RosCommandContext *comContext, + Size workareaSize, + int numInsertRows, + int numDeleteRows, + bool readOriginalData) +{ + Size worstCaseTupleSize; + int numColumns; + + Assert(comContext); + + comContext->xid = ((vci_rc_query == comContext->command) || + (vci_rc_generate_local_ros == comContext->command)) ? + InvalidTransactionId : GetCurrentTransactionId(); + + comContext->heapOid = IndexGetRelation(comContext->info.rel->rd_id, false); + + comContext->local_ros = NULL; + comContext->done = false; + + switch (comContext->command) + { + case vci_rc_generate_local_ros: + comContext->wos2ros_array.orig_tids = (ItemPointerData *) palloc(sizeof(ItemPointerData) * numInsertRows); + comContext->wos2ros_array.max = numInsertRows; + comContext->delvec_array.orig_tids = (ItemPointerData *) palloc(sizeof(ItemPointerData) * numDeleteRows); + comContext->delvec_array.max = numDeleteRows; + break; + + case vci_rc_wos_ros_conv: + case vci_rc_collect_deleted: + comContext->wos2ros_array.orig_tids = (ItemPointerData *) palloc(sizeof(ItemPointerData) * numInsertRows); + comContext->wos2ros_array.wos_tids = (ItemPointerData *) palloc(sizeof(ItemPointerData) * numInsertRows); + comContext->wos2ros_array.max = numInsertRows; + break; + + default: + break; + } + + comContext->numRowsToConvert = Min(Max(numInsertRows, numDeleteRows), VCI_NUM_ROWS_IN_EXTENT); + + /* + * Column sizes + */ + numColumns = vci_GetMainRelVar(&(comContext->info), vcimrv_num_columns, 0); + + /* + * get column size in worst case and column ID lists for both original + * relation and VCI relation + */ + comContext->numColumns = numColumns; + + if (readOriginalData) + { + Size allocatableSize = Min(workareaSize, MaxAllocSize); + int numRowsAtOnce; + int largestTupleSize; + + comContext->heapAttrNumList = (AttrNumber *) palloc(sizeof(AttrNumber) * numColumns); + comContext->indxColumnIdList = (int16 *) palloc(sizeof(int16) * numColumns); + comContext->columnSizeList = (int16 *) palloc(sizeof(int16) * numColumns); + worstCaseTupleSize = vci_GetColumnIdsAndSizes( + comContext->heapAttrNumList, + comContext->indxColumnIdList, + comContext->columnSizeList, + numColumns, + &(comContext->info), + comContext->heapOid); + + comContext->heapRel = table_open(comContext->heapOid, AccessShareLock); + + /* + * PostgreSQL limits the tuple size by TOAST_TUPLE_TARGET, normally. + * The upper limit of the tuple size is smaller than BLCKSZ. We use + * other area to keep the offset or data size in the chunk buffers or + * ROS. Here, we assume the type of offset is uint32. + */ + largestTupleSize = worstCaseTupleSize + + (comContext->numColumns * sizeof(uint32)); + + /* The number of rows in one chunk */ + numRowsAtOnce = (int) (allocatableSize * VCI_WOS_ROS_WORKAREA_SAFE_RATIO / + largestTupleSize); + numRowsAtOnce = (numRowsAtOnce / VCI_COMPACTION_UNIT_ROW) * VCI_COMPACTION_UNIT_ROW; + numRowsAtOnce = Max(numRowsAtOnce, VCI_COMPACTION_UNIT_ROW); + numRowsAtOnce = Min(numRowsAtOnce, VCI_NUM_ROWS_IN_EXTENT); + + comContext->numRowsAtOnce = numRowsAtOnce; + } + else + { + comContext->heapAttrNumList = NULL; + comContext->indxColumnIdList = NULL; + comContext->columnSizeList = NULL; + comContext->heapRel = NULL; + comContext->numRowsAtOnce = VCI_COMPACTION_UNIT_ROW; + } + + comContext->scan = NULL; + + switch (comContext->command) + { + case vci_rc_wos_ros_conv: + case vci_rc_collect_deleted: + case vci_rc_update_del_vec: + case vci_rc_vacuum: + comContext->oldestXmin = GetOldestNonRemovableTransactionId(comContext->info.rel); + comContext->wos2rosXid = comContext->oldestXmin; + break; + + case vci_rc_generate_local_ros: + default: + comContext->oldestXmin = InvalidTransactionId; + comContext->wos2rosXid = InvalidTransactionId; + break; + } + +} + +void +vci_InitRosCommandContext2(vci_RosCommandContext *comContext, Size workareaSize) +{ + bool make_wos2ros_tid_list = false; + bool make_delvec_tid_list = false; + + comContext->data_wos_del_list = + tuplesort_begin_datum(TIDOID, TIDLessOperator, InvalidOid, false, + Min(workareaSize / 1024 / 3, INT_MAX), NULL, TUPLESORT_NONE); + comContext->whiteout_wos_del_list = + tuplesort_begin_datum(TIDOID, TIDLessOperator, InvalidOid, false, + Min(workareaSize / 1024 / 3, INT_MAX), NULL, TUPLESORT_NONE); + + switch (comContext->command) + { + case vci_rc_wos_ros_conv: + make_wos2ros_tid_list = true; + break; + + case vci_rc_collect_deleted: + make_wos2ros_tid_list = true; + break; + + case vci_rc_update_del_vec: + make_delvec_tid_list = true; + break; + + default: + break; + } + + if (make_wos2ros_tid_list || make_delvec_tid_list) + { + TupleDesc tupDesc; + AttrNumber sortKeys[] = {1}; + Oid sortOperators[] = {TIDLessOperator}; + Oid sortCollations[] = {InvalidOid}; + bool nullsFirstFlags[] = {false}; + + tupDesc = CreateTemplateTupleDesc(2); + + TupleDescInitEntry(tupDesc, (AttrNumber) 1, "orig_tid", TIDOID, -1, 0); + TupleDescInitEntry(tupDesc, (AttrNumber) 2, "wos_tid", TIDOID, -1, 0); + + comContext->tid_tid_tupdesc = tupDesc; + comContext->tid_tid_slot = MakeSingleTupleTableSlot(tupDesc, &TTSOpsHeapTuple); + + if (make_wos2ros_tid_list) + { + comContext->wos2ros_tid_list = + tuplesort_begin_heap(tupDesc, 1, + sortKeys, sortOperators, sortCollations, nullsFirstFlags, + Min(workareaSize / 1024 / 3, INT_MAX), NULL, TUPLESORT_NONE); + } + + if (make_delvec_tid_list) + { + comContext->delvec_tid_list = + tuplesort_begin_heap(tupDesc, 1, + sortKeys, sortOperators, sortCollations, nullsFirstFlags, + Min(workareaSize / 1024 / 3, INT_MAX), NULL, TUPLESORT_NONE); + } + } +} + +void +vci_InitRosChunkStroageAndBuffer(vci_RosCommandContext *comContext, bool forAppending) +{ + int numRowsAtOnce; + + Assert(RelationIsValid(comContext->heapRel)); + + numRowsAtOnce = comContext->numRowsAtOnce; + + /* Initialize the buffers for building chunks of ROS data */ + vci_InitOneRosChunkBuffer(&(comContext->buffer), + numRowsAtOnce, + comContext->columnSizeList, + comContext->numColumns, + false, + &(comContext->info)); + + vci_InitRosChunkStorage(&(comContext->storage), numRowsAtOnce, forAppending); +} + +void +vci_CleanRosCommandContext(vci_RosCommandContext *comContext, bool neverWrite) +{ + if (comContext->tid_tid_slot) + { + ExecClearTuple(comContext->tid_tid_slot); + pfree(comContext->tid_tid_slot); + comContext->tid_tid_slot = NULL; + } + + if (comContext->data_wos_del_list) + { + tuplesort_end(comContext->data_wos_del_list); + comContext->data_wos_del_list = NULL; + } + + if (comContext->whiteout_wos_del_list) + { + tuplesort_end(comContext->whiteout_wos_del_list); + comContext->whiteout_wos_del_list = NULL; + } + + if (comContext->wos2ros_tid_list) + { + tuplesort_end(comContext->wos2ros_tid_list); + comContext->wos2ros_tid_list = NULL; + } + + if (comContext->delvec_tid_list) + { + tuplesort_end(comContext->delvec_tid_list); + comContext->delvec_tid_list = NULL; + } + + if (comContext->tid_tid_tupdesc) + { + FreeTupleDesc(comContext->tid_tid_tupdesc); + comContext->tid_tid_tupdesc = NULL; + } + + /* Close original heap relation if it is opened. */ + vci_CloseHeapRelInCommandContext(comContext); + + /* + * Release chunk buffers - WOS ROS Conv. + */ + if (comContext->command == vci_rc_wos_ros_conv) + { + vci_DestroyOneRosChunkBuffer(&(comContext->buffer)); + vci_DestroyRosChunkStorage(&(comContext->storage)); + } + + if (NULL != comContext->heapAttrNumList) + { + /* release local work area */ + pfree(comContext->heapAttrNumList); + pfree(comContext->indxColumnIdList); + pfree(comContext->columnSizeList); + comContext->heapAttrNumList = NULL; + comContext->indxColumnIdList = NULL; + comContext->columnSizeList = NULL; + } + + /* release local work area */ + if (comContext->wos2ros_array.orig_tids) + { + pfree(comContext->wos2ros_array.orig_tids); + comContext->wos2ros_array.orig_tids = NULL; + } + + if (comContext->wos2ros_array.wos_tids) + { + pfree(comContext->wos2ros_array.wos_tids); + comContext->wos2ros_array.wos_tids = NULL; + } + + if (comContext->delvec_array.orig_tids) + { + pfree(comContext->delvec_array.orig_tids); + comContext->delvec_array.orig_tids = NULL; + } + + if (comContext->utility_array.orig_blknos) + { + pfree(comContext->utility_array.orig_blknos); + comContext->utility_array.orig_blknos = NULL; + } + + if (neverWrite) + return; + + /* write header of the main relation */ + if (NeedMainRelHeaderUpdate(comContext->command)) + vci_WriteMainRelVar(&(comContext->info), + vci_wmrv_update); +} + +void +vci_FinRosCommandContext(vci_RosCommandContext *comContext, bool neverWrite) +{ + vci_CleanRosCommandContext(comContext, neverWrite); + + /* release the main relation */ + vci_ReleaseMainRelInCommandContext(comContext); + + comContext->indexOid = InvalidOid; + comContext->command = vci_rc_invalid; +} + +/** + * numRows is from 1 to VCI_NUM_ROWS_IN_EXTENT + * workareaSize should be taken from the configuration parameter + * in postgresql.conf. + * It just convert one extent. + */ +int +vci_ConvertWos2Ros(Relation mainRel, Size workareaSize, int numRows) +{ + vci_RosCommandContext comContext; + MemoryContext memCtxWos2Ros; + MemoryContext oldMemCtx; + int result = -1; + + Assert((0 < numRows) && (numRows <= VCI_NUM_ROWS_IN_EXTENT)); + + vci_InitRosCommandContext0(&comContext, mainRel, vci_rc_wos_ros_conv); + + /* recover ROS if necessary */ + vci_RecoverOneVCIIfNecessary(&(comContext.info)); + + /* prepare local work area */ + memCtxWos2Ros = AllocSetContextCreate(TopTransactionContext, + "WOS->ROS conversion", + ALLOCSET_DEFAULT_SIZES); + oldMemCtx = MemoryContextSwitchTo(memCtxWos2Ros); + + vci_InitRosCommandContext1(&comContext, + workareaSize / 3 * 2, + numRows, 0, + true); + + vci_InitRosCommandContext2(&comContext, workareaSize / 3); + + if (TransactionIdPrecedes(GetCurrentTransactionId(), + (TransactionId) vci_GetMainRelVar(&comContext.info, vcimrv_current_ros_version, 0))) + goto done; + + GetActiveSnapshot(); + + /* obtain new extent ID */ + comContext.extentIdSrc = VCI_INVALID_EXTENT_ID; + comContext.extentId = vci_GetFreeExtentId(&(comContext.info)); + + /* Write Recovery Information of this command. */ + vci_WriteRecoveryRecordForExtentInfo(&comContext.info, comContext.extentId, comContext.extentIdSrc); + vci_InitRecoveryRecordForFreeSpace(&comContext.info); + + vci_WriteRecoveryRecordDone(&comContext.info, comContext.command, comContext.xid); + + constructTidSortState(&comContext); + + /* call Main routine */ + fillTidListFromTidSortState(&comContext, numRows); + + vci_InitRosChunkStroageAndBuffer(&comContext, false /* no append */ ); + + result = ConvertWos2Ros(&comContext); + + /* remove WOS entries */ + cleanUpWos(&comContext, vcimrv_data_wos_oid); + cleanUpWos(&comContext, vcimrv_whiteout_wos_oid); + + /* Xmax WOS entry */ + RemoveWosEntries(&comContext, WOS_Data); + RemoveWosEntries(&comContext, WOS_Whiteout); + +done: + /* Finalize ROS */ + vci_FinRosCommandContext(&comContext, false); + + MemoryContextSwitchTo(oldMemCtx); + MemoryContextDelete(memCtxWos2Ros); + + return result; +} + +static void +FlushTidCridPairListToTreeForBuild(vci_TidCridRelations *relPair, + vcis_tidcrid_pair_list_t *appList, + BlockNumber blockNumber) +{ + if (0 < appList->num) + { + ItemPointerData treeNode; + + vci_GetTidCridSubTree(relPair, blockNumber, &treeNode); + if (!ItemPointerIsValid(&treeNode)) + vci_CreateTidCridSubTree(relPair, blockNumber, &treeNode); + vci_UpdateTidCridSubTree(relPair, &treeNode, appList); + } + appList->num = 0; +} + +static void +UpdateTidCridForBuild(vci_RosCommandContext *comContext) +{ + RosChunkStorage *src = &(comContext->storage); + vci_TidCridRelations relPair; + const LOCKMODE lockmode = ExclusiveLock; + BlockNumber blockNumber = InvalidBlockNumber; + int32 offset = offsetof(vcis_tidcrid_pair_list_t, body); + int chunkId; + int rowIdInExt = 0; + vcis_tidcrid_pair_list_t *appList = palloc(offset + + (sizeof(vcis_tidcrid_pair_item_t) * src->numTotalRows)); + + vci_OpenTidCridRelations(&relPair, &comContext->info, lockmode); + appList->num = 0; + + for (chunkId = 0; chunkId < src->numFilled; ++chunkId) + { + int rowId; + + for (rowId = 0; rowId < src->chunk[chunkId]->numFilled; ++rowId) + { + ItemPointer itemPtr = (ItemPointer) &(src->chunk[chunkId]-> + tidData[sizeof(ItemPointerData) * rowId]); + + if (blockNumber != ItemPointerGetBlockNumber(itemPtr)) + { + if (BlockNumberIsValid(blockNumber)) + FlushTidCridPairListToTreeForBuild(&relPair, appList, + blockNumber); + blockNumber = ItemPointerGetBlockNumber(itemPtr); + } + + Assert(appList->num < src->numTotalRows); + appList->body[appList->num].crid = vci_GetCridFromUint64( + vci_CalcCrid64(comContext->extentId, rowIdInExt)); + ItemPointerCopy(itemPtr, &appList->body[appList->num].page_item_id); + (appList->num)++; + + Assert(rowIdInExt < src->numTotalRows); + rowIdInExt++; + } + } + if (BlockNumberIsValid(blockNumber)) + FlushTidCridPairListToTreeForBuild(&relPair, appList, blockNumber); + pfree(appList); + vci_CloseTidCridRelations(&relPair, lockmode); +} + +/* Implementation of callback interface:IndexBuildCallback */ +static void +vci_build_callback(Relation rel, + ItemPointer tid, + Datum *values, + bool *isnull, + bool tupleIsAlive, + void *state) +{ + vci_RosCommandContext *comContext = (vci_RosCommandContext *) state; + + Assert(comContext); + + if (tupleIsAlive) + { + Assert((0 <= comContext->buffer.numFilled) && + (comContext->buffer.numFilled < comContext->numRowsAtOnce)); + + vci_FillOneRowInRosChunkBuffer(&(comContext->buffer), + &(comContext->info), + &IndexHeapTuple->t_self, /* use the original heap + * tuple saved in + * heapam_index_build_range_scan() */ + IndexHeapTuple, /* use the original heap + * tuple saved in + * heapam_index_build_range_scan() */ + comContext->indxColumnIdList, + comContext->heapAttrNumList, + RelationGetDescr(comContext->heapRel)); + + if (comContext->numRowsAtOnce <= comContext->buffer.numFilled) + { + vci_RegisterChunkBuffer(&(comContext->storage), + &(comContext->buffer)); + vci_ResetRosChunkBufferCounter(&(comContext->buffer)); + } + + if (VCI_NUM_ROWS_IN_EXTENT <= + (comContext->storage.numTotalRows + comContext->buffer.numFilled)) + { + Assert(TransactionIdIsValid(comContext->xid)); + if (0 < comContext->buffer.numFilled) + { + vci_RegisterChunkBuffer(&(comContext->storage), + &(comContext->buffer)); + vci_ResetRosChunkBufferCounter(&(comContext->buffer)); + } + vci_WriteExtentInfoInMainRosForWosRosConvInit(&(comContext->info), + comContext->extentId, + comContext->xid); + UpdateTidCridForBuild(comContext); + vci_WriteOneExtent(&(comContext->info), + &(comContext->storage), + comContext->extentId, + comContext->xid, + InvalidTransactionId, + comContext->xid); + vci_ResetRosChunkStorage(&(comContext->storage)); + comContext->extentId++; + } + } +} + +static void +FinalizeBuild(vci_RosCommandContext *comContext) +{ + if (0 < comContext->buffer.numFilled) + vci_RegisterChunkBuffer(&(comContext->storage), + &(comContext->buffer)); + + if (0 < comContext->storage.numTotalRows) + { + Assert(TransactionIdIsValid(comContext->xid)); + vci_WriteExtentInfoInMainRosForWosRosConvInit(&(comContext->info), + comContext->extentId, + comContext->xid); + UpdateTidCridForBuild(comContext); + vci_WriteOneExtent(&(comContext->info), + &(comContext->storage), + comContext->extentId, + comContext->xid, + InvalidTransactionId, + comContext->xid); + comContext->extentId++; + } +} + +/** + * @brief Obtain number of rows in the relation estimated by ANALYZE or + * VACUUM commands. + * + * @param[in] relid The Oid of the relation. + * @return The estimated number of rows. + */ +static double +GetEstimatedNumRows(Oid relid) +{ + HeapTuple tp = SearchSysCache1(RELOID, ObjectIdGetDatum(relid)); + + if (HeapTupleIsValid(tp)) + { + Form_pg_class reltup = (Form_pg_class) GETSTRUCT(tp); + double result = Max(reltup->reltuples, 0); + + ReleaseSysCache(tp); + + return result; + } + else + return 0.0; +} + +/** + * This function is assumed when the VCI index is newly built, and + * it converts all the data in the relation of PostgreSQL into ROS. + */ +double +vci_ConvertWos2RosForBuild(Relation mainRel, + Size workareaSize, + IndexInfo *indexInfo) +{ + vci_RosCommandContext comContext; + MemoryContext memCtxWos2Ros; + MemoryContext oldMemCtx; + double result = 0; + + vci_InitRosCommandContext0(&comContext, mainRel, + vci_rc_wos_ros_conv_build); + + /* prepare local work area */ + memCtxWos2Ros = AllocSetContextCreate(TopTransactionContext, + "WOS->ROS conversion", + ALLOCSET_DEFAULT_SIZES); + oldMemCtx = MemoryContextSwitchTo(memCtxWos2Ros); + + vci_InitRosCommandContext1(&comContext, + workareaSize, + VCI_NUM_ROWS_IN_EXTENT, 0, + true); + + vci_InitRosChunkStroageAndBuffer(&comContext, false /* no append */ ); + + comContext.extentId = VCI_FIRST_NORMAL_EXTENT_ID; + + /* + * Initialize information for printing progress + */ + comContext.estimatedNumRows = GetEstimatedNumRows( + RelationGetRelid(comContext.heapRel)); + if (comContext.estimatedNumRows < 1) + comContext.estimatedNumRows = 1; + comContext.numConvertedRows = 0; + strcpy(comContext.relName, RelationGetRelationName(mainRel)); + + result = table_index_build_scan(comContext.heapRel, + mainRel, + indexInfo, + true, /* allow syncscan */ + true, + vci_build_callback, + (void *) &comContext, NULL); + indexInfo->ii_BrokenHotChain = true; + FinalizeBuild(&comContext); + + vci_FinRosCommandContext(&comContext, false); + + MemoryContextSwitchTo(oldMemCtx); + MemoryContextDelete(memCtxWos2Ros); + + return result; +} + +static void +RemoveWosEntries(vci_RosCommandContext *comContext, WosKind wos_kind) +{ + Datum value; + bool isnull; + Relation rel; + Oid wos_oid; + Tuplesortstate *sortstate = NULL; + + switch (wos_kind) + { + case WOS_Data: + wos_oid = vci_GetMainRelVar(&comContext->info, vcimrv_data_wos_oid, 0); + sortstate = comContext->data_wos_del_list; + break; + + case WOS_Whiteout: + wos_oid = vci_GetMainRelVar(&comContext->info, vcimrv_whiteout_wos_oid, 0); + sortstate = comContext->whiteout_wos_del_list; + break; + default: + wos_oid = InvalidOid; + break; + } + + tuplesort_performsort(sortstate); + + rel = relation_open(wos_oid, RowExclusiveLock); + + while (tuplesort_getdatum(sortstate, true, true, &value, &isnull, NULL)) + { + ItemPointer tid; + + tid = DatumGetItemPointer(value); + + simple_heap_delete(rel, tid); + } + + RelationSetTargetBlock(rel, InvalidBlockNumber); + + relation_close(rel, RowExclusiveLock); +} + +static uint64 +cleanUpWos(vci_RosCommandContext *comContext, vci_MainRelVar wosType) +{ + const LOCKMODE lockmode = ShareUpdateExclusiveLock; + vci_MainRelHeaderInfo *info; + BlockNumber nblocks; + BlockNumber blkno; + OffsetNumber offnum; + ItemPointer dead_tuples; + int max_dead_tuples; + int tupindex; + uint64 total_live = 0; + + HeapTupleData tuple; + + Oid oidWosType; + TransactionId oldestXmin; + Relation rel; + + info = &comContext->info; + + oldestXmin = comContext->oldestXmin; + + oidWosType = vci_GetMainRelVar(info, wosType, 0); + + rel = table_open(oidWosType, lockmode); + + max_dead_tuples = MaxHeapTuplesPerPage; + dead_tuples = palloc0(sizeof(ItemPointerData) * max_dead_tuples); + + nblocks = RelationGetNumberOfBlocks(rel); + for (blkno = 0; blkno < nblocks; blkno++) + { + Size freespace; + int num_dead_tuples = 0; + TransactionId snapshotConflictHorizon = InvalidTransactionId; + + Buffer buffer; + Buffer vmbuffer = InvalidBuffer; + Page page; + OffsetNumber maxoff; + + OffsetNumber unused[MaxOffsetNumber]; + int nunused = 0; + bool is_visible_page = true; + + /* Get a buffer containing the target block. */ + buffer = ReadBuffer(rel, blkno); + page = BufferGetPage(buffer); + + if (!ConditionalLockBufferForCleanup(buffer)) + { + ReleaseBuffer(buffer); + continue; + } + + /* Collect removable dead tuples in the target block. */ + maxoff = PageGetMaxOffsetNumber(page); + for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) + { + ItemId itemid = PageGetItemId(page, offnum); + + /* Unused items require no processing, but we count 'em */ + if (!ItemIdIsUsed(itemid)) + continue; + + /* Redirect items mustn't be touched */ + if (ItemIdIsRedirected(itemid)) + continue; + + ItemPointerSet(&(tuple.t_self), blkno, offnum); + + /* + * DEAD item pointers are to be vacuumed normally; but we don't + * count them in tups_vacuumed, else we'd be double-counting (at + * least in the common case where heap_page_prune() just freed up + * a non-HOT tuple). + */ + if (ItemIdIsDead(itemid)) + { + dead_tuples[num_dead_tuples++] = tuple.t_self; + continue; + } + + Assert(ItemIdIsNormal(itemid)); + + tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid); + tuple.t_len = ItemIdGetLength(itemid); + tuple.t_tableOid = RelationGetRelid(rel); + + switch (HeapTupleSatisfiesVacuum(&tuple, oldestXmin, buffer)) + { + case HEAPTUPLE_DEAD: + dead_tuples[num_dead_tuples++] = tuple.t_self; + HeapTupleHeaderAdvanceConflictHorizon(tuple.t_data, + &snapshotConflictHorizon); + break; + case HEAPTUPLE_LIVE: + ++total_live; + break; + case HEAPTUPLE_RECENTLY_DEAD: + case HEAPTUPLE_INSERT_IN_PROGRESS: + case HEAPTUPLE_DELETE_IN_PROGRESS: + break; + default: + elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); + break; + } + } + + if (num_dead_tuples == 0) + { + /* + * Skip repair of a fragmentation, because dead tuple is not + * exist. + */ + UnlockReleaseBuffer(buffer); + continue; + } + + visibilitymap_pin(rel, blkno, &vmbuffer); + + /* + * this routine is copied from lazy_vacuum_heap_rel() & + * lazy_vacuum_heap_page(), and modified. + */ + + START_CRIT_SECTION(); + + for (tupindex = 0; tupindex < num_dead_tuples; tupindex++) + { + BlockNumber tblk; + OffsetNumber toff; + ItemId itemid; + + HeapTupleHeader htup; + + tblk = ItemPointerGetBlockNumber(&dead_tuples[tupindex]); + if (tblk != blkno) + break; /* past end of tuples for this block */ + toff = ItemPointerGetOffsetNumber(&dead_tuples[tupindex]); + + itemid = PageGetItemId(page, toff); + if (!ItemIdHasStorage(itemid)) + continue; + if (!ItemIdIsDead(itemid)) + continue; + + htup = (HeapTupleHeader) PageGetItem(page, itemid); + dead_tuples[tupindex] = *(ItemPointer) ((char *) htup + htup->t_hoff); + + Assert(ItemIdIsDead(itemid) && !ItemIdHasStorage(itemid)); + ItemIdSetUnused(itemid); + unused[nunused++] = toff; + } + + /* Attempt to truncate line pointer array now */ + if (nunused > 0) + PageTruncateLinePointerArray(page); + + + /* Mark buffer dirty before we write WAL. */ + MarkBufferDirty(buffer); + + for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) + { + ItemId itemid = PageGetItemId(page, offnum); + + if (ItemIdIsUsed(itemid)) + { + is_visible_page = false; + break; + } + } + + if (BufferIsValid(vmbuffer)) + { + if (is_visible_page) + { + PageSetAllVisible(page); + MarkBufferDirty(buffer); + visibilitymap_set(rel, blkno, buffer, InvalidXLogRecPtr, + vmbuffer, InvalidTransactionId, VISIBILITYMAP_ALL_VISIBLE); + } + + ReleaseBuffer(vmbuffer); + } + + /* XLOG stuff */ + if (nunused > 0 && RelationNeedsWAL(rel)) + { + /* + * Commit add323d added the vmbuffer/vmflags parameters. + * A quick fix was needed to allow build to proceed. + * + * TODO Confirm if passing InvalidBuffer, 0 is OK here. + */ + log_heap_prune_and_freeze(rel, buffer, + InvalidBuffer, /* vmbuffer */ + 0, /* vmflags */ + snapshotConflictHorizon, + false, /* no cleanup lock required */ + PRUNE_ON_ACCESS, + NULL, 0, /* frozen */ + NULL, 0, /* redirected */ + NULL, 0, /* dead */ + unused, nunused); + } + + END_CRIT_SECTION(); + + freespace = PageGetHeapFreeSpace(page); + + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + ReleaseBuffer(buffer); + + RecordPageWithFreeSpace(rel, blkno, freespace); + + /* + * in vci_UnregisterTIDFromTIDTree(), TidTree in memory will be + * rebuild when the size was too large to store in memory, and the + * size is reduced to fit to the memory size. At that time, data WOS + * will be scan to obtain TID list. So, vci_UnregisterTIDFromTIDTree() + * can not be called in the critical section above. + */ + } + + pfree(dead_tuples); + table_close(rel, lockmode); + + return total_live; +} + +/** + * generate local ROS. + * This function is assumed to be called in backend process, not parallel + * background worker. Here, vci_CSFetchContext is used unlocalized. + */ +vci_local_ros_t * +vci_GenerateLocalRos(vci_CSQueryContext queryContext, + Size workareaSize, + int64 numDataWosRows, + int64 numWhiteoutWosRows) +{ + vci_RosCommandContext comContext; + int numRowsInExtent; + MemoryContext localMemCtx; + MemoryContext sharedMemCtx; + MemoryContext oldMemCtx; + vci_local_ros_t *result; + Size partedWorkareaSize = workareaSize / 4; + int64 numLocalDeleteListRows; + + numRowsInExtent = vci_GetNumRowsInLocalRosExtent(queryContext->num_columns); + + sharedMemCtx = AllocSetContextCreate(queryContext->shared_memory_context, + "Work for Local ROS generation", + ALLOCSET_DEFAULT_SIZES); + + result = MemoryContextAllocZero(sharedMemCtx, sizeof(vci_local_ros_t)); + result->num_local_extents = 0; + result->extent = NULL; + result->memory_context = sharedMemCtx; + result->fetch_context = vci_CSCreateFetchContextBase(queryContext, + Min(numRowsInExtent, numDataWosRows), + queryContext->num_columns, + queryContext->attr_num, + true, + true, + true, + false); /* no compression */ + + numRowsInExtent = result->fetch_context->num_rows_read_at_once; + + Assert(queryContext == result->fetch_context->query_context); + + /* + * Local Delete List + */ + numLocalDeleteListRows = numDataWosRows + numWhiteoutWosRows; + + result->local_delete_list.crid_list = + MemoryContextAllocZero(result->memory_context, + sizeof(*(result->local_delete_list.crid_list)) * numLocalDeleteListRows); + result->local_delete_list.num_entry = 0; + result->local_delete_list.length = numLocalDeleteListRows; + + Assert(0 == ((uintptr_t) (result->local_delete_list.crid_list) & (MAXIMUM_ALIGNOF - 1))); + + localMemCtx = AllocSetContextCreate(TopTransactionContext, + "Work for Local ROS generation", + ALLOCSET_DEFAULT_SIZES); + oldMemCtx = MemoryContextSwitchTo(localMemCtx); + + vci_InitRosCommandContext0(&comContext, queryContext->info->rel, + vci_rc_generate_local_ros); + vci_InitRosCommandContext1(&comContext, + partedWorkareaSize, + numDataWosRows, numWhiteoutWosRows, + true); + + vci_InitRosChunkStroageAndBuffer(&comContext, false /* no append */ ); + + comContext.inclusiveXid = queryContext->inclusive_xid; + comContext.exclusiveXid = queryContext->exclusive_xid; + + Assert(queryContext->num_data_wos_entries <= VCI_NUM_ROWS_IN_EXTENT * VCI_MAX_NUMBER_UNCONVERTED_ROS); + Assert(queryContext->num_whiteout_wos_entries <= VCI_NUM_ROWS_IN_EXTENT * VCI_MAX_NUMBER_UNCONVERTED_ROS); + + constructTidArray(&comContext, + (int) queryContext->num_data_wos_entries, + (int) queryContext->num_whiteout_wos_entries); + + comContext.numRowsToConvert = Min(comContext.numRowsToConvert, + numRowsInExtent); + comContext.local_ros = result; + queryContext->local_ros = result; + + MemoryContextSwitchTo(sharedMemCtx); + + PG_TRY(); + { + ConvertWos2LocalRos(&comContext); + + comContext.local_ros = result; + + ConvertWhiteOut2LocalDeleteList(&comContext, + result->fetch_context->query_context->tid_crid_diff_sel); + + qsort(result->local_delete_list.crid_list, + result->local_delete_list.num_entry, + sizeof(uint64), + CmpUint64); + + queryContext->local_ros = result; + queryContext->num_local_ros_extents = result->num_local_extents; + queryContext->delete_list = comContext.local_ros->local_delete_list.crid_list; + queryContext->num_delete = comContext.local_ros->local_delete_list.num_entry; + } + PG_CATCH(); + { + if (geterrcode() == ERRCODE_OUT_OF_MEMORY) + { + vci_FinRosCommandContext(&comContext, true /* never write */ ); + + MemoryContextSwitchTo(oldMemCtx); + MemoryContextDelete(localMemCtx); + } + + PG_RE_THROW(); + } + PG_END_TRY(); + + vci_FinRosCommandContext(&comContext, false); + + MemoryContextSwitchTo(oldMemCtx); + MemoryContextDelete(localMemCtx); + + return result; +} + +/** + * in vci_DestroyLocalRos(), release the memory context allocated to the + * local ros pointed by localRos. + * We have not need to pfree() each element. + */ +void +vci_DestroyLocalRos(vci_local_ros_t *localRos) +{ + MemoryContext memCtx; + + Assert(localRos); + memCtx = localRos->memory_context; + MemoryContextDelete(memCtx); +} + +uint32 +vci_CountFreezedInDataWos(Relation mainRel, Size workareaSize) +{ + uint32 count = 0; + vci_MainRelHeaderInfo infoData = {0}; + vci_MainRelHeaderInfo *info = &infoData; + + Oid dataWosOid; + Relation dataWosRel; + + TableScanDesc scan; + HeapTuple tuple; + Snapshot snapshot; + + vci_InitMainRelHeaderInfo(info, mainRel, vci_rc_probe); + vci_KeepMainRelHeader(info); + + dataWosOid = (Oid) vci_GetMainRelVar(info, vcimrv_data_wos_oid, 0); + dataWosRel = table_open(dataWosOid, AccessShareLock); + + snapshot = vci_GetSnapshotForWos2Ros(); + + scan = table_beginscan(dataWosRel, snapshot, 0, NULL); + scan->rs_flags &= ~SO_ALLOW_PAGEMODE; + + while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL) + { + count++; + } + table_endscan(scan); + + PopActiveSnapshot(); + + /* release the data WOS relation */ + table_close(dataWosRel, AccessShareLock); + + /* release the main relation */ + vci_ReleaseMainRelHeader(info); + + return count; +} + +/* --------------------------------------------------------------*/ +/* Update Delete Lists */ +/* --------------------------------------------------------------*/ + +uint32 +vci_CountFreezedInWhiteoutWos(Relation mainRel, Size workareaSize) +{ + uint32 count = 0; + vci_MainRelHeaderInfo infoData = {0}; + vci_MainRelHeaderInfo *info = &infoData; + + Oid whiteoutWosOid; + Relation whiteoutWosRel; + + TableScanDesc scan; + HeapTuple tuple; + Snapshot snapshot; + + vci_InitMainRelHeaderInfo(info, mainRel, vci_rc_probe); + vci_KeepMainRelHeader(info); + + whiteoutWosOid = (Oid) vci_GetMainRelVar(info, vcimrv_whiteout_wos_oid, 0); + whiteoutWosRel = table_open(whiteoutWosOid, AccessShareLock); + + snapshot = vci_GetSnapshotForWos2Ros(); + + scan = table_beginscan(whiteoutWosRel, snapshot, 0, NULL); + scan->rs_flags &= ~SO_ALLOW_PAGEMODE; + while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL) + { + count++; + } + table_endscan(scan); + + PopActiveSnapshot(); + + /* release the data WOS relation */ + table_close(whiteoutWosRel, AccessShareLock); + + /* release the main relation */ + vci_ReleaseMainRelHeader(info); + + return count; +} + +static uint64 +UpdateDelVec(vci_RosCommandContext *comContext, Size workareaSize, uint64 numRowsAtOnce) +{ + uint32 numExtents; + Tuplesortstate *cridList; + uint64 result = 0; + + if (comContext->num_delvec_tids == 0) + return 0; + + numExtents = vci_GetMainRelVar(&comContext->info, vcimrv_num_extents, 0); + + cridList = + tuplesort_begin_datum(INT8OID, Int8LessOperator, InvalidOid, false, + Min(workareaSize / 1024 / 2, INT_MAX), NULL, TUPLESORT_NONE); + + /* + * Phase 1. Convert TID List -> CRID List + */ + do + { + vci_TidCridUpdateListContext *oldListContext; + Tuplesortstate *addList; + + uint32 oldSel; + uint32 newSel; + + oldSel = vci_GetMainRelVar(&comContext->info, vcimrv_tid_crid_diff_sel, 0); + newSel = 1 ^ oldSel; + + oldListContext = vci_OpenTidCridUpdateList(&comContext->info, oldSel); + + addList = + tuplesort_begin_datum(TIDOID, TIDLessOperator, InvalidOid, false, + Min(workareaSize / 1024 / 2, INT_MAX), NULL, TUPLESORT_NONE); + while (result < numRowsAtOnce) + { + ItemPointerData orig_tid; + ItemPointerData wos_tid; + uint64 cridUint; + + if (!get_entry_into_tid_list(comContext, WOS_Whiteout, &orig_tid, &wos_tid)) + break; + + if (comContext->whiteout_wos_del_list) + tuplesort_putdatum(comContext->whiteout_wos_del_list, ItemPointerGetDatum(&wos_tid), false); + + cridUint = vci_GetCridFromTid(oldListContext, &orig_tid, NULL); + + if (cridUint == VCI_INVALID_CRID) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("vci index \"%s\" corrupted", RelationGetRelationName(comContext->info.rel)), + errdetail("try to delete TID (%d,%d) into delete vector twice", + ItemPointerGetBlockNumber(&orig_tid), + ItemPointerGetOffsetNumber(&orig_tid)), + errhint("Use DROP INDEX \"%s\"", RelationGetRelationName(comContext->info.rel)))); + + /* list for storage */ + tuplesort_putdatum(addList, ItemPointerGetDatum(&orig_tid), false); + + /* list for operation */ + tuplesort_putdatum(cridList, Int64GetDatum((int64) cridUint), false); + + result++; + } + + vci_CloseTidCridUpdateList(oldListContext); + + tuplesort_performsort(addList); + + /* Insert TID->CRID(Invalid) List */ + vci_MergeAndWriteTidCridUpdateList(&comContext->info, newSel, oldSel, addList, vci_GetCridFromUint64(VCI_INVALID_CRID)); + + tuplesort_end(addList); + + } while (false); /* phase1 */ + + elog(DEBUG2, "CRID List OK"); + + /* + * Phase 2. loop for crid + */ + do + { + LOCKMODE lockmode = RowExclusiveLock; + vci_ColumnRelations delvecCol; + + BlockNumber prevBlkno = InvalidBlockNumber; + OffsetNumber prevOffset = InvalidOffsetNumber; + + Buffer buffer = InvalidBuffer; + Page page = NULL; + + bool readFirstBlock = false; + Datum value; + bool isnull; + + uint32 numDeletedRows[VCI_MAX_PAGE_SPACE / sizeof(vcis_m_extent_t)]; + int32 topExtentId = -1; + BlockNumber topBlockNumber = InvalidBlockNumber; + + memset(numDeletedRows, 0, sizeof(numDeletedRows)); + + tuplesort_performsort(cridList); + + vci_OpenColumnRelations(&delvecCol, &comContext->info, + VCI_COLUMN_ID_DELETE, lockmode); + + while (tuplesort_getdatum(cridList, true, true, &value, &isnull, NULL)) + { + HeapTupleHeader htup; + int32 extentId; + BlockNumber blkno; + OffsetNumber offset; + uint32 byte_num; + uint32 setBitPos; + uint64 crid; + BlockNumber extentInfoBlkno; + OffsetNumber extentInfoOffset; + + crid = (uint64) DatumGetInt64(value); + + extentId = vci_CalcExtentIdFromCrid64(crid); + blkno = vci_CalcBlockNumberFromCrid64ForDelete(crid); + offset = vci_CalcOffsetNumberFromCrid64ForDelete(crid); + byte_num = vci_CalcByteFromCrid64ForDelete(crid); + setBitPos = vci_CalcBitFromCrid64ForDelete(crid); + + if ((blkno != prevBlkno) || (offset != prevOffset)) + { + if (readFirstBlock) + { + /* write Tuple & WAL */ + vci_WriteItem(delvecCol.data, buffer, prevOffset); + } + } + + if (blkno != prevBlkno) + { + if (readFirstBlock) + UnlockReleaseBuffer(buffer); + + buffer = vci_ReadBufferWithPageInitDelVec(delvecCol.data, blkno); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + page = BufferGetPage(buffer); + + readFirstBlock = true; + } + + /* Calc bits & overwrite */ + htup = (HeapTupleHeader) PageGetItem(page, PageGetItemId(page, offset)); + *((char *) htup + htup->t_hoff + byte_num) |= 1 << setBitPos; + + vci_GetExtentInfoPosition(&extentInfoBlkno, &extentInfoOffset, extentId); + + if (topBlockNumber != extentInfoBlkno) + { + writeNumDeleteRowsIntoExntetInfo(&comContext->info, topExtentId, numExtents, numDeletedRows); + + memset(numDeletedRows, 0, sizeof(numDeletedRows)); + + topExtentId = extentId; + topBlockNumber = extentInfoBlkno; + } + + numDeletedRows[extentId - topExtentId]++; + + prevBlkno = blkno; + prevOffset = offset; + } + + /* write remaining Tuple & WAL, and release buffer */ + if (readFirstBlock) + { + Assert(BufferIsValid(buffer)); + vci_WriteItem(delvecCol.data, buffer, prevOffset); + UnlockReleaseBuffer(buffer); + } + + /* Close Column */ + vci_CloseColumnRelations(&delvecCol, lockmode); + + if (BlockNumberIsValid(topBlockNumber)) + writeNumDeleteRowsIntoExntetInfo(&comContext->info, topExtentId, numExtents, numDeletedRows); + + } while (false); /* phase 2 */ + + tuplesort_end(cridList); + + elog(DEBUG2, "update delvec OK"); + + return result; +} + +static void +writeNumDeleteRowsIntoExntetInfo(vci_MainRelHeaderInfo *info, int32 topExtentId, uint32 numExtents, uint32 *numDeletedRows) +{ + BlockNumber topBlockNumber; + OffsetNumber topOffsetNumber; + Buffer buffer; + Page page; + int32 extentId; + + if (topExtentId < 0) + return; + + vci_GetExtentInfoPosition(&topBlockNumber, &topOffsetNumber, topExtentId); + + buffer = vci_ReadBufferWithPageInit(info->rel, topBlockNumber); + + /* LockBuffer(buffer, BUFFER_LOCK_SHARE); */ + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + + page = BufferGetPage(buffer); + + for (extentId = topExtentId; (uint32) extentId < numExtents; extentId++) + { + BlockNumber curBlockNumber; + OffsetNumber curOffsetNumber; + vcis_m_extent_t *extentInfo; + + vci_GetExtentInfoPosition(&curBlockNumber, &curOffsetNumber, extentId); + + if (curBlockNumber != topBlockNumber) + break; + + extentInfo = (vcis_m_extent_t *) &(((char *) page)[curOffsetNumber]); + + extentInfo->num_deleted_rows += numDeletedRows[extentId - topExtentId]; + } + + vci_WriteOneItemPage(info->rel, buffer); + + UnlockReleaseBuffer(buffer); +} + +int +vci_UpdateDelVec(Relation mainRel, Size workareaSize, int numRows) +{ + int result = -1; + MemoryContext memCtxWos2Ros; + MemoryContext oldMemCtx; + vci_RosCommandContext comContext; + + vci_InitRosCommandContext0(&comContext, mainRel, vci_rc_update_del_vec); + + /* recover ROS if necessary */ + vci_RecoverOneVCIIfNecessary(&(comContext.info)); + + /* Change Mem Context */ + memCtxWos2Ros = AllocSetContextCreate(TopTransactionContext, + "Delete Vector Update.", + ALLOCSET_DEFAULT_SIZES); + oldMemCtx = MemoryContextSwitchTo(memCtxWos2Ros); + + /* Create TID List from Whiteout WOS */ + vci_InitRosCommandContext1(&comContext, + workareaSize / 2, + 0, numRows, + false); + + vci_InitRosCommandContext2(&comContext, workareaSize / 2); + + if (TransactionIdPrecedes(GetCurrentTransactionId(), + (TransactionId) vci_GetMainRelVar(&comContext.info, vcimrv_current_ros_version, 0))) + goto done; + + GetActiveSnapshot(); + + /* Write Recovery Information */ + vci_WriteRecoveryRecordForUpdateDelVec(&comContext.info); + + vci_WriteRecoveryRecordDone(&comContext.info, comContext.command, comContext.xid); + + constructTidSortState(&comContext); + + /* call Main routine */ + result = UpdateDelVec(&comContext, workareaSize / 2, Min(numRows, VCI_NUM_ROWS_IN_EXTENT)); + + /* Clean up WOS entry */ + cleanUpWos(&comContext, vcimrv_data_wos_oid); + cleanUpWos(&comContext, vcimrv_whiteout_wos_oid); + + /* Xmax WOS entry */ + RemoveWosEntries(&comContext, WOS_Data); + RemoveWosEntries(&comContext, WOS_Whiteout); + +done: + /* Finalize ROS */ + vci_FinRosCommandContext(&comContext, false); + + MemoryContextSwitchTo(oldMemCtx); + MemoryContextDelete(memCtxWos2Ros); + + return result; +} + +/* -------------------------------------------------------------- */ +/* Correction Deleted Rows */ +/* -------------------------------------------------------------- */ + +static vci_target_extent_info_t +CountExtents(Relation mainRel, uint32 threshold, CEKind kind) +{ + TransactionId wos2rosXid; + + vci_MainRelHeaderInfo infoData = {0}; + vci_MainRelHeaderInfo *info = &infoData; + + vcis_m_extent_t *extentInfo; + vci_meta_item_scanner_t *scan; + + vci_target_extent_info_t result = {0, -1 /* not-found-value */ }; + uint32 max_deleted_rows = 0; + + wos2rosXid = GetOldestNonRemovableTransactionId(mainRel); + + vci_InitMainRelHeaderInfo(info, mainRel, vci_rc_probe); + vci_KeepMainRelHeader(info); + vci_GetMainRelVar(info, vcimrv_num_extents, 0); + + scan = vci_BeginMetaItemScan(info->rel, BUFFER_LOCK_SHARE); + while ((extentInfo = vci_GetMExtentNext(info, scan)) != NULL) + { + if (kind == CEK_CountDeletedRows) + { + if (vci_ExtentIsFree(extentInfo)) + continue; + + if (!vci_ExtentIsVisible(extentInfo, wos2rosXid)) + continue; + + if (TransactionIdIsValid(extentInfo->xdel)) + continue; + + if (extentInfo->num_deleted_rows >= threshold) + { + if (max_deleted_rows <= extentInfo->num_deleted_rows) + { + result.best_extent_id = scan->index; + max_deleted_rows = extentInfo->num_deleted_rows; + } + result.num_fit_extents++; + } + } + else + { + if (vci_ExtentIsFree(extentInfo)) + continue; + + if (vci_ExtentIsCollectable(extentInfo, wos2rosXid)) + { + result.best_extent_id = scan->index; + result.num_fit_extents++; + } + } + } + vci_EndMetaItemScan(scan); + + /* release the main relation */ + vci_ReleaseMainRelHeader(info); + + return result; +} + +vci_target_extent_info_t +vci_CountDeletedRowsInROS(Relation mainRel, uint32 threshold) +{ + return CountExtents(mainRel, threshold, CEK_CountDeletedRows); +} + +static HeapTuple +getTupleFromVector(int offset, + TupleDesc tupleDesc, + vci_virtual_tuples_t *vecSet) +{ + HeapTuple result; + vci_CSFetchContext fetchContext = vecSet->fetch_context; + vci_CSQueryContext queryContext = fetchContext->query_context; + Datum values[MaxAttrNumber]; + bool isnull[MaxAttrNumber]; + int cId; + + Assert((0 <= offset) && (offset < vecSet->num_rows)); + Assert(tupleDesc->natts == vecSet->num_columns); + for (cId = 0; cId < vecSet->num_columns; ++cId) + { + int tgtId = queryContext->column_id[fetchContext->column_link[cId]]; + + Assert((0 <= tgtId) && (tgtId < queryContext->num_columns)); + values[tgtId] = vci_CSGetValuesOfVirtualTupleColumnar(vecSet, cId)[offset]; + isnull[tgtId] = vci_CSGetIsNullOfVirtualTupleColumnar(vecSet, cId)[offset]; + } + result = heap_form_tuple(tupleDesc, values, isnull); +#ifdef __s390x__ + result->t_self = vci_CSGetTidInItemPointerFromVirtualTuples(vecSet, offset); +#else + result->t_self = *(vci_CSGetTidInItemPointerFromVirtualTuples(vecSet, offset)); +#endif + + return result; +} + +static void +FillOneRosChunkBufferFromExtent(vci_RosCommandContext *comContext, + int32 extentId, + uint32 *rowIdInExtent) +{ + vci_CSQueryContext queryContext; + vci_CSFetchContext fetchContext; + vci_CSFetchContext localContext; + vci_virtual_tuples_t *vectorSet = NULL; + + TupleDesc tupleDesc; + AttrNumber *tableAttrNumList; + AttrNumber *fetchAttrNumList; + int colId; + int numFetchRowsAtOnce = Min(comContext->numRowsAtOnce, VCI_MAX_NUM_ROW_TO_FETCH); + vci_ros_command_t saveCommand1; + + saveCommand1 = comContext->info.command; + + /* Get a descriptor of the index relation(VCI main relation). */ + /* This is not a descriptor of the table relation. */ + /* This including only target columns for VCI. */ + tupleDesc = vci_GetTupleDescr(&comContext->info); + Assert(comContext->numColumns == tupleDesc->natts); + + /* Create pg_attribute::attnum list of the table relation for initialize, */ + /* and create serial number of ROS columners for fetch. */ + tableAttrNumList = palloc(sizeof(AttrNumber) * comContext->numColumns); + fetchAttrNumList = palloc(sizeof(AttrNumber) * comContext->numColumns); + for (colId = VCI_FIRST_NORMALCOLUMN_ID; colId < comContext->numColumns; ++colId) + { + tableAttrNumList[colId] = comContext->heapAttrNumList[colId]; + fetchAttrNumList[colId] = (AttrNumber) (comContext->indxColumnIdList[colId] + 1); + } + + /* queryContext */ + queryContext = vci_CSCreateQueryContext(RelationGetRelid(comContext->info.rel), + comContext->numColumns, + tableAttrNumList, + TopTransactionContext, + false, + false); + + /* fetchContext */ + fetchContext = vci_CSCreateFetchContext(queryContext, + numFetchRowsAtOnce, + comContext->numColumns, + tableAttrNumList, + true, /* use ColumnStore */ + true, /* return Tid */ + false); /* NOT return CRID */ + + localContext = vci_CSLocalizeFetchContext(fetchContext, + CurrentMemoryContext); + + { + vci_extent_status_t *status = vci_CSCreateCheckExtent(localContext); + bool extent_ok; + + vci_CSCheckExtent(status, localContext, extentId, true); + + elog(DEBUG2, "status: %d, %d, %d, %d", status->size, status->num_rows, + status->existence, status->visible); + + extent_ok = status->existence && status->visible; + + vci_CSDestroyCheckExtent(status); + + if (!extent_ok) + { + comContext->done = true; + goto done; + } + } + + /* VectorSet */ + vectorSet = vci_CSCreateVirtualTuples(localContext); + + { + while (comContext->buffer.numFilled < comContext->numRowsAtOnce) + { + /* int numFetchRows; */ + int numRead; + int offset; + + if ((*rowIdInExtent) >= VCI_NUM_ROWS_IN_EXTENT) + { + comContext->done = true; + goto done; + } + + /* + * if (((*rowIdInExtent) + numFetchRowsAtOnce) <= + * VCI_NUM_ROWS_IN_EXTENT) numFetchRows = numFetchRowsAtOnce; else + * numFetchRows = VCI_NUM_ROWS_IN_EXTENT - (*rowIdInExtent); + */ + + /* FIXME: Does it need to use numFetchRows?? */ + /* let the vci_CSFetchVirtualTuples optimize the number of rows */ + numRead = vci_CSFetchVirtualTuples(vectorSet, + vci_CalcCrid64(extentId, *rowIdInExtent), + numFetchRowsAtOnce); + + if (numRead < 1) + { + comContext->done = true; + goto done; + } + + /* Read fetched data as HeapTuple */ + for (offset = 0; offset < numRead; ++offset) + { + HeapTuple tuple = NULL; + uint16 skip = vci_CSGetSkipFromVirtualTuples(vectorSet)[offset]; + + if (0 < skip) + { + (*rowIdInExtent) += skip; + offset += skip - 1; + continue; + } + + tuple = getTupleFromVector(offset, tupleDesc, vectorSet); + (*rowIdInExtent) += 1; + + if (tuple != NULL) + { + /* ... and register to ROS Chunk. */ + vci_FillOneRowInRosChunkBuffer(&(comContext->buffer), + &(comContext->info), + &tuple->t_self, + tuple, + comContext->indxColumnIdList, + fetchAttrNumList, + tupleDesc); + if (comContext->buffer.numFilled == comContext->numRowsAtOnce) + break; + } + else + { + Assert(false); + elog(LOG, "internal error: CDR command failed"); + } + } + } + } + +done: + if (vectorSet) + vci_CSDestroyVirtualTuples(vectorSet); + vci_CSDestroyFetchContext(localContext); + vci_CSDestroyFetchContext(fetchContext); + vci_CSDestroyQueryContext(queryContext); + + pfree(fetchAttrNumList); + pfree(tableAttrNumList); + + comContext->info.command = saveCommand1; +} + +static bool +isCdrTargetExtentValid(vci_RosCommandContext *comContext) +{ + bool result; + uint32 numExtents; + vcis_m_extent_t *extentInfo; + Buffer buffer = InvalidBuffer; + + if (comContext->extentId == comContext->extentIdSrc) + return false; + + numExtents = vci_GetMainRelVar(&comContext->info, vcimrv_num_extents, 0); + if (numExtents <= comContext->extentIdSrc) + return false; + + extentInfo = vci_GetMExtent(&buffer, &comContext->info, comContext->extentIdSrc); + LockBuffer(buffer, BUFFER_LOCK_SHARE); + result = vci_ExtentIsVisible(extentInfo, comContext->wos2rosXid) && !TransactionIdIsValid(extentInfo->xdel); + UnlockReleaseBuffer(buffer); + + return result; +} + +static int32 +CollectDeletedRows(vci_RosCommandContext *comContext, Snapshot snapshot) +{ + uint32 rowIdInExtent; + + vcis_m_extent_t *extentInfo; + Buffer buffer = InvalidBuffer; + + int numRows; + + Assert(0 == (comContext->numRowsAtOnce % VCI_COMPACTION_UNIT_ROW)); + + /* + * Set CDR data and write main relation for recovery. Header and extent + * info. Here, we also put current ROS version to the actual current + * transaction ID. + */ + vci_WriteExtentInfoInMainRosForWriteExtent(&comContext->info, + comContext->extentId, + comContext->xid, + vci_rc_collect_deleted); + + /* Create ROS Chunk from target Extent */ + vci_ResetRosChunkStorage(&(comContext->storage)); + vci_ResetRosChunkBufferCounter(&(comContext->buffer)); + + /* collect data from old extent for new extent */ + rowIdInExtent = 0; + while (!comContext->done) + { + + CHECK_FOR_INTERRUPTS(); + + /* fetch the data from old extents for one chunk */ + FillOneRosChunkBufferFromExtent(comContext, + comContext->extentIdSrc, &rowIdInExtent); + + if (comContext->buffer.numFilled == comContext->numRowsAtOnce) + { + /* copy chunk buffer in a compact manner */ + vci_RegisterChunkBuffer(&(comContext->storage), &(comContext->buffer)); + vci_ResetRosChunkBufferCounter(&(comContext->buffer)); + + Assert(comContext->storage.numTotalRows <= VCI_NUM_ROWS_IN_EXTENT); + } + else + { + Assert(comContext->done); + + /* + * We read and fill data in unit of VCI_COMPACTION_UNIT_ROW. The + * remaining data is read outside this loop to merge data read + * newly from WOS. + */ + } + } + comContext->done = false; + + elog(DEBUG2, "... collected deleted extent %d -> %d", comContext->extentIdSrc, + comContext->extentId); + + /* + * Now, reading from old extent was completed. Write Current ROS Version + * to VCI main relation as the XDel of old extent. + */ + extentInfo = vci_GetMExtent(&buffer, &(comContext->info), + comContext->extentIdSrc); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + extentInfo->xdel = comContext->xid; + vci_WriteOneItemPage(comContext->info.rel, buffer); + UnlockReleaseBuffer(buffer); + + /* Append data from WOS */ + + numRows = Min((VCI_NUM_ROWS_IN_EXTENT - comContext->storage.numTotalRows + - comContext->buffer.numFilled), + comContext->numRowsToConvert); + + if (numRows > 0) + { + fillTidListFromTidSortState(comContext, numRows); + + ReadOneExtentAndStoreInChunkStorage(comContext); + } + + /* Copy the remaining data to chunk buffer in a compact manner */ + if (0 < comContext->buffer.numFilled) + { + vci_RegisterChunkBuffer(&(comContext->storage), &(comContext->buffer)); + vci_ResetRosChunkBufferCounter(&(comContext->buffer)); + + Assert(comContext->storage.numTotalRows <= VCI_NUM_ROWS_IN_EXTENT); + } + + /* + * Update TID-CRID List, and Write Ros Chunk into new extent. + */ + comContext->numRowsToConvert = comContext->storage.numTotalRows; + + if (comContext->numRowsToConvert == 0) + { + + vci_SetMainRelVar(&comContext->info, vcimrv_new_extent_id, 0, VCI_INVALID_EXTENT_ID); + + return 0; + } + + vci_AddTidCridUpdateList(&(comContext->info), + &(comContext->storage), + comContext->extentId); + vci_WriteOneExtent(&(comContext->info), + &(comContext->storage), + comContext->extentId, /* to */ + comContext->xid, + InvalidTransactionId, + comContext->xid); + + return comContext->storage.numTotalRows; +} + +int +vci_CollectDeletedRows(Relation mainRel, Size workareaSize, int32 extentId) +{ + int result = -1; + + MemoryContext memCtxWos2Ros; + MemoryContext oldMemCtx; + vci_RosCommandContext comContext; + Snapshot snapshot; + + vci_InitRosCommandContext0(&comContext, mainRel, vci_rc_collect_deleted); + + /* excute recovery previous ROS command if necessary */ + vci_RecoverOneVCIIfNecessary(&(comContext.info)); + + /* Change Mem Context */ + memCtxWos2Ros = AllocSetContextCreate(TopTransactionContext, + "Collect Deleted Rows", + ALLOCSET_DEFAULT_SIZES); + oldMemCtx = MemoryContextSwitchTo(memCtxWos2Ros); + + /* CommandContext */ + vci_InitRosCommandContext1(&comContext, + workareaSize / 3 * 2, + VCI_NUM_ROWS_IN_EXTENT, 0, + true); + + vci_InitRosCommandContext2(&comContext, workareaSize / 3); + + if (TransactionIdPrecedes(GetCurrentTransactionId(), + (TransactionId) vci_GetMainRelVar(&comContext.info, vcimrv_current_ros_version, 0))) + goto done; + + snapshot = GetActiveSnapshot(); + + /* obtain new extent ID */ + comContext.extentIdSrc = extentId; + comContext.extentId = vci_GetFreeExtentId(&(comContext.info)); + + if (!isCdrTargetExtentValid(&comContext)) + goto done; + + /* Write Recovery Information of this command. */ + vci_WriteRecoveryRecordForExtentInfo(&comContext.info, comContext.extentId, comContext.extentIdSrc); + vci_InitRecoveryRecordForFreeSpace(&comContext.info); + + vci_WriteRecoveryRecordDone(&comContext.info, comContext.command, comContext.xid); + + constructTidSortState(&comContext); + + vci_InitRosChunkStroageAndBuffer(&comContext, true /* append */ ); + + /* call Main routine */ + result = CollectDeletedRows(&comContext, snapshot); + + cleanUpWos(&comContext, vcimrv_data_wos_oid); + cleanUpWos(&comContext, vcimrv_whiteout_wos_oid); + + /* Xmax WOS entry */ + RemoveWosEntries(&comContext, WOS_Data); + RemoveWosEntries(&comContext, WOS_Whiteout); + +done: + /* Finalize ROS */ + vci_FinRosCommandContext(&comContext, false); + + MemoryContextSwitchTo(oldMemCtx); + MemoryContextDelete(memCtxWos2Ros); + + return result; +} + +/* -------------------------------------------------------------- */ +/* Collect Unused Extent */ +/* -------------------------------------------------------------- */ + +vci_target_extent_info_t +vci_CountUnusedExtents(Relation mainRel) +{ + return CountExtents(mainRel, 0, CEK_CountUnusedExtents); +} + +static uint32 +SearchUnusedExtent(vci_MainRelHeaderInfo *info) +{ + int32 extentIdFirstFound = VCI_INVALID_EXTENT_ID; + TransactionId OldestXmin; + vcis_m_extent_t *extentInfo; + vci_meta_item_scanner_t *scan; + + OldestXmin = GetOldestNonRemovableTransactionId(info->rel); + + /* search deleted extent */ + scan = vci_BeginMetaItemScan(info->rel, BUFFER_LOCK_SHARE); + while ((extentInfo = vci_GetMExtentNext(info, scan)) != NULL) + { + if (vci_ExtentIsCollectable(extentInfo, OldestXmin)) + { + extentIdFirstFound = scan->index; + break; + } + } + vci_EndMetaItemScan(scan); + + return extentIdFirstFound; +} + +static void +CollectUnusedExtent(vci_RosCommandContext *comContext) +{ + int16 colId; + int16 numColumns = vci_GetMainRelVar(&comContext->info, vcimrv_num_columns, 0); + int16 recoveredColId = VCI_INVALID_COLUMN_ID; + vcis_m_extent_t *extentInfo; + Buffer buffer = InvalidBuffer; + + extentInfo = vci_GetMExtent(&buffer, &comContext->info, comContext->extentId); + + LockBuffer(buffer, BUFFER_LOCK_SHARE); + if (extentInfo->flags & VCIS_M_EXTENT_FLAG_ENABLE_RECOVERED_COLID) + recoveredColId = extentInfo->recovered_colid; + UnlockReleaseBuffer(buffer); + + for (colId = VCI_COLUMN_ID_NULL; colId < numColumns; ++colId) + { + vci_ColumnRelations relPairData; + vci_ColumnRelations *relPair = &relPairData; + vcis_c_extent_t *extentPointer; + + LOCKMODE lockmode = RowExclusiveLock; + + Buffer bufData; + Buffer bufMeta; + BlockNumber blockNumber; + BlockNumber startBlockNumber; + + Page page; + + vcis_extent_t *extentHead; + + vci_OpenColumnRelations(relPair, &comContext->info, colId, lockmode); + + /* target column-extent pointer */ + extentPointer = vci_GetColumnExtent(&bufMeta, &blockNumber, + relPair->meta, + comContext->extentId); + startBlockNumber = extentPointer->enabled ? extentPointer->block_number : InvalidBlockNumber; + ReleaseBuffer(bufMeta); + + if (!BlockNumberIsValid(startBlockNumber)) + { + /* Close Column */ + elog(DEBUG2, "this is invalid extent pointer!!"); + vci_CloseColumnRelations(relPair, lockmode); + continue; + } + + /* get extent Header */ + bufData = vci_ReadBufferWithPageInit(relPair->data, startBlockNumber); + page = BufferGetPage(bufData); + extentHead = vci_GetExtentT(page); + + if (colId == recoveredColId) + goto skip_collect_freelist; + + /* Freelist link node */ + { + bool isFixedLength; + + isFixedLength = true; + if (VCI_FIRST_NORMALCOLUMN_ID <= colId) + { + vcis_m_column_t *colInfo; + + colInfo = vci_GetMColumn(&comContext->info, colId); + if (colInfo->comp_type != vcis_compression_type_fixed_raw) + isFixedLength = false; + } + + if (!isFixedLength) + { + vcis_free_space_t newFS; + BlockNumber newFSBlockNumber; + + vci_MakeFreeSpace(relPair, startBlockNumber, &newFSBlockNumber, &newFS, true); + + /* FIXME */ /* The common dictionary should be collected? */ + vci_WriteRecoveryRecordForFreeSpace(relPair, + colId, VCI_INVALID_DICTIONARY_ID, + newFSBlockNumber, + &newFS); + + ReleaseBuffer(bufData); + vci_AppendFreeSpaceToLinkList(relPair, + newFSBlockNumber, + newFS.prev_pos, + newFS.next_pos, + newFS.size); + } + else + { + LockBuffer(bufData, BUFFER_LOCK_EXCLUSIVE); + extentHead->type = vcis_free_space; + vci_WriteOneItemPage(relPair->data, bufData); + UnlockReleaseBuffer(bufData); + } + } + +skip_collect_freelist: + vci_WriteRawDataExtentInfo(relPair->meta, + comContext->extentId, + InvalidBlockNumber, + 0, + NULL, /* min */ + NULL, /* max */ + false, + false); + + /* Close Column */ + vci_CloseColumnRelations(relPair, lockmode); + } + /* loop for each column */ + + vci_WriteExtentInfo(&comContext->info, + comContext->extentId, + 0, + 0, + 0, + InvalidTransactionId, + InvalidTransactionId); +} + +int +vci_CollectUnusedExtent(Relation mainRel, Size workareaSize) +{ + int result = -1; + + MemoryContext memCtxWos2Ros; + MemoryContext oldMemCtx; + vci_RosCommandContext comContext; + + vci_InitRosCommandContext0(&comContext, mainRel, vci_rc_collect_extent); + + /* excute recovery previous ROS command if necessary */ + vci_RecoverOneVCIIfNecessary(&(comContext.info)); + + /* Change Mem Context */ + memCtxWos2Ros = AllocSetContextCreate(TopTransactionContext, + "Collect Deleted Extent", + ALLOCSET_DEFAULT_SIZES); + oldMemCtx = MemoryContextSwitchTo(memCtxWos2Ros); + + /* CommandContext */ + vci_InitRosCommandContext1(&comContext, + workareaSize, + 0, 0, + false); + + if (TransactionIdPrecedes(GetCurrentTransactionId(), + (TransactionId) vci_GetMainRelVar(&comContext.info, vcimrv_current_ros_version, 0))) + goto done; + + comContext.extentIdSrc = VCI_INVALID_EXTENT_ID; + comContext.extentId = SearchUnusedExtent(&comContext.info); + + if (comContext.extentId == VCI_INVALID_EXTENT_ID) + goto done; + + /* Write Recovery Infomation of this command. */ + vci_WriteRecoveryRecordForExtentInfo(&comContext.info, VCI_INVALID_EXTENT_ID, comContext.extentId); + vci_InitRecoveryRecordForFreeSpace(&comContext.info); + + vci_WriteRecoveryRecordDone(&comContext.info, comContext.command, comContext.xid); + + /* call Main routine */ + CollectUnusedExtent(&comContext); + + result = comContext.extentId; + +done: + /* Finalize ROS */ + vci_FinRosCommandContext(&comContext, false); + + MemoryContextSwitchTo(oldMemCtx); + MemoryContextDelete(memCtxWos2Ros); + + return result; +} + +/* -------------------------------------------------------------- */ +/* Update TID-CRID Tree */ +/* -------------------------------------------------------------- */ + +int32 +vci_CountTidCridUpdateListLength(Relation mainRel, Size workarea) +{ + int32 result; + vci_MainRelHeaderInfo infoData = {0}; + vci_MainRelHeaderInfo *info = &infoData; + int32 oldSel; + + vci_InitMainRelHeaderInfo(info, mainRel, vci_rc_probe); + vci_KeepMainRelHeader(info); + + oldSel = vci_GetMainRelVar(info, vcimrv_tid_crid_diff_sel, 0); + result = vci_GetTidCridUpdateListLength(info, oldSel); + + /* release the main relation */ + vci_ReleaseMainRelHeader(info); + + return result; +} + +/** + * @param[in] comContext Conv Context + * @param[in] workareaSize + */ +static void +UpdateTidCrid(vci_RosCommandContext *comContext, Size workareaSize) +{ + const LOCKMODE lockmode = RowExclusiveLock; + uint32 toMove; + int i; + + vci_TidCridRelations relPairData; + vci_TidCridRelations *relPair = &relPairData; + + vci_TidCridUpdateListContext *oldListContext = NULL; + BlockNumber prevOldListBlkno = InvalidBlockNumber; + vcis_tidcrid_pair_item_t *array; + + vcis_tidcrid_pair_list_t *moveList; + Tuplesortstate *deleteList; + + uint32 oldSel = vci_GetMainRelVar(&comContext->info, vcimrv_tid_crid_diff_sel, 0); + uint32 newSel = 1 ^ oldSel; + + oldListContext = vci_OpenTidCridUpdateList(&comContext->info, oldSel); + + moveList = palloc(offsetof(vcis_tidcrid_pair_list_t, body) + (sizeof(vcis_tidcrid_pair_item_t) * MaxHeapTuplesPerPage)); + moveList->num = 0; + + deleteList = + tuplesort_begin_datum(TIDOID, TIDLessOperator, InvalidOid, false, + Min(workareaSize / 1024, INT_MAX), NULL, TUPLESORT_NONE); + array = palloc(VCI_TID_CRID_UPDATE_PAGE_SPACE); + + vci_OpenTidCridRelations(relPair, &comContext->info, lockmode); + + i = 0; + + for (toMove = 0; toMove < comContext->utility_array.num; toMove++) + { + ItemPointerData treeNodeData; + ItemPointer treeNode = &treeNodeData; + + BlockNumber blkToMove; + + blkToMove = comContext->utility_array.orig_blknos[toMove]; + + moveList->num = 0; + + for (; i < oldListContext->count; i++) + { + BlockNumber blkno = VCI_TID_CRID_UPDATE_BODY_PAGE_ID + (i / VCI_TID_CRID_UPDATE_PAGE_ITEMS); + vcis_tidcrid_pair_item_t item; + + if (prevOldListBlkno != blkno) + { + vci_ReadOneBlockFromTidCridUpdateList(oldListContext, blkno, array); + prevOldListBlkno = blkno; + } + + item = array[i % VCI_TID_CRID_UPDATE_PAGE_ITEMS]; + + if (ItemPointerGetBlockNumber(&item.page_item_id) != blkToMove) + break; + + Assert(moveList->num < MaxHeapTuplesPerPage); + + moveList->body[moveList->num] = item; + moveList->num++; + + tuplesort_putdatum(deleteList, ItemPointerGetDatum(&item.page_item_id), false); + } + + if (moveList->num == 0) + continue; + + vci_GetTidCridSubTree(relPair, blkToMove, treeNode); + + if (!ItemPointerIsValid(treeNode)) + vci_CreateTidCridSubTree(relPair, blkToMove, treeNode); + + vci_UpdateTidCridSubTree(relPair, treeNode, moveList); + } + + pfree(array); + pfree(moveList); + + vci_CloseTidCridRelations(relPair, lockmode); + + vci_CloseTidCridUpdateList(oldListContext); + + tuplesort_performsort(deleteList); + + vci_MergeAndWriteTidCridUpdateList(&comContext->info, newSel, oldSel, deleteList, vci_GetCridFromUint64(VCI_MOVED_CRID)); + + tuplesort_end(deleteList); +} + +/** + * @param[in,out] comContext Conv Context + * @param[in] numPages + */ +static void +collectBlockNumberToMove(vci_RosCommandContext *comContext, int numPages) +{ + uint32 oldSel; + vci_TidCridUpdateListContext *oldListContext; + BlockNumber prevblk = InvalidBlockNumber; + vcis_tidcrid_pair_item_t *array; + BlockNumber blockNumber = VCI_TID_CRID_UPDATE_BODY_PAGE_ID; + uint64 count = 0; + + oldSel = vci_GetMainRelVar(&comContext->info, vcimrv_tid_crid_diff_sel, 0); + oldListContext = vci_OpenTidCridUpdateList(&comContext->info, oldSel); + + comContext->utility_array.num = 0; + + array = palloc(VCI_TID_CRID_UPDATE_PAGE_SPACE); + + while (blockNumber < oldListContext->nblocks) + { + int i; + + vci_ReadOneBlockFromTidCridUpdateList(oldListContext, blockNumber, array); + + for (i = 0; (i < VCI_TID_CRID_UPDATE_PAGE_ITEMS) && (count < oldListContext->count); i++) + { + BlockNumber blkno = ItemPointerGetBlockNumber(&array[i].page_item_id); + + if (prevblk != blkno) + { + comContext->utility_array.orig_blknos[comContext->utility_array.num++] = blkno; + prevblk = blkno; + + if (numPages == comContext->utility_array.num) + goto done; + } + + count++; + } + + blockNumber++; + } + +done: + pfree(array); + + vci_CloseTidCridUpdateList(oldListContext); +} + +int +vci_UpdateTidCrid(Relation mainRel, Size workareaSize, int numPages) +{ + int result = 0; + + MemoryContext memCtxWos2Ros; + MemoryContext oldMemCtx; + vci_RosCommandContext comContext; + + vci_InitRosCommandContext0(&comContext, mainRel, vci_rc_update_tid_crid); + + /* excute recovery previous ROS command if necessary */ + vci_RecoverOneVCIIfNecessary(&(comContext.info)); + + /* Change Mem Context */ + memCtxWos2Ros = AllocSetContextCreate(TopTransactionContext, + "TIDCRID Tree Update", + ALLOCSET_DEFAULT_SIZES); + oldMemCtx = MemoryContextSwitchTo(memCtxWos2Ros); + + /* CommandContext */ + vci_InitRosCommandContext1(&comContext, + workareaSize, + 0, 0, + false); + + if (TransactionIdPrecedes(GetCurrentTransactionId(), + (TransactionId) vci_GetMainRelVar(&comContext.info, vcimrv_current_ros_version, 0))) + goto done; + + comContext.utility_array.orig_blknos = (BlockNumber *) palloc(sizeof(BlockNumber) * numPages); + comContext.utility_array.max = numPages; + + collectBlockNumberToMove(&comContext, numPages); + + result = comContext.utility_array.num; + + /* Write Recovery Information of this command. */ + vci_InitRecoveryRecordForTidCrid(&comContext.info); + vci_InitRecoveryRecordForFreeSpace(&comContext.info); + + vci_WriteRecoveryRecordDone(&comContext.info, comContext.command, comContext.xid); + + /* call Main routine */ + UpdateTidCrid(&comContext, workareaSize); + +done: + /* Finalize ROS */ + vci_FinRosCommandContext(&comContext, false); + + MemoryContextSwitchTo(oldMemCtx); + MemoryContextDelete(memCtxWos2Ros); + + return result; +} + +/* -------------------------------------------------------------- */ +/* Vacuum and Freeze */ +/* -------------------------------------------------------------- */ + +static void +freezeMainAndRos(vci_RosCommandContext *comContext) +{ + vcis_m_extent_t *mExtent; + TransactionId wos2rosXid = comContext->wos2rosXid; + vci_meta_item_scanner_t *scan; + TransactionId lastRosVer; + + lastRosVer = vci_GetMainRelVar(&comContext->info, vcimrv_last_ros_version, 0); + if (TransactionIdIsNormal(lastRosVer) && TransactionIdPrecedes(lastRosVer, wos2rosXid)) + vci_SetMainRelVar(&comContext->info, vcimrv_last_ros_version, 0, FrozenTransactionId); + + scan = vci_BeginMetaItemScan(comContext->info.rel, BUFFER_LOCK_EXCLUSIVE); + while ((mExtent = vci_GetMExtentNext(&comContext->info, scan)) != NULL) + { + if (TransactionIdIsNormal(mExtent->xgen) && + TransactionIdPrecedes(mExtent->xgen, wos2rosXid)) /* mExtent->xgen < + * wos2rosXid */ + mExtent->xgen = FrozenTransactionId; + + if (TransactionIdIsNormal(mExtent->xdel) && + TransactionIdPrecedes(mExtent->xdel, wos2rosXid)) /* mExtent->xdel < + * wos2rosXid */ + mExtent->xdel = FrozenTransactionId; + } + vci_EndMetaItemScan(scan); +} + +/* + * VCITupleSatisfiesVisibility + * True iff heap tuple satisfies a time qual. + * + * Notes: + * Assumes heap tuple is valid, and buffer at least share locked. + * + * Copy of OSS HeapTupleSatisfiesVisibulity() for VCI snapshot types + * + */ +bool +VCITupleSatisfiesVisibility(HeapTuple htup, Snapshot snapshot, Buffer buffer) +{ + switch (snapshot->snapshot_type) + { + case SNAPSHOT_VCI_WOS2ROS: + return HeapTupleSatisfiesWos2Ros(htup, snapshot, buffer); + case SNAPSHOT_VCI_LOCALROS: + return HeapTupleSatisfiesLocalRos(htup, snapshot, buffer); + default: + return HeapTupleSatisfiesVisibility(htup, snapshot, buffer); + } + return false; +} + +static void +freezeWos(vci_RosCommandContext *comContext, vci_MainRelVar wosType, Snapshot snapshot) +{ + LOCKMODE lockmode = ShareUpdateExclusiveLock; + Oid oid; + HeapTupleFreeze *frozen; + Relation rel; + BlockNumber nblocks, + blkno; + + frozen = palloc0(sizeof(HeapTupleFreeze) * MaxHeapTuplesPerPage); + + oid = vci_GetMainRelVar(&comContext->info, wosType, 0); + + rel = table_open(oid, lockmode); + + nblocks = RelationGetNumberOfBlocks(rel); + + for (blkno = 0; blkno < nblocks; blkno++) + { + Buffer buffer; + Page page; + OffsetNumber offnum, + maxoff; + int nfrozen = 0; + + buffer = ReadBuffer(rel, blkno); + + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + + page = BufferGetPage(buffer); + + maxoff = PageGetMaxOffsetNumber(page); + + for (offnum = FirstOffsetNumber; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemid; + HeapTupleData loctup; + + itemid = PageGetItemId(page, offnum); + + if (ItemIdIsNormal(itemid)) + { + bool valid; + TransactionId xmin; + + loctup.t_tableOid = RelationGetRelid(rel); + loctup.t_data = (HeapTupleHeader) PageGetItem(page, itemid); + loctup.t_len = ItemIdGetLength(itemid); + ItemPointerSet(&loctup.t_self, blkno, offnum); + + valid = VCITupleSatisfiesVisibility(&loctup, snapshot, buffer); + + HeapCheckForSerializableConflictOut(valid, rel, &loctup, buffer, snapshot); + + xmin = HeapTupleHeaderGetXmin(loctup.t_data); + + if (valid && + !TransactionIdEquals(xmin, FrozenTransactionId) && + TransactionIdPrecedes(xmin, comContext->oldestXmin)) + { + HeapTupleFreeze *frz = &frozen[nfrozen]; + HeapTupleHeader tuple = loctup.t_data; + + frz->frzflags = 0; + frz->t_infomask2 = tuple->t_infomask2; + frz->t_infomask = tuple->t_infomask | HEAP_XMIN_FROZEN; + frz->xmax = HeapTupleHeaderGetRawXmax(tuple); + frz->offset = offnum; + + nfrozen++; + } + } + } + + if (nfrozen > 0) + { + heap_pre_freeze_checks(buffer, frozen, nfrozen); + START_CRIT_SECTION(); + heap_freeze_prepared_tuples(buffer, frozen, nfrozen); + MarkBufferDirty(buffer); + + /* Now WAL-log freezing if necessary */ + if (RelationNeedsWAL(rel)) + { + /* + * Commit add323d added the vmbuffer/vmflags parameters. + * A quick fix was needed to allow build to proceed. + * + * TODO Confirm if passing InvalidBuffer, 0 is OK here. + */ + log_heap_prune_and_freeze(rel, buffer, + InvalidBuffer, /* vmbuffer */ + 0, /* vmflags */ + comContext->oldestXmin, + false, /* no cleanup lock + * required */ + PRUNE_VACUUM_SCAN, + frozen, nfrozen, + NULL, 0, /* redirected */ + NULL, 0, /* dead */ + NULL, 0); /* unused */ + } + END_CRIT_SECTION(); + } + UnlockReleaseBuffer(buffer); + } + + table_close(rel, lockmode); + + pfree(frozen); +} + +/** + * @param[in] comContext Conv Context + * + * @note + * This is not transaction-safe, because the truncation is done immediately + * and cannot be rolled back later. Caller is responsible for having + * checked permissions etc, and must have obtained AccessExclusiveLock. + */ +static void +truncateRos(vci_RosCommandContext *comContext) +{ + const LOCKMODE lockmode = ShareUpdateExclusiveLock; + int colId; + + vci_meta_item_scanner_t *scan; + vcis_m_extent_t *extentInfo; + int32 lastAvailableExtentId = -1; + + scan = vci_BeginMetaItemScan(comContext->info.rel, BUFFER_LOCK_SHARE); + while ((extentInfo = vci_GetMExtentNext(&comContext->info, scan)) != NULL) + { + if (TransactionIdIsValid(extentInfo->xgen) || + TransactionIdIsValid(extentInfo->xdel)) + lastAvailableExtentId = scan->index; + } + vci_EndMetaItemScan(scan); + + vci_SetMainRelVar(&comContext->info, vcimrv_num_extents, 0, lastAvailableExtentId + 1); + + for (colId = VCI_FIRST_NORMALCOLUMN_ID; colId < comContext->numColumns; ++colId) + { + vcis_m_column_t *colInfo; + + vci_ColumnRelations relPairData; + vci_ColumnRelations *relPair = &relPairData; + + BlockNumber nblocks; + + colInfo = vci_GetMColumn(&comContext->info, colId); + + vci_OpenColumnRelations(relPair, &comContext->info, colId, lockmode); + + nblocks = RelationGetNumberOfBlocks(relPair->data); + + if (colInfo->comp_type != vcis_compression_type_fixed_raw) + { + BlockNumber sentinelBlockNumber; + vcis_column_meta_t *columnMeta; + + elog(DEBUG2, " -- colId %d ,variable column ", colId); + + columnMeta = vci_GetColumnMeta(&relPair->bufMeta, relPair->meta); + sentinelBlockNumber = columnMeta->free_page_end_id; + ReleaseBuffer(relPair->bufMeta); + + Assert(sentinelBlockNumber + 1 <= nblocks); + + RelationTruncate(relPair->data, sentinelBlockNumber + 1); + elog(DEBUG2, " end"); + } + else + { + int16 columnSize; + int extentHeaderSize; + Size dataSize; + int numExtentPages; + BlockNumber startBlockNumber; + + elog(DEBUG2, " -- colId %d ,variable column ", colId); + + columnSize = vci_GetFixedColumnSize(&comContext->info, colId); + extentHeaderSize = vci_GetExtentFixedLengthRawDataHeaderSize(VCI_NUM_ROWS_IN_EXTENT); + dataSize = (Size) columnSize * VCI_NUM_ROWS_IN_EXTENT; + numExtentPages = vci_GetNumBlocks(dataSize + extentHeaderSize); + startBlockNumber = (lastAvailableExtentId + 1) * numExtentPages; + + if (startBlockNumber < nblocks) + RelationTruncate(relPair->data, startBlockNumber); + + elog(DEBUG2, " end"); + + } + + vci_CloseColumnRelations(relPair, lockmode); + } +} + +/** + * @param[in] comContext Conv Context + */ +static void +truncateWos(vci_RosCommandContext *comContext) +{ + LOCKMODE lockmode = ShareUpdateExclusiveLock; + + Oid oid[2] = { + vci_GetMainRelVar(&comContext->info, vcimrv_data_wos_oid, 0), + vci_GetMainRelVar(&comContext->info, vcimrv_whiteout_wos_oid, 0) + }; + + int i; + + for (i = 0; i < 2; i++) + { + Relation rel = table_open(oid[i], lockmode); + int lock_retry = 0; + BlockNumber old_rel_pages; + BlockNumber new_rel_pages; + BlockNumber blkno; + + while (true) + { + if (ConditionalLockRelation(rel, AccessExclusiveLock)) + break; + + /* + * * Check for interrupts while trying to (re-)acquire the + * exclusive * lock. + */ + CHECK_FOR_INTERRUPTS(); + + if (++lock_retry > (VACUUM_TRUNCATE_LOCK_TIMEOUT / + VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL)) + { + table_close(rel, lockmode); + return; + } + + pg_usleep(VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL); + } + + blkno = old_rel_pages = new_rel_pages = RelationGetNumberOfBlocks(rel); + + while (blkno > 0) + { + Buffer buffer; + Page page; + OffsetNumber offnum, + maxoff; + + blkno--; + + buffer = ReadBuffer(rel, blkno); + + LockBuffer(buffer, BUFFER_LOCK_SHARE); + page = BufferGetPage(buffer); + + if (PageIsNew(page) || PageIsEmpty(page)) + { + UnlockReleaseBuffer(buffer); + + new_rel_pages = blkno; + continue; + } + + maxoff = PageGetMaxOffsetNumber(page); + + for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) + { + ItemId itemid; + + itemid = PageGetItemId(page, offnum); + + if (ItemIdIsUsed(itemid)) + { + UnlockReleaseBuffer(buffer); + goto found_use_item; + } + } + + UnlockReleaseBuffer(buffer); + + new_rel_pages = blkno; + } + +found_use_item: + if (new_rel_pages < old_rel_pages) + RelationTruncate(rel, new_rel_pages); + + UnlockRelation(rel, AccessExclusiveLock); + + table_close(rel, lockmode); + } +} + +void +vci_VacuumRos(Relation mainRel, IndexVacuumInfo *vacuumInfo) +{ + MemoryContext memCtxWos2Ros; + MemoryContext oldMemCtx; + vci_RosCommandContext comContext; + Snapshot snapshot; + + vci_InitRosCommandContext0(&comContext, mainRel, vci_rc_vacuum); + + /* recover ROS if necessary */ + vci_RecoverOneVCIIfNecessary(&(comContext.info)); + + /* Change Mem Context */ + memCtxWos2Ros = AllocSetContextCreate(TopTransactionContext, + "Vacuum", + ALLOCSET_DEFAULT_SIZES); + oldMemCtx = MemoryContextSwitchTo(memCtxWos2Ros); + + vci_InitRosCommandContext1(&comContext, 0, 0, 0, false); + + snapshot = GetActiveSnapshot(); + + /* remove WOS entries */ + elog(DEBUG2, " -- wos"); + cleanUpWos(&comContext, vcimrv_data_wos_oid); + cleanUpWos(&comContext, vcimrv_whiteout_wos_oid); + freezeWos(&comContext, vcimrv_data_wos_oid, snapshot); + freezeWos(&comContext, vcimrv_whiteout_wos_oid, snapshot); + truncateWos(&comContext); + + elog(DEBUG2, " -- ros"); + freezeMainAndRos(&comContext); + truncateRos(&comContext); + + elog(DEBUG2, " -- end"); + + vci_UpdateXidGeneration(&comContext.info); + + /* Finalize ROS */ + vci_FinRosCommandContext(&comContext, true /* never write */ ); + + MemoryContextSwitchTo(oldMemCtx); + MemoryContextDelete(memCtxWos2Ros); +} + +static void +constructTidArray(vci_RosCommandContext *comContext, int max_data_wos_entries, int max_whiteout_wos_entries) +{ + vci_MainRelHeaderInfo *info; + Snapshot snapshot; + Oid data_wos_oid; + Oid whiteout_wos_oid; + vci_tid_tid_xid64_t *data_wos_entries; + vci_tid_tid_xid64_t *whiteout_wos_entries; + int num_data_wos_entries = 0; + int num_whiteout_wos_entries = 0; + int data_wos_entries_pos = 0; + int whiteout_wos_entries_pos = 0; + + info = &comContext->info; + + data_wos_oid = vci_GetMainRelVar(info, vcimrv_data_wos_oid, 0); + whiteout_wos_oid = vci_GetMainRelVar(info, vcimrv_whiteout_wos_oid, 0); + + data_wos_entries = palloc(max_data_wos_entries * sizeof(vci_tid_tid_xid64_t)); + whiteout_wos_entries = palloc(max_whiteout_wos_entries * sizeof(vci_tid_tid_xid64_t)); + + snapshot = vci_GetSnapshotForLocalRos(comContext->inclusiveXid, comContext->exclusiveXid); + + num_data_wos_entries = + readTidListFromWosIntoTidArray(data_wos_oid, WOS_Data, + data_wos_entries, max_data_wos_entries, + snapshot); + + num_whiteout_wos_entries = + readTidListFromWosIntoTidArray(whiteout_wos_oid, WOS_Whiteout, + whiteout_wos_entries, max_whiteout_wos_entries, + snapshot); + + Assert(num_data_wos_entries <= max_data_wos_entries); + Assert(num_whiteout_wos_entries <= max_whiteout_wos_entries); + + qsort(data_wos_entries, num_data_wos_entries, sizeof(vci_tid_tid_xid64_t), comparator_orig_tid_xid64); + qsort(whiteout_wos_entries, num_whiteout_wos_entries, sizeof(vci_tid_tid_xid64_t), comparator_orig_tid_xid64); + + while ((data_wos_entries_pos < num_data_wos_entries) && + (whiteout_wos_entries_pos < num_whiteout_wos_entries)) + { + int32 res; + vci_tid_tid_xid64_t data_wos_item; + vci_tid_tid_xid64_t whiteout_wos_item; + + data_wos_item = data_wos_entries[data_wos_entries_pos]; + whiteout_wos_item = whiteout_wos_entries[whiteout_wos_entries_pos]; + + res = ItemPointerCompare(&data_wos_item.orig_tid, &whiteout_wos_item.orig_tid); + + if (res == 0) + res = compareXid64(data_wos_item.xid64, whiteout_wos_item.xid64); + + if (res < 0) + { + comContext->wos2ros_array.orig_tids[comContext->wos2ros_array.num] = + data_wos_item.orig_tid; + + comContext->wos2ros_array.num++; + data_wos_entries_pos++; + } + else if (res > 0) + { + comContext->delvec_array.orig_tids[comContext->delvec_array.num] = + whiteout_wos_item.orig_tid; + + comContext->delvec_array.num++; + whiteout_wos_entries_pos++; + } + else + { + data_wos_entries_pos++; + whiteout_wos_entries_pos++; + } + } + + while (data_wos_entries_pos < num_data_wos_entries) + { + comContext->wos2ros_array.orig_tids[comContext->wos2ros_array.num] = + data_wos_entries[data_wos_entries_pos].orig_tid; + + comContext->wos2ros_array.num++; + data_wos_entries_pos++; + } + + while (whiteout_wos_entries_pos < num_whiteout_wos_entries) + { + comContext->delvec_array.orig_tids[comContext->delvec_array.num] = + whiteout_wos_entries[whiteout_wos_entries_pos].orig_tid; + + comContext->delvec_array.num++; + whiteout_wos_entries_pos++; + } + + PopActiveSnapshot(); + + pfree(data_wos_entries); + pfree(whiteout_wos_entries); +} + +static int +comparator_orig_tid_xid64(const void *pa, const void *pb) +{ + vci_tid_tid_xid64_t *a = (vci_tid_tid_xid64_t *) pa; + vci_tid_tid_xid64_t *b = (vci_tid_tid_xid64_t *) pb; + int res; + + res = ItemPointerCompare(&a->orig_tid, &b->orig_tid); + + if (res == 0) + { + if (a->xid64 == b->xid64) + res = 0; + else if (a->xid64 > b->xid64) + res = 1; + else + res = -1; + } + + return res; +} + +/** + * @param[in,out] comContext Conv Context + * @param[in] snapshot Snapshot + */ +static void +constructTidSortState(vci_RosCommandContext *comContext) +{ + vci_MainRelHeaderInfo *info; + Snapshot snapshot; + Oid data_wos_oid; + Oid whiteout_wos_oid; + MemoryContext workcontext; + MemoryContext oldcontext; + TupleDesc tupDesc; + Tuplesortstate *data_wos_valid_tid_sortstate; + Tuplesortstate *whiteout_wos_valid_tid_sortstate; + AttrNumber sortKeys[2] = {1, 3}; + Oid sortOperators[2] = {TIDLessOperator, Int8LessOperator}; + Oid sortCollations[2] = {InvalidOid, InvalidOid,}; + bool nullsFirstFlags[2] = {false, false}; + TupleTableSlot *data_wos_valid_slot; + TupleTableSlot *whiteout_wos_valid_slot; + vci_tid_tid_xid64_t data_wos_item; + vci_tid_tid_xid64_t whiteout_wos_item; + bool is_terminated_data_wos = false; + bool is_terminated_whiteout_wos = false; + int64 numInsertRows = 0; + int64 numDeleteRows = 0; + ItemPointerData last_whiteout_orig_tid; + + info = &comContext->info; + + data_wos_oid = vci_GetMainRelVar(info, vcimrv_data_wos_oid, 0); + whiteout_wos_oid = vci_GetMainRelVar(info, vcimrv_whiteout_wos_oid, 0); + + workcontext = AllocSetContextCreate(CurrentMemoryContext, + "Construct Tid Sort State", + ALLOCSET_DEFAULT_SIZES); + + oldcontext = MemoryContextSwitchTo(workcontext); + + tupDesc = CreateTemplateTupleDesc(4); + + TupleDescInitEntry(tupDesc, (AttrNumber) 1, "orig_tid", TIDOID, -1, 0); + TupleDescInitEntry(tupDesc, (AttrNumber) 2, "wos_tid", TIDOID, -1, 0); + TupleDescInitEntry(tupDesc, (AttrNumber) 3, "xid64", INT8OID, -1, 0); + TupleDescInitEntry(tupDesc, (AttrNumber) 4, "movable", BOOLOID, -1, 0); + + data_wos_valid_tid_sortstate = + tuplesort_begin_heap(tupDesc, 2, + sortKeys, sortOperators, sortCollations, nullsFirstFlags, + VciGuc.maintenance_work_mem / 8 * 3, NULL, + TUPLESORT_NONE); + + whiteout_wos_valid_tid_sortstate = + tuplesort_begin_heap(tupDesc, 2, + sortKeys, sortOperators, sortCollations, nullsFirstFlags, + VciGuc.maintenance_work_mem / 8 * 3, NULL, + TUPLESORT_NONE); + + data_wos_valid_slot = MakeSingleTupleTableSlot(tupDesc, &TTSOpsHeapTuple); + whiteout_wos_valid_slot = MakeSingleTupleTableSlot(tupDesc, &TTSOpsHeapTuple); + + snapshot = vci_GetSnapshotForWos2Ros(); + + readTidListFromWosIntoTidSortState(data_wos_oid, WOS_Data, + data_wos_valid_slot, + data_wos_valid_tid_sortstate, + snapshot, + comContext->wos2rosXid); + + readTidListFromWosIntoTidSortState(whiteout_wos_oid, WOS_Whiteout, + whiteout_wos_valid_slot, + whiteout_wos_valid_tid_sortstate, + snapshot, + comContext->wos2rosXid); + + tuplesort_performsort(data_wos_valid_tid_sortstate); + tuplesort_performsort(whiteout_wos_valid_tid_sortstate); + + if (!getValidTidSortState(data_wos_valid_tid_sortstate, data_wos_valid_slot, &data_wos_item)) + is_terminated_data_wos = true; + + if (!getValidTidSortState(whiteout_wos_valid_tid_sortstate, whiteout_wos_valid_slot, &whiteout_wos_item)) + is_terminated_whiteout_wos = true; + + ItemPointerSetInvalid(&last_whiteout_orig_tid); + + while (!is_terminated_data_wos && !is_terminated_whiteout_wos) + { + int32 res; + + res = ItemPointerCompare(&data_wos_item.orig_tid, &whiteout_wos_item.orig_tid); + + if (res == 0) + res = compareXid64(data_wos_item.xid64, whiteout_wos_item.xid64); + + if (res < 0) + { + if (can_select_candidate_for_wos2ros_conv(&data_wos_item, comContext, &last_whiteout_orig_tid)) + { + put_entry_into_tid_list(comContext, WOS_Data, &data_wos_item.orig_tid, &data_wos_item.wos_tid); + + numInsertRows++; + } + + if (!getValidTidSortState(data_wos_valid_tid_sortstate, data_wos_valid_slot, &data_wos_item)) + is_terminated_data_wos = true; + } + else if (res > 0) + { + last_whiteout_orig_tid = whiteout_wos_item.orig_tid; + + if (can_select_candidate_for_update_delvec(&whiteout_wos_item, comContext)) + { + put_entry_into_tid_list(comContext, WOS_Whiteout, &whiteout_wos_item.orig_tid, &whiteout_wos_item.wos_tid); + + numDeleteRows++; + } + + if (!getValidTidSortState(whiteout_wos_valid_tid_sortstate, whiteout_wos_valid_slot, &whiteout_wos_item)) + is_terminated_whiteout_wos = true; + } + else + { + if (data_wos_item.movable && whiteout_wos_item.movable) + { + if (comContext->data_wos_del_list) + tuplesort_putdatum(comContext->data_wos_del_list, + ItemPointerGetDatum(&data_wos_item.wos_tid), false); + + if (comContext->whiteout_wos_del_list) + tuplesort_putdatum(comContext->whiteout_wos_del_list, + ItemPointerGetDatum(&whiteout_wos_item.wos_tid), false); + } + + if (!getValidTidSortState(data_wos_valid_tid_sortstate, data_wos_valid_slot, &data_wos_item)) + is_terminated_data_wos = true; + + if (!getValidTidSortState(whiteout_wos_valid_tid_sortstate, whiteout_wos_valid_slot, &whiteout_wos_item)) + is_terminated_whiteout_wos = true; + } + } + + if (!is_terminated_data_wos && comContext->wos2ros_tid_list) + { + do + { + if (can_select_candidate_for_wos2ros_conv(&data_wos_item, comContext, &last_whiteout_orig_tid)) + { + put_entry_into_tid_list(comContext, WOS_Data, &data_wos_item.orig_tid, &data_wos_item.wos_tid); + numInsertRows++; + } + } while (getValidTidSortState(data_wos_valid_tid_sortstate, + data_wos_valid_slot, &data_wos_item)); + } + + if (!is_terminated_whiteout_wos && comContext->delvec_tid_list) + { + do + { + if (can_select_candidate_for_update_delvec(&whiteout_wos_item, comContext)) + { + put_entry_into_tid_list(comContext, WOS_Whiteout, &whiteout_wos_item.orig_tid, &whiteout_wos_item.wos_tid); + + numDeleteRows++; + } + } while (getValidTidSortState(whiteout_wos_valid_tid_sortstate, + whiteout_wos_valid_slot, &whiteout_wos_item)); + } + + tuplesort_end(whiteout_wos_valid_tid_sortstate); + tuplesort_end(data_wos_valid_tid_sortstate); + + FreeTupleDesc(tupDesc); + + PopActiveSnapshot(); + + MemoryContextSwitchTo(oldcontext); + MemoryContextDelete(workcontext); + + if (comContext->wos2ros_tid_list) + { + tuplesort_performsort(comContext->wos2ros_tid_list); + comContext->num_wos2ros_tids = numInsertRows; + } + + if (comContext->delvec_tid_list) + { + tuplesort_performsort(comContext->delvec_tid_list); + comContext->num_delvec_tids = numDeleteRows; + } +} + +static bool +can_select_candidate_for_wos2ros_conv(vci_tid_tid_xid64_t *data_wos_item, vci_RosCommandContext *comContext, ItemPointer last_whiteout_orig_tid) +{ + if (!data_wos_item->movable) + return false; + + if (!comContext->wos2ros_tid_list) + return false; + + if (!comContext->delvec_tid_list) + if (ItemPointerIsValid(last_whiteout_orig_tid) && + ItemPointerEquals(last_whiteout_orig_tid, &data_wos_item->orig_tid)) + return false; + + return true; +} + +static bool +can_select_candidate_for_update_delvec(vci_tid_tid_xid64_t *whiteout_wos_item, vci_RosCommandContext *comContext) +{ + if (!whiteout_wos_item->movable) + return false; + + if (!comContext->delvec_tid_list) + return false; + + return true; +} + +static void +put_entry_into_tid_list(vci_RosCommandContext *comContext, WosKind wos_kind, ItemPointer orig_tid, ItemPointer wos_tid) +{ + TupleTableSlot *slot; + Tuplesortstate *sortstate; + + slot = comContext->tid_tid_slot; + + ExecClearTuple(slot); + + if (wos_kind == WOS_Data) + sortstate = comContext->wos2ros_tid_list; + else + sortstate = comContext->delvec_tid_list; + + Assert(sortstate != NULL); + + slot->tts_values[0] = ItemPointerGetDatum(orig_tid); + slot->tts_values[1] = ItemPointerGetDatum(wos_tid); + slot->tts_isnull[0] = false; + slot->tts_isnull[1] = false; + + slot->tts_flags |= TTS_FLAG_EMPTY; + + ExecStoreVirtualTuple(slot); + + tuplesort_puttupleslot(sortstate, slot); +} + +static bool +get_entry_into_tid_list(vci_RosCommandContext *comContext, WosKind wos_kind, ItemPointer orig_tid, ItemPointer wos_tid) +{ + bool isnull; + TupleTableSlot *slot; + Tuplesortstate *sortstate; + + slot = MakeSingleTupleTableSlot(comContext->tid_tid_slot->tts_tupleDescriptor, &TTSOpsMinimalTuple); + + if (wos_kind == WOS_Data) + sortstate = comContext->wos2ros_tid_list; + else + sortstate = comContext->delvec_tid_list; + + Assert(sortstate != NULL); + + if (!tuplesort_gettupleslot(sortstate, true, false, slot, NULL)) + { + ExecDropSingleTupleTableSlot(slot); + return false; + } + + slot_getsomeattrs(slot, 2); + + *orig_tid = *DatumGetItemPointer(slot_getattr(slot, 1, &isnull)); + *wos_tid = *DatumGetItemPointer(slot_getattr(slot, 2, &isnull)); + + ExecDropSingleTupleTableSlot(slot); + return true; +} + +static int +readTidListFromWosIntoTidArray(Oid wos_oid, WosKind wos_kind, vci_tid_tid_xid64_t *wos_entris, int max_wos_entries, Snapshot snapshot) +{ + LOCKMODE lockmode = AccessShareLock; + TableScanDesc scan; + HeapTuple tuple; + Relation rel; + TupleDesc tupleDesc; + int num_rows = 0; + + rel = relation_open(wos_oid, lockmode); + + tupleDesc = RelationGetDescr(rel); + + CHECK_FOR_INTERRUPTS(); + + scan = table_beginscan(rel, snapshot, 0, NULL); + + scan->rs_flags &= ~SO_ALLOW_PAGEMODE; + while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL) + { + bool isnull; + + if (max_wos_entries <= num_rows) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("too many WOS rows over estimation"))); + + wos_entris[num_rows].orig_tid = *DatumGetItemPointer(heap_getattr(tuple, 1, tupleDesc, &isnull)); /* original_tid in WOS */ + wos_entris[num_rows].wos_tid = tuple->t_self; + wos_entris[num_rows].xid64 = DatumGetInt64(heap_getattr(tuple, 2, tupleDesc, &isnull)); /* xid64 in WOS */ + + wos_entris[num_rows].movable = true; + + Assert(ItemPointerIsValid(&wos_entris[num_rows].orig_tid)); + + CHECK_FOR_INTERRUPTS(); + + num_rows++; + } + table_endscan(scan); + + table_close(rel, lockmode); + + return num_rows; +} + +static void +readTidListFromWosIntoTidSortState(Oid wos_oid, WosKind wos_kind, + TupleTableSlot *slot, Tuplesortstate *sortstate, + Snapshot snapshot, + TransactionId wos2ros_xid) +{ + LOCKMODE lockmode = AccessShareLock; + TableScanDesc scan; + HeapTuple tuple; + Relation rel; + TupleDesc tupleDesc; + + rel = relation_open(wos_oid, lockmode); + tupleDesc = RelationGetDescr(rel); + + CHECK_FOR_INTERRUPTS(); + + scan = table_beginscan(rel, snapshot, 0, NULL); + scan->rs_flags &= ~SO_ALLOW_PAGEMODE; + while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL) + { + TransactionId xmin; + bool isnull; + bool movable; + + xmin = HeapTupleHeaderGetXmin(tuple->t_data); + movable = TransactionIdPrecedes(xmin, wos2ros_xid); + ExecClearTuple(slot); + + slot->tts_values[0] = heap_getattr(tuple, 1, tupleDesc, &isnull); /* original_tid in WOS */ + slot->tts_values[1] = ItemPointerGetDatum(&tuple->t_self); + slot->tts_values[2] = heap_getattr(tuple, 2, tupleDesc, &isnull); /* xid64 in WOS */ + slot->tts_values[3] = BoolGetDatum(movable); + + slot->tts_isnull[0] = false; + slot->tts_isnull[1] = false; + slot->tts_isnull[2] = false; + slot->tts_isnull[3] = false; + + slot->tts_flags |= TTS_FLAG_EMPTY; + + ExecStoreVirtualTuple(slot); + + tuplesort_puttupleslot(sortstate, slot); + + CHECK_FOR_INTERRUPTS(); + } + table_endscan(scan); + + relation_close(rel, lockmode); +} + +static bool +getValidTidSortState(Tuplesortstate *sortstate, TupleTableSlot *slot, vci_tid_tid_xid64_t *item) +{ + bool isnull; + TupleTableSlot *tempslot; + + tempslot = MakeSingleTupleTableSlot(slot->tts_tupleDescriptor, &TTSOpsMinimalTuple); + + if (!tuplesort_gettupleslot(sortstate, true, false, tempslot, NULL)) + { + ExecDropSingleTupleTableSlot(tempslot); + return false; + } + + slot_getsomeattrs(tempslot, 4); + + item->orig_tid = *DatumGetItemPointer(slot_getattr(tempslot, 1, &isnull)); + item->wos_tid = *DatumGetItemPointer(slot_getattr(tempslot, 2, &isnull)); + item->xid64 = DatumGetInt64(slot_getattr(tempslot, 3, &isnull)); + item->movable = DatumGetBool(slot_getattr(tempslot, 4, &isnull)); + + ExecDropSingleTupleTableSlot(tempslot); + return true; +} + +static int32 +compareXid64(int64 data_wos_xid64, int64 whiteout_wos_xid64) +{ + Assert((data_wos_xid64 > 0) && (whiteout_wos_xid64 > 0)); + + if (data_wos_xid64 == whiteout_wos_xid64) + { + return 0; + } + else if (data_wos_xid64 > whiteout_wos_xid64) + { + return +1; + } + else + { + + return 0; + } +} diff --git a/contrib/vci/storage/vci_ros_daemon.c b/contrib/vci/storage/vci_ros_daemon.c new file mode 100644 index 0000000..5976170 --- /dev/null +++ b/contrib/vci/storage/vci_ros_daemon.c @@ -0,0 +1,865 @@ +/*------------------------------------------------------------------------- + * + * vci_ros_daemon.c + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/storage/vci_ros_daemon.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/htup.h" +#include "access/htup_details.h" +#include "access/htup_details.h" +#include "access/xact.h" +#include "access/xlog.h" +#include "c.h" +#include "catalog/index.h" +#include "catalog/pg_database.h" +#include "fmgr.h" +#include "lib/ilist.h" +#include "miscadmin.h" +#include "postmaster/autovacuum.h" +#include "postmaster/bgworker.h" +#include "storage/bufpage.h" +#include "storage/ipc.h" +#include "storage/latch.h" +#include "storage/lwlock.h" +#include "storage/proc.h" +#include "storage/procarray.h" /* for TransactionIdIsInProgress() */ +/* #include "storage/shmem.h" */ +#include "utils/guc.h" +#include "utils/memutils.h" +#include "utils/rel.h" +#include "utils/snapmgr.h" +#include "utils/syscache.h" +#include "pgstat.h" + +#include "vci.h" +#include "vci_mem.h" +#include "vci_ros.h" +#include "vci_ros_daemon.h" +#include "vci_ros_command.h" + +#include "vci_memory_entry.h" + +/** + * message on worker exit. + */ +typedef struct message_on_worker_exit +{ + int log_min_messages; + int message_level; + char message[1024]; +} message_on_worker_exit_t; + +static message_on_worker_exit_t message_on_worker_exit; + +#define INIT_MESSAGE_ON_WORKER_EXIT() \ +do \ +{ \ + MemSet(&message_on_worker_exit, 0x00, sizeof(message_on_worker_exit)); \ + message_on_worker_exit.log_min_messages = log_min_messages; \ + on_proc_exit(callback_on_exit_worker, Int32GetDatum(0)); \ +} while (0) + +#define SET_MESSAGE_ON_WORKER_EXIT(elevel, ...) \ +do \ +{ \ + message_on_worker_exit.message_level = (elevel); \ + snprintf(message_on_worker_exit.message, sizeof(message_on_worker_exit.message), __VA_ARGS__); \ + message_on_worker_exit.log_min_messages = log_min_messages; \ + log_min_messages = PANIC; \ +} while (0) + +#define RESET_MESSAGE_ON_WORKER_EXIT() \ +do \ +{ \ + log_min_messages = message_on_worker_exit.log_min_messages; \ + message_on_worker_exit.message_level = 0; \ + message_on_worker_exit.message[0] = '\0'; \ +} while (0) + +static bool TryToOpenVCIRelations(Oid indexOid, LOCKMODE heapLock, LOCKMODE indexLock, + Relation *heapRel, Relation *indexRel); +static void CheckRosControlWorkerCancel(void); +static void callback_on_exit_worker(int code, Datum arg); + +/* BGW_MAXREN = 64 */ +/* If the ROS control worker name is changed then update the bgw_name check in LockAcquire() too.*/ +static const char VCI_ROS_CONTROL_DAEMON_NAME[BGW_MAXLEN] = "vci:ROS control daemon"; +static const char VCI_ROS_CONTROL_WORKER_NAME_TEMP[BGW_MAXLEN] = "vci:ROS control worker(slot=%d)"; +static const char VCI_ROS_CONTROL_WORKER_TYPE[BGW_MAXLEN] = "vci:ROS control worker"; + +/* flags set by signal handlers */ +static volatile sig_atomic_t gotSighup = false; +static volatile sig_atomic_t gotSigterm = false; + +static vci_workerslot_t *workerslot; + +static char probeMessage[num_vci_rc][1024] = +{ + " data WOS count : %8d / %8d.", + " whiteout WOS count : %8d / %8d.", + " CDR : %8d / %8d (extent %d).", + " CDE : %8d / %8d (extent %d).", + " TIDCRID : %8d / %8d.", +}; + +/* ------------ daemon -------------- */ + +/** + * Register ROS Control daemon function called from _PG_init_ + */ +void +vci_ROS_control_daemon_setup(void) +{ + BackgroundWorker worker; + + /* for internal use */ + if (VciGuc.enable_ros_control_daemon == false) + { + elog(DEBUG1, "vci: no daemon mode"); + return; + } + + memset(&worker, 0, sizeof(worker)); + /* set up common data for all our workers */ + worker.bgw_flags = BGWORKER_SHMEM_ACCESS | + BGWORKER_BACKEND_DATABASE_CONNECTION; + /* worker.bgw_start_time = BgWorkerStart_ConsistentState; */ + worker.bgw_start_time = BgWorkerStart_RecoveryFinished; + /* worker.bgw_start_time = BgWorkerStart_PostmasterStart; */ + + worker.bgw_restart_time = VCI_DAEMON_RESTART_TIME; + worker.bgw_notify_pid = 0; + + snprintf(worker.bgw_name, BGW_MAXLEN, VCI_ROS_CONTROL_DAEMON_NAME); + snprintf(worker.bgw_type, BGW_MAXLEN, VCI_ROS_CONTROL_DAEMON_NAME); + strcpy(worker.bgw_library_name, VCI_STRING); + strcpy(worker.bgw_function_name, "vci_ROS_control_daemon_main"); + worker.bgw_main_arg = (Datum) 0; + + RegisterBackgroundWorker(&worker); +} + +/** + * Signal handler for SIGTERM + * + * @description + * Set a flag to let the main loop to terminate, and set our latch to wake it up. + * + * @param[in] SIGNAL_ARGS + */ +static void +vci_ROSControlDaemonSigterm(SIGNAL_ARGS) +{ + gotSigterm = true; + if (MyProc) + SetLatch(&MyProc->procLatch); +} + +/** + * Signal handler for SIGHUP + * + * @description + * Set a flag to tell the main loop to reread the config file, and set + * our latch to wake it up. + * + * @params[in] SIGNAL_ARGS + */ +static void +vci_ROSControlDaemonSighup(SIGNAL_ARGS) +{ + gotSighup = true; + if (MyProc) + SetLatch(&MyProc->procLatch); +} + +/** + * ROS control DAEMON's entory point. + */ +void +vci_ROS_control_daemon_main(Datum main_arg) +{ + /* + * XXX - VCI wants to pretend this worker is like an autovacuum launcher; + * Let's set the MyBackendType to achieve this. + */ + MyBackendType = B_AUTOVAC_LAUNCHER; + + pg_bindtextdomain(TEXTDOMAIN); + + /* StringInfoData buf; */ + elog(DEBUG1, "start initialize %s", MyBgworkerEntry->bgw_name); + + /* Establish signal handlers before unblocking signals. */ + pqsignal(SIGHUP, vci_ROSControlDaemonSighup); + pqsignal(SIGTERM, vci_ROSControlDaemonSigterm); + pqsignal(SIGQUIT, vci_ROSControlDaemonSigterm); + pqsignal(SIGINT, vci_ROSControlDaemonSigterm); + + /* pqsignal(SIGUSR1, vci_ROSNotify); */ + + /* We're now ready to receive signals */ + BackgroundWorkerUnblockSignals(); + + BackgroundWorkerInitializeConnection(NULL, NULL, 0); /* Connect to Shared + * database */ + + /* Connect DB to access common system catalog */ + + workerslot = (vci_workerslot_t *) palloc0(sizeof(vci_workerslot_t) * + VciGuc.control_max_workers); + + /* Main loop */ + while (!gotSigterm) + { + int rc; + int i; + + CHECK_FOR_INTERRUPTS(); + + /* + * Background workers mustn't call usleep() or any direct equivalent: + * instead, they may wait on their process latch, which sleeps as + * necessary, but is awakened if postmaster dies. That way the + * background process goes away immediately in an emergency. + */ + rc = WaitLatch(&MyProc->procLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, + VciGuc.control_naptime * INT64CONST(1000), + PG_WAIT_EXTENSION); + ResetLatch(&MyProc->procLatch); + + /* emergency bailout if postmaster has died */ + if (rc & WL_POSTMASTER_DEATH) + proc_exit(1); /* abnormal end */ + + if (gotSigterm) + goto done; + + LWLockAcquire(VciShmemAddr->io_load_lock, LW_EXCLUSIVE); + + /* Check VCI' database is exists */ + vci_RemoveMemoryEntryOnDroppedDatabase(); + + vci_update_memoryentry_in_devloadinfo(); + + if (gotSigterm) + { + LWLockRelease(VciShmemAddr->io_load_lock); + goto done; + } + + VciShmemAddr->translated_dev_pos = 0; + + elog(DEBUG2, ">>> 1. control_max_workers = %d", VciGuc.control_max_workers); + for (i = 0; i < VciGuc.control_max_workers; i++) + { + elog(DEBUG2, ">>> 1. workerslot[%d].pid is %d", i, (int) workerslot[i].pid); + if (workerslot[i].pid != 0) + { + pid_t pid; + BgwHandleStatus status; + + status = GetBackgroundWorkerPid(&workerslot[i].handle, &pid); + switch (status) + { + case BGWH_STOPPED: + workerslot[i].pid = 0; + break; + case BGWH_NOT_YET_STARTED: + case BGWH_POSTMASTER_DIED: + case BGWH_STARTED: + break; + default: + /* LCOV_EXCL_START */ + elog(PANIC, "invalid BgwHandleStatus in vci_ROS_control_daemon_main"); + /* LCOV_EXCL_STOP */ + break; + } + + if (gotSigterm) + { + LWLockRelease(VciShmemAddr->io_load_lock); + goto done; + } + } + } + + LWLockAcquire(VciShmemAddr->memory_entries->lock, LW_SHARED); + + vci_ResetDevloadCurrentPos(); + + if (!fullPageWrites) + goto reload_configuration; + + elog(DEBUG2, ">>> 2. control_max_workers = %d", VciGuc.control_max_workers); + for (i = 0; i < VciGuc.control_max_workers; i++) + { + elog(DEBUG2, ">>> 2. workerslot[%d].pid is %d", i, (int) workerslot[i].pid); + if (workerslot[i].pid == 0) + { + int j; + bool worker_running = false; + + if (!vci_GetWosRosConvertingVCI(&VciShmemAddr->worker_args_array[i])) + break; + + Assert(OidIsValid(VciShmemAddr->worker_args_array[i].dbid)); + Assert(OidIsValid(VciShmemAddr->worker_args_array[i].oid)); + + for (j = 0; j < VciGuc.control_max_workers; j++) + { + if (workerslot[j].pid != 0 && + workerslot[j].dbid == VciShmemAddr->worker_args_array[i].dbid && + workerslot[j].oid == VciShmemAddr->worker_args_array[i].oid) + { + elog(DEBUG1, "a worker is running on VCI (oid=%d, dbid=%d)", + VciShmemAddr->worker_args_array[i].oid, + VciShmemAddr->worker_args_array[i].dbid); + worker_running = true; + break; + } + } + + if (!worker_running) + { + workerslot[i] = vci_LaunchROSControlWorker(&VciShmemAddr->worker_args_array[i], i); + workerslot[i].oid = VciShmemAddr->worker_args_array[i].oid; + workerslot[i].dbid = VciShmemAddr->worker_args_array[i].dbid; + } + } + + } + + /* + * In case of a SIGHUP, just reload the configuration. (?) + */ +reload_configuration: + if (gotSighup) + { + gotSighup = false; + ProcessConfigFile(PGC_SIGHUP); + } + + vci_MoveTranslatedVCI2Tail(); + + LWLockRelease(VciShmemAddr->memory_entries->lock); + + LWLockRelease(VciShmemAddr->io_load_lock); + } + +done: + + /* + * Daemon terminate by exit code=1, restart by postmaster as necessary. + */ + proc_exit(1); +} + +/* ------------ Worker -------------- */ + +vci_workerslot_t +vci_LaunchROSControlWorker(vci_wosros_conv_worker_arg_t *vciinfo, int slot_id) +/* vci_database_priority_t *item, */ +{ + BackgroundWorker worker; + BackgroundWorkerHandle *handle; + pid_t pid; + + vci_workerslot_t result; + + /* Assert(MyDatabaseId == InvalidOid); */ + + worker.bgw_flags = BGWORKER_SHMEM_ACCESS | BGWORKER_BACKEND_DATABASE_CONNECTION; + worker.bgw_start_time = BgWorkerStart_ConsistentState; + worker.bgw_restart_time = BGW_NEVER_RESTART; + + sprintf(worker.bgw_library_name, VCI_STRING); + sprintf(worker.bgw_function_name, "vci_ROS_control_worker_main"); + snprintf(worker.bgw_name, BGW_MAXLEN, VCI_ROS_CONTROL_WORKER_NAME_TEMP, slot_id); + snprintf(worker.bgw_type, BGW_MAXLEN, VCI_ROS_CONTROL_WORKER_TYPE); + +/* + worker.bgw_main_arg = PointerGetDatum(item); +*/ + worker.bgw_main_arg = PointerGetDatum(vciinfo); + worker.bgw_notify_pid = 0; /* don't notify by SIG_USR1 since it calls + * SetLatch and and awakens the parent process + * ROS daemon. That results ROS daemon + * spawning unnecessary multiple ROS control + * workers. */ + + if (!RegisterDynamicBackgroundWorker(&worker, &handle)) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_RESOURCES), + errmsg("could not register background process"), + errhint("You may need to increase max_worker_processes."))); + + /* Wait for workers to become ready. */ + while (true) + { + BgwHandleStatus status; + + status = GetBackgroundWorkerPid(handle, &pid); + if (gotSigterm) + break; + + switch (status) + { + case BGWH_NOT_YET_STARTED: + continue; + + case BGWH_STARTED: + goto done; + + case BGWH_STOPPED: + pid = 0; + goto done; + + case BGWH_POSTMASTER_DIED: + pid = 0; + goto done; + + default: + /* LCOV_EXCL_START */ + elog(PANIC, "should not reach here"); + /* LCOV_EXCL_STOP */ + goto done; + } + } + +done: + result.pid = pid; + result.handle = *handle; + + pfree(handle); + + return result; +} + +/** + * + */ +static inline bool +vci_GetRosCommandExecFlag(char flag, vci_ros_command_t command_id) +{ + return (flag & (1 << command_id)) != 0; +} + +static inline void +vci_SetRosCommandExecFlag(char *flag, vci_ros_command_t command_id) +{ + *flag |= (1 << command_id); +} + +static int +determine_ExecCommand_and_Extent(const Oid vci_oid, + char *targetExecFlag, + int32 *targetExtentForCdr, + bool force_wosros_conv) +{ + Relation indexRel; + Relation heapRel; + vci_ros_command_t command; + + /* Transaction Start */ + SetCurrentStatementStartTimestamp(); + StartTransactionCommand(); + PushActiveSnapshot(GetTransactionSnapshot()); + + /* Try to open the heap relation & the index relation. */ + if (!TryToOpenVCIRelations(vci_oid, AccessShareLock, AccessShareLock, + &heapRel, &indexRel)) + { + AbortCurrentTransaction(); + return -1; + } + + /* Check request for ros control worker cancel. */ + CheckRosControlWorkerCancel(); + + MemSet(targetExecFlag, 0, sizeof(char)); + MemSet(targetExtentForCdr, 0, sizeof(int32)); + + for (command = 0; command < num_vci_rc; command++) + { + int32 count = 0; + vci_target_extent_info_t extent_info = {0, -1}; + int32 targetExtentId; + + switch (command) + { + case vci_rc_wos_ros_conv: + /* 1. count DataWOS */ + count = vci_CountFreezedInDataWos(indexRel, MaxAllocSize); + break; + + case vci_rc_update_del_vec: + /* 2. count WhiteoutWOS */ + count = vci_CountFreezedInWhiteoutWos(indexRel, MaxAllocSize); + break; + + case vci_rc_collect_deleted: + /* 3. count deleted rows in each extent */ + extent_info = vci_CountDeletedRowsInROS(indexRel, (uint32) VciGuc.cdr_threshold); + break; + + case vci_rc_update_tid_crid: + /* 5. count TID->CRID update list */ + count = vci_CountTidCridUpdateListLength(indexRel, MaxAllocSize); + break; + + case vci_rc_collect_extent: + /* 6. count unused extents */ + extent_info = vci_CountUnusedExtents(indexRel); + break; + + default: + /* LCOV_EXCL_START */ + elog(ERROR, "unexpected ROS command"); + /* LCOV_EXCL_STOP */ + break; + } + + switch (command) + { + case vci_rc_wos_ros_conv: + elog(DEBUG2, &probeMessage[vci_rc_wos_ros_conv][0], count, VciGuc.wosros_conv_threshold); + if (force_wosros_conv || count >= VciGuc.wosros_conv_threshold) + vci_SetRosCommandExecFlag(targetExecFlag, vci_rc_wos_ros_conv); + break; + + case vci_rc_update_del_vec: + elog(DEBUG2, &probeMessage[vci_rc_update_del_vec][0], count, VCI_UPDATE_DELVEC_THRESHOLD); + if (force_wosros_conv || count >= VCI_UPDATE_DELVEC_THRESHOLD) + vci_SetRosCommandExecFlag(targetExecFlag, vci_rc_update_del_vec); + break; + + case vci_rc_update_tid_crid: + elog(DEBUG2, &probeMessage[vci_rc_update_tid_crid][0], count, VCI_UPDATE_TIDCRID_THRESHOLD); + if (count >= VCI_UPDATE_TIDCRID_THRESHOLD) + vci_SetRosCommandExecFlag(targetExecFlag, vci_rc_update_tid_crid); + break; + + case vci_rc_collect_extent: + case vci_rc_collect_deleted: + targetExtentId = VCI_INVALID_EXTENT_ID; + if (extent_info.num_fit_extents > 0) + { + targetExtentId = extent_info.best_extent_id; + + if (command == vci_rc_collect_deleted) + *targetExtentForCdr = targetExtentId; + + vci_SetRosCommandExecFlag(targetExecFlag, command); + } + break; + + default: + /* LCOV_EXCL_START */ + elog(ERROR, "unexpected ROS command"); + /* LCOV_EXCL_STOP */ + break; + } + } + + /* unlock VCI main rel */ + index_close(indexRel, AccessShareLock); + + table_close(heapRel, AccessShareLock); + + /* Transaction End */ + PopActiveSnapshot(); + CommitTransactionCommand(); + + return 0; +} + +/** + * update ROS + * + * @param[in] targetIndexOid target index oid + * @param[in] targetExecCommandFlag target exec commands + * @param[in] targetExtentId target extent id + * @param[out] num_converted_data_wos number of rows coverted in Data WOS + * @param[out] num_converted_whiteout_wos number of rows converted in Whiteout WOS + */ +static void +vci_executeROScommand(Oid targetIndexOid, char targetExecCommandFlag, int32 targetExtentId, + int *num_converted_data_wos, int *num_converted_whiteout_wos) +{ + vci_ros_command_t command; + + /* + * loop for executing ROS commaand each command is excuted in anoter + * Transaction(); + */ + for (command = 0; command < num_vci_rc; command++) + { + if (vci_GetRosCommandExecFlag(targetExecCommandFlag, command)) + { + Relation mainRel; + Relation heapRel; + Size workAreaSize = VciGuc.maintenance_work_mem * INT64CONST(1024); + + instr_time s_time; + instr_time e_time; + volatile Snapshot snapshot; + + /* Check request for ros control worker cancel. */ + CheckRosControlWorkerCancel(); + + /* transaction start */ + SetCurrentStatementStartTimestamp(); + StartTransactionCommand(); + snapshot = GetTransactionSnapshot(); + PushActiveSnapshot(snapshot); + GetCurrentTransactionId(); + + /** Try to open the heap relation & the index relation, + * and get ShareUpdateExclusiveLock for the index relation. */ + if (!TryToOpenVCIRelations(targetIndexOid, AccessShareLock, ShareUpdateExclusiveLock, + &heapRel, &mainRel)) + { + /* Exit worker process. */ + AbortCurrentTransaction(); + return; + } + + elog(LOG, "starts ROS command \"%s\"", vci_GetRosCommandName(command)); + INSTR_TIME_SET_CURRENT(s_time); + + switch (command) + { + case vci_rc_wos_ros_conv: + /* 1. WOS->ROS conversion */ + *num_converted_data_wos = vci_ConvertWos2Ros(mainRel, workAreaSize, VciGuc.wosros_conv_threshold); + break; + + case vci_rc_update_del_vec: + /* 2. update delete vector */ + *num_converted_whiteout_wos = vci_UpdateDelVec(mainRel, workAreaSize, VCI_UPDATE_DELVEC_THRESHOLD); + break; + + case vci_rc_collect_deleted: + /* 3. collect deleted rows */ + vci_CollectDeletedRows(mainRel, workAreaSize, targetExtentId); + break; + + case vci_rc_update_tid_crid: + /* 5. update TID->CRID update list to TID-CRID tree */ + vci_UpdateTidCrid(mainRel, workAreaSize, 10000); + break; + + case vci_rc_collect_extent: + /* 6. collect an unused extent */ + vci_CollectUnusedExtent(mainRel, workAreaSize); + break; + + default: + /* LCOV_EXCL_START */ + elog(ERROR, "unexpected ROS command"); + /* LCOV_EXCL_STOP */ + break; + } + + index_close(mainRel, ShareUpdateExclusiveLock); + table_close(heapRel, AccessShareLock); + + PopActiveSnapshot(); + CommitTransactionCommand(); + + INSTR_TIME_SET_CURRENT(e_time); + INSTR_TIME_SUBTRACT(e_time, s_time); + elog(LOG, "finished ROS command \"%s\" (%.03f ms)", vci_GetRosCommandName(command), + INSTR_TIME_GET_MILLISEC(e_time)); + } + } +} + +/* + * @param[in] dboid id of db to which the worker connects. + * @pramm[in] username user name + */ +static void +BackgroundWorkerInitializeConnectionByOid1(Oid dboid, const char *username) +{ + BackgroundWorker *worker = MyBgworkerEntry; + + /* XXX is this the right errcode? */ + if (!(worker->bgw_flags & BGWORKER_BACKEND_DATABASE_CONNECTION)) + ereport(FATAL, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("database connection requirement not indicated during registration"))); + + InitPostgres(NULL, dboid, username, InvalidOid, 0, NULL); + + /* it had better not gotten out of "init" mode yet */ + if (!IsInitProcessingMode()) + ereport(ERROR, + (errmsg("invalid processing mode in background worker"))); + SetProcessingMode(NormalProcessing); +} + +#define RATIO_OF_INCREASE 1.1 + +/** + * @param[in] main_arg id of vci, a WOS->ROS transfomation of which is performed. + */ +void +vci_ROS_control_worker_main(Datum main_arg) +{ + Oid targetIndexOid = InvalidOid; + int32 targetExtentId = 01; + char targetExecCommandFlag = 0x00; + + Oid dboid; + vci_wosros_conv_worker_arg_t *vciinfo; + int ret; + int num_converted_data_wos = INT_MAX; + int num_converted_whiteout_wos = INT_MAX; + + pg_bindtextdomain(TEXTDOMAIN); + + pqsignal(SIGHUP, vci_ROSControlDaemonSighup); + pqsignal(SIGTERM, vci_ROSControlDaemonSigterm); + pqsignal(SIGQUIT, vci_ROSControlDaemonSigterm); + pqsignal(SIGINT, vci_ROSControlDaemonSigterm); + /* pqsignal(SIGUSR1, vci_ROSNotify); */ + + /* Check full_page_writers=off */ + if (!fullPageWrites) + return; + + /* We're now ready to receive signals */ + BackgroundWorkerUnblockSignals(); + + INIT_MESSAGE_ON_WORKER_EXIT(); + + /* + * Checkout the Postmaster was rebooted. if + * (MyBgworkerEntry->bgw_notify_pid == 0) return; + */ + + /* Connect to DB corresponding to dbid */ + + vciinfo = (vci_wosros_conv_worker_arg_t *) DatumGetPointer(main_arg); + targetIndexOid = vciinfo->oid; + dboid = vciinfo->dbid; + + SET_MESSAGE_ON_WORKER_EXIT(DEBUG1, "worker: Failed to connect '%d'.", dboid); + BackgroundWorkerInitializeConnectionByOid1(dboid, NULL); + RESET_MESSAGE_ON_WORKER_EXIT(); + + elog(DEBUG1, "worker: connect to %d is OK. do wos->ros conversion on vci %d", dboid, targetIndexOid); + +#if 0 + /** + * TODO -- Put thi call back again if/when Iwata-San's separate bgworker patch is accepted. + * See https://www.postgresql.org/message-id/OS7PR01MB11964335F36BE41021B62EAE8EAE4A%40OS7PR01MB11964.jpnprd01.prod.outlook.com + */ + + /* Accept cancel by admin commands. */ + AcceptBackgroundWorkerCancel(MyDatabaseId, BGWORKER_CANCEL_ADMIN_COMMANDS); +#endif + + ret = determine_ExecCommand_and_Extent(targetIndexOid, &targetExecCommandFlag, + &targetExtentId, vciinfo->force_next_wosros_conv); + + if (ret == 0) + vci_executeROScommand(targetIndexOid, targetExecCommandFlag, targetExtentId, + &num_converted_data_wos, &num_converted_whiteout_wos); + + if (vciinfo->force_next_wosros_conv && + num_converted_data_wos == 0 && + num_converted_whiteout_wos == 0) + { + vci_id_t vciid; + + vciid.oid = targetIndexOid; + vciid.dbid = dboid; + + vci_SetForceNextWosRosConvFlag(&vciid, false); + } + +} + +/** + * Try to open the heap relation & the index relation. + * open the heap relation to detect AccessExclusiveLock of the heap + * relation, before opening the index relation. + */ +static bool +TryToOpenVCIRelations(Oid indexOid, LOCKMODE heapLock, LOCKMODE indexLock, + Relation *heapRel, Relation *indexRel) +{ + Oid heapOid; + + heapOid = IndexGetRelation(indexOid, true); + if (OidIsValid(heapOid)) + { + *heapRel = try_relation_open(heapOid, heapLock); + if (*heapRel != NULL) + { + *indexRel = try_relation_open(indexOid, indexLock); + if (*indexRel != NULL) + { + if (isVciIndexRelation(*indexRel)) + return true; + + relation_close(*indexRel, indexLock); + } + + relation_close(*heapRel, heapLock); + } + } + + elog(DEBUG1, "worker: The relation the OID=%d indicates was deleted.", indexOid); + + return false; +} + +/** + * Check request for ros control worker cancel. + */ +static void +CheckRosControlWorkerCancel(void) +{ +#ifdef WIN32 + if (UNBLOCKED_SIGNAL_QUEUE()) + pgwin32_dispatch_queued_signals(); +#endif /* WIN32 */ + + if (gotSigterm) + { + ereport(DEBUG1, + (errcode(ERRCODE_ADMIN_SHUTDOWN), + errmsg_internal("terminating VCI worker process due to administrator command"))); + /* process terminate. */ + exit(1); + + } +} + +/** + * callback on exit worker fro message. + */ +static void +callback_on_exit_worker(int code, Datum arg) +{ + log_min_messages = message_on_worker_exit.log_min_messages; + + if (message_on_worker_exit.message[0]) + { + elog(message_on_worker_exit.message_level, + "%s", message_on_worker_exit.message); + message_on_worker_exit.message[0] = '\0'; + } + elog(DEBUG1, "worker: ROS control worker exit code=%d.", code); +} -- 1.8.3.1