From 6d726b83d493d2999ae6ff455af5316b005467a6 Mon Sep 17 00:00:00 2001 From: David Christensen Date: Fri, 5 Jan 2024 15:10:42 -0500 Subject: [PATCH v3 15/28] chore: Split nbtree.h structure defs into an internal file These definitions have been separated out so we can calculate block size constants from front-end code as well, which we were prevented from doing due to s_lock.h complaints when compiled in front-end mode. Since we only need to calculate the various cluster constants using this structure sizes we can just define the parts that that routine cares about here in the new header and pull in that into blocksize.c instead of all of nbtree.h. --- src/include/access/nbtree.h | 163 +-------------------------- src/include/access/nbtree_int.h | 192 ++++++++++++++++++++++++++++++++ 2 files changed, 194 insertions(+), 161 deletions(-) create mode 100644 src/include/access/nbtree_int.h diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 9575ca17ce..476a08def7 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -25,168 +25,9 @@ #include "storage/bufmgr.h" #include "storage/shm_toc.h" -/* There's room for a 16-bit vacuum cycle ID in BTPageOpaqueData */ -typedef uint16 BTCycleId; +/* data structures are defined in nbtree_int.h */ +#include "access/nbtree_int.h" -/* - * BTPageOpaqueData -- At the end of every page, we store a pointer - * to both siblings in the tree. This is used to do forward/backward - * index scans. The next-page link is also critical for recovery when - * a search has navigated to the wrong page due to concurrent page splits - * or deletions; see src/backend/access/nbtree/README for more info. - * - * In addition, we store the page's btree level (counting upwards from - * zero at a leaf page) as well as some flag bits indicating the page type - * and status. If the page is deleted, a BTDeletedPageData struct is stored - * in the page's tuple area, while a standard BTPageOpaqueData struct is - * stored in the page special area. - * - * We also store a "vacuum cycle ID". When a page is split while VACUUM is - * processing the index, a nonzero value associated with the VACUUM run is - * stored into both halves of the split page. (If VACUUM is not running, - * both pages receive zero cycleids.) This allows VACUUM to detect whether - * a page was split since it started, with a small probability of false match - * if the page was last split some exact multiple of MAX_BT_CYCLE_ID VACUUMs - * ago. Also, during a split, the BTP_SPLIT_END flag is cleared in the left - * (original) page, and set in the right page, but only if the next page - * to its right has a different cycleid. - * - * NOTE: the BTP_LEAF flag bit is redundant since level==0 could be tested - * instead. - * - * NOTE: the btpo_level field used to be a union type in order to allow - * deleted pages to store a 32-bit safexid in the same field. We now store - * 64-bit/full safexid values using BTDeletedPageData instead. - */ - -typedef struct BTPageOpaqueData -{ - BlockNumber btpo_prev; /* left sibling, or P_NONE if leftmost */ - BlockNumber btpo_next; /* right sibling, or P_NONE if rightmost */ - uint32 btpo_level; /* tree level --- zero for leaf pages */ - uint16 btpo_flags; /* flag bits, see below */ - BTCycleId btpo_cycleid; /* vacuum cycle ID of latest split */ -} BTPageOpaqueData; - -typedef BTPageOpaqueData *BTPageOpaque; - -#define BTPageGetOpaque(page) ((BTPageOpaque) PageGetSpecialPointer(page)) - -/* Bits defined in btpo_flags */ -#define BTP_LEAF (1 << 0) /* leaf page, i.e. not internal page */ -#define BTP_ROOT (1 << 1) /* root page (has no parent) */ -#define BTP_DELETED (1 << 2) /* page has been deleted from tree */ -#define BTP_META (1 << 3) /* meta-page */ -#define BTP_HALF_DEAD (1 << 4) /* empty, but still in tree */ -#define BTP_SPLIT_END (1 << 5) /* rightmost page of split group */ -#define BTP_HAS_GARBAGE (1 << 6) /* page has LP_DEAD tuples (deprecated) */ -#define BTP_INCOMPLETE_SPLIT (1 << 7) /* right sibling's downlink is missing */ -#define BTP_HAS_FULLXID (1 << 8) /* contains BTDeletedPageData */ - -/* - * The max allowed value of a cycle ID is a bit less than 64K. This is - * for convenience of pg_filedump and similar utilities: we want to use - * the last 2 bytes of special space as an index type indicator, and - * restricting cycle ID lets btree use that space for vacuum cycle IDs - * while still allowing index type to be identified. - */ -#define MAX_BT_CYCLE_ID 0xFF7F - - -/* - * The Meta page is always the first page in the btree index. - * Its primary purpose is to point to the location of the btree root page. - * We also point to the "fast" root, which is the current effective root; - * see README for discussion. - */ - -typedef struct BTMetaPageData -{ - uint32 btm_magic; /* should contain BTREE_MAGIC */ - uint32 btm_version; /* nbtree version (always <= BTREE_VERSION) */ - BlockNumber btm_root; /* current root location */ - uint32 btm_level; /* tree level of the root page */ - BlockNumber btm_fastroot; /* current "fast" root location */ - uint32 btm_fastlevel; /* tree level of the "fast" root page */ - /* remaining fields only valid when btm_version >= BTREE_NOVAC_VERSION */ - - /* number of deleted, non-recyclable pages during last cleanup */ - uint32 btm_last_cleanup_num_delpages; - /* number of heap tuples during last cleanup (deprecated) */ - float8 btm_last_cleanup_num_heap_tuples; - - bool btm_allequalimage; /* are all columns "equalimage"? */ -} BTMetaPageData; - -#define BTPageGetMeta(p) \ - ((BTMetaPageData *) PageGetContents(p)) - -/* - * The current Btree version is 4. That's what you'll get when you create - * a new index. - * - * Btree version 3 was used in PostgreSQL v11. It is mostly the same as - * version 4, but heap TIDs were not part of the keyspace. Index tuples - * with duplicate keys could be stored in any order. We continue to - * support reading and writing Btree versions 2 and 3, so that they don't - * need to be immediately re-indexed at pg_upgrade. In order to get the - * new heapkeyspace semantics, however, a REINDEX is needed. - * - * Deduplication is safe to use when the btm_allequalimage field is set to - * true. It's safe to read the btm_allequalimage field on version 3, but - * only version 4 indexes make use of deduplication. Even version 4 - * indexes created on PostgreSQL v12 will need a REINDEX to make use of - * deduplication, though, since there is no other way to set - * btm_allequalimage to true (pg_upgrade hasn't been taught to set the - * metapage field). - * - * Btree version 2 is mostly the same as version 3. There are two new - * fields in the metapage that were introduced in version 3. A version 2 - * metapage will be automatically upgraded to version 3 on the first - * insert to it. INCLUDE indexes cannot use version 2. - */ -#define BTREE_METAPAGE 0 /* first page is meta */ -#define BTREE_MAGIC 0x053162 /* magic number in metapage */ -#define BTREE_VERSION 4 /* current version number */ -#define BTREE_MIN_VERSION 2 /* minimum supported version */ -#define BTREE_NOVAC_VERSION 3 /* version with all meta fields set */ - -/* - * Maximum size of a btree index entry, including its tuple header. - * - * We actually need to be able to fit three items on every page, - * so restrict any one item to 1/3 the per-page available space. - * - * There are rare cases where _bt_truncate() will need to enlarge - * a heap index tuple to make space for a tiebreaker heap TID - * attribute, which we account for here. - */ -#define BTMaxItemSize(page) \ - (MAXALIGN_DOWN((PageGetUsablePageSize(page) - \ - MAXALIGN(3*sizeof(ItemIdData)) - \ - MAXALIGN(sizeof(BTPageOpaqueData))) / 3) - \ - MAXALIGN(sizeof(ItemPointerData))) -#define BTMaxItemSizeNoHeapTid(page) \ - MAXALIGN_DOWN((PageGetUsablePageSize(page) - \ - MAXALIGN(3*sizeof(ItemIdData)) - \ - MAXALIGN(sizeof(BTPageOpaqueData))) / 3) - -/* - * MaxTIDsPerBTreePageDynamic is an upper bound on the number of heap TIDs tuples - * that may be stored on a btree leaf page. It is used to size the - * per-page temporary buffers. - * - * Note: we don't bother considering per-tuple overheads here to keep - * things simple (value is based on how many elements a single array of - * heap TIDs must have to fill the space between the page header and - * special area). The value is slightly higher (i.e. more conservative) - * than necessary as a result, which is considered acceptable. - */ -#define CalcMaxTIDsPerBTreePage(usablespace) \ - (int) ((usablespace) - sizeof(BTPageOpaqueData) / \ - sizeof(ItemPointerData)) -#define MaxTIDsPerBTreePageLimit (CalcMaxTIDsPerBTreePage(PageUsableSpaceMax)) -#define MaxTIDsPerBTreePageDynamic (CalcMaxTIDsPerBTreePage(PageUsableSpace)) /* * The leaf-page fillfactor defaults to 90% but is user-adjustable. * For pages above the leaf level, we use a fixed 70% fillfactor. diff --git a/src/include/access/nbtree_int.h b/src/include/access/nbtree_int.h new file mode 100644 index 0000000000..f337912a9a --- /dev/null +++ b/src/include/access/nbtree_int.h @@ -0,0 +1,192 @@ +/*------------------------------------------------------------------------- + * + * nbtree_int.h + * data structures for btree access method implementation. + * + * + * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/nbtree_int.h + * + *------------------------------------------------------------------------- + */ +#ifndef NBTREE_INT_H +#define NBTREE_INT_H + +#include "storage/block.h" + +/* + * These definitions have been separated out so we can calculate block size + * constants from front-end code as well, which we were prevented from doing + * due to s_lock.h complaints when compiled in front-end mode. + * + * Since we only need to calculate the various cluster constants using this + * data, we can just define the parts that that routine cares about here and + * pull in that instead of all of nbtree.h directly. + */ + +/* There's room for a 16-bit vacuum cycle ID in BTPageOpaqueData */ +typedef uint16 BTCycleId; + +/* + * BTPageOpaqueData -- At the end of every page, we store a pointer + * to both siblings in the tree. This is used to do forward/backward + * index scans. The next-page link is also critical for recovery when + * a search has navigated to the wrong page due to concurrent page splits + * or deletions; see src/backend/access/nbtree/README for more info. + * + * In addition, we store the page's btree level (counting upwards from + * zero at a leaf page) as well as some flag bits indicating the page type + * and status. If the page is deleted, a BTDeletedPageData struct is stored + * in the page's tuple area, while a standard BTPageOpaqueData struct is + * stored in the page special area. + * + * We also store a "vacuum cycle ID". When a page is split while VACUUM is + * processing the index, a nonzero value associated with the VACUUM run is + * stored into both halves of the split page. (If VACUUM is not running, + * both pages receive zero cycleids.) This allows VACUUM to detect whether + * a page was split since it started, with a small probability of false match + * if the page was last split some exact multiple of MAX_BT_CYCLE_ID VACUUMs + * ago. Also, during a split, the BTP_SPLIT_END flag is cleared in the left + * (original) page, and set in the right page, but only if the next page + * to its right has a different cycleid. + * + * NOTE: the BTP_LEAF flag bit is redundant since level==0 could be tested + * instead. + * + * NOTE: the btpo_level field used to be a union type in order to allow + * deleted pages to store a 32-bit safexid in the same field. We now store + * 64-bit/full safexid values using BTDeletedPageData instead. + */ + +typedef struct BTPageOpaqueData +{ + BlockNumber btpo_prev; /* left sibling, or P_NONE if leftmost */ + BlockNumber btpo_next; /* right sibling, or P_NONE if rightmost */ + uint32 btpo_level; /* tree level --- zero for leaf pages */ + uint16 btpo_flags; /* flag bits, see below */ + BTCycleId btpo_cycleid; /* vacuum cycle ID of latest split */ +} BTPageOpaqueData; + +typedef BTPageOpaqueData *BTPageOpaque; + +#define BTPageGetOpaque(page) ((BTPageOpaque) PageGetSpecialPointer(page)) + +/* Bits defined in btpo_flags */ +#define BTP_LEAF (1 << 0) /* leaf page, i.e. not internal page */ +#define BTP_ROOT (1 << 1) /* root page (has no parent) */ +#define BTP_DELETED (1 << 2) /* page has been deleted from tree */ +#define BTP_META (1 << 3) /* meta-page */ +#define BTP_HALF_DEAD (1 << 4) /* empty, but still in tree */ +#define BTP_SPLIT_END (1 << 5) /* rightmost page of split group */ +#define BTP_HAS_GARBAGE (1 << 6) /* page has LP_DEAD tuples (deprecated) */ +#define BTP_INCOMPLETE_SPLIT (1 << 7) /* right sibling's downlink is missing */ +#define BTP_HAS_FULLXID (1 << 8) /* contains BTDeletedPageData */ + +/* + * The max allowed value of a cycle ID is a bit less than 64K. This is + * for convenience of pg_filedump and similar utilities: we want to use + * the last 2 bytes of special space as an index type indicator, and + * restricting cycle ID lets btree use that space for vacuum cycle IDs + * while still allowing index type to be identified. + */ +#define MAX_BT_CYCLE_ID 0xFF7F + + +/* + * The Meta page is always the first page in the btree index. + * Its primary purpose is to point to the location of the btree root page. + * We also point to the "fast" root, which is the current effective root; + * see README for discussion. + */ + +typedef struct BTMetaPageData +{ + uint32 btm_magic; /* should contain BTREE_MAGIC */ + uint32 btm_version; /* nbtree version (always <= BTREE_VERSION) */ + BlockNumber btm_root; /* current root location */ + uint32 btm_level; /* tree level of the root page */ + BlockNumber btm_fastroot; /* current "fast" root location */ + uint32 btm_fastlevel; /* tree level of the "fast" root page */ + /* remaining fields only valid when btm_version >= BTREE_NOVAC_VERSION */ + + /* number of deleted, non-recyclable pages during last cleanup */ + uint32 btm_last_cleanup_num_delpages; + /* number of heap tuples during last cleanup (deprecated) */ + float8 btm_last_cleanup_num_heap_tuples; + + bool btm_allequalimage; /* are all columns "equalimage"? */ +} BTMetaPageData; + +#define BTPageGetMeta(p) \ + ((BTMetaPageData *) PageGetContents(p)) + +/* + * The current Btree version is 4. That's what you'll get when you create + * a new index. + * + * Btree version 3 was used in PostgreSQL v11. It is mostly the same as + * version 4, but heap TIDs were not part of the keyspace. Index tuples + * with duplicate keys could be stored in any order. We continue to + * support reading and writing Btree versions 2 and 3, so that they don't + * need to be immediately re-indexed at pg_upgrade. In order to get the + * new heapkeyspace semantics, however, a REINDEX is needed. + * + * Deduplication is safe to use when the btm_allequalimage field is set to + * true. It's safe to read the btm_allequalimage field on version 3, but + * only version 4 indexes make use of deduplication. Even version 4 + * indexes created on PostgreSQL v12 will need a REINDEX to make use of + * deduplication, though, since there is no other way to set + * btm_allequalimage to true (pg_upgrade hasn't been taught to set the + * metapage field). + * + * Btree version 2 is mostly the same as version 3. There are two new + * fields in the metapage that were introduced in version 3. A version 2 + * metapage will be automatically upgraded to version 3 on the first + * insert to it. INCLUDE indexes cannot use version 2. + */ +#define BTREE_METAPAGE 0 /* first page is meta */ +#define BTREE_MAGIC 0x053162 /* magic number in metapage */ +#define BTREE_VERSION 4 /* current version number */ +#define BTREE_MIN_VERSION 2 /* minimum supported version */ +#define BTREE_NOVAC_VERSION 3 /* version with all meta fields set */ + +/* + * Maximum size of a btree index entry, including its tuple header. + * + * We actually need to be able to fit three items on every page, + * so restrict any one item to 1/3 the per-page available space. + * + * There are rare cases where _bt_truncate() will need to enlarge + * a heap index tuple to make space for a tiebreaker heap TID + * attribute, which we account for here. + */ +#define BTMaxItemSize(page) \ + (MAXALIGN_DOWN((PageGetUsablePageSize(page) - \ + MAXALIGN(3*sizeof(ItemIdData)) - \ + MAXALIGN(sizeof(BTPageOpaqueData))) / 3) - \ + MAXALIGN(sizeof(ItemPointerData))) +#define BTMaxItemSizeNoHeapTid(page) \ + MAXALIGN_DOWN((PageGetUsablePageSize(page) - \ + MAXALIGN(3*sizeof(ItemIdData)) - \ + MAXALIGN(sizeof(BTPageOpaqueData))) / 3) + +/* + * ClusterMaxTIDsPerBTreePage is an upper bound on the number of heap TIDs tuples + * that may be stored on a btree leaf page. It is used to size the + * per-page temporary buffers. + * + * Note: we don't bother considering per-tuple overheads here to keep + * things simple (value is based on how many elements a single array of + * heap TIDs must have to fill the space between the page header and + * special area). The value is slightly higher (i.e. more conservative) + * than necessary as a result, which is considered acceptable. + */ +#define CalcMaxTIDsPerBTreePage(usablespace) \ + (int) ((usablespace) - sizeof(BTPageOpaqueData) / \ + sizeof(ItemPointerData)) +#define MaxTIDsPerBTreePageLimit (CalcMaxTIDsPerBTreePage(PageUsableSpaceMax)) + +#endif + -- 2.40.1