Re: BTree on-disk page ordering - Mailing list pgsql-hackers
From | Tom Lane |
---|---|
Subject | Re: BTree on-disk page ordering |
Date | |
Msg-id | 2471.1147130032@sss.pgh.pa.us Whole thread Raw |
In response to | BTree on-disk page ordering ("Jim C. Nasby" <jnasby@pervasive.com>) |
List | pgsql-hackers |
"Jim C. Nasby" <jnasby@pervasive.com> writes: > The mention of the changes to the btree scan code in the latest weekly > news got me curious so I started looking at the 'executive summary' > (read as: README) of the patch changes for both the scan patch and the > btbulkdelete patch. If my understanding is correct, vacuum will only see > a speed improvement when an index's on-disk storage order has a low > correlation to index order. Is there any way to see what that > correlation is on a running system? I'm wondering if anyone has checked > to see what kind of performance impact a highly out-of-order index has > on index scans. There's nothing built in. If you feel like hacking something, I've attached a truly ugly tool that I've used once or twice in the past to debug broken indexes. It's got some smarts about detecting inconsistent index structure and it looks like this latest version was meant to dump out the index keys of a particular index. You could modify it to just scan the level-zero pages and compute some statistics about their ordering. regards, tom lane /* * Usage: checkindex filename * * Note: we read in the entire index file, hence this does not work well * for indexes bigger than available memory */ #include "postgres.h" #include <unistd.h> #include "access/nbtree.h" static char *buffers; static BlockNumber nblocks; #define GetPage(p) ((Page) (buffers + (p) * BLCKSZ)) static BlockNumber rootblk; static uint32 rootlevel; bool assert_enabled = true; static void check_metapage(void) { Page page = GetPage(0); BTMetaPageData *metad; BTPageOpaque metaopaque; metad = BTPageGetMeta(page); if (metad->btm_magic != BTREE_MAGIC) fprintf(stderr, "Bogus magic %lu\n", (long) metad->btm_magic); if (metad->btm_version != BTREE_VERSION) fprintf(stderr, "Bogus version %lu\n", (long) metad->btm_version); rootblk = metad->btm_root; rootlevel = metad->btm_level; metaopaque = (BTPageOpaque) PageGetSpecialPointer(page); if (metaopaque->btpo_flags != BTP_META) fprintf(stderr, "Bogus metapage flags 0x%x\n", metaopaque->btpo_flags); } static void check_rootpage(void) { Page page; BTPageOpaque rootopaque; if (rootblk <= 0 || rootblk >= nblocks) { fprintf(stderr, "Bogus root block # %lu\n", (long) rootblk); return; } page = GetPage(rootblk); rootopaque = (BTPageOpaque) PageGetSpecialPointer(page); if (rootopaque->btpo_flags != BTP_ROOT) fprintf(stderr, "Bogus rootpage flags 0x%x\n", rootopaque->btpo_flags); if (rootopaque->btpo.level != rootlevel) fprintf(stderr, "Bogus rootpage level %u, expected %u\n", rootopaque->btpo.level, rootlevel); if (rootopaque->btpo_prev != P_NONE) fprintf(stderr, "Bogus rootpage left-link 0x%x\n", rootopaque->btpo_prev); if (rootopaque->btpo_next != P_NONE) fprintf(stderr, "Bogus rootpage right-link 0x%x\n", rootopaque->btpo_next); } static int print_text_key(unsigned char *tdata, int tsize) { int orig_tsize = tsize; int dsize; Assert(tsize >= 4); dsize = *((int *) tdata); tdata += 4; tsize -= 4; dsize -= 4; Assert(dsize >= 0); Assert(tsize >= dsize); while (dsize-- > 0) { printf("%c", *tdata); tdata++; tsize--; } printf("\t"); // kluge alignment while ((long) tdata % MAXIMUM_ALIGNOF) { tdata++; tsize--; } return orig_tsize - tsize; } static int print_int_key(unsigned char *tdata, int tsize) { Assert(tsize >= 4); printf("%d\t", *((int *) tdata)); return 4; } static void print_item(Page page, BlockNumber blk, OffsetNumber off) { BTPageOpaque opaque; BTItem btitem; IndexTuple itup; ItemPointer ip; opaque = (BTPageOpaque) PageGetSpecialPointer(page); btitem = (BTItem) PageGetItem(page, PageGetItemId(page, off)); itup = &(btitem->bti_itup); ip = &itup->t_tid; /* no key in upper-level first data item */ if (P_ISLEAF(opaque) || off != P_FIRSTDATAKEY(opaque)) { int tsize; unsigned char *tdata; int dsize; tsize = IndexTupleSize(itup) - sizeof(IndexTupleData); tdata = (unsigned char *) itup + sizeof(IndexTupleData); dsize = print_int_key(tdata, tsize); tdata += dsize; tsize -= dsize; dsize = print_int_key(tdata, tsize); tdata += dsize; tsize -= dsize; } if (off >= P_FIRSTDATAKEY(opaque)) printf("\t%d\t%d", ItemPointerGetBlockNumber(ip), ItemPointerGetOffsetNumber(ip)); else printf("\thigh key on page %u", blk); printf("\n"); } static void check_level(uint32 level, BlockNumber leftpage, uint32 expected_pages) { uint32 npages = 0; BlockNumber prev = P_NONE; BlockNumber next; BlockNumber blk; Page page; BTPageOpaque opaque; for (blk = leftpage; blk != P_NONE; blk = next) { if (blk <= 0 || blk >= nblocks) { fprintf(stderr, "Bogus right-link %u from %u\n", blk, prev); break; } npages++; page = GetPage(blk); opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (opaque->btpo_prev != prev) fprintf(stderr, "Bogus left-link %u on %u (expected %u)\n", opaque->btpo_prev, blk, prev); if (opaque->btpo.level != level) fprintf(stderr, "Bogus level %u on %u (expected %u)\n", opaque->btpo.level, blk, level); if (opaque->btpo_flags & (BTP_DELETED | BTP_META)) fprintf(stderr, "Bogus flags 0x%x on %u\n", opaque->btpo_flags, blk); #define SHOW_LEVEL 0 #define SHOW_HI_KEY 1 if (level == SHOW_LEVEL) { OffsetNumber low, high; low = P_FIRSTDATAKEY(opaque); high = PageGetMaxOffsetNumber(page); for (; low <= high; low++) print_item(page, blk, low); #if SHOW_HI_KEY if (!P_RIGHTMOST(opaque)) print_item(page, blk, P_HIKEY); #endif } next = opaque->btpo_next; prev = blk; } } static void check_levels(void) { BlockNumber *leftpages; uint32 *levelcount; uint32 ndeleted = 0; BlockNumber blk; Page page; BTPageOpaque opaque; uint32 lev; leftpages = calloc(sizeof(BlockNumber), rootlevel + 1); Assert(leftpages); levelcount = calloc(sizeof(uint32), rootlevel + 1); Assert(levelcount); for (blk = 1; blk < nblocks; blk++) { page = GetPage(blk); opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (opaque->btpo_flags & BTP_DELETED) { ndeleted++; continue; } if ((opaque->btpo_flags & BTP_META) || ((opaque->btpo_flags & BTP_ROOT) && blk != rootblk) || ((opaque->btpo_flags & BTP_LEAF) && opaque->btpo.level != 0)) fprintf(stderr, "Bogus flags 0x%x on page %u\n", opaque->btpo_flags, blk); if (opaque->btpo.level > rootlevel) { fprintf(stderr, "Bogus level %u on page %u\n", opaque->btpo.level, blk); continue; } levelcount[opaque->btpo.level]++; if (opaque->btpo_prev == P_NONE) { if (leftpages[opaque->btpo.level] == P_NONE) leftpages[opaque->btpo.level] = blk; else fprintf(stderr, "Multiple leftmost pages for level %u: %u, %u\n", opaque->btpo.level, leftpages[opaque->btpo.level], blk); } } for (lev = 0; lev <= rootlevel; lev++) { if (leftpages[lev] != P_NONE) check_level(lev, leftpages[lev], levelcount[lev]); } } static void check_downlinks(void) { BlockNumber blk; for (blk = 1; blk < nblocks; blk++) { Page page; BTPageOpaque opaque; OffsetNumber low, high; page = GetPage(blk); opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (opaque->btpo_flags & BTP_DELETED) continue; if (opaque->btpo.level == 0) continue; /* nothing to check */ low = P_FIRSTDATAKEY(opaque); high = PageGetMaxOffsetNumber(page); /* * Internal page should always contain keys. */ if (high < low) fprintf(stderr, "empty internal page %u (level %u)\n", blk, opaque->btpo.level); for (; low <= high; low++) { BTItem btitem; IndexTuple itup; BlockNumber cblk; Page cpage; BTPageOpaque copaque; btitem = (BTItem) PageGetItem(page, PageGetItemId(page, low)); itup = &(btitem->bti_itup); cblk = ItemPointerGetBlockNumber(&itup->t_tid); if (cblk <= 0 || cblk >= nblocks) { fprintf(stderr, "Bogus downlink %u in %u/%u\n", cblk, blk, low); continue; } cpage = GetPage(cblk); copaque = (BTPageOpaque) PageGetSpecialPointer(cpage); if (copaque->btpo_flags & BTP_DELETED) { fprintf(stderr, "Downlink %u in %u/%u points to deleted page\n", cblk, blk, low); continue; } if (copaque->btpo.level != opaque->btpo.level-1) fprintf(stderr, "Downlink %u in %u/%u (level %u) points to page of level %u\n", cblk, blk, low, opaque->btpo.level, copaque->btpo.level); } } } int main(int argc, char **argv) { FILE *infile; unsigned long fsize; /* Read in the file */ if (argc != 2) { fprintf(stderr, "usage: checkindex filename\n"); return 1; } infile = fopen(argv[1], "rb"); if (!infile) { perror(argv[1]); return 1; } if (fseek(infile, 0, SEEK_END) < 0) { perror(argv[1]); return 1; } fsize = ftell(infile); if (fseek(infile, 0, SEEK_SET) < 0) { perror(argv[1]); return 1; } if (fsize % BLCKSZ) fprintf(stderr, "Size of input file is not a multiple of BLCKSZ\n"); nblocks = fsize / BLCKSZ; buffers = malloc(nblocks * BLCKSZ); if (!buffers) { fprintf(stderr, "out of memory\n"); return 1; } if (fread(buffers, 1, nblocks * BLCKSZ, infile) != nblocks * BLCKSZ) { perror("short read"); return 1; } fclose(infile); /* Start checking */ check_metapage(); check_rootpage(); check_levels(); check_downlinks(); return 0; } /* * ExceptionalCondition - Handles the failure of an Assert() */ int ExceptionalCondition(char *conditionName, char *errorType, char *fileName, int lineNumber) { if (!PointerIsValid(conditionName) || !PointerIsValid(fileName) || !PointerIsValid(errorType)) fprintf(stderr, "TRAP: ExceptionalCondition: bad arguments\n"); else { fprintf(stderr, "TRAP: %s(\"%s\", File: \"%s\", Line: %d)\n", errorType, conditionName, fileName, lineNumber); } abort(); return 0; }
pgsql-hackers by date: