Re: BTree on-disk page ordering - Mailing list pgsql-hackers

From Tom Lane
Subject Re: BTree on-disk page ordering
Date
Msg-id 2471.1147130032@sss.pgh.pa.us
Whole thread Raw
In response to BTree on-disk page ordering  ("Jim C. Nasby" <jnasby@pervasive.com>)
List pgsql-hackers
"Jim C. Nasby" <jnasby@pervasive.com> writes:
> The mention of the changes to the btree scan code in the latest weekly
> news got me curious so I started looking at the 'executive summary'
> (read as: README) of the patch changes for both the scan patch and the
> btbulkdelete patch. If my understanding is correct, vacuum will only see
> a speed improvement when an index's on-disk storage order has a low
> correlation to index order. Is there any way to see what that
> correlation is on a running system? I'm wondering if anyone has checked
> to see what kind of performance impact a highly out-of-order index has
> on index scans.

There's nothing built in.  If you feel like hacking something, I've
attached a truly ugly tool that I've used once or twice in the past to
debug broken indexes.  It's got some smarts about detecting inconsistent
index structure and it looks like this latest version was meant to dump
out the index keys of a particular index.  You could modify it to just
scan the level-zero pages and compute some statistics about their
ordering.

            regards, tom lane

/*
 * Usage: checkindex filename
 *
 * Note: we read in the entire index file, hence this does not work well
 * for indexes bigger than available memory
 */
#include "postgres.h"

#include <unistd.h>

#include "access/nbtree.h"

static char *buffers;
static BlockNumber nblocks;

#define GetPage(p)    ((Page) (buffers + (p) * BLCKSZ))

static BlockNumber rootblk;
static uint32 rootlevel;

bool assert_enabled = true;


static void
check_metapage(void)
{
    Page    page = GetPage(0);
    BTMetaPageData *metad;
    BTPageOpaque metaopaque;

    metad = BTPageGetMeta(page);
    if (metad->btm_magic != BTREE_MAGIC)
        fprintf(stderr, "Bogus magic %lu\n", (long) metad->btm_magic);
    if (metad->btm_version != BTREE_VERSION)
        fprintf(stderr, "Bogus version %lu\n", (long) metad->btm_version);
    rootblk = metad->btm_root;
    rootlevel = metad->btm_level;

    metaopaque = (BTPageOpaque) PageGetSpecialPointer(page);
    if (metaopaque->btpo_flags != BTP_META)
        fprintf(stderr, "Bogus metapage flags 0x%x\n", metaopaque->btpo_flags);
}

static void
check_rootpage(void)
{
    Page    page;
    BTPageOpaque rootopaque;

    if (rootblk <= 0 || rootblk >= nblocks)
    {
        fprintf(stderr, "Bogus root block # %lu\n", (long) rootblk);
        return;
    }
    page = GetPage(rootblk);

    rootopaque = (BTPageOpaque) PageGetSpecialPointer(page);
    if (rootopaque->btpo_flags != BTP_ROOT)
        fprintf(stderr, "Bogus rootpage flags 0x%x\n", rootopaque->btpo_flags);
    if (rootopaque->btpo.level != rootlevel)
        fprintf(stderr, "Bogus rootpage level %u, expected %u\n",
                rootopaque->btpo.level, rootlevel);
    if (rootopaque->btpo_prev != P_NONE)
        fprintf(stderr, "Bogus rootpage left-link 0x%x\n",
                rootopaque->btpo_prev);
    if (rootopaque->btpo_next != P_NONE)
        fprintf(stderr, "Bogus rootpage right-link 0x%x\n",
                rootopaque->btpo_next);
}

static int
print_text_key(unsigned char *tdata, int tsize)
{
    int        orig_tsize = tsize;
    int        dsize;

    Assert(tsize >= 4);
    dsize = *((int *) tdata);
    tdata += 4;
    tsize -= 4;
    dsize -= 4;
    Assert(dsize >= 0);
    Assert(tsize >= dsize);

    while (dsize-- > 0)
    {
        printf("%c", *tdata);
        tdata++;
        tsize--;
    }
    printf("\t");

    // kluge alignment
    while ((long) tdata % MAXIMUM_ALIGNOF)
    {
        tdata++;
        tsize--;
    }
    return orig_tsize - tsize;
}

static int
print_int_key(unsigned char *tdata, int tsize)
{
    Assert(tsize >= 4);
    printf("%d\t", *((int *) tdata));
    return 4;
}

static void
print_item(Page page, BlockNumber blk, OffsetNumber off)
{
    BTPageOpaque opaque;
    BTItem        btitem;
    IndexTuple    itup;
    ItemPointer ip;

    opaque = (BTPageOpaque) PageGetSpecialPointer(page);

    btitem = (BTItem) PageGetItem(page, PageGetItemId(page, off));
    itup = &(btitem->bti_itup);
    ip = &itup->t_tid;

    /* no key in upper-level first data item */
    if (P_ISLEAF(opaque) || off != P_FIRSTDATAKEY(opaque))
    {
        int            tsize;
        unsigned char *tdata;
        int            dsize;

        tsize = IndexTupleSize(itup) - sizeof(IndexTupleData);
        tdata = (unsigned char *) itup + sizeof(IndexTupleData);

        dsize = print_int_key(tdata, tsize);
        tdata += dsize; tsize -= dsize;

        dsize = print_int_key(tdata, tsize);
        tdata += dsize; tsize -= dsize;
    }

    if (off >= P_FIRSTDATAKEY(opaque))
        printf("\t%d\t%d",
               ItemPointerGetBlockNumber(ip),
               ItemPointerGetOffsetNumber(ip));
    else
        printf("\thigh key on page %u", blk);

    printf("\n");
}

static void
check_level(uint32 level, BlockNumber leftpage, uint32 expected_pages)
{
    uint32        npages = 0;
    BlockNumber    prev = P_NONE;
    BlockNumber    next;
    BlockNumber    blk;
    Page    page;
    BTPageOpaque opaque;

    for (blk = leftpage; blk != P_NONE; blk = next)
    {
        if (blk <= 0 || blk >= nblocks)
        {
            fprintf(stderr, "Bogus right-link %u from %u\n", blk, prev);
            break;
        }
        npages++;
        page = GetPage(blk);
        opaque = (BTPageOpaque) PageGetSpecialPointer(page);
        if (opaque->btpo_prev != prev)
            fprintf(stderr, "Bogus left-link %u on %u (expected %u)\n",
                    opaque->btpo_prev, blk, prev);
        if (opaque->btpo.level != level)
            fprintf(stderr, "Bogus level %u on %u (expected %u)\n",
                    opaque->btpo.level, blk, level);
        if (opaque->btpo_flags & (BTP_DELETED | BTP_META))
            fprintf(stderr, "Bogus flags 0x%x on %u\n",
                    opaque->btpo_flags, blk);

#define SHOW_LEVEL    0
#define SHOW_HI_KEY    1

        if (level == SHOW_LEVEL)
        {
            OffsetNumber low,
                high;

            low = P_FIRSTDATAKEY(opaque);
            high = PageGetMaxOffsetNumber(page);
            for (; low <= high; low++)
                print_item(page, blk, low);

#if SHOW_HI_KEY
            if (!P_RIGHTMOST(opaque))
                print_item(page, blk, P_HIKEY);
#endif
        }

        next = opaque->btpo_next;
        prev = blk;
    }
}

static void
check_levels(void)
{
    BlockNumber *leftpages;
    uint32 *levelcount;
    uint32    ndeleted = 0;
    BlockNumber blk;
    Page    page;
    BTPageOpaque opaque;
    uint32    lev;

    leftpages = calloc(sizeof(BlockNumber), rootlevel + 1);
    Assert(leftpages);
    levelcount = calloc(sizeof(uint32), rootlevel + 1);
    Assert(levelcount);

    for (blk = 1; blk < nblocks; blk++)
    {
        page = GetPage(blk);
        opaque = (BTPageOpaque) PageGetSpecialPointer(page);
        if (opaque->btpo_flags & BTP_DELETED)
        {
            ndeleted++;
            continue;
        }
        if ((opaque->btpo_flags & BTP_META) ||
            ((opaque->btpo_flags & BTP_ROOT) && blk != rootblk) ||
            ((opaque->btpo_flags & BTP_LEAF) && opaque->btpo.level != 0))
            fprintf(stderr, "Bogus flags 0x%x on page %u\n",
                    opaque->btpo_flags, blk);
        if (opaque->btpo.level > rootlevel)
        {
            fprintf(stderr, "Bogus level %u on page %u\n",
                    opaque->btpo.level, blk);
            continue;
        }
        levelcount[opaque->btpo.level]++;
        if (opaque->btpo_prev == P_NONE)
        {
            if (leftpages[opaque->btpo.level] == P_NONE)
                leftpages[opaque->btpo.level] = blk;
            else
                fprintf(stderr, "Multiple leftmost pages for level %u: %u, %u\n",
                        opaque->btpo.level,
                        leftpages[opaque->btpo.level],
                        blk);
        }
    }

    for (lev = 0; lev <= rootlevel; lev++)
    {
        if (leftpages[lev] != P_NONE)
            check_level(lev, leftpages[lev], levelcount[lev]);
    }
}

static void
check_downlinks(void)
{
    BlockNumber blk;

    for (blk = 1; blk < nblocks; blk++)
    {
        Page    page;
        BTPageOpaque opaque;
        OffsetNumber low,
                    high;

        page = GetPage(blk);
        opaque = (BTPageOpaque) PageGetSpecialPointer(page);
        if (opaque->btpo_flags & BTP_DELETED)
            continue;
        if (opaque->btpo.level == 0)
            continue;            /* nothing to check */
        low = P_FIRSTDATAKEY(opaque);
        high = PageGetMaxOffsetNumber(page);
        /*
         * Internal page should always contain keys.
         */
        if (high < low)
            fprintf(stderr, "empty internal page %u (level %u)\n",
                    blk, opaque->btpo.level);
        for (; low <= high; low++)
        {
            BTItem        btitem;
            IndexTuple    itup;
            BlockNumber cblk;
            Page    cpage;
            BTPageOpaque copaque;

            btitem = (BTItem) PageGetItem(page, PageGetItemId(page, low));
            itup = &(btitem->bti_itup);
            cblk = ItemPointerGetBlockNumber(&itup->t_tid);
            if (cblk <= 0 || cblk >= nblocks)
            {
                fprintf(stderr, "Bogus downlink %u in %u/%u\n",
                        cblk, blk, low);
                continue;
            }
            cpage = GetPage(cblk);
            copaque = (BTPageOpaque) PageGetSpecialPointer(cpage);
            if (copaque->btpo_flags & BTP_DELETED)
            {
                fprintf(stderr, "Downlink %u in %u/%u points to deleted page\n",
                        cblk, blk, low);
                continue;
            }
            if (copaque->btpo.level != opaque->btpo.level-1)
                fprintf(stderr, "Downlink %u in %u/%u (level %u) points to page of level %u\n",
                        cblk, blk, low, opaque->btpo.level, copaque->btpo.level);
        }
    }
}

int
main(int argc, char **argv)
{
    FILE   *infile;
    unsigned long fsize;

    /* Read in the file */
    if (argc != 2)
    {
        fprintf(stderr, "usage: checkindex filename\n");
        return 1;
    }
    infile = fopen(argv[1], "rb");
    if (!infile)
    {
        perror(argv[1]);
        return 1;
    }
    if (fseek(infile, 0, SEEK_END) < 0)
    {
        perror(argv[1]);
        return 1;
    }
    fsize = ftell(infile);
    if (fseek(infile, 0, SEEK_SET) < 0)
    {
        perror(argv[1]);
        return 1;
    }
    if (fsize % BLCKSZ)
        fprintf(stderr, "Size of input file is not a multiple of BLCKSZ\n");
    nblocks = fsize / BLCKSZ;
    buffers = malloc(nblocks * BLCKSZ);
    if (!buffers)
    {
        fprintf(stderr, "out of memory\n");
        return 1;
    }
    if (fread(buffers, 1, nblocks * BLCKSZ, infile) != nblocks * BLCKSZ)
    {
        perror("short read");
        return 1;
    }
    fclose(infile);

    /* Start checking */
    check_metapage();
    check_rootpage();
    check_levels();
    check_downlinks();

    return 0;
}


/*
 * ExceptionalCondition - Handles the failure of an Assert()
 */
int
ExceptionalCondition(char *conditionName,
                     char *errorType,
                     char *fileName,
                     int lineNumber)
{
    if (!PointerIsValid(conditionName)
        || !PointerIsValid(fileName)
        || !PointerIsValid(errorType))
        fprintf(stderr, "TRAP: ExceptionalCondition: bad arguments\n");
    else
    {
        fprintf(stderr, "TRAP: %s(\"%s\", File: \"%s\", Line: %d)\n",
                errorType, conditionName,
                fileName, lineNumber);
    }

    abort();

    return 0;
}

pgsql-hackers by date:

Previous
From: "Jim C. Nasby"
Date:
Subject: BTree on-disk page ordering
Next
From: Albert Cervera Areny
Date:
Subject: Inheritance, Primary Keys and Foreign Keys