Re: heapgettup refactoring - Mailing list pgsql-hackers

From Andres Freund
Subject Re: heapgettup refactoring
Date
Msg-id 20221101010948.hsf33emgnwzvil4a@awork3.anarazel.de
Whole thread Raw
In response to heapgettup refactoring  (Melanie Plageman <melanieplageman@gmail.com>)
Responses Re: heapgettup refactoring
List pgsql-hackers
Hi,

On 2022-10-31 14:37:39 -0400, Melanie Plageman wrote:
> and heapgetpage(). heapgettup() and heapgettup_pagemode() have a lot of
> duplicated code, confusingly nested if statements, and unnecessary local
> variables. While working on a feature for the AIO/DIO patchset, I
> noticed that it was difficult to add new code to heapgettup() and
> heapgettup_pagemode() because of how the functions are written.

Thanks for working on this - the current state is quite painful.


> From cde2d6720f4f5ab2531c22ad4a5f0d9e6ec1039d Mon Sep 17 00:00:00 2001
> From: Melanie Plageman <melanieplageman@gmail.com>
> Date: Wed, 26 Oct 2022 20:00:34 -0400
> Subject: [PATCH v1 1/3] Remove breaks in HeapTupleSatisfiesVisibility
>
> breaks in HeapTupleSatisfiesVisibility were superfluous
> ---
>  src/backend/access/heap/heapam_visibility.c | 7 -------
>  1 file changed, 7 deletions(-)
>
> diff --git a/src/backend/access/heap/heapam_visibility.c b/src/backend/access/heap/heapam_visibility.c
> index 6e33d1c881..dd5d5da190 100644
> --- a/src/backend/access/heap/heapam_visibility.c
> +++ b/src/backend/access/heap/heapam_visibility.c
> @@ -1769,25 +1769,18 @@ HeapTupleSatisfiesVisibility(HeapTuple htup, Snapshot snapshot, Buffer buffer)
>      {
>          case SNAPSHOT_MVCC:
>              return HeapTupleSatisfiesMVCC(htup, snapshot, buffer);
> -            break;
>          case SNAPSHOT_SELF:
>              return HeapTupleSatisfiesSelf(htup, snapshot, buffer);
> -            break;
>          case SNAPSHOT_ANY:
>              return HeapTupleSatisfiesAny(htup, snapshot, buffer);
> -            break;
>          case SNAPSHOT_TOAST:
>              return HeapTupleSatisfiesToast(htup, snapshot, buffer);
> -            break;
>          case SNAPSHOT_DIRTY:
>              return HeapTupleSatisfiesDirty(htup, snapshot, buffer);
> -            break;
>          case SNAPSHOT_HISTORIC_MVCC:
>              return HeapTupleSatisfiesHistoricMVCC(htup, snapshot, buffer);
> -            break;
>          case SNAPSHOT_NON_VACUUMABLE:
>              return HeapTupleSatisfiesNonVacuumable(htup, snapshot, buffer);
> -            break;
>      }

Not sure what the author of this code, a certain Mr Freund, was thinking when
he added those returns...


> From 9d8b01960463dc64ff5b111d523ff80fce3017af Mon Sep 17 00:00:00 2001
> From: Melanie Plageman <melanieplageman@gmail.com>
> Date: Mon, 31 Oct 2022 13:40:06 -0400
> Subject: [PATCH v1 2/3] Turn HeapKeyTest macro into function
>
> This should always be inlined appropriately now. It is easier to read as
> a function. Also, remove unused include in catcache.c.
> ---
>  src/backend/access/heap/heapam.c   | 10 ++--
>  src/backend/utils/cache/catcache.c |  1 -
>  src/include/access/valid.h         | 76 ++++++++++++------------------
>  3 files changed, 36 insertions(+), 51 deletions(-)
>
> diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
> index 12be87efed..1c995faa12 100644
> --- a/src/backend/access/heap/heapam.c
> +++ b/src/backend/access/heap/heapam.c
> @@ -716,8 +716,10 @@ heapgettup(HeapScanDesc scan,
>                                                      snapshot);
>
>                  if (valid && key != NULL)
> -                    HeapKeyTest(tuple, RelationGetDescr(scan->rs_base.rs_rd),
> -                                nkeys, key, valid);
> +                {
> +                    valid = HeapKeyTest(tuple, RelationGetDescr(scan->rs_base.rs_rd),
> +                                nkeys, key);
> +                }
>
>                  if (valid)
>                  {

superfluous parens.



> --- a/src/include/access/valid.h
> +++ b/src/include/access/valid.h
> @@ -19,51 +19,35 @@
>   *
>   *        Test a heap tuple to see if it satisfies a scan key.
>   */
> -#define HeapKeyTest(tuple, \
> -                    tupdesc, \
> -                    nkeys, \
> -                    keys, \
> -                    result) \
> -do \
> -{ \
> -    /* Use underscores to protect the variables passed in as parameters */ \
> -    int            __cur_nkeys = (nkeys); \
> -    ScanKey        __cur_keys = (keys); \
> - \
> -    (result) = true; /* may change */ \
> -    for (; __cur_nkeys--; __cur_keys++) \
> -    { \
> -        Datum    __atp; \
> -        bool    __isnull; \
> -        Datum    __test; \
> - \
> -        if (__cur_keys->sk_flags & SK_ISNULL) \
> -        { \
> -            (result) = false; \
> -            break; \
> -        } \
> - \
> -        __atp = heap_getattr((tuple), \
> -                             __cur_keys->sk_attno, \
> -                             (tupdesc), \
> -                             &__isnull); \
> - \
> -        if (__isnull) \
> -        { \
> -            (result) = false; \
> -            break; \
> -        } \
> - \
> -        __test = FunctionCall2Coll(&__cur_keys->sk_func, \
> -                                   __cur_keys->sk_collation, \
> -                                   __atp, __cur_keys->sk_argument); \
> - \
> -        if (!DatumGetBool(__test)) \
> -        { \
> -            (result) = false; \
> -            break; \
> -        } \
> -    } \
> -} while (0)
> +static inline bool
> +HeapKeyTest(HeapTuple tuple, TupleDesc tupdesc, int nkeys, ScanKey keys)
> +{
> +    int cur_nkeys = nkeys;
> +    ScanKey cur_key = keys;
> +
> +    for (; cur_nkeys--; cur_key++)
> +    {
> +        Datum atp;
> +        bool isnull;
> +        Datum test;
> +
> +        if (cur_key->sk_flags & SK_ISNULL)
> +            return false;
> +
> +        atp = heap_getattr(tuple, cur_key->sk_attno, tupdesc, &isnull);
> +
> +        if (isnull)
> +            return false;
> +
> +        test = FunctionCall2Coll(&cur_key->sk_func,
> +                                cur_key->sk_collation,
> +                                atp, cur_key->sk_argument);
> +
> +        if (!DatumGetBool(test))
> +            return false;
> +    }
> +
> +    return true;
> +}

Seems like a simple and nice win in readability.

I recall looking at this in the past and thinking that there was some
additional subtlety here, but I can't see what that'd be.



> From a894ce38c488df6546392b9f3bd894b67edf951e Mon Sep 17 00:00:00 2001
> From: Melanie Plageman <melanieplageman@gmail.com>
> Date: Mon, 31 Oct 2022 13:40:29 -0400
> Subject: [PATCH v1 3/3] Refactor heapgettup* and heapgetpage
>
> Simplify heapgettup(), heapgettup_pagemode(), and heapgetpage(). All
> three contained several unnecessary local variables, duplicate code, and
> nested if statements. Streamlining these improves readability and
> extensibility.

It'd be nice to break this into slightly smaller chunks.


> +
> +static inline void heapgettup_no_movement(HeapScanDesc scan)
> +{

FWIW, for function definitions we keep the return type (and with that also the
the "static inline") on a separate line.


> +    ItemId        lpp;
> +    OffsetNumber lineoff;
> +    BlockNumber page;
> +    Page dp;
> +    HeapTuple    tuple = &(scan->rs_ctup);
> +    /*
> +    * ``no movement'' scan direction: refetch prior tuple
> +    */
> +
> +    /* Since the tuple was previously fetched, needn't lock page here */
> +    if (!scan->rs_inited)
> +    {
> +        Assert(!BufferIsValid(scan->rs_cbuf));
> +        tuple->t_data = NULL;
> +        return;

Is it possible to have a no-movement scan with an uninitialized scan? That
doesn't really seem to make sense. At least that's how I understand the
explanation for NoMovement nearby:
 * dir == NoMovementScanDirection means "re-fetch the tuple indicated
 * by scan->rs_ctup".

We can't have a rs_ctup without an already started scan, no?

Looks like this is pre-existing code that you just moved, but it still seems
wrong.


> +    }
> +    page = ItemPointerGetBlockNumber(&(tuple->t_self));
> +    if (page != scan->rs_cblock)
> +        heapgetpage((TableScanDesc) scan, page);


We have a
    BlockNumber page;
and
    Page        dp;
in this code which seems almost intentionally confusing. This again is a
pre-existing issue but perhaps we could clean it up first?



> +static inline Page heapgettup_continue_page(HeapScanDesc scan, BlockNumber page, ScanDirection dir,
> +        int *linesleft, OffsetNumber *lineoff)
> +{
> +    HeapTuple    tuple = &(scan->rs_ctup);

Hm. Finding the next offset via rs_ctup doesn't seem quite right. For one,
it's not actually that cheap to extract the offset from an ItemPointer because
of the the way we pack it into ItemPointerData.


> +    Page dp = BufferGetPage(scan->rs_cbuf);
> +    TestForOldSnapshot(scan->rs_base.rs_snapshot, scan->rs_base.rs_rd, dp);

Newlines between definitions and code :)

Perhaps worth asserting that the scan is initialized and that rs_cbuf is valid?


> +    if (ScanDirectionIsForward(dir))
> +    {
> +        *lineoff = OffsetNumberNext(ItemPointerGetOffsetNumber(&(tuple->t_self)));
> +        *linesleft = PageGetMaxOffsetNumber(dp) - (*lineoff) + 1;

We can't access PageGetMaxOffsetNumber etc without holding a lock on the
page. It's not immediately obvious that that is held in all paths.


> +static inline BlockNumber heapgettup_initial_page(HeapScanDesc scan, ScanDirection dir)
> +{
> +    Assert(!ScanDirectionIsNoMovement(dir));
> +    Assert(!scan->rs_inited);

Is there a reason we couldn't set rs_inited in here, rather than reapeating
that in all callers?


ISTM that this function should deal with the
            /*
             * return null immediately if relation is empty
             */

logic, I think you now are repeating that check on every call to heapgettup().


> @@ -511,182 +711,55 @@ heapgettup(HeapScanDesc scan,
>             ScanKey key)
>  {
>      HeapTuple    tuple = &(scan->rs_ctup);
> -    Snapshot    snapshot = scan->rs_base.rs_snapshot;
> -    bool        backward = ScanDirectionIsBackward(dir);
>      BlockNumber page;
> -    bool        finished;
>      Page        dp;
> -    int            lines;
>      OffsetNumber lineoff;
>      int            linesleft;
> -    ItemId        lpp;
> +
> +    if (ScanDirectionIsNoMovement(dir))
> +        return heapgettup_no_movement(scan);

Maybe add an unlikely() - this path is barely ever used...


>      /*
> -     * calculate next starting lineoff, given scan direction
> +     * return null immediately if relation is empty
>       */
> -    if (ScanDirectionIsForward(dir))
> +    if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
>      {

As mentioned above, I don't think we should repeat the nblocks check on every
call.


> +        page = scan->rs_cblock;
> +        LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
> +        dp = heapgettup_continue_page(scan, page, dir, &linesleft, &lineoff);
> +        goto continue_page;
>      }
>
>      /*
>       * advance the scan until we find a qualifying tuple or run out of stuff
>       * to scan
>       */
> -    lpp = PageGetItemId(dp, lineoff);
> -    for (;;)
> +    while (page != InvalidBlockNumber)
>      {
> +        heapgetpage((TableScanDesc) scan, page);
> +        LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
> +        dp = heapgettup_start_page(scan, page, dir, &linesleft, &lineoff);
> +    continue_page:


I don't like the goto continue_page at all.  Seems that the paths leading here
should call LockBuffer(), heapgettup_start_page() etc?  Possibly a do {} while
() loop could do the trick as well.



Greetings,

Andres Freund



pgsql-hackers by date:

Previous
From: Ian Lawrence Barwick
Date:
Subject: Re: Commit fest 2022-11
Next
From: Tom Lane
Date:
Subject: Re: Reusing return value from planner_rt_fetch