diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c index fe3aa3c..28ad480 100644 --- a/src/backend/access/index/genam.c +++ b/src/backend/access/index/genam.c @@ -114,6 +114,8 @@ RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys) scan->xs_ctup.t_data = NULL; scan->xs_cbuf = InvalidBuffer; scan->xs_continue_hot = false; + scan->want_index_tuple = false; + scan->xs_itup = NULL; return scan; } diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index 13e68d6..0f544f4 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -20,6 +20,8 @@ * index_insert - insert an index tuple into a relation * index_markpos - mark a scan position * index_restrpos - restore a scan position + * index_getnexttid - get the next TID from a scan + * index_fetch - get the tuple of specified TID * index_getnext - get the next tuple from a scan * index_getbitmap - get all tuples from a scan * index_bulk_delete - bulk deletion of index tuples @@ -425,6 +427,133 @@ index_restrpos(IndexScanDesc scan) } /* ---------------- + * index_getnexttid - get the next TID from a scan + * + * The result is the next TID satisfying the scan keys, + * or NULL if no more matching tuples exist. + * ---------------- + */ +ItemPointer +index_getnexttid(IndexScanDesc scan, ScanDirection direction) +{ + FmgrInfo *procedure; + bool found; + + SCAN_CHECKS; + GET_SCAN_PROCEDURE(amgettuple); + + Assert(TransactionIdIsValid(RecentGlobalXmin)); + + /* New index tuple, so not still following a HOT chain... */ + scan->xs_continue_hot = false; + + /* + * The AM's gettuple proc finds the next index entry matching the + * scan keys, and puts the TID in xs_ctup.t_self (ie, *tid). It + * should also set scan->xs_recheck, though we pay no attention to + * that here. + */ + found = DatumGetBool(FunctionCall2(procedure, + PointerGetDatum(scan), + Int32GetDatum(direction))); + + /* Reset kill flag immediately for safety */ + scan->kill_prior_tuple = false; + + /* If we're out of index entries, release any held pin on a heap page */ + if (!found) + { + if (BufferIsValid(scan->xs_cbuf)) + { + ReleaseBuffer(scan->xs_cbuf); + scan->xs_cbuf = InvalidBuffer; + } + return NULL; + } + + /* Return the TID of the tuple we found. */ + pgstat_count_index_tuples(scan->indexRelation, 1); + return &scan->xs_ctup.t_self; +} + +/* ---------------- + * index_fetch - get the heap tuple of specified TID from a scan + * + * The result is the heap tuple of particular TID, or NULL if no more + * matching tuples exist. On success, the buffer containing the heap tuple + * is pinned (the pin will be dropped at the next index_getnext, + * index_getnexttid or index_endscan). + * + * Note: caller must check scan->xs_recheck, and perform rechecking of the + * scan keys if required. We do not do that here because we don't have + * enough information to do it efficiently in the general case. + * There should be at most one call of this function after index_getnextid + * function call, and snapshot must be MVCC snapshot. + * ---------------- + */ +HeapTuple +index_fetch(IndexScanDesc scan) +{ + bool all_dead = false; + bool got_heap_tuple; + Buffer prev_buf; + ItemPointer tid = &scan->xs_ctup.t_self; + + /* We can skip the buffer-switching logic if we're in mid-HOT chain. */ + if (!scan->xs_continue_hot) + { + + /* Switch to correct buffer if we don't have it already */ + prev_buf = scan->xs_cbuf; + scan->xs_cbuf = ReleaseAndReadBuffer(scan->xs_cbuf, + scan->heapRelation, + ItemPointerGetBlockNumber(tid)); + + /* + * Prune page, but only if we weren't already on this page + */ + if (prev_buf != scan->xs_cbuf) + heap_page_prune_opt(scan->heapRelation, scan->xs_cbuf, + RecentGlobalXmin); + } + + /* Obtain share-lock on the buffer so we can examine visibility */ + LockBuffer(scan->xs_cbuf, BUFFER_LOCK_SHARE); + got_heap_tuple = heap_hot_search_buffer(tid, scan->heapRelation, + scan->xs_cbuf, + scan->xs_snapshot, + &scan->xs_ctup, + &all_dead, + !scan->xs_continue_hot); + LockBuffer(scan->xs_cbuf, BUFFER_LOCK_UNLOCK); + + if (got_heap_tuple) + { + /* + * Only in a non-MVCC snapshot can more than one member of the + * HOT chain be visible. + */ + scan->xs_continue_hot = !IsMVCCSnapshot(scan->xs_snapshot); + pgstat_count_heap_fetch(scan->indexRelation); + return &scan->xs_ctup; + } + + /* We've reached the end of the HOT chain. */ + scan->xs_continue_hot = false; + + /* + * If we scanned a whole HOT chain and found only dead tuples, tell + * index AM to kill its entry for that TID. We do not do this when in + * recovery because it may violate MVCC to do so. see comments in + * RelationGetIndexScan(). + */ + if (!scan->xactStartedInRecovery) + scan->kill_prior_tuple = all_dead; + + return NULL; +} + +/* ---------------- * index_getnext - get the next heap tuple from a scan * * The result is the next heap tuple satisfying the scan keys and the @@ -440,20 +569,11 @@ index_restrpos(IndexScanDesc scan) HeapTuple index_getnext(IndexScanDesc scan, ScanDirection direction) { - HeapTuple heapTuple = &scan->xs_ctup; - ItemPointer tid = &heapTuple->t_self; - FmgrInfo *procedure; - bool all_dead = false; - - SCAN_CHECKS; - GET_SCAN_PROCEDURE(amgettuple); - - Assert(TransactionIdIsValid(RecentGlobalXmin)); + HeapTuple heapTuple; + ItemPointer tid; for (;;) { - bool got_heap_tuple; - if (scan->xs_continue_hot) { /* @@ -461,86 +581,26 @@ index_getnext(IndexScanDesc scan, ScanDirection direction) * earlier member. Must still hold pin on current heap page. */ Assert(BufferIsValid(scan->xs_cbuf)); - Assert(ItemPointerGetBlockNumber(tid) == + Assert(ItemPointerGetBlockNumber(&scan->xs_ctup.t_self) == BufferGetBlockNumber(scan->xs_cbuf)); } else { - bool found; - Buffer prev_buf; - - /* - * If we scanned a whole HOT chain and found only dead tuples, - * tell index AM to kill its entry for that TID. We do not do this - * when in recovery because it may violate MVCC to do so. see - * comments in RelationGetIndexScan(). - */ - if (!scan->xactStartedInRecovery) - scan->kill_prior_tuple = all_dead; - - /* - * The AM's gettuple proc finds the next index entry matching the - * scan keys, and puts the TID in xs_ctup.t_self (ie, *tid). It - * should also set scan->xs_recheck, though we pay no attention to - * that here. - */ - found = DatumGetBool(FunctionCall2(procedure, - PointerGetDatum(scan), - Int32GetDatum(direction))); - - /* Reset kill flag immediately for safety */ - scan->kill_prior_tuple = false; + tid = index_getnexttid(scan, direction); /* If we're out of index entries, break out of outer loop */ - if (!found) + if (!tid) break; - - pgstat_count_index_tuples(scan->indexRelation, 1); - - /* Switch to correct buffer if we don't have it already */ - prev_buf = scan->xs_cbuf; - scan->xs_cbuf = ReleaseAndReadBuffer(scan->xs_cbuf, - scan->heapRelation, - ItemPointerGetBlockNumber(tid)); - - /* - * Prune page, but only if we weren't already on this page - */ - if (prev_buf != scan->xs_cbuf) - heap_page_prune_opt(scan->heapRelation, scan->xs_cbuf, - RecentGlobalXmin); } - /* Obtain share-lock on the buffer so we can examine visibility */ - LockBuffer(scan->xs_cbuf, BUFFER_LOCK_SHARE); - got_heap_tuple = heap_hot_search_buffer(tid, scan->heapRelation, - scan->xs_cbuf, - scan->xs_snapshot, - &scan->xs_ctup, - &all_dead, - !scan->xs_continue_hot); - LockBuffer(scan->xs_cbuf, BUFFER_LOCK_UNLOCK); - - if (got_heap_tuple) - { - /* - * Only in a non-MVCC snapshot can more than one member of the - * HOT chain be visible. - */ - scan->xs_continue_hot = !IsMVCCSnapshot(scan->xs_snapshot); - pgstat_count_heap_fetch(scan->indexRelation); + /* + * Fetch the next (or only) heap tuple for this index entry. If + * we don't find anything, loop around and grab the next tid from + * the index. + */ + heapTuple = index_fetch(scan); + if (heapTuple != NULL) return heapTuple; - } - - /* Loop around to ask index AM for another TID */ - scan->xs_continue_hot = false; - } - - /* Release any held pin on a heap page */ - if (BufferIsValid(scan->xs_cbuf)) - { - ReleaseBuffer(scan->xs_cbuf); - scan->xs_cbuf = InvalidBuffer; } return NULL; /* failure exit */ diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index ac86eb4..7d39948 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -77,7 +77,7 @@ static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, BTCycleId cycleid); static void btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno); - +static IndexTuple bt_getindextuple(IndexScanDesc scan); /* * btbuild() -- build a new btree index. @@ -314,9 +314,83 @@ btgettuple(PG_FUNCTION_ARGS) else res = _bt_first(scan, dir); + if (scan->want_index_tuple) + { + if (scan->xs_itup != NULL) + { + pfree(scan->xs_itup); + scan->xs_itup = NULL; + } + if (res) + scan->xs_itup = bt_getindextuple(scan); + } + PG_RETURN_BOOL(res); } + +/* + * bt_getindextuple - fetch index tuple at current position. + * + * The caller must have pin on so->currPos.buf. + * + * The tuple returned is a palloc'd copy. This can fail to find the tuple if + * new tuples have been inserted to the page since we stepped on this index + * page. NULL is returned in that case. + */ +static IndexTuple +bt_getindextuple(IndexScanDesc scan) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + Page page; + BTPageOpaque opaque; + OffsetNumber minoff; + OffsetNumber maxoff; + IndexTuple ituple, result; + int itemIndex; + BTScanPosItem *positem; + OffsetNumber offnum; + + Assert(BufferIsValid(so->currPos.buf)); + + LockBuffer(so->currPos.buf, BT_READ); + + page = BufferGetPage(so->currPos.buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + minoff = P_FIRSTDATAKEY(opaque); + maxoff = PageGetMaxOffsetNumber(page); + + itemIndex = so->currPos.itemIndex; + positem = &so->currPos.items[itemIndex]; + offnum = positem->indexOffset; + + Assert(itemIndex >= so->currPos.firstItem && + itemIndex <= so->currPos.lastItem); + if (offnum < minoff) + { + /* pure paranoia */ + LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); + return NULL; + } + + ituple = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); + + if (ItemPointerEquals(&ituple->t_tid, &scan->xs_ctup.t_self)) + { + /* found the item */ + Size itupsz = IndexTupleSize(ituple); + + result = palloc(itupsz); + memcpy(result, ituple, itupsz); + } + else + result = NULL; + + LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); + + return result; +} + /* * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap */ diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index fee829f..bcfbc70 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -661,7 +661,11 @@ ExplainNode(PlanState *planstate, List *ancestors, pname = sname = "Seq Scan"; break; case T_IndexScan: - pname = sname = "Index Scan"; + sname = "Index Scan"; + if (((IndexScan *) plan)->try_index_only_scan) + pname = "Index Only Scan"; + else + pname = "Index Scan"; break; case T_BitmapIndexScan: pname = sname = "Bitmap Index Scan"; @@ -828,6 +832,9 @@ ExplainNode(PlanState *planstate, List *ancestors, } ExplainPropertyText("Scan Direction", scandir, es); ExplainPropertyText("Index Name", indexname, es); + ExplainPropertyText("Index Only Scan", + indexscan->try_index_only_scan? "True" : "False", + es); } } /* FALL THRU */ diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c index 263f3b9..b9e23b7 100644 --- a/src/backend/executor/nodeIndexscan.c +++ b/src/backend/executor/nodeIndexscan.c @@ -27,6 +27,8 @@ #include "access/genam.h" #include "access/nbtree.h" #include "access/relscan.h" +#include "access/sysattr.h" +#include "access/visibilitymap.h" #include "executor/execdebug.h" #include "executor/nodeIndexscan.h" #include "optimizer/clauses.h" @@ -37,6 +39,8 @@ static TupleTableSlot *IndexNext(IndexScanState *node); +static void IndexStoreHeapTuple(TupleTableSlot *slot, IndexScan *plan, + IndexScanDesc scandesc); /* ---------------------------------------------------------------- @@ -52,17 +56,20 @@ IndexNext(IndexScanState *node) EState *estate; ExprContext *econtext; ScanDirection direction; + IndexScan *plan; IndexScanDesc scandesc; HeapTuple tuple; TupleTableSlot *slot; + ItemPointer tid; /* * extract necessary information from index scan node */ + plan = (IndexScan *) node->ss.ps.plan; estate = node->ss.ps.state; direction = estate->es_direction; /* flip direction if this is an overall backward scan */ - if (ScanDirectionIsBackward(((IndexScan *) node->ss.ps.plan)->indexorderdir)) + if (ScanDirectionIsBackward(plan->indexorderdir)) { if (ScanDirectionIsForward(direction)) direction = BackwardScanDirection; @@ -74,18 +81,46 @@ IndexNext(IndexScanState *node) slot = node->ss.ss_ScanTupleSlot; /* - * ok, now that we have what we need, fetch the next tuple. + * ok, now that we have what we need, fetch the next tid. */ - while ((tuple = index_getnext(scandesc, direction)) != NULL) + while ((tid = index_getnexttid(scandesc, direction)) != NULL) { + /* Attempt index-only scan, if possible. */ + if (scandesc->want_index_tuple && scandesc->xs_itup != NULL && + visibilitymap_test(scandesc->heapRelation, + ItemPointerGetBlockNumber(tid), + &node->iss_VMBuffer)) + { + /* Lossy scan shouldn't return tuple. */ + Assert(!scandesc->xs_recheck); + + /* Store tuple in slot. */ + IndexStoreHeapTuple(slot, plan, scandesc); + return slot; + } + + /* Index-only approach failed for some reason, so fetch heap tuple. */ + tuple = index_fetch(scandesc); + if (tuple == NULL) + continue; + /* - * Store the scanned tuple in the scan tuple slot of the scan state. + * Only MVCC snapshots are supported here, so there should be no + * need to keep following the HOT chain once a visible entry has + * been found. + */ + Assert(!scandesc->xs_continue_hot); + + /* + * Store the scanned tuple in the scan tuple slot of the scan + * state. + * * Note: we pass 'false' because tuples returned by amgetnext are * pointers onto disk pages and must not be pfree()'d. */ ExecStoreTuple(tuple, /* tuple to store */ slot, /* slot to store in */ - scandesc->xs_cbuf, /* buffer containing tuple */ + scandesc->xs_cbuf, /* buffer containing tuple */ false); /* don't pfree */ /* @@ -389,6 +424,13 @@ ExecEndIndexScan(IndexScanState *node) IndexScanDesc indexScanDesc; Relation relation; + /* Release VM buffer pin, if any. */ + if (node->iss_VMBuffer != InvalidBuffer) + { + ReleaseBuffer(node->iss_VMBuffer); + node->iss_VMBuffer = InvalidBuffer; + } + /* * extract information from the node */ @@ -469,6 +511,7 @@ ExecInitIndexScan(IndexScan *node, EState *estate, int eflags) indexstate = makeNode(IndexScanState); indexstate->ss.ps.plan = (Plan *) node; indexstate->ss.ps.state = estate; + indexstate->iss_VMBuffer = InvalidBuffer; /* * Miscellaneous initialization @@ -617,6 +660,7 @@ ExecInitIndexScan(IndexScan *node, EState *estate, int eflags) indexstate->iss_ScanKeys, indexstate->iss_NumScanKeys, indexstate->iss_OrderByKeys, indexstate->iss_NumOrderByKeys); + indexstate->iss_ScanDesc->want_index_tuple = node->try_index_only_scan; /* * all done. */ @@ -1125,3 +1169,71 @@ ExecIndexBuildScanKeys(PlanState *planstate, Relation index, Index scanrelid, else if (n_array_keys != 0) elog(ERROR, "ScalarArrayOpExpr index qual found where not allowed"); } + +/* + * IndexStoreHeapTuple + * When performing an index-only scan, we build a faux heap tuple + * from the index tuple. The missing columns are set to NULL, which + * is OK because we know they're never referenced anyway. + */ +static void +IndexStoreHeapTuple(TupleTableSlot *slot, IndexScan *plan, + IndexScanDesc scandesc) +{ + int natts; + Datum *values; + bool *isnull; + HeapTuple tuple; + int i; + + natts = scandesc->heapRelation->rd_att->natts; + Assert(natts + 1 == plan->nheapatt); + values = palloc(sizeof(Datum) * natts); + isnull = palloc(sizeof(bool) * natts); + + /* Transpose index tuple into heap tuple. */ + for (i = 0; i < natts; ++i) + { + int indexatt = plan->heapatt[i + 1]; + + if (indexatt == 0) + isnull[i] = true; + else + { + values[i] = index_getattr(scandesc->xs_itup, indexatt, + scandesc->indexRelation->rd_att, + &isnull[i]); + } + } + + tuple = heap_form_tuple(scandesc->heapRelation->rd_att, values, isnull); + + /* Done with isnull and values arrays. */ + pfree(values); + pfree(isnull); + + /* If an OID is required, fill it in. */ + if (scandesc->heapRelation->rd_att->tdhasoid) + { + if (plan->heapatt[0] == 0) + HeapTupleSetOid(tuple, InvalidOid); + else + { + /* We use plan->heapatt[0] to indicate that an OID is needed. */ + Datum oid_datum; + bool oid_isnull; + + oid_datum = index_getattr(scandesc->xs_itup, plan->heapatt[0], + scandesc->indexRelation->rd_att, + &oid_isnull); + Assert(!oid_isnull); + HeapTupleSetOid(tuple, DatumGetObjectId(oid_datum)); + } + } + + /* Store tuple. */ + ExecStoreTuple(tuple, /* tuple to store */ + slot, /* slot to store in */ + InvalidBuffer, /* buffer containing tuple */ + true); /* must pfree */ +} diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index d0704ed..2663f52 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -372,6 +372,9 @@ _copyIndexScan(IndexScan *from) COPY_NODE_FIELD(indexorderby); COPY_NODE_FIELD(indexorderbyorig); COPY_SCALAR_FIELD(indexorderdir); + COPY_SCALAR_FIELD(try_index_only_scan); + COPY_SCALAR_FIELD(nheapatt); + COPY_POINTER_FIELD(heapatt, from->nheapatt * sizeof(int)); return newnode; } diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c index 417aeb8..0d2976c 100644 --- a/src/backend/nodes/outfuncs.c +++ b/src/backend/nodes/outfuncs.c @@ -440,6 +440,8 @@ _outSeqScan(StringInfo str, SeqScan *node) static void _outIndexScan(StringInfo str, IndexScan *node) { + int i; + WRITE_NODE_TYPE("INDEXSCAN"); _outScanInfo(str, (Scan *) node); @@ -450,6 +452,12 @@ _outIndexScan(StringInfo str, IndexScan *node) WRITE_NODE_FIELD(indexorderby); WRITE_NODE_FIELD(indexorderbyorig); WRITE_ENUM_FIELD(indexorderdir, ScanDirection); + WRITE_BOOL_FIELD(try_index_only_scan); + WRITE_INT_FIELD(nheapatt); + + appendStringInfo(str, " :heapatt"); + for (i = 0; i < node->nheapatt; i++) + appendStringInfo(str, " %d", node->heapatt[i]); } static void @@ -1510,6 +1518,7 @@ _outIndexPath(StringInfo str, IndexPath *node) WRITE_FLOAT_FIELD(indextotalcost, "%.2f"); WRITE_FLOAT_FIELD(indexselectivity, "%.4f"); WRITE_FLOAT_FIELD(rows, "%.0f"); + WRITE_BOOL_FIELD(try_index_only_scan); } static void diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index a3a82ec..7189a99 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -20,6 +20,7 @@ #include #include "access/skey.h" +#include "access/sysattr.h" #include "foreign/fdwapi.h" #include "miscadmin.h" #include "nodes/makefuncs.h" @@ -1189,6 +1190,27 @@ create_indexscan_plan(PlannerInfo *root, /* use the indexscan-specific rows estimate, not the parent rel's */ scan_plan->scan.plan.plan_rows = best_path->rows; + /* additional details for index only scans */ + if (best_path->try_index_only_scan) + { + int i; + + scan_plan->try_index_only_scan = true; + scan_plan->nheapatt = best_path->path.parent->max_attr + 1; + scan_plan->heapatt = palloc0(sizeof(int) * scan_plan->nheapatt); + for (i = 0; i < best_path->indexinfo->ncolumns; ++i) + { + int indexatt = best_path->indexinfo->indexkeys[i]; + if (indexatt == ObjectIdAttributeNumber) + scan_plan->heapatt[0] = i + 1; + else + { + Assert(indexatt > 0 && indexatt < scan_plan->nheapatt); + scan_plan->heapatt[indexatt] = i + 1; + } + } + } + return scan_plan; } @@ -2860,6 +2882,7 @@ make_indexscan(List *qptlist, node->indexorderby = indexorderby; node->indexorderbyorig = indexorderbyorig; node->indexorderdir = indexscandir; + node->try_index_only_scan = false; return node; } diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index 161d5ab..f9447e0 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -19,6 +19,7 @@ #include "catalog/pg_operator.h" #include "foreign/fdwapi.h" #include "miscadmin.h" +#include "access/sysattr.h" #include "nodes/nodeFuncs.h" #include "optimizer/clauses.h" #include "optimizer/cost.h" @@ -473,6 +474,50 @@ create_index_path(PlannerInfo *root, pathnode->isjoininner = (outer_rel != NULL); pathnode->indexscandir = indexscandir; + /* + * If the AM is capable of returning index tuples, check + * whether an index-only scan is possible. We attempt that only when all + * the needed attributes and quals are available from the index tuple. + */ + if (index->amcanreturn && !index->predOK && !index->indexprs) + { + Bitmapset *index_attrs = NULL; + int i; + + /* If we haven't cached base_attrs_used yet, do so now. */ + if (rel->base_attrs_used == NULL) + { + ListCell *lc; + + /* Add all the attributes needed by joins or final output. */ + for (i = rel->min_attr; i <= rel->max_attr; i++) + if (!bms_is_empty(rel->attr_needed[i - rel->min_attr])) + rel->base_attrs_used = bms_add_member(rel->base_attrs_used, + i - FirstLowInvalidHeapAttributeNumber); + + /* Add all the attributes used by restrictinfos. */ + foreach (lc, rel->baserestrictinfo) + { + RestrictInfo *rinfo = (RestrictInfo *) lfirst(lc); + pull_varattnos((Node *) rinfo->clause, rel->relid, &rel->base_attrs_used); + } + } + + /* Do we have all the necessary attributes? */ + for (i = 0; i < index->ncolumns; i++) + { + int attno = index->indexkeys[i]; + + Assert(attno != 0); + index_attrs = + bms_add_member(index_attrs, + attno - FirstLowInvalidHeapAttributeNumber); + } + pathnode->try_index_only_scan = + !bms_nonempty_difference(rel->base_attrs_used, index_attrs); + bms_free(index_attrs); + } + if (outer_rel != NULL) { /* diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c index 6259170..42bc4cf 100644 --- a/src/backend/optimizer/util/plancat.c +++ b/src/backend/optimizer/util/plancat.c @@ -215,6 +215,7 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, info->amsearchnulls = indexRelation->rd_am->amsearchnulls; info->amhasgettuple = OidIsValid(indexRelation->rd_am->amgettuple); info->amhasgetbitmap = OidIsValid(indexRelation->rd_am->amgetbitmap); + info->amcanreturn = indexRelation->rd_am->amcanreturn; /* * Fetch the ordering information for the index, if any. diff --git a/src/backend/optimizer/util/var.c b/src/backend/optimizer/util/var.c index 8ce7ee4..2cec150 100644 --- a/src/backend/optimizer/util/var.c +++ b/src/backend/optimizer/util/var.c @@ -36,6 +36,12 @@ typedef struct typedef struct { + Bitmapset *varattnos; + Index varno; +} pull_varattnos_context; + +typedef struct +{ int var_location; int sublevels_up; } locate_var_of_level_context; @@ -70,7 +76,7 @@ typedef struct static bool pull_varnos_walker(Node *node, pull_varnos_context *context); -static bool pull_varattnos_walker(Node *node, Bitmapset **varattnos); +static bool pull_varattnos_walker(Node *node, pull_varattnos_context *context); static bool contain_var_clause_walker(Node *node, void *context); static bool contain_vars_of_level_walker(Node *node, int *sublevels_up); static bool locate_var_of_level_walker(Node *node, @@ -177,41 +183,48 @@ pull_varnos_walker(Node *node, pull_varnos_context *context) * pull_varattnos * Find all the distinct attribute numbers present in an expression tree, * and add them to the initial contents of *varattnos. - * Only Vars that reference RTE 1 of rtable level zero are considered. + * Only the given varno at rtable level zero is considered. * * Attribute numbers are offset by FirstLowInvalidHeapAttributeNumber so that * we can include system attributes (e.g., OID) in the bitmap representation. * - * Currently, this does not support subqueries nor expressions containing - * references to multiple tables; not needed since it's only applied to - * index expressions and predicates. + * Currently, this does not support subqueries; this is not needed for current uses. */ void -pull_varattnos(Node *node, Bitmapset **varattnos) +pull_varattnos(Node *node, Index varno, Bitmapset **varattnos) { - (void) pull_varattnos_walker(node, varattnos); + pull_varattnos_context context; + + context.varattnos = *varattnos; + context.varno = varno; + + (void) pull_varattnos_walker(node, &context); + + *varattnos = context.varattnos; } static bool -pull_varattnos_walker(Node *node, Bitmapset **varattnos) +pull_varattnos_walker(Node *node, pull_varattnos_context *context) { + if (node == NULL) return false; if (IsA(node, Var)) { Var *var = (Var *) node; - Assert(var->varno == 1); - *varattnos = bms_add_member(*varattnos, - var->varattno - FirstLowInvalidHeapAttributeNumber); + if (var->varno == context->varno && var->varlevelsup == 0) + context->varattnos = + bms_add_member(context->varattnos, + var->varattno - FirstLowInvalidHeapAttributeNumber); return false; } - /* Should not find a subquery or subplan */ + + /* Should not find a subquery */ Assert(!IsA(node, Query)); - Assert(!IsA(node, SubPlan)); return expression_tree_walker(node, pull_varattnos_walker, - (void *) varattnos); + (void *) context); } diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index 809222b..e4ec953 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -3676,10 +3676,10 @@ RelationGetIndexAttrBitmap(Relation relation) } /* Collect all attributes used in expressions, too */ - pull_varattnos((Node *) indexInfo->ii_Expressions, &indexattrs); + pull_varattnos((Node *) indexInfo->ii_Expressions, 1, &indexattrs); /* Collect all attributes in the index predicate, too */ - pull_varattnos((Node *) indexInfo->ii_Predicate, &indexattrs); + pull_varattnos((Node *) indexInfo->ii_Predicate, 1, &indexattrs); index_close(indexDesc, AccessShareLock); } diff --git a/src/include/access/genam.h b/src/include/access/genam.h index a95b3d7..2250485 100644 --- a/src/include/access/genam.h +++ b/src/include/access/genam.h @@ -145,6 +145,9 @@ extern void index_rescan(IndexScanDesc scan, extern void index_endscan(IndexScanDesc scan); extern void index_markpos(IndexScanDesc scan); extern void index_restrpos(IndexScanDesc scan); +extern ItemPointer index_getnexttid(IndexScanDesc scan, + ScanDirection direction); +extern HeapTuple index_fetch(IndexScanDesc scan); extern HeapTuple index_getnext(IndexScanDesc scan, ScanDirection direction); extern int64 index_getbitmap(IndexScanDesc scan, TIDBitmap *bitmap); diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index 57d08b9..d3eb29d 100644 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -16,6 +16,7 @@ #include "access/genam.h" #include "access/heapam.h" +#include "access/itup.h" typedef struct HeapScanDescData @@ -84,6 +85,10 @@ typedef struct IndexScanDescData /* state data for traversing HOT chains in index_getnext */ bool xs_continue_hot; /* T if must keep walking HOT chain */ + + /* state for index-only scans */ + bool want_index_tuple; /* caller requests index tuple from AM */ + IndexTuple xs_itup; /* index tuple returned by AM */ } IndexScanDescData; /* Struct for heap-or-index scans of system tables */ diff --git a/src/include/catalog/pg_am.h b/src/include/catalog/pg_am.h index feab420..31b2a31 100644 --- a/src/include/catalog/pg_am.h +++ b/src/include/catalog/pg_am.h @@ -65,6 +65,7 @@ CATALOG(pg_am,2601) regproc amvacuumcleanup; /* post-VACUUM cleanup function */ regproc amcostestimate; /* estimate cost of an indexscan */ regproc amoptions; /* parse AM-specific parameters */ + bool amcanreturn; /* AM knows how to return tuple or not */ } FormData_pg_am; /* ---------------- @@ -78,7 +79,7 @@ typedef FormData_pg_am *Form_pg_am; * compiler constants for pg_am * ---------------- */ -#define Natts_pg_am 28 +#define Natts_pg_am 29 #define Anum_pg_am_amname 1 #define Anum_pg_am_amstrategies 2 #define Anum_pg_am_amsupport 3 @@ -107,22 +108,23 @@ typedef FormData_pg_am *Form_pg_am; #define Anum_pg_am_amvacuumcleanup 26 #define Anum_pg_am_amcostestimate 27 #define Anum_pg_am_amoptions 28 +#define Anum_pg_am_canreturn 29 /* ---------------- * initial contents of pg_am * ---------------- */ -DATA(insert OID = 403 ( btree 5 1 t f t t t t t f t t 0 btinsert btbeginscan btgettuple btgetbitmap btrescan btendscan btmarkpos btrestrpos btbuild btbuildempty btbulkdelete btvacuumcleanup btcostestimate btoptions )); +DATA(insert OID = 403 ( btree 5 1 t f t t t t t f t t 0 btinsert btbeginscan btgettuple btgetbitmap btrescan btendscan btmarkpos btrestrpos btbuild btbuildempty btbulkdelete btvacuumcleanup btcostestimate btoptions t)); DESCR("b-tree index access method"); #define BTREE_AM_OID 403 -DATA(insert OID = 405 ( hash 1 1 f f t f f f f f f f 23 hashinsert hashbeginscan hashgettuple hashgetbitmap hashrescan hashendscan hashmarkpos hashrestrpos hashbuild hashbuildempty hashbulkdelete hashvacuumcleanup hashcostestimate hashoptions )); +DATA(insert OID = 405 ( hash 1 1 f f t f f f f f f f 23 hashinsert hashbeginscan hashgettuple hashgetbitmap hashrescan hashendscan hashmarkpos hashrestrpos hashbuild hashbuildempty hashbulkdelete hashvacuumcleanup hashcostestimate hashoptions f)); DESCR("hash index access method"); #define HASH_AM_OID 405 -DATA(insert OID = 783 ( gist 0 8 f t f f t t t t t f 0 gistinsert gistbeginscan gistgettuple gistgetbitmap gistrescan gistendscan gistmarkpos gistrestrpos gistbuild gistbuildempty gistbulkdelete gistvacuumcleanup gistcostestimate gistoptions )); +DATA(insert OID = 783 ( gist 0 8 f t f f t t t t t f 0 gistinsert gistbeginscan gistgettuple gistgetbitmap gistrescan gistendscan gistmarkpos gistrestrpos gistbuild gistbuildempty gistbulkdelete gistvacuumcleanup gistcostestimate gistoptions f)); DESCR("GiST index access method"); #define GIST_AM_OID 783 -DATA(insert OID = 2742 ( gin 0 5 f f f f t t f t f f 0 gininsert ginbeginscan - gingetbitmap ginrescan ginendscan ginmarkpos ginrestrpos ginbuild ginbuildempty ginbulkdelete ginvacuumcleanup gincostestimate ginoptions )); +DATA(insert OID = 2742 ( gin 0 5 f f f f t t f t f f 0 gininsert ginbeginscan - gingetbitmap ginrescan ginendscan ginmarkpos ginrestrpos ginbuild ginbuildempty ginbulkdelete ginvacuumcleanup gincostestimate ginoptions f)); DESCR("GIN index access method"); #define GIN_AM_OID 2742 diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index a3a9310..4369453 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -1217,6 +1217,7 @@ typedef struct * RuntimeContext expr context for evaling runtime Skeys * RelationDesc index relation descriptor * ScanDesc index scan descriptor + * VMBuffer buffer pinned for visibility map testing * ---------------- */ typedef struct IndexScanState @@ -1233,6 +1234,7 @@ typedef struct IndexScanState ExprContext *iss_RuntimeContext; Relation iss_RelationDesc; IndexScanDesc iss_ScanDesc; + Buffer iss_VMBuffer; } IndexScanState; /* ---------------- diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h index 7c085b3..feb9d8e 100644 --- a/src/include/nodes/plannodes.h +++ b/src/include/nodes/plannodes.h @@ -312,6 +312,9 @@ typedef struct IndexScan List *indexorderby; /* list of index ORDER BY exprs */ List *indexorderbyorig; /* the same in original form */ ScanDirection indexorderdir; /* forward or backward or don't care */ + bool try_index_only_scan; /* attempt to skip heap fetches? */ + int nheapatt; /* number of heap attributes */ + int *heapatt; /* map from heap attrs to index attrs */ } IndexScan; /* ---------------- diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h index f659269..eee9797 100644 --- a/src/include/nodes/relation.h +++ b/src/include/nodes/relation.h @@ -415,6 +415,8 @@ typedef struct RelOptInfo /* used by various scans and joins: */ List *baserestrictinfo; /* RestrictInfo structures (if base * rel) */ + Bitmapset *base_attrs_used; /* all attributes used in any way, + * including in baserestrictinfo */ QualCost baserestrictcost; /* cost of evaluating the above */ List *joininfo; /* RestrictInfo structures for join clauses * involving this rel */ @@ -491,6 +493,7 @@ typedef struct IndexOptInfo bool amsearchnulls; /* can AM search for NULL/NOT NULL entries? */ bool amhasgettuple; /* does AM have amgettuple interface? */ bool amhasgetbitmap; /* does AM have amgetbitmap interface? */ + bool amcanreturn; /* does AM know how to return tuple? */ } IndexOptInfo; @@ -701,6 +704,7 @@ typedef struct IndexPath Cost indextotalcost; Selectivity indexselectivity; double rows; /* estimated number of result tuples */ + bool try_index_only_scan; } IndexPath; /* diff --git a/src/include/optimizer/var.h b/src/include/optimizer/var.h index 5d7e2d9..4fd0052 100644 --- a/src/include/optimizer/var.h +++ b/src/include/optimizer/var.h @@ -31,7 +31,7 @@ typedef enum } PVCPlaceHolderBehavior; extern Relids pull_varnos(Node *node); -extern void pull_varattnos(Node *node, Bitmapset **varattnos); +extern void pull_varattnos(Node *node, Index varno, Bitmapset **varattnos); extern bool contain_var_clause(Node *node); extern bool contain_vars_of_level(Node *node, int levelsup); extern int locate_var_of_level(Node *node, int levelsup); diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out index 4861006..43b5e38 100644 --- a/src/test/regress/expected/aggregates.out +++ b/src/test/regress/expected/aggregates.out @@ -449,12 +449,12 @@ analyze tenk1; -- ensure we get consistent plans here -- Basic cases explain (costs off) select min(unique1) from tenk1; - QUERY PLAN -------------------------------------------------------- + QUERY PLAN +------------------------------------------------------------ Result InitPlan 1 (returns $0) -> Limit - -> Index Scan using tenk1_unique1 on tenk1 + -> Index Only Scan using tenk1_unique1 on tenk1 Index Cond: (unique1 IS NOT NULL) (5 rows) @@ -466,12 +466,12 @@ select min(unique1) from tenk1; explain (costs off) select max(unique1) from tenk1; - QUERY PLAN ----------------------------------------------------------------- + QUERY PLAN +--------------------------------------------------------------------- Result InitPlan 1 (returns $0) -> Limit - -> Index Scan Backward using tenk1_unique1 on tenk1 + -> Index Only Scan Backward using tenk1_unique1 on tenk1 Index Cond: (unique1 IS NOT NULL) (5 rows) @@ -488,7 +488,7 @@ explain (costs off) Result InitPlan 1 (returns $0) -> Limit - -> Index Scan Backward using tenk1_unique1 on tenk1 + -> Index Only Scan Backward using tenk1_unique1 on tenk1 Index Cond: ((unique1 IS NOT NULL) AND (unique1 < 42)) (5 rows) @@ -505,7 +505,7 @@ explain (costs off) Result InitPlan 1 (returns $0) -> Limit - -> Index Scan Backward using tenk1_unique1 on tenk1 + -> Index Only Scan Backward using tenk1_unique1 on tenk1 Index Cond: ((unique1 IS NOT NULL) AND (unique1 > 42)) (5 rows) @@ -522,7 +522,7 @@ explain (costs off) Result InitPlan 1 (returns $0) -> Limit - -> Index Scan Backward using tenk1_unique1 on tenk1 + -> Index Only Scan Backward using tenk1_unique1 on tenk1 Index Cond: ((unique1 IS NOT NULL) AND (unique1 > 42000)) (5 rows) @@ -535,12 +535,12 @@ select max(unique1) from tenk1 where unique1 > 42000; -- multi-column index (uses tenk1_thous_tenthous) explain (costs off) select max(tenthous) from tenk1 where thousand = 33; - QUERY PLAN --------------------------------------------------------------------------- + QUERY PLAN +---------------------------------------------------------------------------- Result InitPlan 1 (returns $0) -> Limit - -> Index Scan Backward using tenk1_thous_tenthous on tenk1 + -> Index Only Scan Backward using tenk1_thous_tenthous on tenk1 Index Cond: ((thousand = 33) AND (tenthous IS NOT NULL)) (5 rows) @@ -557,7 +557,7 @@ explain (costs off) Result InitPlan 1 (returns $0) -> Limit - -> Index Scan using tenk1_thous_tenthous on tenk1 + -> Index Only Scan using tenk1_thous_tenthous on tenk1 Index Cond: ((thousand = 33) AND (tenthous IS NOT NULL)) (5 rows) @@ -578,7 +578,7 @@ explain (costs off) -> Result InitPlan 1 (returns $1) -> Limit - -> Index Scan using tenk1_unique1 on tenk1 + -> Index Only Scan using tenk1_unique1 on tenk1 Index Cond: ((unique1 IS NOT NULL) AND (unique1 > int4_tbl.f1)) (7 rows) @@ -596,12 +596,12 @@ select f1, (select min(unique1) from tenk1 where unique1 > f1) AS gt -- check some cases that were handled incorrectly in 8.3.0 explain (costs off) select distinct max(unique2) from tenk1; - QUERY PLAN ----------------------------------------------------------------- + QUERY PLAN +--------------------------------------------------------------------- HashAggregate InitPlan 1 (returns $0) -> Limit - -> Index Scan Backward using tenk1_unique2 on tenk1 + -> Index Only Scan Backward using tenk1_unique2 on tenk1 Index Cond: (unique2 IS NOT NULL) -> Result (6 rows) @@ -614,13 +614,13 @@ select distinct max(unique2) from tenk1; explain (costs off) select max(unique2) from tenk1 order by 1; - QUERY PLAN ----------------------------------------------------------------- + QUERY PLAN +--------------------------------------------------------------------- Sort Sort Key: ($0) InitPlan 1 (returns $0) -> Limit - -> Index Scan Backward using tenk1_unique2 on tenk1 + -> Index Only Scan Backward using tenk1_unique2 on tenk1 Index Cond: (unique2 IS NOT NULL) -> Result (7 rows) @@ -633,13 +633,13 @@ select max(unique2) from tenk1 order by 1; explain (costs off) select max(unique2) from tenk1 order by max(unique2); - QUERY PLAN ----------------------------------------------------------------- + QUERY PLAN +--------------------------------------------------------------------- Sort Sort Key: ($0) InitPlan 1 (returns $0) -> Limit - -> Index Scan Backward using tenk1_unique2 on tenk1 + -> Index Only Scan Backward using tenk1_unique2 on tenk1 Index Cond: (unique2 IS NOT NULL) -> Result (7 rows) @@ -652,13 +652,13 @@ select max(unique2) from tenk1 order by max(unique2); explain (costs off) select max(unique2) from tenk1 order by max(unique2)+1; - QUERY PLAN ----------------------------------------------------------------- + QUERY PLAN +--------------------------------------------------------------------- Sort Sort Key: (($0 + 1)) InitPlan 1 (returns $0) -> Limit - -> Index Scan Backward using tenk1_unique2 on tenk1 + -> Index Only Scan Backward using tenk1_unique2 on tenk1 Index Cond: (unique2 IS NOT NULL) -> Result (7 rows) @@ -671,13 +671,13 @@ select max(unique2) from tenk1 order by max(unique2)+1; explain (costs off) select max(unique2), generate_series(1,3) as g from tenk1 order by g desc; - QUERY PLAN ----------------------------------------------------------------- + QUERY PLAN +--------------------------------------------------------------------- Sort Sort Key: (generate_series(1, 3)) InitPlan 1 (returns $0) -> Limit - -> Index Scan Backward using tenk1_unique2 on tenk1 + -> Index Only Scan Backward using tenk1_unique2 on tenk1 Index Cond: (unique2 IS NOT NULL) -> Result (7 rows) @@ -705,18 +705,18 @@ insert into minmaxtest2 values(15), (16); insert into minmaxtest3 values(17), (18); explain (costs off) select min(f1), max(f1) from minmaxtest; - QUERY PLAN --------------------------------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------------------------------------------- Result InitPlan 1 (returns $0) -> Limit -> Merge Append Sort Key: public.minmaxtest.f1 - -> Index Scan using minmaxtesti on minmaxtest + -> Index Only Scan using minmaxtesti on minmaxtest Index Cond: (f1 IS NOT NULL) - -> Index Scan using minmaxtest1i on minmaxtest1 minmaxtest + -> Index Only Scan using minmaxtest1i on minmaxtest1 minmaxtest Index Cond: (f1 IS NOT NULL) - -> Index Scan Backward using minmaxtest2i on minmaxtest2 minmaxtest + -> Index Only Scan Backward using minmaxtest2i on minmaxtest2 minmaxtest Index Cond: (f1 IS NOT NULL) -> Index Scan using minmaxtest3i on minmaxtest3 minmaxtest Index Cond: (f1 IS NOT NULL) @@ -724,11 +724,11 @@ explain (costs off) -> Limit -> Merge Append Sort Key: public.minmaxtest.f1 - -> Index Scan Backward using minmaxtesti on minmaxtest + -> Index Only Scan Backward using minmaxtesti on minmaxtest Index Cond: (f1 IS NOT NULL) - -> Index Scan Backward using minmaxtest1i on minmaxtest1 minmaxtest + -> Index Only Scan Backward using minmaxtest1i on minmaxtest1 minmaxtest Index Cond: (f1 IS NOT NULL) - -> Index Scan using minmaxtest2i on minmaxtest2 minmaxtest + -> Index Only Scan using minmaxtest2i on minmaxtest2 minmaxtest Index Cond: (f1 IS NOT NULL) -> Index Scan Backward using minmaxtest3i on minmaxtest3 minmaxtest Index Cond: (f1 IS NOT NULL)