I wrote:
> Basically I think we have to fix this by ensuring that an error escape
> can't occur while a relcache entry is in a partially rebuilt state.
Attached is a draft patch for this. In addition to fixing the stated
problem, it also takes care of a thinko that I found along the way:
RelationClearRelation assumes that any rd_indexprs or rd_indpred trees
can be freed by deleting the rd_indexcxt context, as the comments in
rel.h imply. But actually the code that loads those fields was putting
the trees directly into CacheMemoryContext, meaning that a cache flush
on an index that has expressions or a predicate would result in a
session-lifespan memory leak.
I think this is not too complicated to back-patch --- if anything the
logic is simpler than before. Comments?
regards, tom lane
Index: src/backend/utils/cache/relcache.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/utils/cache/relcache.c,v
retrieving revision 1.297
diff -c -r1.297 relcache.c
*** src/backend/utils/cache/relcache.c 7 Jan 2010 20:39:45 -0000 1.297
--- src/backend/utils/cache/relcache.c 10 Jan 2010 23:45:58 -0000
***************
*** 198,203 ****
--- 198,204 ----
/* non-export function prototypes */
+ static void RelationDestroyRelation(Relation relation);
static void RelationClearRelation(Relation relation, bool rebuild);
static void RelationReloadIndexInfo(Relation relation);
***************
*** 211,220 ****
int natts, const FormData_pg_attribute *attrs);
static HeapTuple ScanPgRelation(Oid targetRelId, bool indexOK);
! static Relation AllocateRelationDesc(Relation relation, Form_pg_class relp);
static void RelationParseRelOptions(Relation relation, HeapTuple tuple);
static void RelationBuildTupleDesc(Relation relation);
! static Relation RelationBuildDesc(Oid targetRelId, Relation oldrelation);
static void RelationInitPhysicalAddr(Relation relation);
static void load_critical_index(Oid indexoid);
static TupleDesc GetPgClassDescriptor(void);
--- 212,221 ----
int natts, const FormData_pg_attribute *attrs);
static HeapTuple ScanPgRelation(Oid targetRelId, bool indexOK);
! static Relation AllocateRelationDesc(Form_pg_class relp);
static void RelationParseRelOptions(Relation relation, HeapTuple tuple);
static void RelationBuildTupleDesc(Relation relation);
! static Relation RelationBuildDesc(Oid targetRelId, bool insertIt);
static void RelationInitPhysicalAddr(Relation relation);
static void load_critical_index(Oid indexoid);
static TupleDesc GetPgClassDescriptor(void);
***************
*** 305,319 ****
* AllocateRelationDesc
*
* This is used to allocate memory for a new relation descriptor
! * and initialize the rd_rel field.
! *
! * If 'relation' is NULL, allocate a new RelationData object.
! * If not, reuse the given object (that path is taken only when
! * we have to rebuild a relcache entry during RelationClearRelation).
*/
static Relation
! AllocateRelationDesc(Relation relation, Form_pg_class relp)
{
MemoryContext oldcxt;
Form_pg_class relationForm;
--- 306,317 ----
* AllocateRelationDesc
*
* This is used to allocate memory for a new relation descriptor
! * and initialize the rd_rel field from the given pg_class tuple.
*/
static Relation
! AllocateRelationDesc(Form_pg_class relp)
{
+ Relation relation;
MemoryContext oldcxt;
Form_pg_class relationForm;
***************
*** 321,335 ****
oldcxt = MemoryContextSwitchTo(CacheMemoryContext);
/*
! * allocate space for new relation descriptor, if needed
*/
! if (relation == NULL)
! relation = (Relation) palloc(sizeof(RelationData));
/*
! * clear all fields of reldesc
*/
- MemSet(relation, 0, sizeof(RelationData));
relation->rd_targblock = InvalidBlockNumber;
relation->rd_fsm_nblocks = InvalidBlockNumber;
relation->rd_vm_nblocks = InvalidBlockNumber;
--- 319,331 ----
oldcxt = MemoryContextSwitchTo(CacheMemoryContext);
/*
! * allocate and zero space for new relation descriptor
*/
! relation = (Relation) palloc0(sizeof(RelationData));
/*
! * clear fields of reldesc that should initialize to something non-zero
*/
relation->rd_targblock = InvalidBlockNumber;
relation->rd_fsm_nblocks = InvalidBlockNumber;
relation->rd_vm_nblocks = InvalidBlockNumber;
***************
*** 387,393 ****
{
case RELKIND_RELATION:
case RELKIND_TOASTVALUE:
- case RELKIND_UNCATALOGED:
case RELKIND_INDEX:
break;
default:
--- 383,388 ----
***************
*** 415,420 ****
--- 410,416 ----
relation->rd_options = MemoryContextAlloc(CacheMemoryContext,
VARSIZE(options));
memcpy(relation->rd_options, options, VARSIZE(options));
+ pfree(options);
}
}
***************
*** 805,828 ****
/*
* RelationBuildDesc
*
! * Build a relation descriptor --- either a new one, or by
! * recycling the given old relation object. The latter case
! * supports rebuilding a relcache entry without invalidating
! * pointers to it. The caller must hold at least
* AccessShareLock on the target relid.
*
* Returns NULL if no pg_class row could be found for the given relid
* (suggesting we are trying to access a just-deleted relation).
* Any other error is reported via elog.
*/
static Relation
! RelationBuildDesc(Oid targetRelId, Relation oldrelation)
{
Relation relation;
Oid relid;
HeapTuple pg_class_tuple;
Form_pg_class relp;
- MemoryContext oldcxt;
/*
* find the tuple in pg_class corresponding to the given relation id
--- 801,822 ----
/*
* RelationBuildDesc
*
! * Build a relation descriptor. The caller must hold at least
* AccessShareLock on the target relid.
*
+ * The new descriptor is inserted into the hash table if insertIt is true.
+ *
* Returns NULL if no pg_class row could be found for the given relid
* (suggesting we are trying to access a just-deleted relation).
* Any other error is reported via elog.
*/
static Relation
! RelationBuildDesc(Oid targetRelId, bool insertIt)
{
Relation relation;
Oid relid;
HeapTuple pg_class_tuple;
Form_pg_class relp;
/*
* find the tuple in pg_class corresponding to the given relation id
***************
*** 845,851 ****
* allocate storage for the relation descriptor, and copy pg_class_tuple
* to relation->rd_rel.
*/
! relation = AllocateRelationDesc(oldrelation, relp);
/*
* initialize the relation's relation id (relation->rd_id)
--- 839,845 ----
* allocate storage for the relation descriptor, and copy pg_class_tuple
* to relation->rd_rel.
*/
! relation = AllocateRelationDesc(relp);
/*
* initialize the relation's relation id (relation->rd_id)
***************
*** 916,926 ****
heap_freetuple(pg_class_tuple);
/*
! * Insert newly created relation into relcache hash tables.
*/
! oldcxt = MemoryContextSwitchTo(CacheMemoryContext);
! RelationCacheInsert(relation);
! MemoryContextSwitchTo(oldcxt);
/* It's fully valid */
relation->rd_isvalid = true;
--- 910,919 ----
heap_freetuple(pg_class_tuple);
/*
! * Insert newly created relation into relcache hash table, if requested.
*/
! if (insertIt)
! RelationCacheInsert(relation);
/* It's fully valid */
relation->rd_isvalid = true;
***************
*** 1569,1577 ****
if (RelationIsValid(rd))
{
RelationIncrementReferenceCount(rd);
! /* revalidate nailed index if necessary */
if (!rd->rd_isvalid)
! RelationReloadIndexInfo(rd);
return rd;
}
--- 1562,1580 ----
if (RelationIsValid(rd))
{
RelationIncrementReferenceCount(rd);
! /* revalidate cache entry if necessary */
if (!rd->rd_isvalid)
! {
! /*
! * Indexes only have a limited number of possible schema changes,
! * and we don't want to use the full-blown procedure because it's
! * a headache for indexes that reload itself depends on.
! */
! if (rd->rd_rel->relkind == RELKIND_INDEX)
! RelationReloadIndexInfo(rd);
! else
! RelationClearRelation(rd, true);
! }
return rd;
}
***************
*** 1579,1585 ****
* no reldesc in the cache, so have RelationBuildDesc() build one and add
* it.
*/
! rd = RelationBuildDesc(relationId, NULL);
if (RelationIsValid(rd))
RelationIncrementReferenceCount(rd);
return rd;
--- 1582,1588 ----
* no reldesc in the cache, so have RelationBuildDesc() build one and add
* it.
*/
! rd = RelationBuildDesc(relationId, true);
if (RelationIsValid(rd))
RelationIncrementReferenceCount(rd);
return rd;
***************
*** 1761,1766 ****
--- 1764,1813 ----
}
/*
+ * RelationDestroyRelation
+ *
+ * Physically delete a relation cache entry and all subsidiary data.
+ * Caller must already have unhooked the entry from the hash table.
+ */
+ static void
+ RelationDestroyRelation(Relation relation)
+ {
+ Assert(RelationHasReferenceCountZero(relation));
+
+ /*
+ * Make sure smgr and lower levels close the relation's files, if they
+ * weren't closed already. (This was probably done by caller, but let's
+ * just be real sure.)
+ */
+ RelationCloseSmgr(relation);
+
+ /*
+ * Free all the subsidiary data structures of the relcache entry,
+ * then the entry itself.
+ */
+ if (relation->rd_rel)
+ pfree(relation->rd_rel);
+ /* can't use DecrTupleDescRefCount here */
+ Assert(relation->rd_att->tdrefcount > 0);
+ if (--relation->rd_att->tdrefcount == 0)
+ FreeTupleDesc(relation->rd_att);
+ list_free(relation->rd_indexlist);
+ bms_free(relation->rd_indexattr);
+ FreeTriggerDesc(relation->trigdesc);
+ if (relation->rd_options)
+ pfree(relation->rd_options);
+ if (relation->rd_indextuple)
+ pfree(relation->rd_indextuple);
+ if (relation->rd_am)
+ pfree(relation->rd_am);
+ if (relation->rd_indexcxt)
+ MemoryContextDelete(relation->rd_indexcxt);
+ if (relation->rd_rulescxt)
+ MemoryContextDelete(relation->rd_rulescxt);
+ pfree(relation);
+ }
+
+ /*
* RelationClearRelation
*
* Physically blow away a relation cache entry, or reset it and rebuild
***************
*** 1776,1782 ****
RelationClearRelation(Relation relation, bool rebuild)
{
Oid old_reltype = relation->rd_rel->reltype;
- MemoryContext oldcxt;
/*
* Make sure smgr and lower levels close the relation's files, if they
--- 1823,1828 ----
***************
*** 1791,1797 ****
* Never, never ever blow away a nailed-in system relation, because we'd
* be unable to recover. However, we must reset rd_targblock, in case we
* got called because of a relation cache flush that was triggered by
! * VACUUM.
*
* If it's a nailed index, then we need to re-read the pg_class row to see
* if its relfilenode changed. We can't necessarily do that here, because
--- 1837,1843 ----
* Never, never ever blow away a nailed-in system relation, because we'd
* be unable to recover. However, we must reset rd_targblock, in case we
* got called because of a relation cache flush that was triggered by
! * VACUUM. Likewise reset the fsm and vm size info.
*
* If it's a nailed index, then we need to re-read the pg_class row to see
* if its relfilenode changed. We can't necessarily do that here, because
***************
*** 1830,1869 ****
return;
}
! /*
! * Remove relation from hash tables
! *
! * Note: we might be reinserting it momentarily, but we must not have it
! * visible in the hash tables until it's valid again, so don't try to
! * optimize this away...
! */
! oldcxt = MemoryContextSwitchTo(CacheMemoryContext);
! RelationCacheDelete(relation);
! MemoryContextSwitchTo(oldcxt);
!
! /* Clear out catcache's entries for this relation */
! CatalogCacheFlushRelation(RelationGetRelid(relation));
/*
! * Free all the subsidiary data structures of the relcache entry. We
! * cannot free rd_att if we are trying to rebuild the entry, however,
! * because pointers to it may be cached in various places. The rule
! * manager might also have pointers into the rewrite rules. So to begin
! * with, we can only get rid of these fields:
*/
! FreeTriggerDesc(relation->trigdesc);
! if (relation->rd_indextuple)
! pfree(relation->rd_indextuple);
! if (relation->rd_am)
! pfree(relation->rd_am);
! if (relation->rd_rel)
! pfree(relation->rd_rel);
! if (relation->rd_options)
! pfree(relation->rd_options);
! list_free(relation->rd_indexlist);
! bms_free(relation->rd_indexattr);
! if (relation->rd_indexcxt)
! MemoryContextDelete(relation->rd_indexcxt);
/*
* If we're really done with the relcache entry, blow it away. But if
--- 1876,1889 ----
return;
}
! /* Mark it invalid until we've finished rebuild */
! relation->rd_isvalid = false;
/*
! * Clear out catcache's entries for this relation. This is a bit of
! * a hack, but it's a convenient place to do it.
*/
! CatalogCacheFlushRelation(RelationGetRelid(relation));
/*
* If we're really done with the relcache entry, blow it away. But if
***************
*** 1873,1956 ****
*/
if (!rebuild)
{
! /* ok to zap remaining substructure */
flush_rowtype_cache(old_reltype);
! /* can't use DecrTupleDescRefCount here */
! Assert(relation->rd_att->tdrefcount > 0);
! if (--relation->rd_att->tdrefcount == 0)
! FreeTupleDesc(relation->rd_att);
! if (relation->rd_rulescxt)
! MemoryContextDelete(relation->rd_rulescxt);
! pfree(relation);
}
else
{
/*
! * When rebuilding an open relcache entry, must preserve ref count and
! * rd_createSubid/rd_newRelfilenodeSubid state. Also attempt to
! * preserve the tupledesc and rewrite-rule substructures in place.
! * (Note: the refcount mechanism for tupledescs may eventually ensure
! * that we don't really need to preserve the tupledesc in-place, but
! * for now there are still a lot of places that assume an open rel's
! * tupledesc won't move.)
*
* Note that this process does not touch CurrentResourceOwner; which
* is good because whatever ref counts the entry may have do not
* necessarily belong to that resource owner.
*/
Oid save_relid = RelationGetRelid(relation);
! int old_refcnt = relation->rd_refcnt;
! SubTransactionId old_createSubid = relation->rd_createSubid;
! SubTransactionId old_newRelfilenodeSubid = relation->rd_newRelfilenodeSubid;
! struct PgStat_TableStatus *old_pgstat_info = relation->pgstat_info;
! TupleDesc old_att = relation->rd_att;
! RuleLock *old_rules = relation->rd_rules;
! MemoryContext old_rulescxt = relation->rd_rulescxt;
! if (RelationBuildDesc(save_relid, relation) != relation)
{
/* Should only get here if relation was deleted */
flush_rowtype_cache(old_reltype);
! Assert(old_att->tdrefcount > 0);
! if (--old_att->tdrefcount == 0)
! FreeTupleDesc(old_att);
! if (old_rulescxt)
! MemoryContextDelete(old_rulescxt);
! pfree(relation);
elog(ERROR, "relation %u deleted while still in use", save_relid);
}
- relation->rd_refcnt = old_refcnt;
- relation->rd_createSubid = old_createSubid;
- relation->rd_newRelfilenodeSubid = old_newRelfilenodeSubid;
- relation->pgstat_info = old_pgstat_info;
! if (equalTupleDescs(old_att, relation->rd_att))
! {
! /* needn't flush typcache here */
! Assert(relation->rd_att->tdrefcount == 1);
! if (--relation->rd_att->tdrefcount == 0)
! FreeTupleDesc(relation->rd_att);
! relation->rd_att = old_att;
! }
! else
! {
flush_rowtype_cache(old_reltype);
! Assert(old_att->tdrefcount > 0);
! if (--old_att->tdrefcount == 0)
! FreeTupleDesc(old_att);
! }
! if (equalRuleLocks(old_rules, relation->rd_rules))
{
! if (relation->rd_rulescxt)
! MemoryContextDelete(relation->rd_rulescxt);
! relation->rd_rules = old_rules;
! relation->rd_rulescxt = old_rulescxt;
}
! else
{
! if (old_rulescxt)
! MemoryContextDelete(old_rulescxt);
}
}
}
--- 1893,2008 ----
*/
if (!rebuild)
{
! /* Flush any rowtype cache entry */
flush_rowtype_cache(old_reltype);
!
! /* Remove it from the hash table */
! RelationCacheDelete(relation);
!
! /* And release storage */
! RelationDestroyRelation(relation);
}
else
{
/*
! * Our strategy for rebuilding an open relcache entry is to build
! * a new entry from scratch, swap its contents with the old entry,
! * and finally delete the new entry (along with any infrastructure
! * swapped over from the old entry). This is to avoid trouble in case
! * an error causes us to lose control partway through. The old entry
! * will still be marked !rd_isvalid, so we'll try to rebuild it again
! * on next access. Meanwhile it's not any less valid than it was
! * before, so any code that might expect to continue accessing it
! * isn't hurt by the rebuild failure. (Consider for example a
! * subtransaction that ALTERs a table and then gets cancelled partway
! * through the cache entry rebuild. The outer transaction should
! * still see the not-modified cache entry as valid.) The worst
! * consequence of an error is leaking the necessarily-unreferenced
! * new entry, and this shouldn't happen often enough for that to be
! * a big problem.
! *
! * When rebuilding an open relcache entry, we must preserve ref count
! * and rd_createSubid/rd_newRelfilenodeSubid state. Also attempt to
! * preserve the tupledesc and rewrite-rule substructures in place,
! * because various places assume that these structures won't move
! * while they are working with an open relcache entry. (Note: the
! * refcount mechanism for tupledescs may eventually allow us to remove
! * this hack for the tupledesc.)
*
* Note that this process does not touch CurrentResourceOwner; which
* is good because whatever ref counts the entry may have do not
* necessarily belong to that resource owner.
*/
+ Relation newrel;
Oid save_relid = RelationGetRelid(relation);
! bool keep_tupdesc;
! bool keep_rules;
! /* Build temporary entry, but don't link it into hashtable */
! newrel = RelationBuildDesc(save_relid, false);
! if (newrel == NULL)
{
/* Should only get here if relation was deleted */
flush_rowtype_cache(old_reltype);
! RelationCacheDelete(relation);
! RelationDestroyRelation(relation);
elog(ERROR, "relation %u deleted while still in use", save_relid);
}
! keep_tupdesc = equalTupleDescs(relation->rd_att, newrel->rd_att);
! keep_rules = equalRuleLocks(relation->rd_rules, newrel->rd_rules);
! if (!keep_tupdesc)
flush_rowtype_cache(old_reltype);
!
! /*
! * Perform swapping of the relcache entry contents. Within this
! * process the old entry is momentarily invalid, so there *must*
! * be no possibility of CHECK_FOR_INTERRUPTS within this sequence.
! * Do it in all-in-line code for safety.
! *
! * Since the vast majority of fields should be swapped, our method
! * is to swap the whole structures and then re-swap those few fields
! * we didn't want swapped.
! */
{
! RelationData tmpstruct;
!
! memcpy(&tmpstruct, newrel, sizeof(RelationData));
! memcpy(newrel, relation, sizeof(RelationData));
! memcpy(relation, &tmpstruct, sizeof(RelationData));
}
!
! #define SWAPFIELD(fldtype, fldname) \
! do { \
! fldtype _tmp = newrel->fldname; \
! newrel->fldname = relation->fldname; \
! relation->fldname = _tmp; \
! } while (0)
!
! /* rd_smgr must not be swapped, due to back-links from smgr level */
! SWAPFIELD(SMgrRelation, rd_smgr);
! /* rd_refcnt must be preserved */
! SWAPFIELD(int, rd_refcnt);
! /* isnailed shouldn't change */
! Assert(newrel->rd_isnailed == relation->rd_isnailed);
! /* creation sub-XIDs must be preserved */
! SWAPFIELD(SubTransactionId, rd_createSubid);
! SWAPFIELD(SubTransactionId, rd_newRelfilenodeSubid);
! /* preserve old tupledesc and rules if no logical change */
! if (keep_tupdesc)
! SWAPFIELD(TupleDesc, rd_att);
! if (keep_rules)
{
! SWAPFIELD(RuleLock *, rd_rules);
! SWAPFIELD(MemoryContext, rd_rulescxt);
}
+ /* pgstat_info must be preserved */
+ SWAPFIELD(struct PgStat_TableStatus *, pgstat_info);
+
+ #undef SWAPFIELD
+
+ /* And now we can throw away the temporary entry */
+ RelationDestroyRelation(newrel);
}
}
***************
*** 2836,2842 ****
Relation ird;
LockRelationOid(indexoid, AccessShareLock);
! ird = RelationBuildDesc(indexoid, NULL);
if (ird == NULL)
elog(PANIC, "could not open critical system index %u", indexoid);
ird->rd_isnailed = true;
--- 2888,2894 ----
Relation ird;
LockRelationOid(indexoid, AccessShareLock);
! ird = RelationBuildDesc(indexoid, true);
if (ird == NULL)
elog(PANIC, "could not open critical system index %u", indexoid);
ird->rd_isnailed = true;
***************
*** 3293,3299 ****
fix_opfuncids((Node *) result);
/* Now save a copy of the completed tree in the relcache entry. */
! oldcxt = MemoryContextSwitchTo(CacheMemoryContext);
relation->rd_indexprs = (List *) copyObject(result);
MemoryContextSwitchTo(oldcxt);
--- 3345,3351 ----
fix_opfuncids((Node *) result);
/* Now save a copy of the completed tree in the relcache entry. */
! oldcxt = MemoryContextSwitchTo(relation->rd_indexcxt);
relation->rd_indexprs = (List *) copyObject(result);
MemoryContextSwitchTo(oldcxt);
***************
*** 3368,3374 ****
fix_opfuncids((Node *) result);
/* Now save a copy of the completed tree in the relcache entry. */
! oldcxt = MemoryContextSwitchTo(CacheMemoryContext);
relation->rd_indpred = (List *) copyObject(result);
MemoryContextSwitchTo(oldcxt);
--- 3420,3426 ----
fix_opfuncids((Node *) result);
/* Now save a copy of the completed tree in the relcache entry. */
! oldcxt = MemoryContextSwitchTo(relation->rd_indexcxt);
relation->rd_indpred = (List *) copyObject(result);
MemoryContextSwitchTo(oldcxt);