Thread: Reduce heap tuple header size
This patch, which is built upon the "HeapTupleHeader accessor macros" patch from 2002-06-10, is supposed to reduce the heap tuple header size by four bytes on most architectures. Of course it changes the on-disk tuple format and therefore requires initdb. As I have (once more) opened my mouth too wide, I'll have to provide a heap file conversion utility, if this patch gets accepted... More on this later. ====================== All 81 tests passed. ====================== It's late now, I'll do more tests tomorrow. Good night Manfred diff -ru ../orig/src/backend/access/heap/heapam.c src/backend/access/heap/heapam.c --- ../orig/src/backend/access/heap/heapam.c 2002-06-13 19:34:48.000000000 +0200 +++ src/backend/access/heap/heapam.c 2002-06-13 22:31:42.000000000 +0200 @@ -2204,7 +2204,7 @@ htup->t_infomask = HEAP_XMAX_INVALID | xlhdr.mask; HeapTupleHeaderSetXmin(htup, record->xl_xid); HeapTupleHeaderSetCmin(htup, FirstCommandId); - HeapTupleHeaderSetXmax(htup, InvalidTransactionId); + HeapTupleHeaderSetXmaxInvalid(htup); HeapTupleHeaderSetCmax(htup, FirstCommandId); offnum = PageAddItem(page, (Item) htup, newlen, offnum, diff -ru ../orig/src/include/access/htup.h src/include/access/htup.h --- ../orig/src/include/access/htup.h 2002-06-13 19:34:49.000000000 +0200 +++ src/include/access/htup.h 2002-06-14 01:12:47.000000000 +0200 @@ -57,15 +57,24 @@ * Also note that we omit the nulls bitmap if t_infomask shows that there * are no nulls in the tuple. */ +/* +** We store five "virtual" fields Xmin, Cmin, Xmax, Cmax, and Xvac +** in three physical fields t_xmin, t_cid, t_xmax: +** CommandId Cmin; insert CID stamp +** CommandId Cmax; delete CommandId stamp +** TransactionId Xmin; insert XID stamp +** TransactionId Xmax; delete XID stamp +** TransactionId Xvac; used by VACCUUM +** +** This assumes, that a CommandId can be stored in a TransactionId. +*/ typedef struct HeapTupleHeaderData { Oid t_oid; /* OID of this tuple -- 4 bytes */ - CommandId t_cmin; /* insert CID stamp -- 4 bytes each */ - CommandId t_cmax; /* delete CommandId stamp */ - - TransactionId t_xmin; /* insert XID stamp -- 4 bytes each */ - TransactionId t_xmax; /* delete XID stamp */ + TransactionId t_xmin; /* Xmin -- 4 bytes each */ + TransactionId t_cid; /* Cmin, Cmax, Xvac */ + TransactionId t_xmax; /* Xmax, Cmax */ ItemPointerData t_ctid; /* current TID of this or newer tuple */ @@ -75,7 +84,7 @@ uint8 t_hoff; /* sizeof header incl. bitmap, padding */ - /* ^ - 31 bytes - ^ */ + /* ^ - 27 bytes - ^ */ bits8 t_bits[1]; /* bitmap of NULLs -- VARIABLE LENGTH */ @@ -96,6 +105,8 @@ * attribute(s) */ #define HEAP_HASEXTENDED 0x000C /* the two above combined */ +#define HEAP_XMIN_IS_XMAX 0x0040 /* created and deleted in the */ + /* same transaction */ #define HEAP_XMAX_UNLOGGED 0x0080 /* to lock tuple for update */ /* without logging */ #define HEAP_XMIN_COMMITTED 0x0100 /* t_xmin committed */ @@ -108,6 +119,7 @@ * vacuum */ #define HEAP_MOVED_IN 0x8000 /* moved from another place by * vacuum */ +#define HEAP_MOVED (HEAP_MOVED_OFF | HEAP_MOVED_IN) #define HEAP_XACT_MASK 0xFFF0 /* visibility-related bits */ @@ -116,53 +128,100 @@ /* HeapTupleHeader accessor macros */ #define HeapTupleHeaderGetXmin(tup) \ - ((tup)->t_xmin) +( \ + (tup)->t_xmin \ +) #define HeapTupleHeaderGetXmax(tup) \ - ((tup)->t_xmax) +( \ + ((tup)->t_infomask & HEAP_XMIN_IS_XMAX) ? \ + (tup)->t_xmin \ + : \ + (tup)->t_xmax \ +) -/* no AssertMacro, because this is read as a system-defined attribute also */ +/* no AssertMacro, because this is read as a system-defined attribute */ #define HeapTupleHeaderGetCmin(tup) \ ( \ - (tup)->t_cmin \ + ((tup)->t_infomask & HEAP_MOVED) ? \ + FirstCommandId \ + : \ + ( \ + ((tup)->t_infomask & (HEAP_XMIN_IS_XMAX | HEAP_XMAX_INVALID)) ? \ + (CommandId) (tup)->t_cid \ + : \ + FirstCommandId \ + ) \ ) #define HeapTupleHeaderGetCmax(tup) \ - ((tup)->t_cmax) +( \ + ((tup)->t_infomask & HEAP_MOVED) ? \ + FirstCommandId \ + : \ + ( \ + ((tup)->t_infomask & (HEAP_XMIN_IS_XMAX | HEAP_XMAX_INVALID)) ? \ + (CommandId) (tup)->t_xmax \ + : \ + (CommandId) (tup)->t_cid \ + ) \ +) #define HeapTupleHeaderGetXvac(tup) \ ( \ - AssertMacro((tup)->t_infomask & (HEAP_MOVED_IN | HEAP_MOVED_OFF)), \ - (TransactionId) (tup)->t_cmin \ + AssertMacro((tup)->t_infomask & HEAP_MOVED), \ + (tup)->t_cid \ ) #define HeapTupleHeaderSetXmin(tup, xid) \ - (TransactionIdStore((xid), &(tup)->t_xmin)) +( \ + TransactionIdStore((xid), &(tup)->t_xmin) \ +) #define HeapTupleHeaderSetXminInvalid(tup) \ - (StoreInvalidTransactionId(&(tup)->t_xmin)) +do { \ + (tup)->t_infomask &= ~HEAP_XMIN_IS_XMAX; \ + StoreInvalidTransactionId(&(tup)->t_xmin); \ +} while (0) #define HeapTupleHeaderSetXmax(tup, xid) \ - (TransactionIdStore((xid), &(tup)->t_xmax)) +do { \ + if (TransactionIdEquals((tup)->t_xmin, (xid))) \ + (tup)->t_infomask |= HEAP_XMIN_IS_XMAX; \ + else \ + { \ + (tup)->t_infomask &= ~HEAP_XMIN_IS_XMAX; \ + TransactionIdStore((xid), &(tup)->t_xmax); \ + } \ +} while (0) #define HeapTupleHeaderSetXmaxInvalid(tup) \ - (StoreInvalidTransactionId(&(tup)->t_xmax)) +do { \ + (tup)->t_infomask &= ~HEAP_XMIN_IS_XMAX; \ + StoreInvalidTransactionId(&(tup)->t_xmax); \ +} while (0) #define HeapTupleHeaderSetCmin(tup, cid) \ -( \ - AssertMacro(!((tup)->t_infomask & (HEAP_MOVED_IN | HEAP_MOVED_OFF))), \ - (tup)->t_cmin = (cid) \ -) +do { \ + Assert(!((tup)->t_infomask & HEAP_MOVED)); \ + TransactionIdStore((TransactionId) (cid), &(tup)->t_cid); \ +} while (0) #define HeapTupleHeaderSetCmax(tup, cid) \ - ((tup)->t_cmax = (cid)) +do { \ + Assert(!((tup)->t_infomask & HEAP_MOVED)); \ + if ((tup)->t_infomask & HEAP_XMIN_IS_XMAX) \ + TransactionIdStore((TransactionId) (cid), &(tup)->t_xmax); \ + else \ + TransactionIdStore((TransactionId) (cid), &(tup)->t_cid); \ +} while (0) #define HeapTupleHeaderSetXvac(tup, xid) \ -( \ - AssertMacro((tup)->t_infomask & (HEAP_MOVED_IN | HEAP_MOVED_OFF)), \ - TransactionIdStore((xid), (TransactionId *) &((tup)->t_cmin)) \ -) +do { \ + Assert((tup)->t_infomask & HEAP_MOVED); \ + TransactionIdStore((xid), &(tup)->t_cid); \ +} while (0) /*
Manfred Koizar <mkoi-pg@aon.at> writes: > This patch, which is built upon the "HeapTupleHeader accessor macros" > patch from 2002-06-10, is supposed to reduce the heap tuple header size > by four bytes on most architectures. Of course it changes the on-disk > tuple format and therefore requires initdb. As I commented before, I am not in favor of this. I don't think that a four-byte savings justifies a forced initdb with no chance of pg_upgrade, plus loss of redundancy (= reduced chance of detecting or recovering from corruption), plus significantly slower access to several critical header fields. The tqual.c routines are already hotspots in many scenarios. I believe this will make them noticeably slower. regards, tom lane
On Fri, 14 Jun 2002 10:16:22 -0400, Tom Lane <tgl@sss.pgh.pa.us> wrote: >As I commented before, I am not in favor of this. I don't think that a >four-byte savings justifies a forced initdb with no chance of >pg_upgrade, As I don't know other users' preferences, I cannot comment on this. I just think that four bytes per tuple can amount to a sum that justifies this effort. Disk space is not my argument here, but reduced disk IO is. >plus significantly slower access to >several critical header fields. The tqual.c routines are already >hotspots in many scenarios. I believe this will make them noticeably >slower. Significantly slower? I tried to analyze HeapTupleSatisfiesUpdate(), as I think it is the most complicated of these Satisfies functions (look for the !! comments): /* !! HeapTupleHeaderGetXmin no slowdown !! HeapTupleHeaderGetXmax one infomask compare !! HeapTupleHeaderGetCmin two infomask compares !! HeapTupleHeaderGetCMax two infomask compares !! HeapTupleHeaderGetXvac no slowdown */ int HeapTupleSatisfiesUpdate(HeapTuple htuple, CommandId curcid) { HeapTupleHeader tuple = htuple->t_data; if (!(tuple->t_infomask & HEAP_XMIN_COMMITTED)) { if (tuple->t_infomask & HEAP_XMIN_INVALID) /* !! no slowdown */ return HeapTupleInvisible; if (tuple->t_infomask & HEAP_MOVED_OFF) { if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXvac(tuple))) /* !! no slowdown */ return HeapTupleInvisible; if (!TransactionIdIsInProgress(HeapTupleHeaderGetXvac(tuple))) { if (TransactionIdDidCommit(HeapTupleHeaderGetXvac(tuple))) { tuple->t_infomask |= HEAP_XMIN_INVALID; /* !! no slowdown */ return HeapTupleInvisible; } tuple->t_infomask |= HEAP_XMIN_COMMITTED; } } else if (tuple->t_infomask & HEAP_MOVED_IN) { if (!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXvac(tuple))) { if (TransactionIdIsInProgress(HeapTupleHeaderGetXvac(tuple))) /* !! no slowdown */ return HeapTupleInvisible; if (TransactionIdDidCommit(HeapTupleHeaderGetXvac(tuple))) tuple->t_infomask |= HEAP_XMIN_COMMITTED; else { tuple->t_infomask |= HEAP_XMIN_INVALID; /* !! no slowdown */ return HeapTupleInvisible; } } } /* !! no slowdown up to here */ else if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmin(tuple))) { /* !! GetCmin does 2 infomask compares */ if (HeapTupleHeaderGetCmin(tuple) >= curcid) /* !! returning with 2 additional infomask compares */ return HeapTupleInvisible; /* inserted after scan * started */ if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid */ /* !! returning with 2 additional infomask compares */ return HeapTupleMayBeUpdated; /* !! assertions turned off in production: no slowdown */ Assert(TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tuple))); if (tuple->t_infomask & HEAP_MARKED_FOR_UPDATE) /* !! returning with 2 additional infomask compares */ return HeapTupleMayBeUpdated; /* !! GetCmax does 2 infomask compares */ if (HeapTupleHeaderGetCmax(tuple) >= curcid) /* !! returning with 4 additional infomask compares */ return HeapTupleSelfUpdated; /* updated after scan * started */ else /* !! returning with 4 additional infomask compares */ return HeapTupleInvisible; /* updated before scan * started */ } /* !! no slowdown up to here */ else if (!TransactionIdDidCommit(HeapTupleHeaderGetXmin(tuple))) { if (TransactionIdDidAbort(HeapTupleHeaderGetXmin(tuple))) tuple->t_infomask |= HEAP_XMIN_INVALID; /* aborted */ /* !! no slowdown */ return HeapTupleInvisible; } else tuple->t_infomask |= HEAP_XMIN_COMMITTED; } /* !! no slowdown */ /* by here, the inserting transaction has committed */ if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid or aborted */ /* !! no slowdown */ return HeapTupleMayBeUpdated; if (tuple->t_infomask & HEAP_XMAX_COMMITTED) { if (tuple->t_infomask & HEAP_MARKED_FOR_UPDATE) /* !! no slowdown ?? BTW, shouldn't we set HEAP_MAX_INVALID here? */ return HeapTupleMayBeUpdated; /* !! no slowdown */ return HeapTupleUpdated; /* updated by other */ } /* !! no slowdown up to here, !! one infomask compare to follow in GetXmax(), !! additional waste of precious CPU cycles could be avoided by: !! TransactionId xmax = HeapTupleHeaderGetXmax(tuple); */ if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tuple))) { if (tuple->t_infomask & HEAP_MARKED_FOR_UPDATE) /* !! returning with 1 additional infomask compare */ return HeapTupleMayBeUpdated; /* !! GetCmax does 2 infomask compares */ if (HeapTupleHeaderGetCmax(tuple) >= curcid) /* !! returning with 3 additional infomask compares */ return HeapTupleSelfUpdated; /* updated after scan * started */ else /* !! returning with 3 additional infomask compares */ return HeapTupleInvisible; /* updated before scan started */ } /* !! 1 infomask compare up to here, !! another infomask compare ... !! or use xmax */ if (!TransactionIdDidCommit(HeapTupleHeaderGetXmax(tuple))) { /* !! and a third infomask compare */ if (TransactionIdDidAbort(HeapTupleHeaderGetXmax(tuple))) { tuple->t_infomask |= HEAP_XMAX_INVALID; /* aborted */ /* !! returning with 1 or 3 additional infomask compares */ return HeapTupleMayBeUpdated; } /* running xact */ /* !! returning with 1 or 3 additional infomask compares */ return HeapTupleBeingUpdated; /* in updation by other */ } /* !! 2 (or 1 with a local xmax) infomask compares up to here */ /* xmax transaction committed */ tuple->t_infomask |= HEAP_XMAX_COMMITTED; if (tuple->t_infomask & HEAP_MARKED_FOR_UPDATE) /* ?? BTW, shouldn't we set HEAP_MAX_INVALID here? */ return HeapTupleMayBeUpdated; return HeapTupleUpdated; /* updated by other */ } So in the worst case we return after having done four more compares than without the patch. Note that in the most common cases there is no additional cost at all. If you still think we have a performance problem here, we could replace GetCmin and GetCmax by cheaper macros: #define HeapTupleHeaderGetCminKnowingThatNotMoved(tup) \ ( \ AssertMacro(!((tup)->t_infomask & HEAP_MOVED)), ( \ ((tup)->t_infomask & (HEAP_XMIN_IS_XMAX | HEAP_XMAX_INVALID)) ? \ (CommandId) (tup)->t_cid \ : \ FirstCommandId \ ) \ ) thus reducing the additional cost to one t_infomask compare, because the Satisfies functions only access Cmin and Cmax, when HEAP_MOVED is known to be not set. OTOH experimenting with a moderatly sized "out of production" database I got the following results: | pages | pages | relkind | count | tuples | before| after | savings --------+-------+--------+-------+-------+-------- i | 31 | 808146 | 8330 | 8330 | 0.00% r | 32 | 612968 | 13572 | 13184 | 2.86% all | 63 | | 21902 | 21514 | 1.77% 2.86% fewer heap pages mean 2.86% less disk IO caused by heap pages. Considering that index pages tend to benefit more from caching we conclude that heap pages contribute more to the overall IO load, so the total savings in the number of disk IOs should be better than the 1.77% shown in the table above. I think this outweighs a few CPU cycles now and then. Servus Manfred
Your patch has been added to the PostgreSQL unapplied patches list at: http://candle.pha.pa.us/cgi-bin/pgpatches I will try to apply it within the next 48 hours. --------------------------------------------------------------------------- Manfred Koizar wrote: > This patch, which is built upon the "HeapTupleHeader accessor macros" > patch from 2002-06-10, is supposed to reduce the heap tuple header size > by four bytes on most architectures. Of course it changes the on-disk > tuple format and therefore requires initdb. As I have (once more) > opened my mouth too wide, I'll have to provide a heap file conversion > utility, if this patch gets accepted... More on this later. > > ====================== > All 81 tests passed. > ====================== > > It's late now, I'll do more tests tomorrow. > > Good night > Manfred > > diff -ru ../orig/src/backend/access/heap/heapam.c src/backend/access/heap/heapam.c > --- ../orig/src/backend/access/heap/heapam.c 2002-06-13 19:34:48.000000000 +0200 > +++ src/backend/access/heap/heapam.c 2002-06-13 22:31:42.000000000 +0200 > @@ -2204,7 +2204,7 @@ > htup->t_infomask = HEAP_XMAX_INVALID | xlhdr.mask; > HeapTupleHeaderSetXmin(htup, record->xl_xid); > HeapTupleHeaderSetCmin(htup, FirstCommandId); > - HeapTupleHeaderSetXmax(htup, InvalidTransactionId); > + HeapTupleHeaderSetXmaxInvalid(htup); > HeapTupleHeaderSetCmax(htup, FirstCommandId); > > offnum = PageAddItem(page, (Item) htup, newlen, offnum, > diff -ru ../orig/src/include/access/htup.h src/include/access/htup.h > --- ../orig/src/include/access/htup.h 2002-06-13 19:34:49.000000000 +0200 > +++ src/include/access/htup.h 2002-06-14 01:12:47.000000000 +0200 > @@ -57,15 +57,24 @@ > * Also note that we omit the nulls bitmap if t_infomask shows that there > * are no nulls in the tuple. > */ > +/* > +** We store five "virtual" fields Xmin, Cmin, Xmax, Cmax, and Xvac > +** in three physical fields t_xmin, t_cid, t_xmax: > +** CommandId Cmin; insert CID stamp > +** CommandId Cmax; delete CommandId stamp > +** TransactionId Xmin; insert XID stamp > +** TransactionId Xmax; delete XID stamp > +** TransactionId Xvac; used by VACCUUM > +** > +** This assumes, that a CommandId can be stored in a TransactionId. > +*/ > typedef struct HeapTupleHeaderData > { > Oid t_oid; /* OID of this tuple -- 4 bytes */ > > - CommandId t_cmin; /* insert CID stamp -- 4 bytes each */ > - CommandId t_cmax; /* delete CommandId stamp */ > - > - TransactionId t_xmin; /* insert XID stamp -- 4 bytes each */ > - TransactionId t_xmax; /* delete XID stamp */ > + TransactionId t_xmin; /* Xmin -- 4 bytes each */ > + TransactionId t_cid; /* Cmin, Cmax, Xvac */ > + TransactionId t_xmax; /* Xmax, Cmax */ > > ItemPointerData t_ctid; /* current TID of this or newer tuple */ > > @@ -75,7 +84,7 @@ > > uint8 t_hoff; /* sizeof header incl. bitmap, padding */ > > - /* ^ - 31 bytes - ^ */ > + /* ^ - 27 bytes - ^ */ > > bits8 t_bits[1]; /* bitmap of NULLs -- VARIABLE LENGTH */ > > @@ -96,6 +105,8 @@ > * attribute(s) */ > #define HEAP_HASEXTENDED 0x000C /* the two above combined */ > > +#define HEAP_XMIN_IS_XMAX 0x0040 /* created and deleted in the */ > + /* same transaction > */ > #define HEAP_XMAX_UNLOGGED 0x0080 /* to lock tuple for update */ > /* without logging > */ > #define HEAP_XMIN_COMMITTED 0x0100 /* t_xmin committed */ > @@ -108,6 +119,7 @@ > * vacuum */ > #define HEAP_MOVED_IN 0x8000 /* moved from another place by > * vacuum */ > +#define HEAP_MOVED (HEAP_MOVED_OFF | HEAP_MOVED_IN) > > #define HEAP_XACT_MASK 0xFFF0 /* visibility-related bits */ > > @@ -116,53 +128,100 @@ > /* HeapTupleHeader accessor macros */ > > #define HeapTupleHeaderGetXmin(tup) \ > - ((tup)->t_xmin) > +( \ > + (tup)->t_xmin \ > +) > > #define HeapTupleHeaderGetXmax(tup) \ > - ((tup)->t_xmax) > +( \ > + ((tup)->t_infomask & HEAP_XMIN_IS_XMAX) ? \ > + (tup)->t_xmin \ > + : \ > + (tup)->t_xmax \ > +) > > -/* no AssertMacro, because this is read as a system-defined attribute also */ > +/* no AssertMacro, because this is read as a system-defined attribute */ > #define HeapTupleHeaderGetCmin(tup) \ > ( \ > - (tup)->t_cmin \ > + ((tup)->t_infomask & HEAP_MOVED) ? \ > + FirstCommandId \ > + : \ > + ( \ > + ((tup)->t_infomask & (HEAP_XMIN_IS_XMAX | HEAP_XMAX_INVALID)) ? \ > + (CommandId) (tup)->t_cid \ > + : \ > + FirstCommandId \ > + ) \ > ) > > #define HeapTupleHeaderGetCmax(tup) \ > - ((tup)->t_cmax) > +( \ > + ((tup)->t_infomask & HEAP_MOVED) ? \ > + FirstCommandId \ > + : \ > + ( \ > + ((tup)->t_infomask & (HEAP_XMIN_IS_XMAX | HEAP_XMAX_INVALID)) ? \ > + (CommandId) (tup)->t_xmax \ > + : \ > + (CommandId) (tup)->t_cid \ > + ) \ > +) > > #define HeapTupleHeaderGetXvac(tup) \ > ( \ > - AssertMacro((tup)->t_infomask & (HEAP_MOVED_IN | HEAP_MOVED_OFF)), \ > - (TransactionId) (tup)->t_cmin \ > + AssertMacro((tup)->t_infomask & HEAP_MOVED), \ > + (tup)->t_cid \ > ) > > > #define HeapTupleHeaderSetXmin(tup, xid) \ > - (TransactionIdStore((xid), &(tup)->t_xmin)) > +( \ > + TransactionIdStore((xid), &(tup)->t_xmin) \ > +) > > #define HeapTupleHeaderSetXminInvalid(tup) \ > - (StoreInvalidTransactionId(&(tup)->t_xmin)) > +do { \ > + (tup)->t_infomask &= ~HEAP_XMIN_IS_XMAX; \ > + StoreInvalidTransactionId(&(tup)->t_xmin); \ > +} while (0) > > #define HeapTupleHeaderSetXmax(tup, xid) \ > - (TransactionIdStore((xid), &(tup)->t_xmax)) > +do { \ > + if (TransactionIdEquals((tup)->t_xmin, (xid))) \ > + (tup)->t_infomask |= HEAP_XMIN_IS_XMAX; \ > + else \ > + { \ > + (tup)->t_infomask &= ~HEAP_XMIN_IS_XMAX; \ > + TransactionIdStore((xid), &(tup)->t_xmax); \ > + } \ > +} while (0) > > #define HeapTupleHeaderSetXmaxInvalid(tup) \ > - (StoreInvalidTransactionId(&(tup)->t_xmax)) > +do { \ > + (tup)->t_infomask &= ~HEAP_XMIN_IS_XMAX; \ > + StoreInvalidTransactionId(&(tup)->t_xmax); \ > +} while (0) > > #define HeapTupleHeaderSetCmin(tup, cid) \ > -( \ > - AssertMacro(!((tup)->t_infomask & (HEAP_MOVED_IN | HEAP_MOVED_OFF))), \ > - (tup)->t_cmin = (cid) \ > -) > +do { \ > + Assert(!((tup)->t_infomask & HEAP_MOVED)); \ > + TransactionIdStore((TransactionId) (cid), &(tup)->t_cid); \ > +} while (0) > > #define HeapTupleHeaderSetCmax(tup, cid) \ > - ((tup)->t_cmax = (cid)) > +do { \ > + Assert(!((tup)->t_infomask & HEAP_MOVED)); \ > + if ((tup)->t_infomask & HEAP_XMIN_IS_XMAX) \ > + TransactionIdStore((TransactionId) (cid), &(tup)->t_xmax); \ > + else \ > + TransactionIdStore((TransactionId) (cid), &(tup)->t_cid); \ > +} while (0) > > #define HeapTupleHeaderSetXvac(tup, xid) \ > -( \ > - AssertMacro((tup)->t_infomask & (HEAP_MOVED_IN | HEAP_MOVED_OFF)), \ > - TransactionIdStore((xid), (TransactionId *) &((tup)->t_cmin)) \ > -) > +do { \ > + Assert((tup)->t_infomask & HEAP_MOVED); \ > + TransactionIdStore((xid), &(tup)->t_cid); \ > +} while (0) > > > /* > > > ---------------------------(end of broadcast)--------------------------- > TIP 3: if posting/reading through Usenet, please send an appropriate > subscribe-nomail command to majordomo@postgresql.org so that your > message can get through to the mailing list cleanly > -- Bruce Momjian | http://candle.pha.pa.us pgman@candle.pha.pa.us | (610) 853-3000 + If your life is a hard drive, | 830 Blythe Avenue + Christ can be your backup. | Drexel Hill, Pennsylvania 19026
Bruce Momjian <pgman@candle.pha.pa.us> writes: > Your patch has been added to the PostgreSQL unapplied patches list at: > http://candle.pha.pa.us/cgi-bin/pgpatches > I will try to apply it within the next 48 hours. Are you planning to ignore my objections to it? regards, tom lane
Tom Lane wrote: > Bruce Momjian <pgman@candle.pha.pa.us> writes: > > Your patch has been added to the PostgreSQL unapplied patches list at: > > http://candle.pha.pa.us/cgi-bin/pgpatches > > I will try to apply it within the next 48 hours. > > Are you planning to ignore my objections to it? The author replied addressing your objections and I saw no reply from on on that. -- Bruce Momjian | http://candle.pha.pa.us pgman@candle.pha.pa.us | (610) 853-3000 + If your life is a hard drive, | 830 Blythe Avenue + Christ can be your backup. | Drexel Hill, Pennsylvania 19026
Tom Lane wrote: > Bruce Momjian <pgman@candle.pha.pa.us> writes: > > Tom Lane wrote: > >> Are you planning to ignore my objections to it? > > > The author replied addressing your objections and I saw no reply from on > > on that. > > He replied stating his opinion; my opinion didn't change. I was waiting > for some other people to weigh in with their opinions. As far as I've > seen, no one else has commented at all. > > If I get voted down on the point after suitable discussion, so be it. > But I will strongly object to you applying the patch just because it's > next in your inbox. Well, we have three votes, yours, the author, and mine. That's a vote. If you want to make a pitch for more votes, go ahead. I can wait. The thread went on for quite a while and no one else gave an opinion, as I remember, though there may have been a few positive ones for the patch that I forgot about. I don't care if you object, strongly object, or jump up and down; you have one vote. Please stop the posturing. -- Bruce Momjian | http://candle.pha.pa.us pgman@candle.pha.pa.us | (610) 853-3000 + If your life is a hard drive, | 830 Blythe Avenue + Christ can be your backup. | Drexel Hill, Pennsylvania 19026
Bruce Momjian <pgman@candle.pha.pa.us> writes: > Tom Lane wrote: >> Are you planning to ignore my objections to it? > The author replied addressing your objections and I saw no reply from on > on that. He replied stating his opinion; my opinion didn't change. I was waiting for some other people to weigh in with their opinions. As far as I've seen, no one else has commented at all. If I get voted down on the point after suitable discussion, so be it. But I will strongly object to you applying the patch just because it's next in your inbox. regards, tom lane
Tom Lane wrote: > Bruce Momjian <pgman@candle.pha.pa.us> writes: > > Tom Lane wrote: > >> Are you planning to ignore my objections to it? > > > The author replied addressing your objections and I saw no reply from on > > on that. > > He replied stating his opinion; my opinion didn't change. I was waiting > for some other people to weigh in with their opinions. As far as I've > seen, no one else has commented at all. > > If I get voted down on the point after suitable discussion, so be it. > But I will strongly object to you applying the patch just because it's > next in your inbox. Tom, I have reviewed your objections: > As I commented before, I am not in favor of this. I don't think that a > four-byte savings justifies a forced initdb with no chance of > pg_upgrade, plus loss of redundancy (= reduced chance of detecting or > recovering from corruption), plus significantly slower access to > several critical header fields. The tqual.c routines are already > hotspots in many scenarios. I believe this will make them noticeably > slower. I don't think enough people use pg_upgrade to make it a reason to keep an extra four bytes of tuple overhead. I realize 8-byte aligned systems don't benefit, but most of our platforms are 4-byte aligned. I don't consider redundency a valid reason either. We just don't have many table corruption complaints, and the odds that having an extra 4 bytes is going to make detection or correction better is unlikely. The author addressed the slowness complaint and seemed to refute the idea it would be slower. Is there something I am missing? -- Bruce Momjian | http://candle.pha.pa.us pgman@candle.pha.pa.us | (610) 853-3000 + If your life is a hard drive, | 830 Blythe Avenue + Christ can be your backup. | Drexel Hill, Pennsylvania 19026
Bruce Momjian wrote: > > Tom Lane wrote: > > Bruce Momjian <pgman@candle.pha.pa.us> writes: > > > Tom Lane wrote: > > >> Are you planning to ignore my objections to it? > > > > > The author replied addressing your objections and I saw no reply from on > > > on that. > > > > He replied stating his opinion; my opinion didn't change. I was waiting > > for some other people to weigh in with their opinions. As far as I've > > seen, no one else has commented at all. > > > > If I get voted down on the point after suitable discussion, so be it. > > But I will strongly object to you applying the patch just because it's > > next in your inbox. > > Tom, I have reviewed your objections: > > > As I commented before, I am not in favor of this. I don't think that a > > four-byte savings justifies a forced initdb with no chance of > > pg_upgrade, plus loss of redundancy (= reduced chance of detecting or > > recovering from corruption), plus significantly slower access to > > several critical header fields. The tqual.c routines are already > > hotspots in many scenarios. I believe this will make them noticeably > > slower. > > I don't think enough people use pg_upgrade to make it a reason to keep > an extra four bytes of tuple overhead. I realize 8-byte aligned systems > don't benefit, but most of our platforms are 4-byte aligned. I don't > consider redundency a valid reason either. We just don't have many > table corruption complaints, and the odds that having an extra 4 bytes > is going to make detection or correction better is unlikely. The non-overwriting storage management (which is one reason why whe need all these header fields) causes over 30 bytes of row overhead anyway. I am with Tom here, 4 bytes per row isn't worth making the tuple header variable length size. > The author addressed the slowness complaint and seemed to refute the > idea it would be slower. Do we have any hard numbers on that? Is it just access to the header fields, or do we loose the offset cacheability of all fixed size fields at the beginning of a row? In the latter case count me into the slowness-believer camp. Jan -- #======================================================================# # It's easier to get forgiveness for being wrong than for being right. # # Let's break this rule - forgive me. # #================================================== JanWieck@Yahoo.com #
Jan Wieck wrote: > > I don't think enough people use pg_upgrade to make it a reason to keep > > an extra four bytes of tuple overhead. I realize 8-byte aligned systems > > don't benefit, but most of our platforms are 4-byte aligned. I don't > > consider redundency a valid reason either. We just don't have many > > table corruption complaints, and the odds that having an extra 4 bytes > > is going to make detection or correction better is unlikely. > > The non-overwriting storage management (which is one reason why whe need > all these header fields) causes over 30 bytes of row overhead anyway. I > am with Tom here, 4 bytes per row isn't worth making the tuple header > variable length size. Woh, I didn't see anything about making the header variable size. The issue was that on 8-byte machines, structure alignment will not allow any savings. However, on 4-byte machines, it will be a savings of ~11% in the tuple header. > > The author addressed the slowness complaint and seemed to refute the > > idea it would be slower. > > Do we have any hard numbers on that? Is it just access to the header > fields, or do we loose the offset cacheability of all fixed size fields > at the beginning of a row? In the latter case count me into the > slowness-believer camp. No other slowdown except access to the tuple header requires a little more smarts. As the author mentions, the increased number of tuples per page more than offset that. In fact, the patch is fairly small, so you can review it yourself: http://candle.pha.pa.us/cgi-bin/pgpatches -- Bruce Momjian | http://candle.pha.pa.us pgman@candle.pha.pa.us | (610) 853-3000 + If your life is a hard drive, | 830 Blythe Avenue + Christ can be your backup. | Drexel Hill, Pennsylvania 19026
Jan Wieck wrote: > > I don't think enough people use pg_upgrade to make it a reason to keep > > an extra four bytes of tuple overhead. I realize 8-byte aligned systems > > don't benefit, but most of our platforms are 4-byte aligned. I don't > > consider redundency a valid reason either. We just don't have many > > table corruption complaints, and the odds that having an extra 4 bytes > > is going to make detection or correction better is unlikely. > > The non-overwriting storage management (which is one reason why whe need > all these header fields) causes over 30 bytes of row overhead anyway. I > am with Tom here, 4 bytes per row isn't worth making the tuple header > variable length size. > > > The author addressed the slowness complaint and seemed to refute the > > idea it would be slower. > > Do we have any hard numbers on that? Is it just access to the header > fields, or do we loose the offset cacheability of all fixed size fields > at the beginning of a row? In the latter case count me into the > slowness-believer camp. Here is a summary of the concepts used in the patch: http://groups.google.com/groups?hl=en&lr=&ie=UTF-8&threadm=ejf4du853mblm44f0u78f02g166r69lng7%404ax.com&rnum=28&prev=/groups%3Fq%3Dmanfred%2Bkoizar%2Bgroup:comp.databases.postgresql.*%26start%3D20%26hl%3Den%26lr%3D%26ie%3DUTF-8%26scoring%3Dd%26selm%3Dejf4du853mblm44f0u78f02g166r69lng7%25404ax.com%26rnum%3D28 -- Bruce Momjian | http://candle.pha.pa.us pgman@candle.pha.pa.us | (610) 853-3000 + If your life is a hard drive, | 830 Blythe Avenue + Christ can be your backup. | Drexel Hill, Pennsylvania 19026
Bruce Momjian <pgman@candle.pha.pa.us> writes: > Tom, I have reviewed your objections: Thanks. > Is there something I am missing? My principal objection to this is that I do not like piecemeal breakage of pg_upgrade, disk page examination tools, etc. There are other things that have been proposed that would require changes of page header or tuple header format, and I would prefer to see them bundled up and done as one big change (in some future revision; it's probably too late for 7.3). "Disk format of the week" is not an idea that appeals to me. I know as well as you do that pg_upgrade compatibility is only interesting if there *is* a pg_upgrade, which very possibly won't happen for 7.3 anyway --- but I don't like foreclosing the possibility for a marginal win. And this is definitely a marginal win. Let's try to batch up enough changes to make it worth the pain. In case you are wondering what I am talking about, here is an off-the-cuff list of things that I'd prefer not to do one at a time: * Version identifier words in page headers * CRCs in page headers * Replication and/or point-in-time recovery might need additional header fields similar to those used by WAL * Restructuring of line pointers * Making OIDs optional (no wasted space if no OID) * Adding a tuple version identifier to tuples (for DROP COLUMN etc) * Restructuring index tuple headers (remove length field) * Fixing array storage to support NULLs inside arrays * Restoring time travel on some optional basis Some of these may never happen (I'm not even in favor of all of them) but it's certain that we will want to do some of them. I don't want to take the same hit over and over when some intelligent project management would let us take it just once or twice. regards, tom lane
Jan Wieck <JanWieck@Yahoo.com> writes: > Do we have any hard numbers on that? Is it just access to the header > fields, or do we loose the offset cacheability of all fixed size fields > at the beginning of a row? In the latter case count me into the > slowness-believer camp. I don't believe the patch would have made the header variable size, only changed what the fixed size is. The slowdown I was worried about was just a matter of a couple extra tests and branches in the tqual.c routines; which would be negligible if they weren't such hotspots. regards, tom lane
Bruce Momjian wrote: > > Jan Wieck wrote: > > > I don't think enough people use pg_upgrade to make it a reason to keep > > > an extra four bytes of tuple overhead. I realize 8-byte aligned systems > > > don't benefit, but most of our platforms are 4-byte aligned. I don't > > > consider redundency a valid reason either. We just don't have many > > > table corruption complaints, and the odds that having an extra 4 bytes > > > is going to make detection or correction better is unlikely. > > > > The non-overwriting storage management (which is one reason why whe need > > all these header fields) causes over 30 bytes of row overhead anyway. I > > am with Tom here, 4 bytes per row isn't worth making the tuple header > > variable length size. > > Woh, I didn't see anything about making the header variable size. The > issue was that on 8-byte machines, structure alignment will not allow > any savings. However, on 4-byte machines, it will be a savings of ~11% > in the tuple header. You're right. Dunno where I got that idea from. Looking at the patch I find it quite confusing using Xmin as Xmax, sometimes. If we use 3 physical variables for 5 virtual ones in that way, I would rather use generic names. Jan -- #======================================================================# # It's easier to get forgiveness for being wrong than for being right. # # Let's break this rule - forgive me. # #================================================== JanWieck@Yahoo.com #
Tom Lane wrote: > > Jan Wieck <JanWieck@Yahoo.com> writes: > > Do we have any hard numbers on that? Is it just access to the header > > fields, or do we loose the offset cacheability of all fixed size fields > > at the beginning of a row? In the latter case count me into the > > slowness-believer camp. > > I don't believe the patch would have made the header variable size, > only changed what the fixed size is. The slowdown I was worried about > was just a matter of a couple extra tests and branches in the tqual.c > routines; which would be negligible if they weren't such hotspots. Did someone run at least pgbench with/without that patch applied? Jan -- #======================================================================# # It's easier to get forgiveness for being wrong than for being right. # # Let's break this rule - forgive me. # #================================================== JanWieck@Yahoo.com #
Tom Lane wrote: > Bruce Momjian <pgman@candle.pha.pa.us> writes: > > Tom, I have reviewed your objections: > > Thanks. > > > Is there something I am missing? > > My principal objection to this is that I do not like piecemeal breakage > of pg_upgrade, disk page examination tools, etc. There are other things What do we have, pg_upgrade and Red Hat's disk dump tool, right? (I forgot the name.) > that have been proposed that would require changes of page header or > tuple header format, and I would prefer to see them bundled up and done > as one big change (in some future revision; it's probably too late for > 7.3). "Disk format of the week" is not an idea that appeals to me. This is part of the "do it perfect or don't do it" logic that I usually disagree with. We all agree we have a tuple header that is too large. Here we have a chance to reduce it by 11% on most platforms, and as a downside, we will not have a working pg_upgrade. What do you think the average user will choose? > I know as well as you do that pg_upgrade compatibility is only > interesting if there *is* a pg_upgrade, which very possibly won't > happen for 7.3 anyway --- but I don't like foreclosing the possibility I am not sure what needs to be done for pg_upgrade for 7.3 (does anyone else?), but if you suspect it will not work anyway, why are you trying to save it for 7.3? (I think the problem with pg_upgrade in 7.2 was the inability to create pg_clog files to match the new transaction counter.) > for a marginal win. And this is definitely a marginal win. Let's > try to batch up enough changes to make it worth the pain. Marginal? Seems like a pretty concrete win to me in disk space. Also, he has offered to write a conversion tool. If he does that, maybe he can improve pg_upgrade if needed. > In case you are wondering what I am talking about, here is an > off-the-cuff list of things that I'd prefer not to do one at a time: > > * Version identifier words in page headers Yes, we need that, and in fact should do that in 7.3 if we accept this patch. This may make future upgrades easier. In fact, I wonder if we could place the page version number in such a way that old pre-7.3 pages could be easily identified, i.e. place version number 0xAE in an offset that used to hold a values that were always less than 0x80. > * CRCs in page headers > * Replication and/or point-in-time recovery might need additional > header fields similar to those used by WAL > * Restructuring of line pointers > * Making OIDs optional (no wasted space if no OID) > * Adding a tuple version identifier to tuples (for DROP COLUMN etc) > * Restructuring index tuple headers (remove length field) > * Fixing array storage to support NULLs inside arrays > * Restoring time travel on some optional basis > > Some of these may never happen (I'm not even in favor of all of them) > but it's certain that we will want to do some of them. I don't want to > take the same hit over and over when some intelligent project management > would let us take it just once or twice. Yes, there are some good ones there, but the idea that somehow they are all going to hit in the same release seems unlikely. I say let's do some now, some later, and move ahead. Ultimately, I think we need to add smarts to the backend to read old formats. I know it is a pain, but I see no other options. pg_upgrade always will have trouble with certain changes. -- Bruce Momjian | http://candle.pha.pa.us pgman@candle.pha.pa.us | (610) 853-3000 + If your life is a hard drive, | 830 Blythe Avenue + Christ can be your backup. | Drexel Hill, Pennsylvania 19026
Jan Wieck wrote: > Tom Lane wrote: > > > > Jan Wieck <JanWieck@Yahoo.com> writes: > > > Do we have any hard numbers on that? Is it just access to the header > > > fields, or do we loose the offset cacheability of all fixed size fields > > > at the beginning of a row? In the latter case count me into the > > > slowness-believer camp. > > > > I don't believe the patch would have made the header variable size, > > only changed what the fixed size is. The slowdown I was worried about > > was just a matter of a couple extra tests and branches in the tqual.c > > routines; which would be negligible if they weren't such hotspots. > > Did someone run at least pgbench with/without that patch applied? No, but he did perform this analysis: > thus reducing the additional cost to one t_infomask compare, > because the Satisfies functions only access Cmin and Cmax, > when HEAP_MOVED is known to be not set. > > OTOH experimenting with a moderatly sized "out of production" > database I got the following results: > | pages | pages | > relkind | count | tuples | before| after | savings > --------+-------+--------+-------+-------+-------- > i | 31 | 808146 | 8330 | 8330 | 0.00% > r | 32 | 612968 | 13572 | 13184 | 2.86% > all | 63 | | 21902 | 21514 | 1.77% > > 2.86% fewer heap pages mean 2.86% less disk IO caused by heap pages. > Considering that index pages tend to benefit more from caching > we conclude that heap pages contribute more to the overall > IO load, so the total savings in the number of disk IOs should > be better than the 1.77% shown in the table above. I think > this outweighs a few CPU cycles now and then. -- Bruce Momjian | http://candle.pha.pa.us pgman@candle.pha.pa.us | (610) 853-3000 + If your life is a hard drive, | 830 Blythe Avenue + Christ can be your backup. | Drexel Hill, Pennsylvania 19026
Bruce Momjian <pgman@candle.pha.pa.us> writes: > Tom Lane wrote: >> Some of these may never happen (I'm not even in favor of all of them) >> but it's certain that we will want to do some of them. I don't want to >> take the same hit over and over when some intelligent project management >> would let us take it just once or twice. > Yes, there are some good ones there, but the idea that somehow they are > all going to hit in the same release seems unlikely. I say let's do > some now, some later, and move ahead. I think it's very likely that P-I-T recovery and replication will hit in the 7.4 release cycle. I would prefer to see us plan ahead to do these other changes for 7.4 as well, and get as many of them done in that cycle as we can. Basically my point is that there are costs and benefits to this sort of change, and many of the costs are quantized --- it doesn't cost more to make three incompatible disk-format changes than one, *as long as they're in the same release*. So we should try to do some actual management of such changes, not accept them willy-nilly whenever someone feels like doing one. This patch is unlikely to suffer significant bit-rot if we hold it for 7.4, and that's what I think we should do with it. regards, tom lane
Tom Lane wrote: > Bruce Momjian <pgman@candle.pha.pa.us> writes: > > Tom Lane wrote: > >> Some of these may never happen (I'm not even in favor of all of them) > >> but it's certain that we will want to do some of them. I don't want to > >> take the same hit over and over when some intelligent project management > >> would let us take it just once or twice. > > > Yes, there are some good ones there, but the idea that somehow they are > > all going to hit in the same release seems unlikely. I say let's do > > some now, some later, and move ahead. > > I think it's very likely that P-I-T recovery and replication will hit > in the 7.4 release cycle. I would prefer to see us plan ahead to do > these other changes for 7.4 as well, and get as many of them done in > that cycle as we can. > > Basically my point is that there are costs and benefits to this sort > of change, and many of the costs are quantized --- it doesn't cost > more to make three incompatible disk-format changes than one, *as long > as they're in the same release*. So we should try to do some actual > management of such changes, not accept them willy-nilly whenever someone > feels like doing one. > > This patch is unlikely to suffer significant bit-rot if we hold it for > 7.4, and that's what I think we should do with it. Well, that is a different argument than initially. So, it is a valid patch, but we have to decide when to apply it. We can easily hold it until we near release of 7.3. If pg_upgrade is in good shape _and_ no other format changes are required, we can hold it for 7.4. What happens if 7.4 doesn't have any format changes? However, if we have other changes or pg_upgrade isn't going to work, we can apply it in August. (Manfred, you can be sure I will not lose this patch.) However, we have been very bad at predicting what features/changes are coming in future releases. Also, I don't think replication will require tuple format changes. I will check with the group but I haven't heard anything about that. So, we have to decide if we apply it now or delay it for later in 7.3, or for >=7.4. My personal vote is that we apply it now, and perhaps try some of the other format changes we were going to make. Tom's vote is to hold it, I assume. Let's see what others say. -- Bruce Momjian | http://candle.pha.pa.us pgman@candle.pha.pa.us | (610) 853-3000 + If your life is a hard drive, | 830 Blythe Avenue + Christ can be your backup. | Drexel Hill, Pennsylvania 19026
Jan Wieck <JanWieck@Yahoo.com> writes: > Here are some numbers: > Current CVS tip: tps 34.1, 38.7, 36.6 > avg(tps) 36.4 > With patch: tps 37.0, 41.1, 41.1 > avg(tps) 39.7 > So it saves less than 3% disk space at the cost of about 9% performance > loss. Uh ... isn't more TPS better? regards, tom lane
Bruce Momjian wrote: > > Jan Wieck wrote: > > > > Did someone run at least pgbench with/without that patch applied? > > No, but he did perform this analysis: > > > thus reducing the additional cost to one t_infomask compare, > > because the Satisfies functions only access Cmin and Cmax, > > when HEAP_MOVED is known to be not set. > > > > OTOH experimenting with a moderatly sized "out of production" > > database I got the following results: > > | pages | pages | > > relkind | count | tuples | before| after | savings > > --------+-------+--------+-------+-------+-------- > > i | 31 | 808146 | 8330 | 8330 | 0.00% > > r | 32 | 612968 | 13572 | 13184 | 2.86% > > all | 63 | | 21902 | 21514 | 1.77% > > > > 2.86% fewer heap pages mean 2.86% less disk IO caused by heap pages. > > Considering that index pages tend to benefit more from caching > > we conclude that heap pages contribute more to the overall > > IO load, so the total savings in the number of disk IOs should > > be better than the 1.77% shown in the table above. I think > > this outweighs a few CPU cycles now and then. This anawhat? This is a proof that this patch is able to save not even 3% of disk space in a production environment plus an assumption that the saved IO outweights the extra effort in the tuple visibility checks. Here are some numbers: P3 850MHz 256MB RAM IDE postmaster -N256 -B8192 pgbench -i -s 10 db pgbench -c 20 -t 500 db Current CVS tip: tps 34.1, 38.7, 36.6 avg(tps) 36.4 With patch: tps 37.0, 41.1, 41.1 avg(tps) 39.7 So it saves less than 3% disk space at the cost of about 9% performance loss. If we can do the same the other way around I'd go for wasting some more disk space. Jan -- #======================================================================# # It's easier to get forgiveness for being wrong than for being right. # # Let's break this rule - forgive me. # #================================================== JanWieck@Yahoo.com #
Tom Lane dijo: > Jan Wieck <JanWieck@Yahoo.com> writes: > > Here are some numbers: > > > Current CVS tip: tps 34.1, 38.7, 36.6 > > avg(tps) 36.4 > > > With patch: tps 37.0, 41.1, 41.1 > > avg(tps) 39.7 > > > So it saves less than 3% disk space at the cost of about 9% performance > > loss. > > Uh ... isn't more TPS better? Also, is that 3% in disk space savings the actual number, or just copied from the "anawhat"? -- Alvaro Herrera (<alvherre[a]atentus.com>) Y dijo Dios: "Que haya Satanás, para que la gente no me culpe de todo a mí." "Y que hayan abogados, para que la gente no culpe de todo a Satanás"
Alvaro Herrera wrote: > Tom Lane dijo: > > > Jan Wieck <JanWieck@Yahoo.com> writes: > > > Here are some numbers: > > > > > Current CVS tip: tps 34.1, 38.7, 36.6 > > > avg(tps) 36.4 > > > > > With patch: tps 37.0, 41.1, 41.1 > > > avg(tps) 39.7 > > > > > So it saves less than 3% disk space at the cost of about 9% performance > > > loss. > > > > Uh ... isn't more TPS better? 9%, that is a dramatic difference. Is it caused by the reduced disk space (Jan's numbers are correct) or by the extra overhead in the merged fields (Jan's numbers are backwards)? Jan will tell us soon. > Also, is that 3% in disk space savings the actual number, or just copied > from the "anawhat"? The 3% is savings from a sample database. Header size is 11% reduced. -- Bruce Momjian | http://candle.pha.pa.us pgman@candle.pha.pa.us | (610) 853-3000 + If your life is a hard drive, | 830 Blythe Avenue + Christ can be your backup. | Drexel Hill, Pennsylvania 19026
Bruce Momjian <pgman@candle.pha.pa.us> writes: > So it saves less than 3% disk space at the cost of about 9% performance > loss. > > Uh ... isn't more TPS better? > 9%, that is a dramatic difference. Yup. I'm suspicious of it --- if the database is 3% smaller then I'd believe a 3% performance gain from reduction of I/O, but I don't see where 9% comes from. regards, tom lane
Jan, any update on this? Are the numbers correct? --------------------------------------------------------------------------- Jan Wieck wrote: > Bruce Momjian wrote: > > > > Jan Wieck wrote: > > > > > > Did someone run at least pgbench with/without that patch applied? > > > > No, but he did perform this analysis: > > > > > thus reducing the additional cost to one t_infomask compare, > > > because the Satisfies functions only access Cmin and Cmax, > > > when HEAP_MOVED is known to be not set. > > > > > > OTOH experimenting with a moderatly sized "out of production" > > > database I got the following results: > > > | pages | pages | > > > relkind | count | tuples | before| after | savings > > > --------+-------+--------+-------+-------+-------- > > > i | 31 | 808146 | 8330 | 8330 | 0.00% > > > r | 32 | 612968 | 13572 | 13184 | 2.86% > > > all | 63 | | 21902 | 21514 | 1.77% > > > > > > 2.86% fewer heap pages mean 2.86% less disk IO caused by heap pages. > > > Considering that index pages tend to benefit more from caching > > > we conclude that heap pages contribute more to the overall > > > IO load, so the total savings in the number of disk IOs should > > > be better than the 1.77% shown in the table above. I think > > > this outweighs a few CPU cycles now and then. > > This anawhat? This is a proof that this patch is able to save not even > 3% of disk space in a production environment plus an assumption that the > saved IO outweights the extra effort in the tuple visibility checks. > > Here are some numbers: > > P3 850MHz 256MB RAM IDE > postmaster -N256 -B8192 > pgbench -i -s 10 db > pgbench -c 20 -t 500 db > > > Current CVS tip: tps 34.1, 38.7, 36.6 > avg(tps) 36.4 > > With patch: tps 37.0, 41.1, 41.1 > avg(tps) 39.7 > > So it saves less than 3% disk space at the cost of about 9% performance > loss. If we can do the same the other way around I'd go for wasting some > more disk space. > > > Jan > > -- > > #======================================================================# > # It's easier to get forgiveness for being wrong than for being right. # > # Let's break this rule - forgive me. # > #================================================== JanWieck@Yahoo.com # > > ---------------------------(end of broadcast)--------------------------- > TIP 1: subscribe and unsubscribe commands go to majordomo@postgresql.org > -- Bruce Momjian | http://candle.pha.pa.us pgman@candle.pha.pa.us | (610) 853-3000 + If your life is a hard drive, | 830 Blythe Avenue + Christ can be your backup. | Drexel Hill, Pennsylvania 19026
Bruce Momjian wrote: > > Jan, any update on this? Are the numbers correct? Sorry, but took some time. Well, it turned out that pgbench does a terrible job with runtimes below 30 minutes. Seems that one checkpoint more or less can have a significant impact on the numbers reported by such run. Also starting off with a populated cache (rampup) seems to be a very good idea. So my advice for running pgbench is to do an initdb before running (to whipe out logfile creation/reuse issues). To populate a fresh database with a reasonable scaling. Run pgbench with a high enough -c and -t so it runs for at least 5 minutes. Then do the actual measurement with a pgbench run with settings keeping the system busy for 30 minutes or more. Needless to say, keep your fingers (and everyone elses too) off the system during that time. Shut down not needed services, and especially cron! Using the above, the discussed change to the tuple header shows less than 1% difference. Sorry for all the confusion. Jan > > --------------------------------------------------------------------------- > > Jan Wieck wrote: > > Bruce Momjian wrote: > > > > > > Jan Wieck wrote: > > > > > > > > Did someone run at least pgbench with/without that patch applied? > > > > > > No, but he did perform this analysis: > > > > > > > thus reducing the additional cost to one t_infomask compare, > > > > because the Satisfies functions only access Cmin and Cmax, > > > > when HEAP_MOVED is known to be not set. > > > > > > > > OTOH experimenting with a moderatly sized "out of production" > > > > database I got the following results: > > > > | pages | pages | > > > > relkind | count | tuples | before| after | savings > > > > --------+-------+--------+-------+-------+-------- > > > > i | 31 | 808146 | 8330 | 8330 | 0.00% > > > > r | 32 | 612968 | 13572 | 13184 | 2.86% > > > > all | 63 | | 21902 | 21514 | 1.77% > > > > > > > > 2.86% fewer heap pages mean 2.86% less disk IO caused by heap pages. > > > > Considering that index pages tend to benefit more from caching > > > > we conclude that heap pages contribute more to the overall > > > > IO load, so the total savings in the number of disk IOs should > > > > be better than the 1.77% shown in the table above. I think > > > > this outweighs a few CPU cycles now and then. > > > > This anawhat? This is a proof that this patch is able to save not even > > 3% of disk space in a production environment plus an assumption that the > > saved IO outweights the extra effort in the tuple visibility checks. > > > > Here are some numbers: > > > > P3 850MHz 256MB RAM IDE > > postmaster -N256 -B8192 > > pgbench -i -s 10 db > > pgbench -c 20 -t 500 db > > > > > > Current CVS tip: tps 34.1, 38.7, 36.6 > > avg(tps) 36.4 > > > > With patch: tps 37.0, 41.1, 41.1 > > avg(tps) 39.7 > > > > So it saves less than 3% disk space at the cost of about 9% performance > > loss. If we can do the same the other way around I'd go for wasting some > > more disk space. > > > > > > Jan > > > > -- > > > > #======================================================================# > > # It's easier to get forgiveness for being wrong than for being right. # > > # Let's break this rule - forgive me. # > > #================================================== JanWieck@Yahoo.com # > > > > ---------------------------(end of broadcast)--------------------------- > > TIP 1: subscribe and unsubscribe commands go to majordomo@postgresql.org > > > > -- > Bruce Momjian | http://candle.pha.pa.us > pgman@candle.pha.pa.us | (610) 853-3000 > + If your life is a hard drive, | 830 Blythe Avenue > + Christ can be your backup. | Drexel Hill, Pennsylvania 19026 > > ---------------------------(end of broadcast)--------------------------- > TIP 4: Don't 'kill -9' the postmaster -- #======================================================================# # It's easier to get forgiveness for being wrong than for being right. # # Let's break this rule - forgive me. # #================================================== JanWieck@Yahoo.com #
Jan Wieck <JanWieck@Yahoo.com> writes: > Well, it turned out that pgbench does a terrible job with runtimes below > 30 minutes. Seems that one checkpoint more or less can have a > significant impact on the numbers reported by such run. Yeah, it is *very* painful to get reproducible numbers out of pgbench. > Using the above, the discussed change to the tuple header shows less > than 1% difference. So the bottom line is that there is probably no measurable performance difference, but a 3% space savings, at least for average row lengths comparable to those used in pgbench. (Obviously the space savings is going to depend on average row length...) regards, tom lane
Tom Lane wrote: > > Jan Wieck <JanWieck@Yahoo.com> writes: > > Well, it turned out that pgbench does a terrible job with runtimes below > > 30 minutes. Seems that one checkpoint more or less can have a > > significant impact on the numbers reported by such run. > > Yeah, it is *very* painful to get reproducible numbers out of pgbench. And since it seems to be extremly checkpoint dependent, imagine what someone would have to do to measure the impact of changing config options about checkpoint segments and intervals. But I guess that this would be an issue for every benchmark. The TPC must know about it as the specs for the TPC-C require at least one checkpoint occuring during the measurement time and 90% of all transactions fitting the timing constraints. Currently PostgreSQL creates an IO storm on a checkpoint, that simply brings the system down to responsetimes 100x the average, slowly recovering from there since all simulated users are active at once then (they all woke up from their thinking or keying times and pressed SEND). Jan -- #======================================================================# # It's easier to get forgiveness for being wrong than for being right. # # Let's break this rule - forgive me. # #================================================== JanWieck@Yahoo.com #
On Fri, 21 Jun 2002 12:45:48 -0400 (EDT), Bruce Momjian <pgman@candle.pha.pa.us> wrote: >> > Yes, there are some good ones there, but the idea that somehow they are >> > all going to hit in the same release seems unlikely. I say let's do >> > some now, some later, and move ahead. I (strongly :-)) agree. >Well, that is a different argument than initially. So, it is a valid >patch, but we have to decide when to apply it. > >We can easily hold it until we near release of 7.3. If pg_upgrade is in >good shape _and_ no other format changes are required, we can hold it >for 7.4. What happens if 7.4 doesn't have any format changes? > >However, if we have other changes or pg_upgrade isn't going to work, we >can apply it in August. But what, if the patch causes or reveals a well hidden bug, possibly somewhere else in the code? I'd vote for applying it now, so that in August it already has undergone some testing. You can always patch -R before going beta... >(Manfred, you can be sure I will not lose this patch.) Thanks. Anyway, lose it! Here is a new version. >So, we have to decide if we apply it now or delay it for later in 7.3, >or for >=7.4. Why not let the users decide? With this new version of the patch they can configure --enable-pg72format if this patch is the only thing that stops pg_upgrade from working and if they want to use pg_upgrade. >My personal vote is that we apply it now, and perhaps try some of the >other format changes we were going to make. And with #ifdef PG72FORMAT we can introduce them without very much risk. One thing that's still missing is we'd need two entries in catversion.h. Servus Manfred diff -u ../base/INSTALL ./INSTALL --- ../base/INSTALL 2002-01-31 01:46:26.000000000 +0100 +++ ./INSTALL 2002-06-25 15:59:03.000000000 +0200 @@ -283,6 +283,13 @@ Enables single-byte character set recode support. See the Administrator's Guide about this feature. + --enable-pg72format + + Enables version 7.2 page format, giving you a better chance of + converting existing databases by pg_upgrade. Do not use this + feature, if you are just starting to use PostgreSQL or if you + plan to upgrade via pg_dump/restore. + --enable-multibyte Allows the use of multibyte character encodings (including Common subdirectories: ../base/config and ./config diff -u ../base/configure ./configure --- ../base/configure 2002-05-29 14:42:20.000000000 +0200 +++ ./configure 2002-06-25 15:51:18.000000000 +0200 @@ -1652,6 +1652,44 @@ # +# Version 7.2 page format (--enable-pg72format) +# +echo "$as_me:$LINENO: checking whether to build with version 7.2 page format" >&5 +echo $ECHO_N "checking whether to build with version 7.2 page format... $ECHO_C" >&6 + + +# Check whether --enable-pg72format or --disable-pg72format was given. +if test "${enable_pg72format+set}" = set; then + enableval="$enable_pg72format" + + case $enableval in + yes) + +cat >>confdefs.h <<\_ACEOF +#define PG72FORMAT 1 +_ACEOF + + ;; + no) + : + ;; + *) + { { echo "$as_me:$LINENO: error: no argument expected for --enable-pg72format option" >&5 +echo "$as_me: error: no argument expected for --enable-pg72format option" >&2;} + { (exit 1); exit 1; }; } + ;; + esac + +else + enable_pg72format=no + +fi; + +echo "$as_me:$LINENO: result: $enable_pg72format" >&5 +echo "${ECHO_T}$enable_pg72format" >&6 + + +# # Multibyte support # MULTIBYTE=SQL_ASCII diff -u ../base/configure.in ./configure.in --- ../base/configure.in 2002-05-29 14:42:20.000000000 +0200 +++ ./configure.in 2002-06-25 15:51:15.000000000 +0200 @@ -162,6 +162,16 @@ # +# Version 7.2 page format (--enable-pg72format) +# +AC_MSG_CHECKING([whether to build with version 7.2 page format]) +PGAC_ARG_BOOL(enable, pg72format, no, [ --enable-pg72format enable version 7.2 page format], + [AC_DEFINE([PG72FORMAT], 1, + [Set to 1 if you want version 7.2 page format (--enable-pg72format)])]) +AC_MSG_RESULT([$enable_pg72format]) + + +# # Multibyte support # MULTIBYTE=SQL_ASCII diff -ru ../base/src/backend/access/heap/heapam.c src/backend/access/heap/heapam.c --- ../base/src/backend/access/heap/heapam.c 2002-06-17 10:11:31.000000000 +0200 +++ src/backend/access/heap/heapam.c 2002-06-17 22:35:32.000000000 +0200 @@ -2204,7 +2204,7 @@ htup->t_infomask = HEAP_XMAX_INVALID | xlhdr.mask; HeapTupleHeaderSetXmin(htup, record->xl_xid); HeapTupleHeaderSetCmin(htup, FirstCommandId); - HeapTupleHeaderSetXmax(htup, InvalidTransactionId); + HeapTupleHeaderSetXmaxInvalid(htup); HeapTupleHeaderSetCmax(htup, FirstCommandId); offnum = PageAddItem(page, (Item) htup, newlen, offnum, diff -ru ../base/src/include/access/htup.h src/include/access/htup.h --- ../base/src/include/access/htup.h 2002-06-17 10:11:32.000000000 +0200 +++ src/include/access/htup.h 2002-06-25 16:14:37.000000000 +0200 @@ -57,15 +57,33 @@ * Also note that we omit the nulls bitmap if t_infomask shows that there * are no nulls in the tuple. */ +/* +** We store five "virtual" fields Xmin, Cmin, Xmax, Cmax, and Xvac +** in three physical fields t_xmin, t_cid, t_xmax: +** CommandId Cmin; insert CID stamp +** CommandId Cmax; delete CommandId stamp +** TransactionId Xmin; insert XID stamp +** TransactionId Xmax; delete XID stamp +** TransactionId Xvac; used by VACCUUM +** +** This assumes, that a CommandId can be stored in a TransactionId. +*/ typedef struct HeapTupleHeaderData { Oid t_oid; /* OID of this tuple -- 4 bytes */ +#ifdef PG72FORMAT + /* v7.2: Xvac is stored in t_cmin */ CommandId t_cmin; /* insert CID stamp -- 4 bytes each */ CommandId t_cmax; /* delete CommandId stamp */ TransactionId t_xmin; /* insert XID stamp -- 4 bytes each */ TransactionId t_xmax; /* delete XID stamp */ +#else + TransactionId t_xmin; /* Xmin -- 4 bytes each */ + TransactionId t_cid; /* Cmin, Cmax, Xvac */ + TransactionId t_xmax; /* Xmax, Cmax */ +#endif ItemPointerData t_ctid; /* current TID of this or newer tuple */ @@ -75,7 +93,7 @@ uint8 t_hoff; /* sizeof header incl. bitmap, padding */ - /* ^ - 31 bytes - ^ */ + /* ^ - 27 (v7.3) or 31 (v7.2) bytes - ^ */ bits8 t_bits[1]; /* bitmap of NULLs -- VARIABLE LENGTH */ @@ -96,6 +114,8 @@ * attribute(s) */ #define HEAP_HASEXTENDED 0x000C /* the two above combined */ +#define HEAP_XMIN_IS_XMAX 0x0040 /* created and deleted in the */ + /* same transaction */ #define HEAP_XMAX_UNLOGGED 0x0080 /* to lock tuple for update */ /* without logging */ #define HEAP_XMIN_COMMITTED 0x0100 /* t_xmin committed */ @@ -108,6 +128,7 @@ * vacuum */ #define HEAP_MOVED_IN 0x8000 /* moved from another place by * vacuum */ +#define HEAP_MOVED (HEAP_MOVED_OFF | HEAP_MOVED_IN) #define HEAP_XACT_MASK 0xFFF0 /* visibility-related bits */ @@ -116,8 +137,11 @@ /* HeapTupleHeader accessor macros */ #define HeapTupleHeaderGetXmin(tup) \ - ((tup)->t_xmin) +( \ + (tup)->t_xmin \ +) +#ifdef PG72FORMAT #define HeapTupleHeaderGetXmax(tup) \ ((tup)->t_xmax) @@ -163,6 +187,98 @@ AssertMacro((tup)->t_infomask & (HEAP_MOVED_IN | HEAP_MOVED_OFF)), \ TransactionIdStore((xid), (TransactionId *) &((tup)->t_cmin)) \ ) +#else +#define HeapTupleHeaderGetXmax(tup) \ +( \ + ((tup)->t_infomask & HEAP_XMIN_IS_XMAX) ? \ + (tup)->t_xmin \ + : \ + (tup)->t_xmax \ +) + +/* no AssertMacro, because this is read as a system-defined attribute */ +#define HeapTupleHeaderGetCmin(tup) \ +( \ + ((tup)->t_infomask & HEAP_MOVED) ? \ + FirstCommandId \ + : \ + ( \ + ((tup)->t_infomask & (HEAP_XMIN_IS_XMAX | HEAP_XMAX_INVALID)) ? \ + (CommandId) (tup)->t_cid \ + : \ + FirstCommandId \ + ) \ +) + +#define HeapTupleHeaderGetCmax(tup) \ +( \ + ((tup)->t_infomask & HEAP_MOVED) ? \ + FirstCommandId \ + : \ + ( \ + ((tup)->t_infomask & (HEAP_XMIN_IS_XMAX | HEAP_XMAX_INVALID)) ? \ + (CommandId) (tup)->t_xmax \ + : \ + (CommandId) (tup)->t_cid \ + ) \ +) + +#define HeapTupleHeaderGetXvac(tup) \ +( \ + AssertMacro((tup)->t_infomask & HEAP_MOVED), \ + (tup)->t_cid \ +) + + +#define HeapTupleHeaderSetXmin(tup, xid) \ +( \ + TransactionIdStore((xid), &(tup)->t_xmin) \ +) + +#define HeapTupleHeaderSetXminInvalid(tup) \ +do { \ + (tup)->t_infomask &= ~HEAP_XMIN_IS_XMAX; \ + StoreInvalidTransactionId(&(tup)->t_xmin); \ +} while (0) + +#define HeapTupleHeaderSetXmax(tup, xid) \ +do { \ + if (TransactionIdEquals((tup)->t_xmin, (xid))) \ + (tup)->t_infomask |= HEAP_XMIN_IS_XMAX; \ + else \ + { \ + (tup)->t_infomask &= ~HEAP_XMIN_IS_XMAX; \ + TransactionIdStore((xid), &(tup)->t_xmax); \ + } \ +} while (0) + +#define HeapTupleHeaderSetXmaxInvalid(tup) \ +do { \ + (tup)->t_infomask &= ~HEAP_XMIN_IS_XMAX; \ + StoreInvalidTransactionId(&(tup)->t_xmax); \ +} while (0) + +#define HeapTupleHeaderSetCmin(tup, cid) \ +do { \ + Assert(!((tup)->t_infomask & HEAP_MOVED)); \ + TransactionIdStore((TransactionId) (cid), &(tup)->t_cid); \ +} while (0) + +#define HeapTupleHeaderSetCmax(tup, cid) \ +do { \ + Assert(!((tup)->t_infomask & HEAP_MOVED)); \ + if ((tup)->t_infomask & HEAP_XMIN_IS_XMAX) \ + TransactionIdStore((TransactionId) (cid), &(tup)->t_xmax); \ + else \ + TransactionIdStore((TransactionId) (cid), &(tup)->t_cid); \ +} while (0) + +#define HeapTupleHeaderSetXvac(tup, xid) \ +do { \ + Assert((tup)->t_infomask & HEAP_MOVED); \ + TransactionIdStore((xid), &(tup)->t_cid); \ +} while (0) +#endif /* diff -ru ../base/src/include/pg_config.h.in src/include/pg_config.h.in --- ../base/src/include/pg_config.h.in 2002-06-17 21:04:03.000000000 +0200 +++ src/include/pg_config.h.in 2002-06-25 16:02:40.000000000 +0200 @@ -39,6 +39,9 @@ /* Set to 1 if you want cyrillic recode (--enable-recode) */ #undef CYR_RECODE +/* Set to 1 if you want version 7.2 page format (--enable-pg72format) */ +#undef PG72FORMAT + /* Set to 1 if you want to use multibyte characters (--enable-multibyte) */ #undef MULTIBYTE
Manfred Koizar <mkoi-pg@aon.at> writes: > Why not let the users decide? With this new version of the patch they > can > configure --enable-pg72format I think that is a really, really bad idea. If we're gonna do it we should just do it. We don't need multiple incompatible file formats running around all calling themselves PG 7.3. Configure options are generally not a solution to anything anyway, since more and more people use RPM distributions and don't have the option to make their own configure choices. regards, tom lane
Patch applied. Thanks. Catalog version updated. initdb everyone. --------------------------------------------------------------------------- Manfred Koizar wrote: > This patch, which is built upon the "HeapTupleHeader accessor macros" > patch from 2002-06-10, is supposed to reduce the heap tuple header size > by four bytes on most architectures. Of course it changes the on-disk > tuple format and therefore requires initdb. As I have (once more) > opened my mouth too wide, I'll have to provide a heap file conversion > utility, if this patch gets accepted... More on this later. > > ====================== > All 81 tests passed. > ====================== > > It's late now, I'll do more tests tomorrow. > > Good night > Manfred > > diff -ru ../orig/src/backend/access/heap/heapam.c src/backend/access/heap/heapam.c > --- ../orig/src/backend/access/heap/heapam.c 2002-06-13 19:34:48.000000000 +0200 > +++ src/backend/access/heap/heapam.c 2002-06-13 22:31:42.000000000 +0200 > @@ -2204,7 +2204,7 @@ > htup->t_infomask = HEAP_XMAX_INVALID | xlhdr.mask; > HeapTupleHeaderSetXmin(htup, record->xl_xid); > HeapTupleHeaderSetCmin(htup, FirstCommandId); > - HeapTupleHeaderSetXmax(htup, InvalidTransactionId); > + HeapTupleHeaderSetXmaxInvalid(htup); > HeapTupleHeaderSetCmax(htup, FirstCommandId); > > offnum = PageAddItem(page, (Item) htup, newlen, offnum, > diff -ru ../orig/src/include/access/htup.h src/include/access/htup.h > --- ../orig/src/include/access/htup.h 2002-06-13 19:34:49.000000000 +0200 > +++ src/include/access/htup.h 2002-06-14 01:12:47.000000000 +0200 > @@ -57,15 +57,24 @@ > * Also note that we omit the nulls bitmap if t_infomask shows that there > * are no nulls in the tuple. > */ > +/* > +** We store five "virtual" fields Xmin, Cmin, Xmax, Cmax, and Xvac > +** in three physical fields t_xmin, t_cid, t_xmax: > +** CommandId Cmin; insert CID stamp > +** CommandId Cmax; delete CommandId stamp > +** TransactionId Xmin; insert XID stamp > +** TransactionId Xmax; delete XID stamp > +** TransactionId Xvac; used by VACCUUM > +** > +** This assumes, that a CommandId can be stored in a TransactionId. > +*/ > typedef struct HeapTupleHeaderData > { > Oid t_oid; /* OID of this tuple -- 4 bytes */ > > - CommandId t_cmin; /* insert CID stamp -- 4 bytes each */ > - CommandId t_cmax; /* delete CommandId stamp */ > - > - TransactionId t_xmin; /* insert XID stamp -- 4 bytes each */ > - TransactionId t_xmax; /* delete XID stamp */ > + TransactionId t_xmin; /* Xmin -- 4 bytes each */ > + TransactionId t_cid; /* Cmin, Cmax, Xvac */ > + TransactionId t_xmax; /* Xmax, Cmax */ > > ItemPointerData t_ctid; /* current TID of this or newer tuple */ > > @@ -75,7 +84,7 @@ > > uint8 t_hoff; /* sizeof header incl. bitmap, padding */ > > - /* ^ - 31 bytes - ^ */ > + /* ^ - 27 bytes - ^ */ > > bits8 t_bits[1]; /* bitmap of NULLs -- VARIABLE LENGTH */ > > @@ -96,6 +105,8 @@ > * attribute(s) */ > #define HEAP_HASEXTENDED 0x000C /* the two above combined */ > > +#define HEAP_XMIN_IS_XMAX 0x0040 /* created and deleted in the */ > + /* same transaction > */ > #define HEAP_XMAX_UNLOGGED 0x0080 /* to lock tuple for update */ > /* without logging > */ > #define HEAP_XMIN_COMMITTED 0x0100 /* t_xmin committed */ > @@ -108,6 +119,7 @@ > * vacuum */ > #define HEAP_MOVED_IN 0x8000 /* moved from another place by > * vacuum */ > +#define HEAP_MOVED (HEAP_MOVED_OFF | HEAP_MOVED_IN) > > #define HEAP_XACT_MASK 0xFFF0 /* visibility-related bits */ > > @@ -116,53 +128,100 @@ > /* HeapTupleHeader accessor macros */ > > #define HeapTupleHeaderGetXmin(tup) \ > - ((tup)->t_xmin) > +( \ > + (tup)->t_xmin \ > +) > > #define HeapTupleHeaderGetXmax(tup) \ > - ((tup)->t_xmax) > +( \ > + ((tup)->t_infomask & HEAP_XMIN_IS_XMAX) ? \ > + (tup)->t_xmin \ > + : \ > + (tup)->t_xmax \ > +) > > -/* no AssertMacro, because this is read as a system-defined attribute also */ > +/* no AssertMacro, because this is read as a system-defined attribute */ > #define HeapTupleHeaderGetCmin(tup) \ > ( \ > - (tup)->t_cmin \ > + ((tup)->t_infomask & HEAP_MOVED) ? \ > + FirstCommandId \ > + : \ > + ( \ > + ((tup)->t_infomask & (HEAP_XMIN_IS_XMAX | HEAP_XMAX_INVALID)) ? \ > + (CommandId) (tup)->t_cid \ > + : \ > + FirstCommandId \ > + ) \ > ) > > #define HeapTupleHeaderGetCmax(tup) \ > - ((tup)->t_cmax) > +( \ > + ((tup)->t_infomask & HEAP_MOVED) ? \ > + FirstCommandId \ > + : \ > + ( \ > + ((tup)->t_infomask & (HEAP_XMIN_IS_XMAX | HEAP_XMAX_INVALID)) ? \ > + (CommandId) (tup)->t_xmax \ > + : \ > + (CommandId) (tup)->t_cid \ > + ) \ > +) > > #define HeapTupleHeaderGetXvac(tup) \ > ( \ > - AssertMacro((tup)->t_infomask & (HEAP_MOVED_IN | HEAP_MOVED_OFF)), \ > - (TransactionId) (tup)->t_cmin \ > + AssertMacro((tup)->t_infomask & HEAP_MOVED), \ > + (tup)->t_cid \ > ) > > > #define HeapTupleHeaderSetXmin(tup, xid) \ > - (TransactionIdStore((xid), &(tup)->t_xmin)) > +( \ > + TransactionIdStore((xid), &(tup)->t_xmin) \ > +) > > #define HeapTupleHeaderSetXminInvalid(tup) \ > - (StoreInvalidTransactionId(&(tup)->t_xmin)) > +do { \ > + (tup)->t_infomask &= ~HEAP_XMIN_IS_XMAX; \ > + StoreInvalidTransactionId(&(tup)->t_xmin); \ > +} while (0) > > #define HeapTupleHeaderSetXmax(tup, xid) \ > - (TransactionIdStore((xid), &(tup)->t_xmax)) > +do { \ > + if (TransactionIdEquals((tup)->t_xmin, (xid))) \ > + (tup)->t_infomask |= HEAP_XMIN_IS_XMAX; \ > + else \ > + { \ > + (tup)->t_infomask &= ~HEAP_XMIN_IS_XMAX; \ > + TransactionIdStore((xid), &(tup)->t_xmax); \ > + } \ > +} while (0) > > #define HeapTupleHeaderSetXmaxInvalid(tup) \ > - (StoreInvalidTransactionId(&(tup)->t_xmax)) > +do { \ > + (tup)->t_infomask &= ~HEAP_XMIN_IS_XMAX; \ > + StoreInvalidTransactionId(&(tup)->t_xmax); \ > +} while (0) > > #define HeapTupleHeaderSetCmin(tup, cid) \ > -( \ > - AssertMacro(!((tup)->t_infomask & (HEAP_MOVED_IN | HEAP_MOVED_OFF))), \ > - (tup)->t_cmin = (cid) \ > -) > +do { \ > + Assert(!((tup)->t_infomask & HEAP_MOVED)); \ > + TransactionIdStore((TransactionId) (cid), &(tup)->t_cid); \ > +} while (0) > > #define HeapTupleHeaderSetCmax(tup, cid) \ > - ((tup)->t_cmax = (cid)) > +do { \ > + Assert(!((tup)->t_infomask & HEAP_MOVED)); \ > + if ((tup)->t_infomask & HEAP_XMIN_IS_XMAX) \ > + TransactionIdStore((TransactionId) (cid), &(tup)->t_xmax); \ > + else \ > + TransactionIdStore((TransactionId) (cid), &(tup)->t_cid); \ > +} while (0) > > #define HeapTupleHeaderSetXvac(tup, xid) \ > -( \ > - AssertMacro((tup)->t_infomask & (HEAP_MOVED_IN | HEAP_MOVED_OFF)), \ > - TransactionIdStore((xid), (TransactionId *) &((tup)->t_cmin)) \ > -) > +do { \ > + Assert((tup)->t_infomask & HEAP_MOVED); \ > + TransactionIdStore((xid), &(tup)->t_cid); \ > +} while (0) > > > /* > > > ---------------------------(end of broadcast)--------------------------- > TIP 3: if posting/reading through Usenet, please send an appropriate > subscribe-nomail command to majordomo@postgresql.org so that your > message can get through to the mailing list cleanly > -- Bruce Momjian | http://candle.pha.pa.us pgman@candle.pha.pa.us | (610) 853-3000 + If your life is a hard drive, | 830 Blythe Avenue + Christ can be your backup. | Drexel Hill, Pennsylvania 19026