From e0111ed0b75e824f46c3b0bf5322a9bd355d100c Mon Sep 17 00:00:00 2001 From: John Naylor Date: Thu, 26 Dec 2019 18:28:50 -0500 Subject: [PATCH v3] Use the CLZ instruction in AllocSetFreeIndex() In commit ab5b4e2f9ed, we optimized AllocSetFreeIndex() using a lookup table. At the time, using CLZ was rejected because compiler/platform support was not widespread enough to justify it. Since 02a6a54ecd6, we test for availablity of __builtin_clz(), so use that instead. This is about twice as fast on Intel platforms, but perhaps more importantly eliminates cache pollution caused by the lookup table approach. In addition, for the open-coded case, use the general-purpose lookup table added by 02a6a54ecd6, rather than a single-purpose one. --- src/backend/utils/mmgr/aset.c | 50 ++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 24 deletions(-) diff --git a/src/backend/utils/mmgr/aset.c b/src/backend/utils/mmgr/aset.c index f729d9b6de..a89ef7084b 100644 --- a/src/backend/utils/mmgr/aset.c +++ b/src/backend/utils/mmgr/aset.c @@ -46,6 +46,7 @@ #include "postgres.h" +#include "port/pg_bitutils.h" #include "utils/memdebug.h" #include "utils/memutils.h" @@ -297,18 +298,6 @@ static const MemoryContextMethods AllocSetMethods = { #endif }; -/* - * Table for AllocSetFreeIndex - */ -#define LT16(n) n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n - -static const unsigned char LogTable256[256] = -{ - 0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, - LT16(5), LT16(6), LT16(6), LT16(7), LT16(7), LT16(7), LT16(7), - LT16(8), LT16(8), LT16(8), LT16(8), LT16(8), LT16(8), LT16(8), LT16(8) -}; - /* ---------- * Debug macros * ---------- @@ -337,24 +326,37 @@ static inline int AllocSetFreeIndex(Size size) { int idx; - unsigned int t, - tsize; if (size > (1 << ALLOC_MINBITS)) { - tsize = (size - 1) >> ALLOC_MINBITS; - /* - * At this point we need to obtain log2(tsize)+1, ie, the number of - * not-all-zero bits at the right. We used to do this with a - * shift-and-count loop, but this function is enough of a hotspot to - * justify micro-optimization effort. The best approach seems to be - * to use a lookup table. Note that this code assumes that - * ALLOCSET_NUM_FREELISTS <= 17, since we only cope with two bytes of - * the tsize value. + * At this point we compute ceil(log2(size >> ALLOC_MINBITS)). + * We can do this quickly via the number of not-all-zero bits at + * the right. We could simply use + * + * pg_leftmost_one_pos32((size - 1) >> ALLOC_MINBITS) + 1 + * + * for this, but duplicating the logic here affords us additional + * optimizations: + * + * 1. The compiler can fold ALLOC_MINBITS into other constants, + * rather than right-shifting as a separate step. + * 2. In the open-coded case, we only need to cope with two + * bytes of the size value. */ +#ifdef HAVE__BUILTIN_CLZ + idx = 31 - __builtin_clz((uint32) size - 1) - ALLOC_MINBITS + 1; +#else + unsigned int t, + tsize; + + StaticAssertStmt(ALLOCSET_NUM_FREELISTS <= 17, ""); + + tsize = (size - 1) >> ALLOC_MINBITS; t = tsize >> 8; - idx = t ? LogTable256[t] + 8 : LogTable256[tsize]; + idx = t ? pg_leftmost_one_pos[t] + 8 : pg_leftmost_one_pos[tsize]; + idx += 1; +#endif Assert(idx < ALLOCSET_NUM_FREELISTS); } -- 2.22.0