From 109991cea7d33436fa46c8e9c0fac26bb0a88ebb Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Sun, 12 Jul 2015 13:14:01 -0700 Subject: [PATCH 3/3] Perform memory prefetching when writing memtuples This patch is based on, but quite distinct to a separately submitted, more general version which performs prefetching in several places [1]. This version now only performs prefetching of each "tuple proper" during the writing of batches of tuples (an entire run, written following a quicksort). The case for prefetching each "tuple proper" at several sites now seems weak due to difference in CPU microarchitecture. However, it might still be that there is a consistent improvement observable when writing out tuples, because that involves a particularly tight inner loop, with relatively predictable processing to hide memory latency behind. A helpful generic prefetch hint may be possible for this case, even if it proves impossible elsewhere. This has been shown to appreciably help on both a POWER7 server processor [2], and an Intel Mobile processor. [1] https://commitfest.postgresql.org/6/305/ [2] CAM3SWZR5rv3+F3FOKf35=dti7oTmmcdFoe2voGuR0pddg3Jb+Q@mail.gmail.com --- config/c-compiler.m4 | 17 +++++++++++++++++ configure | 31 +++++++++++++++++++++++++++++++ configure.in | 1 + src/backend/utils/sort/tuplesort.c | 14 ++++++++++++++ src/include/c.h | 14 ++++++++++++++ src/include/pg_config.h.in | 3 +++ src/include/pg_config.h.win32 | 3 +++ src/include/pg_config_manual.h | 10 ++++++++++ 8 files changed, 93 insertions(+) diff --git a/config/c-compiler.m4 b/config/c-compiler.m4 index 550d034..8be2122 100644 --- a/config/c-compiler.m4 +++ b/config/c-compiler.m4 @@ -271,6 +271,23 @@ fi])# PGAC_C_BUILTIN_UNREACHABLE +# PGAC_C_BUILTIN_PREFETCH +# ------------------------- +# Check if the C compiler understands __builtin_prefetch(), +# and define HAVE__BUILTIN_PREFETCH if so. +AC_DEFUN([PGAC_C_BUILTIN_PREFETCH], +[AC_CACHE_CHECK(for __builtin_prefetch, pgac_cv__builtin_prefetch, +[AC_LINK_IFELSE([AC_LANG_PROGRAM([], +[int i = 0;__builtin_prefetch(&i, 0, 3);])], +[pgac_cv__builtin_prefetch=yes], +[pgac_cv__builtin_prefetch=no])]) +if test x"$pgac_cv__builtin_prefetch" = xyes ; then +AC_DEFINE(HAVE__BUILTIN_PREFETCH, 1, + [Define to 1 if your compiler understands __builtin_prefetch.]) +fi])# PGAC_C_BUILTIN_PREFETCH + + + # PGAC_C_VA_ARGS # -------------- # Check if the C compiler understands C99-style variadic macros, diff --git a/configure b/configure index 5772d0e..0a4c305 100755 --- a/configure +++ b/configure @@ -11338,6 +11338,37 @@ if test x"$pgac_cv__builtin_unreachable" = xyes ; then $as_echo "#define HAVE__BUILTIN_UNREACHABLE 1" >>confdefs.h fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __builtin_prefetch" >&5 +$as_echo_n "checking for __builtin_prefetch... " >&6; } +if ${pgac_cv__builtin_prefetch+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main () +{ +int i = 0;__builtin_prefetch(&i, 0, 3); + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + pgac_cv__builtin_prefetch=yes +else + pgac_cv__builtin_prefetch=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv__builtin_prefetch" >&5 +$as_echo "$pgac_cv__builtin_prefetch" >&6; } +if test x"$pgac_cv__builtin_prefetch" = xyes ; then + +$as_echo "#define HAVE__BUILTIN_PREFETCH 1" >>confdefs.h + +fi { $as_echo "$as_me:${as_lineno-$LINENO}: checking for __VA_ARGS__" >&5 $as_echo_n "checking for __VA_ARGS__... " >&6; } if ${pgac_cv__va_args+:} false; then : diff --git a/configure.in b/configure.in index 44f832f1..339c3e6 100644 --- a/configure.in +++ b/configure.in @@ -1320,6 +1320,7 @@ PGAC_C_BUILTIN_BSWAP32 PGAC_C_BUILTIN_BSWAP64 PGAC_C_BUILTIN_CONSTANT_P PGAC_C_BUILTIN_UNREACHABLE +PGAC_C_BUILTIN_PREFETCH PGAC_C_VA_ARGS PGAC_STRUCT_TIMEZONE PGAC_UNION_SEMUN diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c index 15253ab..85268c2 100644 --- a/src/backend/utils/sort/tuplesort.c +++ b/src/backend/utils/sort/tuplesort.c @@ -3531,6 +3531,20 @@ dumpbatch(Tuplesortstate *state, bool alltuples) WRITETUP(state, state->tp_tapenum[state->destTape], &state->memtuples[i]); state->memtupcount--; + + /* + * Perform memory prefetch of the tuple pointed to by the SortTuple + * that's two places ahead of tuple just written. Testing shows that + * this significantly boosts performance. + * + * Don't do this for pass-by-value datum sorts, where it will never + * help. Note that hinting a NULL address does not affect correctness, + * so NULL values are not a concern here. + */ +#ifdef USE_MEM_PREFETCH + if (state->tuples && i + 2 < memtupwrite) + pg_read_prefetch(state->memtuples[i + 2].tuple); +#endif } markrunend(state, state->tp_tapenum[state->destTape]); state->tp_runs[state->destTape]++; diff --git a/src/include/c.h b/src/include/c.h index 8163b00..3d05ff8 100644 --- a/src/include/c.h +++ b/src/include/c.h @@ -927,6 +927,20 @@ typedef NameData *Name; #define pg_unreachable() abort() #endif +/* + * Prefetch support -- Support memory prefetching hints on some platforms. + * + * pg_read_prefetch() is specialized for the case where an array is accessed + * sequentially, and we can prefetch a pointer within the next element (or an + * even later element) in order to hide memory latency. This case involves + * prefetching addresses with low temporal locality. Note that it's rather + * difficult to get any kind of speedup with this; any use of the intrinsic + * should be carefully tested. It's okay to pass it an invalid or NULL + * address, although it's best avoided. + */ +#if defined(USE_MEM_PREFETCH) +#define pg_read_prefetch(addr) __builtin_prefetch((addr), 0, 0) +#endif /* ---------------------------------------------------------------- * Section 8: random stuff diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index 16a272e..0e393ba 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -669,6 +669,9 @@ /* Define to 1 if your compiler understands __builtin_constant_p. */ #undef HAVE__BUILTIN_CONSTANT_P +/* Define to 1 if your compiler understands __builtin_prefetch. */ +#undef HAVE__BUILTIN_PREFETCH + /* Define to 1 if your compiler understands __builtin_types_compatible_p. */ #undef HAVE__BUILTIN_TYPES_COMPATIBLE_P diff --git a/src/include/pg_config.h.win32 b/src/include/pg_config.h.win32 index 8566065..990c3c3 100644 --- a/src/include/pg_config.h.win32 +++ b/src/include/pg_config.h.win32 @@ -514,6 +514,9 @@ /* Define to 1 if your compiler understands __builtin_constant_p. */ /* #undef HAVE__BUILTIN_CONSTANT_P */ +/* Define to 1 if your compiler understands __builtin_prefetch. */ +#undef HAVE__BUILTIN_PREFETCH + /* Define to 1 if your compiler understands __builtin_types_compatible_p. */ /* #undef HAVE__BUILTIN_TYPES_COMPATIBLE_P */ diff --git a/src/include/pg_config_manual.h b/src/include/pg_config_manual.h index e278fa0..4c7b1d5 100644 --- a/src/include/pg_config_manual.h +++ b/src/include/pg_config_manual.h @@ -153,6 +153,16 @@ #endif /* + * USE_MEM_PREFETCH controls whether Postgres will attempt to use memory + * prefetching. Usually the automatic configure tests are sufficient, but + * it's conceivable that using prefetching is counter-productive on some + * platforms. If necessary you can remove the #define here. + */ +#ifdef HAVE__BUILTIN_PREFETCH +#define USE_MEM_PREFETCH +#endif + +/* * USE_SSL code should be compiled only when compiling with an SSL * implementation. (Currently, only OpenSSL is supported, but we might add * more implementations in the future.) -- 1.9.1