From 41732ad9a44dcd12e52d823fb59cb23cce4fe217 Mon Sep 17 00:00:00 2001 From: Dean Rasheed Date: Sun, 20 Feb 2022 12:45:01 +0000 Subject: [PATCH 1/3] Apply auto-vectorization to the inner loop of div_var_fast(). This loop is basically the same as the inner loop of mul_var(), which was auto-vectorized in commit 8870917623, but the compiler will only consider auto-vectorizing the div_var_fast() loop if the assignment target div[qi + i] is replaced by div_qi[i], where div_qi = &div[qi]. Additionally, since the compiler doesn't know that qdigit is guaranteed to fit in a 16-bit NumericDigit, cast it to NumericDigit before multiplying to make the resulting auto-vectorized code more efficient. --- src/backend/utils/adt/numeric.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/backend/utils/adt/numeric.c b/src/backend/utils/adt/numeric.c index 3208789f75..fc43d2a456 100644 --- a/src/backend/utils/adt/numeric.c +++ b/src/backend/utils/adt/numeric.c @@ -8908,13 +8908,22 @@ div_var_fast(const NumericVar *var1, const NumericVar *var2, * which would make the new value simply div[qi] mod vardigits[0]. * The lower-order terms in qdigit can change this result by not * more than about twice INT_MAX/NBASE, so overflow is impossible. + * + * This inner loop is the performance bottleneck for division, so + * code it in the same way as the inner loop of mul_var() so that + * it can be auto-vectorized. We cast qdigit to NumericDigit + * before multiplying to allow the compiler to generate more + * efficient code (using 16-bit multiplication), which is safe + * since we know that the quotient digit is off by at most one, so + * there is no overflow risk. */ if (qdigit != 0) { int istop = Min(var2ndigits, div_ndigits - qi + 1); + int *div_qi = &div[qi]; for (i = 0; i < istop; i++) - div[qi + i] -= qdigit * var2digits[i]; + div_qi[i] -= ((NumericDigit) qdigit) * var2digits[i]; } } -- 2.26.2