From c87da5330c636e939983b1ba8eaee581b4c953dd Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Sun, 29 Nov 2015 12:51:36 -0800 Subject: [PATCH] Abort C collation text abbreviation less frequently Discriminate against the C collation by creating a much lower bar for the amount of entropy that abbreviated keys must capture. This is consistent with existing cases that have cheaper conversion processes, like UUID. Backpatch to 9.5, where abbreviated keys for text were added. --- src/backend/utils/adt/varlena.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index a89f586..0bcdd96 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -1869,7 +1869,7 @@ btsortsupport_worker(SortSupport ssup, Oid collid) */ if (abbreviate) { - tss->prop_card = 0.20; + tss->prop_card = collate_c ? 0.01 : 0.20; initHyperLogLog(&tss->abbr_card, 10); initHyperLogLog(&tss->full_card, 10); ssup->abbrev_full_comparator = ssup->comparator; @@ -2261,7 +2261,11 @@ bttext_abbrev_abort(int memtupcount, SortSupport ssup) * cardinality against the overall size of the set in order to more * accurately model costs. Assume that an abbreviated comparison, and an * abbreviated comparison with a cheap memcmp()-based authoritative - * resolution are equivalent. + * resolution are equivalent. (With the C collation, authoritative + * cardinality is used in the same way, even though the cost of an + * authoritative tie-breaker is no cheaper when values are equal. The + * theory is that the early appearance of low entropy abbreviated keys + * predicts the same prefix for all or most values.) */ if (abbrev_distinct > key_distinct * tss->prop_card) { -- 1.9.1