Add planner support function for starts_with() - Mailing list pgsql-hackers

From Tom Lane
Subject Add planner support function for starts_with()
Date
Msg-id 232599.1633800229@sss.pgh.pa.us
Whole thread Raw
Responses Re: Add planner support function for starts_with()  ("Bossart, Nathan" <bossartn@amazon.com>)
List pgsql-hackers
When starts_with() and the equivalent ^@ operator were added, they
were plugged into the planner in only a rather half-baked way.
Selectivity estimation got taught about the operator, but the
other infrastructure associated with LIKE/regex matching wasn't
updated.  This causes these operators to be planned more stupidly
than a functionally-equivalent LIKE/regex pattern [1].

With the (admittedly later) introduction of planner support functions,
it's really quite easy to do better.  The attached patch adds a planner
support function for starts_with(), with these benefits:

* A condition such as "textcol ^@ constant" can now use a regular
btree index, not only an SP-GiST index, so long as the index's
collation is C.  (This works just like "textcol LIKE 'foo%'".)

* "starts_with(textcol, constant)" can be optimized the same as
"textcol ^@ constant".

I also rejiggered match_pattern_prefix() a bit, with the effect
that fixed-prefix LIKE and regex patterns are now more like
starts_with() in another way: if you apply one to an SPGiST-indexed
column, you'll get an index condition using ^@ rather than two
index conditions with >= and <.  That should be more efficient
at runtime, though I didn't try to do any performance testing.

            regards, tom lane

[1] https://www.postgresql.org/message-id/CADT4RqB13KQHOJqqQ%2BWXmYtJrukS2UiFdtfTvT-XA3qYLyB6Cw%40mail.gmail.com
diff --git a/src/backend/utils/adt/like_support.c b/src/backend/utils/adt/like_support.c
index 241e6f0f59..988568825e 100644
--- a/src/backend/utils/adt/like_support.c
+++ b/src/backend/utils/adt/like_support.c
@@ -143,6 +143,14 @@ texticregexeq_support(PG_FUNCTION_ARGS)
     PG_RETURN_POINTER(like_regex_support(rawreq, Pattern_Type_Regex_IC));
 }

+Datum
+text_starts_with_support(PG_FUNCTION_ARGS)
+{
+    Node       *rawreq = (Node *) PG_GETARG_POINTER(0);
+
+    PG_RETURN_POINTER(like_regex_support(rawreq, Pattern_Type_Prefix));
+}
+
 /* Common code for the above */
 static Node *
 like_regex_support(Node *rawreq, Pattern_Type ptype)
@@ -246,6 +254,7 @@ match_pattern_prefix(Node *leftop,
     Oid            eqopr;
     Oid            ltopr;
     Oid            geopr;
+    Oid            preopr = InvalidOid;
     bool        collation_aware;
     Expr       *expr;
     FmgrInfo    ltproc;
@@ -302,12 +311,20 @@ match_pattern_prefix(Node *leftop,
     switch (ldatatype)
     {
         case TEXTOID:
-            if (opfamily == TEXT_PATTERN_BTREE_FAM_OID ||
-                opfamily == TEXT_SPGIST_FAM_OID)
+            if (opfamily == TEXT_PATTERN_BTREE_FAM_OID)
+            {
+                eqopr = TextEqualOperator;
+                ltopr = TextPatternLessOperator;
+                geopr = TextPatternGreaterEqualOperator;
+                collation_aware = false;
+            }
+            else if (opfamily == TEXT_SPGIST_FAM_OID)
             {
                 eqopr = TextEqualOperator;
                 ltopr = TextPatternLessOperator;
                 geopr = TextPatternGreaterEqualOperator;
+                /* This opfamily has direct support for prefixing */
+                preopr = TextPrefixOperator;
                 collation_aware = false;
             }
             else
@@ -360,20 +377,6 @@ match_pattern_prefix(Node *leftop,
             return NIL;
     }

-    /*
-     * If necessary, verify that the index's collation behavior is compatible.
-     * For an exact-match case, we don't have to be picky.  Otherwise, insist
-     * that the index collation be "C".  Note that here we are looking at the
-     * index's collation, not the expression's collation -- this test is *not*
-     * dependent on the LIKE/regex operator's collation.
-     */
-    if (collation_aware)
-    {
-        if (!(pstatus == Pattern_Prefix_Exact ||
-              lc_collate_is_c(indexcollation)))
-            return NIL;
-    }
-
     /*
      * If necessary, coerce the prefix constant to the right type.  The given
      * prefix constant is either text or bytea type, therefore the only case
@@ -409,8 +412,31 @@ match_pattern_prefix(Node *leftop,
     }

     /*
-     * Otherwise, we have a nonempty required prefix of the values.
-     *
+     * Otherwise, we have a nonempty required prefix of the values.  Some
+     * opclasses support prefix checks directly, otherwise we'll try to
+     * generate a range constraint.
+     */
+    if (OidIsValid(preopr) && op_in_opfamily(preopr, opfamily))
+    {
+        expr = make_opclause(preopr, BOOLOID, false,
+                             (Expr *) leftop, (Expr *) prefix,
+                             InvalidOid, indexcollation);
+        result = list_make1(expr);
+        return result;
+    }
+
+    /*
+     * Since we need a range constraint, it's only going to work reliably if
+     * the index is collation-insensitive or has "C" collation.  Note that
+     * here we are looking at the index's collation, not the expression's
+     * collation -- this test is *not* dependent on the LIKE/regex operator's
+     * collation.
+     */
+    if (collation_aware &&
+        !lc_collate_is_c(indexcollation))
+        return NIL;
+
+    /*
      * We can always say "x >= prefix".
      */
     if (!op_in_opfamily(geopr, opfamily))
@@ -1165,7 +1191,6 @@ pattern_fixed_prefix(Const *patt, Pattern_Type ptype, Oid collation,
         case Pattern_Type_Prefix:
             /* Prefix type work is trivial.  */
             result = Pattern_Prefix_Partial;
-            *rest_selec = 1.0;    /* all */
             *prefix = makeConst(patt->consttype,
                                 patt->consttypmod,
                                 patt->constcollid,
@@ -1175,6 +1200,8 @@ pattern_fixed_prefix(Const *patt, Pattern_Type ptype, Oid collation,
                                           patt->constlen),
                                 patt->constisnull,
                                 patt->constbyval);
+            if (rest_selec != NULL)
+                *rest_selec = 1.0;    /* all */
             break;
         default:
             elog(ERROR, "unrecognized ptype: %d", (int) ptype);
diff --git a/src/include/catalog/pg_operator.dat b/src/include/catalog/pg_operator.dat
index 89c73acd68..0075a02f32 100644
--- a/src/include/catalog/pg_operator.dat
+++ b/src/include/catalog/pg_operator.dat
@@ -102,7 +102,7 @@
   oprright => 'text', oprresult => 'bool', oprcom => '=(text,text)',
   oprnegate => '<>(text,text)', oprcode => 'texteq', oprrest => 'eqsel',
   oprjoin => 'eqjoinsel' },
-{ oid => '3877', descr => 'starts with',
+{ oid => '3877', oid_symbol => 'TextPrefixOperator', descr => 'starts with',
   oprname => '^@', oprleft => 'text', oprright => 'text', oprresult => 'bool',
   oprcode => 'starts_with', oprrest => 'prefixsel',
   oprjoin => 'prefixjoinsel' },
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index d068d6532e..6412f369f1 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -167,8 +167,12 @@
   proname => 'texteq', proleakproof => 't', prorettype => 'bool',
   proargtypes => 'text text', prosrc => 'texteq' },
 { oid => '3696',
-  proname => 'starts_with', proleakproof => 't', prorettype => 'bool',
-  proargtypes => 'text text', prosrc => 'text_starts_with' },
+  proname => 'starts_with', prosupport => 'text_starts_with_support',
+  proleakproof => 't', prorettype => 'bool', proargtypes => 'text text',
+  prosrc => 'text_starts_with' },
+{ oid => '8923', descr => 'planner support for text_starts_with',
+  proname => 'text_starts_with_support', prorettype => 'internal',
+  proargtypes => 'internal', prosrc => 'text_starts_with_support' },
 { oid => '68',
   proname => 'xideq', proleakproof => 't', prorettype => 'bool',
   proargtypes => 'xid xid', prosrc => 'xideq' },
diff --git a/src/test/regress/expected/create_index_spgist.out b/src/test/regress/expected/create_index_spgist.out
index f4fb08a289..5c04df9c01 100644
--- a/src/test/regress/expected/create_index_spgist.out
+++ b/src/test/regress/expected/create_index_spgist.out
@@ -804,6 +804,22 @@ SELECT count(*) FROM radix_text_tbl WHERE t ^@     'Worth';
      2
 (1 row)

+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM radix_text_tbl WHERE starts_with(t, 'Worth');
+                         QUERY PLAN
+------------------------------------------------------------
+ Aggregate
+   ->  Index Only Scan using sp_radix_ind on radix_text_tbl
+         Index Cond: (t ^@ 'Worth'::text)
+         Filter: starts_with(t, 'Worth'::text)
+(4 rows)
+
+SELECT count(*) FROM radix_text_tbl WHERE starts_with(t, 'Worth');
+ count
+-------
+     2
+(1 row)
+
 -- Now check the results from bitmap indexscan
 SET enable_seqscan = OFF;
 SET enable_indexscan = OFF;
@@ -1333,6 +1349,23 @@ SELECT count(*) FROM radix_text_tbl WHERE t ^@     'Worth';
      2
 (1 row)

+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM radix_text_tbl WHERE starts_with(t, 'Worth');
+                   QUERY PLAN
+------------------------------------------------
+ Aggregate
+   ->  Bitmap Heap Scan on radix_text_tbl
+         Filter: starts_with(t, 'Worth'::text)
+         ->  Bitmap Index Scan on sp_radix_ind
+               Index Cond: (t ^@ 'Worth'::text)
+(5 rows)
+
+SELECT count(*) FROM radix_text_tbl WHERE starts_with(t, 'Worth');
+ count
+-------
+     2
+(1 row)
+
 RESET enable_seqscan;
 RESET enable_indexscan;
 RESET enable_bitmapscan;
diff --git a/src/test/regress/sql/create_index_spgist.sql b/src/test/regress/sql/create_index_spgist.sql
index b126dae629..660bfc6193 100644
--- a/src/test/regress/sql/create_index_spgist.sql
+++ b/src/test/regress/sql/create_index_spgist.sql
@@ -295,6 +295,10 @@ EXPLAIN (COSTS OFF)
 SELECT count(*) FROM radix_text_tbl WHERE t ^@     'Worth';
 SELECT count(*) FROM radix_text_tbl WHERE t ^@     'Worth';

+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM radix_text_tbl WHERE starts_with(t, 'Worth');
+SELECT count(*) FROM radix_text_tbl WHERE starts_with(t, 'Worth');
+
 -- Now check the results from bitmap indexscan
 SET enable_seqscan = OFF;
 SET enable_indexscan = OFF;
@@ -424,6 +428,10 @@ EXPLAIN (COSTS OFF)
 SELECT count(*) FROM radix_text_tbl WHERE t ^@     'Worth';
 SELECT count(*) FROM radix_text_tbl WHERE t ^@     'Worth';

+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM radix_text_tbl WHERE starts_with(t, 'Worth');
+SELECT count(*) FROM radix_text_tbl WHERE starts_with(t, 'Worth');
+
 RESET enable_seqscan;
 RESET enable_indexscan;
 RESET enable_bitmapscan;

pgsql-hackers by date:

Previous
From: Bharath Rupireddy
Date:
Subject: Re: Reword docs of feature "Remove temporary files after backend crash"
Next
From: Tom Lane
Date:
Subject: Re: starts_with, ^@ and index usage