gsoc, oprrest function for text search take 2 - Mailing list pgsql-hackers

From Jan Urbański
Subject gsoc, oprrest function for text search take 2
Date
Msg-id 488DAEB8.3000402@students.mimuw.edu.pl
Whole thread Raw
Responses Re: gsoc, oprrest function for text search take 2  ("Heikki Linnakangas" <heikki@enterprisedb.com>)
List pgsql-hackers
Hi,

I know Commit Fest is in progress, as well as the holiday season. But
the Summer of Code ends in about three weeks, so I'd like to request a
bit of out-of-order processing :)

My previous mail sent to -hackers is here:
http://archives.postgresql.org/message-id/4881CCCF.4020905@students.mimuw.edu.pl

I had problems applying the patch from that mail (it got mangled
somehow, could by my mail agent...), so I'm attaching it again.

There are two things that I'm not particularly proud of in the patch.
First is palloc()ing and filling in a table simply to user qsort() on
it. I guess I could either hack up a version of get_attstatsslot() that
returns an array of (element, frequency) pairs or sort the elements by
hand instead of using qsort() and keep the order of frequencies in sync
while doing the sort.

Another thing are cstring_to_text_with_len calls. I'm doing them so I
can use bttextcmp in bsearch(). I think I could come up with a dedicated
function to return text Datums and WordEntries (read: non-NULL
terminated strings with a given length).

Are these unsignificant? Or should I do these optimizations? Or, sadly,
signs that using binary search is not a good decision?

Thanks,
Jan

--
Jan Urbanski
GPG key ID: E583D7D2

ouden estin

diff --git a/src/backend/tsearch/Makefile b/src/backend/tsearch/Makefile
index e20a4a2..ba728eb 100644
--- a/src/backend/tsearch/Makefile
+++ b/src/backend/tsearch/Makefile
@@ -19,7 +19,7 @@ DICTFILES=synonym_sample.syn thesaurus_sample.ths hunspell_sample.affix \
 OBJS = ts_locale.o ts_parse.o wparser.o wparser_def.o dict.o \
     dict_simple.o dict_synonym.o dict_thesaurus.o \
     dict_ispell.o regis.o spell.o \
-    to_tsany.o ts_typanalyze.o ts_utils.o
+    to_tsany.o ts_typanalyze.o ts_selfuncs.o ts_utils.o

 include $(top_srcdir)/src/backend/common.mk

diff --git a/src/backend/tsearch/ts_selfuncs.c b/src/backend/tsearch/ts_selfuncs.c
new file mode 100644
index 0000000..4b86072
--- /dev/null
+++ b/src/backend/tsearch/ts_selfuncs.c
@@ -0,0 +1,299 @@
+/*-------------------------------------------------------------------------
+ *
+ * ts_selfuncs.c
+ *      Selectivity functions for text search types.
+ *
+ * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ *      $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "miscadmin.h" /* for check_stack_depth() */
+#include "utils/memutils.h"
+#include "utils/builtins.h"
+#include "utils/syscache.h"
+#include "utils/lsyscache.h"
+#include "utils/selfuncs.h"
+#include "catalog/pg_type.h"
+#include "catalog/pg_statistic.h"
+#include "nodes/nodes.h"
+#include "tsearch/ts_type.h"
+
+/* lookup table type for binary searching through MCELEMs */
+typedef struct
+{
+    Datum    element;
+    float4    frequency;
+} TextFreq;
+
+static int
+compare_textfreq(const void *e1, const void *e2);
+static Selectivity
+tsquery_opr_selec(QueryItem *item, char *operand, TextFreq *lookup,
+                  int length, float4 minfreq);
+static Selectivity
+mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem,
+                           float4 *numbers, int nnumbers);
+static double
+tsquerysel(VariableStatData *vardata, Datum constval);
+
+
+/* TSQuery traversal function */
+static Selectivity
+tsquery_opr_selec(QueryItem *item, char *operand, TextFreq *lookup,
+                  int length, float4 minfreq)
+{
+    TextFreq    key,
+                *searchres;
+    Selectivity    s1, s2;
+
+    /* since this function recurses, it could be driven to stack overflow */
+    check_stack_depth();
+
+    if (item->type == QI_VAL)
+    {
+        QueryOperand *oper = (QueryOperand *) item;
+
+        /*
+         * Prepare the key for bsearch(). No need to initialize key.frequency,
+         * because sorting is only on key.element.
+         */
+        key.element = PointerGetDatum(
+            cstring_to_text_with_len(operand + oper->distance, oper->length));
+
+        searchres = (TextFreq *) bsearch(&key, lookup, length,
+                                         sizeof(TextFreq), compare_textfreq);
+        if (searchres)
+        {
+            /*
+             * The element is in MCELEM. Return precise selectivity (or at
+             * least as precise, as ANALYZE could find out).
+             */
+            return (Selectivity) searchres->frequency;
+        }
+        else
+        {
+            /*
+             * The element is not in MCELEM. Punt, but  assure that the
+             * selectivity cannot be more than minfreq / 2.
+             */
+            return (Selectivity) Min(DEFAULT_TS_SEL, minfreq / 2);
+        }
+    }
+
+    /* Current TSQuery node is an operator */
+    switch (item->operator.oper)
+    {
+        case OP_NOT:
+            return 1.0 - tsquery_opr_selec(item + 1, operand, lookup,
+                                           length, minfreq);
+
+        case OP_AND:
+            return
+                tsquery_opr_selec(item + 1, operand, lookup, length, minfreq) *
+                tsquery_opr_selec(item + item->operator.left, operand, lookup,
+                                  length, minfreq);
+
+        case OP_OR:
+            s1 = tsquery_opr_selec(item + 1, operand, lookup, length, minfreq);
+            s2 = tsquery_opr_selec(item + item->operator.left, operand, lookup,
+                                   length, minfreq);
+            return s1 + s2 - s1 * s2;
+
+        default:
+            elog(ERROR, "unrecognized operator: %d", item->operator.oper);
+    }
+
+    /* never reached, keep compiler happy */
+    return (Selectivity) DEFAULT_TS_SEL;
+}
+
+/*
+ *  Traverse the tsquery preorder, calculating selectivity as:
+ *
+ *   selec(left_oper) * selec(right_oper) in AND nodes,
+ *
+ *   selec(left_oper) + selec(right_oper) -
+ *      selec(left_oper) * selec(right_oper) in OR nodes,
+ *
+ *   1 - select(oper) in NOT nodes
+ *
+ *   freq[val] in VAL nodes, if the value is in MCELEM
+ *   min(freq[MCELEM]) / 2 in VAL nodes, if it is not
+ *
+ *
+ * Implementation-wise, we sort the MCELEM array to use binary
+ * search on it.
+ */
+static Selectivity
+mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem,
+                           float4 *numbers, int nnumbers)
+{
+    float4            minfreq;
+    TextFreq        *lookup;
+    Selectivity        selec;
+    MemoryContext    opr_context,
+                    old_context;
+    int                i;
+
+    /* Grab the lowest frequency */
+    minfreq = numbers[nnumbers - 1];
+
+    /*
+     * Compute selectivity in child context, because we will possibly do a lot
+     * of QueryOperand to text transformations, which palloc() memory
+     */
+    opr_context = AllocSetContextCreate(CurrentMemoryContext,
+                                        "Text search restriction",
+                                        ALLOCSET_DEFAULT_MINSIZE,
+                                        ALLOCSET_DEFAULT_INITSIZE,
+                                        ALLOCSET_DEFAULT_MAXSIZE);
+    old_context = MemoryContextSwitchTo(opr_context);
+
+
+    /* Construct the array to be qsort()'ed */
+    Assert(nmcelem == nnumbers);
+    lookup = (TextFreq *) palloc(sizeof(TextFreq) * nmcelem);
+    for (i = 0; i < nmcelem; i++)
+    {
+        lookup[i].element = mcelem[i];
+        lookup[i].frequency = numbers[i];
+    }
+
+    qsort(lookup, nmcelem, sizeof(TextFreq), compare_textfreq);
+
+    selec = tsquery_opr_selec(GETQUERY(query), GETOPERAND(query), lookup,
+                              nmcelem, minfreq);
+
+    MemoryContextSwitchTo(old_context);
+    MemoryContextDelete(opr_context);
+
+    return selec;
+}
+
+static int
+compare_textfreq(const void *e1, const void *e2)
+{
+    const TextFreq *t1 = (const TextFreq *) e1;
+    const TextFreq *t2 = (const TextFreq *) e2;
+
+    return DatumGetInt32(DirectFunctionCall2(bttextcmp,
+                                             t1->element, t2->element));
+}
+
+/*
+ *    tssel -- Selectivity of "@@"
+ */
+Datum
+tssel(PG_FUNCTION_ARGS)
+{
+    PlannerInfo        *root = (PlannerInfo *) PG_GETARG_POINTER(0);
+    /* We skip arg #2, which is the operator OID - it's gonna be "@@" */
+    List            *args = (List *) PG_GETARG_POINTER(2);
+    int                varRelid = PG_GETARG_INT32(3);
+    VariableStatData vardata;
+    Node            *other;
+    bool            varonleft;
+    Selectivity        selec;
+
+    /*
+     * If expression is not variable op something or something op variable,
+     * then punt and return a default estimate.
+     */
+    if (!get_restriction_variable(root, args, varRelid,
+                                  &vardata, &other, &varonleft))
+    {
+        PG_RETURN_FLOAT8(DEFAULT_TS_SEL);
+    }
+
+    /*
+     * Can't do anything useful if the something is not a constant, either.
+     */
+    if (!IsA(other, Const))
+    {
+        ReleaseVariableStats(vardata);
+        PG_RETURN_FLOAT8(DEFAULT_TS_SEL);
+    }
+
+    /* The "@@" operator is strict, so might cope with NULL right away */
+    if (((Const *) other)->constisnull) {
+        ReleaseVariableStats(vardata);
+        PG_RETURN_FLOAT8((float8) 0.0);
+    }
+
+    /*
+     * OK, there's a Var and a Const we're dealing with here. We need the Var
+     * to be a TSVector (or else we don't have any useful statistic for it).
+     */
+
+    if (vardata.vartype == TSVECTOROID)
+    {
+        /* tsvector @@ tsquery or the other way around */
+        Assert(((Const *) other)->consttype == TSQUERYOID);
+
+        selec = tsquerysel(&vardata, ((Const *) other)->constvalue);
+    }
+    else
+    {
+        /* The Var is something we don't have useful statistic for */
+        selec = (Selectivity) DEFAULT_TS_SEL;
+    }
+
+    ReleaseVariableStats(vardata);
+
+    PG_RETURN_FLOAT8((float8) selec);
+}
+
+static Selectivity
+tsquerysel(VariableStatData *vardata, Datum constval)
+{
+    Selectivity            selec;
+
+    if (HeapTupleIsValid(vardata->statsTuple))
+    {
+        TSQuery                query;
+        Form_pg_statistic    stats;
+        Datum                *values;
+        int                    nvalues;
+        float4                *numbers;
+        int                    nnumbers;
+
+        /* The caller made sure the const is a TSQuery, so get it now */
+        query = DatumGetTSQuery(constval);
+
+        stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
+
+        /* MCELEM will be an array of Text elements for a tsvector column */
+        if (get_attstatsslot(vardata->statsTuple,
+                             TEXTOID, -1,
+                             STATISTIC_KIND_MCELEM, InvalidOid,
+                             &values, &nvalues,
+                             &numbers, &nnumbers))
+        {
+            /*
+             * There is a most-common-elements slot for the tsvector Var, so
+             * use that.
+             */
+
+            selec = mcelem_tsquery_selec(query, values, nvalues,
+                                         numbers, nnumbers);
+            free_attstatsslot(TEXTOID, values, nvalues, numbers, nnumbers);
+        }
+        else
+        {
+            /* No most-common-elements slot */
+            selec = (Selectivity) DEFAULT_TS_SEL;
+        }
+    }
+    else
+    {
+        selec = (Selectivity) DEFAULT_TS_SEL;
+    }
+
+    return selec;
+}
diff --git a/src/include/catalog/pg_operator.h b/src/include/catalog/pg_operator.h
index 1c7f37d..451805d 100644
--- a/src/include/catalog/pg_operator.h
+++ b/src/include/catalog/pg_operator.h
@@ -915,10 +915,10 @@ DATA(insert OID = 3630 (  "<>"       PGNSP PGUID b f f 3614     3614     16 3630 3629     ts
 DATA(insert OID = 3631 (  ">="       PGNSP PGUID b f f 3614     3614     16 3628 3627     tsvector_ge scalargtsel
scalargtjoinsel)); 
 DATA(insert OID = 3632 (  ">"       PGNSP PGUID b f f 3614     3614     16 3627 3628     tsvector_gt scalargtsel
scalargtjoinsel)); 
 DATA(insert OID = 3633 (  "||"       PGNSP PGUID b f f 3614     3614     3614  0    0     tsvector_concat   -    -
)); 
-DATA(insert OID = 3636 (  "@@"       PGNSP PGUID b f f 3614     3615     16 3637    0     ts_match_vq   contsel
contjoinsel    )); 
-DATA(insert OID = 3637 (  "@@"       PGNSP PGUID b f f 3615     3614     16 3636    0     ts_match_qv   contsel
contjoinsel    )); 
-DATA(insert OID = 3660 (  "@@@"    PGNSP PGUID b f f 3614     3615     16 3661    0     ts_match_vq   contsel
contjoinsel    )); 
-DATA(insert OID = 3661 (  "@@@"    PGNSP PGUID b f f 3615     3614     16 3660    0     ts_match_qv   contsel
contjoinsel    )); 
+DATA(insert OID = 3636 (  "@@"       PGNSP PGUID b f f 3614     3615     16 3637    0     ts_match_vq   tssel
contjoinsel    )); 
+DATA(insert OID = 3637 (  "@@"       PGNSP PGUID b f f 3615     3614     16 3636    0     ts_match_qv   tssel
contjoinsel    )); 
+DATA(insert OID = 3660 (  "@@@"    PGNSP PGUID b f f 3614     3615     16 3661    0     ts_match_vq   tssel
contjoinsel    )); 
+DATA(insert OID = 3661 (  "@@@"    PGNSP PGUID b f f 3615     3614     16 3660    0     ts_match_qv   tssel
contjoinsel    )); 
 DATA(insert OID = 3674 (  "<"       PGNSP PGUID b f f 3615     3615     16 3679 3678     tsquery_lt scalarltsel
scalarltjoinsel)); 
 DATA(insert OID = 3675 (  "<="       PGNSP PGUID b f f 3615     3615     16 3678 3679     tsquery_le scalarltsel
scalarltjoinsel)); 
 DATA(insert OID = 3676 (  "="       PGNSP PGUID b t f 3615     3615     16 3676 3677     tsquery_eq eqsel eqjoinsel
));
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
index 16ccb55..ef24f40 100644
--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -4321,6 +4321,8 @@ DESCR("GiST tsquery support");
 DATA(insert OID = 3701 (  gtsquery_consistent            PGNSP PGUID 12 1 0 0 f f t f i 5 16 "2281 2281 23 26 2281"
_null__null_ _null_ gtsquery_consistent _null_ _null_ _null_ )); 
 DESCR("GiST tsquery support");

+DATA(insert OID = 3687 (  tssel    PGNSP PGUID 12 1 0 0 f f t f s 4 701 "2281 26 2281 23" _null_ _null_ _null_ tssel
_null__null_ _null_ )); 
+DESCR("restriction selectivity of tsvector @@ tsquery");
 DATA(insert OID = 3688 (  ts_typanalyze    PGNSP PGUID 12 1 0 0 f f t f s 1 16 "2281" _null_ _null_ _null_
ts_typanalyze_null_ _null_ _null_ )); 
 DESCR("tsvector typanalyze");

diff --git a/src/include/tsearch/ts_type.h b/src/include/tsearch/ts_type.h
index 06ca1f9..6208432 100644
--- a/src/include/tsearch/ts_type.h
+++ b/src/include/tsearch/ts_type.h
@@ -291,4 +291,17 @@ extern Datum tsquery_rewrite_query(PG_FUNCTION_ARGS);
 extern Datum tsq_mcontains(PG_FUNCTION_ARGS);
 extern Datum tsq_mcontained(PG_FUNCTION_ARGS);

+/*
+ * The default text search selectivity is chosen to be smll enough to
+ * encourage indexscans for typical table densities. See selfuncs.h and
+ * DEFAULT_EQ_SEL for details.
+ */
+#define DEFAULT_TS_SEL 0.005
+
+/*
+ * operator restriction function for tsvector @@ tsquery and
+ * tsquery @@ tsvector
+ */
+extern Datum tssel(PG_FUNCTION_ARGS);
+
 #endif   /* _PG_TSTYPE_H_ */

pgsql-hackers by date:

Previous
From: Zdenek Kotala
Date:
Subject: Re: Review: DTrace probes (merged version) ver_03
Next
From: Magnus Hagander
Date:
Subject: Re: issues/experience with building postgres on Windows