Re: gsoc, oprrest function for text search take 2 - Mailing list pgsql-hackers

From Jan Urbański
Subject Re: gsoc, oprrest function for text search take 2
Date
Msg-id 48A48B4E.10803@students.mimuw.edu.pl
Whole thread Raw
In response to Re: gsoc, oprrest function for text search take 2  ("Heikki Linnakangas" <heikki@enterprisedb.com>)
Responses Re: gsoc, oprrest function for text search take 2  (Jan Urbański <j.urbanski@students.mimuw.edu.pl>)
Re: gsoc, oprrest function for text search take 2  (Tom Lane <tgl@sss.pgh.pa.us>)
List pgsql-hackers
Heikki Linnakangas wrote:
> Jan Urbański wrote:
>> So right now the idea is to:
>>  (1) pre-sort STATISTIC_KIND_MCELEM values
>>  (2) build an array of pointers to detoasted values in tssel()
>>  (3) use binary search when looking for MCELEMs during tsquery analysis
>
> Sounds like a plan. In (2), it's even better to detoast the values
> lazily. For a typical one-word tsquery, the binary search will only look
> at a small portion of the elements.

Here's another version.

Most common lexemes get sorted before storing in pg_statistic. The
ordering is on length first and value second. That way we can avoid
strncmp() calls when the lexemes have different lengths (and lexemes
know their lengths, so the data is readily available). Also, in the
binary search routine during selectivity estimation we can sometimes
avoid detoasting (I think) Datums from the pg_statistic MCELEM array.
See comments in code.

Pre-sorting introduced one problem (see XXX in code): it's not easy
anymore to get the minimal frequency of MCELEM values. I was using it to
assert that the selectivity of a tsquery node containing a lexeme not in
MCELEM is no more that min(MCELEM freqs) / 2. That's only significant
when the minimum frequency is less than DEFAULT_TS_SEL * 2, so I'm kind
of inclined to ignore it and maybe drop a comment in the code that this
may be a potential problem.

If nothing is fundamentally broken with this, I'll repeat my profiling
tests to see if anything has been gained.

Cheers,
Jan

--
Jan Urbanski
GPG key ID: E583D7D2

ouden estin
diff --git a/src/backend/tsearch/Makefile b/src/backend/tsearch/Makefile
index e20a4a2..ba728eb 100644
--- a/src/backend/tsearch/Makefile
+++ b/src/backend/tsearch/Makefile
@@ -19,7 +19,7 @@ DICTFILES=synonym_sample.syn thesaurus_sample.ths hunspell_sample.affix \
 OBJS = ts_locale.o ts_parse.o wparser.o wparser_def.o dict.o \
     dict_simple.o dict_synonym.o dict_thesaurus.o \
     dict_ispell.o regis.o spell.o \
-    to_tsany.o ts_typanalyze.o ts_utils.o
+    to_tsany.o ts_typanalyze.o ts_selfuncs.o ts_utils.o

 include $(top_srcdir)/src/backend/common.mk

diff --git a/src/backend/tsearch/ts_selfuncs.c b/src/backend/tsearch/ts_selfuncs.c
new file mode 100644
index 0000000..0fadc60
--- /dev/null
+++ b/src/backend/tsearch/ts_selfuncs.c
@@ -0,0 +1,320 @@
+/*-------------------------------------------------------------------------
+ *
+ * ts_selfuncs.c
+ *      Selectivity functions for text search types.
+ *
+ * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ *      $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "miscadmin.h" /* for check_stack_depth() */
+#include "utils/memutils.h"
+#include "utils/builtins.h"
+#include "utils/syscache.h"
+#include "utils/lsyscache.h"
+#include "utils/selfuncs.h"
+#include "catalog/pg_type.h"
+#include "catalog/pg_statistic.h"
+#include "nodes/nodes.h"
+#include "tsearch/ts_type.h"
+
+/* lookup table type for binary searching through MCELEMs */
+typedef struct
+{
+    Datum    element;
+    float4    frequency;
+} TextFreq;
+
+/* type of keys for bsearch()ing through an array of TextFreqs  */
+typedef struct
+{
+    char    *lexeme;
+    int        length;
+} LexemeKey;
+
+static int
+compare_lexeme_textfreq(const void *e1, const void *e2);
+
+static Selectivity
+tsquery_opr_selec(QueryItem *item, char *operand, TextFreq *lookup,
+                  int length, float4 minfreq);
+static Selectivity
+mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem,
+                           float4 *numbers, int nnumbers);
+static double
+tsquerysel(VariableStatData *vardata, Datum constval);
+
+
+/* TSQuery traversal function */
+static Selectivity
+tsquery_opr_selec(QueryItem *item, char *operand, TextFreq *lookup,
+                  int length, float4 minfreq)
+{
+    LexemeKey    key;
+    TextFreq    *searchres;
+    Selectivity    s1, s2;
+
+    /* since this function recurses, it could be driven to stack overflow */
+    check_stack_depth();
+
+    if (item->type == QI_VAL)
+    {
+        QueryOperand *oper = (QueryOperand *) item;
+
+        /*
+         * Prepare the key for bsearch().
+         */
+        key.lexeme = operand + oper->distance;
+        key.length = oper->length;
+
+        searchres = (TextFreq *) bsearch(&key, lookup, length,
+                                         sizeof(TextFreq), compare_lexeme_textfreq);
+
+        if (searchres)
+        {
+            /*
+             * The element is in MCELEM. Return precise selectivity (or at
+             * least as precise, as ANALYZE could find out).
+             */
+            return (Selectivity) searchres->frequency;
+        }
+        else
+        {
+            /*
+             * The element is not in MCELEM. Punt, but assert that the
+             * selectivity cannot be more than minfreq / 2.
+             */
+            return (Selectivity) Min(DEFAULT_TS_SEL, minfreq / 2);
+        }
+    }
+
+    /* Current TSQuery node is an operator */
+    switch (item->operator.oper)
+    {
+        case OP_NOT:
+            return 1.0 - tsquery_opr_selec(item + 1, operand, lookup,
+                                           length, minfreq);
+
+        case OP_AND:
+            return
+                tsquery_opr_selec(item + 1, operand, lookup, length, minfreq) *
+                tsquery_opr_selec(item + item->operator.left, operand, lookup,
+                                  length, minfreq);
+
+        case OP_OR:
+            s1 = tsquery_opr_selec(item + 1, operand, lookup, length, minfreq);
+            s2 = tsquery_opr_selec(item + item->operator.left, operand, lookup,
+                                   length, minfreq);
+            return s1 + s2 - s1 * s2;
+
+        default:
+            elog(ERROR, "unrecognized operator: %d", item->operator.oper);
+    }
+
+    /* never reached, keep compiler happy */
+    return (Selectivity) DEFAULT_TS_SEL;
+}
+
+/*
+ * Traverse the tsquery preorder, calculating selectivity as:
+ *
+ *   selec(left_oper) * selec(right_oper) in AND nodes,
+ *
+ *   selec(left_oper) + selec(right_oper) -
+ *      selec(left_oper) * selec(right_oper) in OR nodes,
+ *
+ *   1 - select(oper) in NOT nodes
+ *
+ *   freq[val] in VAL nodes, if the value is in MCELEM
+ *   min(freq[MCELEM]) / 2 in VAL nodes, if it is not
+ *
+ *
+ * The MCELEM array is already sorted (see ts_typanalyze.c), so we can use
+ * binary search for determining freq[MCELEM].
+ */
+static Selectivity
+mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem,
+                           float4 *numbers, int nnumbers)
+{
+    float4            minfreq;
+    TextFreq        *lookup;
+    Selectivity        selec;
+    int                i;
+
+    /* Grab the lowest frequency */
+
+    /*
+      XXX This is no longer easy, need to go through the whole list. Plug it
+      with 2.0 for the moment (so that minfreq / 2 is 1.0)
+
+      minfreq = numbers[nnumbers - 1];
+    */
+    minfreq = 2.0;
+
+    /* Construct the array for binary search */
+    Assert(nmcelem == nnumbers);
+    lookup = (TextFreq *) palloc(sizeof(TextFreq) * nmcelem);
+    for (i = 0; i < nmcelem; i++)
+    {
+        lookup[i].element = mcelem[i];
+        lookup[i].frequency = numbers[i];
+    }
+
+    selec = tsquery_opr_selec(GETQUERY(query), GETOPERAND(query), lookup,
+                              nmcelem, minfreq);
+
+    pfree(lookup);
+
+    return selec;
+}
+
+/*
+ * bsearch() comparator for a lexeme (non-NULL terminated string with length)
+ * and a TextFreq. Use length, then byte-for-byte comparision, because that's
+ * how ANALYZE code sorted data before storing it in a statistic tuple.
+ * See ts_typanalyze.c for details.
+ */
+static int
+compare_lexeme_textfreq(const void *e1, const void *e2)
+{
+    const LexemeKey    *key;
+    const TextFreq    *t;
+    text            *element;
+    int                len1,
+                    len2;
+
+    key = (const LexemeKey *) e1;
+    t = (const TextFreq *) e2;
+
+    len1 = key->length;
+    len2 = VARSIZE_ANY_EXHDR(t->element);
+
+    /* Compare lengths first, possibly avoiding pointless detoasting */
+    if (len1 > len2)
+        return 1;
+    else if (len1 < len2)
+        return -1;
+
+    /* Fall back on byte-for-byte comparision, detoasting the Datum first */
+    element = DatumGetTextP(t->element);
+
+    return strncmp(key->lexeme, VARDATA_ANY(element), len1);
+}
+
+/*
+ *    tssel -- Selectivity of "@@"
+ */
+Datum
+tssel(PG_FUNCTION_ARGS)
+{
+    PlannerInfo        *root = (PlannerInfo *) PG_GETARG_POINTER(0);
+    /* We skip arg #2, which is the operator OID - it's gonna be "@@" */
+    List            *args = (List *) PG_GETARG_POINTER(2);
+    int                varRelid = PG_GETARG_INT32(3);
+    VariableStatData vardata;
+    Node            *other;
+    bool            varonleft;
+    Selectivity        selec;
+
+    /*
+     * If expression is not variable op something or something op variable,
+     * then punt and return a default estimate.
+     */
+    if (!get_restriction_variable(root, args, varRelid,
+                                  &vardata, &other, &varonleft))
+    {
+        PG_RETURN_FLOAT8(DEFAULT_TS_SEL);
+    }
+
+    /*
+     * Can't do anything useful if the something is not a constant, either.
+     */
+    if (!IsA(other, Const))
+    {
+        ReleaseVariableStats(vardata);
+        PG_RETURN_FLOAT8(DEFAULT_TS_SEL);
+    }
+
+    /* The "@@" operator is strict, so might cope with NULL right away */
+    if (((Const *) other)->constisnull) {
+        ReleaseVariableStats(vardata);
+        PG_RETURN_FLOAT8((float8) 0.0);
+    }
+
+    /*
+     * OK, there's a Var and a Const we're dealing with here. We need the Var
+     * to be a TSVector (or else we don't have any useful statistic for it).
+     */
+
+    if (vardata.vartype == TSVECTOROID)
+    {
+        /* tsvector @@ tsquery or the other way around */
+        Assert(((Const *) other)->consttype == TSQUERYOID);
+
+        selec = tsquerysel(&vardata, ((Const *) other)->constvalue);
+    }
+    else
+    {
+        /* The Var is something we don't have useful statistic for */
+        selec = DEFAULT_TS_SEL;
+    }
+
+    ReleaseVariableStats(vardata);
+
+    PG_RETURN_FLOAT8((float8) selec);
+}
+
+static Selectivity
+tsquerysel(VariableStatData *vardata, Datum constval)
+{
+    Selectivity            selec;
+
+    if (HeapTupleIsValid(vardata->statsTuple))
+    {
+        TSQuery                query;
+        Form_pg_statistic    stats;
+        Datum                *values;
+        int                    nvalues;
+        float4                *numbers;
+        int                    nnumbers;
+
+        /* The caller made sure the const is a TSQuery, so get it now */
+        query = DatumGetTSQuery(constval);
+
+        stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
+
+        /* MCELEM will be an array of Text elements for a tsvector column */
+        if (get_attstatsslot(vardata->statsTuple,
+                             TEXTOID, -1,
+                             STATISTIC_KIND_MCELEM, InvalidOid,
+                             &values, &nvalues,
+                             &numbers, &nnumbers))
+        {
+            /*
+             * There is a most-common-elements slot for the tsvector Var, so
+             * use that.
+             */
+
+            selec = mcelem_tsquery_selec(query, values, nvalues,
+                                         numbers, nnumbers);
+            free_attstatsslot(TEXTOID, values, nvalues, numbers, nnumbers);
+        }
+        else
+        {
+            /* No most-common-elements slot */
+            selec = (Selectivity) DEFAULT_TS_SEL;
+        }
+    }
+    else
+    {
+        selec = (Selectivity) DEFAULT_TS_SEL;
+    }
+
+    return selec;
+}
diff --git a/src/backend/tsearch/ts_typanalyze.c b/src/backend/tsearch/ts_typanalyze.c
index 965e758..201c4ab 100644
--- a/src/backend/tsearch/ts_typanalyze.c
+++ b/src/backend/tsearch/ts_typanalyze.c
@@ -43,8 +43,9 @@ static void compute_tsvector_stats(VacAttrStats *stats,
 static void prune_lexemes_hashtable(HTAB *lexemes_tab, int b_current);
 static uint32 lexeme_hash(const void *key, Size keysize);
 static int lexeme_match(const void *key1, const void *key2, Size keysize);
-static int trackitem_compare_desc(const void *e1, const void *e2);
-
+static int lexeme_compare(const void *key1, const void *key2);
+static int trackitem_compare_frequencies_desc(const void *e1, const void *e2);
+static int trackitem_compare_lexemes(const void *e1, const void *e2);

 /*
  *    ts_typanalyze -- a custom typanalyze function for tsvector columns
@@ -273,7 +274,7 @@ compute_tsvector_stats(VacAttrStats *stats,
         Assert(i == track_len);

         qsort(sort_table, track_len, sizeof(TrackItem *),
-              trackitem_compare_desc);
+              trackitem_compare_frequencies_desc);

         /* Suppress any single-occurrence items */
         while (track_len > 0)
@@ -287,6 +288,22 @@ compute_tsvector_stats(VacAttrStats *stats,
         if (num_mcelem > track_len)
             num_mcelem = track_len;

+        /*
+         * We want to store statistics sorted on the lexeme value using first
+         * length, then byte-for-byte comparision. The reason for doing length
+         * comparision first is that we don't care about the ordering as long
+         * as it's consistent and comparing lengths first gives us a chance to
+         * avoid a strncmp() call.
+         *
+         * This is different from what we do with scalar statistics -- they get
+         * sorted on frequencies. The rationale is that we usually search
+         * through most common elements looking for a specific value, so we can
+         * grab its frequency. When values are presorted we can employ binary
+         * search for that. See ts_selfuncs.c for a real usage scenario.
+         */
+        qsort(sort_table, num_mcelem, sizeof(TrackItem *),
+              trackitem_compare_lexemes);
+
         /* Generate MCELEM slot entry */
         if (num_mcelem > 0)
         {
@@ -379,25 +396,49 @@ lexeme_hash(const void *key, Size keysize)
 static int
 lexeme_match(const void *key1, const void *key2, Size keysize)
 {
-    const LexemeHashKey *d1 = (const LexemeHashKey *) key1;
-    const LexemeHashKey *d2 = (const LexemeHashKey *) key2;
+    /* The keysize parameter is superfluous, the keys store their lengths */
+    return lexeme_compare(key1, key2);
+}

-    /* The lexemes need to have the same length, and be memcmp-equal */
-    if (d1->length == d2->length &&
-        memcmp(d1->lexeme, d2->lexeme, d1->length) == 0)
-        return 0;
-    else
+/*
+ *    Comparision function for lexemes.
+ */
+static int
+lexeme_compare(const void *key1, const void *key2)
+{
+    const LexemeHashKey    *d1 = (const LexemeHashKey *) key1;
+    const LexemeHashKey    *d2 = (const LexemeHashKey *) key2;
+
+    /* First, compare by length */
+    if (d1->length > d2->length)
         return 1;
+    else if (d1->length < d2->length)
+        return -1;
+    else
+        /* Lengths are equal, do a byte-for-byte comparision */
+        return strncmp(d1->lexeme, d2->lexeme, d1->length);
 }

 /*
- *    qsort() comparator for TrackItems - LC style (descending sort)
+ *    qsort() comparator for sorting TrackItems on frequencies (descending sort)
  */
 static int
-trackitem_compare_desc(const void *e1, const void *e2)
+trackitem_compare_frequencies_desc(const void *e1, const void *e2)
 {
     const TrackItem * const *t1 = (const TrackItem * const *) e1;
     const TrackItem * const *t2 = (const TrackItem * const *) e2;

     return (*t2)->frequency - (*t1)->frequency;
 }
+
+/*
+ *    qsort() comparator for sorting TrackItem on lexemes
+ */
+static int
+trackitem_compare_lexemes(const void *e1, const void *e2)
+{
+    const TrackItem * const *t1 = (const TrackItem * const *) e1;
+    const TrackItem * const *t2 = (const TrackItem * const *) e2;
+
+    return lexeme_compare(&(*t1)->key, &(*t2)->key);
+}
diff --git a/src/include/catalog/pg_operator.h b/src/include/catalog/pg_operator.h
index 1c7f37d..451805d 100644
--- a/src/include/catalog/pg_operator.h
+++ b/src/include/catalog/pg_operator.h
@@ -915,10 +915,10 @@ DATA(insert OID = 3630 (  "<>"       PGNSP PGUID b f f 3614     3614     16 3630 3629     ts
 DATA(insert OID = 3631 (  ">="       PGNSP PGUID b f f 3614     3614     16 3628 3627     tsvector_ge scalargtsel
scalargtjoinsel)); 
 DATA(insert OID = 3632 (  ">"       PGNSP PGUID b f f 3614     3614     16 3627 3628     tsvector_gt scalargtsel
scalargtjoinsel)); 
 DATA(insert OID = 3633 (  "||"       PGNSP PGUID b f f 3614     3614     3614  0    0     tsvector_concat   -    -
)); 
-DATA(insert OID = 3636 (  "@@"       PGNSP PGUID b f f 3614     3615     16 3637    0     ts_match_vq   contsel
contjoinsel    )); 
-DATA(insert OID = 3637 (  "@@"       PGNSP PGUID b f f 3615     3614     16 3636    0     ts_match_qv   contsel
contjoinsel    )); 
-DATA(insert OID = 3660 (  "@@@"    PGNSP PGUID b f f 3614     3615     16 3661    0     ts_match_vq   contsel
contjoinsel    )); 
-DATA(insert OID = 3661 (  "@@@"    PGNSP PGUID b f f 3615     3614     16 3660    0     ts_match_qv   contsel
contjoinsel    )); 
+DATA(insert OID = 3636 (  "@@"       PGNSP PGUID b f f 3614     3615     16 3637    0     ts_match_vq   tssel
contjoinsel    )); 
+DATA(insert OID = 3637 (  "@@"       PGNSP PGUID b f f 3615     3614     16 3636    0     ts_match_qv   tssel
contjoinsel    )); 
+DATA(insert OID = 3660 (  "@@@"    PGNSP PGUID b f f 3614     3615     16 3661    0     ts_match_vq   tssel
contjoinsel    )); 
+DATA(insert OID = 3661 (  "@@@"    PGNSP PGUID b f f 3615     3614     16 3660    0     ts_match_qv   tssel
contjoinsel    )); 
 DATA(insert OID = 3674 (  "<"       PGNSP PGUID b f f 3615     3615     16 3679 3678     tsquery_lt scalarltsel
scalarltjoinsel)); 
 DATA(insert OID = 3675 (  "<="       PGNSP PGUID b f f 3615     3615     16 3678 3679     tsquery_le scalarltsel
scalarltjoinsel)); 
 DATA(insert OID = 3676 (  "="       PGNSP PGUID b t f 3615     3615     16 3676 3677     tsquery_eq eqsel eqjoinsel
));
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
index 16ccb55..ef24f40 100644
--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -4321,6 +4321,8 @@ DESCR("GiST tsquery support");
 DATA(insert OID = 3701 (  gtsquery_consistent            PGNSP PGUID 12 1 0 0 f f t f i 5 16 "2281 2281 23 26 2281"
_null__null_ _null_ gtsquery_consistent _null_ _null_ _null_ )); 
 DESCR("GiST tsquery support");

+DATA(insert OID = 3687 (  tssel    PGNSP PGUID 12 1 0 0 f f t f s 4 701 "2281 26 2281 23" _null_ _null_ _null_ tssel
_null__null_ _null_ )); 
+DESCR("restriction selectivity of tsvector @@ tsquery");
 DATA(insert OID = 3688 (  ts_typanalyze    PGNSP PGUID 12 1 0 0 f f t f s 1 16 "2281" _null_ _null_ _null_
ts_typanalyze_null_ _null_ _null_ )); 
 DESCR("tsvector typanalyze");

diff --git a/src/include/tsearch/ts_type.h b/src/include/tsearch/ts_type.h
index 06ca1f9..b253327 100644
--- a/src/include/tsearch/ts_type.h
+++ b/src/include/tsearch/ts_type.h
@@ -155,6 +155,8 @@ extern Datum ts_rankcd_wttf(PG_FUNCTION_ARGS);

 extern Datum ts_typanalyze(PG_FUNCTION_ARGS);

+extern Datum tssel(PG_FUNCTION_ARGS);
+

 /*
  * TSQuery
@@ -291,4 +293,17 @@ extern Datum tsquery_rewrite_query(PG_FUNCTION_ARGS);
 extern Datum tsq_mcontains(PG_FUNCTION_ARGS);
 extern Datum tsq_mcontained(PG_FUNCTION_ARGS);

+/*
+ * The default text search selectivity is chosen to be smll enough to
+ * encourage indexscans for typical table densities. See selfuncs.h and
+ * DEFAULT_EQ_SEL for details.
+ */
+#define DEFAULT_TS_SEL 0.005
+
+/*
+ * operator restriction function for tsvector @@ tsquery and
+ * tsquery @@ tsvector
+ */
+extern Datum tssel(PG_FUNCTION_ARGS);
+
 #endif   /* _PG_TSTYPE_H_ */

pgsql-hackers by date:

Previous
From: Tom Lane
Date:
Subject: Re: WIP: patch to create explicit support for semi and anti joins
Next
From: Jan Urbański
Date:
Subject: Re: gsoc, oprrest function for text search take 2