gsoc, oprrest function for text search - Mailing list pgsql-hackers

From Jan Urbański
Subject gsoc, oprrest function for text search
Date
Msg-id 4881CCCF.4020905@students.mimuw.edu.pl
Whole thread Raw
Responses Re: gsoc, oprrest function for text search  (Jan Urbański <j.urbanski@students.mimuw.edu.pl>)
Re: gsoc, oprrest function for text search  ("Heikki Linnakangas" <heikki@enterprisedb.com>)
List pgsql-hackers
Here's a WIP patch implementing an oprrest function for tsvector @@
tsquery and tsquery @@ tsvector.

The idea is (quoting a comment)
/*
  *  Traverse the tsquery preorder, calculating selectivity as:
  *
  *   selec(left_oper) * selec(right_oper) in AND nodes,
  *
  *   selec(left_oper) + selec(right_oper) -
  *      selec(left_oper) * selec(right_oper) in OR nodes,
  *
  *   1 - select(oper) in NOT nodes
  *
  *   freq[val] in VAL nodes, if the value is in MCELEM
  *   min(freq[MCELEM]) / 2 in VAL nodes, if it is not
  *
  *
  * Implementation-wise, we sort the MCELEM array to use binary
  * search on it.
  */

The patch still has many rough edges, but it applies to HEAD and passes
tests. I'm posting it mostly to get feedback about whether I'm going in
the right direction.

Cheers,
Jan

--
Jan Urbanski
GPG key ID: E583D7D2

ouden estin
diff --git a/src/backend/tsearch/Makefile b/src/backend/tsearch/Makefile
index e20a4a2..ba728eb 100644
*** a/src/backend/tsearch/Makefile
--- b/src/backend/tsearch/Makefile
***************
*** 19,25 ****
  OBJS = ts_locale.o ts_parse.o wparser.o wparser_def.o dict.o \
      dict_simple.o dict_synonym.o dict_thesaurus.o \
      dict_ispell.o regis.o spell.o \
!     to_tsany.o ts_typanalyze.o ts_utils.o

  include $(top_srcdir)/src/backend/common.mk

--- 19,25 ----
  OBJS = ts_locale.o ts_parse.o wparser.o wparser_def.o dict.o \
      dict_simple.o dict_synonym.o dict_thesaurus.o \
      dict_ispell.o regis.o spell.o \
!     to_tsany.o ts_typanalyze.o ts_selfuncs.o ts_utils.o

  include $(top_srcdir)/src/backend/common.mk

diff --git a/src/backend/tsearch/ts_selfuncs.c b/src/backend/tsearch/ts_selfuncs.c
new file mode 100644
index 0000000..4b86072
*** /dev/null
--- b/src/backend/tsearch/ts_selfuncs.c
***************
*** 0,-1 ****
--- 1,299 ----
+ /*-------------------------------------------------------------------------
+  *
+  * ts_selfuncs.c
+  *      Selectivity functions for text search types.
+  *
+  * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
+  *
+  *
+  * IDENTIFICATION
+  *      $PostgreSQL$
+  *
+  *-------------------------------------------------------------------------
+  */
+ #include "postgres.h"
+
+ #include "miscadmin.h" /* for check_stack_depth() */
+ #include "utils/memutils.h"
+ #include "utils/builtins.h"
+ #include "utils/syscache.h"
+ #include "utils/lsyscache.h"
+ #include "utils/selfuncs.h"
+ #include "catalog/pg_type.h"
+ #include "catalog/pg_statistic.h"
+ #include "nodes/nodes.h"
+ #include "tsearch/ts_type.h"
+
+ /* lookup table type for binary searching through MCELEMs */
+ typedef struct
+ {
+     Datum    element;
+     float4    frequency;
+ } TextFreq;
+
+ static int
+ compare_textfreq(const void *e1, const void *e2);
+ static Selectivity
+ tsquery_opr_selec(QueryItem *item, char *operand, TextFreq *lookup,
+                   int length, float4 minfreq);
+ static Selectivity
+ mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem,
+                            float4 *numbers, int nnumbers);
+ static double
+ tsquerysel(VariableStatData *vardata, Datum constval);
+
+
+ /* TSQuery traversal function */
+ static Selectivity
+ tsquery_opr_selec(QueryItem *item, char *operand, TextFreq *lookup,
+                   int length, float4 minfreq)
+ {
+     TextFreq    key,
+                 *searchres;
+     Selectivity    s1, s2;
+
+     /* since this function recurses, it could be driven to stack overflow */
+     check_stack_depth();
+
+     if (item->type == QI_VAL)
+     {
+         QueryOperand *oper = (QueryOperand *) item;
+
+         /*
+          * Prepare the key for bsearch(). No need to initialize key.frequency,
+          * because sorting is only on key.element.
+          */
+         key.element = PointerGetDatum(
+             cstring_to_text_with_len(operand + oper->distance, oper->length));
+
+         searchres = (TextFreq *) bsearch(&key, lookup, length,
+                                          sizeof(TextFreq), compare_textfreq);
+         if (searchres)
+         {
+             /*
+              * The element is in MCELEM. Return precise selectivity (or at
+              * least as precise, as ANALYZE could find out).
+              */
+             return (Selectivity) searchres->frequency;
+         }
+         else
+         {
+             /*
+              * The element is not in MCELEM. Punt, but  assure that the
+              * selectivity cannot be more than minfreq / 2.
+              */
+             return (Selectivity) Min(DEFAULT_TS_SEL, minfreq / 2);
+         }
+     }
+
+     /* Current TSQuery node is an operator */
+     switch (item->operator.oper)
+     {
+         case OP_NOT:
+             return 1.0 - tsquery_opr_selec(item + 1, operand, lookup,
+                                            length, minfreq);
+
+         case OP_AND:
+             return
+                 tsquery_opr_selec(item + 1, operand, lookup, length, minfreq) *
+                 tsquery_opr_selec(item + item->operator.left, operand, lookup,
+                                   length, minfreq);
+
+         case OP_OR:
+             s1 = tsquery_opr_selec(item + 1, operand, lookup, length, minfreq);
+             s2 = tsquery_opr_selec(item + item->operator.left, operand, lookup,
+                                    length, minfreq);
+             return s1 + s2 - s1 * s2;
+
+         default:
+             elog(ERROR, "unrecognized operator: %d", item->operator.oper);
+     }
+
+     /* never reached, keep compiler happy */
+     return (Selectivity) DEFAULT_TS_SEL;
+ }
+
+ /*
+  *  Traverse the tsquery preorder, calculating selectivity as:
+  *
+  *   selec(left_oper) * selec(right_oper) in AND nodes,
+  *
+  *   selec(left_oper) + selec(right_oper) -
+  *      selec(left_oper) * selec(right_oper) in OR nodes,
+  *
+  *   1 - select(oper) in NOT nodes
+  *
+  *   freq[val] in VAL nodes, if the value is in MCELEM
+  *   min(freq[MCELEM]) / 2 in VAL nodes, if it is not
+  *
+  *
+  * Implementation-wise, we sort the MCELEM array to use binary
+  * search on it.
+  */
+ static Selectivity
+ mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem,
+                            float4 *numbers, int nnumbers)
+ {
+     float4            minfreq;
+     TextFreq        *lookup;
+     Selectivity        selec;
+     MemoryContext    opr_context,
+                     old_context;
+     int                i;
+
+     /* Grab the lowest frequency */
+     minfreq = numbers[nnumbers - 1];
+
+     /*
+      * Compute selectivity in child context, because we will possibly do a lot
+      * of QueryOperand to text transformations, which palloc() memory
+      */
+     opr_context = AllocSetContextCreate(CurrentMemoryContext,
+                                         "Text search restriction",
+                                         ALLOCSET_DEFAULT_MINSIZE,
+                                         ALLOCSET_DEFAULT_INITSIZE,
+                                         ALLOCSET_DEFAULT_MAXSIZE);
+     old_context = MemoryContextSwitchTo(opr_context);
+
+
+     /* Construct the array to be qsort()'ed */
+     Assert(nmcelem == nnumbers);
+     lookup = (TextFreq *) palloc(sizeof(TextFreq) * nmcelem);
+     for (i = 0; i < nmcelem; i++)
+     {
+         lookup[i].element = mcelem[i];
+         lookup[i].frequency = numbers[i];
+     }
+
+     qsort(lookup, nmcelem, sizeof(TextFreq), compare_textfreq);
+
+     selec = tsquery_opr_selec(GETQUERY(query), GETOPERAND(query), lookup,
+                               nmcelem, minfreq);
+
+     MemoryContextSwitchTo(old_context);
+     MemoryContextDelete(opr_context);
+
+     return selec;
+ }
+
+ static int
+ compare_textfreq(const void *e1, const void *e2)
+ {
+     const TextFreq *t1 = (const TextFreq *) e1;
+     const TextFreq *t2 = (const TextFreq *) e2;
+
+     return DatumGetInt32(DirectFunctionCall2(bttextcmp,
+                                              t1->element, t2->element));
+ }
+
+ /*
+  *    tssel -- Selectivity of "@@"
+  */
+ Datum
+ tssel(PG_FUNCTION_ARGS)
+ {
+     PlannerInfo        *root = (PlannerInfo *) PG_GETARG_POINTER(0);
+     /* We skip arg #2, which is the operator OID - it's gonna be "@@" */
+     List            *args = (List *) PG_GETARG_POINTER(2);
+     int                varRelid = PG_GETARG_INT32(3);
+     VariableStatData vardata;
+     Node            *other;
+     bool            varonleft;
+     Selectivity        selec;
+
+     /*
+      * If expression is not variable op something or something op variable,
+      * then punt and return a default estimate.
+      */
+     if (!get_restriction_variable(root, args, varRelid,
+                                   &vardata, &other, &varonleft))
+     {
+         PG_RETURN_FLOAT8(DEFAULT_TS_SEL);
+     }
+
+     /*
+      * Can't do anything useful if the something is not a constant, either.
+      */
+     if (!IsA(other, Const))
+     {
+         ReleaseVariableStats(vardata);
+         PG_RETURN_FLOAT8(DEFAULT_TS_SEL);
+     }
+
+     /* The "@@" operator is strict, so might cope with NULL right away */
+     if (((Const *) other)->constisnull) {
+         ReleaseVariableStats(vardata);
+         PG_RETURN_FLOAT8((float8) 0.0);
+     }
+
+     /*
+      * OK, there's a Var and a Const we're dealing with here. We need the Var
+      * to be a TSVector (or else we don't have any useful statistic for it).
+      */
+
+     if (vardata.vartype == TSVECTOROID)
+     {
+         /* tsvector @@ tsquery or the other way around */
+         Assert(((Const *) other)->consttype == TSQUERYOID);
+
+         selec = tsquerysel(&vardata, ((Const *) other)->constvalue);
+     }
+     else
+     {
+         /* The Var is something we don't have useful statistic for */
+         selec = (Selectivity) DEFAULT_TS_SEL;
+     }
+
+     ReleaseVariableStats(vardata);
+
+     PG_RETURN_FLOAT8((float8) selec);
+ }
+
+ static Selectivity
+ tsquerysel(VariableStatData *vardata, Datum constval)
+ {
+     Selectivity            selec;
+
+     if (HeapTupleIsValid(vardata->statsTuple))
+     {
+         TSQuery                query;
+         Form_pg_statistic    stats;
+         Datum                *values;
+         int                    nvalues;
+         float4                *numbers;
+         int                    nnumbers;
+
+         /* The caller made sure the const is a TSQuery, so get it now */
+         query = DatumGetTSQuery(constval);
+
+         stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
+
+         /* MCELEM will be an array of Text elements for a tsvector column */
+         if (get_attstatsslot(vardata->statsTuple,
+                              TEXTOID, -1,
+                              STATISTIC_KIND_MCELEM, InvalidOid,
+                              &values, &nvalues,
+                              &numbers, &nnumbers))
+         {
+             /*
+              * There is a most-common-elements slot for the tsvector Var, so
+              * use that.
+              */
+
+             selec = mcelem_tsquery_selec(query, values, nvalues,
+                                          numbers, nnumbers);
+             free_attstatsslot(TEXTOID, values, nvalues, numbers, nnumbers);
+         }
+         else
+         {
+             /* No most-common-elements slot */
+             selec = (Selectivity) DEFAULT_TS_SEL;
+         }
+     }
+     else
+     {
+         selec = (Selectivity) DEFAULT_TS_SEL;
+     }
+
+     return selec;
+ }
diff --git a/src/include/catalog/pg_operator.h b/src/include/catalog/pg_operator.h
index 1c7f37d..451805d 100644
*** a/src/include/catalog/pg_operator.h
--- b/src/include/catalog/pg_operator.h
***************
*** 915,924 ****
  DATA(insert OID = 3631 (  ">="       PGNSP PGUID b f f 3614     3614     16 3628 3627     tsvector_ge scalargtsel
scalargtjoinsel)); 
  DATA(insert OID = 3632 (  ">"       PGNSP PGUID b f f 3614     3614     16 3627 3628     tsvector_gt scalargtsel
scalargtjoinsel)); 
  DATA(insert OID = 3633 (  "||"       PGNSP PGUID b f f 3614     3614     3614  0    0     tsvector_concat   -    -
 )); 
! DATA(insert OID = 3636 (  "@@"       PGNSP PGUID b f f 3614     3615     16 3637    0     ts_match_vq   contsel
contjoinsel    )); 
! DATA(insert OID = 3637 (  "@@"       PGNSP PGUID b f f 3615     3614     16 3636    0     ts_match_qv   contsel
contjoinsel    )); 
! DATA(insert OID = 3660 (  "@@@"    PGNSP PGUID b f f 3614     3615     16 3661    0     ts_match_vq   contsel
contjoinsel    )); 
! DATA(insert OID = 3661 (  "@@@"    PGNSP PGUID b f f 3615     3614     16 3660    0     ts_match_qv   contsel
contjoinsel    )); 
  DATA(insert OID = 3674 (  "<"       PGNSP PGUID b f f 3615     3615     16 3679 3678     tsquery_lt scalarltsel
scalarltjoinsel)); 
  DATA(insert OID = 3675 (  "<="       PGNSP PGUID b f f 3615     3615     16 3678 3679     tsquery_le scalarltsel
scalarltjoinsel)); 
  DATA(insert OID = 3676 (  "="       PGNSP PGUID b t f 3615     3615     16 3676 3677     tsquery_eq eqsel eqjoinsel
));
--- 915,924 ----
  DATA(insert OID = 3631 (  ">="       PGNSP PGUID b f f 3614     3614     16 3628 3627     tsvector_ge scalargtsel
scalargtjoinsel)); 
  DATA(insert OID = 3632 (  ">"       PGNSP PGUID b f f 3614     3614     16 3627 3628     tsvector_gt scalargtsel
scalargtjoinsel)); 
  DATA(insert OID = 3633 (  "||"       PGNSP PGUID b f f 3614     3614     3614  0    0     tsvector_concat   -    -
 )); 
! DATA(insert OID = 3636 (  "@@"       PGNSP PGUID b f f 3614     3615     16 3637    0     ts_match_vq   tssel
contjoinsel    )); 
! DATA(insert OID = 3637 (  "@@"       PGNSP PGUID b f f 3615     3614     16 3636    0     ts_match_qv   tssel
contjoinsel    )); 
! DATA(insert OID = 3660 (  "@@@"    PGNSP PGUID b f f 3614     3615     16 3661    0     ts_match_vq   tssel
contjoinsel    )); 
! DATA(insert OID = 3661 (  "@@@"    PGNSP PGUID b f f 3615     3614     16 3660    0     ts_match_qv   tssel
contjoinsel    )); 
  DATA(insert OID = 3674 (  "<"       PGNSP PGUID b f f 3615     3615     16 3679 3678     tsquery_lt scalarltsel
scalarltjoinsel)); 
  DATA(insert OID = 3675 (  "<="       PGNSP PGUID b f f 3615     3615     16 3678 3679     tsquery_le scalarltsel
scalarltjoinsel)); 
  DATA(insert OID = 3676 (  "="       PGNSP PGUID b t f 3615     3615     16 3676 3677     tsquery_eq eqsel eqjoinsel
));
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
index 16ccb55..ef24f40 100644
*** a/src/include/catalog/pg_proc.h
--- b/src/include/catalog/pg_proc.h
***************
*** 4321,4326 ****
--- 4321,4328 ----
  DATA(insert OID = 3701 (  gtsquery_consistent            PGNSP PGUID 12 1 0 0 f f t f i 5 16 "2281 2281 23 26 2281"
_null__null_ _null_ gtsquery_consistent _null_ _null_ _null_ )); 
  DESCR("GiST tsquery support");

+ DATA(insert OID = 3687 (  tssel    PGNSP PGUID 12 1 0 0 f f t f s 4 701 "2281 26 2281 23" _null_ _null_ _null_ tssel
_null__null_ _null_ )); 
+ DESCR("restriction selectivity of tsvector @@ tsquery");
  DATA(insert OID = 3688 (  ts_typanalyze    PGNSP PGUID 12 1 0 0 f f t f s 1 16 "2281" _null_ _null_ _null_
ts_typanalyze_null_ _null_ _null_ )); 
  DESCR("tsvector typanalyze");

diff --git a/src/include/tsearch/ts_type.h b/src/include/tsearch/ts_type.h
index 06ca1f9..6208432 100644
*** a/src/include/tsearch/ts_type.h
--- b/src/include/tsearch/ts_type.h
***************
*** 291,294 ****
--- 291,307 ----
  extern Datum tsq_mcontains(PG_FUNCTION_ARGS);
  extern Datum tsq_mcontained(PG_FUNCTION_ARGS);

+ /*
+  * The default text search selectivity is chosen to be smll enough to
+  * encourage indexscans for typical table densities. See selfuncs.h and
+  * DEFAULT_EQ_SEL for details.
+  */
+ #define DEFAULT_TS_SEL 0.005
+
+ /*
+  * operator restriction function for tsvector @@ tsquery and
+  * tsquery @@ tsvector
+  */
+ extern Datum tssel(PG_FUNCTION_ARGS);
+
  #endif   /* _PG_TSTYPE_H_ */

pgsql-hackers by date:

Previous
From: chris
Date:
Subject: Re: Postgres-R: primary key patches
Next
From: Adriaan van Os
Date:
Subject: Re: Getting to universal binaries for Darwin