Thread: [PATCH] RFC: Add length parameterised dmetaphone functions

[PATCH] RFC: Add length parameterised dmetaphone functions

From
Christian Marie
Date:
A developer I work with was trying to use dmetaphone to group people names into
equivalence classes. He found that many long names would be grouped together
when they shouldn't be, this turned out to be because dmetaphone has an
undocumented upper bound on its output length, of four. This is obviously
impractical for many use cases.

This patch addresses this by adding and documenting an optional argument to
dmetaphone and dmetaphone_alt that specifies the maximum output length. This
makes it possible to use dmetaphone on much longer inputs.

Backwards compatibility is catered for by making the new argument optional,
defaulting to the old, hard-coded value of four. We now have:
dmetaphone(text source) returns textdmetaphone(text source, int max_output_length) returns textdmetaphone_alt(text
source)returns textdmetaphone_alt(text source, int max_output_length) returns text
 

---contrib/fuzzystrmatch/Makefile                    |  2 +-contrib/fuzzystrmatch/dmetaphone.c                | 46
++++++++++++++------contrib/fuzzystrmatch/fuzzystrmatch--1.0--1.1.sql| 12
++++++contrib/fuzzystrmatch/fuzzystrmatch--1.0.sql     | 44
-------------------contrib/fuzzystrmatch/fuzzystrmatch--1.1.sql     | 52
+++++++++++++++++++++++contrib/fuzzystrmatch/fuzzystrmatch.control      |  2 +-doc/src/sgml/fuzzystrmatch.sgml
        | 18 ++++++--7 files changed, 115 insertions(+), 61 deletions(-)create mode 100644
contrib/fuzzystrmatch/fuzzystrmatch--1.0--1.1.sqldeletemode 100644 contrib/fuzzystrmatch/fuzzystrmatch--1.0.sqlcreate
mode100644 contrib/fuzzystrmatch/fuzzystrmatch--1.1.sql
 

diff --git a/contrib/fuzzystrmatch/Makefile b/contrib/fuzzystrmatch/Makefile
index 0327d9510a50..fee85ad1ec36 100644
--- a/contrib/fuzzystrmatch/Makefile
+++ b/contrib/fuzzystrmatch/Makefile
@@ -4,7 +4,7 @@ MODULE_big = fuzzystrmatchOBJS = fuzzystrmatch.o dmetaphone.o $(WIN32RES)EXTENSION = fuzzystrmatch
-DATA = fuzzystrmatch--1.0.sql fuzzystrmatch--unpackaged--1.0.sql
+DATA = fuzzystrmatch--1.1.sql fuzzystrmatch--unpackaged--1.0.sql fuzzystrmatch--1.0--1.1.sqlPGFILEDESC =
"fuzzystrmatch- similarities and distance between strings"ifdef USE_PGXS
 
diff --git a/contrib/fuzzystrmatch/dmetaphone.c b/contrib/fuzzystrmatch/dmetaphone.c
index 147c8501ee89..7afcf0aee841 100644
--- a/contrib/fuzzystrmatch/dmetaphone.c
+++ b/contrib/fuzzystrmatch/dmetaphone.c
@@ -115,10 +115,28 @@ The remaining code is authored by Andrew Dunstan <amdunstan@ncshp.org> and#include <assert.h>/*
prototypefor the main function we got from the perl module */
 
-static void DoubleMetaphone(char *, char **);
+static void DoubleMetaphone(char *, char **, int);#ifndef DMETAPHONE_MAIN
+/* Fetch and validate the optional second function argument, which should be a
+ * maximum length for output
+ *
+ * Defaults to 4.
+ *
+ * Introduced in 1.1
+ */
+inline int
+fetch_max_len_arg(PG_FUNCTION_ARGS)
+{
+    int max_len = (PG_NARGS() > 1) ? PG_GETARG_INT32(1) : 4;
+    if(max_len < 1)
+        ereport(ERROR,
+            (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+            errmsg("invalid max output length, should be >= 1")));
+    return max_len;
+}
+/* * The PostgreSQL visible dmetaphone function. */
@@ -128,6 +146,7 @@ PG_FUNCTION_INFO_V1(dmetaphone);Datumdmetaphone(PG_FUNCTION_ARGS){
+    int         max_len;    text       *arg;    char       *aptr,               *codes[2],
@@ -139,8 +158,9 @@ dmetaphone(PG_FUNCTION_ARGS)#endif    arg = PG_GETARG_TEXT_P(0);    aptr = text_to_cstring(arg);
+    max_len = fetch_max_len_arg(fcinfo);
-    DoubleMetaphone(aptr, codes);
+    DoubleMetaphone(aptr, codes, max_len);    code = codes[0];    if (!code)        code = "";
@@ -157,6 +177,7 @@ PG_FUNCTION_INFO_V1(dmetaphone_alt);Datumdmetaphone_alt(PG_FUNCTION_ARGS){
+    int         max_len;    text       *arg;    char       *aptr,               *codes[2],
@@ -168,8 +189,9 @@ dmetaphone_alt(PG_FUNCTION_ARGS)#endif    arg = PG_GETARG_TEXT_P(0);    aptr =
text_to_cstring(arg);
+    max_len = fetch_max_len_arg(fcinfo);
-    DoubleMetaphone(aptr, codes);
+    DoubleMetaphone(aptr, codes, max_len);    code = codes[1];    if (!code)        code = "";
@@ -390,7 +412,7 @@ MetaphAdd(metastring *s, char *new_str)static void
-DoubleMetaphone(char *str, char **codes)
+DoubleMetaphone(char *str, char **codes, int max_len){    int            length;    metastring *original;
@@ -427,7 +449,7 @@ DoubleMetaphone(char *str, char **codes)    }    /* main loop */
-    while ((primary->length < 4) || (secondary->length < 4))
+    while ((primary->length < max_len) || (secondary->length < max_len))    {        if (current >= length)
break;
@@ -1410,11 +1432,11 @@ DoubleMetaphone(char *str, char **codes)    }
-    if (primary->length > 4)
-        SetAt(primary, 4, '\0');
+    if (primary->length > max_len)
+        SetAt(primary, max_len, '\0');
-    if (secondary->length > 4)
-        SetAt(secondary, 4, '\0');
+    if (secondary->length > max_len)
+        SetAt(secondary, max_len, '\0');    *codes = primary->str;    *++codes = secondary->str;
@@ -1428,13 +1450,13 @@ DoubleMetaphone(char *str, char **codes)/* just for testing - not part of the perl code */
-main(int argc, char **argv)
+void main(int argc, char **argv){    char       *codes[2];
-    if (argc > 1)
+    if (argc > 2)    {
-        DoubleMetaphone(argv[1], codes);
+        DoubleMetaphone(argv[1], codes, atoi(argv[2]));        printf("%s|%s\n", codes[0], codes[1]);    }}
diff --git a/contrib/fuzzystrmatch/fuzzystrmatch--1.0--1.1.sql b/contrib/fuzzystrmatch/fuzzystrmatch--1.0--1.1.sql
new file mode 100644
index 000000000000..b00112c6bc9d
--- /dev/null
+++ b/contrib/fuzzystrmatch/fuzzystrmatch--1.0--1.1.sql
@@ -0,0 +1,12 @@
+/* contrib/fuzzystrmatch/fuzzystrmatch--1.0--1.1.sql */
+
+-- complain if script is sourced in psql, rather than via ALTER EXTENSION
+\echo Use "ALTER EXTENSION fuzzystrmatch UPDATE TO '1.1'" to load this file. \quit
+
+CREATE FUNCTION dmetaphone (text, int) RETURNS text
+AS 'MODULE_PATHNAME', 'dmetaphone'
+LANGUAGE C IMMUTABLE STRICT;
+
+CREATE FUNCTION dmetaphone_alt (text, int) RETURNS text
+AS 'MODULE_PATHNAME', 'dmetaphone_alt'
+LANGUAGE C IMMUTABLE STRICT;
diff --git a/contrib/fuzzystrmatch/fuzzystrmatch--1.0.sql b/contrib/fuzzystrmatch/fuzzystrmatch--1.0.sql
deleted file mode 100644
index 1cf9b6188cc0..000000000000
--- a/contrib/fuzzystrmatch/fuzzystrmatch--1.0.sql
+++ /dev/null
@@ -1,44 +0,0 @@
-/* contrib/fuzzystrmatch/fuzzystrmatch--1.0.sql */
-
--- complain if script is sourced in psql, rather than via CREATE EXTENSION
-\echo Use "CREATE EXTENSION fuzzystrmatch" to load this file. \quit
-
-CREATE FUNCTION levenshtein (text,text) RETURNS int
-AS 'MODULE_PATHNAME','levenshtein'
-LANGUAGE C IMMUTABLE STRICT;
-
-CREATE FUNCTION levenshtein (text,text,int,int,int) RETURNS int
-AS 'MODULE_PATHNAME','levenshtein_with_costs'
-LANGUAGE C IMMUTABLE STRICT;
-
-CREATE FUNCTION levenshtein_less_equal (text,text,int) RETURNS int
-AS 'MODULE_PATHNAME','levenshtein_less_equal'
-LANGUAGE C IMMUTABLE STRICT;
-
-CREATE FUNCTION levenshtein_less_equal (text,text,int,int,int,int) RETURNS int
-AS 'MODULE_PATHNAME','levenshtein_less_equal_with_costs'
-LANGUAGE C IMMUTABLE STRICT;
-
-CREATE FUNCTION metaphone (text,int) RETURNS text
-AS 'MODULE_PATHNAME','metaphone'
-LANGUAGE C IMMUTABLE STRICT;
-
-CREATE FUNCTION soundex(text) RETURNS text
-AS 'MODULE_PATHNAME', 'soundex'
-LANGUAGE C IMMUTABLE STRICT;
-
-CREATE FUNCTION text_soundex(text) RETURNS text
-AS 'MODULE_PATHNAME', 'soundex'
-LANGUAGE C IMMUTABLE STRICT;
-
-CREATE FUNCTION difference(text,text) RETURNS int
-AS 'MODULE_PATHNAME', 'difference'
-LANGUAGE C IMMUTABLE STRICT;
-
-CREATE FUNCTION dmetaphone (text) RETURNS text
-AS 'MODULE_PATHNAME', 'dmetaphone'
-LANGUAGE C IMMUTABLE STRICT;
-
-CREATE FUNCTION dmetaphone_alt (text) RETURNS text
-AS 'MODULE_PATHNAME', 'dmetaphone_alt'
-LANGUAGE C IMMUTABLE STRICT;
diff --git a/contrib/fuzzystrmatch/fuzzystrmatch--1.1.sql b/contrib/fuzzystrmatch/fuzzystrmatch--1.1.sql
new file mode 100644
index 000000000000..e0eba457892b
--- /dev/null
+++ b/contrib/fuzzystrmatch/fuzzystrmatch--1.1.sql
@@ -0,0 +1,52 @@
+/* contrib/fuzzystrmatch/fuzzystrmatch--1.0.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION fuzzystrmatch" to load this file. \quit
+
+CREATE FUNCTION levenshtein (text,text) RETURNS int
+AS 'MODULE_PATHNAME','levenshtein'
+LANGUAGE C IMMUTABLE STRICT;
+
+CREATE FUNCTION levenshtein (text,text,int,int,int) RETURNS int
+AS 'MODULE_PATHNAME','levenshtein_with_costs'
+LANGUAGE C IMMUTABLE STRICT;
+
+CREATE FUNCTION levenshtein_less_equal (text,text,int) RETURNS int
+AS 'MODULE_PATHNAME','levenshtein_less_equal'
+LANGUAGE C IMMUTABLE STRICT;
+
+CREATE FUNCTION levenshtein_less_equal (text,text,int,int,int,int) RETURNS int
+AS 'MODULE_PATHNAME','levenshtein_less_equal_with_costs'
+LANGUAGE C IMMUTABLE STRICT;
+
+CREATE FUNCTION metaphone (text,int) RETURNS text
+AS 'MODULE_PATHNAME','metaphone'
+LANGUAGE C IMMUTABLE STRICT;
+
+CREATE FUNCTION soundex(text) RETURNS text
+AS 'MODULE_PATHNAME', 'soundex'
+LANGUAGE C IMMUTABLE STRICT;
+
+CREATE FUNCTION text_soundex(text) RETURNS text
+AS 'MODULE_PATHNAME', 'soundex'
+LANGUAGE C IMMUTABLE STRICT;
+
+CREATE FUNCTION difference(text,text) RETURNS int
+AS 'MODULE_PATHNAME', 'difference'
+LANGUAGE C IMMUTABLE STRICT;
+
+CREATE FUNCTION dmetaphone (text) RETURNS text
+AS 'MODULE_PATHNAME', 'dmetaphone'
+LANGUAGE C IMMUTABLE STRICT;
+
+CREATE FUNCTION dmetaphone_alt (text) RETURNS text
+AS 'MODULE_PATHNAME', 'dmetaphone_alt'
+LANGUAGE C IMMUTABLE STRICT;
+
+CREATE FUNCTION dmetaphone (text, int) RETURNS text
+AS 'MODULE_PATHNAME', 'dmetaphone'
+LANGUAGE C IMMUTABLE STRICT;
+
+CREATE FUNCTION dmetaphone_alt (text, int) RETURNS text
+AS 'MODULE_PATHNAME', 'dmetaphone_alt'
+LANGUAGE C IMMUTABLE STRICT;
diff --git a/contrib/fuzzystrmatch/fuzzystrmatch.control b/contrib/fuzzystrmatch/fuzzystrmatch.control
index e257f0961171..6b2832ae8d95 100644
--- a/contrib/fuzzystrmatch/fuzzystrmatch.control
+++ b/contrib/fuzzystrmatch/fuzzystrmatch.control
@@ -1,5 +1,5 @@# fuzzystrmatch extensioncomment = 'determine similarities and distance between strings'
-default_version = '1.0'
+default_version = '1.1'module_pathname = '$libdir/fuzzystrmatch'relocatable = true
diff --git a/doc/src/sgml/fuzzystrmatch.sgml b/doc/src/sgml/fuzzystrmatch.sgml
index f26bd90dfc5f..3898d16af22c 100644
--- a/doc/src/sgml/fuzzystrmatch.sgml
+++ b/doc/src/sgml/fuzzystrmatch.sgml
@@ -210,7 +210,9 @@ test=# SELECT metaphone('GUMBO', 4);<synopsis>dmetaphone(text source) returns text
+dmetaphone(text source, int max_output_length) returns textdmetaphone_alt(text source) returns text
+dmetaphone_alt(text source, int max_output_length) returns text</synopsis>  <para>
@@ -218,14 +220,24 @@ dmetaphone_alt(text source) returns text  </para>  <para>
+   max_output_length defaults to 4 and must be greater than 1
+  </para>
+
+  <para>   Example:  </para><screen>
-test=# select dmetaphone('gumbo');
- dmetaphone
+postgres=# select dmetaphone('unicorns');
+ dmetaphone 
+------------
+ ANKR
+(1 row)
+
+postgres=# select dmetaphone('unicorns', 8);
+ dmetaphone ------------
- KMP
+ ANKRNS(1 row)</screen> </sect2>
-- 
2.6.2




Re: [PATCH] RFC: Add length parameterised dmetaphone functions

From
Michael Paquier
Date:
On Fri, Nov 6, 2015 at 2:00 PM, Christian Marie <christian@ponies.io> wrote:
> This patch addresses this by adding and documenting an optional argument to
> dmetaphone and dmetaphone_alt that specifies the maximum output length. This
> makes it possible to use dmetaphone on much longer inputs.

It would be good if you could register your patch here so as other
folks can track it for review, and this way we are sure that it won't
fall into the void:
https://commitfest.postgresql.org/8/
It would be good as well if you could attach a version of this patch
directly to an email sent to this thread.

> Backwards compatibility is catered for by making the new argument optional,
> defaulting to the old, hard-coded value of four. We now have:

Preserving backward-compatibility is an essential piece. Thanks.
--
Michael



Re: [PATCH] RFC: Add length parameterised dmetaphone functions

From
Albe Laurenz
Date:
Christian Marie wrote:
> A developer I work with was trying to use dmetaphone to group people names into
> equivalence classes. He found that many long names would be grouped together
> when they shouldn't be, this turned out to be because dmetaphone has an
> undocumented upper bound on its output length, of four. This is obviously
> impractical for many use cases.
> 
> This patch addresses this by adding and documenting an optional argument to
> dmetaphone and dmetaphone_alt that specifies the maximum output length. This
> makes it possible to use dmetaphone on much longer inputs.
> 
> Backwards compatibility is catered for by making the new argument optional,
> defaulting to the old, hard-coded value of four. We now have:
> 
>     dmetaphone(text source) returns text
>     dmetaphone(text source, int max_output_length) returns text
>     dmetaphone_alt(text source) returns text
>     dmetaphone_alt(text source, int max_output_length) returns text

I like the idea.

How about:   dmetaphone(text source, int max_output_length DEFAULT 4) returns text   dmetaphone_alt(text source, int
max_output_lengthDEFAULT 4) returns text
 

Saves two functions and is self-documenting.

> +postgres=# select dmetaphone('unicorns');
> + dmetaphone
> +------------
> + ANKR
> +(1 row)
> +
> +postgres=# select dmetaphone('unicorns', 8);
> + dmetaphone
>  ------------
> - KMP
> + ANKRNS
>  (1 row)
>  </screen>
>   </sect2>

Yeah, "ponies" would have been too short...

Yours,
Laurenz Albe