[PATCH] RFC: Add length parameterised dmetaphone functions - Mailing list pgsql-hackers
From | Christian Marie |
---|---|
Subject | [PATCH] RFC: Add length parameterised dmetaphone functions |
Date | |
Msg-id | 1446786049-6677-1-git-send-email-christian@ponies.io Whole thread Raw |
Responses |
Re: [PATCH] RFC: Add length parameterised dmetaphone functions
Re: [PATCH] RFC: Add length parameterised dmetaphone functions |
List | pgsql-hackers |
A developer I work with was trying to use dmetaphone to group people names into equivalence classes. He found that many long names would be grouped together when they shouldn't be, this turned out to be because dmetaphone has an undocumented upper bound on its output length, of four. This is obviously impractical for many use cases. This patch addresses this by adding and documenting an optional argument to dmetaphone and dmetaphone_alt that specifies the maximum output length. This makes it possible to use dmetaphone on much longer inputs. Backwards compatibility is catered for by making the new argument optional, defaulting to the old, hard-coded value of four. We now have: dmetaphone(text source) returns textdmetaphone(text source, int max_output_length) returns textdmetaphone_alt(text source)returns textdmetaphone_alt(text source, int max_output_length) returns text ---contrib/fuzzystrmatch/Makefile | 2 +-contrib/fuzzystrmatch/dmetaphone.c | 46 ++++++++++++++------contrib/fuzzystrmatch/fuzzystrmatch--1.0--1.1.sql| 12 ++++++contrib/fuzzystrmatch/fuzzystrmatch--1.0.sql | 44 -------------------contrib/fuzzystrmatch/fuzzystrmatch--1.1.sql | 52 +++++++++++++++++++++++contrib/fuzzystrmatch/fuzzystrmatch.control | 2 +-doc/src/sgml/fuzzystrmatch.sgml | 18 ++++++--7 files changed, 115 insertions(+), 61 deletions(-)create mode 100644 contrib/fuzzystrmatch/fuzzystrmatch--1.0--1.1.sqldeletemode 100644 contrib/fuzzystrmatch/fuzzystrmatch--1.0.sqlcreate mode100644 contrib/fuzzystrmatch/fuzzystrmatch--1.1.sql diff --git a/contrib/fuzzystrmatch/Makefile b/contrib/fuzzystrmatch/Makefile index 0327d9510a50..fee85ad1ec36 100644 --- a/contrib/fuzzystrmatch/Makefile +++ b/contrib/fuzzystrmatch/Makefile @@ -4,7 +4,7 @@ MODULE_big = fuzzystrmatchOBJS = fuzzystrmatch.o dmetaphone.o $(WIN32RES)EXTENSION = fuzzystrmatch -DATA = fuzzystrmatch--1.0.sql fuzzystrmatch--unpackaged--1.0.sql +DATA = fuzzystrmatch--1.1.sql fuzzystrmatch--unpackaged--1.0.sql fuzzystrmatch--1.0--1.1.sqlPGFILEDESC = "fuzzystrmatch- similarities and distance between strings"ifdef USE_PGXS diff --git a/contrib/fuzzystrmatch/dmetaphone.c b/contrib/fuzzystrmatch/dmetaphone.c index 147c8501ee89..7afcf0aee841 100644 --- a/contrib/fuzzystrmatch/dmetaphone.c +++ b/contrib/fuzzystrmatch/dmetaphone.c @@ -115,10 +115,28 @@ The remaining code is authored by Andrew Dunstan <amdunstan@ncshp.org> and#include <assert.h>/* prototypefor the main function we got from the perl module */ -static void DoubleMetaphone(char *, char **); +static void DoubleMetaphone(char *, char **, int);#ifndef DMETAPHONE_MAIN +/* Fetch and validate the optional second function argument, which should be a + * maximum length for output + * + * Defaults to 4. + * + * Introduced in 1.1 + */ +inline int +fetch_max_len_arg(PG_FUNCTION_ARGS) +{ + int max_len = (PG_NARGS() > 1) ? PG_GETARG_INT32(1) : 4; + if(max_len < 1) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid max output length, should be >= 1"))); + return max_len; +} +/* * The PostgreSQL visible dmetaphone function. */ @@ -128,6 +146,7 @@ PG_FUNCTION_INFO_V1(dmetaphone);Datumdmetaphone(PG_FUNCTION_ARGS){ + int max_len; text *arg; char *aptr, *codes[2], @@ -139,8 +158,9 @@ dmetaphone(PG_FUNCTION_ARGS)#endif arg = PG_GETARG_TEXT_P(0); aptr = text_to_cstring(arg); + max_len = fetch_max_len_arg(fcinfo); - DoubleMetaphone(aptr, codes); + DoubleMetaphone(aptr, codes, max_len); code = codes[0]; if (!code) code = ""; @@ -157,6 +177,7 @@ PG_FUNCTION_INFO_V1(dmetaphone_alt);Datumdmetaphone_alt(PG_FUNCTION_ARGS){ + int max_len; text *arg; char *aptr, *codes[2], @@ -168,8 +189,9 @@ dmetaphone_alt(PG_FUNCTION_ARGS)#endif arg = PG_GETARG_TEXT_P(0); aptr = text_to_cstring(arg); + max_len = fetch_max_len_arg(fcinfo); - DoubleMetaphone(aptr, codes); + DoubleMetaphone(aptr, codes, max_len); code = codes[1]; if (!code) code = ""; @@ -390,7 +412,7 @@ MetaphAdd(metastring *s, char *new_str)static void -DoubleMetaphone(char *str, char **codes) +DoubleMetaphone(char *str, char **codes, int max_len){ int length; metastring *original; @@ -427,7 +449,7 @@ DoubleMetaphone(char *str, char **codes) } /* main loop */ - while ((primary->length < 4) || (secondary->length < 4)) + while ((primary->length < max_len) || (secondary->length < max_len)) { if (current >= length) break; @@ -1410,11 +1432,11 @@ DoubleMetaphone(char *str, char **codes) } - if (primary->length > 4) - SetAt(primary, 4, '\0'); + if (primary->length > max_len) + SetAt(primary, max_len, '\0'); - if (secondary->length > 4) - SetAt(secondary, 4, '\0'); + if (secondary->length > max_len) + SetAt(secondary, max_len, '\0'); *codes = primary->str; *++codes = secondary->str; @@ -1428,13 +1450,13 @@ DoubleMetaphone(char *str, char **codes)/* just for testing - not part of the perl code */ -main(int argc, char **argv) +void main(int argc, char **argv){ char *codes[2]; - if (argc > 1) + if (argc > 2) { - DoubleMetaphone(argv[1], codes); + DoubleMetaphone(argv[1], codes, atoi(argv[2])); printf("%s|%s\n", codes[0], codes[1]); }} diff --git a/contrib/fuzzystrmatch/fuzzystrmatch--1.0--1.1.sql b/contrib/fuzzystrmatch/fuzzystrmatch--1.0--1.1.sql new file mode 100644 index 000000000000..b00112c6bc9d --- /dev/null +++ b/contrib/fuzzystrmatch/fuzzystrmatch--1.0--1.1.sql @@ -0,0 +1,12 @@ +/* contrib/fuzzystrmatch/fuzzystrmatch--1.0--1.1.sql */ + +-- complain if script is sourced in psql, rather than via ALTER EXTENSION +\echo Use "ALTER EXTENSION fuzzystrmatch UPDATE TO '1.1'" to load this file. \quit + +CREATE FUNCTION dmetaphone (text, int) RETURNS text +AS 'MODULE_PATHNAME', 'dmetaphone' +LANGUAGE C IMMUTABLE STRICT; + +CREATE FUNCTION dmetaphone_alt (text, int) RETURNS text +AS 'MODULE_PATHNAME', 'dmetaphone_alt' +LANGUAGE C IMMUTABLE STRICT; diff --git a/contrib/fuzzystrmatch/fuzzystrmatch--1.0.sql b/contrib/fuzzystrmatch/fuzzystrmatch--1.0.sql deleted file mode 100644 index 1cf9b6188cc0..000000000000 --- a/contrib/fuzzystrmatch/fuzzystrmatch--1.0.sql +++ /dev/null @@ -1,44 +0,0 @@ -/* contrib/fuzzystrmatch/fuzzystrmatch--1.0.sql */ - --- complain if script is sourced in psql, rather than via CREATE EXTENSION -\echo Use "CREATE EXTENSION fuzzystrmatch" to load this file. \quit - -CREATE FUNCTION levenshtein (text,text) RETURNS int -AS 'MODULE_PATHNAME','levenshtein' -LANGUAGE C IMMUTABLE STRICT; - -CREATE FUNCTION levenshtein (text,text,int,int,int) RETURNS int -AS 'MODULE_PATHNAME','levenshtein_with_costs' -LANGUAGE C IMMUTABLE STRICT; - -CREATE FUNCTION levenshtein_less_equal (text,text,int) RETURNS int -AS 'MODULE_PATHNAME','levenshtein_less_equal' -LANGUAGE C IMMUTABLE STRICT; - -CREATE FUNCTION levenshtein_less_equal (text,text,int,int,int,int) RETURNS int -AS 'MODULE_PATHNAME','levenshtein_less_equal_with_costs' -LANGUAGE C IMMUTABLE STRICT; - -CREATE FUNCTION metaphone (text,int) RETURNS text -AS 'MODULE_PATHNAME','metaphone' -LANGUAGE C IMMUTABLE STRICT; - -CREATE FUNCTION soundex(text) RETURNS text -AS 'MODULE_PATHNAME', 'soundex' -LANGUAGE C IMMUTABLE STRICT; - -CREATE FUNCTION text_soundex(text) RETURNS text -AS 'MODULE_PATHNAME', 'soundex' -LANGUAGE C IMMUTABLE STRICT; - -CREATE FUNCTION difference(text,text) RETURNS int -AS 'MODULE_PATHNAME', 'difference' -LANGUAGE C IMMUTABLE STRICT; - -CREATE FUNCTION dmetaphone (text) RETURNS text -AS 'MODULE_PATHNAME', 'dmetaphone' -LANGUAGE C IMMUTABLE STRICT; - -CREATE FUNCTION dmetaphone_alt (text) RETURNS text -AS 'MODULE_PATHNAME', 'dmetaphone_alt' -LANGUAGE C IMMUTABLE STRICT; diff --git a/contrib/fuzzystrmatch/fuzzystrmatch--1.1.sql b/contrib/fuzzystrmatch/fuzzystrmatch--1.1.sql new file mode 100644 index 000000000000..e0eba457892b --- /dev/null +++ b/contrib/fuzzystrmatch/fuzzystrmatch--1.1.sql @@ -0,0 +1,52 @@ +/* contrib/fuzzystrmatch/fuzzystrmatch--1.0.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION fuzzystrmatch" to load this file. \quit + +CREATE FUNCTION levenshtein (text,text) RETURNS int +AS 'MODULE_PATHNAME','levenshtein' +LANGUAGE C IMMUTABLE STRICT; + +CREATE FUNCTION levenshtein (text,text,int,int,int) RETURNS int +AS 'MODULE_PATHNAME','levenshtein_with_costs' +LANGUAGE C IMMUTABLE STRICT; + +CREATE FUNCTION levenshtein_less_equal (text,text,int) RETURNS int +AS 'MODULE_PATHNAME','levenshtein_less_equal' +LANGUAGE C IMMUTABLE STRICT; + +CREATE FUNCTION levenshtein_less_equal (text,text,int,int,int,int) RETURNS int +AS 'MODULE_PATHNAME','levenshtein_less_equal_with_costs' +LANGUAGE C IMMUTABLE STRICT; + +CREATE FUNCTION metaphone (text,int) RETURNS text +AS 'MODULE_PATHNAME','metaphone' +LANGUAGE C IMMUTABLE STRICT; + +CREATE FUNCTION soundex(text) RETURNS text +AS 'MODULE_PATHNAME', 'soundex' +LANGUAGE C IMMUTABLE STRICT; + +CREATE FUNCTION text_soundex(text) RETURNS text +AS 'MODULE_PATHNAME', 'soundex' +LANGUAGE C IMMUTABLE STRICT; + +CREATE FUNCTION difference(text,text) RETURNS int +AS 'MODULE_PATHNAME', 'difference' +LANGUAGE C IMMUTABLE STRICT; + +CREATE FUNCTION dmetaphone (text) RETURNS text +AS 'MODULE_PATHNAME', 'dmetaphone' +LANGUAGE C IMMUTABLE STRICT; + +CREATE FUNCTION dmetaphone_alt (text) RETURNS text +AS 'MODULE_PATHNAME', 'dmetaphone_alt' +LANGUAGE C IMMUTABLE STRICT; + +CREATE FUNCTION dmetaphone (text, int) RETURNS text +AS 'MODULE_PATHNAME', 'dmetaphone' +LANGUAGE C IMMUTABLE STRICT; + +CREATE FUNCTION dmetaphone_alt (text, int) RETURNS text +AS 'MODULE_PATHNAME', 'dmetaphone_alt' +LANGUAGE C IMMUTABLE STRICT; diff --git a/contrib/fuzzystrmatch/fuzzystrmatch.control b/contrib/fuzzystrmatch/fuzzystrmatch.control index e257f0961171..6b2832ae8d95 100644 --- a/contrib/fuzzystrmatch/fuzzystrmatch.control +++ b/contrib/fuzzystrmatch/fuzzystrmatch.control @@ -1,5 +1,5 @@# fuzzystrmatch extensioncomment = 'determine similarities and distance between strings' -default_version = '1.0' +default_version = '1.1'module_pathname = '$libdir/fuzzystrmatch'relocatable = true diff --git a/doc/src/sgml/fuzzystrmatch.sgml b/doc/src/sgml/fuzzystrmatch.sgml index f26bd90dfc5f..3898d16af22c 100644 --- a/doc/src/sgml/fuzzystrmatch.sgml +++ b/doc/src/sgml/fuzzystrmatch.sgml @@ -210,7 +210,9 @@ test=# SELECT metaphone('GUMBO', 4);<synopsis>dmetaphone(text source) returns text +dmetaphone(text source, int max_output_length) returns textdmetaphone_alt(text source) returns text +dmetaphone_alt(text source, int max_output_length) returns text</synopsis> <para> @@ -218,14 +220,24 @@ dmetaphone_alt(text source) returns text </para> <para> + max_output_length defaults to 4 and must be greater than 1 + </para> + + <para> Example: </para><screen> -test=# select dmetaphone('gumbo'); - dmetaphone +postgres=# select dmetaphone('unicorns'); + dmetaphone +------------ + ANKR +(1 row) + +postgres=# select dmetaphone('unicorns', 8); + dmetaphone ------------ - KMP + ANKRNS(1 row)</screen> </sect2> -- 2.6.2
pgsql-hackers by date: