LIKE optimization in UTF-8 and locale-C - Mailing list pgsql-hackers

From ITAGAKI Takahiro
Subject LIKE optimization in UTF-8 and locale-C
Date
Msg-id 20070322143734.62FA.ITAGAKI.TAKAHIRO@oss.ntt.co.jp
Whole thread Raw
Responses Re: LIKE optimization in UTF-8 and locale-C  (Tom Lane <tgl@sss.pgh.pa.us>)
List pgsql-hackers
Hello,

I found LIKE operators are slower on multi-byte encoding databases
than single-byte encoding ones. It comes from difference between
MatchText() and MBMatchText().

We've had an optimization for single-byte encodings using
pg_database_encoding_max_length() == 1 test. I'll propose to extend it
in UTF-8 with locale-C case. All of trailing bytes are different from first
bytes in UTF-8 multi-byte characters, so we can use functions for single-bytes
and byte-wise comparison in the case. With the attached patch, the performance
of UTF-8 LIKE operators are pushed up to near other single-bytes encodings.

Databases initialized with locale-C are widely used in Japan, because
Japanese locale are broken in almost of platforms. Japanese user can
choose EUC-jp or UTF-8 as a server encoding, but I think UTF-8 will be
more and more used in the future.


---- test ----

initdb --no-locale --encoding=<encoding>

[HEAD]
SQL_ASCII :  7171 ms /  7203 ms /  7187 ms
LATIN1    :  7172 ms /  7156 ms /  7141 ms
UTF8      : 16235 ms / 16281 ms / 16281 ms
EUC_JP    : 17454 ms / 17453 ms / 17438 ms

[with patch]
SQL_ASCII :  7062 ms /  7125 ms /  7125 ms
LATIN1    :  7047 ms /  7063 ms /  7047 ms
UTF8      :  7188 ms /  7234 ms /  7235 ms
EUC_JP    : 17468 ms / 17453 ms / 17453 ms


CREATE OR REPLACE FUNCTION test() RETURNS INTEGER AS $$
DECLARE
    cnt integer;
BEGIN
    FOR i IN 1..1000 LOOP
        SELECT count(*) INTO cnt FROM item WHERE i_title LIKE '%BABABABABARIBA%' LIMIT 50;
    END LOOP;
    RETURN cnt;
END;
$$ LANGUAGE plpgsql;

SELECT count(*) FROM item; -- borrowed from DBT-1 (TPC-W)
 count
-------
 10000
(1 row)

---- patch ----

Index: src/backend/utils/adt/like.c
===================================================================
--- src/backend/utils/adt/like.c    (head)
+++ src/backend/utils/adt/like.c    (working copy)
@@ -21,6 +21,7 @@

 #include "mb/pg_wchar.h"
 #include "utils/builtins.h"
+#include "utils/pg_locale.h"


 #define LIKE_TRUE                        1
@@ -119,6 +120,13 @@


 /*
+ * true iff match functions for single-byte characters are available.
+ */
+#define sb_match_available() \
+    (pg_database_encoding_max_length() == 1 || \
+     (lc_collate_is_c() && GetDatabaseEncoding() == PG_UTF8))
+
+/*
  *    interface routines called by the function manager
  */

@@ -138,7 +146,7 @@
     p = VARDATA(pat);
     plen = (VARSIZE(pat) - VARHDRSZ);

-    if (pg_database_encoding_max_length() == 1)
+    if (sb_match_available())
         result = (MatchText(s, slen, p, plen) == LIKE_TRUE);
     else
         result = (MBMatchText(s, slen, p, plen) == LIKE_TRUE);
@@ -162,7 +170,7 @@
     p = VARDATA(pat);
     plen = (VARSIZE(pat) - VARHDRSZ);

-    if (pg_database_encoding_max_length() == 1)
+    if (sb_match_available())
         result = (MatchText(s, slen, p, plen) != LIKE_TRUE);
     else
         result = (MBMatchText(s, slen, p, plen) != LIKE_TRUE);
@@ -186,7 +194,7 @@
     p = VARDATA(pat);
     plen = (VARSIZE(pat) - VARHDRSZ);

-    if (pg_database_encoding_max_length() == 1)
+    if (sb_match_available())
         result = (MatchText(s, slen, p, plen) == LIKE_TRUE);
     else
         result = (MBMatchText(s, slen, p, plen) == LIKE_TRUE);
@@ -210,7 +218,7 @@
     p = VARDATA(pat);
     plen = (VARSIZE(pat) - VARHDRSZ);

-    if (pg_database_encoding_max_length() == 1)
+    if (sb_match_available())
         result = (MatchText(s, slen, p, plen) != LIKE_TRUE);
     else
         result = (MBMatchText(s, slen, p, plen) != LIKE_TRUE);
@@ -275,7 +283,7 @@
     int            slen,
                 plen;

-    if (pg_database_encoding_max_length() == 1)
+    if (sb_match_available())
     {
         s = NameStr(*str);
         slen = strlen(s);
@@ -316,7 +324,7 @@
     int            slen,
                 plen;

-    if (pg_database_encoding_max_length() == 1)
+    if (sb_match_available())
     {
         s = NameStr(*str);
         slen = strlen(s);
@@ -357,7 +365,7 @@
     int            slen,
                 plen;

-    if (pg_database_encoding_max_length() == 1)
+    if (sb_match_available())
     {
         s = VARDATA(str);
         slen = (VARSIZE(str) - VARHDRSZ);
@@ -393,7 +401,7 @@
     int            slen,
                 plen;

-    if (pg_database_encoding_max_length() == 1)
+    if (sb_match_available())
     {
         s = VARDATA(str);
         slen = (VARSIZE(str) - VARHDRSZ);
@@ -429,7 +437,7 @@
     text       *esc = PG_GETARG_TEXT_P(1);
     text       *result;

-    if (pg_database_encoding_max_length() == 1)
+    if (sb_match_available())
         result = do_like_escape(pat, esc);
     else
         result = MB_do_like_escape(pat, esc);


Regards,
---
ITAGAKI Takahiro
NTT Open Source Software Center


pgsql-hackers by date:

Previous
From: Tatsuo Ishii
Date:
Subject: Re: Proposal: Adding JIS X 0213 support
Next
From: "Pavan Deolasee"
Date:
Subject: Re: [PATCHES] Bitmapscan changes