Re: PostgreSQL 9.3.5 substring(text from pattern for escape) bug - Mailing list pgsql-bugs

From Tom Lane
Subject Re: PostgreSQL 9.3.5 substring(text from pattern for escape) bug
Date
Msg-id 20583.1557680147@sss.pgh.pa.us
Whole thread Raw
In response to Re: PostgreSQL 9.3.5 substring(text from pattern for escape) bug  (Andrew Gierth <andrew@tao11.riddles.org.uk>)
Responses Re: PostgreSQL 9.3.5 substring(text from pattern for escape) bug  (Tom Lane <tgl@sss.pgh.pa.us>)
List pgsql-bugs
Andrew Gierth <andrew@tao11.riddles.org.uk> writes:
> "Tom" == Tom Lane <tgl@sss.pgh.pa.us> writes:
>  Tom> Huh, interesting. So we should be translating the initial
>  Tom> substring to a non-greedy pattern. I believe Spencer's engine can
>  Tom> handle that by sticking (?:...){1,1}? around it.

> Your suggested fix doesn't seem to work. If the leading/trailing
> substrings do not have | or parens in then it seems to work to wrap them
> in (?:(?:)??...), thanks to the rule that the first quantified atom in a
> subexpression sets the whole subexpression's greediness, but handling |
> or parens correctly seems harder.

[ pokes at that... ]  Huh.  That's a bug, which AFAICS is aboriginal in
Henry's code: it optimizes away a {1,1} quantifier without regard to
whether the quantifier is attempting to impose a different greediness
preference than its argument would have naturally.  The attached
seems to fix it.

            regards, tom lane

diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c
index eb1f3d5..8cd7d56 100644
--- a/src/backend/regex/regcomp.c
+++ b/src/backend/regex/regcomp.c
@@ -1155,7 +1155,10 @@ parseqatom(struct vars *v,
         /* rest of branch can be strung starting from atom->end */
         s2 = atom->end;
     }
-    else if (m == 1 && n == 1)
+    else if (m == 1 && n == 1 &&
+             (qprefer == 0 ||
+              (atom->flags & (LONGER | SHORTER | MIXED)) == 0 ||
+              qprefer == (atom->flags & (LONGER | SHORTER | MIXED))))
     {
         /* no/vacuous quantifier:  done */
         EMPTYARC(s, atom->begin);    /* empty prefix */
diff --git a/src/test/regress/expected/regex.out b/src/test/regress/expected/regex.out
index c0bfa8a..f372003 100644
--- a/src/test/regress/expected/regex.out
+++ b/src/test/regress/expected/regex.out
@@ -492,6 +492,55 @@ select regexp_matches('foo/bar/baz',
  {foo,bar,baz}
 (1 row)

+-- Test that greediness can be overridden by outer quantifier
+select regexp_matches('llmmmfff', '^(l*)(.*)(f*)$');
+ regexp_matches
+----------------
+ {ll,mmmfff,""}
+(1 row)
+
+select regexp_matches('llmmmfff', '^(l*){1,1}(.*)(f*)$');
+ regexp_matches
+----------------
+ {ll,mmmfff,""}
+(1 row)
+
+select regexp_matches('llmmmfff', '^(l*){1,1}?(.*)(f*)$');
+  regexp_matches
+------------------
+ {"",llmmmfff,""}
+(1 row)
+
+select regexp_matches('llmmmfff', '^(l*){1,1}?(.*){1,1}?(f*)$');
+ regexp_matches
+----------------
+ {"",llmmm,fff}
+(1 row)
+
+select regexp_matches('llmmmfff', '^(l*?)(.*)(f*)$');
+  regexp_matches
+------------------
+ {"",llmmmfff,""}
+(1 row)
+
+select regexp_matches('llmmmfff', '^(l*?){1,1}(.*)(f*)$');
+ regexp_matches
+----------------
+ {ll,mmmfff,""}
+(1 row)
+
+select regexp_matches('llmmmfff', '^(l*?){1,1}?(.*)(f*)$');
+  regexp_matches
+------------------
+ {"",llmmmfff,""}
+(1 row)
+
+select regexp_matches('llmmmfff', '^(l*?){1,1}?(.*){1,1}?(f*)$');
+ regexp_matches
+----------------
+ {"",llmmm,fff}
+(1 row)
+
 -- Test for infinite loop in cfindloop with zero-length possible match
 -- but no actual match (can only happen in the presence of backrefs)
 select 'a' ~ '$()|^\1';
diff --git a/src/test/regress/sql/regex.sql b/src/test/regress/sql/regex.sql
index 1361b62..a174224 100644
--- a/src/test/regress/sql/regex.sql
+++ b/src/test/regress/sql/regex.sql
@@ -118,6 +118,16 @@ select regexp_matches('Programmer', '(\w)(.*?\1)', 'g');
 select regexp_matches('foo/bar/baz',
                       '^([^/]+?)(?:/([^/]+?))(?:/([^/]+?))?$', '');

+-- Test that greediness can be overridden by outer quantifier
+select regexp_matches('llmmmfff', '^(l*)(.*)(f*)$');
+select regexp_matches('llmmmfff', '^(l*){1,1}(.*)(f*)$');
+select regexp_matches('llmmmfff', '^(l*){1,1}?(.*)(f*)$');
+select regexp_matches('llmmmfff', '^(l*){1,1}?(.*){1,1}?(f*)$');
+select regexp_matches('llmmmfff', '^(l*?)(.*)(f*)$');
+select regexp_matches('llmmmfff', '^(l*?){1,1}(.*)(f*)$');
+select regexp_matches('llmmmfff', '^(l*?){1,1}?(.*)(f*)$');
+select regexp_matches('llmmmfff', '^(l*?){1,1}?(.*){1,1}?(f*)$');
+
 -- Test for infinite loop in cfindloop with zero-length possible match
 -- but no actual match (can only happen in the presence of backrefs)
 select 'a' ~ '$()|^\1';

pgsql-bugs by date:

Previous
From: Andrew Gierth
Date:
Subject: Re: PostgreSQL 9.3.5 substring(text from pattern for escape) bug
Next
From: Amit Langote
Date:
Subject: Re: inconsistent results querying table partitioned by date