[HACKERS] Tightening isspace() tests where behavior should match SQL parser - Mailing list pgsql-hackers

From Tom Lane
Subject [HACKERS] Tightening isspace() tests where behavior should match SQL parser
Date
Msg-id 10129.1495302480@sss.pgh.pa.us
Whole thread Raw
Responses Re: [HACKERS] Tightening isspace() tests where behavior should matchSQL parser
List pgsql-hackers
The proximate cause of bug #14662,
https://www.postgresql.org/message-id/flat/20170519162316.29945.5021%40wrigleys.postgresql.org
appears to be that SplitIdentifierString's isspace() checks are
identifying some bytes of a multibyte character as spaces.  It's not
quite clear to me whether there's an encoding configuration problem
involved in this specific report, but in any case feeding individual
bytes of a multibyte character to <ctype.h> functions is highly dubious.
The best you can say about that is that the behavior will be
platform-dependent.

I think that the easiest way to resolve this is to replace the isspace()
calls with scanner_isspace(), on the grounds that we are trying to parse
identifiers the same way that the core SQL lexer would, and therefore we
should use its definition of whitespace.

I went looking through all our other calls of isspace() to see if we
have the same problem anywhere else, and identified these functions
as being at risk:

parse_ident()
regproc.c's parseNameAndArgTypes (for regprocedurein and siblings)
SplitIdentifierString()
SplitDirectoriesString()

In all four cases, we must allow for non-ASCII input and it's
arguable that correct behavior is to match the core lexer,
so I propose replacing isspace() with scanner_isspace() in
these places.

There are quite a lot more isspace() calls than that of course,
but the rest of them seem probably all right to me.  As an
example, I don't see a need to make float8in() stricter about
what it will allow as trailing whitespace.  We're not expecting
any non-ASCII input really, and if we do see multibyte characters
and all the bytes manage to pass isspace(), not much harm is done.

Attached is a proposed patch.  I'm vacillating on whether to
back-patch this --- it will fix a reported bug, but it seems
at least somewhat conceivable that it will also break cases
that were working acceptably before.  Thoughts?

            regards, tom lane

diff --git a/src/backend/utils/adt/misc.c b/src/backend/utils/adt/misc.c
index 7dcecb2..9cc0b08 100644
--- a/src/backend/utils/adt/misc.c
+++ b/src/backend/utils/adt/misc.c
@@ -770,7 +770,7 @@ parse_ident(PG_FUNCTION_ARGS)
     nextp = qualname_str;

     /* skip leading whitespace */
-    while (isspace((unsigned char) *nextp))
+    while (scanner_isspace(*nextp))
         nextp++;

     for (;;)
@@ -858,14 +858,14 @@ parse_ident(PG_FUNCTION_ARGS)
                                 text_to_cstring(qualname))));
         }

-        while (isspace((unsigned char) *nextp))
+        while (scanner_isspace(*nextp))
             nextp++;

         if (*nextp == '.')
         {
             after_dot = true;
             nextp++;
-            while (isspace((unsigned char) *nextp))
+            while (scanner_isspace(*nextp))
                 nextp++;
         }
         else if (*nextp == '\0')
diff --git a/src/backend/utils/adt/regproc.c b/src/backend/utils/adt/regproc.c
index a90452c..ba879e8 100644
--- a/src/backend/utils/adt/regproc.c
+++ b/src/backend/utils/adt/regproc.c
@@ -32,6 +32,7 @@
 #include "lib/stringinfo.h"
 #include "miscadmin.h"
 #include "parser/parse_type.h"
+#include "parser/scansup.h"
 #include "utils/builtins.h"
 #include "utils/lsyscache.h"
 #include "utils/syscache.h"
@@ -1769,7 +1770,7 @@ parseNameAndArgTypes(const char *string, bool allowNone, List **names,
     ptr2 = ptr + strlen(ptr);
     while (--ptr2 > ptr)
     {
-        if (!isspace((unsigned char) *ptr2))
+        if (!scanner_isspace(*ptr2))
             break;
     }
     if (*ptr2 != ')')
@@ -1786,7 +1787,7 @@ parseNameAndArgTypes(const char *string, bool allowNone, List **names,
     for (;;)
     {
         /* allow leading whitespace */
-        while (isspace((unsigned char) *ptr))
+        while (scanner_isspace(*ptr))
             ptr++;
         if (*ptr == '\0')
         {
@@ -1842,7 +1843,7 @@ parseNameAndArgTypes(const char *string, bool allowNone, List **names,
         /* Lop off trailing whitespace */
         while (--ptr2 >= typename)
         {
-            if (!isspace((unsigned char) *ptr2))
+            if (!scanner_isspace(*ptr2))
                 break;
             *ptr2 = '\0';
         }
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c
index be399f4..a0dd391 100644
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -3252,7 +3252,7 @@ SplitIdentifierString(char *rawstring, char separator,

     *namelist = NIL;

-    while (isspace((unsigned char) *nextp))
+    while (scanner_isspace(*nextp))
         nextp++;                /* skip leading whitespace */

     if (*nextp == '\0')
@@ -3290,7 +3290,7 @@ SplitIdentifierString(char *rawstring, char separator,

             curname = nextp;
             while (*nextp && *nextp != separator &&
-                   !isspace((unsigned char) *nextp))
+                   !scanner_isspace(*nextp))
                 nextp++;
             endp = nextp;
             if (curname == nextp)
@@ -3312,13 +3312,13 @@ SplitIdentifierString(char *rawstring, char separator,
             pfree(downname);
         }

-        while (isspace((unsigned char) *nextp))
+        while (scanner_isspace(*nextp))
             nextp++;            /* skip trailing whitespace */

         if (*nextp == separator)
         {
             nextp++;
-            while (isspace((unsigned char) *nextp))
+            while (scanner_isspace(*nextp))
                 nextp++;        /* skip leading whitespace for next */
             /* we expect another name, so done remains false */
         }
@@ -3377,7 +3377,7 @@ SplitDirectoriesString(char *rawstring, char separator,

     *namelist = NIL;

-    while (isspace((unsigned char) *nextp))
+    while (scanner_isspace(*nextp))
         nextp++;                /* skip leading whitespace */

     if (*nextp == '\0')
@@ -3414,7 +3414,7 @@ SplitDirectoriesString(char *rawstring, char separator,
             while (*nextp && *nextp != separator)
             {
                 /* trailing whitespace should not be included in name */
-                if (!isspace((unsigned char) *nextp))
+                if (!scanner_isspace(*nextp))
                     endp = nextp + 1;
                 nextp++;
             }
@@ -3422,13 +3422,13 @@ SplitDirectoriesString(char *rawstring, char separator,
                 return false;    /* empty unquoted name not allowed */
         }

-        while (isspace((unsigned char) *nextp))
+        while (scanner_isspace(*nextp))
             nextp++;            /* skip trailing whitespace */

         if (*nextp == separator)
         {
             nextp++;
-            while (isspace((unsigned char) *nextp))
+            while (scanner_isspace(*nextp))
                 nextp++;        /* skip leading whitespace for next */
             /* we expect another name, so done remains false */
         }

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

pgsql-hackers by date:

Previous
From: Andres Freund
Date:
Subject: Re: [HACKERS] Making replication commands case-insensitive
Next
From: Pavel Stehule
Date:
Subject: Re: [HACKERS] proposal psql \gdesc