Re: [PATCHES] dollar quoting - Mailing list pgsql-hackers

From Andrew Dunstan
Subject Re: [PATCHES] dollar quoting
Date
Msg-id 402F840A.9020208@dunslane.net
Whole thread Raw
In response to Re: [PATCHES] dollar quoting  (Tom Lane <tgl@sss.pgh.pa.us>)
Responses Re: [PATCHES] dollar quoting
List pgsql-hackers

Tom Lane wrote:

>Andrew Dunstan <andrew@dunslane.net> writes:
>
>
>>Tom Lane wrote:
>>
>>
>>>... But how about
>>>42$foo$
>>>This is a syntax error in 7.4, and we propose to redefine it as an
>>>integer literal '42' followed by a dollar-quote start symbol.
>>>
>>>
>
>
>
>>The test should not succeed anywhere in the string '42$foo$'.
>>
>>
>
>No, it won't.  The problem is that it should, because the backend will
>see that as '42' followed by a $foo$ quote start.
>

Ok, I see what you are saying. This mismatch would only happen on
invalid input, though. I believe that what I did will work on all legal
input.

I think that this might be cured by having psql recognise a legal
identifier or keyword and eating it as a word, rather than treating it
as just another set of bytes in the stream. That would enable us to
avoid the lookback in the dollar-quote recognition test altogether. The
attached patch does it that way - the keyword/id test needs to come
right at the end of the loop to avoid clashing with backslash commands,
btw.

I *think* that this way psql will recognise the start of a dollar quote
iff the backend lexer would.

>
>
>
>>Interacting with lexer states would probably be ... unpleasant. Matching
>>a stream oriented lexer with a line oriented CLI would be messy I suspect.
>>
>>
>
>I think it would not be that bad.  We'd have to run the lexer on the
>command input buffer and see what state it terminates in.
>
>
>

Yeah. I am not enough of a flex wizard to undertake the task, though. It
would take me lots of time. If we make a decision that we really need
this in order to do dollar quoting in psql I would need some substantial
help, at least.

cheers

andrew
Index: src/bin/psql/mainloop.c
===================================================================
RCS file: /projects/cvsroot/pgsql-server/src/bin/psql/mainloop.c,v
retrieving revision 1.61
diff -c -r1.61 mainloop.c
*** src/bin/psql/mainloop.c    25 Jan 2004 03:07:22 -0000    1.61
--- src/bin/psql/mainloop.c    15 Feb 2004 14:28:02 -0000
***************
*** 21,26 ****
--- 21,61 ----
  sigjmp_buf    main_loop_jmp;
  #endif

+ /*
+  * function to detect a valid $foo$ quote delimiter at the start of the
+  * parameter dquote.
+  */
+
+ static bool valid_dolquote(char * dquote)
+ {
+     int i;
+
+     /* must start with a $ */
+     if (dquote[0] != '$')
+         return false;
+
+     /* empty 'identifier' case */
+     if (dquote[1] == '$')
+         return true;
+
+     /* first 'identifier' char must be a letter or have high bit set */
+     if (!isalpha(dquote[1]) && (dquote[1] & 0x80) == 0)
+         return false;
+
+     /* subsequent chars must be alphanumeric or _ or have high bit set */
+     for (i = 2; dquote[i] != '$'; i++)
+     {
+         if ((dquote[i] & 0x80) == 0 && ! isalnum(dquote[i]) &&
+             dquote[i] != '_')
+         {
+             /* we found an invalid character */
+             return false;
+         }
+     }
+
+     return true;
+ }
+

  /*
   * Main processing loop for reading lines of input
***************
*** 49,54 ****
--- 84,92 ----
      unsigned int query_start;
      volatile int count_eof = 0;
      volatile unsigned int bslash_count = 0;
+     volatile bool free_dolquote = false;
+     char *dol_quote = NULL;
+

      int            i,
                  prevlen,
***************
*** 120,125 ****
--- 158,164 ----
                  in_quote = 0;
                  paren_level = 0;
                  count_eof = 0;
+                 free_dolquote = true;
                  slashCmdStatus = CMD_UNKNOWN;
              }
              else
***************
*** 136,141 ****
--- 175,190 ----
          pqsignal(SIGINT, handle_sigint);        /* control-C => cancel */
  #endif   /* not WIN32 */

+         if (free_dolquote)
+         {
+             if(dol_quote)
+             {
+                 free(dol_quote);
+                 dol_quote = NULL;
+             }
+             free_dolquote = false;
+         }
+
          fflush(stdout);

          if (slashCmdStatus == CMD_NEWEDIT)
***************
*** 150,155 ****
--- 199,209 ----
              in_xcomment = 0;
              in_quote = 0;
              paren_level = 0;
+             if(dol_quote)
+             {
+                 free(dol_quote);
+                 dol_quote = NULL;
+             }
              slashCmdStatus = CMD_UNKNOWN;
          }

***************
*** 161,167 ****
          {
              int            prompt_status;

!             if (in_quote && in_quote == '\'')
                  prompt_status = PROMPT_SINGLEQUOTE;
              else if (in_quote && in_quote == '"')
                  prompt_status = PROMPT_DOUBLEQUOTE;
--- 215,223 ----
          {
              int            prompt_status;

!             if (dol_quote)
!                 prompt_status = PROMPT_DOLLARQUOTE;
!             else if (in_quote && in_quote == '\'')
                  prompt_status = PROMPT_SINGLEQUOTE;
              else if (in_quote && in_quote == '"')
                  prompt_status = PROMPT_DOUBLEQUOTE;
***************
*** 268,273 ****
--- 324,343 ----
                      in_quote = 0;
              }

+             /* in or end of $foo$ type quote? */
+
+             else if (dol_quote)
+             {
+                 if (strncmp(line+i,dol_quote,strlen(dol_quote)) == 0)
+                 {
+                     ADVANCE_1;
+                     while(line[i] != '$')
+                         ADVANCE_1;
+                     free(dol_quote);
+                     dol_quote = NULL;
+                 }
+             }
+
              /* start of extended comment? */
              else if (line[i] == '/' && line[i + thislen] == '*')
              {
***************
*** 288,293 ****
--- 358,383 ----
              else if (line[i] == '\'' || line[i] == '"')
                  in_quote = line[i];

+             /*
+              * start of $foo$ type quote?
+              */
+             else if (!dol_quote && valid_dolquote(line+i))
+             {
+                 char * dol_end;
+                 char eos;
+
+                 dol_end = strchr(line+i+1,'$');
+                 dol_end ++;
+                 eos = *dol_end;
+                 *dol_end = '\0';
+                 dol_quote = pg_strdup(line+i);
+                 *dol_end = eos;
+                 ADVANCE_1;
+                 while(line[i] != '$')
+                     ADVANCE_1;
+
+             }
+
              /* single-line comment? truncate line */
              else if (line[i] == '-' && line[i + thislen] == '-')
              {
***************
*** 447,452 ****
--- 537,566 ----
                  i = end_of_cmd - line;
                  query_start = i;
              }
+
+             /*
+              * keyword or identifier?
+              * We grab the whole string so that we don't
+              * mistakenly see $foo$ inside an identifier as the start
+              * of a dollar quote.
+              */
+
+             else if ( (line[i] & 0x80) != 0 ||
+                       isalpha(line[i]) ||
+                       line[i] == '_')
+             {
+                 while ((line[i+thislen] & 0x80) != 0 ||
+                        isalnum(line[i+thislen]) ||
+                        line[i+thislen] == '_' ||
+                        line[i+thislen] == '$' )
+                 {
+                     /* keep going while we still have identifier chars */
+                     ADVANCE_1;
+                 }
+
+
+             }
+
          }                        /* for (line) */


***************
*** 458,464 ****


          /* Put the rest of the line in the query buffer. */
!         if (in_quote || line[query_start + strspn(line + query_start, " \t\n\r")] != '\0')
          {
              if (query_buf->len > 0)
                  appendPQExpBufferChar(query_buf, '\n');
--- 572,579 ----


          /* Put the rest of the line in the query buffer. */
!         if (in_quote || dol_quote ||
!             line[query_start + strspn(line + query_start, " \t\n\r")] != '\0')
          {
              if (query_buf->len > 0)
                  appendPQExpBufferChar(query_buf, '\n');
Index: src/bin/psql/prompt.c
===================================================================
RCS file: /projects/cvsroot/pgsql-server/src/bin/psql/prompt.c,v
retrieving revision 1.34
diff -c -r1.34 prompt.c
*** src/bin/psql/prompt.c    25 Jan 2004 03:07:22 -0000    1.34
--- src/bin/psql/prompt.c    15 Feb 2004 14:28:02 -0000
***************
*** 85,90 ****
--- 85,91 ----
          case PROMPT_CONTINUE:
          case PROMPT_SINGLEQUOTE:
          case PROMPT_DOUBLEQUOTE:
+         case PROMPT_DOLLARQUOTE:
          case PROMPT_COMMENT:
          case PROMPT_PAREN:
              prompt_name = "PROMPT2";
***************
*** 198,203 ****
--- 199,207 ----
                              break;
                          case PROMPT_DOUBLEQUOTE:
                              buf[0] = '"';
+                             break;
+                         case PROMPT_DOLLARQUOTE:
+                             buf[0] = '$';
                              break;
                          case PROMPT_COMMENT:
                              buf[0] = '*';
Index: src/bin/psql/prompt.h
===================================================================
RCS file: /projects/cvsroot/pgsql-server/src/bin/psql/prompt.h,v
retrieving revision 1.13
diff -c -r1.13 prompt.h
*** src/bin/psql/prompt.h    29 Nov 2003 19:52:07 -0000    1.13
--- src/bin/psql/prompt.h    15 Feb 2004 14:28:02 -0000
***************
*** 15,20 ****
--- 15,21 ----
      PROMPT_COMMENT,
      PROMPT_SINGLEQUOTE,
      PROMPT_DOUBLEQUOTE,
+     PROMPT_DOLLARQUOTE,
      PROMPT_PAREN,
      PROMPT_COPY
  } promptStatus_t;
Index: src/backend/parser/scan.l
===================================================================
RCS file: /projects/cvsroot/pgsql-server/src/backend/parser/scan.l,v
retrieving revision 1.112
diff -c -r1.112 scan.l
*** src/backend/parser/scan.l    29 Nov 2003 19:51:52 -0000    1.112
--- src/backend/parser/scan.l    15 Feb 2004 14:28:16 -0000
***************
*** 39,44 ****
--- 39,46 ----

  static int        xcdepth = 0;    /* depth of nesting in slash-star comments */

+ static char    *dolqstart;  /* current $foo$ quote start string */
+
  /*
   * literalbuf is used to accumulate literal values when multiple rules
   * are needed to parse a single literal.  Call startlit to reset buffer
***************
*** 95,100 ****
--- 97,103 ----
   *  <xd> delimited identifiers (double-quoted identifiers)
   *  <xh> hexadecimal numeric string
   *  <xq> quoted strings
+  *  <dolq> $foo$-style quoted strings
   */

  %x xb
***************
*** 102,107 ****
--- 105,111 ----
  %x xd
  %x xh
  %x xq
+ %x dolq

  /* Bit string
   * It is tempting to scan the string for only those characters
***************
*** 141,146 ****
--- 145,159 ----
  xqoctesc        [\\][0-7]{1,3}
  xqcat            {quote}{whitespace_with_newline}{quote}

+ /* $foo$ style quotes ("dollar quoting")
+  * The quoted string starts with $foo$ where "foo" is an optional string
+  * in the form of an identifier, except that it may not contain "$",
+  * and extends to the first occurrence
+  * of an identical string.  There is *no* processing of the quoted text.
+  */
+ dolqdelim   \$([A-Za-z\200-\377][A-Za-z\200-\377_0-9]*)?\$
+ dolqinside  [^$]+
+
  /* Double quote
   * Allows embedded spaces and other special characters into identifiers.
   */
***************
*** 387,392 ****
--- 400,434 ----
                  }
  <xq><<EOF>>        { yyerror("unterminated quoted string"); }

+ {dolqdelim}  {
+      token_start = yytext;
+      dolqstart = pstrdup(yytext);
+      BEGIN(dolq);
+      startlit();
+     }
+ <dolq>{dolqdelim} {
+      if (strcmp(yytext, dolqstart) == 0)
+      {
+       pfree(dolqstart);
+       BEGIN(INITIAL);
+       yylval.str = litbufdup();
+       return SCONST;
+      }
+      /*
+       * When we fail to match $...$ to dolqstart, transfer
+       * the $... part to the output, but put back the final
+       * $ for rescanning.  Consider $delim$...$junk$delim$
+       */
+      addlit(yytext, yyleng-1);
+      yyless(yyleng-1);
+     }
+ <dolq>{dolqinside}  {
+      addlit(yytext, yyleng);
+     }
+ <dolq>.           {
+      addlitchar(yytext[0]);
+     }
+ <dolq><<EOF>>  { yyerror("unterminated special-quoted string"); }

  {xdstart}        {
                      token_start = yytext;

pgsql-hackers by date:

Previous
From: Florian Weimer
Date:
Subject: Re: [pgsql-hackers-win32] Sync vs. fsync during checkpoint
Next
From: Tom Lane
Date:
Subject: Re: [pgsql-hackers-win32] Sync vs. fsync during checkpoint