Thread: dollar quoting with flex

dollar quoting with flex

From
Andrew Dunstan
Date:
(Fourth try ;-)

Attached is a patch for dollar quoting in the backend and in psql (with
the new flex scanner). I'm fairly confident about the backend (because
this is mainly Tom's work adapted :-) ) but rather less so about psql -
I don't entirely understand all the odd states in psql's scanner. I'm
not sure that I have freed up memory in all the necessary cases. Nor am
I sure what the state is or should be if we end an included file in a
dollar-quoting state, nor how to handle such a situation. So, some extra
eyeballs would be appreciated.

However - it does seem to work in my simple testing.

If this is all OK, the remaining tasks would include pg_dump, docs (Jon
Jensen says he will attack these two) and some regression tests (any
volunteers?)

cheers

andrew
Index: src/backend/parser/scan.l
===================================================================
RCS file: /projects/cvsroot/pgsql-server/src/backend/parser/scan.l,v
retrieving revision 1.114
diff -c -r1.114 scan.l
*** src/backend/parser/scan.l    21 Feb 2004 00:34:52 -0000    1.114
--- src/backend/parser/scan.l    24 Feb 2004 17:33:01 -0000
***************
*** 37,42 ****
--- 37,43 ----
  extern YYSTYPE yylval;

  static int        xcdepth = 0;    /* depth of nesting in slash-star comments */
+ static char    *dolqstart;      /* current $foo$ quote start string */

  /*
   * literalbuf is used to accumulate literal values when multiple rules
***************
*** 94,99 ****
--- 95,101 ----
   *  <xd> delimited identifiers (double-quoted identifiers)
   *  <xh> hexadecimal numeric string
   *  <xq> quoted strings
+  *  <dolq> $foo$ quoted strings
   */

  %x xb
***************
*** 101,106 ****
--- 103,109 ----
  %x xd
  %x xh
  %x xq
+ %x dolq

  /*
   * In order to make the world safe for Windows and Mac clients as well as
***************
*** 175,180 ****
--- 178,194 ----
  xqoctesc        [\\][0-7]{1,3}
  xqcat            {quote}{whitespace_with_newline}{quote}

+ /* $foo$ style quotes ("dollar quoting")
+  * The quoted string starts with $foo$ where "foo" is an optional string
+  * in the form of an identifier, except that it may not contain "$",
+  * and extends to the first occurrence of an identical string.
+  * There is *no* processing of the quoted text.
+  */
+ dolq_start        [A-Za-z\200-\377_]
+ dolq_cont        [A-Za-z\200-\377_0-9]
+ dolqdlm         \$({dolq_start}{dolq_cont}*)?\$
+ dolqins         [^$]+
+
  /* Double quote
   * Allows embedded spaces and other special characters into identifiers.
   */
***************
*** 242,248 ****
  other            .

  /*
!  * Quoted strings must allow some special characters such as single-quote
   *  and newline.
   * Embedded single-quotes are implemented both in the SQL standard
   *  style of two adjacent single quotes "''" and in the Postgres/Java style
--- 256,263 ----
  other            .

  /*
!  * Dollar quoted strings are totally opaque, and no escaping is done on them.
!  * Other quoted strings must allow some special characters such as single-quote
   *  and newline.
   * Embedded single-quotes are implemented both in the SQL standard
   *  style of two adjacent single quotes "''" and in the Postgres/Java style
***************
*** 390,395 ****
--- 405,439 ----
                  }
  <xq><<EOF>>        { yyerror("unterminated quoted string"); }

+ {dolqdlm}       {
+                     token_start = yytext;
+                     dolqstart = pstrdup(yytext);
+                     BEGIN(dolq);
+                     startlit();
+                 }
+ <dolq>{dolqdlm} {
+                     if (strcmp(yytext, dolqstart) == 0)
+                     {
+                         pfree(dolqstart);
+                         BEGIN(INITIAL);
+                         yylval.str = litbufdup();
+                         return SCONST;
+                     }
+                     /*
+                      * When we fail to match $...$ to dolqstart, transfer
+                      * the $... part to the output, but put back the final
+                      * $ for rescanning.  Consider $delim$...$junk$delim$
+                      */
+                     addlit(yytext, yyleng-1);
+                     yyless(yyleng-1);
+                 }
+ <dolq>{dolqins} {
+                     addlit(yytext, yyleng);
+                 }
+ <dolq>.         {
+                     addlitchar(yytext[0]);
+                 }
+ <dolq><<EOF>>   { yyerror("unterminated dollar-quoted string"); }
  {xdstart}        {
                      token_start = yytext;
                      BEGIN(xd);
***************
*** 407,413 ****
                      yylval.str = ident;
                      return IDENT;
                  }
! <xd>{xddouble} {
                      addlitchar('"');
                  }
  <xd>{xdinside}    {
--- 451,457 ----
                      yylval.str = ident;
                      return IDENT;
                  }
! <xd>{xddouble}  {
                      addlitchar('"');
                  }
  <xd>{xdinside}    {
Index: src/bin/psql/prompt.c
===================================================================
RCS file: /projects/cvsroot/pgsql-server/src/bin/psql/prompt.c,v
retrieving revision 1.34
diff -c -r1.34 prompt.c
*** src/bin/psql/prompt.c    25 Jan 2004 03:07:22 -0000    1.34
--- src/bin/psql/prompt.c    24 Feb 2004 17:33:19 -0000
***************
*** 85,90 ****
--- 85,91 ----
          case PROMPT_CONTINUE:
          case PROMPT_SINGLEQUOTE:
          case PROMPT_DOUBLEQUOTE:
+         case PROMPT_DOLLARQUOTE:
          case PROMPT_COMMENT:
          case PROMPT_PAREN:
              prompt_name = "PROMPT2";
***************
*** 198,203 ****
--- 199,207 ----
                              break;
                          case PROMPT_DOUBLEQUOTE:
                              buf[0] = '"';
+                             break;
+                         case PROMPT_DOLLARQUOTE:
+                             buf[0] = '$';
                              break;
                          case PROMPT_COMMENT:
                              buf[0] = '*';
Index: src/bin/psql/prompt.h
===================================================================
RCS file: /projects/cvsroot/pgsql-server/src/bin/psql/prompt.h,v
retrieving revision 1.13
diff -c -r1.13 prompt.h
*** src/bin/psql/prompt.h    29 Nov 2003 19:52:07 -0000    1.13
--- src/bin/psql/prompt.h    24 Feb 2004 17:33:19 -0000
***************
*** 15,20 ****
--- 15,21 ----
      PROMPT_COMMENT,
      PROMPT_SINGLEQUOTE,
      PROMPT_DOUBLEQUOTE,
+     PROMPT_DOLLARQUOTE,
      PROMPT_PAREN,
      PROMPT_COPY
  } promptStatus_t;
Index: src/bin/psql/psqlscan.l
===================================================================
RCS file: /projects/cvsroot/pgsql-server/src/bin/psql/psqlscan.l,v
retrieving revision 1.1
diff -c -r1.1 psqlscan.l
*** src/bin/psql/psqlscan.l    19 Feb 2004 19:40:09 -0000    1.1
--- src/bin/psql/psqlscan.l    24 Feb 2004 17:33:19 -0000
***************
*** 92,97 ****
--- 92,98 ----
      int            start_state;    /* saved YY_START */
      int            paren_depth;    /* depth of nesting in parentheses */
      int            xcdepth;        /* depth of nesting in slash-star comments */
+     char        *dolqstart;      /* current $foo$ quote start string */
  } PsqlScanStateData;

  static PsqlScanState cur_state;    /* current state while active */
***************
*** 151,156 ****
--- 152,158 ----
   *  <xd> delimited identifiers (double-quoted identifiers)
   *  <xh> hexadecimal numeric string
   *  <xq> quoted strings
+  *  <dolq> $foo$ quoted strings
   */

  %x xb
***************
*** 158,163 ****
--- 160,166 ----
  %x xd
  %x xh
  %x xq
+ %x dolq
  /* Additional exclusive states for psql only: lex backslash commands */
  %x xslashcmd
  %x xslasharg
***************
*** 241,246 ****
--- 244,260 ----
  xqoctesc        [\\][0-7]{1,3}
  xqcat            {quote}{whitespace_with_newline}{quote}

+ /* $foo$ style quotes ("dollar quoting")
+  * The quoted string starts with $foo$ where "foo" is an optional string
+  * in the form of an identifier, except that it may not contain "$",
+  * and extends to the first occurrence of an identical string.
+  * There is *no* processing of the quoted text.
+  */
+ dolq_start        [A-Za-z\200-\377_]
+ dolq_cont        [A-Za-z\200-\377_0-9]
+ dolqdlm         \$({dolq_start}{dolq_cont}*)?\$
+ dolqins         [^$]+
+
  /* Double quote
   * Allows embedded spaces and other special characters into identifiers.
   */
***************
*** 428,433 ****
--- 442,477 ----
                      ECHO;
                  }

+ {dolqdlm}       {
+                     cur_state->dolqstart = pg_strdup(yytext);
+                     BEGIN(dolq);
+                     ECHO;
+                 }
+ <dolq>{dolqdlm} {
+                     if (strcmp(yytext, cur_state->dolqstart) == 0)
+                     {
+                         free(cur_state->dolqstart);
+                         cur_state->dolqstart = NULL;
+                         BEGIN(INITIAL);
+                         ECHO;
+                     }
+                     else
+                     {
+                         /*
+                          * When we fail to match $...$ to dolqstart, transfer
+                          * the $... part to the output, but put back the final
+                          * $ for rescanning.  Consider $delim$...$junk$delim$
+                          */
+                         emit(yytext, yyleng-1);
+                         yyless(yyleng-1);
+                     }
+                 }
+ <dolq>{dolqins} {
+                     ECHO;
+                 }
+ <dolq>.         {
+                     ECHO;
+                 }
  {xdstart}        {
                      BEGIN(xd);
                      ECHO;
***************
*** 1007,1012 ****
--- 1051,1060 ----
                  case xq:
                      result = PSCAN_INCOMPLETE;
                      *prompt = PROMPT_SINGLEQUOTE;
+                     break;
+                 case dolq:
+                     result = PSCAN_INCOMPLETE;
+                     *prompt = PROMPT_DOLLARQUOTE;
                      break;
                  default:
                      /* can't get here */

Re: dollar quoting with flex

From
Tom Lane
Date:
Andrew Dunstan <andrew@dunslane.net> writes:
> Attached is a patch for dollar quoting in the backend and in psql (with
> the new flex scanner). I'm fairly confident about the backend (because
> this is mainly Tom's work adapted :-) ) but rather less so about psql -
> I don't entirely understand all the odd states in psql's scanner. I'm
> not sure that I have freed up memory in all the necessary cases. Nor am
> I sure what the state is or should be if we end an included file in a
> dollar-quoting state, nor how to handle such a situation. So, some extra
> eyeballs would be appreciated.

I'll take a look soon.  The psql behavior is that a new lexer is
instantiated for each include-file level, which means that quoting
states can't persist across file boundaries.  This emulates the behavior
of the old handmade lexing code, and seems fairly reasonable to me.
(By definition, you weren't in a quoting state when you recognized the
\i command, and so you shouldn't be when you come out of the include
file.)  We could argue about that if people want to reconsider it, but
it seems orthogonal to the dollar-quoting change to me.

            regards, tom lane

Re: dollar quoting with flex

From
Tom Lane
Date:
Andrew Dunstan <andrew@dunslane.net> writes:
> Attached is a patch for dollar quoting in the backend and in psql (with
> the new flex scanner).

Applied with minor fixes.

> If this is all OK, the remaining tasks would include pg_dump, docs (Jon
> Jensen says he will attack these two) and some regression tests (any
> volunteers?)

I think plpgsql's lexer also needs to be taught about dollar-quoting.

            regards, tom lane

dollar quoting for plpgsql

From
Andrew Dunstan
Date:
Tom Lane wrote:

>
>
>I think plpgsql's lexer also needs to be taught about dollar-quoting.
>
>
>
>

The attached patch appears to do the trick:


floobl=# create or replace function testme() returns text language
plpgsql as $$
floobl$# begin return $foo$a'\b$bar$foo$; end;
floobl$# $$;
CREATE FUNCTION
floobl=# select testme();
  testme
----------
 a'\b$bar
(1 row)

floobl=#



cheers

andrew
Index: src/pl/plpgsql/src/scan.l
===================================================================
RCS file: /projects/cvsroot/pgsql-server/src/pl/plpgsql/src/scan.l,v
retrieving revision 1.31
diff -c -r1.31 scan.l
*** src/pl/plpgsql/src/scan.l    24 Feb 2004 22:06:32 -0000    1.31
--- src/pl/plpgsql/src/scan.l    25 Feb 2004 16:32:28 -0000
***************
*** 57,62 ****
--- 57,63 ----
  static bool have_lookahead_token;
  static const char *cur_line_start;
  static int    cur_line_num;
+ static char    *dolqstart;      /* current $foo$ quote start string */

  int    plpgsql_SpaceScanned = 0;
  %}
***************
*** 70,76 ****
  %option case-insensitive


! %x    IN_STRING IN_COMMENT

  digit            [0-9]
  ident_start        [A-Za-z\200-\377_]
--- 71,77 ----
  %option case-insensitive


! %x    IN_STRING IN_COMMENT IN_DOLLARQUOTE

  digit            [0-9]
  ident_start        [A-Za-z\200-\377_]
***************
*** 84,89 ****
--- 85,98 ----

  space            [ \t\n\r\f]

+ /* $foo$ style quotes ("dollar quoting")
+  * copied stright from the backend SQL parser
+  */
+ dolq_start        [A-Za-z\200-\377_]
+ dolq_cont        [A-Za-z\200-\377_0-9]
+ dolqdelim        \$({dolq_start}{dolq_cont}*)?\$
+ dolqinside        [^$]+
+
  %%
      /* ----------
       * Local variables in scanner to remember where
***************
*** 288,293 ****
--- 297,336 ----
                          (errcode(ERRCODE_DATATYPE_MISMATCH),
                           errmsg("unterminated string")));
              }
+
+ {dolqdelim}        {
+               start_lineno = plpgsql_scanner_lineno();
+               start_charpos = yytext;
+               dolqstart = pstrdup(yytext);
+               BEGIN(IN_DOLLARQUOTE);
+                 }
+ <IN_DOLLARQUOTE>{dolqdelim} {
+                     if (strcmp(yytext, dolqstart) == 0)
+                     {
+                         pfree(dolqstart);
+                         yyleng -= (yytext - start_charpos);
+                         yytext = start_charpos;
+                         BEGIN INITIAL;
+                         return T_STRING;
+                     }
+                     else
+                     {
+                         /*
+                          * When we fail to match $...$ to dolqstart, transfer
+                          * the $... part to the output, but put back the final
+                          * $ for rescanning.  Consider $delim$...$junk$delim$
+                          */
+                         yyless(yyleng-1);
+                     }
+                 }
+ <IN_DOLLARQUOTE>{dolqinside} { }
+ <IN_DOLLARQUOTE>.    { /* needed for $ inside the quoted text */ }
+ <IN_DOLLARQUOTE><<EOF>>    {
+                 plpgsql_error_lineno = start_lineno;
+                 ereport(ERROR,
+                         (errcode(ERRCODE_DATATYPE_MISMATCH),
+                          errmsg("unterminated dollar quoted string")));
+                   }

      /* ----------
       * Any unmatched character is returned as is

Re: dollar quoting for plpgsql

From
Tom Lane
Date:
Andrew Dunstan <andrew@dunslane.net> writes:
>> I think plpgsql's lexer also needs to be taught about dollar-quoting.

> The attached patch appears to do the trick:

Applied.  It needed a little more work to handle RAISE NOTICE
reasonably, but I took care of that.

            regards, tom lane