From f854b4c50cd93c2149199112923f1ecdd4c66c11 Mon Sep 17 00:00:00 2001 From: John Naylor Date: Fri, 5 Jul 2019 14:04:13 +0700 Subject: [PATCH v4 1/3] Replace the Flex quotestop rules with a new exclusive state When Flex encounters a quote while inside any kind of quoted string, it saves the current state and enters a new state in order to detect possible string continuations. This brings the number of scanner states from 37045 to 30367, which is small enough to allow Flex to use 16-bit types in the yy_transition array. --- src/backend/parser/scan.l | 108 ++++++++++++++++++----------------- src/include/parser/scanner.h | 3 + 2 files changed, 60 insertions(+), 51 deletions(-) diff --git a/src/backend/parser/scan.l b/src/backend/parser/scan.l index e1cae859e8..cbf3f6deca 100644 --- a/src/backend/parser/scan.l +++ b/src/backend/parser/scan.l @@ -168,6 +168,7 @@ extern void core_yyset_column(int column_no, yyscan_t yyscanner); * delimited identifiers (double-quoted identifiers) * hexadecimal numeric string * standard quoted strings + * quote stop (detect continued strings) * extended quoted strings (support backslash escape sequences) * $foo$ quoted strings * quoted identifier with Unicode escapes @@ -185,6 +186,7 @@ extern void core_yyset_column(int column_no, yyscan_t yyscanner); %x xd %x xh %x xq +%x xqs %x xe %x xdolq %x xui @@ -231,19 +233,9 @@ special_whitespace ({space}+|{comment}{newline}) horiz_whitespace ({horiz_space}|{comment}) whitespace_with_newline ({horiz_whitespace}*{newline}{special_whitespace}*) -/* - * To ensure that {quotecontinue} can be scanned without having to back up - * if the full pattern isn't matched, we include trailing whitespace in - * {quotestop}. This matches all cases where {quotecontinue} fails to match, - * except for {quote} followed by whitespace and just one "-" (not two, - * which would start a {comment}). To cover that we have {quotefail}. - * The actions for {quotestop} and {quotefail} must throw back characters - * beyond the quote proper. - */ quote ' -quotestop {quote}{whitespace}* -quotecontinue {quote}{whitespace_with_newline}{quote} -quotefail {quote}{whitespace}*"-" +quotecontinue {whitespace_with_newline}{quote} +quotecontinuefail {whitespace}*{other}? /* Bit string * It is tempting to scan the string for only those characters @@ -476,21 +468,10 @@ other . startlit(); addlitchar('b', yyscanner); } -{quotestop} | -{quotefail} { - yyless(1); - BEGIN(INITIAL); - yylval->str = litbufdup(yyscanner); - return BCONST; - } {xhinside} | {xbinside} { addlit(yytext, yyleng, yyscanner); } -{quotecontinue} | -{quotecontinue} { - /* ignore */ - } <> { yyerror("unterminated bit string literal"); } {xhstart} { @@ -505,13 +486,6 @@ other . startlit(); addlitchar('x', yyscanner); } -{quotestop} | -{quotefail} { - yyless(1); - BEGIN(INITIAL); - yylval->str = litbufdup(yyscanner); - return XCONST; - } <> { yyerror("unterminated hexadecimal string literal"); } {xnstart} { @@ -568,28 +542,63 @@ other . BEGIN(xus); startlit(); } -{quotestop} | -{quotefail} { - yyless(1); - BEGIN(INITIAL); + +{quote} { /* - * check that the data remains valid if it might have been - * made invalid by unescaping any chars. + * When we are scanning a quoted string and see an end + * quote, we must look ahead for a possible continuation. + * If we don't see one, we know the last quote was in + * fact the end of the string. */ - if (yyextra->saw_non_ascii) - pg_verifymbstr(yyextra->literalbuf, - yyextra->literallen, - false); - yylval->str = litbufdup(yyscanner); - return SCONST; + yyextra->state_before_quote_stop = YYSTATE; + BEGIN(xqs); } -{quotestop} | -{quotefail} { - /* throw back all but the quote */ - yyless(1); - /* xusend state looks for possible UESCAPE */ - BEGIN(xusend); +{quotecontinue} { + BEGIN(yyextra->state_before_quote_stop); + } +<> | +{quotecontinuefail} { + /* + * throw back everything and handle the string + * we scanned previously + */ + yyless(0); + + switch (yyextra->state_before_quote_stop) + { + case xb: + BEGIN(INITIAL); + yylval->str = litbufdup(yyscanner); + return BCONST; + case xh: + BEGIN(INITIAL); + yylval->str = litbufdup(yyscanner); + return XCONST; + case xe: + /* fallthrough */ + case xq: + BEGIN(INITIAL); + + /* + * Check that the data remains valid if it + * might have been made invalid by unescaping + * any chars. + */ + if (yyextra->saw_non_ascii) + pg_verifymbstr(yyextra->literalbuf, + yyextra->literallen, + false); + yylval->str = litbufdup(yyscanner); + return SCONST; + case xus: + /* xusend state looks for possible UESCAPE */ + BEGIN(xusend); + break; + default: + yyerror("unhandled previous state after endquote"); + } } + {whitespace} { /* stay in xusend state over whitespace */ } @@ -693,9 +702,6 @@ other . if (c == '\0' || IS_HIGHBIT_SET(c)) yyextra->saw_non_ascii = true; } -{quotecontinue} { - /* ignore */ - } . { /* This is only needed for \ just before EOF */ addlitchar(yytext[0], yyscanner); diff --git a/src/include/parser/scanner.h b/src/include/parser/scanner.h index 731a2bd264..9b5f5eaad1 100644 --- a/src/include/parser/scanner.h +++ b/src/include/parser/scanner.h @@ -99,6 +99,9 @@ typedef struct core_yy_extra_type int literallen; /* actual current string length */ int literalalloc; /* current allocated buffer size */ + /* start condition when end quote is detected */ + int state_before_quote_stop; + int xcdepth; /* depth of nesting in slash-star comments */ char *dolqstart; /* current $foo$ quote start string */ -- 2.17.2 (Apple Git-113)