From 862165136653994f14db91bc9fc95891751a7805 Mon Sep 17 00:00:00 2001 From: John Naylor Date: Mon, 15 Jan 2018 10:19:30 +0700 Subject: [PATCH v7] Create data conversion infrastructure convert_header2dat.pl turns DATA()/(SH)DESCR() statements into serialized Perl data structures in pg_*.dat files, preserving comments along the way. This is a one-off script, but it is committed to the repo in case third parties want to convert their own catalog data. The pg_tablespace.h changes allow the OID symbols to be captured correctly. Remove data parsing from the original Catalogs() function and rename it to ParseHeader() to reflect its new, limited role of extracting the schema info from a single header. The new data files are handled by the new function ParseData(). Having these functions work with only one file at a time requires their callers to do more work, but results in a cleaner design. rewrite_dat.pl reads in pg_*.dat files and rewrites them in a standard format. It writes attributes in order, preserves comments and folds consecutive blank lines. The meta-attributes oid, oid_symbol and (sh)descr are on their own line, if present. --- src/backend/catalog/Catalog.pm | 227 +++++++++-------- src/include/catalog/convert_header2dat.pl | 394 ++++++++++++++++++++++++++++++ src/include/catalog/pg_tablespace.h | 3 +- src/include/catalog/rewrite_dat.pl | 200 +++++++++++++++ 4 files changed, 706 insertions(+), 118 deletions(-) create mode 100644 src/include/catalog/convert_header2dat.pl create mode 100644 src/include/catalog/rewrite_dat.pl diff --git a/src/backend/catalog/Catalog.pm b/src/backend/catalog/Catalog.pm index 9ced154..60e641e 100644 --- a/src/backend/catalog/Catalog.pm +++ b/src/backend/catalog/Catalog.pm @@ -1,7 +1,7 @@ #---------------------------------------------------------------------- # # Catalog.pm -# Perl module that extracts info from catalog headers into Perl +# Perl module that extracts info from catalog files into Perl # data structures # # Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group @@ -16,12 +16,11 @@ package Catalog; use strict; use warnings; -# Call this function with an array of names of header files to parse. -# Returns a nested data structure describing the data in the headers. -sub Catalogs +# Parses a catalog header file into a data structure describing the schema +# of the catalog. +sub ParseHeader { - my (%catalogs, $catname, $declaring_attributes, $most_recent); - $catalogs{names} = []; + my $input_file = shift; # There are a few types which are given one name in the C source, but a # different name at the SQL level. These are enumerated here. @@ -34,19 +33,16 @@ sub Catalogs 'TransactionId' => 'xid', 'XLogRecPtr' => 'pg_lsn'); - foreach my $input_file (@_) - { my %catalog; + my $declaring_attributes = 0; my $is_varlen = 0; $catalog{columns} = []; - $catalog{data} = []; + $catalog{toasting} = []; + $catalog{indexing} = []; open(my $ifh, '<', $input_file) || die "$input_file: $!"; - my ($filename) = ($input_file =~ m/(\w+)\.h$/); - my $natts_pat = "Natts_$filename"; - # Scan the input file. while (<$ifh>) { @@ -64,9 +60,6 @@ sub Catalogs redo; } - # Remember input line number for later. - my $input_line_number = $.; - # Strip useless whitespace and trailing semicolons. chomp; s/^\s+//; @@ -74,68 +67,17 @@ sub Catalogs s/\s+/ /g; # Push the data into the appropriate data structure. - if (/$natts_pat\s+(\d+)/) - { - $catalog{natts} = $1; - } - elsif ( - /^DATA\(insert(\s+OID\s+=\s+(\d+))?\s+\(\s*(.*)\s*\)\s*\)$/) - { - check_natts($filename, $catalog{natts}, $3, $input_file, - $input_line_number); - - push @{ $catalog{data} }, { oid => $2, bki_values => $3 }; - } - elsif (/^DESCR\(\"(.*)\"\)$/) - { - $most_recent = $catalog{data}->[-1]; - - # this tests if most recent line is not a DATA() statement - if (ref $most_recent ne 'HASH') - { - die "DESCR() does not apply to any catalog ($input_file)"; - } - if (!defined $most_recent->{oid}) - { - die "DESCR() does not apply to any oid ($input_file)"; - } - elsif ($1 ne '') - { - $most_recent->{descr} = $1; - } - } - elsif (/^SHDESCR\(\"(.*)\"\)$/) - { - $most_recent = $catalog{data}->[-1]; - - # this tests if most recent line is not a DATA() statement - if (ref $most_recent ne 'HASH') - { - die - "SHDESCR() does not apply to any catalog ($input_file)"; - } - if (!defined $most_recent->{oid}) - { - die "SHDESCR() does not apply to any oid ($input_file)"; - } - elsif ($1 ne '') - { - $most_recent->{shdescr} = $1; - } - } - elsif (/^DECLARE_TOAST\(\s*(\w+),\s*(\d+),\s*(\d+)\)/) + if (/^DECLARE_TOAST\(\s*(\w+),\s*(\d+),\s*(\d+)\)/) { - $catname = 'toasting'; my ($toast_name, $toast_oid, $index_oid) = ($1, $2, $3); - push @{ $catalog{data} }, + push @{ $catalog{toasting} }, "declare toast $toast_oid $index_oid on $toast_name\n"; } elsif (/^DECLARE_(UNIQUE_)?INDEX\(\s*(\w+),\s*(\d+),\s*(.+)\)/) { - $catname = 'indexing'; my ($is_unique, $index_name, $index_oid, $using) = ($1, $2, $3, $4); - push @{ $catalog{data} }, + push @{ $catalog{indexing} }, sprintf( "declare %sindex %s %s %s\n", $is_unique ? 'unique ' : '', @@ -143,16 +85,13 @@ sub Catalogs } elsif (/^BUILD_INDICES/) { - push @{ $catalog{data} }, "build indices\n"; + push @{ $catalog{indexing} }, "build indices\n"; } elsif (/^CATALOG\(([^,]*),(\d+)\)/) { - $catname = $1; + $catalog{catname} = $1; $catalog{relation_oid} = $2; - # Store pg_* catalog names in the same order we receive them - push @{ $catalogs{names} }, $catname; - $catalog{bootstrap} = /BKI_BOOTSTRAP/ ? ' bootstrap' : ''; $catalog{shared_relation} = /BKI_SHARED_RELATION/ ? ' shared_relation' : ''; @@ -173,7 +112,7 @@ sub Catalogs } if (/^}/) { - undef $declaring_attributes; + $declaring_attributes = 0; } else { @@ -227,32 +166,107 @@ sub Catalogs } } } - $catalogs{$catname} = \%catalog; close $ifh; - } - return \%catalogs; + return \%catalog; } -# Split a DATA line into fields. -# Call this on the bki_values element of a DATA item returned by Catalogs(); -# it returns a list of field values. We don't strip quoting from the fields. -# Note: it should be safe to assign the result to a list of length equal to -# the nominal number of catalog fields, because check_natts already checked -# the number of fields. -sub SplitDataLine +# Parses a file containing Perl data structure literals, returning live data. +# +# The parameter $preserve_formatting needs to be set for callers that want +# to work with non-data lines in the data files, such as comments and blank +# lines. If a caller just wants consume the data, leave it unset. +sub ParseData { - my $bki_values = shift; - - # This handling of quoted strings might look too simplistic, but it - # matches what bootscanner.l does: that has no provision for quote marks - # inside quoted strings, either. If we don't have a quoted string, just - # snarf everything till next whitespace. That will accept some things - # that bootscanner.l will see as erroneous tokens; but it seems wiser - # to do that and let bootscanner.l complain than to silently drop - # non-whitespace characters. - my @result = $bki_values =~ /"[^"]*"|\S+/g; - - return @result; + my ($input_file, $schema, $preserve_formatting) = @_; + + $input_file =~ /\w+\.dat$/ + or die "Input file needs to be a .dat file.\n"; + my $data = []; + + # Read entire file into a string and eval it. + if (!$preserve_formatting) + { + my $file_string = do + { + local $/ = undef; + open my $ifd, "<", $input_file or die "$input_file: $!"; + <$ifd>; + }; + + eval '$data = ' . $file_string; + print "Error : $@\n" if $@; + return $data; + } + + # When preserving formatting, we scan the file one line at a time + # and decide how to handle each item. We don't check too closely + # for valid syntax, since we assume it will be checked otherwise. + my $prev_blank = 0; + open(my $ifd, '<', $input_file) or die "$input_file: $!"; + while (<$ifd>) + { + my $datum; + + # Capture non-consecutive blank lines. + if (/^\s*$/) + { + next if $prev_blank; + $prev_blank = 1; + + # Newline gets added by caller. + $datum = ''; + } + else + { + $prev_blank = 0; + } + + # Capture comments that are on their own line. + if (/^\s*(#.*?)\s*$/) + { + $datum = $1; + } + + # Capture brackets that are on their own line. + elsif (/^\s*(\[|\])\s*$/) + { + $datum = $1; + } + + # Capture hash references + # NB: Assumes that the next hash ref can't start on the + # same line where the present one ended. + # Not foolproof, but we shouldn't need a full parser, + # since we expect relatively well-behaved input. + elsif (/{/) + { + # Quick hack to detect when we have a full hash ref to + # parse. We can't just use a regex because of values in + # pg_aggregate and pg_proc like '{0,0}'. + my $lcnt = tr/{//; + my $rcnt = tr/}//; + + if ($lcnt == $rcnt) + { + eval '$datum = ' . $_; + if (!ref $datum) + { + die "Error parsing $_\n$!"; + } + } + else + { + my $next_line = <$ifd>; + die "$input_file: ends within Perl hash\n" + if !defined $next_line; + $_ .= $next_line; + redo; + } + } + push @$data, $datum if defined $datum; + } + close $ifd; + return $data; } # Fill in default values of a record using the given schema. It's the @@ -308,7 +322,6 @@ sub RenameTempFile rename($temp_name, $final_name) || die "rename: $temp_name: $!"; } - # Find a symbol defined in a particular header file and extract the value. # # The include path has to be passed as a reference to an array. @@ -340,22 +353,4 @@ sub FindDefinedSymbol die "$catalog_header: not found in any include directory\n"; } - -# verify the number of fields in the passed-in DATA line -sub check_natts -{ - my ($catname, $natts, $bki_val, $file, $line) = @_; - - die -"Could not find definition for Natts_${catname} before start of DATA() in $file\n" - unless defined $natts; - - my $nfields = scalar(SplitDataLine($bki_val)); - - die sprintf -"Wrong number of attributes in DATA() entry at %s:%d (expected %d but got %d)\n", - $file, $line, $natts, $nfields - unless $natts == $nfields; -} - 1; diff --git a/src/include/catalog/convert_header2dat.pl b/src/include/catalog/convert_header2dat.pl new file mode 100644 index 0000000..518a7eb --- /dev/null +++ b/src/include/catalog/convert_header2dat.pl @@ -0,0 +1,394 @@ +#!/usr/bin/perl -w +#---------------------------------------------------------------------- +# +# convert_header2dat.pl +# Perl script that parses the catalog header files for BKI +# DATA() and (SH)DESCR() statements, as well as defined symbols +# referring to OIDs, and writes them out as native perl data +# structures. White space and header commments referring to DATA() +# lines are preserved. Some functions are loosely copied from +# src/backend/catalog/Catalog.pm, whose equivalents have been +# removed. +# +# Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group +# Portions Copyright (c) 1994, Regents of the University of California +# +# /src/include/catalog/convert_header2dat.pl +# +#---------------------------------------------------------------------- + +use strict; +use warnings; + +use Data::Dumper; + +# No $VARs - we add our own later. +$Data::Dumper::Terse = 1; + +my @input_files; +my $output_path = ''; +my $major_version; + +# Process command line switches. +while (@ARGV) +{ + my $arg = shift @ARGV; + if ($arg !~ /^-/) + { + push @input_files, $arg; + } + elsif ($arg =~ /^-o/) + { + $output_path = length($arg) > 2 ? substr($arg, 2) : shift @ARGV; + } + else + { + usage(); + } +} + +# Sanity check arguments. +die "No input files.\n" if !@input_files; +foreach my $input_file (@input_files) +{ + if ($input_file !~ /\.h$/) + { + die "Input files need to be header files.\n"; + } +} + +# Make sure output_path ends in a slash. +if ($output_path ne '' && substr($output_path, -1) ne '/') +{ + $output_path .= '/'; +} + +# Read all the input header files into internal data structures +# XXX This script is not robust against non-catalog headers. It's best +# to pass it the same list found in backend/catalog/Makefile. +my $catalogs = catalogs(@input_files); + +# produce output, one catalog at a time +foreach my $catname (@{ $catalogs->{names} }) +{ + my $catalog = $catalogs->{$catname}; + my $schema = $catalog->{columns}; + + # First, see if the header has any data entries. This is necessary + # because headers with no DATA may still have comments that catalogs() + # thought was in a DATA section. + my $found_one = 0; + foreach my $data (@{ $catalog->{data} }) + { + if (ref $data eq 'HASH') + { + $found_one = 1; + } + } + next if !$found_one; + + my @attnames; + foreach my $column (@$schema) + { + my $attname = $column->{name}; + my $atttype = $column->{type}; + push @attnames, $attname; + } + + my $datfile = "$output_path$catname.dat"; + open my $dat, '>', $datfile + or die "can't open $datfile: $!"; + + # Write out data file. + + print $dat <{data} }) + { + + # Either a blank line or comment - just write it out. + if (! ref $data) + { + print $dat "$data\n"; + } + # Hash ref representing a data entry. + elsif (ref $data eq 'HASH') + { + # Split line into tokens without interpreting their meaning. + my %bki_values; + @bki_values{@attnames} = split_data_line($data->{bki_values}); + + # Flatten data hierarchy. + delete $data->{bki_values}; + my %flat_data = (%$data, %bki_values); + + # Strip double quotes for readability. Most will be put + # back in when writing postgres.bki + foreach (values %flat_data) + { + s/"//g; + } + + print $dat Dumper(\%flat_data); + print $dat ",\n"; + } + } + + print $dat "\n]\n"; +} + + +# This function is a heavily modified version of its former namesake +# in Catalog.pm. There is possibly some dead code here. It's not worth +# removing. +sub catalogs +{ + my (%catalogs, $catname, $declaring_attributes, $most_recent); + $catalogs{names} = []; + + # There are a few types which are given one name in the C source, but a + # different name at the SQL level. These are enumerated here. + my %RENAME_ATTTYPE = ( + 'int16' => 'int2', + 'int32' => 'int4', + 'int64' => 'int8', + 'Oid' => 'oid', + 'NameData' => 'name', + 'TransactionId' => 'xid', + 'XLogRecPtr' => 'pg_lsn'); + + foreach my $input_file (@_) + { + my %catalog; + $catalog{columns} = []; + $catalog{data} = []; + my $is_varlen = 0; + my $saving_comments = 0; + + open(my $ifh, '<', $input_file) || die "$input_file: $!"; + my ($filename) = ($input_file =~ m/(\w+)\.h$/); + + # Skip these to keep the code simple. + next if $filename eq 'toasting' + or $filename eq 'indexing'; + + # Scan the input file. + while (<$ifh>) + { + # Determine whether we're in the DATA section and should + # start saving header comments. + if (/(\/|\s)\*\s+initial contents of pg_/) + { + $saving_comments = 1; + } + + if ($saving_comments) + { + if ( m{^(/|\s+)\*\s+(.+?)(\*/)?$} ) + { + my $comment = $2; + + # Filter out comments we know we don't want. + if ($comment !~ /^-+$/ + and $comment !~ /initial contents of pg/ + and $comment !~ /PG_\w+_H/) + { + # Trim whitespace. + $comment =~ s/^\s+//; + $comment =~ s/\s+$//; + push @{ $catalog{data} }, "# $comment"; + } + } + elsif (/^\s*$/) + { + # Preserve blank lines. Newline gets added by caller. + push @{ $catalog{data} }, ''; + } + } + else + { + # Strip C-style comments. + s;/\*(.|\n)*\*/;;g; + if (m;/\*;) + { + # handle multi-line comments properly. + my $next_line = <$ifh>; + die "$input_file: ends within C-style comment\n" + if !defined $next_line; + $_ .= $next_line; + redo; + } + } + + # Strip useless whitespace and trailing semicolons. + chomp; + s/^\s+//; + s/;\s*$//; + s/\s+/ /g; + + # Push the data into the appropriate data structure. + if (/^DATA\(insert(\s+OID\s+=\s+(\d+))?\s+\(\s*(.*)\s*\)\s*\)$/) + { + if ($2) + { + push @{ $catalog{data} }, { oid => $2, bki_values => $3 }; + } + else + { + push @{ $catalog{data} }, { bki_values => $3 }; + } + } + # Save defined symbols referring to OIDs. + elsif (/^#define\s+(\S+)\s+(\d+)$/) + { + $most_recent = $catalog{data}->[-1]; + my $oid_symbol = $1; + + # Print a warning if we find a defined symbol that is not + # associated with the most recent DATA() statement, and is + # not one of the symbols that we know to exclude. + if (ref $most_recent ne 'HASH' + and $oid_symbol !~ m/^Natts/ + and $oid_symbol !~ m/^Anum/ + and $oid_symbol !~ m/^STATISTIC_/ + and $oid_symbol !~ m/^TRIGGER_TYPE_/ + and $oid_symbol !~ m/RelationId$/ + and $oid_symbol !~ m/Relation_Rowtype_Id$/) + { + printf "Unhandled #define symbol: $filename: $_\n"; + next; + } + if (defined $most_recent->{oid} && $most_recent->{oid} ne $2) + { + print "#define does not apply to last seen oid \n$_\n"; + next; + } + $most_recent->{oid_symbol} = $oid_symbol; + } + elsif (/^DESCR\(\"(.*)\"\)$/) + { + $most_recent = $catalog{data}->[-1]; + + # Test if most recent line is not a DATA() statement. + if (ref $most_recent ne 'HASH') + { + die "DESCR() does not apply to any catalog ($input_file)"; + } + if (!defined $most_recent->{oid}) + { + die "DESCR() does not apply to any oid ($input_file)"; + } + elsif ($1 ne '') + { + $most_recent->{descr} = $1; + } + } + elsif (/^SHDESCR\(\"(.*)\"\)$/) + { + $most_recent = $catalog{data}->[-1]; + + # Test if most recent line is not a DATA() statement. + if (ref $most_recent ne 'HASH') + { + die "SHDESCR() does not apply to any catalog ($input_file)"; + } + if (!defined $most_recent->{oid}) + { + die "SHDESCR() does not apply to any oid ($input_file)"; + } + elsif ($1 ne '') + { + $most_recent->{shdescr} = $1; + } + } + elsif (/^CATALOG\(([^,]*),(\d+)\)/) + { + $catname = $1; + $catalog{relation_oid} = $2; + + # Store pg_* catalog names in the same order we receive them + push @{ $catalogs{names} }, $catname; + + $declaring_attributes = 1; + } + elsif ($declaring_attributes) + { + next if (/^{|^$/); + next if (/^#/); + if (/^}/) + { + undef $declaring_attributes; + } + else + { + my %column; + my ($atttype, $attname, $attopt) = split /\s+/, $_; + die "parse error ($input_file)" unless $attname; + if (exists $RENAME_ATTTYPE{$atttype}) + { + $atttype = $RENAME_ATTTYPE{$atttype}; + } + if ($attname =~ /(.*)\[.*\]/) # array attribute + { + $attname = $1; + $atttype .= '[]'; + } + + $column{type} = $atttype; + $column{name} = $attname; + + push @{ $catalog{columns} }, \%column; + } + } + } + if (defined $catname) + { + $catalogs{$catname} = \%catalog; + } + close $ifh; + } + return \%catalogs; +} + +# Split a DATA line into fields. +# Call this on the bki_values element of a DATA item returned by catalogs(); +# it returns a list of field values. We don't strip quoting from the fields. +# Note: It should be safe to assign the result to a list of length equal to +# the nominal number of catalog fields, because the number of fields were +# checked in the original Catalog module. +sub split_data_line +{ + my $bki_values = shift; + + my @result = $bki_values =~ /"[^"]*"|\S+/g; + return @result; +} + +sub usage +{ + die < 2 ? substr($arg, 2) : shift @ARGV; + } + elsif ($arg eq '--revert') + { + revert(); + } + else + { + usage(); + } +} + +# Sanity check arguments. +die "No input files.\n" + if !@input_files; + +# Make sure output_path ends in a slash. +if ($output_path ne '' && substr($output_path, -1) ne '/') +{ + $output_path .= '/'; +} + +# Metadata of a catalog entry +my @METADATA = ('oid', 'oid_symbol', 'descr', 'shdescr'); + +# Read all the input files into internal data structures. +# We pass data file names as arguments and then look for matching +# headers to parse the schema from. +foreach my $datfile (@input_files) +{ + $datfile =~ /(.+)\.dat$/ + or die "Input files need to be data (.dat) files.\n"; + + my $header = "$1.h"; + die "There in no header file corresponding to $datfile" + if ! -e $header; + + my @attnames; + my $catalog = Catalog::ParseHeader($header); + my $catname = $catalog->{catname}; + my $schema = $catalog->{columns}; + + foreach my $column (@$schema) + { + my $attname = $column->{name}; + push @attnames, $attname; + } + + my $catalog_data = Catalog::ParseData($datfile, $schema, 1); + next if !defined $catalog_data; + + # Back up old data file rather than overwrite it. The input path and + # output path are normally the same, but we don't assume that. + my $newdatfile = "$output_path$catname.dat"; + if (-e $newdatfile) + { + rename($newdatfile, $newdatfile . '.bak') + or die "rename: $newdatfile: $!"; + } + open my $dat, '>', $newdatfile + or die "can't open $newdatfile: $!"; + + # Write the data. + foreach my $data (@$catalog_data) + { + # Either a newline, comment, or bracket - just write it out. + if (! ref $data) + { + print $dat "$data\n"; + } + # Hash ref representing a data entry. + elsif (ref $data eq 'HASH') + { + my %values = %$data; + print $dat "{ "; + + # Separate out metadata fields for readability. + my $metadata_line = format_line(\%values, @METADATA); + if ($metadata_line) + { + print $dat $metadata_line; + print $dat ",\n"; + } + my $data_line = format_line(\%values, @attnames); + + # Line up with metadata line, if there is one. + if ($metadata_line) + { + print $dat ' '; + } + print $dat $data_line; + print $dat " },\n"; + } + else + { + die "Unexpected data type"; + } + } +} + +# Format the individual elements of a Perl hash into a valid string +# representation. We do this ourselves, rather than use native Perl +# facilities, so we can keep control over the exact formatting of the +# data files. +sub format_line +{ + my $data = shift; + my @atts = @_; + + my $first = 1; + my $value; + my $line = ''; + + foreach my $field (@atts) + { + next if !defined $data->{$field}; + $value = $data->{$field}; + + # Re-escape single quotes. + $value =~ s/'/\\'/g; + + if (!$first) + { + $line .= ', '; + } + $first = 0; + + $line .= "$field => '$value'"; + } + return $line; +} + +# Rename .bak files back to .dat +# This requires passing the .dat files as arguments to the script as normal. +# XXX This is of questionable utility, since the files are under version +# control, after all. +sub revert +{ + foreach my $datfile (@input_files) + { + my $bakfile = "$datfile.bak"; + if (-e $bakfile) + { + rename($bakfile, $datfile) or die "rename: $bakfile: $!"; + } + } + exit 0; +} + +sub usage +{ + die <