Re: [HACKERS] Cutting initdb's runtime (Perl question embedded) - Mailing list pgsql-hackers

From ilmari@ilmari.org (Dagfinn Ilmari Mannsåker)
Subject Re: [HACKERS] Cutting initdb's runtime (Perl question embedded)
Date
Msg-id d8jpogbq8r1.fsf@dalvik.ping.uio.no
Whole thread Raw
In response to Re: [HACKERS] Cutting initdb's runtime (Perl question embedded)  (Tom Lane <tgl@sss.pgh.pa.us>)
Responses Re: [HACKERS] Cutting initdb's runtime (Perl question embedded)  (Andres Freund <andres@anarazel.de>)
Re: [HACKERS] Cutting initdb's runtime (Perl question embedded)  (Andres Freund <andres@anarazel.de>)
List pgsql-hackers
Tom Lane <tgl@sss.pgh.pa.us> writes:

> There's certainly lots more that could be done in the genbki code,
> but I think all we can justify at this stage of the development
> cycle is to get the low-hanging fruit for testing speedups.

I threw Devel::NYTProf at it and picked some more low-hanging fruit.
Attached are separate patches for each change, and here are the runtimes
of genbki.pl and Gen_fmgrtab.pl, respectively, after each patch
(averages of 5 runs, in millseconds):

master (b6dd1271): 355, 182

1: Avoid unnecessary regex captures: 349, 183
2: Avoid repeated calls to SplitDataLine: 316, 158
3: Inline SplitDataLine: 291, 141
4: Inline check_natts: 287, 141

Together they shave 68ms or 19.2% off the runtime of genbki.pl and 41ms
or 22.5% off the runtime of Gen_fmgrtab.pl

Finally, one non-performance patch, which just removes the use of
Exporter in Catalog.pm, since none of the users actually import anything
from it.

- ilmari
-- 
"I use RMS as a guide in the same way that a boat captain would use
 a lighthouse.  It's good to know where it is, but you generally
 don't want to find yourself in the same spot." - Tollef Fog Heen

From 5d7f4b1e78243daa6939501b88b2644bfcf9bd77 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dagfinn=20Ilmari=20Manns=C3=A5ker?= <ilmari@ilmari.org>
Date: Mon, 17 Apr 2017 13:26:35 +0100
Subject: [PATCH 1/8] Avoid unnecessary regex captures in Catalog.pm

---
 src/backend/catalog/Catalog.pm | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/backend/catalog/Catalog.pm b/src/backend/catalog/Catalog.pm
index fa8de04..e7b647a 100644
--- a/src/backend/catalog/Catalog.pm
+++ b/src/backend/catalog/Catalog.pm
@@ -54,7 +54,7 @@ sub Catalogs
         {
 
             # Strip C-style comments.
-            s;/\*(.|\n)*\*/;;g;
+            s;/\*(?:.|\n)*\*/;;g;
             if (m;/\*;)
             {
 
@@ -80,12 +80,12 @@ sub Catalogs
             {
                 $catalog{natts} = $1;
             }
-            elsif (/^DATA\(insert(\s+OID\s+=\s+(\d+))?\s+\(\s*(.*)\s*\)\s*\)$/)
+            elsif (/^DATA\(insert(?:\s+OID\s+=\s+(\d+))?\s+\(\s*(.*)\s*\)\s*\)$/)
             {
-                check_natts($filename, $catalog{natts}, $3,
+                check_natts($filename, $catalog{natts}, $2,
                             $input_file, $input_line_number);
 
-                push @{ $catalog{data} }, { oid => $2, bki_values => $3 };
+                push @{ $catalog{data} }, { oid => $1, bki_values => $2 };
             }
             elsif (/^DESCR\(\"(.*)\"\)$/)
             {
-- 
2.7.4

From 92402e0306eda209b1a2811acc41325d75add0cb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dagfinn=20Ilmari=20Manns=C3=A5ker?= <ilmari@ilmari.org>
Date: Mon, 17 Apr 2017 14:04:20 +0100
Subject: [PATCH 2/8] Avoid repeated calls to Catalog::SplitDataLine

Both check_natts and the callers of Catalog::Catalogs were calling
Catalog::SplitDataLines, the former discarding the result.

Instead, have Catalog::Catalogs store the split fields directly and pass
the count to check_natts
---
 src/backend/catalog/Catalog.pm   | 14 +++++++-------
 src/backend/catalog/genbki.pl    |  3 +--
 src/backend/utils/Gen_fmgrtab.pl |  3 +--
 3 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/src/backend/catalog/Catalog.pm b/src/backend/catalog/Catalog.pm
index e7b647a..0c508b0 100644
--- a/src/backend/catalog/Catalog.pm
+++ b/src/backend/catalog/Catalog.pm
@@ -82,10 +82,12 @@ sub Catalogs
             }
             elsif (/^DATA\(insert(?:\s+OID\s+=\s+(\d+))?\s+\(\s*(.*)\s*\)\s*\)$/)
             {
-                check_natts($filename, $catalog{natts}, $2,
+                my @bki_values = SplitDataLine($2);
+
+                check_natts($filename, $catalog{natts}, scalar(@bki_values),
                             $input_file, $input_line_number);
 
-                push @{ $catalog{data} }, { oid => $1, bki_values => $2 };
+                push @{ $catalog{data} }, { oid => $1, bki_values => \@bki_values };
             }
             elsif (/^DESCR\(\"(.*)\"\)$/)
             {
@@ -254,17 +256,15 @@ sub RenameTempFile
 # verify the number of fields in the passed-in DATA line
 sub check_natts
 {
-    my ($catname, $natts, $bki_val, $file, $line) = @_;
+    my ($catname, $natts, $bki_vals, $file, $line) = @_;
 
     die "Could not find definition for Natts_${catname} before start of DATA() in $file\n"
         unless defined $natts;
 
-    my $nfields = scalar(SplitDataLine($bki_val));
-
     die sprintf
         "Wrong number of attributes in DATA() entry at %s:%d (expected %d but got %d)\n",
-        $file, $line, $natts, $nfields
-      unless $natts == $nfields;
+        $file, $line, $natts, $bki_vals
+    unless $natts == $bki_vals;
 }
 
 1;
diff --git a/src/backend/catalog/genbki.pl b/src/backend/catalog/genbki.pl
index 198e072..8875f6c 100644
--- a/src/backend/catalog/genbki.pl
+++ b/src/backend/catalog/genbki.pl
@@ -161,9 +161,8 @@ foreach my $catname (@{ $catalogs->{names} })
         foreach my $row (@{ $catalog->{data} })
         {
 
-            # Split line into tokens without interpreting their meaning.
             my %bki_values;
-            @bki_values{@attnames} = Catalog::SplitDataLine($row->{bki_values});
+            @bki_values{@attnames} = @{$row->{bki_values}};
 
             # Perform required substitutions on fields
             foreach my $att (keys %bki_values)
diff --git a/src/backend/utils/Gen_fmgrtab.pl b/src/backend/utils/Gen_fmgrtab.pl
index 76bdf5c..d2c4617 100644
--- a/src/backend/utils/Gen_fmgrtab.pl
+++ b/src/backend/utils/Gen_fmgrtab.pl
@@ -58,9 +58,8 @@ foreach my $column (@{ $catalogs->{pg_proc}->{columns} })
 my $data = $catalogs->{pg_proc}->{data};
 foreach my $row (@$data)
 {
-    # Split line into tokens without interpreting their meaning.
     my %bki_values;
-    @bki_values{@attnames} = Catalog::SplitDataLine($row->{bki_values});
+    @bki_values{@attnames} = @{$row->{bki_values}};
 
     # Select out just the rows for internal-language procedures.
     # Note assumption here that INTERNALlanguageId is 12.
-- 
2.7.4

From c8e55bd9ff36029c3f4fe7053f54f9f862c79d7e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dagfinn=20Ilmari=20Manns=C3=A5ker?= <ilmari@ilmari.org>
Date: Mon, 17 Apr 2017 14:47:06 +0100
Subject: [PATCH 3/8] Inline SplitDataLines into its only caller

---
 src/backend/catalog/Catalog.pm | 38 ++++++++++++++------------------------
 1 file changed, 14 insertions(+), 24 deletions(-)

diff --git a/src/backend/catalog/Catalog.pm b/src/backend/catalog/Catalog.pm
index 0c508b0..9bd6263 100644
--- a/src/backend/catalog/Catalog.pm
+++ b/src/backend/catalog/Catalog.pm
@@ -82,12 +82,24 @@ sub Catalogs
             }
             elsif (/^DATA\(insert(?:\s+OID\s+=\s+(\d+))?\s+\(\s*(.*)\s*\)\s*\)$/)
             {
-                my @bki_values = SplitDataLine($2);
+                # The field-splitting regex will clobber $1, save it
+                my $oid = $1;
+
+                # This handling of quoted strings might look too
+                # simplistic, but it matches what bootscanner.l does:
+                # that has no provision for quote marks inside quoted
+                # strings, either.  If we don't have a quoted string,
+                # just snarf everything till next whitespace.  That will
+                # accept some things that bootscanner.l will see as
+                # erroneous tokens; but it seems wiser to do that and
+                # let bootscanner.l complain than to silently drop
+                # non-whitespace characters.
+                my @bki_values = $2 =~ /"[^"]*"|\S+/g;
 
                 check_natts($filename, $catalog{natts}, scalar(@bki_values),
                             $input_file, $input_line_number);
 
-                push @{ $catalog{data} }, { oid => $1, bki_values => \@bki_values };
+                push @{ $catalog{data} }, { oid => $oid, bki_values => \@bki_values };
             }
             elsif (/^DESCR\(\"(.*)\"\)$/)
             {
@@ -218,28 +230,6 @@ sub Catalogs
     return \%catalogs;
 }
 
-# Split a DATA line into fields.
-# Call this on the bki_values element of a DATA item returned by Catalogs();
-# it returns a list of field values.  We don't strip quoting from the fields.
-# Note: it should be safe to assign the result to a list of length equal to
-# the nominal number of catalog fields, because check_natts already checked
-# the number of fields.
-sub SplitDataLine
-{
-    my $bki_values = shift;
-
-    # This handling of quoted strings might look too simplistic, but it
-    # matches what bootscanner.l does: that has no provision for quote marks
-    # inside quoted strings, either.  If we don't have a quoted string, just
-    # snarf everything till next whitespace.  That will accept some things
-    # that bootscanner.l will see as erroneous tokens; but it seems wiser
-    # to do that and let bootscanner.l complain than to silently drop
-    # non-whitespace characters.
-    my @result = $bki_values =~ /"[^"]*"|\S+/g;
-
-    return @result;
-}
-
 # Rename temporary files to final names.
 # Call this function with the final file name and the .tmp extension
 # Note: recommended extension is ".tmp$$", so that parallel make steps
-- 
2.7.4

From fc6c69a692002277378308c2f6ca019185ef9fbb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dagfinn=20Ilmari=20Manns=C3=A5ker?= <ilmari@ilmari.org>
Date: Mon, 17 Apr 2017 14:38:22 +0100
Subject: [PATCH 4/8] Inline check_nattrs into its only caller

---
 src/backend/catalog/Catalog.pm | 24 ++++++++----------------
 1 file changed, 8 insertions(+), 16 deletions(-)

diff --git a/src/backend/catalog/Catalog.pm b/src/backend/catalog/Catalog.pm
index 9bd6263..0aa3e0c 100644
--- a/src/backend/catalog/Catalog.pm
+++ b/src/backend/catalog/Catalog.pm
@@ -96,8 +96,14 @@ sub Catalogs
                 # non-whitespace characters.
                 my @bki_values = $2 =~ /"[^"]*"|\S+/g;
 
-                check_natts($filename, $catalog{natts}, scalar(@bki_values),
-                            $input_file, $input_line_number);
+                # Check that the DATA line matches the declared number of attributes
+                die "Could not find definition for Natts_${catname} before start of DATA() in $filename\n"
+                    unless defined $catalog{natts};
+
+                die sprintf
+                    "Wrong number of attributes in DATA() entry at %s:%d (expected %d but got %d)\n",
+                    $filename, $input_line_number, $catalog{natts}, scalar(@bki_values)
+                unless $catalog{natts} == @bki_values;
 
                 push @{ $catalog{data} }, { oid => $oid, bki_values => \@bki_values };
             }
@@ -243,18 +249,4 @@ sub RenameTempFile
     rename($temp_name, $final_name) || die "rename: $temp_name: $!";
 }
 
-# verify the number of fields in the passed-in DATA line
-sub check_natts
-{
-    my ($catname, $natts, $bki_vals, $file, $line) = @_;
-
-    die "Could not find definition for Natts_${catname} before start of DATA() in $file\n"
-        unless defined $natts;
-
-    die sprintf
-        "Wrong number of attributes in DATA() entry at %s:%d (expected %d but got %d)\n",
-        $file, $line, $natts, $bki_vals
-    unless $natts == $bki_vals;
-}
-
 1;
-- 
2.7.4

From 592dbc64d21066f5da43d535af2cb3780b95d006 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dagfinn=20Ilmari=20Manns=C3=A5ker?= <ilmari@ilmari.org>
Date: Mon, 17 Apr 2017 15:57:18 +0100
Subject: [PATCH 6/8] Remove pointless Exporter usage in Catalog.pm

None of the users actually import any of the subroutines, they just call
them by their fully-qualified names.
---
 src/backend/catalog/Catalog.pm | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/backend/catalog/Catalog.pm b/src/backend/catalog/Catalog.pm
index b326427..d4b4170 100644
--- a/src/backend/catalog/Catalog.pm
+++ b/src/backend/catalog/Catalog.pm
@@ -16,11 +16,6 @@ package Catalog;
 use strict;
 use warnings;
 
-require Exporter;
-our @ISA       = qw(Exporter);
-our @EXPORT    = ();
-our @EXPORT_OK = qw(Catalogs SplitDataLine RenameTempFile);
-
 # Call this function with an array of names of header files to parse.
 # Returns a nested data structure describing the data in the headers.
 sub Catalogs
-- 
2.7.4


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

pgsql-hackers by date:

Previous
From: Stephen Frost
Date:
Subject: Re: [HACKERS] pg_dump emits ALTER TABLE ONLY partitioned_table
Next
From: Andres Freund
Date:
Subject: Re: [HACKERS] Cutting initdb's runtime (Perl question embedded)