Re: reducing the footprint of ScanKeyword (was Re: Large writable variables) - Mailing list pgsql-hackers

From Tom Lane
Subject Re: reducing the footprint of ScanKeyword (was Re: Large writable variables)
Date
Msg-id 15387.1546988005@sss.pgh.pa.us
Whole thread Raw
In response to Re: reducing the footprint of ScanKeyword (was Re: Large writable variables)  (John Naylor <john.naylor@2ndquadrant.com>)
Responses Re: reducing the footprint of ScanKeyword (was Re: Large writablevariables)  (Joerg Sonnenberger <joerg@bec.de>)
Re: reducing the footprint of ScanKeyword (was Re: Large writable variables)  (John Naylor <john.naylor@2ndquadrant.com>)
List pgsql-hackers
John Naylor <john.naylor@2ndquadrant.com> writes:
> -As for the graph algorithm, I'd have to play with it to understand
> how it works.

I improved the comment about how come the hash table entry assignment
works.  One thing I'm not clear about myself is

    # A cycle-free graph is either empty or has some vertex of degree 1.

That sounds like a standard graph theory result, but I'm not familiar
with a proof for it.

            regards, tom lane

#----------------------------------------------------------------------
#
# PerfectHash.pm
#    Perl module that constructs minimal perfect hash functions
#
# This code constructs a minimal perfect hash function for the given
# set of keys, using an algorithm described in
# "An optimal algorithm for generating minimal perfect hash functions"
# by Czech, Havas and Majewski in Information Processing Letters,
# 43(5):256-264, October 1992.
# This implementation is loosely based on NetBSD's "nbperf",
# which was written by Joerg Sonnenberger.
#
# The resulting hash function is perfect in the sense that if the presented
# key is one of the original set, it will return the key's index in the set
# (in range 0..N-1).  However, the caller must still verify the match,
# as false positives are possible.  Also, the hash function may return
# values that are out of range (negative, or >= N).  This indicates that
# the presented key is definitely not in the set.
#
#
# Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
# Portions Copyright (c) 1994, Regents of the University of California
#
# src/tools/PerfectHash.pm
#
#----------------------------------------------------------------------

package PerfectHash;

use strict;
use warnings;


# At runtime, we'll compute two simple hash functions of the input key,
# and use them to index into a mapping table.  The hash functions are just
# multiply-and-add in uint32 arithmetic, with different multipliers but
# the same initial seed.  All the complexity in this module is concerned
# with selecting hash parameters that will work and building the mapping
# table.

# We support making case-insensitive hash functions, though this only
# works for a strict-ASCII interpretation of case insensitivity,
# ie, A-Z maps onto a-z and nothing else.
my $case_insensitive = 0;


#
# Construct a C function implementing a perfect hash for the given keys.
# The C function definition is returned as a string.
#
# The keys can be any set of Perl strings; it is caller's responsibility
# that there not be any duplicates.  (Note that the "strings" can be
# binary data, but endianness is the caller's problem.)
#
# The name to use for the function is caller-specified, but its signature
# is always "int f(const void *key, size_t keylen)".  The caller may
# prepend "static " to the result string if it wants a static function.
#
# If $ci is true, the function is case-insensitive, for the limited idea
# of case-insensitivity explained above.
#
sub generate_hash_function
{
    my ($keys_ref, $funcname, $ci) = @_;

    # It's not worth passing this around as a parameter; just use a global.
    $case_insensitive = $ci;

    # Try different hash function parameters until we find a set that works
    # for these keys.  In principle we might need to change multipliers,
    # but these two multipliers are chosen to be primes that are cheap to
    # calculate via shift-and-add, so don't change them without care.
    my $hash_mult1 = 31;
    my $hash_mult2 = 2053;

    # We just try successive hash seed values until we find one that works.
    # (Commonly, random seeds are tried, but we want reproducible results
    # from this program so we don't do that.)
    my $hash_seed;
    my @subresult;
    for ($hash_seed = 0; $hash_seed < 1000; $hash_seed++)
    {
        @subresult =
          _construct_hash_table($keys_ref, $hash_mult1, $hash_mult2,
            $hash_seed);
        last if @subresult;
    }

    # Choke if we didn't succeed in a reasonable number of tries.
    die "failed to generate perfect hash" if !@subresult;

    # Extract info from the function result array.
    my $elemtype = $subresult[0];
    my @hashtab  = @{ $subresult[1] };
    my $nhash    = scalar(@hashtab);

    # OK, construct the hash function definition including the hash table.
    my $f = '';
    $f .= sprintf "int\n";
    $f .= sprintf "%s(const void *key, size_t keylen)\n{\n", $funcname;
    $f .= sprintf "\tstatic const %s h[%d] = {\n", $elemtype, $nhash;
    for (my $i = 0; $i < $nhash; $i++)
    {
        $f .= sprintf "%s%6d,%s",
          ($i % 8 == 0 ? "\t\t" : " "),
          $hashtab[$i],
          ($i % 8 == 7 ? "\n" : "");
    }
    $f .= sprintf "\n" if ($nhash % 8 != 0);
    $f .= sprintf "\t};\n\n";
    $f .= sprintf "\tconst unsigned char *k = key;\n";
    $f .= sprintf "\tuint32\t\ta = %d;\n",   $hash_seed;
    $f .= sprintf "\tuint32\t\tb = %d;\n\n", $hash_seed;
    $f .= sprintf "\twhile (keylen--)\n\t{\n";
    $f .= sprintf "\t\tunsigned char c = *k++";
    $f .= sprintf " | 0x20" if $case_insensitive;    # see comment below
    $f .= sprintf ";\n\n";
    $f .= sprintf "\t\ta = a * %d + c;\n", $hash_mult1;
    $f .= sprintf "\t\tb = b * %d + c;\n", $hash_mult2;
    $f .= sprintf "\t}\n";
    $f .= sprintf "\treturn h[a %% %d] + h[b %% %d];\n", $nhash, $nhash;
    $f .= sprintf "}\n";

    return $f;
}


# Calculate a hash function as the run-time code will do.
#
# If we are making a case-insensitive hash function, we implement that
# by OR'ing 0x20 into each byte of the key.  This correctly transforms
# upper-case ASCII into lower-case ASCII, while not changing digits or
# dollar signs.  (It does change '_', else we could just skip adjusting
# $cn here at all, for typical keyword strings.)
sub _calc_hash
{
    my ($key, $mult, $seed) = @_;

    my $result = $seed;
    for my $c (split //, $key)
    {
        my $cn = ord($c);
        $cn |= 0x20 if $case_insensitive;
        $result = ($result * $mult + $cn) % 4294967296;
    }
    return $result;
}


# Attempt to construct a mapping table for a minimal perfect hash function
# for the given keys, using the specified hash parameters.
#
# Returns an array containing the mapping table element type name as the
# first element, and a ref to an array of the table values as the second.
#
# Returns an empty array on failure; then caller should choose different
# hash parameter(s) and try again.
sub _construct_hash_table
{
    my ($keys_ref, $hash_mult1, $hash_mult2, $hash_seed) = @_;
    my @keys = @{$keys_ref};

    # This algorithm is based on a graph whose edges correspond to the
    # keys and whose vertices correspond to entries of the mapping table.
    # A key's edge links the two vertices whose indexes are the outputs of
    # the two hash functions for that key.  For K keys, the mapping
    # table must have at least 2*K+1 entries, guaranteeing that there's at
    # least one unused entry.  (In principle, larger mapping tables make it
    # easier to find a workable hash and increase the number of inputs that
    # can be rejected due to touching unused hashtable entries.  In practice,
    # neither effect seems strong enough to justify using a larger table.)
    my $nedges = scalar @keys;       # number of edges
    my $nverts = 2 * $nedges + 1;    # number of vertices

    # Initialize the array of edges.
    my @E = ();
    foreach my $kw (@keys)
    {
        # Calculate hashes for this key.
        # The hashes are immediately reduced modulo the mapping table size.
        my $hash1 = _calc_hash($kw, $hash_mult1, $hash_seed) % $nverts;
        my $hash2 = _calc_hash($kw, $hash_mult2, $hash_seed) % $nverts;

        # If the two hashes are the same for any key, we have to fail
        # since this edge would itself form a cycle in the graph.
        return () if $hash1 == $hash2;

        # Add the edge for this key.
        push @E, { left => $hash1, right => $hash2 };
    }

    # Initialize the array of vertices, giving them all empty lists
    # of associated edges.  (The lists will be hashes of edge numbers.)
    my @V = ();
    for (my $v = 0; $v < $nverts; $v++)
    {
        push @V, { edges => {} };
    }

    # Insert each edge in the lists of edges using its vertices.
    for (my $e = 0; $e < $nedges; $e++)
    {
        my $v = $E[$e]{left};
        $V[$v]{edges}->{$e} = 1;

        $v = $E[$e]{right};
        $V[$v]{edges}->{$e} = 1;
    }

    # Now we attempt to prove the graph acyclic.
    # A cycle-free graph is either empty or has some vertex of degree 1.
    # Removing the edge attached to that vertex doesn't change this property,
    # so doing that repeatedly will reduce the size of the graph.
    # If the graph is empty at the end of the process, it was acyclic.
    # We track the order of edge removal so that the next phase can process
    # them in reverse order of removal.
    my @output_order = ();

    # Consider each vertex as a possible starting point for edge-removal.
    for (my $startv = 0; $startv < $nverts; $startv++)
    {
        my $v = $startv;

        # If vertex v is of degree 1 (i.e. exactly 1 edge connects to it),
        # remove that edge, and then consider the edge's other vertex to see
        # if it is now of degree 1.  The inner loop repeats until reaching a
        # vertex not of degree 1.
        while (scalar(keys(%{ $V[$v]{edges} })) == 1)
        {
            # Unlink its only edge.
            my $e = (keys(%{ $V[$v]{edges} }))[0];
            delete($V[$v]{edges}->{$e});

            # Unlink the edge from its other vertex, too.
            my $v2 = $E[$e]{left};
            $v2 = $E[$e]{right} if ($v2 == $v);
            delete($V[$v2]{edges}->{$e});

            # Push e onto the front of the output-order list.
            unshift @output_order, $e;

            # Consider v2 on next iteration of inner loop.
            $v = $v2;
        }
    }

    # We succeeded only if all edges were removed from the graph.
    return () if (scalar(@output_order) != $nedges);

    # OK, build the hash table of size $nverts.
    my @hashtab = (0) x $nverts;
    # We need a "visited" flag array in this step, too.
    my @visited = (0) x $nverts;

    # The goal is that for any key, the sum of the hash table entries for
    # its first and second hash values is the desired output (i.e., the key
    # number).  By assigning hash table values in the selected edge order,
    # we can guarantee that that's true.  This works because the edge first
    # removed from the graph (and hence last to be visited here) must have
    # at least one vertex it shared with no other edge; hence it will have at
    # least one vertex (hashtable entry) still unvisited when we reach it here,
    # and we can assign that unvisited entry a value that makes the sum come
    # out as we wish.  By induction, the same holds for all the other edges.
    foreach my $e (@output_order)
    {
        my $l = $E[$e]{left};
        my $r = $E[$e]{right};
        if (!$visited[$l])
        {
            # $hashtab[$r] might be zero, or some previously assigned value.
            $hashtab[$l] = $e - $hashtab[$r];
        }
        else
        {
            die "oops, doubly used hashtab entry" if $visited[$r];
            # $hashtab[$l] might be zero, or some previously assigned value.
            $hashtab[$r] = $e - $hashtab[$l];
        }
        # Now freeze both of these hashtab entries.
        $visited[$l] = 1;
        $visited[$r] = 1;
    }

    # Detect range of values needed in hash table.
    my $hmin = $nedges;
    my $hmax = 0;
    for (my $v = 0; $v < $nverts; $v++)
    {
        $hmin = $hashtab[$v] if $hashtab[$v] < $hmin;
        $hmax = $hashtab[$v] if $hashtab[$v] > $hmax;
    }

    # Choose width of hashtable entries.  In addition to the actual values,
    # we need to be able to store a flag for unused entries, and we wish to
    # have the property that adding any other entry value to the flag gives
    # an out-of-range result (>= $nedges).
    my $elemtype;
    my $unused_flag;

    if (   $hmin >= -0x7F
        && $hmax <= 0x7F
        && $hmin + 0x7F >= $nedges)
    {
        # int8 will work
        $elemtype    = 'int8';
        $unused_flag = 0x7F;
    }
    elsif ($hmin >= -0x7FFF
        && $hmax <= 0x7FFF
        && $hmin + 0x7FFF >= $nedges)
    {
        # int16 will work
        $elemtype    = 'int16';
        $unused_flag = 0x7FFF;
    }
    elsif ($hmin >= -0x7FFFFFFF
        && $hmax <= 0x7FFFFFFF
        && $hmin + 0x3FFFFFFF >= $nedges)
    {
        # int32 will work
        $elemtype    = 'int32';
        $unused_flag = 0x3FFFFFFF;
    }
    else
    {
        die "hash table values too wide";
    }

    # Set any unvisited hashtable entries to $unused_flag.
    for (my $v = 0; $v < $nverts; $v++)
    {
        $hashtab[$v] = $unused_flag if !$visited[$v];
    }

    return ($elemtype, \@hashtab);
}

1;

pgsql-hackers by date:

Previous
From: Michael Paquier
Date:
Subject: Re: Displaying and dumping of table access methods
Next
From: Joerg Sonnenberger
Date:
Subject: Re: reducing the footprint of ScanKeyword (was Re: Large writablevariables)