nasm/perllib/phash.ph

# -*- perl -*-
#
# Perfect Minimal Hash Generator written in Perl, which produces
# C output.
#

require 'random_sv_vectors.ph';
require 'crc64.ph';

#
# Compute the prehash for a key
#
# prehash(key, sv, N)
#
sub prehash($$$) {
    my($key, $n, $sv) = @_;
    my @c = crc64($sv, $key);
    my $nmask = ($n << 1) - 2;

    # Create a bipartite graph...
    $k1 = ($c[1] & $nmask) + 0; # low word
    $k2 = ($c[0] & $nmask) + 1; # high word

    return ($k1, $k2);
}

#
# Walk the assignment graph, return true on success
#
sub walk_graph($$$$) {
    my($nodeval,$nodeneigh,$n,$v) = @_;
    my $nx;

    # print STDERR "Vertex $n value $v\n";
    $$nodeval[$n] = $v;

    foreach $nx (@{$$nodeneigh[$n]}) {
	# $nx -> [neigh, hash]
	my ($o, $e) = @$nx;

	# print STDERR "Edge $n,$o value $e: ";
	my $ov;
	if (defined($ov = $$nodeval[$o])) {
	    if ($v+$ov != $e) {
		# Cyclic graph with collision
		# print STDERR "error, should be ", $v+$ov, "\n";
		return 0;
	    } else {
		# print STDERR "ok\n";
	    }
	} else {
	    return 0 unless (walk_graph($nodeval, $nodeneigh, $o, $e-$v));
	}
    }
    return 1;
}

#
# Generate the function assuming a given N.
#
# gen_hash_n(N, sv, \%data, run)
#
sub gen_hash_n($$$$) {
    my($n, $sv, $href, $run) = @_;
    my @keys = keys(%{$href});
    my $i;
    my $gr;
    my ($k, $v);
    my $gsize = 2*$n;
    my @nodeval;
    my @nodeneigh;
    my %edges;

    for ($i = 0; $i < $gsize; $i++) {
	$nodeneigh[$i] = [];
    }

    %edges = ();
    foreach $k (@keys) {
	my ($pf1, $pf2) = prehash($k, $n, $sv);
	($pf1,$pf2) = ($pf2,$pf1) if ($pf1 > $pf2); # Canonicalize order

	my $pf = "$pf1,$pf2";
	my $e = ${$href}{$k};
	my $xkey;

	if (defined($xkey = $edges{$pf})) {
	    next if ($e == ${$href}{$xkey}); # Duplicate hash, safe to ignore
	    if (defined($run)) {
		print STDERR "$run: Collision: $pf: $k with $xkey\n";
	    }
	    return;
	}

	# print STDERR "Edge $pf value $e from $k\n";

	$edges{$pf} = $k;
	push(@{$nodeneigh[$pf1]}, [$pf2, $e]);
	push(@{$nodeneigh[$pf2]}, [$pf1, $e]);
    }

    # Now we need to assign values to each vertex, so that for each
    # edge, the sum of the values for the two vertices give the value
    # for the edge (which is our hash index.)  If we find an impossible
    # situation, the graph was cyclic.
    @nodeval = (undef) x $gsize;

    for ($i = 0; $i < $gsize; $i++) {
	if (scalar(@{$nodeneigh[$i]})) {
	    # This vertex has neighbors (is used)
	    if (!defined($nodeval[$i])) {
		# First vertex in a cluster
		unless (walk_graph(\@nodeval, \@nodeneigh, $i, 0)) {
		    if (defined($run)) {
			print STDERR "$run: Graph is cyclic\n";
		    }
		    return;
		}
	    }
	}
    }

    # for ($i = 0; $i < $n; $i++) {
    #	print STDERR "Vertex ", $i, ": ", $g[$i], "\n";
    # }

    if (defined($run)) {
	printf STDERR "$run: Done: n = $n, sv = [0x%08x, 0x%08x]\n",
	$$sv[0], $$sv[1];
    }

    return ($n, $sv, \@nodeval);
}

#
# Driver for generating the function
#
# gen_perfect_hash(\%data)
#
sub gen_perfect_hash($) {
    my($href) = @_;
    my @keys = keys(%{$href});
    my @hashinfo;
    my ($n, $i, $j, $sv, $maxj);
    my $run = 1;

    # Minimal power of 2 value for N with enough wiggle room.
    # The scaling constant must be larger than 0.5 in order for the
    # algorithm to ever terminate. The higher the scaling constant,
    # the more space does the hash take up, but the less likely is it
    # that an invalid token will require a string comparison.
    my $room = int(scalar(@keys)*1.6);
    $n = 1;
    while ($n < $room) {
	$n <<= 1;
    }

    # Number of times to try...
    $maxj = scalar @random_sv_vectors;

    for ($i = 0; $i < 4; $i++) {
	printf STDERR "%d vectors, trying n = %d...\n",
		scalar @keys, $n;
	for ($j = 0; $j < $maxj; $j++) {
	    $sv = $random_sv_vectors[$j];
	    @hashinfo = gen_hash_n($n, $sv, $href, $run++);
	    return @hashinfo if (@hashinfo);
	}
	$n <<= 1;
    }

    return;
}

#
# Verify that the hash table is actually correct...
#
sub verify_hash_table($$)
{
    my ($href, $hashinfo) = @_;
    my ($n, $sv, $g) = @{$hashinfo};
    my $k;
    my $err = 0;

    foreach $k (keys(%$href)) {
	my ($pf1, $pf2) = prehash($k, $n, $sv);
	my $g1 = ${$g}[$pf1];
	my $g2 = ${$g}[$pf2];

	if ($g1+$g2 != ${$href}{$k}) {
	    printf STDERR "%s(%d,%d): %d+%d = %d != %d\n",
	    $k, $pf1, $pf2, $g1, $g2, $g1+$g2, ${$href}{$k};
	    $err = 1;
	} else {
	    # printf STDERR "%s: %d+%d = %d ok\n",
	    # $k, $g1, $g2, $g1+$g2;
	}
    }

    die "$0: hash validation error\n" if ($err);
}

1;