2007-08-31 04:15:25 +08:00
|
|
|
# -*- perl -*-
|
|
|
|
#
|
|
|
|
# Perfect Minimal Hash Generator written in Perl, which produces
|
|
|
|
# C output.
|
|
|
|
#
|
|
|
|
|
2007-09-01 02:10:23 +08:00
|
|
|
require 'random_sv_vectors.ph';
|
2007-10-03 08:40:00 +08:00
|
|
|
require 'crc64.ph';
|
2007-08-31 04:15:25 +08:00
|
|
|
|
|
|
|
#
|
|
|
|
# Compute the prehash for a key
|
|
|
|
#
|
|
|
|
# prehash(key, sv, N)
|
|
|
|
#
|
|
|
|
sub prehash($$$) {
|
|
|
|
my($key, $n, $sv) = @_;
|
2007-10-03 08:40:00 +08:00
|
|
|
my @c = crc64($sv, $key);
|
2007-08-31 04:15:25 +08:00
|
|
|
|
2007-09-01 02:10:23 +08:00
|
|
|
# Create a bipartite graph...
|
2007-10-03 08:40:00 +08:00
|
|
|
$k1 = (($c[1] & ($n-1)) << 1) + 0; # low word
|
|
|
|
$k2 = (($c[0] & ($n-1)) << 1) + 1; # high word
|
2007-08-31 04:15:25 +08:00
|
|
|
|
2007-09-01 02:10:23 +08:00
|
|
|
return ($k1, $k2);
|
2007-08-31 04:15:25 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
#
|
2008-05-26 09:10:57 +08:00
|
|
|
# Walk the assignment graph, return true on success
|
2007-08-31 04:15:25 +08:00
|
|
|
#
|
2008-05-26 09:10:57 +08:00
|
|
|
sub walk_graph($$$$) {
|
|
|
|
my($nodeval,$nodeneigh,$n,$v) = @_;
|
2007-08-31 04:15:25 +08:00
|
|
|
my $nx;
|
|
|
|
|
|
|
|
# print STDERR "Vertex $n value $v\n";
|
2008-05-26 09:10:57 +08:00
|
|
|
$$nodeval[$n] = $v;
|
|
|
|
|
|
|
|
foreach $nx (@{$$nodeneigh[$n]}) {
|
|
|
|
# $nx -> [neigh, hash]
|
|
|
|
my ($o, $e) = @$nx;
|
|
|
|
|
|
|
|
# print STDERR "Edge $n,$o value $e: ";
|
|
|
|
my $ov;
|
|
|
|
if (defined($ov = $$nodeval[$o])) {
|
|
|
|
if ($v+$ov != $e) {
|
|
|
|
# Cyclic graph with collision
|
|
|
|
# print STDERR "error, should be ", $v+$ov, "\n";
|
|
|
|
return 0;
|
|
|
|
} else {
|
|
|
|
# print STDERR "ok\n";
|
|
|
|
}
|
2007-08-31 04:15:25 +08:00
|
|
|
} else {
|
2008-05-26 09:10:57 +08:00
|
|
|
return 0 unless (walk_graph($nodeval, $nodeneigh, $o, $e-$v));
|
2007-08-31 04:15:25 +08:00
|
|
|
}
|
|
|
|
}
|
2008-05-26 09:10:57 +08:00
|
|
|
return 1;
|
2007-08-31 04:15:25 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
#
|
|
|
|
# Generate the function assuming a given N.
|
|
|
|
#
|
2007-10-03 08:40:00 +08:00
|
|
|
# gen_hash_n(N, sv, \%data, run)
|
2007-08-31 04:15:25 +08:00
|
|
|
#
|
2007-10-03 08:40:00 +08:00
|
|
|
sub gen_hash_n($$$$) {
|
|
|
|
my($n, $sv, $href, $run) = @_;
|
2007-08-31 04:15:25 +08:00
|
|
|
my @keys = keys(%{$href});
|
2008-05-26 09:10:57 +08:00
|
|
|
my $i, $sv;
|
2007-08-31 04:15:25 +08:00
|
|
|
my $gr;
|
|
|
|
my $k, $v;
|
2007-08-31 07:42:39 +08:00
|
|
|
my $gsize = 2*$n;
|
2008-05-26 09:10:57 +08:00
|
|
|
my @nodeval;
|
|
|
|
my @nodeneigh;
|
|
|
|
my %edges;
|
2007-08-31 04:15:25 +08:00
|
|
|
|
2007-08-31 07:42:39 +08:00
|
|
|
for ($i = 0; $i < $gsize; $i++) {
|
2008-05-26 09:10:57 +08:00
|
|
|
$nodeneigh[$i] = [];
|
2007-08-31 04:15:25 +08:00
|
|
|
}
|
|
|
|
|
2008-05-26 09:10:57 +08:00
|
|
|
%edges = ();
|
2007-08-31 04:15:25 +08:00
|
|
|
foreach $k (@keys) {
|
2007-09-01 02:10:23 +08:00
|
|
|
my ($pf1, $pf2) = prehash($k, $n, $sv);
|
2008-05-26 09:44:44 +08:00
|
|
|
($pf1,$pf2) = ($pf2,$pf1) if ($pf1 > $pf2); # Canonicalize order
|
|
|
|
|
2008-05-26 09:10:57 +08:00
|
|
|
my $pf = "$pf1,$pf2";
|
2007-08-31 04:15:25 +08:00
|
|
|
my $e = ${$href}{$k};
|
2008-05-26 09:10:57 +08:00
|
|
|
my $xkey;
|
2007-08-31 04:15:25 +08:00
|
|
|
|
2008-05-26 09:44:44 +08:00
|
|
|
if (defined($xkey = $edges{$pf})) {
|
|
|
|
next if ($e == ${$href}{$xkey}); # Duplicate hash, safe to ignore
|
2007-10-03 08:40:00 +08:00
|
|
|
if (defined($run)) {
|
2008-05-26 09:10:57 +08:00
|
|
|
print STDERR "$run: Collision: $pf: $k with $xkey\n";
|
2007-10-03 08:40:00 +08:00
|
|
|
}
|
2007-08-31 04:15:25 +08:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2008-05-26 09:10:57 +08:00
|
|
|
# print STDERR "Edge $pf value $e from $k\n";
|
2007-08-31 04:15:25 +08:00
|
|
|
|
2008-05-26 09:10:57 +08:00
|
|
|
$edges{$pf} = $k;
|
|
|
|
push(@{$nodeneigh[$pf1]}, [$pf2, $e]);
|
|
|
|
push(@{$nodeneigh[$pf2]}, [$pf1, $e]);
|
2007-10-03 08:40:00 +08:00
|
|
|
}
|
2007-08-31 04:15:25 +08:00
|
|
|
|
|
|
|
# Now we need to assign values to each vertex, so that for each
|
|
|
|
# edge, the sum of the values for the two vertices give the value
|
2008-05-26 09:10:57 +08:00
|
|
|
# for the edge (which is our hash index.) If we find an impossible
|
|
|
|
# sitation, the graph was cyclic.
|
|
|
|
@nodeval = (undef) x $gsize;
|
|
|
|
|
2007-08-31 07:42:39 +08:00
|
|
|
for ($i = 0; $i < $gsize; $i++) {
|
2008-05-26 09:10:57 +08:00
|
|
|
if (scalar(@{$nodeneigh[$i]})) {
|
2007-08-31 15:23:31 +08:00
|
|
|
# This vertex has neighbors (is used)
|
2008-05-26 09:10:57 +08:00
|
|
|
if (!defined($nodeval[$i])) {
|
|
|
|
# First vertex in a cluster
|
|
|
|
unless (walk_graph(\@nodeval, \@nodeneigh, $i, 0)) {
|
|
|
|
if (defined($run)) {
|
|
|
|
print STDERR "$run: Graph is cyclic\n";
|
|
|
|
}
|
|
|
|
return;
|
|
|
|
}
|
2007-08-31 15:23:31 +08:00
|
|
|
}
|
2007-08-31 04:15:25 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
# for ($i = 0; $i < $n; $i++) {
|
|
|
|
# print STDERR "Vertex ", $i, ": ", $g[$i], "\n";
|
|
|
|
# }
|
|
|
|
|
2007-10-03 08:40:00 +08:00
|
|
|
if (defined($run)) {
|
|
|
|
printf STDERR "$run: Done: n = $n, sv = [0x%08x, 0x%08x]\n",
|
|
|
|
$$sv[0], $$sv[1];
|
|
|
|
}
|
2007-08-31 04:15:25 +08:00
|
|
|
|
2008-05-26 09:10:57 +08:00
|
|
|
return ($n, $sv, \@nodeval);
|
2007-08-31 04:15:25 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
#
|
|
|
|
# Driver for generating the function
|
|
|
|
#
|
|
|
|
# gen_perfect_hash(\%data)
|
|
|
|
#
|
|
|
|
sub gen_perfect_hash($) {
|
|
|
|
my($href) = @_;
|
|
|
|
my @keys = keys(%{$href});
|
|
|
|
my @hashinfo;
|
|
|
|
my $n, $i, $j, $sv, $maxj;
|
2007-10-03 08:40:00 +08:00
|
|
|
my $run = 1;
|
2007-08-31 04:15:25 +08:00
|
|
|
|
2007-08-31 07:42:39 +08:00
|
|
|
# Minimal power of 2 value for N with enough wiggle room.
|
|
|
|
# The scaling constant must be larger than 0.5 in order for the
|
|
|
|
# algorithm to ever terminate.
|
2012-03-27 00:25:10 +08:00
|
|
|
my $room = int(scalar(@keys)*0.8);
|
2007-08-31 04:15:25 +08:00
|
|
|
$n = 1;
|
2007-08-31 05:39:37 +08:00
|
|
|
while ($n < $room) {
|
2007-08-31 04:15:25 +08:00
|
|
|
$n <<= 1;
|
|
|
|
}
|
|
|
|
|
2007-09-01 02:10:23 +08:00
|
|
|
# Number of times to try...
|
|
|
|
$maxj = scalar @random_sv_vectors;
|
2007-08-31 04:15:25 +08:00
|
|
|
|
|
|
|
for ($i = 0; $i < 4; $i++) {
|
2008-05-21 07:46:36 +08:00
|
|
|
printf STDERR "%d vectors, trying n = %d...\n",
|
|
|
|
scalar @keys, $n;
|
2007-08-31 04:15:25 +08:00
|
|
|
for ($j = 0; $j < $maxj; $j++) {
|
2007-09-01 02:10:23 +08:00
|
|
|
$sv = $random_sv_vectors[$j];
|
2007-10-03 08:40:00 +08:00
|
|
|
@hashinfo = gen_hash_n($n, $sv, $href, $run++);
|
2010-11-08 00:20:23 +08:00
|
|
|
return @hashinfo if (@hashinfo);
|
2007-08-31 04:15:25 +08:00
|
|
|
}
|
|
|
|
$n <<= 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
#
|
|
|
|
# Verify that the hash table is actually correct...
|
|
|
|
#
|
|
|
|
sub verify_hash_table($$)
|
|
|
|
{
|
|
|
|
my ($href, $hashinfo) = @_;
|
2007-09-01 02:10:23 +08:00
|
|
|
my ($n, $sv, $g) = @{$hashinfo};
|
2007-08-31 04:15:25 +08:00
|
|
|
my $k;
|
|
|
|
my $err = 0;
|
|
|
|
|
|
|
|
foreach $k (keys(%$href)) {
|
2007-09-01 02:10:23 +08:00
|
|
|
my ($pf1, $pf2) = prehash($k, $n, $sv);
|
2007-08-31 04:15:25 +08:00
|
|
|
my $g1 = ${$g}[$pf1];
|
|
|
|
my $g2 = ${$g}[$pf2];
|
|
|
|
|
|
|
|
if ($g1+$g2 != ${$href}{$k}) {
|
2007-09-01 02:10:23 +08:00
|
|
|
printf STDERR "%s(%d,%d): %d+%d = %d != %d\n",
|
|
|
|
$k, $pf1, $pf2, $g1, $g2, $g1+$g2, ${$href}{$k};
|
2007-08-31 04:15:25 +08:00
|
|
|
$err = 1;
|
|
|
|
} else {
|
2007-08-31 05:39:37 +08:00
|
|
|
# printf STDERR "%s: %d+%d = %d ok\n",
|
|
|
|
# $k, $g1, $g2, $g1+$g2;
|
2007-08-31 04:15:25 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
die "$0: hash validation error\n" if ($err);
|
|
|
|
}
|
|
|
|
|
|
|
|
1;
|