nasm/nasmlib/perfhash.pl

#!/usr/bin/perl
## --------------------------------------------------------------------------
##
##   Copyright 1996-2017 The NASM Authors - All Rights Reserved
##   See the file AUTHORS included with the NASM distribution for
##   the specific copyright holders.
##
##   Redistribution and use in source and binary forms, with or without
##   modification, are permitted provided that the following
##   conditions are met:
##
##   * Redistributions of source code must retain the above copyright
##     notice, this list of conditions and the following disclaimer.
##   * Redistributions in binary form must reproduce the above
##     copyright notice, this list of conditions and the following
##     disclaimer in the documentation and/or other materials provided
##     with the distribution.
##
##     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
##     CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
##     INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
##     MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
##     DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
##     CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
##     SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
##     NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
##     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
##     HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
##     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
##     OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
##     EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
##
## --------------------------------------------------------------------------

#
# Generate a perfect hash for general case-insensitive string-to-enum
# lookup.  This generates an enum and the corresponding hash, but
# relies on a common function to parse the hash.
#
# Usage:
#      perfhash.pl h foohash.dat foohash.h (to generate C header)
#      perfhash.pl c foohash.dat foohash.c (to generate C source)
#

use strict;

require 'phash.ph';

sub basename($) {
    my($s) = @_;
    $s =~ s/^.*[^-[:alnum:]_\.]//;	# Remove path component as best we can
    return $s;
}

sub intval($) {
    my($s) = @_;

    if ($s =~ /^0/) {
	return oct($s);		# Handles octal or hexadecimal
    } elsif ($s =~ /^\-(0.*)$/) {
	return -oct($1);
    } else {
	return $s + 0;		# Forcibly convert to number
    }
}

my($output, $infile, $outfile) = @ARGV;
my $me = basename($0);

# The following special things are allowed in the input file:
# #<space> or ; begins a comment
# #include filename
# #name str
#    The name of the hash
# #prefix str
#    Defines the prefix before enum
# #guard str
#    Defines the header guard string
# #special str [= value]
#    Generate an enum value without a corresponding string; not capitalized.
# #header str
#    Indicates the name of the .h file to include from the .c file
# #errval str
#    Define the value to be returned if a string is not found
#    (defaults to -1).  This can be any constant C expression,
#    including one of the enum values.
#
# Regular lines are just str [= value]
#
# Enumeration is generated in the order listed in the file, just as in C
# specifying a value causes the values to increase by 1 from that point on
# unless specified.

my $name;
my $prefix;
my $guard;
my $hfile;

my %strings = ();
my %specials = ();
my $next_value = 0;
my $errval = '-1';

my @incstack =  ();
my @filenames = ($infile);
my @linenums =  (0);
my $dd = undef;
my $err = 0;

while (scalar(@filenames)) {
    if (!defined($dd)) {
	open($dd, '<', $filenames[-1])
	    or die "$0: cannot open: $filenames[-1]: $!\n";
    }

    my $line = <$dd>;
    if (!defined($line)) {
	close($dd);
	$dd = pop @incstack;
	pop @filenames;
	pop @linenums;
	next;
    }

    $linenums[-1]++;

    chomp $line;
    $line =~ s/\s*(|\;.*|\#\s.*|\#)$//; # Remove comments and trailing space
    $line =~ s/^\s+//;			# Remove leading space
    if ($line eq '') {
	# Do nothing
    } elsif ($line =~ /^\#name\s+([[:alnum:]_]+)$/) {
	$name = $1;
    } elsif ($line =~ /^\#prefix\s+([[:alnum:]_]+)$/) {
	$prefix = $1;
    } elsif ($line =~ /^\#guard\s+([[:alnum:]_]+)$/) {
	$guard = $1;
    } elsif ($line =~ /^\#errval\s+(\S.*)$/) {
	$errval = $1;
    } elsif ($line =~ /^\#header\s+(\"(.+)\"|\S+)$/) {
	$hfile = ($2 ne '') ? $2 : $1;
    } elsif ($line =~ /^\#include\s+(\"(.+)\"|\S+)$/) {
	push @incstack, $dd;
	push @filenames, (($2 ne '') ? $2 : $1);
	push @linenums, 0;
	undef $dd;		# Open a new file
    } elsif ($line =~ /^(|\#special\s+)(\S+)\s*(|=\s*(\-?(0[Xx][[:xdigit:]]+|0[0-7]*|[0-9]+)))$/) {
	$next_value = intval($4) if ($4 ne '');
	if ($1 eq '') {
	    $strings{$2} = $next_value++;
	} else {
	    $specials{$2} = $next_value++;
	}
    } else {
	printf STDERR "%s:%d:%s syntax error: \"%s\"\n",
	    $filenames[-1], $linenums[-1],
	    (scalar(@incstack) == 1) ? '' : "(from $infile)", $line;
	$err++;
    }
}

exit 1 if ($err);

# Default name, prefix, and header guard name
if (!defined($name)) {
    $name = basename($infile);
    $name =~ s/(\..*)$//;		# Strip extension, if any
}
if (!defined($prefix)) {
    $prefix = "\U${name}\E_";
}
if (!defined($hfile)) {
    $hfile = $outfile;
    $hfile =~ s/\.c$/\.h/;
}
if (!defined($guard)) {
    $guard = basename($hfile);
    $guard =~ s/[^[:alnum:]_]/_/g;
    $guard =~ s/__+/_/g;
    $guard =  "\U$guard";
}

# Verify input.  We can't have more than one constant with the same
# enumeration value, nor the same enumeration string.
if (scalar(keys(%strings)) == 0) {
    die "$0: $infile: no strings to hash!\n";
}

my %enums;
my %enumvals;
my %stringbyval;
my $max_enum;
my $tbllen = 0;
my $tbloffs;
foreach my $s (keys(%strings)) {
    my $es = "${prefix}\U${s}";
    $es =~ s/[^[:alnum:]_]/_/g;
    $es =~ s/__+/_/g;
    my $v = $strings{$s};
    $stringbyval{$v} = $s;
    if (defined($enums{$es})) {
	printf STDERR "%s: string \"%s\" duplicates existing enum %s\n",
	    $infile, $s, $es;
	$err++;
    } else {
	$enums{$es} = $v;
    }
    if (defined($enumvals{$v})) {
	printf STDERR "%s: string \"%s\" duplicates existing enum constant %d\n", $v;
	$err++;
    } else {
	$enumvals{$v} = $es;
    }
    $max_enum = $v if (!defined($max_enum) || $v > $max_enum);
    $tbloffs = $v if (!defined($tbloffs) || $v < $tbloffs);
    $tbllen = $v+1 if (!defined($tbllen) || $v >= $tbllen);
}
foreach my $s (keys(%specials)) {
    my $es = $prefix . $s;	# No string mangling here
    my $v = $specials{$s};
    if (defined($enums{$es})) {
	printf STDERR "%s: special \"%s\" duplicates existing enum %s\n",
	    $infile, $s, $es;
	$err++;
    } else {
	$enums{$es} = $v;
    }
    if (defined ($enumvals{$v})) {
	printf STDERR "%s: special \"%s\" duplicates existing enum constant %d\n", $v;
	$err++;
    } else {
	$enumvals{$v} = $es;
    }
    $max_enum = $v if ($v > $max_enum || !defined($max_enum));
}

$tbllen -= $tbloffs;
if ($tbllen > 65536) {
    printf STDERR "%s: span of enumeration values too large\n";
    $err++;
}

exit 1 if ($err);

open(F, '>', $outfile)
    or die "$0: cannot create: ${outfile}: $!\n";

if ($output eq 'h') {
    print F "/*\n";
    print F " * This file is generated from $infile\n";
    print F " * by $me; do not edit.\n";
    print F " */\n";
    print F "\n";

    print F "#ifndef $guard\n";
    print F "#define $guard 1\n\n";
    print F "#include \"perfhash.h\"\n\n";

    my $c = '{';
    $next_value = 0;
    print F "enum ${name} ";
    foreach my $v (sort { $a <=> $b } keys(%enumvals)) {
	my $s = $enumvals{$v};
	print F "$c\n    $s";
	print F " = $v" if ($v != $next_value);
	$next_value = $v + 1;
	$c = ',';
    }
    print F "\n};\n\n";
    print F "extern const struct perfect_hash ${name}_hash;\n";
    printf F "extern const char * const %s_tbl[%d];\n", $name, $tbllen;

    print F "\nstatic inline enum ${name} ${name}_find(const char *str)\n";
    print F "{\n";
    print F "    return perfhash_find(&${name}_hash, str);\n";
    print F "}\n";

    print F "\nstatic inline const char * ${name}_name(enum ${name} x)\n";
    print F "{\n";
    printf F  "    size_t ix = (size_t)x - (%d);\n", $tbloffs;
    printf F "    if (ix >= %d)\n", $tbllen;
    print F  "        return NULL;\n";
    print F  "    return ${name}_tbl[ix];\n";
    print F  "}\n";

    print F "\nstatic inline const char * ${name}_dname(enum ${name} x)\n";
    print F "{\n";
    print F "    const char *y = ${name}_name(x);\n";
    print F "    return y ? y : invalid_enum_str(x);\n";
    print F "}\n";

    print F "\n#endif /* $guard */\n";
} elsif ($output eq 'c') {
    # The strings we hash must all be lower case, even if the string
    # table doesn't contain them that way.

    my %lcstrings;
    foreach my $s (keys(%strings)) {
	my $ls = "\L$s";
	if (defined($lcstrings{$ls})) {
	    printf STDERR "%s: strings \"%s\" and \"%s\" differ only in case\n",
		$infile, $s, $strings{$lcstrings{$s}};
	} else {
	    $lcstrings{$ls} = $strings{$s} - $tbloffs;
	}
    }

    my @hashinfo = gen_perfect_hash(\%lcstrings);
    if (!@hashinfo) {
	die "$0: no hash found\n";
    }

    # Paranoia...
    verify_hash_table(\%lcstrings, \@hashinfo);

    my ($n, $sv, $g) = @hashinfo;

    die if ($n & ($n-1));

    print F "/*\n";
    print F " * This file is generated from $infile\n";
    print F " * by $me; do not edit.\n";
    print F " */\n";
    print F "\n";

    print F "#include \"$hfile\"\n\n";

    printf F "const char * const %s_tbl[%d] = ", $name, $tbllen;
    my $c = '{';
    for (my $i = $tbloffs; $i < $tbloffs+$tbllen; $i++) {
	printf F "%s\n    %s", $c,
	    defined($stringbyval{$i}) ? '"'.$stringbyval{$i}.'"' : 'NULL';
	$c = ',';
    }
    print F "\n};\n\n";

    print F "#define UNUSED_HASH_ENTRY (65536/3)\n\n";

    printf F "static const int16_t %s_hashvals[%d] = ", $name, $n*2;
    $c = '{';
    for (my $i = 0; $i < $n; $i++) {
	my $h = ${$g}[$i*2+0];
	print F "$c\n    ", defined($h) ? $h : 'UNUSED_HASH_ENTRY';
	$c = ',';
    }
    for (my $i = 0; $i < $n; $i++) {
	my $h = ${$g}[$i*2+1];
	print F "$c\n    ", defined($h) ? $h : 'UNUSED_HASH_ENTRY';
	$c = ',';
    }
    print F "\n};\n\n";

    print F "const struct perfect_hash ${name}_hash = {\n";
    printf F "    UINT64_C(0x%08x%08x),\n", $$sv[0], $$sv[1]; # crcinit
    printf F "    UINT32_C(0x%x),\n", $n-1;		      # hashmask
    printf F "    UINT32_C(%u),\n", $tbllen;		      # tbllen
    printf F "    %d,\n", $tbloffs;			      # tbloffs
    printf F "    (%s),\n", $errval;			      # errval
    printf F "    ${name}_hashvals,\n";			      # hashvals
    printf F "    ${name}_tbl\n";			      # strings
    print F "};\n";
}