openldap/build/unproto/tok_io.c
1998-08-09 00:43:13 +00:00

613 lines
15 KiB
C

/*++
/* NAME
/* tok_io 3
/* SUMMARY
/* token I/O
/* PACKAGE
/* unproto
/* SYNOPSIS
/* #include "token.h"
/*
/* struct token *tok_get()
/*
/* void tok_flush(t)
/* struct token *t;
/*
/* void tok_show(t)
/* struct token *t;
/*
/* void tok_show_ch(t)
/* struct token *t;
/*
/* void put_str(s)
/* char *s;
/*
/* void put_ch(c)
/* int c;
/*
/* void put_nl()
/*
/* char *in_path;
/* int in_line;
/* DESCRIPTION
/* These functions read from stdin and write to stdout. The
/* tokenizer keeps track of where the token appeared in the input
/* stream; on output, this information is used to preserve correct
/* line number information (even after lots of token lookahead or
/* after function-header rewriting) so that diagnostics from the
/* next compiler stage make sense.
/*
/* tok_get() reads the next token from standard input. It returns
/* a null pointer when the end of input is reached.
/*
/* tok_show() displays the contents of a (possibly composite) token
/* on the standard output.
/*
/* tok_show_ch() displays the contents of a single-character token
/* on the standard output. The character should not be a newline.
/*
/* tok_flush() displays the contents of a (possibly composite) token
/* on the standard output and makes it available for re-use.
/*
/* put_str() writes a null-terminated string to standard output.
/* There should be no newline characters in the string argument.
/*
/* put_ch() writes one character to standard output. The character
/* should not be a newline.
/*
/* put_nl() outputs a newline character and adjusts the program's idea of
/* the current output line.
/*
/* The in_path and in_line variables contain the file name and
/* line number of the most recently read token.
/* BUGS
/* The tokenizer is just good enough for the unproto filter.
/* As a benefit, it is quite fast.
/* AUTHOR(S)
/* Wietse Venema
/* Eindhoven University of Technology
/* Department of Mathematics and Computer Science
/* Den Dolech 2, P.O. Box 513, 5600 MB Eindhoven, The Netherlands
/* LAST MODIFICATION
/* 92/01/15 21:52:59
/* VERSION/RELEASE
/* 1.3
/*--*/
static char io_sccsid[] = "@(#) tok_io.c 1.3 92/01/15 21:52:59";
/* C library */
#include <stdio.h>
#include <ctype.h>
extern char *strchr();
extern char *malloc();
extern char *realloc();
extern char *strcpy();
/* Application-specific stuff */
#include "token.h"
#include "vstring.h"
#include "error.h"
extern char *strsave(); /* XXX need include file */
/* Stuff to keep track of original source file name and position */
static char def_path[] = ""; /* default path name */
char *in_path = def_path; /* current input file name */
int in_line = 1; /* current input line number */
static char *out_path = def_path; /* last name in output line control */
static int out_line = 1; /* current output line number */
int last_ch; /* type of last output */
/* Forward declarations */
static int read_quoted();
static void read_comment();
static int backslash_newline();
static char *read_hex();
static char *read_octal();
static void fix_line_control();
/*
* Character input with one level of pushback. The INPUT() macro recursively
* strips backslash-newline pairs from the input stream. The UNPUT() macro
* should be used only for characters obtained through the INPUT() macro.
*
* After skipping a backslash-newline pair, the input line counter is not
* updated, and we continue with the same logical source line. We just
* update a counter with the number of backslash-newline sequences that must
* be accounted for (backslash_newline() updates the counter). At the end of
* the logical source line, an appropriate number of newline characters is
* pushed back (in tok_get()). I do not know how GCC handles this, but it
* seems to produce te same output.
*
* Because backslash_newline() recursively calls itself (through the INPUT()
* macro), we will run out of stack space, given a sufficiently long
* sequence of backslash-newline pairs.
*/
static char in_char = 0; /* push-back storage */
static int in_flag = 0; /* pushback available */
static int nl_compensate = 0; /* line continuation kluge */
#define INPUT(c) (in_flag ? (in_flag = 0, c = in_char) : \
(c = getchar()) != '\\' ? c : \
(c = getchar()) != '\n' ? (ungetc(c, stdin), c = '\\') : \
(c = backslash_newline()))
#define UNPUT(c) (in_flag = 1, in_char = c)
/* Directives that should be ignored. */
#ifdef IGNORE_DIRECTIVES
static char *ignore_directives[] = {
IGNORE_DIRECTIVES,
0,
};
#endif
/* Modified string and ctype stuff. */
#define STREQUAL(x,y) (*(x) == *(y) && strcmp((x),(y)) == 0)
#define ISALNUM(c) (isalnum(c) || (c) == '_')
#define ISALPHA(c) (isalpha(c) || (c) == '_')
#define ISSPACE(c) (isspace(c) && c != '\n')
#define ISDOT(c) (c == '.')
#define ISHEX(c) (isdigit(c) || strchr("abcdefABCDEF", c) != 0)
#define ISOCTAL(c) (isdigit(c) && (c) != '8' && (c) != '9')
/* Collect all characters that satisfy one condition */
#define COLLECT(v,c,cond) { \
register struct vstring *vs = v; \
register char *cp = vs->str; \
*cp++ = c; \
while (INPUT(c) != EOF) { \
if (cond) { \
if (VS_ADDCH(vs, cp, c) == 0) \
fatal("out of memory"); \
} else { \
UNPUT(c); \
break; \
} \
} \
*cp = 0; \
}
/* Ensure that output line information is correct */
#define CHECK_LINE_CONTROL(p,l) { if (out_path != (p) || out_line != (l)) \
fix_line_control((p),(l)); }
/* do_control - parse control line */
static int do_control()
{
struct token *t;
int line;
char *path;
/* Make sure that the directive shows up in the right place. */
CHECK_LINE_CONTROL(in_path, in_line);
while (t = tok_get()) {
switch (t->tokno) {
case TOK_WSPACE:
/* Ignore blanks after "#" token. */
tok_free(t);
break;
case TOK_NUMBER:
/*
* Line control is of the form: number pathname junk. Since we
* have no idea what junk the preprocessor may generate, we copy
* all line control tokens to stdout.
*/
put_str("# ");
line = atoi(t->vstr->str); /* extract line number */
tok_flush(t);
while ((t = tok_get()) && t->tokno == TOK_WSPACE)
tok_flush(t); /* copy white space */
if (t) { /* extract path name */
path = (t->tokno == '"') ? strsave(t->vstr->str) : in_path;
do {
tok_flush(t); /* copy until newline */
} while (t->tokno != '\n' && (t = tok_get()));
}
out_line = in_line = line; /* synchronize */
out_path = in_path = path; /* synchronize */
return;
#ifdef IGNORE_DIRECTIVES
case TOK_WORD:
/*
* Optionally ignore other #directives. This is only a partial
* solution, because the preprocessor will still see them.
*/
{
char **cpp;
char *cp = t->vstr->str;
for (cpp = ignore_directives; *cpp; cpp++) {
if (STREQUAL(cp, *cpp)) {
do {
tok_free(t);
} while (t->tokno != '\n' && (t = tok_get()));
return;
}
}
}
/* FALLTHROUGH */
#endif
default:
/* Pass through. */
put_ch('#');
do {
tok_flush(t);
} while (t->tokno != '\n' && (t = tok_get()));
return;
case 0:
/* Hit EOF, punt. */
put_ch('#');
return;
}
}
}
/* backslash_newline - fix up things after reading a backslash-newline pair */
static int backslash_newline()
{
register int c;
nl_compensate++;
return (INPUT(c));
}
/* tok_get - get next token */
static int last_tokno = '\n';
struct token *tok_get()
{
register struct token *t;
register int c;
int d;
/*
* Get one from the pool and fill it in. The loop is here in case we hit
* a preprocessor control line, which happens in a minority of all cases.
* We update the token input path and line info *after* backslash-newline
* processing or the newline compensation would go wrong.
*/
t = tok_alloc();
for (;;) {
if ((INPUT(c)) == EOF) {
tok_free(t);
return (0);
} else if ((t->line = in_line, t->path = in_path), !isascii(c)) {
t->vstr->str[0] = c;
t->vstr->str[1] = 0;
t->tokno = TOK_OTHER;
break;
} else if (ISSPACE(c)) {
COLLECT(t->vstr, c, ISSPACE(c));
t->tokno = TOK_WSPACE;
break;
} else if (ISALPHA(c)) {
COLLECT(t->vstr, c, ISALNUM(c));
t->tokno = TOK_WORD;
break;
} else if (isdigit(c)) {
COLLECT(t->vstr, c, isdigit(c));
t->tokno = TOK_NUMBER;
break;
} else if (c == '"' || c == '\'') {
t->tokno = read_quoted(t->vstr, c); /* detect missing end quote */
break;
} else if (ISDOT(c)) {
COLLECT(t->vstr, c, ISDOT(c));
t->tokno = TOK_OTHER;
break;
} else if (c == '#' && last_tokno == '\n') {
do_control();
continue;
} else {
t->vstr->str[0] = c;
if (c == '\n') {
in_line++;
if (nl_compensate > 0) { /* compensation for bs-nl */
UNPUT('\n');
nl_compensate--;
}
} else if (c == '/') {
if ((INPUT(d)) == '*') {
t->vstr->str[1] = d; /* comment */
read_comment(t->vstr);
t->tokno = TOK_WSPACE;
break;
} else {
if (d != EOF)
UNPUT(d);
}
} else if (c == '\\') {
t->vstr->str[1] = (INPUT(c) == EOF ? 0 : c);
t->vstr->str[2] = 0;
t->tokno = TOK_OTHER;
break;
}
t->vstr->str[1] = 0;
t->tokno = c;
break;
}
}
last_tokno = t->tokno;
t->end_line = in_line;
return (t);
}
/* read_quoted - read string or character literal, canonicalize escapes */
static int read_quoted(vs, ch)
register struct vstring *vs;
int ch;
{
register char *cp = vs->str;
register int c;
int ret = TOK_OTHER;
*cp++ = ch;
/*
* Clobber the token type in case of a premature newline or EOF. This
* prevents us from attempting to concatenate string constants with
* broken ones that have no closing quote.
*/
while (INPUT(c) != EOF) {
if (c == '\n') { /* newline in string */
UNPUT(c);
break;
}
if (VS_ADDCH(vs, cp, c) == 0) /* store character */
fatal("out of memory");
if (c == ch) { /* closing quote */
ret = c;
break;
}
if (c == '\\') { /* parse escape sequence */
if ((INPUT(c)) == EOF) { /* EOF, punt */
break;
} else if (c == 'a') { /* \a -> audible bell */
if ((cp = vs_strcpy(vs, cp, BELL)) == 0)
fatal("out of memory");
} else if (c == 'x') { /* \xhh -> \nnn */
cp = read_hex(vs, cp);
} else if (ISOCTAL(c) && ch != '\'') {
cp = read_octal(vs, cp, c); /* canonicalize \octal */
} else {
if (VS_ADDCH(vs, cp, c) == 0) /* \other: leave alone */
fatal("out of memory");
}
}
}
*cp = 0;
return (ret);
}
/* read_comment - stuff a whole comment into one huge token */
static void read_comment(vs)
register struct vstring *vs;
{
register char *cp = vs->str + 2; /* skip slash star */
register int c;
register int d;
while (INPUT(c) != EOF) {
if (VS_ADDCH(vs, cp, c) == 0)
fatal("out of memory");
if (c == '*') {
if ((INPUT(d)) == '/') {
if (VS_ADDCH(vs, cp, d) == 0)
fatal("out of memory");
break;
} else {
if (d != EOF)
UNPUT(d);
}
} else if (c == '\n') {
in_line++;
} else if (c == '\\') {
if ((INPUT(d)) != EOF && VS_ADDCH(vs, cp, d) == 0)
fatal("out of memory");
}
}
*cp = 0;
}
/* read_hex - rewrite hex escape to three-digit octal escape */
static char *read_hex(vs, cp)
struct vstring *vs;
register char *cp;
{
register int c;
register int i;
char buf[BUFSIZ];
int len;
unsigned val;
/*
* Eat up all subsequent hex digits. Complain later when there are too
* many.
*/
for (i = 0; i < sizeof(buf) && (INPUT(c) != EOF) && ISHEX(c); i++)
buf[i] = c;
buf[i] = 0;
if (i < sizeof(buf) && c)
UNPUT(c);
/*
* Convert hex form to three-digit octal form. The three-digit form is
* used so that strings can be concatenated without problems. Complain
* about malformed input; truncate the result to at most three octal
* digits.
*/
if (i == 0) {
error("\\x escape sequence without hexadecimal digits");
if (VS_ADDCH(vs, cp, 'x') == 0)
fatal("out of memory");
} else {
(void) sscanf(buf, "%x", &val);
sprintf(buf, "%03o", val);
if ((len = strlen(buf)) > 3)
error("\\x escape sequence yields non-character value");
if ((cp = vs_strcpy(vs, cp, buf + len - 3)) == 0)
fatal("out of memory");
}
return (cp);
}
/* read_octal - convert octal escape to three-digit format */
static char obuf[] = "00123";
static char *read_octal(vs, cp, c)
register struct vstring *vs;
register char *cp;
register int c;
{
register int i;
#define buf_input (obuf + 2)
/* Eat up at most three octal digits. */
buf_input[0] = c;
for (i = 1; i < 3 && (INPUT(c) != EOF) && ISOCTAL(c); i++)
buf_input[i] = c;
buf_input[i] = 0;
if (i < 3 && c)
UNPUT(c);
/*
* Leave three-digit octal escapes alone. Convert one-digit and two-digit
* octal escapes to three-digit form by prefixing them with a suitable
* number of '0' characters. This is done so that strings can be
* concatenated without problems.
*/
if ((cp = vs_strcpy(vs, cp, buf_input + i - 3)) == 0)
fatal("out of memory");
return (cp);
}
/* put_nl - emit newline and adjust output line count */
void put_nl()
{
put_ch('\n');
out_line++;
}
/* fix_line_control - to adjust path and/or line count info in output */
static void fix_line_control(path, line)
register char *path;
register int line;
{
/*
* This function is called sporadically, so it should not be a problem
* that we repeat some of the tests that preceded this function call.
*
* Emit a newline if we are not at the start of a line.
*
* If we switch files, or if we jump backwards, emit line control. If we
* jump forward, emit the proper number of newlines to compensate.
*/
if (last_ch != '\n') /* terminate open line */
put_nl();
if (path != out_path || line < out_line) { /* file switch or back jump */
printf("# %d %s\n", out_line = line, out_path = path);
last_ch = '\n';
} else { /* forward jump */
while (line > out_line)
put_nl();
}
}
/* tok_show_ch - output single-character token (not newline) */
void tok_show_ch(t)
register struct token *t;
{
CHECK_LINE_CONTROL(t->path, t->line);
put_ch(t->tokno); /* show token contents */
}
/* tok_show - output (possibly composite) token */
void tok_show(t)
register struct token *t;
{
register struct token *p;
if (t->tokno == TOK_LIST) {
register struct token *s;
/*
* This branch is completely in terms of tok_xxx() primitives, so
* there is no need to check the line control information.
*/
for (s = t->head; s; s = s->next) {
tok_show_ch(s); /* '(' or ',' or ')' */
for (p = s->head; p; p = p->next)
tok_show(p); /* show list element */
}
} else {
register char *cp = t->vstr->str;
/*
* Measurements show that it pays off to give special treatment to
* single-character tokens. Note that both types of token may cause a
* change of output line number.
*/
CHECK_LINE_CONTROL(t->path, t->line);
if (cp[1] == 0) {
put_ch(*cp); /* single-character token */
} else {
put_str(cp); /* multi_character token */
}
out_line = t->end_line; /* may span multiple lines */
for (p = t->head; p; p = p->next)
tok_show(p); /* trailing blanks */
}
}