mirror of
https://git.openldap.org/openldap/openldap.git
synced 2025-02-23 14:09:39 +08:00
613 lines
15 KiB
C
613 lines
15 KiB
C
/*++
|
|
/* NAME
|
|
/* tok_io 3
|
|
/* SUMMARY
|
|
/* token I/O
|
|
/* PACKAGE
|
|
/* unproto
|
|
/* SYNOPSIS
|
|
/* #include "token.h"
|
|
/*
|
|
/* struct token *tok_get()
|
|
/*
|
|
/* void tok_flush(t)
|
|
/* struct token *t;
|
|
/*
|
|
/* void tok_show(t)
|
|
/* struct token *t;
|
|
/*
|
|
/* void tok_show_ch(t)
|
|
/* struct token *t;
|
|
/*
|
|
/* void put_str(s)
|
|
/* char *s;
|
|
/*
|
|
/* void put_ch(c)
|
|
/* int c;
|
|
/*
|
|
/* void put_nl()
|
|
/*
|
|
/* char *in_path;
|
|
/* int in_line;
|
|
/* DESCRIPTION
|
|
/* These functions read from stdin and write to stdout. The
|
|
/* tokenizer keeps track of where the token appeared in the input
|
|
/* stream; on output, this information is used to preserve correct
|
|
/* line number information (even after lots of token lookahead or
|
|
/* after function-header rewriting) so that diagnostics from the
|
|
/* next compiler stage make sense.
|
|
/*
|
|
/* tok_get() reads the next token from standard input. It returns
|
|
/* a null pointer when the end of input is reached.
|
|
/*
|
|
/* tok_show() displays the contents of a (possibly composite) token
|
|
/* on the standard output.
|
|
/*
|
|
/* tok_show_ch() displays the contents of a single-character token
|
|
/* on the standard output. The character should not be a newline.
|
|
/*
|
|
/* tok_flush() displays the contents of a (possibly composite) token
|
|
/* on the standard output and makes it available for re-use.
|
|
/*
|
|
/* put_str() writes a null-terminated string to standard output.
|
|
/* There should be no newline characters in the string argument.
|
|
/*
|
|
/* put_ch() writes one character to standard output. The character
|
|
/* should not be a newline.
|
|
/*
|
|
/* put_nl() outputs a newline character and adjusts the program's idea of
|
|
/* the current output line.
|
|
/*
|
|
/* The in_path and in_line variables contain the file name and
|
|
/* line number of the most recently read token.
|
|
/* BUGS
|
|
/* The tokenizer is just good enough for the unproto filter.
|
|
/* As a benefit, it is quite fast.
|
|
/* AUTHOR(S)
|
|
/* Wietse Venema
|
|
/* Eindhoven University of Technology
|
|
/* Department of Mathematics and Computer Science
|
|
/* Den Dolech 2, P.O. Box 513, 5600 MB Eindhoven, The Netherlands
|
|
/* LAST MODIFICATION
|
|
/* 92/01/15 21:52:59
|
|
/* VERSION/RELEASE
|
|
/* 1.3
|
|
/*--*/
|
|
|
|
static char io_sccsid[] = "@(#) tok_io.c 1.3 92/01/15 21:52:59";
|
|
|
|
/* C library */
|
|
|
|
#include <stdio.h>
|
|
#include <ctype.h>
|
|
|
|
extern char *strchr();
|
|
extern char *malloc();
|
|
extern char *realloc();
|
|
extern char *strcpy();
|
|
|
|
/* Application-specific stuff */
|
|
|
|
#include "token.h"
|
|
#include "vstring.h"
|
|
#include "error.h"
|
|
|
|
extern char *strsave(); /* XXX need include file */
|
|
|
|
/* Stuff to keep track of original source file name and position */
|
|
|
|
static char def_path[] = ""; /* default path name */
|
|
|
|
char *in_path = def_path; /* current input file name */
|
|
int in_line = 1; /* current input line number */
|
|
|
|
static char *out_path = def_path; /* last name in output line control */
|
|
static int out_line = 1; /* current output line number */
|
|
int last_ch; /* type of last output */
|
|
|
|
/* Forward declarations */
|
|
|
|
static int read_quoted();
|
|
static void read_comment();
|
|
static int backslash_newline();
|
|
static char *read_hex();
|
|
static char *read_octal();
|
|
static void fix_line_control();
|
|
|
|
/*
|
|
* Character input with one level of pushback. The INPUT() macro recursively
|
|
* strips backslash-newline pairs from the input stream. The UNPUT() macro
|
|
* should be used only for characters obtained through the INPUT() macro.
|
|
*
|
|
* After skipping a backslash-newline pair, the input line counter is not
|
|
* updated, and we continue with the same logical source line. We just
|
|
* update a counter with the number of backslash-newline sequences that must
|
|
* be accounted for (backslash_newline() updates the counter). At the end of
|
|
* the logical source line, an appropriate number of newline characters is
|
|
* pushed back (in tok_get()). I do not know how GCC handles this, but it
|
|
* seems to produce te same output.
|
|
*
|
|
* Because backslash_newline() recursively calls itself (through the INPUT()
|
|
* macro), we will run out of stack space, given a sufficiently long
|
|
* sequence of backslash-newline pairs.
|
|
*/
|
|
|
|
static char in_char = 0; /* push-back storage */
|
|
static int in_flag = 0; /* pushback available */
|
|
static int nl_compensate = 0; /* line continuation kluge */
|
|
|
|
#define INPUT(c) (in_flag ? (in_flag = 0, c = in_char) : \
|
|
(c = getchar()) != '\\' ? c : \
|
|
(c = getchar()) != '\n' ? (ungetc(c, stdin), c = '\\') : \
|
|
(c = backslash_newline()))
|
|
#define UNPUT(c) (in_flag = 1, in_char = c)
|
|
|
|
/* Directives that should be ignored. */
|
|
|
|
#ifdef IGNORE_DIRECTIVES
|
|
|
|
static char *ignore_directives[] = {
|
|
IGNORE_DIRECTIVES,
|
|
0,
|
|
};
|
|
|
|
#endif
|
|
|
|
/* Modified string and ctype stuff. */
|
|
|
|
#define STREQUAL(x,y) (*(x) == *(y) && strcmp((x),(y)) == 0)
|
|
|
|
#define ISALNUM(c) (isalnum(c) || (c) == '_')
|
|
#define ISALPHA(c) (isalpha(c) || (c) == '_')
|
|
#define ISSPACE(c) (isspace(c) && c != '\n')
|
|
#define ISDOT(c) (c == '.')
|
|
#define ISHEX(c) (isdigit(c) || strchr("abcdefABCDEF", c) != 0)
|
|
#define ISOCTAL(c) (isdigit(c) && (c) != '8' && (c) != '9')
|
|
|
|
/* Collect all characters that satisfy one condition */
|
|
|
|
#define COLLECT(v,c,cond) { \
|
|
register struct vstring *vs = v; \
|
|
register char *cp = vs->str; \
|
|
*cp++ = c; \
|
|
while (INPUT(c) != EOF) { \
|
|
if (cond) { \
|
|
if (VS_ADDCH(vs, cp, c) == 0) \
|
|
fatal("out of memory"); \
|
|
} else { \
|
|
UNPUT(c); \
|
|
break; \
|
|
} \
|
|
} \
|
|
*cp = 0; \
|
|
}
|
|
|
|
/* Ensure that output line information is correct */
|
|
|
|
#define CHECK_LINE_CONTROL(p,l) { if (out_path != (p) || out_line != (l)) \
|
|
fix_line_control((p),(l)); }
|
|
|
|
/* do_control - parse control line */
|
|
|
|
static int do_control()
|
|
{
|
|
struct token *t;
|
|
int line;
|
|
char *path;
|
|
|
|
/* Make sure that the directive shows up in the right place. */
|
|
|
|
CHECK_LINE_CONTROL(in_path, in_line);
|
|
|
|
while (t = tok_get()) {
|
|
switch (t->tokno) {
|
|
|
|
case TOK_WSPACE:
|
|
/* Ignore blanks after "#" token. */
|
|
tok_free(t);
|
|
break;
|
|
|
|
case TOK_NUMBER:
|
|
|
|
/*
|
|
* Line control is of the form: number pathname junk. Since we
|
|
* have no idea what junk the preprocessor may generate, we copy
|
|
* all line control tokens to stdout.
|
|
*/
|
|
|
|
put_str("# ");
|
|
line = atoi(t->vstr->str); /* extract line number */
|
|
tok_flush(t);
|
|
while ((t = tok_get()) && t->tokno == TOK_WSPACE)
|
|
tok_flush(t); /* copy white space */
|
|
if (t) { /* extract path name */
|
|
path = (t->tokno == '"') ? strsave(t->vstr->str) : in_path;
|
|
do {
|
|
tok_flush(t); /* copy until newline */
|
|
} while (t->tokno != '\n' && (t = tok_get()));
|
|
}
|
|
out_line = in_line = line; /* synchronize */
|
|
out_path = in_path = path; /* synchronize */
|
|
return;
|
|
|
|
#ifdef IGNORE_DIRECTIVES
|
|
|
|
case TOK_WORD:
|
|
|
|
/*
|
|
* Optionally ignore other #directives. This is only a partial
|
|
* solution, because the preprocessor will still see them.
|
|
*/
|
|
{
|
|
char **cpp;
|
|
char *cp = t->vstr->str;
|
|
|
|
for (cpp = ignore_directives; *cpp; cpp++) {
|
|
if (STREQUAL(cp, *cpp)) {
|
|
do {
|
|
tok_free(t);
|
|
} while (t->tokno != '\n' && (t = tok_get()));
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
/* FALLTHROUGH */
|
|
#endif
|
|
default:
|
|
/* Pass through. */
|
|
put_ch('#');
|
|
do {
|
|
tok_flush(t);
|
|
} while (t->tokno != '\n' && (t = tok_get()));
|
|
return;
|
|
|
|
case 0:
|
|
/* Hit EOF, punt. */
|
|
put_ch('#');
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* backslash_newline - fix up things after reading a backslash-newline pair */
|
|
|
|
static int backslash_newline()
|
|
{
|
|
register int c;
|
|
|
|
nl_compensate++;
|
|
return (INPUT(c));
|
|
}
|
|
|
|
/* tok_get - get next token */
|
|
|
|
static int last_tokno = '\n';
|
|
|
|
struct token *tok_get()
|
|
{
|
|
register struct token *t;
|
|
register int c;
|
|
int d;
|
|
|
|
/*
|
|
* Get one from the pool and fill it in. The loop is here in case we hit
|
|
* a preprocessor control line, which happens in a minority of all cases.
|
|
* We update the token input path and line info *after* backslash-newline
|
|
* processing or the newline compensation would go wrong.
|
|
*/
|
|
|
|
t = tok_alloc();
|
|
|
|
for (;;) {
|
|
if ((INPUT(c)) == EOF) {
|
|
tok_free(t);
|
|
return (0);
|
|
} else if ((t->line = in_line, t->path = in_path), !isascii(c)) {
|
|
t->vstr->str[0] = c;
|
|
t->vstr->str[1] = 0;
|
|
t->tokno = TOK_OTHER;
|
|
break;
|
|
} else if (ISSPACE(c)) {
|
|
COLLECT(t->vstr, c, ISSPACE(c));
|
|
t->tokno = TOK_WSPACE;
|
|
break;
|
|
} else if (ISALPHA(c)) {
|
|
COLLECT(t->vstr, c, ISALNUM(c));
|
|
t->tokno = TOK_WORD;
|
|
break;
|
|
} else if (isdigit(c)) {
|
|
COLLECT(t->vstr, c, isdigit(c));
|
|
t->tokno = TOK_NUMBER;
|
|
break;
|
|
} else if (c == '"' || c == '\'') {
|
|
t->tokno = read_quoted(t->vstr, c); /* detect missing end quote */
|
|
break;
|
|
} else if (ISDOT(c)) {
|
|
COLLECT(t->vstr, c, ISDOT(c));
|
|
t->tokno = TOK_OTHER;
|
|
break;
|
|
} else if (c == '#' && last_tokno == '\n') {
|
|
do_control();
|
|
continue;
|
|
} else {
|
|
t->vstr->str[0] = c;
|
|
if (c == '\n') {
|
|
in_line++;
|
|
if (nl_compensate > 0) { /* compensation for bs-nl */
|
|
UNPUT('\n');
|
|
nl_compensate--;
|
|
}
|
|
} else if (c == '/') {
|
|
if ((INPUT(d)) == '*') {
|
|
t->vstr->str[1] = d; /* comment */
|
|
read_comment(t->vstr);
|
|
t->tokno = TOK_WSPACE;
|
|
break;
|
|
} else {
|
|
if (d != EOF)
|
|
UNPUT(d);
|
|
}
|
|
} else if (c == '\\') {
|
|
t->vstr->str[1] = (INPUT(c) == EOF ? 0 : c);
|
|
t->vstr->str[2] = 0;
|
|
t->tokno = TOK_OTHER;
|
|
break;
|
|
}
|
|
t->vstr->str[1] = 0;
|
|
t->tokno = c;
|
|
break;
|
|
}
|
|
}
|
|
last_tokno = t->tokno;
|
|
t->end_line = in_line;
|
|
return (t);
|
|
}
|
|
|
|
/* read_quoted - read string or character literal, canonicalize escapes */
|
|
|
|
static int read_quoted(vs, ch)
|
|
register struct vstring *vs;
|
|
int ch;
|
|
{
|
|
register char *cp = vs->str;
|
|
register int c;
|
|
int ret = TOK_OTHER;
|
|
|
|
*cp++ = ch;
|
|
|
|
/*
|
|
* Clobber the token type in case of a premature newline or EOF. This
|
|
* prevents us from attempting to concatenate string constants with
|
|
* broken ones that have no closing quote.
|
|
*/
|
|
|
|
while (INPUT(c) != EOF) {
|
|
if (c == '\n') { /* newline in string */
|
|
UNPUT(c);
|
|
break;
|
|
}
|
|
if (VS_ADDCH(vs, cp, c) == 0) /* store character */
|
|
fatal("out of memory");
|
|
if (c == ch) { /* closing quote */
|
|
ret = c;
|
|
break;
|
|
}
|
|
if (c == '\\') { /* parse escape sequence */
|
|
if ((INPUT(c)) == EOF) { /* EOF, punt */
|
|
break;
|
|
} else if (c == 'a') { /* \a -> audible bell */
|
|
if ((cp = vs_strcpy(vs, cp, BELL)) == 0)
|
|
fatal("out of memory");
|
|
} else if (c == 'x') { /* \xhh -> \nnn */
|
|
cp = read_hex(vs, cp);
|
|
} else if (ISOCTAL(c) && ch != '\'') {
|
|
cp = read_octal(vs, cp, c); /* canonicalize \octal */
|
|
} else {
|
|
if (VS_ADDCH(vs, cp, c) == 0) /* \other: leave alone */
|
|
fatal("out of memory");
|
|
}
|
|
}
|
|
}
|
|
*cp = 0;
|
|
return (ret);
|
|
}
|
|
|
|
/* read_comment - stuff a whole comment into one huge token */
|
|
|
|
static void read_comment(vs)
|
|
register struct vstring *vs;
|
|
{
|
|
register char *cp = vs->str + 2; /* skip slash star */
|
|
register int c;
|
|
register int d;
|
|
|
|
while (INPUT(c) != EOF) {
|
|
if (VS_ADDCH(vs, cp, c) == 0)
|
|
fatal("out of memory");
|
|
if (c == '*') {
|
|
if ((INPUT(d)) == '/') {
|
|
if (VS_ADDCH(vs, cp, d) == 0)
|
|
fatal("out of memory");
|
|
break;
|
|
} else {
|
|
if (d != EOF)
|
|
UNPUT(d);
|
|
}
|
|
} else if (c == '\n') {
|
|
in_line++;
|
|
} else if (c == '\\') {
|
|
if ((INPUT(d)) != EOF && VS_ADDCH(vs, cp, d) == 0)
|
|
fatal("out of memory");
|
|
}
|
|
}
|
|
*cp = 0;
|
|
}
|
|
|
|
/* read_hex - rewrite hex escape to three-digit octal escape */
|
|
|
|
static char *read_hex(vs, cp)
|
|
struct vstring *vs;
|
|
register char *cp;
|
|
{
|
|
register int c;
|
|
register int i;
|
|
char buf[BUFSIZ];
|
|
int len;
|
|
unsigned val;
|
|
|
|
/*
|
|
* Eat up all subsequent hex digits. Complain later when there are too
|
|
* many.
|
|
*/
|
|
|
|
for (i = 0; i < sizeof(buf) && (INPUT(c) != EOF) && ISHEX(c); i++)
|
|
buf[i] = c;
|
|
buf[i] = 0;
|
|
|
|
if (i < sizeof(buf) && c)
|
|
UNPUT(c);
|
|
|
|
/*
|
|
* Convert hex form to three-digit octal form. The three-digit form is
|
|
* used so that strings can be concatenated without problems. Complain
|
|
* about malformed input; truncate the result to at most three octal
|
|
* digits.
|
|
*/
|
|
|
|
if (i == 0) {
|
|
error("\\x escape sequence without hexadecimal digits");
|
|
if (VS_ADDCH(vs, cp, 'x') == 0)
|
|
fatal("out of memory");
|
|
} else {
|
|
(void) sscanf(buf, "%x", &val);
|
|
sprintf(buf, "%03o", val);
|
|
if ((len = strlen(buf)) > 3)
|
|
error("\\x escape sequence yields non-character value");
|
|
if ((cp = vs_strcpy(vs, cp, buf + len - 3)) == 0)
|
|
fatal("out of memory");
|
|
}
|
|
return (cp);
|
|
}
|
|
|
|
/* read_octal - convert octal escape to three-digit format */
|
|
|
|
static char obuf[] = "00123";
|
|
|
|
static char *read_octal(vs, cp, c)
|
|
register struct vstring *vs;
|
|
register char *cp;
|
|
register int c;
|
|
{
|
|
register int i;
|
|
|
|
#define buf_input (obuf + 2)
|
|
|
|
/* Eat up at most three octal digits. */
|
|
|
|
buf_input[0] = c;
|
|
for (i = 1; i < 3 && (INPUT(c) != EOF) && ISOCTAL(c); i++)
|
|
buf_input[i] = c;
|
|
buf_input[i] = 0;
|
|
|
|
if (i < 3 && c)
|
|
UNPUT(c);
|
|
|
|
/*
|
|
* Leave three-digit octal escapes alone. Convert one-digit and two-digit
|
|
* octal escapes to three-digit form by prefixing them with a suitable
|
|
* number of '0' characters. This is done so that strings can be
|
|
* concatenated without problems.
|
|
*/
|
|
|
|
if ((cp = vs_strcpy(vs, cp, buf_input + i - 3)) == 0)
|
|
fatal("out of memory");
|
|
return (cp);
|
|
}
|
|
|
|
/* put_nl - emit newline and adjust output line count */
|
|
|
|
void put_nl()
|
|
{
|
|
put_ch('\n');
|
|
out_line++;
|
|
}
|
|
|
|
/* fix_line_control - to adjust path and/or line count info in output */
|
|
|
|
static void fix_line_control(path, line)
|
|
register char *path;
|
|
register int line;
|
|
{
|
|
|
|
/*
|
|
* This function is called sporadically, so it should not be a problem
|
|
* that we repeat some of the tests that preceded this function call.
|
|
*
|
|
* Emit a newline if we are not at the start of a line.
|
|
*
|
|
* If we switch files, or if we jump backwards, emit line control. If we
|
|
* jump forward, emit the proper number of newlines to compensate.
|
|
*/
|
|
|
|
if (last_ch != '\n') /* terminate open line */
|
|
put_nl();
|
|
if (path != out_path || line < out_line) { /* file switch or back jump */
|
|
printf("# %d %s\n", out_line = line, out_path = path);
|
|
last_ch = '\n';
|
|
} else { /* forward jump */
|
|
while (line > out_line)
|
|
put_nl();
|
|
}
|
|
}
|
|
|
|
/* tok_show_ch - output single-character token (not newline) */
|
|
|
|
void tok_show_ch(t)
|
|
register struct token *t;
|
|
{
|
|
CHECK_LINE_CONTROL(t->path, t->line);
|
|
|
|
put_ch(t->tokno); /* show token contents */
|
|
}
|
|
|
|
/* tok_show - output (possibly composite) token */
|
|
|
|
void tok_show(t)
|
|
register struct token *t;
|
|
{
|
|
register struct token *p;
|
|
|
|
if (t->tokno == TOK_LIST) {
|
|
register struct token *s;
|
|
|
|
/*
|
|
* This branch is completely in terms of tok_xxx() primitives, so
|
|
* there is no need to check the line control information.
|
|
*/
|
|
|
|
for (s = t->head; s; s = s->next) {
|
|
tok_show_ch(s); /* '(' or ',' or ')' */
|
|
for (p = s->head; p; p = p->next)
|
|
tok_show(p); /* show list element */
|
|
}
|
|
} else {
|
|
register char *cp = t->vstr->str;
|
|
|
|
/*
|
|
* Measurements show that it pays off to give special treatment to
|
|
* single-character tokens. Note that both types of token may cause a
|
|
* change of output line number.
|
|
*/
|
|
|
|
CHECK_LINE_CONTROL(t->path, t->line);
|
|
if (cp[1] == 0) {
|
|
put_ch(*cp); /* single-character token */
|
|
} else {
|
|
put_str(cp); /* multi_character token */
|
|
}
|
|
out_line = t->end_line; /* may span multiple lines */
|
|
for (p = t->head; p; p = p->next)
|
|
tok_show(p); /* trailing blanks */
|
|
}
|
|
}
|