netcdf-c/oc2/daplex.c
dmh 370bd15166 Github issue 134:
Their is an ambiguity in the DAP2 spec.  Section A.2 of the
dap2 spec says:
  "...The backslash character (.\.) MAY be used as
   a single-character quoting mechanism only within
   quoted-string and comment constructs.
      quoted-pair = "\" CHAR
   ..."
The underlying problem was to allow for " chars inside
strings by using \". However, this definition is overbroad.
It is not stated:
1. if the backslash is to be left in the string or not.
2. There is also an unstated, but related issue of what
   to do about e.g. '\n';convert to newline or not.

This change is to conform to libdap and it does the following:
1. The backslash is left in the string
2. Things like \n are left as is and it is assumed that
   higher level code will decide what to do with e.g. \n.
2015-10-20 15:44:26 -06:00

374 lines
8.8 KiB
C

/* Copyright 2009, UCAR/Unidata and OPeNDAP, Inc.
See the COPYRIGHT file for more information. */
#include "config.h"
#ifdef HAVE_STRINGS_H
#include <strings.h>
#endif
#include "dapparselex.h"
#include "daptab.h"
#undef URLCVT /* NEVER turn this on */
#define DAP2ENCODE
#ifdef DAP2ENCODE
#define KEEPSLASH
#endif
/* Forward */
static void dumptoken(DAPlexstate* lexstate);
static void dapaddyytext(DAPlexstate* lex, int c);
#ifndef DAP2ENCODE
static int tohex(int c);
#endif
/****************************************************/
#ifdef INFORMATIONAL
/* Set of all ascii printable characters */
static char ascii[] = " !\"#$%&'()*+,-./:;<=>?@[]\\^_`|{}~";
/* Define the set of legal nonalphanum characters as specified in the DAP2 spec. */
static char* daplegal ="_!~*'-\"";
#endif
static char* ddsworddelims =
"{}[]:;=,";
/* Define 1 and > 1st legal characters */
static char* ddswordchars1 =
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-+_/%\\.*";
static char* ddswordcharsn =
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-+_/%\\.*#";
static char* daswordcharsn =
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-+_/%\\.*#:";
static char* cewordchars1 =
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-+_/%\\";
static char* cewordcharsn =
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-+_/%\\";
/* Current sets of legal characters */
/*
static char* wordchars1 = NULL;
static char* wordcharsn = NULL;
static char* worddelims = NULL;
*/
static char* keywords[] = {
"alias",
"array",
"attributes",
"byte",
"dataset",
"error",
"float32",
"float64",
"grid",
"int16",
"int32",
"maps",
"sequence",
"string",
"structure",
"uint16",
"uint32",
"url",
"code",
"message",
"program_type",
"program",
NULL /* mark end of the keywords list */
};
static int keytokens[] = {
SCAN_ALIAS,
SCAN_ARRAY,
SCAN_ATTR,
SCAN_BYTE,
SCAN_DATASET,
SCAN_ERROR,
SCAN_FLOAT32,
SCAN_FLOAT64,
SCAN_GRID,
SCAN_INT16,
SCAN_INT32,
SCAN_MAPS,
SCAN_SEQUENCE,
SCAN_STRING,
SCAN_STRUCTURE,
SCAN_UINT16,
SCAN_UINT32,
SCAN_URL,
SCAN_CODE,
SCAN_MESSAGE,
SCAN_PTYPE,
SCAN_PROG
};
/**************************************************/
int
daplex(YYSTYPE* lvalp, DAPparsestate* state)
{
DAPlexstate* lexstate = state->lexstate;
int token;
int c;
unsigned int i;
char* p;
char* tmp;
YYSTYPE lval = NULL;
token = 0;
ocbytesclear(lexstate->yytext);
/* invariant: p always points to current char */
for(p=lexstate->next;token==0&&(c=*p);p++) {
if(c == '\n') {
lexstate->lineno++;
} else if(c <= ' ' || c == '\177') {
/* whitespace: ignore */
} else if(c == '#') {
/* single line comment */
while((c=*(++p))) {if(c == '\n') break;}
} else if(strchr(lexstate->worddelims,c) != NULL) {
/* don't put in lexstate->yytext to avoid memory leak */
token = c;
} else if(c == '"') {
int more = 1;
/* We have a string token; will be reported as WORD_STRING */
while(more && (c=*(++p))) {
if(c == '"') {
more = 0;
continue;
}
#ifdef DAP2ENCODE
if(c == '\\') {
/* Resolve spec ambiguity about handling of \c:
1. !KEEPSLASH: convert \c to c for any character c
2. KEEPSLASH: convert \c to \c for any character c;
that is, keep the backslash.
It is clear that the problem being addressed was \".
But it is unclear what to to do about \n: convert to
Ascii LF or leave as \n.
This code will leave as \n and assume higher levels
of code will address the issue.
*/
#ifdef KEEPSLASH
dapaddyytext(lexstate,c);
#endif
c=*(++p);
if(c == '\0') more = 0;
}
#else /*Non-standard*/
switch (c) {
case '\\':
c=*(++p);
switch (c) {
case 'r': c = '\r'; break;
case 'n': c = '\n'; break;
case 'f': c = '\f'; break;
case 't': c = '\t'; break;
case 'x': {
int d1,d2;
c = '?';
++p;
d1 = tohex(*p++);
if(d1 < 0) {
daperror(state,"Illegal \\xDD in TOKEN_STRING");
} else {
d2 = tohex(*p++);
if(d2 < 0) {
daperror(state,"Illegal \\xDD in TOKEN_STRING");
} else {
c=(((unsigned int)d1)<<4) | (unsigned int)d2;
}
}
} break;
default: break;
}
break;
default: break;
}
#endif /*!DAP2ENCODE*/
if(more) dapaddyytext(lexstate,c);
}
token=WORD_STRING;
} else if(strchr(lexstate->wordchars1,c) != NULL) {
int isdatamark = 0;
/* we have a WORD_WORD */
dapaddyytext(lexstate,c);
while((c=*(++p))) {
#ifdef URLCVT
if(c == '%' && p[1] != 0 && p[2] != 0
&& strchr(hexdigits,p[1]) != NULL
&& strchr(hexdigits,p[2]) != NULL) {
int d1,d2;
d1 = tohex(p[1]);
d2 = tohex(p[2]);
if(d1 >= 0 || d2 >= 0) {
c=(((unsigned int)d1)<<4) | (unsigned int)d2;
p+=2;
}
} else {
if(strchr(lexstate->wordcharsn,c) == NULL) {p--; break;}
}
dapaddyytext(lexstate,c);
#else
if(strchr(lexstate->wordcharsn,c) == NULL) {p--; break;}
dapaddyytext(lexstate,c);
#endif
}
/* Special check for Data: */
tmp = ocbytescontents(lexstate->yytext);
if(strcmp(tmp,"Data")==0 && *p == ':') {
dapaddyytext(lexstate,*p); p++;
if(p[0] == '\n') {
token = SCAN_DATA;
isdatamark = 1;
p++;
} else if(p[0] == '\r' && p[1] == '\n') {
token = SCAN_DATA;
isdatamark = 1;
p+=2;
}
}
if(!isdatamark) {
/* check for keyword */
token=WORD_WORD; /* assume */
for(i=0;;i++) {
if(keywords[i] == NULL) break;
if(strcasecmp(keywords[i],tmp)==0) {
token=keytokens[i];
break;
}
}
}
} else { /* illegal */
}
}
lexstate->next = p;
strncpy(lexstate->lasttokentext,ocbytescontents(lexstate->yytext),MAX_TOKEN_LENGTH);
lexstate->lasttoken = token;
if(ocdebug >= 2)
dumptoken(lexstate);
/*Put return value onto Bison stack*/
if(ocbyteslength(lexstate->yytext) == 0)
lval = NULL;
else {
lval = ocbytesdup(lexstate->yytext);
oclistpush(lexstate->reclaim,(void*)lval);
}
if(lvalp) *lvalp = lval;
return token; /* Return the type of the token. */
}
static void
dapaddyytext(DAPlexstate* lex, int c)
{
ocbytesappend(lex->yytext,c);
}
#ifndef DAP2ENCODE
static int
tohex(int c)
{
if(c >= 'a' && c <= 'f') return (c - 'a') + 0xa;
if(c >= 'A' && c <= 'F') return (c - 'A') + 0xa;
if(c >= '0' && c <= '9') return (c - '0');
return -1;
}
#endif
static void
dumptoken(DAPlexstate* lexstate)
{
fprintf(stderr,"TOKEN = |%s|\n",ocbytescontents(lexstate->yytext));
}
/*
Simple lexer
*/
void
dapsetwordchars(DAPlexstate* lexstate, int kind)
{
switch (kind) {
case 0:
lexstate->worddelims = ddsworddelims;
lexstate->wordchars1 = ddswordchars1;
lexstate->wordcharsn = ddswordcharsn;
break;
case 1:
lexstate->worddelims = ddsworddelims;
lexstate->wordchars1 = ddswordchars1;
lexstate->wordcharsn = daswordcharsn;
break;
case 2:
lexstate->worddelims = ddsworddelims;
lexstate->wordchars1 = cewordchars1;
lexstate->wordcharsn = cewordcharsn;
break;
default: break;
}
}
void
daplexinit(char* input, DAPlexstate** lexstatep)
{
DAPlexstate* lexstate;
if(lexstatep == NULL) return; /* no point in building it */
lexstate = (DAPlexstate*)malloc(sizeof(DAPlexstate));
*lexstatep = lexstate;
if(lexstate == NULL) return;
memset((void*)lexstate,0,sizeof(DAPlexstate));
lexstate->input = strdup(input);
lexstate->next = lexstate->input;
lexstate->yytext = ocbytesnew();
lexstate->reclaim = oclistnew();
dapsetwordchars(lexstate,0); /* Assume DDS */
}
void
daplexcleanup(DAPlexstate** lexstatep)
{
DAPlexstate* lexstate = *lexstatep;
if(lexstate == NULL) return;
if(lexstate->input != NULL) ocfree(lexstate->input);
if(lexstate->reclaim != NULL) {
while(oclistlength(lexstate->reclaim) > 0) {
char* word = (char*)oclistpop(lexstate->reclaim);
if(word) free(word);
}
oclistfree(lexstate->reclaim);
}
ocbytesfree(lexstate->yytext);
free(lexstate);
*lexstatep = NULL;
}
/* Dap identifiers will come to us with some
characters escaped using the URL notation of
%HH. The assumption here is that any character
that is encoded is left encoded, except as follows:
1. if the encoded character is in fact a legal DAP2 character
(alphanum+"_!~*'-\"") then it is decoded, otherwise not.
*/
#ifndef DECODE_IDENTIFIERS
static char* decodelist =
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_!~*'-\"";
#endif
char*
dapdecode(DAPlexstate* lexstate, char* name)
{
char* decoded = NULL;
#ifdef DECODE_IDENTIFIERS
decoded = ocuridecode(name);
#else
decoded = ocuridecodeonly(name,decodelist);
#endif
oclistpush(lexstate->reclaim,(void*)decoded);
return decoded;
}