diff --git a/ChangeLog b/ChangeLog index a2233424..fbcc99cf 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,11 @@ +Fri Sep 26 15:50:44 CEST 2003 Daniel Veillard + + * doc/libxml2-api.xml: rebuilt the API + * xmllint.c doc/xmllint.1 doc/xmllint.xml: added the new options + --nocdata and --nsclean to remove CDATA section and surperfluous + namespace declarations + * parser.c SAX2.c: implementation of the 2 new options + Fri Sep 26 14:41:53 CEST 2003 Daniel Veillard * HTMLparser.c testHTML.c xmllint.c include/libxml/HTMLparser.h: diff --git a/SAX2.c b/SAX2.c index ead3d33a..0625b8a9 100644 --- a/SAX2.c +++ b/SAX2.c @@ -1608,9 +1608,11 @@ xmlSAX2TextNode(xmlParserCtxtPtr ctxt, const xmlChar *str, int len) { if (ctxt->dictNames) { xmlChar cur = str[len]; - if ((len <= 3) && ((cur == '"') || (cur == '\'') || (cur == '<'))) { + if ((len <= 3) && ((cur == '"') || (cur == '\'') || + ((cur == '<') && (str[len + 1] != '!')))) { intern = xmlDictLookup(ctxt->dict, str, len); - } else if (IS_BLANK(*str) && (len < 60) && (cur == '<')) { + } else if (IS_BLANK(*str) && (len < 60) && (cur == '<') && + (str[len + 1] != '!')) { int i; for (i = 1;i < len;i++) { diff --git a/doc/libxml2-api.xml b/doc/libxml2-api.xml index 326d2ff3..b120ac3f 100644 --- a/doc/libxml2-api.xml +++ b/doc/libxml2-api.xml @@ -986,6 +986,11 @@ + + + + + @@ -995,6 +1000,13 @@ + + + + + + + @@ -1025,6 +1037,12 @@ + + + + + + @@ -1059,11 +1077,13 @@ + + @@ -2687,6 +2707,11 @@ + + + + + @@ -2913,11 +2938,13 @@ - + + + @@ -3131,6 +3158,7 @@ if necessary or NULL'/> + @@ -3498,7 +3526,8 @@ actually an xmlCharEncoding'/> - + @@ -4353,6 +4382,64 @@ actually an xmlCharEncoding'/> + + parse an XML in-memory document and build a tree. This reuses the existing @ctxt parser context + + + + + + + + + parse an XML from a file descriptor and build a tree. This reuses the existing @ctxt parser context + + + + + + + + + parse an XML file from the filesystem or the network. This reuses the existing @ctxt parser context + + + + + + + + parse an HTML document from I/O functions and source and build a tree. This reuses the existing @ctxt parser context + + + + + + + + + + + parse an XML in-memory document and build a tree. This reuses the existing @ctxt parser context + + + + + + + + + + Reset a parser context + + + + + Applies the options to the parser context + + + + Initialize the default SAX handler @@ -4550,6 +4637,48 @@ actually an xmlCharEncoding'/> + + parse an XML in-memory document and build a tree. + + + + + + + + parse an XML from a file descriptor and build a tree. + + + + + + + + parse an XML file from the filesystem or the network. + + + + + + + parse an HTML document from I/O functions and source and build a tree. + + + + + + + + + + parse an XML in-memory document and build a tree. + + + + + + + Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks to handle parse events. If sax is NULL, fallback to the default DOM behavior and return a tree. diff --git a/doc/xmllint.1 b/doc/xmllint.1 index cf1bd8ba..9e1640ae 100644 --- a/doc/xmllint.1 +++ b/doc/xmllint.1 @@ -24,11 +24,16 @@ xmllint \- command line XML tool .nf \fBxmllint\fR [\fB--version\fR | \fB--debug\fR | \fB--shell\fR | \fB--debugent\fR | - \fB--copy\fR | \fB--recover\fR | \fB--noent\fR | \fB--noout\fR | \fB--htmlout\fR - | \fB--nowrap\fR | \fB--valid\fR | \fB--postvalid\fR | \fB--dtdvalid - \fIURL\fR\fR | \fB--dtdvalidfpi \fIFPI\fR\fR | \fB--timing\fR | \fB--repeat\fR - | \fB--insert\fR | \fB--compress\fR | \fB--html\fR | \fB--push\fR | \fB--memory\fR | \fB--nowarning\fR | \fB--noblanks\fR | \fB--format\fR | \fB--testIO\fR | \fB--encode \fIencoding\fR\fR | \fB--catalogs\fR | \fB--nocatalogs\fR | \fB--auto\fR | \fB--xinclude\fR | \fB--loaddtd\fR | \fB--dtdattr\fR | \fB--dropdtd\fR | \fB--stream\fR | \fB--chkregister\fR | \fB--relaxng\fR | \fB--schema\fR | \fB--nonet\fR] [\fBxmlfile\fR] - + \fB--copy\fR | \fB--recover\fR | \fB--noent\fR | \fB--nocdata\fR | \fB--nsclean\fR + | \fB--noout\fR | \fB--htmlout\fR | \fB--nowrap\fR | \fB--valid\fR | + \fB--postvalid\fR | \fB--dtdvalid \fIURL\fR\fR | \fB--dtdvalidfpi \fIFPI\fR\fR + | \fB--timing\fR | \fB--repeat\fR | \fB--insert\fR | \fB--compress\fR + | \fB--html\fR | \fB--push\fR | \fB--memory\fR | \fB--nowarning\fR | + \fB--noblanks\fR | \fB--format\fR | \fB--testIO\fR | \fB--encode \fIencoding\fR\fR + | \fB--catalogs\fR | \fB--nocatalogs\fR | \fB--auto\fR | \fB--xinclude\fR + | \fB--loaddtd\fR | \fB--dtdattr\fR | \fB--dropdtd\fR | \fB--stream\fR + | \fB--chkregister\fR | \fB--relaxng\fR | \fB--schema\fR | \fB--nonet\fR] + [\fBxmlfile\fR] .fi .SH "INTRODUCTION" @@ -69,6 +74,14 @@ Output any parsable portions of an invalid document. \fB--noent\fR Substitute entity values for entity references. By default, xmllint leaves entity references in place. +.TP +\fB--nocdata\fR +Substitute CDATA section by equivalent text nodes. + +.TP +\fB--nsclean\fR +Remove redundant namespace declarations. + .TP \fB--noout\fR Suppress output. By default, xmllint outputs the result tree. diff --git a/doc/xmllint.xml b/doc/xmllint.xml index e45aa8a2..304be80b 100644 --- a/doc/xmllint.xml +++ b/doc/xmllint.xml @@ -50,6 +50,8 @@ --copy --recover --noent + --nocdata + --nsclean --noout --htmlout --nowrap @@ -173,6 +175,24 @@ + + + + + Substitute CDATA section by equivalent text nodes. + + + + + + + + + Remove redundant namespace declarations. + + + + diff --git a/parser.c b/parser.c index 2472f545..83c69378 100644 --- a/parser.c +++ b/parser.c @@ -675,11 +675,24 @@ xmlEntityPtr xmlParseStringEntityRef(xmlParserCtxtPtr ctxt, * * Pushes a new parser namespace on top of the ns stack * - * Returns -1 in case of error, the index in the stack otherwise + * Returns -1 in case of error, the index in the stack otherwise, + * and -2 if the namespace should be discarded. */ static int nsPush(xmlParserCtxtPtr ctxt, const xmlChar *prefix, const xmlChar *URL) { + if (ctxt->options & XML_PARSE_NSCLEAN) { + int i; + for (i = 0;i < ctxt->nsNr;i += 2) { + if (ctxt->nsTab[i] == prefix) { + /* in scope */ + if (ctxt->nsTab[i + 1] == URL) + return(-2); + /* out of scope keep it */ + break; + } + } + } if ((ctxt->nsMax == 0) || (ctxt->nsTab == NULL)) { ctxt->nsMax = 10; ctxt->nsNr = 0; @@ -12096,6 +12109,14 @@ xmlCtxtUseOptions(xmlParserCtxtPtr ctxt, int options) } else { ctxt->dictNames = 1; } + if (options & XML_PARSE_NOCDATA) { + ctxt->sax->cdataBlock = NULL; + options -= XML_PARSE_NOCDATA; + } + if (options & XML_PARSE_NSCLEAN) { + ctxt->options |= XML_PARSE_NSCLEAN; + options -= XML_PARSE_NSCLEAN; + } return (options); } diff --git a/python/libxml2class.txt b/python/libxml2class.txt index 2bb38c5d..7214e36c 100644 --- a/python/libxml2class.txt +++ b/python/libxml2class.txt @@ -11,6 +11,10 @@ htmlHandleOmittedElem() htmlIsScriptAttribute() htmlParseDoc() htmlParseFile() +htmlReadDoc() +htmlReadFd() +htmlReadFile() +htmlReadMemory() # functions from module HTMLtree htmlIsBooleanAttr() @@ -812,6 +816,12 @@ Class parserCtxt(parserCtxtCore) wellFormed() # functions from module HTMLparser + htmlCtxtReadDoc() + htmlCtxtReadFd() + htmlCtxtReadFile() + htmlCtxtReadMemory() + htmlCtxtReset() + htmlCtxtUseOptions() htmlFreeParserCtxt() htmlParseCharRef() htmlParseChunk() diff --git a/win32/libxml2.def.src b/win32/libxml2.def.src index 247ffdff..c1a1bd40 100644 --- a/win32/libxml2.def.src +++ b/win32/libxml2.def.src @@ -236,6 +236,27 @@ htmlCreateMemoryParserCtxt htmlCreatePushParserCtxt #endif #ifdef LIBXML_HTML_ENABLED +htmlCtxtReadDoc +#endif +#ifdef LIBXML_HTML_ENABLED +htmlCtxtReadFd +#endif +#ifdef LIBXML_HTML_ENABLED +htmlCtxtReadFile +#endif +#ifdef LIBXML_HTML_ENABLED +htmlCtxtReadIO +#endif +#ifdef LIBXML_HTML_ENABLED +htmlCtxtReadMemory +#endif +#ifdef LIBXML_HTML_ENABLED +htmlCtxtReset +#endif +#ifdef LIBXML_HTML_ENABLED +htmlCtxtUseOptions +#endif +#ifdef LIBXML_HTML_ENABLED htmlDefaultSAXHandlerInit #endif #ifdef LIBXML_HTML_ENABLED @@ -332,6 +353,21 @@ htmlParseEntityRef htmlParseFile #endif #ifdef LIBXML_HTML_ENABLED +htmlReadDoc +#endif +#ifdef LIBXML_HTML_ENABLED +htmlReadFd +#endif +#ifdef LIBXML_HTML_ENABLED +htmlReadFile +#endif +#ifdef LIBXML_HTML_ENABLED +htmlReadIO +#endif +#ifdef LIBXML_HTML_ENABLED +htmlReadMemory +#endif +#ifdef LIBXML_HTML_ENABLED htmlSAXParseDoc #endif #ifdef LIBXML_HTML_ENABLED diff --git a/xmllint.c b/xmllint.c index 9ef1905f..77d6cd2e 100644 --- a/xmllint.c +++ b/xmllint.c @@ -1293,6 +1293,8 @@ static void usage(const char *name) { #endif printf("\t--nowarning : do not emit warnings from parser/validator\n"); printf("\t--noblanks : drop (ignorable?) blanks spaces\n"); + printf("\t--nocdata : replace cdata section with text nodes\n"); + printf("\t--nsclean : remove redundant namespace declarations\n"); printf("\t--format : reformat/reindent the input\n"); printf("\t--testIO : test user I/O support\n"); printf("\t--encode encoding : output in the given encoding\n"); @@ -1370,6 +1372,12 @@ main(int argc, char **argv) { (!strcmp(argv[i], "--noent"))) { noent++; options |= XML_PARSE_NOENT; + } else if ((!strcmp(argv[i], "-nsclean")) || + (!strcmp(argv[i], "--nsclean"))) { + options |= XML_PARSE_NSCLEAN; + } else if ((!strcmp(argv[i], "-nocdata")) || + (!strcmp(argv[i], "--nocdata"))) { + options |= XML_PARSE_NOCDATA; } else if ((!strcmp(argv[i], "-nodict")) || (!strcmp(argv[i], "--nodict"))) { options |= XML_PARSE_NODICT;