diff --git a/ChangeLog b/ChangeLog index 37d8414f..85adc27a 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +Wed Jul 7 09:28:43 CEST 1999 Daniel Veillard + + * HTMLparser.[ch], HTMLtree.[ch]: more work for HTML parsing and + output. + * Makefile.am, test/HTML/*, result/HTML/*: added HTMLtests targetestHTMLt + Wed Jul 7 00:25:42 CEST 1999 Daniel Veillard * parser.h : Oops removed the binary compatibility problem diff --git a/HTMLparser.c b/HTMLparser.c index 119daa20..5259f6e8 100644 --- a/HTMLparser.c +++ b/HTMLparser.c @@ -34,7 +34,7 @@ #include "valid.h" #include "parserInternals.h" -#define DEBUG */ +/* #define DEBUG */ /************************************************************************ * * @@ -351,7 +351,6 @@ htmlInitAutoClose(void) { htmlElemDescPtr htmlTagLookup(const CHAR *tag) { int i = 0; - int cnt; for (i = 0; i < (sizeof(html40ElementTable) / sizeof(html40ElementTable[0]));i++) { @@ -408,7 +407,6 @@ htmlCheckAutoClose(const CHAR *new, const CHAR *old) { */ void htmlAutoClose(htmlParserCtxtPtr ctxt, const CHAR *new) { - const CHAR *old; while ((ctxt->node != NULL) && (htmlCheckAutoClose(new, ctxt->node->name))) { @@ -1933,7 +1931,7 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) { atts = (const CHAR **) malloc(maxatts * sizeof(CHAR *)); if (atts == NULL) { fprintf(stderr, "malloc of %ld byte failed\n", - maxatts * sizeof(CHAR *)); + maxatts * (long)sizeof(CHAR *)); return(NULL); } } else if (nbatts + 2 < maxatts) { @@ -1941,7 +1939,7 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) { atts = (const CHAR **) realloc(atts, maxatts * sizeof(CHAR *)); if (atts == NULL) { fprintf(stderr, "realloc of %ld byte failed\n", - maxatts * sizeof(CHAR *)); + maxatts * (long)sizeof(CHAR *)); return(NULL); } } diff --git a/HTMLparser.h b/HTMLparser.h index 23ff9afc..749566ca 100644 --- a/HTMLparser.h +++ b/HTMLparser.h @@ -48,8 +48,10 @@ typedef struct htmlEntityDesc { /* * There is only few public functions. */ -htmlEntityDescPtr -htmlParseEntityRef(htmlParserCtxtPtr ctxt, CHAR **str); +htmlElemDescPtr htmlTagLookup(const CHAR *tag); +htmlEntityDescPtr htmlEntityLookup(const CHAR *name); + +htmlEntityDescPtr htmlParseEntityRef(htmlParserCtxtPtr ctxt, CHAR **str); int htmlParseCharRef(htmlParserCtxtPtr ctxt); void htmlParseElement(htmlParserCtxtPtr ctxt); diff --git a/HTMLtree.c b/HTMLtree.c index a8e91da7..0d4b45f1 100644 --- a/HTMLtree.c +++ b/HTMLtree.c @@ -12,14 +12,11 @@ #include #include /* for memset() only ! */ -#include "tree.h" +#include "HTMLparser.h" +#include "HTMLtree.h" #include "entities.h" #include "valid.h" -#define HTML_TEXT_NODE XML_TEXT_NODE -#define HTML_ENTITY_REF_NODE XML_ENTITY_REF_NODE -#define HTML_COMMENT_NODE XML_COMMENT_NODE - /** * htmlDtdDump: * @buf: the HTML buffer output @@ -46,23 +43,6 @@ htmlDtdDump(xmlBufferPtr buf, xmlDocPtr doc) { xmlBufferWriteChar(buf, " SYSTEM "); xmlBufferWriteQuotedString(buf, cur->SystemID); } - if ((cur->entities == NULL) && (cur->elements == NULL) && - (cur->attributes == NULL) && (cur->notations == NULL)) { - xmlBufferWriteChar(buf, ">\n"); - return; - } - xmlBufferWriteChar(buf, " [\n"); - if (cur->entities != NULL) - xmlDumpEntitiesTable(buf, (xmlEntitiesTablePtr) cur->entities); - if (cur->notations != NULL) - xmlDumpNotationTable(buf, (xmlNotationTablePtr) cur->notations); - if (cur->elements != NULL) - xmlDumpElementTable(buf, (xmlElementTablePtr) cur->elements); - if (cur->attributes != NULL) - xmlDumpAttributeTable(buf, (xmlAttributeTablePtr) cur->attributes); - xmlBufferWriteChar(buf, "]"); - - /* TODO !!! a lot more things to dump ... */ xmlBufferWriteChar(buf, ">\n"); } @@ -116,30 +96,23 @@ htmlAttrListDump(xmlBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur) { static void -htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur, int level); +htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur); /** * htmlNodeListDump: * @buf: the HTML buffer output * @doc: the document * @cur: the first node - * @level: the imbrication level for indenting * * Dump an HTML node list, recursive behaviour,children are printed too. */ static void -htmlNodeListDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur, int level) { - int i; - +htmlNodeListDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) { if (cur == NULL) { fprintf(stderr, "htmlNodeListDump : node == NULL\n"); return; } while (cur != NULL) { - if ((cur->type != HTML_TEXT_NODE) && - (cur->type != HTML_ENTITY_REF_NODE)) { - xmlBufferWriteChar(buf, "\n"); - } - htmlNodeDump(buf, doc, cur, level); + htmlNodeDump(buf, doc, cur); cur = cur->next; } } @@ -149,22 +122,26 @@ htmlNodeListDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur, int level) { * @buf: the HTML buffer output * @doc: the document * @cur: the current node - * @level: the imbrication level for indenting * * Dump an HTML node, recursive behaviour,children are printed too. */ static void -htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur, int level) { +htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) { int i; + htmlElemDescPtr info; if (cur == NULL) { fprintf(stderr, "htmlNodeDump : node == NULL\n"); return; } + /* + * Special cases. + */ if (cur->type == HTML_TEXT_NODE) { if (cur->content != NULL) { CHAR *buffer; + /* uses the HTML encoding routine !!!!!!!!!! */ buffer = xmlEncodeEntitiesReentrant(doc, cur->content); if (buffer != NULL) { xmlBufferWriteCHAR(buf, buffer); @@ -188,20 +165,38 @@ htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur, int level) { return; } - xmlBufferWriteChar(buf, "<"); - if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { - xmlBufferWriteCHAR(buf, cur->ns->prefix); - xmlBufferWriteChar(buf, ":"); - } + /* + * Get specific HTmL info for taht node. + */ + info = htmlTagLookup(cur->name); + xmlBufferWriteChar(buf, "<"); xmlBufferWriteCHAR(buf, cur->name); - if (cur->nsDef) - xmlNsListDump(buf, cur->nsDef); if (cur->properties != NULL) htmlAttrListDump(buf, doc, cur->properties); + if (info->empty) { + xmlBufferWriteChar(buf, ">"); + if (cur->next != NULL) { + if ((cur->next->type != HTML_TEXT_NODE) && + (cur->next->type != HTML_ENTITY_REF_NODE)) + xmlBufferWriteChar(buf, "\n"); + } + return; + } if ((cur->content == NULL) && (cur->childs == NULL)) { - xmlBufferWriteChar(buf, "/>\n"); + if (info->endTag != 0) + xmlBufferWriteChar(buf, ">"); + else { + xmlBufferWriteChar(buf, ">name); + xmlBufferWriteChar(buf, ">"); + } + if (cur->next != NULL) { + if ((cur->next->type != HTML_TEXT_NODE) && + (cur->next->type != HTML_ENTITY_REF_NODE)) + xmlBufferWriteChar(buf, "\n"); + } return; } xmlBufferWriteChar(buf, ">"); @@ -215,16 +210,22 @@ htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur, int level) { } } if (cur->childs != NULL) { - htmlNodeListDump(buf, doc, cur->childs, level + 1); + if ((cur->childs->type != HTML_TEXT_NODE) && + (cur->childs->type != HTML_ENTITY_REF_NODE)) + xmlBufferWriteChar(buf, "\n"); + htmlNodeListDump(buf, doc, cur->childs); + if ((cur->last->type != HTML_TEXT_NODE) && + (cur->last->type != HTML_ENTITY_REF_NODE)) + xmlBufferWriteChar(buf, "\n"); } xmlBufferWriteChar(buf, "ns != NULL) && (cur->ns->prefix != NULL)) { - xmlBufferWriteCHAR(buf, cur->ns->prefix); - xmlBufferWriteChar(buf, ":"); - } - xmlBufferWriteCHAR(buf, cur->name); - xmlBufferWriteChar(buf, ">\n"); + xmlBufferWriteChar(buf, ">"); + if (cur->next != NULL) { + if ((cur->next->type != HTML_TEXT_NODE) && + (cur->next->type != HTML_ENTITY_REF_NODE)) + xmlBufferWriteChar(buf, "\n"); + } } /** @@ -236,29 +237,12 @@ htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur, int level) { */ static void htmlDocContentDump(xmlBufferPtr buf, xmlDocPtr cur) { - xmlBufferWriteChar(buf, "version != NULL) - xmlBufferWriteQuotedString(buf, cur->version); - else - xmlBufferWriteChar(buf, "\"1.0\""); - if (cur->encoding != NULL) { - xmlBufferWriteChar(buf, " encoding="); - xmlBufferWriteQuotedString(buf, cur->encoding); - } - switch (cur->standalone) { - case 0: - xmlBufferWriteChar(buf, " standalone=\"no\""); - break; - case 1: - xmlBufferWriteChar(buf, " standalone=\"yes\""); - break; - } - xmlBufferWriteChar(buf, "?>\n"); if (cur->intSubset != NULL) htmlDtdDump(buf, cur); if (cur->root != NULL) { - htmlNodeDump(buf, cur, cur->root, 0); + htmlNodeDump(buf, cur, cur->root); } + xmlBufferWriteChar(buf, "\n"); } /** diff --git a/HTMLtree.h b/HTMLtree.h new file mode 100644 index 00000000..09206dbb --- /dev/null +++ b/HTMLtree.h @@ -0,0 +1,34 @@ +/* + * tree.h : describes the structures found in an tree resulting + * from an XML parsing. + * + * See Copyright for the status of this software. + * + * Daniel.Veillard@w3.org + */ + +#ifndef __HTML_TREE_H__ +#define __HTML_TREE_H__ + + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include "tree.h" + +#define HTML_TEXT_NODE XML_TEXT_NODE +#define HTML_ENTITY_REF_NODE XML_ENTITY_REF_NODE +#define HTML_COMMENT_NODE XML_COMMENT_NODE + +void htmlDocDumpMemory(xmlDocPtr cur, CHAR**mem, int *size); +void htmlDocDump(FILE *f, xmlDocPtr cur); +int htmlSaveFile(const char *filename, xmlDocPtr cur); + +#ifdef __cplusplus +} +#endif + +#endif /* __HTML_TREE_H__ */ + diff --git a/Makefile.am b/Makefile.am index e06d4640..0f239c3c 100644 --- a/Makefile.am +++ b/Makefile.am @@ -61,7 +61,24 @@ check-local: tests testall : tests SVGtests SAXtests -tests : tester +tests: HTMLtests XMLtests +HTMLtests : testHTML + @(DIR=`pwd`; cd $(srcdir) ; \ + for i in test/HTML/* ; do \ + if [ ! -d $$i ] ; then \ + if [ ! -f result/HTML/`basename $$i` ] ; then \ + echo New test file `basename $$i` ; \ + $$DIR/testHTML $$i > result/HTML/`basename $$i` ; \ + else \ + echo Testing `basename $$i` ; \ + $$DIR/testHTML $$i > result.`basename $$i` ; \ + diff result/HTML/`basename $$i` result.`basename $$i` ; \ + $$DIR/testHTML result.`basename $$i` > result2.`basename $$i` ; \ + diff result.`basename $$i` result2.`basename $$i` ; \ + rm result.`basename $$i` result2.`basename $$i` ; \ + fi ; fi ; done) + +XMLtests : tester @(DIR=`pwd`; cd $(srcdir) ; \ for i in test/* ; do \ if [ ! -d $$i ] ; then \ diff --git a/include/libxml/HTMLparser.h b/include/libxml/HTMLparser.h index 23ff9afc..749566ca 100644 --- a/include/libxml/HTMLparser.h +++ b/include/libxml/HTMLparser.h @@ -48,8 +48,10 @@ typedef struct htmlEntityDesc { /* * There is only few public functions. */ -htmlEntityDescPtr -htmlParseEntityRef(htmlParserCtxtPtr ctxt, CHAR **str); +htmlElemDescPtr htmlTagLookup(const CHAR *tag); +htmlEntityDescPtr htmlEntityLookup(const CHAR *name); + +htmlEntityDescPtr htmlParseEntityRef(htmlParserCtxtPtr ctxt, CHAR **str); int htmlParseCharRef(htmlParserCtxtPtr ctxt); void htmlParseElement(htmlParserCtxtPtr ctxt); diff --git a/include/libxml/HTMLtree.h b/include/libxml/HTMLtree.h new file mode 100644 index 00000000..09206dbb --- /dev/null +++ b/include/libxml/HTMLtree.h @@ -0,0 +1,34 @@ +/* + * tree.h : describes the structures found in an tree resulting + * from an XML parsing. + * + * See Copyright for the status of this software. + * + * Daniel.Veillard@w3.org + */ + +#ifndef __HTML_TREE_H__ +#define __HTML_TREE_H__ + + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include "tree.h" + +#define HTML_TEXT_NODE XML_TEXT_NODE +#define HTML_ENTITY_REF_NODE XML_ENTITY_REF_NODE +#define HTML_COMMENT_NODE XML_COMMENT_NODE + +void htmlDocDumpMemory(xmlDocPtr cur, CHAR**mem, int *size); +void htmlDocDump(FILE *f, xmlDocPtr cur); +int htmlSaveFile(const char *filename, xmlDocPtr cur); + +#ifdef __cplusplus +} +#endif + +#endif /* __HTML_TREE_H__ */ + diff --git a/result/HTML/Down.html b/result/HTML/Down.html new file mode 100644 index 00000000..5504cd4e --- /dev/null +++ b/result/HTML/Down.html @@ -0,0 +1,12 @@ + + + +This service is temporary down + + +

Sorry, this service is temporary down

+We are doing our best to get it back on-line, + +

The W3C system administrators

+ + diff --git a/result/HTML/test2.html b/result/HTML/test2.html new file mode 100644 index 00000000..c462702c --- /dev/null +++ b/result/HTML/test2.html @@ -0,0 +1,41 @@ + + + +Linux Today + + +
+ + + + + +
+ +Atipa Linux solutions. Your reliable cluster, server, and workstation solution. Win a Free Celeron Linux Workstation! + + +Linux Today Logo +
+ +linux.com partner +

+

+ +[ headlines | +features | +commercial | +security | +jobs | +volt | +contribute/submit | +advertise | +search | +site digests | +mailing lists | +about us | +link us ] +
+

+ + diff --git a/result/HTML/test3.html b/result/HTML/test3.html new file mode 100644 index 00000000..0c47a2e8 --- /dev/null +++ b/result/HTML/test3.html @@ -0,0 +1,88 @@ + + + + + + +

Component Package diagram ProblemDomain

+ +

+


+
+
+Stereotype problem domain
+
+Alias Problem Domain
+
+Note +
The Problem Domain package is the model behind the Human +
Interface, thats stores and manipulates the Family Tree. +
+ + +
+

+


+
+
+

Class HumanInterface.FamilyFrame +

+
+
+

Class ProblemDomain.Birth +

+
+
+

Class ProblemDomain.Death +

+
+
+

Class ProblemDomain.Divorce +

+
+
+

Class ProblemDomain.Family +

+
+
+

Class ProblemDomain.Individual +

+
+
+

Class ProblemDomain.LifeEvent +

+
+
+

Class ProblemDomain.Marriage +

+
+
+

Class ProblemDomain.Note +

+
+
+

+Links +

+ + + + + + + diff --git a/test/HTML/Down.html b/test/HTML/Down.html new file mode 100644 index 00000000..0f366479 --- /dev/null +++ b/test/HTML/Down.html @@ -0,0 +1,14 @@ + + + + This service is temporary down + + + +

Sorry, this service is temporary down

+We are doing our best to get it back on-line, + +

The W3C system administrators

+ + diff --git a/test/HTML/test2.html b/test/HTML/test2.html new file mode 100644 index 00000000..c8fd44c5 --- /dev/null +++ b/test/HTML/test2.html @@ -0,0 +1,33 @@ + + Linux Today + + +
+ + + + + + +
+Atipa Linux solutions. Your reliable cluster, server, and workstation solution. Win a Free Celeron Linux Workstation! + + Linux Today Logo
linux.com partner

+ +[ headlines | +features | +commercial | +security | +jobs | +volt | +contribute/submit | +advertise | +search | +site digests | +mailing lists | +about us | +link us ] +
+

+ + diff --git a/test/HTML/test3.html b/test/HTML/test3.html new file mode 100644 index 00000000..af1f1908 --- /dev/null +++ b/test/HTML/test3.html @@ -0,0 +1,34 @@ + + + + +

Component Package diagram ProblemDomain

+


+
+
Stereotype problem domain
+
Alias Problem Domain
+
Note
The Problem Domain package is the model behind the Human +
Interface, thats stores and manipulates the Family Tree. +
+


+
+ +

Class HumanInterface.FamilyFrame

+

Class ProblemDomain.Birth

+

Class ProblemDomain.Death

+

Class ProblemDomain.Divorce

+

Class ProblemDomain.Family

+

Class ProblemDomain.Individual

+

Class ProblemDomain.LifeEvent

+

Class ProblemDomain.Marriage

+

Class ProblemDomain.Note

+
+ +

Links

+ + + + + + + diff --git a/testHTML.c b/testHTML.c index 4b214fac..27afe35e 100644 --- a/testHTML.c +++ b/testHTML.c @@ -27,7 +27,7 @@ #include #include "HTMLparser.h" -#include "tree.h" +#include "HTMLtree.h" #include "debugXML.h" static int debug = 0; @@ -80,7 +80,7 @@ void parseAndPrintFile(char *filename) { * print it. */ if (!debug) - xmlDocDump(stdout, doc); + htmlDocDump(stdout, doc); else xmlDebugDumpDocument(stdout, doc); @@ -111,7 +111,7 @@ void parseAndPrintBuffer(CHAR *buf) { * print it. */ if (!debug) - xmlDocDump(stdout, doc); + htmlDocDump(stdout, doc); else xmlDebugDumpDocument(stdout, doc);