Add support for parsing of large XML data (>= 10MB)

This commit adds XML_PARSE_HUGE to the libxml2 functions used in core
for the parsing of XML objects, raising up the original limit of 10MB
supported by libxml2.

In most code paths of upstream, XML_MAX_TEXT_LENGTH (10^7) is the
historical limit that gets upgraded to XML_MAX_HUGE_LENGTH (10^9) once
XML_PARSE_HUGE is given to the parser calls.  These are still limited by
any palloc() calls for text, up to 1GB.

This offers the possibility to handle within the backend XML objects
larger than 10MB in general, with also a higher depth limit.  This
change affects the contrib module xml2, the xml data type and SQL/XML.

Author: Dmitry Koval
Reviewed-by: Tom Lane, Michael Paquier
Discussion: https://postgr.es/m/18274-98d16bc03520665f@postgresql.org
This commit is contained in:
Michael Paquier 2024-01-17 14:03:55 +09:00
parent 65c5864d7f
commit 2197d06224
3 changed files with 33 additions and 13 deletions

View File

@ -381,7 +381,7 @@ pgxml_xpath(text *document, xmlChar *xpath, xpath_workspace *workspace)
{ {
workspace->doctree = xmlReadMemory((char *) VARDATA_ANY(document), workspace->doctree = xmlReadMemory((char *) VARDATA_ANY(document),
docsize, NULL, NULL, docsize, NULL, NULL,
XML_PARSE_NOENT); XML_PARSE_HUGE | XML_PARSE_NOENT);
if (workspace->doctree != NULL) if (workspace->doctree != NULL)
{ {
workspace->ctxt = xmlXPathNewContext(workspace->doctree); workspace->ctxt = xmlXPathNewContext(workspace->doctree);
@ -626,7 +626,7 @@ xpath_table(PG_FUNCTION_ARGS)
if (xmldoc) if (xmldoc)
doctree = xmlReadMemory(xmldoc, strlen(xmldoc), doctree = xmlReadMemory(xmldoc, strlen(xmldoc),
NULL, NULL, NULL, NULL,
XML_PARSE_NOENT); XML_PARSE_HUGE | XML_PARSE_NOENT);
else /* treat NULL as not well-formed */ else /* treat NULL as not well-formed */
doctree = NULL; doctree = NULL;

View File

@ -87,7 +87,7 @@ xslt_process(PG_FUNCTION_ARGS)
/* Parse document */ /* Parse document */
doctree = xmlReadMemory((char *) VARDATA_ANY(doct), doctree = xmlReadMemory((char *) VARDATA_ANY(doct),
VARSIZE_ANY_EXHDR(doct), NULL, NULL, VARSIZE_ANY_EXHDR(doct), NULL, NULL,
XML_PARSE_NOENT); XML_PARSE_HUGE | XML_PARSE_NOENT);
if (doctree == NULL) if (doctree == NULL)
xml_ereport(xmlerrcxt, ERROR, ERRCODE_EXTERNAL_ROUTINE_EXCEPTION, xml_ereport(xmlerrcxt, ERROR, ERRCODE_EXTERNAL_ROUTINE_EXCEPTION,
@ -96,7 +96,7 @@ xslt_process(PG_FUNCTION_ARGS)
/* Same for stylesheet */ /* Same for stylesheet */
ssdoc = xmlReadMemory((char *) VARDATA_ANY(ssheet), ssdoc = xmlReadMemory((char *) VARDATA_ANY(ssheet),
VARSIZE_ANY_EXHDR(ssheet), NULL, NULL, VARSIZE_ANY_EXHDR(ssheet), NULL, NULL,
XML_PARSE_NOENT); XML_PARSE_HUGE | XML_PARSE_NOENT);
if (ssdoc == NULL) if (ssdoc == NULL)
xml_ereport(xmlerrcxt, ERROR, ERRCODE_EXTERNAL_ROUTINE_EXCEPTION, xml_ereport(xmlerrcxt, ERROR, ERRCODE_EXTERNAL_ROUTINE_EXCEPTION,

View File

@ -1688,8 +1688,8 @@ xml_doctype_in_content(const xmlChar *str)
* xmloption_arg, but a DOCTYPE node in the input can force DOCUMENT mode). * xmloption_arg, but a DOCTYPE node in the input can force DOCUMENT mode).
* *
* If parsed_nodes isn't NULL and the input is not an XML document, the list * If parsed_nodes isn't NULL and the input is not an XML document, the list
* of parsed nodes from the xmlParseBalancedChunkMemory call will be returned * of parsed nodes from the xmlParseInNodeContext call will be returned to
* to *parsed_nodes. * *parsed_nodes.
* *
* Errors normally result in ereport(ERROR), but if escontext is an * Errors normally result in ereport(ERROR), but if escontext is an
* ErrorSaveContext, then "safe" errors are reported there instead, and the * ErrorSaveContext, then "safe" errors are reported there instead, and the
@ -1795,7 +1795,7 @@ xml_parse(text *data, XmlOptionType xmloption_arg,
doc = xmlCtxtReadDoc(ctxt, utf8string, doc = xmlCtxtReadDoc(ctxt, utf8string,
NULL, NULL,
"UTF-8", "UTF-8",
XML_PARSE_NOENT | XML_PARSE_DTDATTR XML_PARSE_NOENT | XML_PARSE_DTDATTR | XML_PARSE_HUGE
| (preserve_whitespace ? 0 : XML_PARSE_NOBLANKS)); | (preserve_whitespace ? 0 : XML_PARSE_NOBLANKS));
if (doc == NULL || xmlerrcxt->err_occurred) if (doc == NULL || xmlerrcxt->err_occurred)
{ {
@ -1828,10 +1828,30 @@ xml_parse(text *data, XmlOptionType xmloption_arg,
/* allow empty content */ /* allow empty content */
if (*(utf8string + count)) if (*(utf8string + count))
{ {
res_code = xmlParseBalancedChunkMemory(doc, NULL, NULL, 0, const char *data;
utf8string + count, xmlNodePtr root;
parsed_nodes); xmlNodePtr lst;
if (res_code != 0 || xmlerrcxt->err_occurred) xmlParserErrors xml_error;
data = (const char *) (utf8string + count);
/*
* Create a fake root node. The xmlNewDoc() function creates
* an XML document without any nodes, and this is required for
* xmlParseInNodeContext() that is able to handle
* XML_PARSE_HUGE.
*/
root = xmlNewNode(NULL, (const xmlChar *) "content-root");
if (root == NULL || xmlerrcxt->err_occurred)
xml_ereport(xmlerrcxt, ERROR, ERRCODE_OUT_OF_MEMORY,
"could not allocate xml node");
xmlDocSetRootElement(doc, root);
/* Try to parse string with using root node context. */
xml_error = xmlParseInNodeContext(root, data, strlen(data),
XML_PARSE_HUGE,
parsed_nodes ? parsed_nodes : &lst);
if (xml_error != XML_ERR_OK || xmlerrcxt->err_occurred)
{ {
xml_errsave(escontext, xmlerrcxt, xml_errsave(escontext, xmlerrcxt,
ERRCODE_INVALID_XML_CONTENT, ERRCODE_INVALID_XML_CONTENT,
@ -4344,7 +4364,7 @@ xpath_internal(text *xpath_expr_text, xmltype *data, ArrayType *namespaces,
xml_ereport(xmlerrcxt, ERROR, ERRCODE_OUT_OF_MEMORY, xml_ereport(xmlerrcxt, ERROR, ERRCODE_OUT_OF_MEMORY,
"could not allocate parser context"); "could not allocate parser context");
doc = xmlCtxtReadMemory(ctxt, (char *) string + xmldecl_len, doc = xmlCtxtReadMemory(ctxt, (char *) string + xmldecl_len,
len - xmldecl_len, NULL, NULL, 0); len - xmldecl_len, NULL, NULL, XML_PARSE_HUGE);
if (doc == NULL || xmlerrcxt->err_occurred) if (doc == NULL || xmlerrcxt->err_occurred)
xml_ereport(xmlerrcxt, ERROR, ERRCODE_INVALID_XML_DOCUMENT, xml_ereport(xmlerrcxt, ERROR, ERRCODE_INVALID_XML_DOCUMENT,
"could not parse XML document"); "could not parse XML document");
@ -4675,7 +4695,7 @@ XmlTableSetDocument(TableFuncScanState *state, Datum value)
PG_TRY(); PG_TRY();
{ {
doc = xmlCtxtReadMemory(xtCxt->ctxt, (char *) xstr, length, NULL, NULL, 0); doc = xmlCtxtReadMemory(xtCxt->ctxt, (char *) xstr, length, NULL, NULL, XML_PARSE_HUGE);
if (doc == NULL || xtCxt->xmlerrcxt->err_occurred) if (doc == NULL || xtCxt->xmlerrcxt->err_occurred)
xml_ereport(xtCxt->xmlerrcxt, ERROR, ERRCODE_INVALID_XML_DOCUMENT, xml_ereport(xtCxt->xmlerrcxt, ERROR, ERRCODE_INVALID_XML_DOCUMENT,
"could not parse XML document"); "could not parse XML document");