/* Parser interface for DOM-based parser (libxml) rather than stream-based SAX-type parser */ #include "postgres.h" #include "fmgr.h" #include "executor/spi.h" #include "funcapi.h" #include "miscadmin.h" #include "lib/stringinfo.h" /* libxml includes */ #include #include #include #include #include PG_MODULE_MAGIC; /* declarations */ static void *pgxml_palloc(size_t size); static void *pgxml_repalloc(void *ptr, size_t size); static void pgxml_pfree(void *ptr); static char *pgxml_pstrdup(const char *string); static void pgxml_errorHandler(void *ctxt, const char *msg,...); void elog_error(int level, char *explain, int force); void pgxml_parser_init(void); static xmlChar *pgxmlNodeSetToText(xmlNodeSetPtr nodeset, xmlChar * toptagname, xmlChar * septagname, xmlChar * plainsep); text *pgxml_result_to_text(xmlXPathObjectPtr res, xmlChar * toptag, xmlChar * septag, xmlChar * plainsep); xmlChar *pgxml_texttoxmlchar(text *textstring); static xmlXPathObjectPtr pgxml_xpath(text *document, xmlChar * xpath); Datum xml_is_well_formed(PG_FUNCTION_ARGS); Datum xml_encode_special_chars(PG_FUNCTION_ARGS); Datum xpath_nodeset(PG_FUNCTION_ARGS); Datum xpath_string(PG_FUNCTION_ARGS); Datum xpath_number(PG_FUNCTION_ARGS); Datum xpath_bool(PG_FUNCTION_ARGS); Datum xpath_list(PG_FUNCTION_ARGS); Datum xpath_table(PG_FUNCTION_ARGS); /* Global variables */ char *errbuf; /* per line error buffer */ char *pgxml_errorMsg = NULL; /* overall error message */ /* Convenience macros */ #define GET_TEXT(cstrp) DatumGetTextP(DirectFunctionCall1(textin, CStringGetDatum(cstrp))) #define GET_STR(textp) DatumGetCString(DirectFunctionCall1(textout, PointerGetDatum(textp))) #define ERRBUF_SIZE 200 /* memory handling passthrough functions (e.g. palloc, pstrdup are currently macros, and the others might become so...) */ static void * pgxml_palloc(size_t size) { /* elog(DEBUG1,"Alloc %d in CMC %p",size,CurrentMemoryContext); */ return palloc(size); } static void * pgxml_repalloc(void *ptr, size_t size) { /* elog(DEBUG1,"ReAlloc in CMC %p",CurrentMemoryContext);*/ return repalloc(ptr, size); } static void pgxml_pfree(void *ptr) { /* elog(DEBUG1,"Free in CMC %p",CurrentMemoryContext); */ pfree(ptr); } static char * pgxml_pstrdup(const char *string) { return pstrdup(string); } /* The error handling function. This formats an error message and sets * a flag - an ereport will be issued prior to return */ static void pgxml_errorHandler(void *ctxt, const char *msg,...) { va_list args; va_start(args, msg); vsnprintf(errbuf, ERRBUF_SIZE, msg, args); va_end(args); /* Now copy the argument across */ if (pgxml_errorMsg == NULL) pgxml_errorMsg = pstrdup(errbuf); else { int32 xsize = strlen(pgxml_errorMsg); pgxml_errorMsg = repalloc(pgxml_errorMsg, (size_t) (xsize + strlen(errbuf) + 1)); strncpy(&pgxml_errorMsg[xsize - 1], errbuf, strlen(errbuf)); pgxml_errorMsg[xsize + strlen(errbuf) - 1] = '\0'; } memset(errbuf, 0, ERRBUF_SIZE); } /* This function reports the current message at the level specified */ void elog_error(int level, char *explain, int force) { if (force || (pgxml_errorMsg != NULL)) { if (pgxml_errorMsg == NULL) { ereport(level, (errcode(ERRCODE_EXTERNAL_ROUTINE_EXCEPTION), errmsg(explain))); } else { ereport(level, (errcode(ERRCODE_EXTERNAL_ROUTINE_EXCEPTION), errmsg("%s:%s", explain, pgxml_errorMsg))); pfree(pgxml_errorMsg); } } } void pgxml_parser_init() { /* * This code could also set parser settings from user-supplied info. * Quite how these settings are made is another matter :) */ xmlMemSetup(pgxml_pfree, pgxml_palloc, pgxml_repalloc, pgxml_pstrdup); xmlInitParser(); xmlSetGenericErrorFunc(NULL, pgxml_errorHandler); xmlSubstituteEntitiesDefault(1); xmlLoadExtDtdDefaultValue = 1; pgxml_errorMsg = NULL; errbuf = palloc(200); memset(errbuf, 0, 200); } /* Returns true if document is well-formed */ PG_FUNCTION_INFO_V1(xml_is_well_formed); Datum xml_is_well_formed(PG_FUNCTION_ARGS) { /* called as xml_is_well_formed(document) */ xmlDocPtr doctree; text *t = PG_GETARG_TEXT_P(0); /* document buffer */ int32 docsize = VARSIZE(t) - VARHDRSZ; pgxml_parser_init(); doctree = xmlParseMemory((char *) VARDATA(t), docsize); if (doctree == NULL) { xmlCleanupParser(); PG_RETURN_BOOL(false); /* i.e. not well-formed */ } xmlCleanupParser(); xmlFreeDoc(doctree); PG_RETURN_BOOL(true); } /* Encodes special characters (<, >, &, " and \r) as XML entities */ PG_FUNCTION_INFO_V1(xml_encode_special_chars); Datum xml_encode_special_chars(PG_FUNCTION_ARGS) { text *tin = PG_GETARG_TEXT_P(0); text *tout; int32 ressize; xmlChar *ts, *tt; ts = pgxml_texttoxmlchar(tin); tt = xmlEncodeSpecialChars(NULL, ts); pfree(ts); ressize = strlen(tt); tout = (text *) palloc(ressize + VARHDRSZ); memcpy(VARDATA(tout), tt, ressize); VARATT_SIZEP(tout) = ressize + VARHDRSZ; xmlFree(tt); PG_RETURN_TEXT_P(tout); } static xmlChar * pgxmlNodeSetToText(xmlNodeSetPtr nodeset, xmlChar * toptagname, xmlChar * septagname, xmlChar * plainsep) { /* Function translates a nodeset into a text representation */ /* * iterates over each node in the set and calls xmlNodeDump to write it to * an xmlBuffer -from which an xmlChar * string is returned. */ /* each representation is surrounded by ... */ /* * plainsep is an ordinary (not tag) seperator - if used, then nodes are * cast to string as output method */ xmlBufferPtr buf; xmlChar *result; int i; buf = xmlBufferCreate(); if ((toptagname != NULL) && (xmlStrlen(toptagname) > 0)) { xmlBufferWriteChar(buf, "<"); xmlBufferWriteCHAR(buf, toptagname); xmlBufferWriteChar(buf, ">"); } if (nodeset != NULL) { for (i = 0; i < nodeset->nodeNr; i++) { if (plainsep != NULL) { xmlBufferWriteCHAR(buf, xmlXPathCastNodeToString(nodeset->nodeTab[i])); /* If this isn't the last entry, write the plain sep. */ if (i < (nodeset->nodeNr) - 1) xmlBufferWriteChar(buf, plainsep); } else { if ((septagname != NULL) && (xmlStrlen(septagname) > 0)) { xmlBufferWriteChar(buf, "<"); xmlBufferWriteCHAR(buf, septagname); xmlBufferWriteChar(buf, ">"); } xmlNodeDump(buf, nodeset->nodeTab[i]->doc, nodeset->nodeTab[i], 1, 0); if ((septagname != NULL) && (xmlStrlen(septagname) > 0)) { xmlBufferWriteChar(buf, ""); } } } } if ((toptagname != NULL) && (xmlStrlen(toptagname) > 0)) { xmlBufferWriteChar(buf, ""); } result = xmlStrdup(buf->content); xmlBufferFree(buf); return result; } /* Translate a PostgreSQL "varlena" -i.e. a variable length parameter * into the libxml2 representation */ xmlChar * pgxml_texttoxmlchar(text *textstring) { xmlChar *res; int32 txsize; txsize = VARSIZE(textstring) - VARHDRSZ; res = (xmlChar *) palloc(txsize + 1); memcpy((char *) res, VARDATA(textstring), txsize); res[txsize] = '\0'; return res; } /* Public visible XPath functions */ /* This is a "raw" xpath function. Check that it returns child elements * properly */ PG_FUNCTION_INFO_V1(xpath_nodeset); Datum xpath_nodeset(PG_FUNCTION_ARGS) { xmlChar *xpath, *toptag, *septag; int32 pathsize; text *xpathsupp, *xpres; /* PG_GETARG_TEXT_P(0) is document buffer */ xpathsupp = PG_GETARG_TEXT_P(1); /* XPath expression */ toptag = pgxml_texttoxmlchar(PG_GETARG_TEXT_P(2)); septag = pgxml_texttoxmlchar(PG_GETARG_TEXT_P(3)); pathsize = VARSIZE(xpathsupp) - VARHDRSZ; xpath = pgxml_texttoxmlchar(xpathsupp); xpres = pgxml_result_to_text( pgxml_xpath(PG_GETARG_TEXT_P(0), xpath), toptag, septag, NULL); /* xmlCleanupParser(); done by result_to_text routine */ pfree(xpath); if (xpres == NULL) PG_RETURN_NULL(); PG_RETURN_TEXT_P(xpres); } /* The following function is almost identical, but returns the elements in */ /* a list. */ PG_FUNCTION_INFO_V1(xpath_list); Datum xpath_list(PG_FUNCTION_ARGS) { xmlChar *xpath, *plainsep; int32 pathsize; text *xpathsupp, *xpres; /* PG_GETARG_TEXT_P(0) is document buffer */ xpathsupp = PG_GETARG_TEXT_P(1); /* XPath expression */ plainsep = pgxml_texttoxmlchar(PG_GETARG_TEXT_P(2)); pathsize = VARSIZE(xpathsupp) - VARHDRSZ; xpath = pgxml_texttoxmlchar(xpathsupp); xpres = pgxml_result_to_text( pgxml_xpath(PG_GETARG_TEXT_P(0), xpath), NULL, NULL, plainsep); /* xmlCleanupParser(); done by result_to_text routine */ pfree(xpath); if (xpres == NULL) PG_RETURN_NULL(); PG_RETURN_TEXT_P(xpres); } PG_FUNCTION_INFO_V1(xpath_string); Datum xpath_string(PG_FUNCTION_ARGS) { xmlChar *xpath; int32 pathsize; text *xpathsupp, *xpres; /* PG_GETARG_TEXT_P(0) is document buffer */ xpathsupp = PG_GETARG_TEXT_P(1); /* XPath expression */ pathsize = VARSIZE(xpathsupp) - VARHDRSZ; /* * We encapsulate the supplied path with "string()" = 8 chars + 1 for NUL * at end */ /* We could try casting to string using the libxml function? */ xpath = (xmlChar *) palloc(pathsize + 9); memcpy((char *) (xpath + 7), VARDATA(xpathsupp), pathsize); strncpy((char *) xpath, "string(", 7); xpath[pathsize + 7] = ')'; xpath[pathsize + 8] = '\0'; xpres = pgxml_result_to_text( pgxml_xpath(PG_GETARG_TEXT_P(0), xpath), NULL, NULL, NULL); xmlCleanupParser(); pfree(xpath); if (xpres == NULL) PG_RETURN_NULL(); PG_RETURN_TEXT_P(xpres); } PG_FUNCTION_INFO_V1(xpath_number); Datum xpath_number(PG_FUNCTION_ARGS) { xmlChar *xpath; int32 pathsize; text *xpathsupp; float4 fRes; xmlXPathObjectPtr res; /* PG_GETARG_TEXT_P(0) is document buffer */ xpathsupp = PG_GETARG_TEXT_P(1); /* XPath expression */ pathsize = VARSIZE(xpathsupp) - VARHDRSZ; xpath = pgxml_texttoxmlchar(xpathsupp); res = pgxml_xpath(PG_GETARG_TEXT_P(0), xpath); pfree(xpath); if (res == NULL) { xmlCleanupParser(); PG_RETURN_NULL(); } fRes = xmlXPathCastToNumber(res); xmlCleanupParser(); if (xmlXPathIsNaN(fRes)) PG_RETURN_NULL(); PG_RETURN_FLOAT4(fRes); } PG_FUNCTION_INFO_V1(xpath_bool); Datum xpath_bool(PG_FUNCTION_ARGS) { xmlChar *xpath; int32 pathsize; text *xpathsupp; int bRes; xmlXPathObjectPtr res; /* PG_GETARG_TEXT_P(0) is document buffer */ xpathsupp = PG_GETARG_TEXT_P(1); /* XPath expression */ pathsize = VARSIZE(xpathsupp) - VARHDRSZ; xpath = pgxml_texttoxmlchar(xpathsupp); res = pgxml_xpath(PG_GETARG_TEXT_P(0), xpath); pfree(xpath); if (res == NULL) { xmlCleanupParser(); PG_RETURN_BOOL(false); } bRes = xmlXPathCastToBoolean(res); xmlCleanupParser(); PG_RETURN_BOOL(bRes); } /* Core function to evaluate XPath query */ xmlXPathObjectPtr pgxml_xpath(text *document, xmlChar * xpath) { xmlDocPtr doctree; xmlXPathContextPtr ctxt; xmlXPathObjectPtr res; xmlXPathCompExprPtr comppath; int32 docsize; docsize = VARSIZE(document) - VARHDRSZ; pgxml_parser_init(); doctree = xmlParseMemory((char *) VARDATA(document), docsize); if (doctree == NULL) { /* not well-formed */ return NULL; } ctxt = xmlXPathNewContext(doctree); ctxt->node = xmlDocGetRootElement(doctree); /* compile the path */ comppath = xmlXPathCompile(xpath); if (comppath == NULL) { xmlCleanupParser(); xmlFreeDoc(doctree); elog_error(ERROR, "XPath Syntax Error", 1); return NULL; } /* Now evaluate the path expression. */ res = xmlXPathCompiledEval(comppath, ctxt); xmlXPathFreeCompExpr(comppath); if (res == NULL) { xmlXPathFreeContext(ctxt); /* xmlCleanupParser(); */ xmlFreeDoc(doctree); return NULL; } /* xmlFreeDoc(doctree); */ return res; } text * pgxml_result_to_text(xmlXPathObjectPtr res, xmlChar * toptag, xmlChar * septag, xmlChar * plainsep) { xmlChar *xpresstr; int32 ressize; text *xpres; if (res == NULL) { xmlCleanupParser(); return NULL; } switch (res->type) { case XPATH_NODESET: xpresstr = pgxmlNodeSetToText(res->nodesetval, toptag, septag, plainsep); break; case XPATH_STRING: xpresstr = xmlStrdup(res->stringval); break; default: elog(NOTICE, "unsupported XQuery result: %d", res->type); xpresstr = xmlStrdup(""); } /* Now convert this result back to text */ ressize = strlen(xpresstr); xpres = (text *) palloc(ressize + VARHDRSZ); memcpy(VARDATA(xpres), xpresstr, ressize); VARATT_SIZEP(xpres) = ressize + VARHDRSZ; /* Free various storage */ xmlCleanupParser(); /* xmlFreeDoc(doctree); -- will die at end of tuple anyway */ xmlFree(xpresstr); elog_error(ERROR, "XPath error", 0); return xpres; } /* xpath_table is a table function. It needs some tidying (as do the * other functions here! */ PG_FUNCTION_INFO_V1(xpath_table); Datum xpath_table(PG_FUNCTION_ARGS) { /* SPI (input tuple) support */ SPITupleTable *tuptable; HeapTuple spi_tuple; TupleDesc spi_tupdesc; /* Output tuple (tuplestore) support */ Tuplestorestate *tupstore = NULL; TupleDesc ret_tupdesc; HeapTuple ret_tuple; ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; AttInMetadata *attinmeta; MemoryContext per_query_ctx; MemoryContext oldcontext; /* Function parameters */ char *pkeyfield = GET_STR(PG_GETARG_TEXT_P(0)); char *xmlfield = GET_STR(PG_GETARG_TEXT_P(1)); char *relname = GET_STR(PG_GETARG_TEXT_P(2)); char *xpathset = GET_STR(PG_GETARG_TEXT_P(3)); char *condition = GET_STR(PG_GETARG_TEXT_P(4)); char **values; xmlChar **xpaths; xmlChar *pos; xmlChar *pathsep = "|"; int numpaths; int ret; int proc; int i; int j; int rownr; /* For issuing multiple rows from one original * document */ int had_values; /* To determine end of nodeset results */ StringInfoData query_buf; /* We only have a valid tuple description in table function mode */ if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("set-valued function called in context that cannot accept a set"))); if (rsinfo->expectedDesc == NULL) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("xpath_table must be called as a table function"))); /* * We want to materialise because it means that we don't have to carry * libxml2 parser state between invocations of this function */ if (!(rsinfo->allowedModes & SFRM_Materialize)) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("xpath_table requires Materialize mode, but it is not " "allowed in this context"))); /* * The tuplestore must exist in a higher context than this function call * (per_query_ctx is used) */ per_query_ctx = rsinfo->econtext->ecxt_per_query_memory; oldcontext = MemoryContextSwitchTo(per_query_ctx); /* * Create the tuplestore - work_mem is the max in-memory size before a * file is created on disk to hold it. */ tupstore = tuplestore_begin_heap(true, false, work_mem); MemoryContextSwitchTo(oldcontext); /* get the requested return tuple description */ ret_tupdesc = CreateTupleDescCopy(rsinfo->expectedDesc); /* * At the moment we assume that the returned attributes make sense for the * XPath specififed (i.e. we trust the caller). It's not fatal if they get * it wrong - the input function for the column type will raise an error * if the path result can't be converted into the correct binary * representation. */ attinmeta = TupleDescGetAttInMetadata(ret_tupdesc); /* Set return mode and allocate value space. */ rsinfo->returnMode = SFRM_Materialize; rsinfo->setDesc = ret_tupdesc; values = (char **) palloc(ret_tupdesc->natts * sizeof(char *)); xpaths = (xmlChar **) palloc(ret_tupdesc->natts * sizeof(xmlChar *)); /* Split XPaths. xpathset is a writable CString. */ /* Note that we stop splitting once we've done all needed for tupdesc */ numpaths = 0; pos = xpathset; do { xpaths[numpaths] = pos; pos = strstr(pos, pathsep); if (pos != NULL) { *pos = '\0'; pos++; } numpaths++; } while ((pos != NULL) && (numpaths < (ret_tupdesc->natts - 1))); /* Now build query */ initStringInfo(&query_buf); /* Build initial sql statement */ appendStringInfo(&query_buf, "SELECT %s, %s FROM %s WHERE %s", pkeyfield, xmlfield, relname, condition ); if ((ret = SPI_connect()) < 0) elog(ERROR, "xpath_table: SPI_connect returned %d", ret); if ((ret = SPI_exec(query_buf.data, 0)) != SPI_OK_SELECT) elog(ERROR, "xpath_table: SPI execution failed for query %s", query_buf.data); proc = SPI_processed; /* elog(DEBUG1,"xpath_table: SPI returned %d rows",proc); */ tuptable = SPI_tuptable; spi_tupdesc = tuptable->tupdesc; /* Switch out of SPI context */ MemoryContextSwitchTo(oldcontext); /* Check that SPI returned correct result. If you put a comma into one of * the function parameters, this will catch it when the SPI query returns * e.g. 3 columns. */ if (spi_tupdesc->natts != 2) { ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("expression returning multiple columns is not valid in parameter list"), errdetail("Expected two columns in SPI result, got %d.", spi_tupdesc->natts))); } /* Setup the parser. Beware that this must happen in the same context as the * cleanup - which means that any error from here on must do cleanup to * ensure that the entity table doesn't get freed by being out of context. */ pgxml_parser_init(); /* For each row i.e. document returned from SPI */ for (i = 0; i < proc; i++) { char *pkey; char *xmldoc; xmlDocPtr doctree; xmlXPathContextPtr ctxt; xmlXPathObjectPtr res; xmlChar *resstr; xmlXPathCompExprPtr comppath; /* Extract the row data as C Strings */ spi_tuple = tuptable->vals[i]; pkey = SPI_getvalue(spi_tuple, spi_tupdesc, 1); xmldoc = SPI_getvalue(spi_tuple, spi_tupdesc, 2); /* * Clear the values array, so that not-well-formed documents return * NULL in all columns. */ /* Note that this also means that spare columns will be NULL. */ for (j = 0; j < ret_tupdesc->natts; j++) values[j] = NULL; /* Insert primary key */ values[0] = pkey; /* Parse the document */ if (xmldoc) doctree = xmlParseMemory(xmldoc, strlen(xmldoc)); else /* treat NULL as not well-formed */ doctree = NULL; if (doctree == NULL) { /* not well-formed, so output all-NULL tuple */ ret_tuple = BuildTupleFromCStrings(attinmeta, values); oldcontext = MemoryContextSwitchTo(per_query_ctx); tuplestore_puttuple(tupstore, ret_tuple); MemoryContextSwitchTo(oldcontext); heap_freetuple(ret_tuple); } else { /* New loop here - we have to deal with nodeset results */ rownr = 0; do { /* Now evaluate the set of xpaths. */ had_values = 0; for (j = 0; j < numpaths; j++) { ctxt = xmlXPathNewContext(doctree); ctxt->node = xmlDocGetRootElement(doctree); xmlSetGenericErrorFunc(ctxt, pgxml_errorHandler); /* compile the path */ comppath = xmlXPathCompile(xpaths[j]); if (comppath == NULL) { xmlCleanupParser(); xmlFreeDoc(doctree); elog_error(ERROR, "XPath Syntax Error", 1); PG_RETURN_NULL(); /* Keep compiler happy */ } /* Now evaluate the path expression. */ res = xmlXPathCompiledEval(comppath, ctxt); xmlXPathFreeCompExpr(comppath); if (res != NULL) { switch (res->type) { case XPATH_NODESET: /* We see if this nodeset has enough nodes */ if ((res->nodesetval != NULL) && (rownr < res->nodesetval->nodeNr)) { resstr = xmlXPathCastNodeToString(res->nodesetval->nodeTab[rownr]); had_values = 1; } else resstr = NULL; break; case XPATH_STRING: resstr = xmlStrdup(res->stringval); break; default: elog(NOTICE, "unsupported XQuery result: %d", res->type); resstr = xmlStrdup(""); } /* * Insert this into the appropriate column in the * result tuple. */ values[j + 1] = resstr; } xmlXPathFreeContext(ctxt); } /* Now add the tuple to the output, if there is one. */ if (had_values) { ret_tuple = BuildTupleFromCStrings(attinmeta, values); oldcontext = MemoryContextSwitchTo(per_query_ctx); tuplestore_puttuple(tupstore, ret_tuple); MemoryContextSwitchTo(oldcontext); heap_freetuple(ret_tuple); } rownr++; } while (had_values); } xmlFreeDoc(doctree); if (pkey) pfree(pkey); if (xmldoc) pfree(xmldoc); } xmlCleanupParser(); /* Needed to flag completeness in 7.3.1. 7.4 defines it as a no-op. */ tuplestore_donestoring(tupstore); SPI_finish(); rsinfo->setResult = tupstore; /* * SFRM_Materialize mode expects us to return a NULL Datum. The actual * tuples are in our tuplestore and passed back through rsinfo->setResult. * rsinfo->setDesc is set to the tuple description that we actually used * to build our tuples with, so the caller can verify we did what it was * expecting. */ return (Datum) 0; }