2001-07-30 22:59:02 +08:00
|
|
|
/********************************************************
|
|
|
|
* Interface code to parse an XML document using expat
|
|
|
|
********************************************************/
|
|
|
|
|
|
|
|
#include "postgres.h"
|
|
|
|
#include "fmgr.h"
|
|
|
|
|
|
|
|
#include "expat.h"
|
|
|
|
#include "pgxml.h"
|
|
|
|
|
|
|
|
/* Memory management - we make expat use standard pg MM */
|
|
|
|
|
|
|
|
XML_Memory_Handling_Suite mhs;
|
|
|
|
|
|
|
|
/* passthrough functions (palloc is a macro) */
|
|
|
|
|
2001-10-25 13:50:21 +08:00
|
|
|
static void *
|
|
|
|
pgxml_palloc(size_t size)
|
2001-07-30 22:59:02 +08:00
|
|
|
{
|
2001-10-25 13:50:21 +08:00
|
|
|
return palloc(size);
|
2001-07-30 22:59:02 +08:00
|
|
|
}
|
|
|
|
|
2001-10-25 13:50:21 +08:00
|
|
|
static void *
|
|
|
|
pgxml_repalloc(void *ptr, size_t size)
|
2001-07-30 22:59:02 +08:00
|
|
|
{
|
2001-10-25 13:50:21 +08:00
|
|
|
return repalloc(ptr, size);
|
2001-07-30 22:59:02 +08:00
|
|
|
}
|
|
|
|
|
2001-10-25 13:50:21 +08:00
|
|
|
static void
|
|
|
|
pgxml_pfree(void *ptr)
|
2001-07-30 22:59:02 +08:00
|
|
|
{
|
2001-10-25 13:50:21 +08:00
|
|
|
return pfree(ptr);
|
2001-07-30 22:59:02 +08:00
|
|
|
}
|
|
|
|
|
2001-10-25 13:50:21 +08:00
|
|
|
static void
|
|
|
|
pgxml_mhs_init()
|
2001-07-30 22:59:02 +08:00
|
|
|
{
|
2001-10-25 13:50:21 +08:00
|
|
|
mhs.malloc_fcn = pgxml_palloc;
|
|
|
|
mhs.realloc_fcn = pgxml_repalloc;
|
|
|
|
mhs.free_fcn = pgxml_pfree;
|
2001-07-30 22:59:02 +08:00
|
|
|
}
|
|
|
|
|
2001-10-25 13:50:21 +08:00
|
|
|
static void
|
|
|
|
pgxml_handler_init()
|
2001-07-30 22:59:02 +08:00
|
|
|
{
|
2001-10-25 13:50:21 +08:00
|
|
|
/*
|
|
|
|
* This code should set up the relevant handlers from user-supplied
|
|
|
|
* settings. Quite how these settings are made is another matter :)
|
|
|
|
*/
|
2001-07-30 22:59:02 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Returns true if document is well-formed */
|
|
|
|
|
|
|
|
PG_FUNCTION_INFO_V1(pgxml_parse);
|
|
|
|
|
|
|
|
Datum
|
|
|
|
pgxml_parse(PG_FUNCTION_ARGS)
|
|
|
|
{
|
2001-10-25 13:50:21 +08:00
|
|
|
/* called as pgxml_parse(document) */
|
|
|
|
XML_Parser p;
|
|
|
|
text *t = PG_GETARG_TEXT_P(0); /* document buffer */
|
|
|
|
int32 docsize = VARSIZE(t) - VARHDRSZ;
|
|
|
|
|
|
|
|
pgxml_mhs_init();
|
|
|
|
|
|
|
|
pgxml_handler_init();
|
|
|
|
|
|
|
|
p = XML_ParserCreate_MM(NULL, &mhs, NULL);
|
|
|
|
if (!p)
|
|
|
|
{
|
|
|
|
elog(ERROR, "pgxml: Could not create expat parser");
|
|
|
|
PG_RETURN_NULL(); /* seems appropriate if we couldn't parse */
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!XML_Parse(p, (char *) VARDATA(t), docsize, 1))
|
|
|
|
{
|
|
|
|
/*
|
2002-03-06 14:10:59 +08:00
|
|
|
* elog(WARNING, "Parse error at line %d:%s",
|
2001-10-25 13:50:21 +08:00
|
|
|
* XML_GetCurrentLineNumber(p),
|
|
|
|
* XML_ErrorString(XML_GetErrorCode(p)));
|
|
|
|
*/
|
|
|
|
XML_ParserFree(p);
|
|
|
|
PG_RETURN_BOOL(false);
|
|
|
|
}
|
|
|
|
|
|
|
|
XML_ParserFree(p);
|
|
|
|
PG_RETURN_BOOL(true);
|
2001-07-30 22:59:02 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* XPath handling functions */
|
|
|
|
|
|
|
|
/* XPath support here is for a very skeletal kind of XPath!
|
|
|
|
It was easy to program though... */
|
|
|
|
|
2001-10-25 13:50:21 +08:00
|
|
|
/* This first is the core function that builds a result set. The
|
2001-07-30 22:59:02 +08:00
|
|
|
actual functions called by the user manipulate that result set
|
|
|
|
in various ways.
|
|
|
|
*/
|
|
|
|
|
2001-10-25 13:50:21 +08:00
|
|
|
static XPath_Results *
|
|
|
|
build_xpath_results(text *doc, text *pathstr)
|
2001-07-30 22:59:02 +08:00
|
|
|
{
|
2001-10-25 13:50:21 +08:00
|
|
|
XPath_Results *xpr;
|
|
|
|
char *res;
|
|
|
|
pgxml_udata *udata;
|
|
|
|
XML_Parser p;
|
|
|
|
int32 docsize;
|
2001-07-30 22:59:02 +08:00
|
|
|
|
2001-10-25 13:50:21 +08:00
|
|
|
xpr = (XPath_Results *) palloc((sizeof(XPath_Results)));
|
|
|
|
memset((void *) xpr, 0, sizeof(XPath_Results));
|
|
|
|
xpr->rescount = 0;
|
2001-07-30 22:59:02 +08:00
|
|
|
|
2001-10-25 13:50:21 +08:00
|
|
|
docsize = VARSIZE(doc) - VARHDRSZ;
|
2001-07-30 22:59:02 +08:00
|
|
|
|
2001-10-25 13:50:21 +08:00
|
|
|
/* res isn't going to be the real return type, it is just a buffer */
|
2001-07-30 22:59:02 +08:00
|
|
|
|
2001-10-25 13:50:21 +08:00
|
|
|
res = (char *) palloc(docsize);
|
|
|
|
memset((void *) res, 0, docsize);
|
2001-07-30 22:59:02 +08:00
|
|
|
|
2001-10-25 13:50:21 +08:00
|
|
|
xpr->resbuf = res;
|
2001-07-30 22:59:02 +08:00
|
|
|
|
2001-10-25 13:50:21 +08:00
|
|
|
udata = (pgxml_udata *) palloc((sizeof(pgxml_udata)));
|
|
|
|
memset((void *) udata, 0, sizeof(pgxml_udata));
|
2001-07-30 22:59:02 +08:00
|
|
|
|
2001-10-25 13:50:21 +08:00
|
|
|
udata->currentpath[0] = '\0';
|
|
|
|
udata->textgrab = 0;
|
2001-07-30 22:59:02 +08:00
|
|
|
|
2001-10-25 13:50:21 +08:00
|
|
|
udata->path = (char *) palloc(VARSIZE(pathstr));
|
|
|
|
memcpy(udata->path, VARDATA(pathstr), VARSIZE(pathstr) - VARHDRSZ);
|
2001-07-30 22:59:02 +08:00
|
|
|
|
2001-10-25 13:50:21 +08:00
|
|
|
udata->path[VARSIZE(pathstr) - VARHDRSZ] = '\0';
|
2001-07-30 22:59:02 +08:00
|
|
|
|
2001-10-25 13:50:21 +08:00
|
|
|
udata->resptr = res;
|
|
|
|
udata->reslen = 0;
|
2001-07-30 22:59:02 +08:00
|
|
|
|
2001-10-25 13:50:21 +08:00
|
|
|
udata->xpres = xpr;
|
2001-07-30 22:59:02 +08:00
|
|
|
|
2001-10-25 13:50:21 +08:00
|
|
|
/* Now fire up the parser */
|
|
|
|
pgxml_mhs_init();
|
2001-07-30 22:59:02 +08:00
|
|
|
|
2001-10-25 13:50:21 +08:00
|
|
|
p = XML_ParserCreate_MM(NULL, &mhs, NULL);
|
|
|
|
if (!p)
|
|
|
|
{
|
|
|
|
elog(ERROR, "pgxml: Could not create expat parser");
|
|
|
|
pfree(xpr);
|
|
|
|
pfree(udata->path);
|
|
|
|
pfree(udata);
|
|
|
|
pfree(res);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
XML_SetUserData(p, (void *) udata);
|
|
|
|
|
|
|
|
/* Set the handlers */
|
|
|
|
|
|
|
|
XML_SetElementHandler(p, pgxml_starthandler, pgxml_endhandler);
|
|
|
|
XML_SetCharacterDataHandler(p, pgxml_charhandler);
|
|
|
|
|
|
|
|
if (!XML_Parse(p, (char *) VARDATA(doc), docsize, 1))
|
|
|
|
{
|
|
|
|
/*
|
2002-03-06 14:10:59 +08:00
|
|
|
* elog(WARNING, "Parse error at line %d:%s",
|
2001-10-25 13:50:21 +08:00
|
|
|
* XML_GetCurrentLineNumber(p),
|
|
|
|
* XML_ErrorString(XML_GetErrorCode(p)));
|
|
|
|
*/
|
|
|
|
XML_ParserFree(p);
|
|
|
|
pfree(xpr);
|
|
|
|
pfree(udata->path);
|
|
|
|
pfree(udata);
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
2001-07-30 22:59:02 +08:00
|
|
|
|
2001-10-25 13:50:21 +08:00
|
|
|
pfree(udata->path);
|
|
|
|
pfree(udata);
|
|
|
|
XML_ParserFree(p);
|
|
|
|
return xpr;
|
2001-07-30 22:59:02 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
PG_FUNCTION_INFO_V1(pgxml_xpath);
|
|
|
|
|
|
|
|
Datum
|
|
|
|
pgxml_xpath(PG_FUNCTION_ARGS)
|
|
|
|
{
|
2001-10-25 13:50:21 +08:00
|
|
|
/* called as pgxml_xpath(document,pathstr, index) for the moment */
|
|
|
|
|
|
|
|
XPath_Results *xpresults;
|
|
|
|
text *restext;
|
2001-07-30 22:59:02 +08:00
|
|
|
|
2001-10-25 13:50:21 +08:00
|
|
|
text *t = PG_GETARG_TEXT_P(0); /* document buffer */
|
|
|
|
text *t2 = PG_GETARG_TEXT_P(1);
|
|
|
|
int32 ind = PG_GETARG_INT32(2) - 1;
|
2001-07-30 22:59:02 +08:00
|
|
|
|
2001-10-25 13:50:21 +08:00
|
|
|
xpresults = build_xpath_results(t, t2);
|
2001-07-30 22:59:02 +08:00
|
|
|
|
2001-10-25 13:50:21 +08:00
|
|
|
/*
|
|
|
|
* This needs to be changed depending on the mechanism for returning
|
|
|
|
* our set of results.
|
|
|
|
*/
|
2001-07-30 22:59:02 +08:00
|
|
|
|
2001-10-25 13:50:21 +08:00
|
|
|
if (xpresults == NULL) /* parse error (not WF or parser failure) */
|
|
|
|
PG_RETURN_NULL();
|
2001-07-30 22:59:02 +08:00
|
|
|
|
2001-10-25 13:50:21 +08:00
|
|
|
if (ind >= (xpresults->rescount))
|
|
|
|
PG_RETURN_NULL();
|
2001-07-30 22:59:02 +08:00
|
|
|
|
2001-10-25 13:50:21 +08:00
|
|
|
restext = (text *) palloc(xpresults->reslens[ind] + VARHDRSZ);
|
|
|
|
memcpy(VARDATA(restext), xpresults->results[ind], xpresults->reslens[ind]);
|
2001-07-30 22:59:02 +08:00
|
|
|
|
2001-10-25 13:50:21 +08:00
|
|
|
VARATT_SIZEP(restext) = xpresults->reslens[ind] + VARHDRSZ;
|
2001-07-30 22:59:02 +08:00
|
|
|
|
2001-10-25 13:50:21 +08:00
|
|
|
pfree(xpresults->resbuf);
|
|
|
|
pfree(xpresults);
|
2001-07-30 22:59:02 +08:00
|
|
|
|
2001-10-25 13:50:21 +08:00
|
|
|
PG_RETURN_TEXT_P(restext);
|
2001-07-30 22:59:02 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2001-10-25 13:50:21 +08:00
|
|
|
static void
|
|
|
|
pgxml_pathcompare(void *userData)
|
2001-07-30 22:59:02 +08:00
|
|
|
{
|
2001-10-25 13:50:21 +08:00
|
|
|
char *matchpos;
|
|
|
|
|
|
|
|
matchpos = strstr(UD->currentpath, UD->path);
|
|
|
|
|
|
|
|
if (matchpos == NULL)
|
|
|
|
{ /* Should we have more logic here ? */
|
|
|
|
if (UD->textgrab)
|
|
|
|
{
|
|
|
|
UD->textgrab = 0;
|
|
|
|
pgxml_finalisegrabbedtext(userData);
|
|
|
|
}
|
|
|
|
return;
|
2001-07-30 22:59:02 +08:00
|
|
|
}
|
2001-10-25 13:50:21 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* OK, we have a match of some sort. Now we need to check that our
|
|
|
|
* match is anchored to the *end* of the string AND that it is
|
|
|
|
* immediately preceded by a '/'
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This test wouldn't work if strlen (UD->path) overran the length of
|
|
|
|
* the currentpath, but that's not possible because we got a match!
|
|
|
|
*/
|
|
|
|
|
|
|
|
if ((matchpos + strlen(UD->path))[0] == '\0')
|
|
|
|
{
|
|
|
|
if ((UD->path)[0] == '/')
|
|
|
|
{
|
|
|
|
if (matchpos == UD->currentpath)
|
|
|
|
UD->textgrab = 1;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
if ((matchpos - 1)[0] == '/')
|
|
|
|
UD->textgrab = 1;
|
|
|
|
}
|
2001-07-30 22:59:02 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2001-10-25 13:50:21 +08:00
|
|
|
static void
|
|
|
|
pgxml_starthandler(void *userData, const XML_Char * name,
|
|
|
|
const XML_Char ** atts)
|
2001-07-30 22:59:02 +08:00
|
|
|
{
|
|
|
|
|
2001-10-25 13:50:21 +08:00
|
|
|
char sepstr[] = "/";
|
|
|
|
|
|
|
|
if ((strlen(name) + strlen(UD->currentpath)) > MAXPATHLENGTH - 2)
|
2002-03-06 14:10:59 +08:00
|
|
|
elog(WARNING, "Path too long");
|
2001-10-25 13:50:21 +08:00
|
|
|
else
|
|
|
|
{
|
|
|
|
strncat(UD->currentpath, sepstr, 1);
|
|
|
|
strcat(UD->currentpath, name);
|
|
|
|
}
|
|
|
|
if (UD->textgrab)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Depending on user preference, should we "reconstitute" the
|
|
|
|
* element into the result text?
|
|
|
|
*/
|
|
|
|
}
|
|
|
|
else
|
|
|
|
pgxml_pathcompare(userData);
|
2001-07-30 22:59:02 +08:00
|
|
|
}
|
|
|
|
|
2001-10-25 13:50:21 +08:00
|
|
|
static void
|
|
|
|
pgxml_endhandler(void *userData, const XML_Char * name)
|
2001-07-30 22:59:02 +08:00
|
|
|
{
|
2001-10-25 13:50:21 +08:00
|
|
|
/*
|
|
|
|
* Start by removing the current element off the end of the
|
|
|
|
* currentpath
|
|
|
|
*/
|
|
|
|
|
|
|
|
char *sepptr;
|
|
|
|
|
|
|
|
sepptr = strrchr(UD->currentpath, '/');
|
|
|
|
if (sepptr == NULL)
|
|
|
|
{
|
|
|
|
elog(ERROR, "There's a problem...");
|
|
|
|
sepptr = UD->currentpath;
|
|
|
|
}
|
|
|
|
if (strcmp(name, sepptr + 1) != 0)
|
|
|
|
{
|
2002-03-06 14:10:59 +08:00
|
|
|
elog(WARNING, "Wanted [%s], got [%s]", sepptr, name);
|
2001-10-25 13:50:21 +08:00
|
|
|
/* unmatched entry, so do nothing */
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
sepptr[0] = '\0'; /* Chop that element off the end */
|
|
|
|
}
|
|
|
|
|
|
|
|
if (UD->textgrab)
|
|
|
|
pgxml_pathcompare(userData);
|
2001-07-30 22:59:02 +08:00
|
|
|
|
|
|
|
}
|
|
|
|
|
2001-10-25 13:50:21 +08:00
|
|
|
static void
|
|
|
|
pgxml_charhandler(void *userData, const XML_Char * s, int len)
|
2001-07-30 22:59:02 +08:00
|
|
|
{
|
2001-10-25 13:50:21 +08:00
|
|
|
if (UD->textgrab)
|
|
|
|
{
|
|
|
|
if (len > 0)
|
|
|
|
{
|
|
|
|
memcpy(UD->resptr, s, len);
|
|
|
|
UD->resptr += len;
|
|
|
|
UD->reslen += len;
|
|
|
|
}
|
|
|
|
}
|
2001-07-30 22:59:02 +08:00
|
|
|
}
|
2001-10-25 13:50:21 +08:00
|
|
|
|
2001-07-30 22:59:02 +08:00
|
|
|
/* Should I be using PG list types here? */
|
|
|
|
|
2001-10-25 13:50:21 +08:00
|
|
|
static void
|
|
|
|
pgxml_finalisegrabbedtext(void *userData)
|
2001-07-30 22:59:02 +08:00
|
|
|
{
|
2001-10-25 13:50:21 +08:00
|
|
|
/* In res/reslen, we have a single result. */
|
|
|
|
UD->xpres->results[UD->xpres->rescount] = UD->resptr - UD->reslen;
|
|
|
|
UD->xpres->reslens[UD->xpres->rescount] = UD->reslen;
|
|
|
|
UD->reslen = 0;
|
|
|
|
UD->xpres->rescount++;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This effectively concatenates all the results together but we do
|
|
|
|
* know where one ends and the next begins
|
|
|
|
*/
|
2001-07-30 22:59:02 +08:00
|
|
|
}
|