postgresql/contrib/xml/pgxml.c

/********************************************************
 * Interface code to parse an XML document using expat
 ********************************************************/

#include "postgres.h"
#include "fmgr.h"

#include "expat.h"
#include "pgxml.h"

/* Memory management - we make expat use standard pg MM */

XML_Memory_Handling_Suite mhs;

/* passthrough functions (palloc is a macro) */

static void *pgxml_palloc(size_t size)
{
  return palloc(size);
}

static void *pgxml_repalloc(void *ptr, size_t size)
{
  return repalloc(ptr,size);
}

static void pgxml_pfree(void *ptr)
{
  return pfree(ptr);
}

static void pgxml_mhs_init()
{
  mhs.malloc_fcn = pgxml_palloc;
  mhs.realloc_fcn = pgxml_repalloc;
  mhs.free_fcn = pgxml_pfree;
}

static void pgxml_handler_init()
{
  /* This code should set up the relevant handlers from  user-supplied
     settings. Quite how these settings are made is another matter :) */
}

/* Returns true if document is well-formed */

PG_FUNCTION_INFO_V1(pgxml_parse);

Datum
pgxml_parse(PG_FUNCTION_ARGS)
{
  /* called as pgxml_parse(document) */
  XML_Parser p;
  text *t = PG_GETARG_TEXT_P(0); /*document buffer */
  int32 docsize = VARSIZE(t) - VARHDRSZ;

  pgxml_mhs_init();

  pgxml_handler_init();

  p = XML_ParserCreate_MM(NULL,&mhs,NULL);
  if (! p) {
    elog(ERROR, "pgxml: Could not create expat parser");
    PG_RETURN_NULL(); /* seems appropriate if we couldn't parse */
  }

  if (! XML_Parse(p, (char *)VARDATA(t) , docsize, 1)) {
    /*    elog(NOTICE, "Parse error at line %d:%s",
	    XML_GetCurrentLineNumber(p),
	    XML_ErrorString(XML_GetErrorCode(p))); */
    XML_ParserFree(p);
    PG_RETURN_BOOL(false);
  }

  XML_ParserFree(p);
  PG_RETURN_BOOL(true);
}

/* XPath handling functions */

/* XPath support here is for a very skeletal kind of XPath!
   It was easy to program though... */

/* This first is the core function that builds a result set. The
   actual functions called by the user manipulate that result set
   in various ways.
*/

static XPath_Results *build_xpath_results(text *doc, text *pathstr)
{
  XPath_Results *xpr;
  char *res;
  pgxml_udata *udata;
  XML_Parser p;
  int32 docsize;

  xpr = (XPath_Results *) palloc((sizeof(XPath_Results)));
  memset((void *)xpr, 0, sizeof(XPath_Results));
  xpr->rescount=0;

  docsize=VARSIZE(doc)-VARHDRSZ;

  /* res isn't going to be the real return type, it is just a buffer */

  res = (char *) palloc(docsize);
  memset((void *)res, 0, docsize);

  xpr->resbuf = res;

  udata = (pgxml_udata *) palloc((sizeof(pgxml_udata)));
  memset((void *)udata,0,sizeof(pgxml_udata));

  udata->currentpath[0]='\0';
  udata->textgrab=0;

  udata->path= (char *) palloc(VARSIZE(pathstr));
  memcpy(udata->path, VARDATA(pathstr), VARSIZE(pathstr)-VARHDRSZ);

  udata->path[VARSIZE(pathstr)-VARHDRSZ]='\0';

  udata->resptr = res;
  udata->reslen = 0;

  udata->xpres = xpr;

  /* Now fire up the parser */
  pgxml_mhs_init();

  p = XML_ParserCreate_MM(NULL,&mhs,NULL);
  if (! p) {
    elog(ERROR, "pgxml: Could not create expat parser");
    pfree(xpr);
    pfree(udata->path);
    pfree(udata);
    pfree(res);
    return NULL;
  }
  XML_SetUserData(p, (void *)udata);

  /* Set the handlers */

  XML_SetElementHandler(p, pgxml_starthandler, pgxml_endhandler);
  XML_SetCharacterDataHandler(p, pgxml_charhandler);

  if (! XML_Parse(p, (char *)VARDATA(doc) , docsize, 1)) {
    /*     elog(NOTICE, "Parse error at line %d:%s",
	    XML_GetCurrentLineNumber(p),
	    XML_ErrorString(XML_GetErrorCode(p))); */
    XML_ParserFree(p);
    pfree(xpr);
    pfree(udata->path);
    pfree(udata);

    return NULL;
  }

  pfree(udata->path);
  pfree(udata);
  XML_ParserFree(p);
  return xpr;
}


PG_FUNCTION_INFO_V1(pgxml_xpath);

Datum
pgxml_xpath(PG_FUNCTION_ARGS)
{
  /* called as pgxml_xpath(document,pathstr, index) for the moment*/

  XPath_Results *xpresults;
  text *restext;

  text *t = PG_GETARG_TEXT_P(0); /*document buffer */
  text *t2= PG_GETARG_TEXT_P(1);
  int32 ind = PG_GETARG_INT32(2) - 1;

  xpresults = build_xpath_results(t,t2);

  /* This needs to be changed depending on the mechanism for returning
     our set of results. */

  if (xpresults==NULL)   /*parse error (not WF or parser failure) */
    {
      PG_RETURN_NULL();
    }

  if (ind >= (xpresults->rescount))
    {
      PG_RETURN_NULL();
    }

  restext = (text *) palloc(xpresults->reslens[ind]+VARHDRSZ);
  memcpy(VARDATA(restext),xpresults->results[ind],xpresults->reslens[ind]);

  VARATT_SIZEP(restext) = xpresults->reslens[ind]+VARHDRSZ;

  pfree(xpresults->resbuf);
  pfree(xpresults);

  PG_RETURN_TEXT_P(restext);
}


static void pgxml_pathcompare(void *userData)
{
  char  *matchpos;

  matchpos=strstr(UD->currentpath, UD->path);

  if (matchpos == NULL) { /* Should we have more logic here ? */
    if (UD->textgrab) {
      UD->textgrab=0;
      pgxml_finalisegrabbedtext(userData);
    }
    return;
  }
  /* OK, we have a match of some sort. Now we need to check that
     our match is anchored to the *end* of the string AND
     that it is immediately preceded by a '/'*/
  /* This test wouldn't work if strlen (UD->path) overran the length
     of the currentpath, but that's not possible because we got a match! */

  if ((matchpos + strlen(UD->path))[0]=='\0')
    {
      if ((UD->path)[0]=='/') {
	if (matchpos == UD->currentpath) {
	  UD->textgrab=1;
	}
      } else {
	if ((matchpos-1)[0]=='/') {
	  UD->textgrab=1;
	}
      }
    }
}

static void pgxml_starthandler(void *userData, const XML_Char *name,
			const XML_Char **atts)
{

  char sepstr[]="/";

  if ((strlen(name)+strlen(UD->currentpath))>MAXPATHLENGTH-2) {
    elog(NOTICE,"Path too long");
  } else {
    strncat(UD->currentpath,sepstr,1);
    strcat(UD->currentpath, name);
  }
  if (UD->textgrab)
    {
      /* Depending on user preference, should we "reconstitute"
	 the element into the result text?
      */
    } else {
      pgxml_pathcompare(userData);
    }
}

static void pgxml_endhandler(void *userData, const XML_Char *name)
{
  /* Start by removing the current element off the end of the
     currentpath */

  char *sepptr;

  sepptr=strrchr(UD->currentpath,'/');
  if (sepptr==NULL) {
    elog(ERROR,"There's a problem...");
    sepptr=UD->currentpath;
  }
  if (strcmp(name, sepptr+1) !=0) {
    elog(NOTICE,"Wanted [%s], got [%s]",sepptr,name);
    /* unmatched entry, so do nothing */
  } else {
    sepptr[0]='\0'; /* Chop that element off the end */
  }

  if (UD->textgrab) {
    pgxml_pathcompare(userData);
  }

}

static void pgxml_charhandler(void *userData, const XML_Char *s, int len)
{
  if (UD->textgrab) {
    if (len>0) {
      memcpy(UD->resptr,s,len);
      UD->resptr += len;
      UD->reslen += len;
    }
  }
}
/* Should I be using PG list types here? */

static void pgxml_finalisegrabbedtext(void *userData)
{
  /* In res/reslen, we have a single result. */
  UD->xpres->results[UD->xpres->rescount]= UD->resptr - UD->reslen;
  UD->xpres->reslens[UD->xpres->rescount]= UD->reslen;
  UD->reslen=0;
  UD->xpres->rescount++;

  /* This effectively concatenates all the results together but we
     do know where one ends and the next begins */
}