2006-05-31 22:05:31 +08:00
|
|
|
/*
|
2006-10-04 08:30:14 +08:00
|
|
|
* lexize stream of lexemes
|
2006-05-31 22:05:31 +08:00
|
|
|
* Teodor Sigaev <teodor@sigaev.ru>
|
|
|
|
*/
|
|
|
|
#include "postgres.h"
|
|
|
|
|
|
|
|
#include <ctype.h>
|
|
|
|
#include <locale.h>
|
|
|
|
|
|
|
|
#include "ts_cfg.h"
|
|
|
|
#include "dict.h"
|
|
|
|
|
|
|
|
void
|
2006-10-04 08:30:14 +08:00
|
|
|
LexizeInit(LexizeData * ld, TSCfgInfo * cfg)
|
|
|
|
{
|
2006-05-31 22:05:31 +08:00
|
|
|
ld->cfg = cfg;
|
|
|
|
ld->curDictId = InvalidOid;
|
|
|
|
ld->posDict = 0;
|
|
|
|
ld->towork.head = ld->towork.tail = ld->curSub = NULL;
|
|
|
|
ld->waste.head = ld->waste.tail = NULL;
|
2006-10-04 08:30:14 +08:00
|
|
|
ld->lastRes = NULL;
|
|
|
|
ld->tmpRes = NULL;
|
2006-05-31 22:05:31 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2006-10-04 08:30:14 +08:00
|
|
|
LPLAddTail(ListParsedLex * list, ParsedLex * newpl)
|
|
|
|
{
|
|
|
|
if (list->tail)
|
|
|
|
{
|
2006-05-31 22:05:31 +08:00
|
|
|
list->tail->next = newpl;
|
|
|
|
list->tail = newpl;
|
2006-10-04 08:30:14 +08:00
|
|
|
}
|
|
|
|
else
|
2006-05-31 22:05:31 +08:00
|
|
|
list->head = list->tail = newpl;
|
|
|
|
newpl->next = NULL;
|
|
|
|
}
|
|
|
|
|
2006-10-04 08:30:14 +08:00
|
|
|
static ParsedLex *
|
|
|
|
LPLRemoveHead(ListParsedLex * list)
|
|
|
|
{
|
|
|
|
ParsedLex *res = list->head;
|
2006-05-31 22:05:31 +08:00
|
|
|
|
2006-10-04 08:30:14 +08:00
|
|
|
if (list->head)
|
2006-05-31 22:05:31 +08:00
|
|
|
list->head = list->head->next;
|
|
|
|
|
2006-10-04 08:30:14 +08:00
|
|
|
if (list->head == NULL)
|
2006-05-31 22:05:31 +08:00
|
|
|
list->tail = NULL;
|
|
|
|
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void
|
2006-10-04 08:30:14 +08:00
|
|
|
LexizeAddLemm(LexizeData * ld, int type, char *lemm, int lenlemm)
|
|
|
|
{
|
|
|
|
ParsedLex *newpl = (ParsedLex *) palloc(sizeof(ParsedLex));
|
2006-05-31 22:05:31 +08:00
|
|
|
|
2006-10-04 08:30:14 +08:00
|
|
|
newpl = (ParsedLex *) palloc(sizeof(ParsedLex));
|
2006-05-31 22:05:31 +08:00
|
|
|
newpl->type = type;
|
|
|
|
newpl->lemm = lemm;
|
|
|
|
newpl->lenlemm = lenlemm;
|
|
|
|
LPLAddTail(&ld->towork, newpl);
|
|
|
|
ld->curSub = ld->towork.tail;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2006-10-04 08:30:14 +08:00
|
|
|
RemoveHead(LexizeData * ld)
|
|
|
|
{
|
2006-05-31 22:05:31 +08:00
|
|
|
LPLAddTail(&ld->waste, LPLRemoveHead(&ld->towork));
|
|
|
|
|
|
|
|
ld->posDict = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2006-10-04 08:30:14 +08:00
|
|
|
setCorrLex(LexizeData * ld, ParsedLex ** correspondLexem)
|
|
|
|
{
|
|
|
|
if (correspondLexem)
|
|
|
|
{
|
2006-05-31 22:05:31 +08:00
|
|
|
*correspondLexem = ld->waste.head;
|
2006-10-04 08:30:14 +08:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
ParsedLex *tmp,
|
|
|
|
*ptr = ld->waste.head;
|
2006-05-31 22:05:31 +08:00
|
|
|
|
2006-10-04 08:30:14 +08:00
|
|
|
while (ptr)
|
|
|
|
{
|
2006-05-31 22:05:31 +08:00
|
|
|
tmp = ptr->next;
|
|
|
|
pfree(ptr);
|
|
|
|
ptr = tmp;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
ld->waste.head = ld->waste.tail = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2006-10-04 08:30:14 +08:00
|
|
|
moveToWaste(LexizeData * ld, ParsedLex * stop)
|
|
|
|
{
|
|
|
|
bool go = true;
|
|
|
|
|
|
|
|
while (ld->towork.head && go)
|
|
|
|
{
|
|
|
|
if (ld->towork.head == stop)
|
|
|
|
{
|
2006-05-31 22:05:31 +08:00
|
|
|
ld->curSub = stop->next;
|
|
|
|
go = false;
|
|
|
|
}
|
|
|
|
RemoveHead(ld);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2006-10-04 08:30:14 +08:00
|
|
|
setNewTmpRes(LexizeData * ld, ParsedLex * lex, TSLexeme * res)
|
|
|
|
{
|
|
|
|
if (ld->tmpRes)
|
|
|
|
{
|
|
|
|
TSLexeme *ptr;
|
|
|
|
|
|
|
|
for (ptr = ld->tmpRes; ptr->lexeme; ptr++)
|
|
|
|
pfree(ptr->lexeme);
|
|
|
|
pfree(ld->tmpRes);
|
2006-05-31 22:05:31 +08:00
|
|
|
}
|
|
|
|
ld->tmpRes = res;
|
|
|
|
ld->lastRes = lex;
|
|
|
|
}
|
|
|
|
|
2006-10-04 08:30:14 +08:00
|
|
|
TSLexeme *
|
|
|
|
LexizeExec(LexizeData * ld, ParsedLex ** correspondLexem)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
ListDictionary *map;
|
|
|
|
DictInfo *dict;
|
|
|
|
TSLexeme *res;
|
|
|
|
|
|
|
|
if (ld->curDictId == InvalidOid)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* usial mode: dictionary wants only one word, but we should keep in
|
|
|
|
* mind that we should go through all stack
|
2006-05-31 22:05:31 +08:00
|
|
|
*/
|
|
|
|
|
2006-10-04 08:30:14 +08:00
|
|
|
while (ld->towork.head)
|
|
|
|
{
|
|
|
|
ParsedLex *curVal = ld->towork.head;
|
2006-05-31 22:05:31 +08:00
|
|
|
|
|
|
|
map = ld->cfg->map + curVal->type;
|
|
|
|
|
2006-10-04 08:30:14 +08:00
|
|
|
if (curVal->type == 0 || curVal->type >= ld->cfg->len || map->len == 0)
|
|
|
|
{
|
2006-05-31 22:05:31 +08:00
|
|
|
/* skip this type of lexeme */
|
|
|
|
RemoveHead(ld);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2006-10-04 08:30:14 +08:00
|
|
|
for (i = ld->posDict; i < map->len; i++)
|
|
|
|
{
|
2006-05-31 22:05:31 +08:00
|
|
|
dict = finddict(DatumGetObjectId(map->dict_id[i]));
|
|
|
|
|
|
|
|
ld->dictState.isend = ld->dictState.getnext = false;
|
|
|
|
ld->dictState.private = NULL;
|
2006-10-04 08:30:14 +08:00
|
|
|
res = (TSLexeme *) DatumGetPointer(FunctionCall4(
|
|
|
|
&(dict->lexize_info),
|
|
|
|
PointerGetDatum(dict->dictionary),
|
|
|
|
PointerGetDatum(curVal->lemm),
|
|
|
|
Int32GetDatum(curVal->lenlemm),
|
|
|
|
PointerGetDatum(&ld->dictState)
|
|
|
|
));
|
|
|
|
|
|
|
|
if (ld->dictState.getnext)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* dictinary wants next word, so setup and store current
|
|
|
|
* position and go to multiword mode
|
2006-05-31 22:05:31 +08:00
|
|
|
*/
|
2006-10-04 08:30:14 +08:00
|
|
|
|
2006-05-31 22:05:31 +08:00
|
|
|
ld->curDictId = DatumGetObjectId(map->dict_id[i]);
|
2006-10-04 08:30:14 +08:00
|
|
|
ld->posDict = i + 1;
|
2006-05-31 22:05:31 +08:00
|
|
|
ld->curSub = curVal->next;
|
2006-10-04 08:30:14 +08:00
|
|
|
if (res)
|
2006-05-31 22:05:31 +08:00
|
|
|
setNewTmpRes(ld, curVal, res);
|
|
|
|
return LexizeExec(ld, correspondLexem);
|
|
|
|
}
|
|
|
|
|
2006-10-04 08:30:14 +08:00
|
|
|
if (!res) /* dictionary doesn't know this lexeme */
|
2006-05-31 22:05:31 +08:00
|
|
|
continue;
|
2006-10-04 08:30:14 +08:00
|
|
|
|
2006-05-31 22:05:31 +08:00
|
|
|
RemoveHead(ld);
|
|
|
|
setCorrLex(ld, correspondLexem);
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
|
|
|
RemoveHead(ld);
|
2006-10-04 08:30:14 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{ /* curDictId is valid */
|
2006-05-31 22:05:31 +08:00
|
|
|
dict = finddict(ld->curDictId);
|
2006-10-04 08:30:14 +08:00
|
|
|
|
2006-05-31 22:05:31 +08:00
|
|
|
/*
|
|
|
|
* Dictionary ld->curDictId asks us about following words
|
|
|
|
*/
|
|
|
|
|
2006-10-04 08:30:14 +08:00
|
|
|
while (ld->curSub)
|
|
|
|
{
|
|
|
|
ParsedLex *curVal = ld->curSub;
|
2006-05-31 22:05:31 +08:00
|
|
|
|
|
|
|
map = ld->cfg->map + curVal->type;
|
|
|
|
|
2006-10-04 08:30:14 +08:00
|
|
|
if (curVal->type != 0)
|
|
|
|
{
|
|
|
|
bool dictExists = false;
|
2006-05-31 22:05:31 +08:00
|
|
|
|
2006-10-04 08:30:14 +08:00
|
|
|
if (curVal->type >= ld->cfg->len || map->len == 0)
|
|
|
|
{
|
2006-05-31 22:05:31 +08:00
|
|
|
/* skip this type of lexeme */
|
|
|
|
ld->curSub = curVal->next;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2006-10-04 08:30:14 +08:00
|
|
|
* We should be sure that current type of lexeme is recognized
|
|
|
|
* by our dictinonary: we just check is it exist in list of
|
|
|
|
* dictionaries ?
|
2006-05-31 22:05:31 +08:00
|
|
|
*/
|
2006-10-04 08:30:14 +08:00
|
|
|
for (i = 0; i < map->len && !dictExists; i++)
|
|
|
|
if (ld->curDictId == DatumGetObjectId(map->dict_id[i]))
|
2006-05-31 22:05:31 +08:00
|
|
|
dictExists = true;
|
|
|
|
|
2006-10-04 08:30:14 +08:00
|
|
|
if (!dictExists)
|
|
|
|
{
|
2006-05-31 22:05:31 +08:00
|
|
|
/*
|
|
|
|
* Dictionary can't work with current tpe of lexeme,
|
|
|
|
* return to basic mode and redo all stored lexemes
|
|
|
|
*/
|
|
|
|
ld->curDictId = InvalidOid;
|
|
|
|
return LexizeExec(ld, correspondLexem);
|
|
|
|
}
|
2006-10-04 08:30:14 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
ld->dictState.isend = (curVal->type == 0) ? true : false;
|
2006-05-31 22:05:31 +08:00
|
|
|
ld->dictState.getnext = false;
|
|
|
|
|
2006-10-04 08:30:14 +08:00
|
|
|
res = (TSLexeme *) DatumGetPointer(FunctionCall4(
|
|
|
|
&(dict->lexize_info),
|
|
|
|
PointerGetDatum(dict->dictionary),
|
|
|
|
PointerGetDatum(curVal->lemm),
|
|
|
|
Int32GetDatum(curVal->lenlemm),
|
|
|
|
PointerGetDatum(&ld->dictState)
|
|
|
|
));
|
2006-05-31 22:05:31 +08:00
|
|
|
|
2006-10-04 08:30:14 +08:00
|
|
|
if (ld->dictState.getnext)
|
|
|
|
{
|
2006-05-31 22:05:31 +08:00
|
|
|
/* Dictionary wants one more */
|
|
|
|
ld->curSub = curVal->next;
|
2006-10-04 08:30:14 +08:00
|
|
|
if (res)
|
2006-05-31 22:05:31 +08:00
|
|
|
setNewTmpRes(ld, curVal, res);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2006-10-04 08:30:14 +08:00
|
|
|
if (res || ld->tmpRes)
|
|
|
|
{
|
2006-05-31 22:05:31 +08:00
|
|
|
/*
|
2006-10-04 08:30:14 +08:00
|
|
|
* Dictionary normalizes lexemes, so we remove from stack all
|
|
|
|
* used lexemes , return to basic mode and redo end of stack
|
|
|
|
* (if it exists)
|
2006-05-31 22:05:31 +08:00
|
|
|
*/
|
2006-10-04 08:30:14 +08:00
|
|
|
if (res)
|
|
|
|
{
|
|
|
|
moveToWaste(ld, ld->curSub);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2006-05-31 22:05:31 +08:00
|
|
|
res = ld->tmpRes;
|
2006-10-04 08:30:14 +08:00
|
|
|
moveToWaste(ld, ld->lastRes);
|
2006-05-31 22:05:31 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* reset to initial state */
|
|
|
|
ld->curDictId = InvalidOid;
|
|
|
|
ld->posDict = 0;
|
|
|
|
ld->lastRes = NULL;
|
|
|
|
ld->tmpRes = NULL;
|
|
|
|
setCorrLex(ld, correspondLexem);
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
2006-10-04 08:30:14 +08:00
|
|
|
/*
|
|
|
|
* Dict don't want next lexem and didn't recognize anything, redo
|
|
|
|
* from ld->towork.head
|
|
|
|
*/
|
2006-05-31 22:05:31 +08:00
|
|
|
ld->curDictId = InvalidOid;
|
|
|
|
return LexizeExec(ld, correspondLexem);
|
2006-10-04 08:30:14 +08:00
|
|
|
}
|
2006-05-31 22:05:31 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
setCorrLex(ld, correspondLexem);
|
|
|
|
return NULL;
|
|
|
|
}
|