2007-10-16 05:36:50 +08:00
|
|
|
/*-------------------------------------------------------------------------
|
|
|
|
*
|
|
|
|
* dict_xsyn.c
|
|
|
|
* Extended synonym dictionary
|
|
|
|
*
|
2022-01-08 08:04:57 +08:00
|
|
|
* Copyright (c) 2007-2022, PostgreSQL Global Development Group
|
2007-10-16 05:36:50 +08:00
|
|
|
*
|
|
|
|
* IDENTIFICATION
|
2010-09-21 04:08:53 +08:00
|
|
|
* contrib/dict_xsyn/dict_xsyn.c
|
2007-10-16 05:36:50 +08:00
|
|
|
*
|
|
|
|
*-------------------------------------------------------------------------
|
|
|
|
*/
|
|
|
|
#include "postgres.h"
|
|
|
|
|
|
|
|
#include <ctype.h>
|
|
|
|
|
|
|
|
#include "commands/defrem.h"
|
|
|
|
#include "tsearch/ts_locale.h"
|
|
|
|
#include "tsearch/ts_utils.h"
|
|
|
|
|
|
|
|
PG_MODULE_MAGIC;
|
|
|
|
|
|
|
|
typedef struct
|
|
|
|
{
|
|
|
|
char *key; /* Word */
|
|
|
|
char *value; /* Unparsed list of synonyms, including the
|
|
|
|
* word itself */
|
|
|
|
} Syn;
|
|
|
|
|
|
|
|
typedef struct
|
|
|
|
{
|
|
|
|
int len;
|
|
|
|
Syn *syn;
|
|
|
|
|
2009-08-06 02:06:49 +08:00
|
|
|
bool matchorig;
|
2007-10-16 05:36:50 +08:00
|
|
|
bool keeporig;
|
2009-08-06 02:06:49 +08:00
|
|
|
bool matchsynonyms;
|
|
|
|
bool keepsynonyms;
|
2007-10-16 05:36:50 +08:00
|
|
|
} DictSyn;
|
|
|
|
|
|
|
|
|
|
|
|
PG_FUNCTION_INFO_V1(dxsyn_init);
|
|
|
|
PG_FUNCTION_INFO_V1(dxsyn_lexize);
|
|
|
|
|
|
|
|
static char *
|
|
|
|
find_word(char *in, char **end)
|
|
|
|
{
|
|
|
|
char *start;
|
|
|
|
|
|
|
|
*end = NULL;
|
|
|
|
while (*in && t_isspace(in))
|
|
|
|
in += pg_mblen(in);
|
|
|
|
|
|
|
|
if (!*in || *in == '#')
|
|
|
|
return NULL;
|
|
|
|
start = in;
|
|
|
|
|
|
|
|
while (*in && !t_isspace(in))
|
|
|
|
in += pg_mblen(in);
|
|
|
|
|
|
|
|
*end = in;
|
|
|
|
|
|
|
|
return start;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
compare_syn(const void *a, const void *b)
|
|
|
|
{
|
2011-09-12 02:54:32 +08:00
|
|
|
return strcmp(((const Syn *) a)->key, ((const Syn *) b)->key);
|
2007-10-16 05:36:50 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2017-10-31 22:34:31 +08:00
|
|
|
read_dictionary(DictSyn *d, const char *filename)
|
2007-10-16 05:36:50 +08:00
|
|
|
{
|
|
|
|
char *real_filename = get_tsearch_config_filename(filename, "rules");
|
2008-06-19 04:55:42 +08:00
|
|
|
tsearch_readline_state trst;
|
2007-10-16 05:36:50 +08:00
|
|
|
char *line;
|
|
|
|
int cur = 0;
|
|
|
|
|
2008-06-19 04:55:42 +08:00
|
|
|
if (!tsearch_readline_begin(&trst, real_filename))
|
2007-10-16 05:36:50 +08:00
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_CONFIG_FILE_ERROR),
|
|
|
|
errmsg("could not open synonym file \"%s\": %m",
|
|
|
|
real_filename)));
|
|
|
|
|
2008-06-19 04:55:42 +08:00
|
|
|
while ((line = tsearch_readline(&trst)) != NULL)
|
2007-10-16 05:36:50 +08:00
|
|
|
{
|
|
|
|
char *value;
|
|
|
|
char *key;
|
2009-08-06 02:06:49 +08:00
|
|
|
char *pos;
|
|
|
|
char *end;
|
2007-10-16 05:36:50 +08:00
|
|
|
|
|
|
|
if (*line == '\0')
|
|
|
|
continue;
|
|
|
|
|
|
|
|
value = lowerstr(line);
|
|
|
|
pfree(line);
|
|
|
|
|
2009-08-06 02:06:49 +08:00
|
|
|
pos = value;
|
|
|
|
while ((key = find_word(pos, &end)) != NULL)
|
2007-10-16 05:36:50 +08:00
|
|
|
{
|
2009-08-06 02:06:49 +08:00
|
|
|
/* Enlarge syn structure if full */
|
|
|
|
if (cur == d->len)
|
|
|
|
{
|
|
|
|
d->len = (d->len > 0) ? 2 * d->len : 16;
|
|
|
|
if (d->syn)
|
|
|
|
d->syn = (Syn *) repalloc(d->syn, sizeof(Syn) * d->len);
|
|
|
|
else
|
|
|
|
d->syn = (Syn *) palloc(sizeof(Syn) * d->len);
|
|
|
|
}
|
2007-10-16 05:36:50 +08:00
|
|
|
|
2009-08-06 02:06:49 +08:00
|
|
|
/* Save first word only if we will match it */
|
|
|
|
if (pos != value || d->matchorig)
|
|
|
|
{
|
|
|
|
d->syn[cur].key = pnstrdup(key, end - key);
|
|
|
|
d->syn[cur].value = pstrdup(value);
|
2007-10-16 05:36:50 +08:00
|
|
|
|
2009-08-06 02:06:49 +08:00
|
|
|
cur++;
|
|
|
|
}
|
|
|
|
|
|
|
|
pos = end;
|
2007-10-16 05:36:50 +08:00
|
|
|
|
2009-08-06 02:06:49 +08:00
|
|
|
/* Don't bother scanning synonyms if we will not match them */
|
|
|
|
if (!d->matchsynonyms)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
pfree(value);
|
2007-10-16 05:36:50 +08:00
|
|
|
}
|
|
|
|
|
2008-06-19 04:55:42 +08:00
|
|
|
tsearch_readline_end(&trst);
|
2007-10-16 05:36:50 +08:00
|
|
|
|
|
|
|
d->len = cur;
|
|
|
|
if (cur > 1)
|
|
|
|
qsort(d->syn, d->len, sizeof(Syn), compare_syn);
|
|
|
|
|
|
|
|
pfree(real_filename);
|
|
|
|
}
|
|
|
|
|
|
|
|
Datum
|
|
|
|
dxsyn_init(PG_FUNCTION_ARGS)
|
|
|
|
{
|
|
|
|
List *dictoptions = (List *) PG_GETARG_POINTER(0);
|
|
|
|
DictSyn *d;
|
|
|
|
ListCell *l;
|
2009-08-06 02:06:49 +08:00
|
|
|
char *filename = NULL;
|
2007-10-16 05:36:50 +08:00
|
|
|
|
|
|
|
d = (DictSyn *) palloc0(sizeof(DictSyn));
|
|
|
|
d->len = 0;
|
|
|
|
d->syn = NULL;
|
2009-08-06 02:06:49 +08:00
|
|
|
d->matchorig = true;
|
2007-10-16 05:36:50 +08:00
|
|
|
d->keeporig = true;
|
2009-08-06 02:06:49 +08:00
|
|
|
d->matchsynonyms = false;
|
|
|
|
d->keepsynonyms = true;
|
2007-10-16 05:36:50 +08:00
|
|
|
|
|
|
|
foreach(l, dictoptions)
|
|
|
|
{
|
|
|
|
DefElem *defel = (DefElem *) lfirst(l);
|
|
|
|
|
Avoid unnecessary use of pg_strcasecmp for already-downcased identifiers.
We have a lot of code in which option names, which from the user's
viewpoint are logically keywords, are passed through the grammar as plain
identifiers, and then matched to string literals during command execution.
This approach avoids making words into lexer keywords unnecessarily. Some
places matched these strings using plain strcmp, some using pg_strcasecmp.
But the latter should be unnecessary since identifiers would have been
downcased on their way through the parser. Aside from any efficiency
concerns (probably not a big factor), the lack of consistency in this area
creates a hazard of subtle bugs due to different places coming to different
conclusions about whether two option names are the same or different.
Hence, standardize on using strcmp() to match any option names that are
expected to have been fed through the parser.
This does create a user-visible behavioral change, which is that while
formerly all of these would work:
alter table foo set (fillfactor = 50);
alter table foo set (FillFactor = 50);
alter table foo set ("fillfactor" = 50);
alter table foo set ("FillFactor" = 50);
now the last case will fail because that double-quoted identifier is
different from the others. However, none of our documentation says that
you can use a quoted identifier in such contexts at all, and we should
discourage doing so since it would break if we ever decide to parse such
constructs as true lexer keywords rather than poor man's substitutes.
So this shouldn't create a significant compatibility issue for users.
Daniel Gustafsson, reviewed by Michael Paquier, small changes by me
Discussion: https://postgr.es/m/29405B24-564E-476B-98C0-677A29805B84@yesql.se
2018-01-27 07:25:02 +08:00
|
|
|
if (strcmp(defel->defname, "matchorig") == 0)
|
2009-08-06 02:06:49 +08:00
|
|
|
{
|
|
|
|
d->matchorig = defGetBoolean(defel);
|
|
|
|
}
|
Avoid unnecessary use of pg_strcasecmp for already-downcased identifiers.
We have a lot of code in which option names, which from the user's
viewpoint are logically keywords, are passed through the grammar as plain
identifiers, and then matched to string literals during command execution.
This approach avoids making words into lexer keywords unnecessarily. Some
places matched these strings using plain strcmp, some using pg_strcasecmp.
But the latter should be unnecessary since identifiers would have been
downcased on their way through the parser. Aside from any efficiency
concerns (probably not a big factor), the lack of consistency in this area
creates a hazard of subtle bugs due to different places coming to different
conclusions about whether two option names are the same or different.
Hence, standardize on using strcmp() to match any option names that are
expected to have been fed through the parser.
This does create a user-visible behavioral change, which is that while
formerly all of these would work:
alter table foo set (fillfactor = 50);
alter table foo set (FillFactor = 50);
alter table foo set ("fillfactor" = 50);
alter table foo set ("FillFactor" = 50);
now the last case will fail because that double-quoted identifier is
different from the others. However, none of our documentation says that
you can use a quoted identifier in such contexts at all, and we should
discourage doing so since it would break if we ever decide to parse such
constructs as true lexer keywords rather than poor man's substitutes.
So this shouldn't create a significant compatibility issue for users.
Daniel Gustafsson, reviewed by Michael Paquier, small changes by me
Discussion: https://postgr.es/m/29405B24-564E-476B-98C0-677A29805B84@yesql.se
2018-01-27 07:25:02 +08:00
|
|
|
else if (strcmp(defel->defname, "keeporig") == 0)
|
2007-10-16 05:36:50 +08:00
|
|
|
{
|
|
|
|
d->keeporig = defGetBoolean(defel);
|
|
|
|
}
|
Avoid unnecessary use of pg_strcasecmp for already-downcased identifiers.
We have a lot of code in which option names, which from the user's
viewpoint are logically keywords, are passed through the grammar as plain
identifiers, and then matched to string literals during command execution.
This approach avoids making words into lexer keywords unnecessarily. Some
places matched these strings using plain strcmp, some using pg_strcasecmp.
But the latter should be unnecessary since identifiers would have been
downcased on their way through the parser. Aside from any efficiency
concerns (probably not a big factor), the lack of consistency in this area
creates a hazard of subtle bugs due to different places coming to different
conclusions about whether two option names are the same or different.
Hence, standardize on using strcmp() to match any option names that are
expected to have been fed through the parser.
This does create a user-visible behavioral change, which is that while
formerly all of these would work:
alter table foo set (fillfactor = 50);
alter table foo set (FillFactor = 50);
alter table foo set ("fillfactor" = 50);
alter table foo set ("FillFactor" = 50);
now the last case will fail because that double-quoted identifier is
different from the others. However, none of our documentation says that
you can use a quoted identifier in such contexts at all, and we should
discourage doing so since it would break if we ever decide to parse such
constructs as true lexer keywords rather than poor man's substitutes.
So this shouldn't create a significant compatibility issue for users.
Daniel Gustafsson, reviewed by Michael Paquier, small changes by me
Discussion: https://postgr.es/m/29405B24-564E-476B-98C0-677A29805B84@yesql.se
2018-01-27 07:25:02 +08:00
|
|
|
else if (strcmp(defel->defname, "matchsynonyms") == 0)
|
2009-08-06 02:06:49 +08:00
|
|
|
{
|
|
|
|
d->matchsynonyms = defGetBoolean(defel);
|
|
|
|
}
|
Avoid unnecessary use of pg_strcasecmp for already-downcased identifiers.
We have a lot of code in which option names, which from the user's
viewpoint are logically keywords, are passed through the grammar as plain
identifiers, and then matched to string literals during command execution.
This approach avoids making words into lexer keywords unnecessarily. Some
places matched these strings using plain strcmp, some using pg_strcasecmp.
But the latter should be unnecessary since identifiers would have been
downcased on their way through the parser. Aside from any efficiency
concerns (probably not a big factor), the lack of consistency in this area
creates a hazard of subtle bugs due to different places coming to different
conclusions about whether two option names are the same or different.
Hence, standardize on using strcmp() to match any option names that are
expected to have been fed through the parser.
This does create a user-visible behavioral change, which is that while
formerly all of these would work:
alter table foo set (fillfactor = 50);
alter table foo set (FillFactor = 50);
alter table foo set ("fillfactor" = 50);
alter table foo set ("FillFactor" = 50);
now the last case will fail because that double-quoted identifier is
different from the others. However, none of our documentation says that
you can use a quoted identifier in such contexts at all, and we should
discourage doing so since it would break if we ever decide to parse such
constructs as true lexer keywords rather than poor man's substitutes.
So this shouldn't create a significant compatibility issue for users.
Daniel Gustafsson, reviewed by Michael Paquier, small changes by me
Discussion: https://postgr.es/m/29405B24-564E-476B-98C0-677A29805B84@yesql.se
2018-01-27 07:25:02 +08:00
|
|
|
else if (strcmp(defel->defname, "keepsynonyms") == 0)
|
2009-08-06 02:06:49 +08:00
|
|
|
{
|
|
|
|
d->keepsynonyms = defGetBoolean(defel);
|
|
|
|
}
|
Avoid unnecessary use of pg_strcasecmp for already-downcased identifiers.
We have a lot of code in which option names, which from the user's
viewpoint are logically keywords, are passed through the grammar as plain
identifiers, and then matched to string literals during command execution.
This approach avoids making words into lexer keywords unnecessarily. Some
places matched these strings using plain strcmp, some using pg_strcasecmp.
But the latter should be unnecessary since identifiers would have been
downcased on their way through the parser. Aside from any efficiency
concerns (probably not a big factor), the lack of consistency in this area
creates a hazard of subtle bugs due to different places coming to different
conclusions about whether two option names are the same or different.
Hence, standardize on using strcmp() to match any option names that are
expected to have been fed through the parser.
This does create a user-visible behavioral change, which is that while
formerly all of these would work:
alter table foo set (fillfactor = 50);
alter table foo set (FillFactor = 50);
alter table foo set ("fillfactor" = 50);
alter table foo set ("FillFactor" = 50);
now the last case will fail because that double-quoted identifier is
different from the others. However, none of our documentation says that
you can use a quoted identifier in such contexts at all, and we should
discourage doing so since it would break if we ever decide to parse such
constructs as true lexer keywords rather than poor man's substitutes.
So this shouldn't create a significant compatibility issue for users.
Daniel Gustafsson, reviewed by Michael Paquier, small changes by me
Discussion: https://postgr.es/m/29405B24-564E-476B-98C0-677A29805B84@yesql.se
2018-01-27 07:25:02 +08:00
|
|
|
else if (strcmp(defel->defname, "rules") == 0)
|
2007-10-16 05:36:50 +08:00
|
|
|
{
|
2009-08-06 02:06:49 +08:00
|
|
|
/* we can't read the rules before parsing all options! */
|
|
|
|
filename = defGetString(defel);
|
2007-10-16 05:36:50 +08:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
|
|
|
errmsg("unrecognized xsyn parameter: \"%s\"",
|
|
|
|
defel->defname)));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2009-08-06 02:06:49 +08:00
|
|
|
if (filename)
|
|
|
|
read_dictionary(d, filename);
|
|
|
|
|
2007-10-16 05:36:50 +08:00
|
|
|
PG_RETURN_POINTER(d);
|
|
|
|
}
|
|
|
|
|
|
|
|
Datum
|
|
|
|
dxsyn_lexize(PG_FUNCTION_ARGS)
|
|
|
|
{
|
|
|
|
DictSyn *d = (DictSyn *) PG_GETARG_POINTER(0);
|
|
|
|
char *in = (char *) PG_GETARG_POINTER(1);
|
|
|
|
int length = PG_GETARG_INT32(2);
|
|
|
|
Syn word;
|
|
|
|
Syn *found;
|
|
|
|
TSLexeme *res = NULL;
|
|
|
|
|
|
|
|
if (!length || d->len == 0)
|
|
|
|
PG_RETURN_POINTER(NULL);
|
|
|
|
|
|
|
|
/* Create search pattern */
|
|
|
|
{
|
|
|
|
char *temp = pnstrdup(in, length);
|
|
|
|
|
|
|
|
word.key = lowerstr(temp);
|
|
|
|
pfree(temp);
|
|
|
|
word.value = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Look for matching syn */
|
|
|
|
found = (Syn *) bsearch(&word, d->syn, d->len, sizeof(Syn), compare_syn);
|
|
|
|
pfree(word.key);
|
|
|
|
|
|
|
|
if (!found)
|
|
|
|
PG_RETURN_POINTER(NULL);
|
|
|
|
|
|
|
|
/* Parse string of synonyms and return array of words */
|
|
|
|
{
|
2009-08-06 02:06:49 +08:00
|
|
|
char *value = found->value;
|
|
|
|
char *syn;
|
|
|
|
char *pos;
|
|
|
|
char *end;
|
2007-10-16 05:36:50 +08:00
|
|
|
int nsyns = 0;
|
|
|
|
|
2009-08-06 02:06:49 +08:00
|
|
|
res = palloc(sizeof(TSLexeme));
|
2007-10-16 05:36:50 +08:00
|
|
|
|
2009-08-06 02:06:49 +08:00
|
|
|
pos = value;
|
|
|
|
while ((syn = find_word(pos, &end)) != NULL)
|
2007-10-16 05:36:50 +08:00
|
|
|
{
|
|
|
|
res = repalloc(res, sizeof(TSLexeme) * (nsyns + 2));
|
|
|
|
|
2009-08-06 02:06:49 +08:00
|
|
|
/* The first word is output only if keeporig=true */
|
|
|
|
if (pos != value || d->keeporig)
|
2007-10-16 05:36:50 +08:00
|
|
|
{
|
2009-08-06 02:06:49 +08:00
|
|
|
res[nsyns].lexeme = pnstrdup(syn, end - syn);
|
2011-11-04 07:17:48 +08:00
|
|
|
res[nsyns].nvariant = 0;
|
|
|
|
res[nsyns].flags = 0;
|
2007-10-16 05:36:50 +08:00
|
|
|
nsyns++;
|
|
|
|
}
|
|
|
|
|
2009-08-06 02:06:49 +08:00
|
|
|
pos = end;
|
2007-10-16 05:36:50 +08:00
|
|
|
|
2009-08-06 02:06:49 +08:00
|
|
|
/* Stop if we are not to output the synonyms */
|
|
|
|
if (!d->keepsynonyms)
|
|
|
|
break;
|
2007-10-16 05:36:50 +08:00
|
|
|
}
|
2009-08-06 02:06:49 +08:00
|
|
|
res[nsyns].lexeme = NULL;
|
2007-10-16 05:36:50 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
PG_RETURN_POINTER(res);
|
|
|
|
}
|