From 1157f3cc81d21540c71f7647821a456692041290 Mon Sep 17 00:00:00 2001 From: Andrew Dunstan Date: Tue, 20 Nov 2007 02:25:22 +0000 Subject: [PATCH] Change descriptions of entity and tag objects to "XML entity" and "XML tag". Allow tag and entity names that follow XML rules. Provide for hexadecimal as well as decimal numeric entities. Adjust code names to coincide with new descriptions. --- doc/src/sgml/textsearch.sgml | 8 +-- src/backend/tsearch/wparser_def.c | 89 +++++++++++++++++---------- src/test/regress/expected/tsearch.out | 4 +- 3 files changed, 63 insertions(+), 38 deletions(-) diff --git a/doc/src/sgml/textsearch.sgml b/doc/src/sgml/textsearch.sgml index b43872cca5..61583df3a2 100644 --- a/doc/src/sgml/textsearch.sgml +++ b/doc/src/sgml/textsearch.sgml @@ -1,4 +1,4 @@ - + Full Text Search @@ -1862,12 +1862,12 @@ LIMIT 10; tag - HTML tag - <A HREF="dictionaries.html"> + XML tag + <a href="dictionaries.html"> entity - HTML entity + XML entity &amp; diff --git a/src/backend/tsearch/wparser_def.c b/src/backend/tsearch/wparser_def.c index 3f95f60579..b80175456d 100644 --- a/src/backend/tsearch/wparser_def.c +++ b/src/backend/tsearch/wparser_def.c @@ -7,7 +7,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.10 2007/11/15 22:25:16 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.11 2007/11/20 02:25:22 adunstan Exp $ * *------------------------------------------------------------------------- */ @@ -50,7 +50,7 @@ #define DECIMAL 20 #define SIGNEDINT 21 #define UNSIGNEDINT 22 -#define HTMLENTITY 23 +#define XMLENTITY 23 #define LASTNUM 23 @@ -95,7 +95,7 @@ static const char *const lex_descr[] = { "Hyphenated word part, all letters", "Hyphenated word part, all ASCII", "Space symbols", - "HTML tag", + "XML tag", "Protocol head", "Hyphenated word, letters and digits", "Hyphenated word, all ASCII", @@ -105,7 +105,7 @@ static const char *const lex_descr[] = { "Decimal notation", "Signed integer", "Unsigned integer", - "HTML entity" + "XML entity" }; @@ -132,11 +132,13 @@ typedef enum TPS_InMantissaFirst, TPS_InMantissaSign, TPS_InMantissa, - TPS_InHTMLEntityFirst, - TPS_InHTMLEntity, - TPS_InHTMLEntityNumFirst, - TPS_InHTMLEntityNum, - TPS_InHTMLEntityEnd, + TPS_InXMLEntityFirst, + TPS_InXMLEntity, + TPS_InXMLEntityNumFirst, + TPS_InXMLEntityNum, + TPS_InXMLEntityHexNumFirst, + TPS_InXMLEntityHexNum, + TPS_InXMLEntityEnd, TPS_InTagFirst, TPS_InXMLBegin, TPS_InTagCloseFirst, @@ -653,7 +655,7 @@ static const TParserStateActionItem actionTPS_Base[] = { {p_isdigit, 0, A_NEXT, TPS_InUnsignedInt, 0, NULL}, {p_iseqC, '-', A_PUSH, TPS_InSignedIntFirst, 0, NULL}, {p_iseqC, '+', A_PUSH, TPS_InSignedIntFirst, 0, NULL}, - {p_iseqC, '&', A_PUSH, TPS_InHTMLEntityFirst, 0, NULL}, + {p_iseqC, '&', A_PUSH, TPS_InXMLEntityFirst, 0, NULL}, {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL}, {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL}, {p_iseqC, '.', A_PUSH, TPS_InPathFirstFirst, 0, NULL}, @@ -811,35 +813,56 @@ static const TParserStateActionItem actionTPS_InMantissa[] = { {NULL, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL} }; -static const TParserStateActionItem actionTPS_InHTMLEntityFirst[] = { +static const TParserStateActionItem actionTPS_InXMLEntityFirst[] = { {p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, - {p_iseqC, '#', A_NEXT, TPS_InHTMLEntityNumFirst, 0, NULL}, - {p_isasclet, 0, A_NEXT, TPS_InHTMLEntity, 0, NULL}, + {p_iseqC, '#', A_NEXT, TPS_InXMLEntityNumFirst, 0, NULL}, + {p_isasclet, 0, A_NEXT, TPS_InXMLEntity, 0, NULL}, + {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL}, + {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL}, {NULL, 0, A_POP, TPS_Null, 0, NULL} }; -static const TParserStateActionItem actionTPS_InHTMLEntity[] = { +static const TParserStateActionItem actionTPS_InXMLEntity[] = { {p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, - {p_isasclet, 0, A_NEXT, TPS_InHTMLEntity, 0, NULL}, - {p_iseqC, ';', A_NEXT, TPS_InHTMLEntityEnd, 0, NULL}, + {p_isalnum, 0, A_NEXT, TPS_InXMLEntity, 0, NULL}, + {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL}, + {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL}, + {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL}, + {p_iseqC, '.', A_NEXT, TPS_InXMLEntity, 0, NULL}, + {p_iseqC, '-', A_NEXT, TPS_InXMLEntity, 0, NULL}, + {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL}, {NULL, 0, A_POP, TPS_Null, 0, NULL} }; -static const TParserStateActionItem actionTPS_InHTMLEntityNumFirst[] = { +static const TParserStateActionItem actionTPS_InXMLEntityNumFirst[] = { {p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, - {p_isdigit, 0, A_NEXT, TPS_InHTMLEntityNum, 0, NULL}, + {p_iseqC, 'x', A_NEXT, TPS_InXMLEntityHexNumFirst, 0, NULL}, + {p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL}, {NULL, 0, A_POP, TPS_Null, 0, NULL} }; -static const TParserStateActionItem actionTPS_InHTMLEntityNum[] = { +static const TParserStateActionItem actionTPS_InXMLEntityHexNumFirst[] = { {p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, - {p_isdigit, 0, A_NEXT, TPS_InHTMLEntityNum, 0, NULL}, - {p_iseqC, ';', A_NEXT, TPS_InHTMLEntityEnd, 0, NULL}, + {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL}, {NULL, 0, A_POP, TPS_Null, 0, NULL} }; -static const TParserStateActionItem actionTPS_InHTMLEntityEnd[] = { - {NULL, 0, A_BINGO | A_CLEAR, TPS_Base, HTMLENTITY, NULL} +static const TParserStateActionItem actionTPS_InXMLEntityNum[] = { + {p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, + {p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL}, + {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL}, + {NULL, 0, A_POP, TPS_Null, 0, NULL} +}; + +static const TParserStateActionItem actionTPS_InXMLEntityHexNum[] = { + {p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, + {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL}, + {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL}, + {NULL, 0, A_POP, TPS_Null, 0, NULL} +}; + +static const TParserStateActionItem actionTPS_InXMLEntityEnd[] = { + {NULL, 0, A_BINGO | A_CLEAR, TPS_Base, XMLENTITY, NULL} }; static const TParserStateActionItem actionTPS_InTagFirst[] = { @@ -854,8 +877,8 @@ static const TParserStateActionItem actionTPS_InTagFirst[] = { static const TParserStateActionItem actionTPS_InXMLBegin[] = { {p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, /* words[i].type)) + if (XMLHLIDIGNORE(prs->words[i].type)) prs->words[i].replace = 1; } diff --git a/src/test/regress/expected/tsearch.out b/src/test/regress/expected/tsearch.out index b6f8f05d22..eb00402075 100644 --- a/src/test/regress/expected/tsearch.out +++ b/src/test/regress/expected/tsearch.out @@ -222,7 +222,7 @@ SELECT * FROM ts_token_type('default'); 10 | hword_part | Hyphenated word part, all letters 11 | hword_asciipart | Hyphenated word part, all ASCII 12 | blank | Space symbols - 13 | tag | HTML tag + 13 | tag | XML tag 14 | protocol | Protocol head 15 | numhword | Hyphenated word, letters and digits 16 | asciihword | Hyphenated word, all ASCII @@ -232,7 +232,7 @@ SELECT * FROM ts_token_type('default'); 20 | float | Decimal notation 21 | int | Signed integer 22 | uint | Unsigned integer - 23 | entity | HTML entity + 23 | entity | XML entity (23 rows) SELECT * FROM ts_parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/? ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teodor@stack.net qwe-wer asdf qwer jf sdjk ewr1> ewri2