From 92e05bc6a5e2c8972bd128cbb9914b4149d58709 Mon Sep 17 00:00:00 2001 From: Teodor Sigaev Date: Tue, 18 Aug 2009 10:34:39 +0000 Subject: [PATCH] Unaccent dictionary. --- contrib/Makefile | 3 +- contrib/README | 4 + contrib/unaccent/Makefile | 24 ++ contrib/unaccent/expected/unaccent.out | 58 +++++ contrib/unaccent/sql/unaccent.sql | 19 ++ contrib/unaccent/unaccent.c | 318 ++++++++++++++++++++++++ contrib/unaccent/unaccent.rules | 187 ++++++++++++++ contrib/unaccent/unaccent.sql.in | 33 +++ contrib/unaccent/uninstall_unaccent.sql | 9 + doc/src/sgml/contrib.sgml | 3 +- doc/src/sgml/filelist.sgml | 3 +- doc/src/sgml/unaccent.sgml | 150 +++++++++++ 12 files changed, 808 insertions(+), 3 deletions(-) create mode 100644 contrib/unaccent/Makefile create mode 100644 contrib/unaccent/expected/unaccent.out create mode 100644 contrib/unaccent/sql/unaccent.sql create mode 100644 contrib/unaccent/unaccent.c create mode 100644 contrib/unaccent/unaccent.rules create mode 100644 contrib/unaccent/unaccent.sql.in create mode 100644 contrib/unaccent/uninstall_unaccent.sql create mode 100644 doc/src/sgml/unaccent.sgml diff --git a/contrib/Makefile b/contrib/Makefile index 85cabd8618..8543b5287f 100644 --- a/contrib/Makefile +++ b/contrib/Makefile @@ -1,4 +1,4 @@ -# $PostgreSQL: pgsql/contrib/Makefile,v 1.88 2009/08/07 20:50:21 petere Exp $ +# $PostgreSQL: pgsql/contrib/Makefile,v 1.89 2009/08/18 10:34:39 teodor Exp $ subdir = contrib top_builddir = .. @@ -39,6 +39,7 @@ SUBDIRS = \ tablefunc \ test_parser \ tsearch2 \ + unaccent \ vacuumlo ifeq ($(with_openssl),yes) diff --git a/contrib/README b/contrib/README index 1ae49adc70..a8396a5bfa 100644 --- a/contrib/README +++ b/contrib/README @@ -169,6 +169,10 @@ tsearch2 - Pavel Stehule , based on code originally by Teodor Sigaev and Oleg Bartunov . +unaccent - + Unaccent dictionary for text search + Teodor Sigaev and Oleg Bartunov . + uuid-ossp - UUID generation functions by Peter Eisentraut diff --git a/contrib/unaccent/Makefile b/contrib/unaccent/Makefile new file mode 100644 index 0000000000..91b04fc275 --- /dev/null +++ b/contrib/unaccent/Makefile @@ -0,0 +1,24 @@ +# $PostgreSQL: pgsql/contrib/unaccent/Makefile,v 1.1 2009/08/18 10:34:39 teodor Exp $ + +MODULE_big = unaccent +OBJS = unaccent.o + +DATA_built = unaccent.sql +DATA = uninstall_unaccent.sql +DATA_TSEARCH = unaccent.rules +REGRESS = unaccent + + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = contrib/pg_trgm +top_builddir = ../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif + +#redefine REGRESS_OPTS because of needings of UTF8 database +REGRESS_OPTS = --dbname=$(CONTRIB_TESTDB) --multibyte=UTF8 --no-locale diff --git a/contrib/unaccent/expected/unaccent.out b/contrib/unaccent/expected/unaccent.out new file mode 100644 index 0000000000..8d197c50be --- /dev/null +++ b/contrib/unaccent/expected/unaccent.out @@ -0,0 +1,58 @@ +SET client_min_messages = warning; +\set ECHO none +RESET client_min_messages; +SET client_encoding TO 'KOI8'; +SELECT unaccent('foobar'); + unaccent +---------- + foobar +(1 row) + +SELECT unaccent('L肆'); + unaccent +---------- + 盘肆 +(1 row) + +SELECT unaccent('出殡'); + unaccent +---------- + 弼殡 +(1 row) + +SELECT unaccent('unaccent', 'foobar'); + unaccent +---------- + foobar +(1 row) + +SELECT unaccent('unaccent', 'L肆'); + unaccent +---------- + 盘肆 +(1 row) + +SELECT unaccent('unaccent', '出殡'); + unaccent +---------- + 弼殡 +(1 row) + +SELECT ts_lexize('unaccent', 'foobar'); + ts_lexize +----------- + +(1 row) + +SELECT ts_lexize('unaccent', 'L肆'); + ts_lexize +----------- + {盘肆} +(1 row) + +SELECT ts_lexize('unaccent', '出殡'); + ts_lexize +----------- + {弼殡} +(1 row) + diff --git a/contrib/unaccent/sql/unaccent.sql b/contrib/unaccent/sql/unaccent.sql new file mode 100644 index 0000000000..71ab5bb435 --- /dev/null +++ b/contrib/unaccent/sql/unaccent.sql @@ -0,0 +1,19 @@ +SET client_min_messages = warning; +\set ECHO none +\i unaccent.sql +\set ECHO all +RESET client_min_messages; + +SET client_encoding TO 'KOI8'; + +SELECT unaccent('foobar'); +SELECT unaccent('L肆'); +SELECT unaccent('出殡'); + +SELECT unaccent('unaccent', 'foobar'); +SELECT unaccent('unaccent', 'L肆'); +SELECT unaccent('unaccent', '出殡'); + +SELECT ts_lexize('unaccent', 'foobar'); +SELECT ts_lexize('unaccent', 'L肆'); +SELECT ts_lexize('unaccent', '出殡'); diff --git a/contrib/unaccent/unaccent.c b/contrib/unaccent/unaccent.c new file mode 100644 index 0000000000..7b5086b958 --- /dev/null +++ b/contrib/unaccent/unaccent.c @@ -0,0 +1,318 @@ +/*------------------------------------------------------------------------- + * + * unaccent.c + * Text search unaccent dictionary + * + * Copyright (c) 2009, PostgreSQL Global Development Group + * + * IDENTIFICATION + * $PostgreSQL: pgsql/contrib/unaccent/unaccent.c,v 1.1 2009/08/18 10:34:39 teodor Exp $ + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "fmgr.h" +#include "catalog/namespace.h" +#include "commands/defrem.h" +#include "mb/pg_wchar.h" +#include "tsearch/ts_cache.h" +#include "tsearch/ts_locale.h" +#include "tsearch/ts_public.h" +#include "utils/builtins.h" + +PG_MODULE_MAGIC; + +/* + * Unaccent dictionary uses uncompressed suffix tree to find a + * character to replace. Each node of tree is an array of + * SuffixChar struct with length = 256 (n-th element of array + * corresponds to byte) + */ +typedef struct SuffixChar { + struct SuffixChar *nextChar; + char *replaceTo; + int replacelen; +} SuffixChar; + +/* + * placeChar - put str into tree's structure, byte by byte. + */ +static SuffixChar* +placeChar(SuffixChar *node, unsigned char *str, int lenstr, char *replaceTo, int replacelen) +{ + SuffixChar *curnode; + + if ( !node ) + { + node = palloc(sizeof(SuffixChar) * 256); + memset(node, 0, sizeof(SuffixChar) * 256); + } + + curnode = node + *str; + + if ( lenstr == 1 ) + { + if ( curnode->replaceTo ) + elog(WARNING, "duplicate TO argument, use first one"); + else + { + curnode->replacelen = replacelen; + curnode->replaceTo = palloc( replacelen ); + memcpy(curnode->replaceTo, replaceTo, replacelen); + } + } + else + { + curnode->nextChar = placeChar( curnode->nextChar, str+1, lenstr-1, replaceTo, replacelen); + } + + return node; +} + +/* + * initSuffixTree - create suffix tree from file. Function converts + * UTF8-encoded file into current encoding. + */ +static SuffixChar* +initSuffixTree(char *filename) +{ + SuffixChar *rootSuffixTree = NULL; + MemoryContext ccxt = CurrentMemoryContext; + tsearch_readline_state trst; + bool skip; + + filename = get_tsearch_config_filename(filename, "rules"); + if (!tsearch_readline_begin(&trst, filename)) + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("could not open unaccent file \"%s\": %m", + filename))); + + do + { + char src[4096]; + char trg[4096]; + int srclen; + int trglen; + char *line = NULL; + + skip = true; + + PG_TRY(); + { + /* + * pg_do_encoding_conversion() (called by tsearch_readline()) + * will emit exception if it finds untranslatable characters in current locale. + * We just skip such characters. + */ + while ((line = tsearch_readline(&trst)) != NULL) + { + if ( sscanf(line, "%s\t%s\n", src, trg)!=2 ) + continue; + + srclen = strlen(src); + trglen = strlen(trg); + + rootSuffixTree = placeChar(rootSuffixTree, + (unsigned char*)src, srclen, + trg, trglen); + skip = false; + pfree(line); + } + } + PG_CATCH(); + { + ErrorData *errdata; + MemoryContext ecxt; + + ecxt = MemoryContextSwitchTo(ccxt); + errdata = CopyErrorData(); + if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER) + { + FlushErrorState(); + } + else + { + MemoryContextSwitchTo(ecxt); + PG_RE_THROW(); + } + } + PG_END_TRY(); + } + while(skip); + + tsearch_readline_end(&trst); + + return rootSuffixTree; +} + +/* + * findReplaceTo - find multibyte character in tree + */ +static SuffixChar * +findReplaceTo( SuffixChar *node, unsigned char *src, int srclen ) +{ + while( node ) + { + node = node + *src; + if ( srclen == 1 ) + return node; + + src++; + srclen--; + node = node->nextChar; + } + + return NULL; +} + +PG_FUNCTION_INFO_V1(unaccent_init); +Datum unaccent_init(PG_FUNCTION_ARGS); +Datum +unaccent_init(PG_FUNCTION_ARGS) +{ + List *dictoptions = (List *) PG_GETARG_POINTER(0); + SuffixChar *rootSuffixTree; + bool fileloaded = false; + ListCell *l; + + foreach(l, dictoptions) + { + DefElem *defel = (DefElem *) lfirst(l); + + if (pg_strcasecmp("Rules", defel->defname) == 0) + { + if (fileloaded) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("multiple Rules parameters"))); + rootSuffixTree = initSuffixTree(defGetString(defel)); + fileloaded = true; + } + else + { + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("unrecognized Unaccent parameter: \"%s\"", + defel->defname))); + } + } + + if (!fileloaded) + { + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("missing Rules parameter"))); + } + + PG_RETURN_POINTER(rootSuffixTree); +} + +PG_FUNCTION_INFO_V1(unaccent_lexize); +Datum unaccent_lexize(PG_FUNCTION_ARGS); +Datum +unaccent_lexize(PG_FUNCTION_ARGS) +{ + SuffixChar *rootSuffixTree = (SuffixChar*)PG_GETARG_POINTER(0); + char *srcchar = (char *) PG_GETARG_POINTER(1); + int32 len = PG_GETARG_INT32(2); + char *srcstart, *trgchar; + int charlen; + TSLexeme *res = NULL; + SuffixChar *node; + + srcstart = srcchar; + while( srcchar - srcstart < len ) + { + charlen = pg_mblen(srcchar); + + node = findReplaceTo( rootSuffixTree, (unsigned char *) srcchar, charlen ); + if ( node && node->replaceTo ) + { + if ( !res ) + { + /* allocate res only it it's needed */ + res = palloc0(sizeof(TSLexeme) * 2); + res->lexeme = trgchar = palloc( len * pg_database_encoding_max_length() + 1 /* \0 */ ); + res->flags = TSL_FILTER; + if ( srcchar != srcstart ) + { + memcpy(trgchar, srcstart, srcchar - srcstart); + trgchar += (srcchar - srcstart); + } + } + memcpy( trgchar, node->replaceTo, node->replacelen ); + trgchar += node->replacelen; + } + else if ( res ) + { + memcpy( trgchar, srcchar, charlen ); + trgchar += charlen; + } + + srcchar += charlen; + } + + if ( res ) + *trgchar = '\0'; + + PG_RETURN_POINTER(res); +} + +/* + * Function-like wrapper for dictionary + */ +PG_FUNCTION_INFO_V1(unaccent_dict); +Datum unaccent_dict(PG_FUNCTION_ARGS); +Datum +unaccent_dict(PG_FUNCTION_ARGS) +{ + text *str; + int strArg; + Oid dictOid; + TSDictionaryCacheEntry *dict; + TSLexeme *res; + + if (PG_NARGS() == 1) + { + dictOid = TSDictionaryGetDictid(stringToQualifiedNameList("unaccent"), false); + strArg = 0; + } + else + { + dictOid = PG_GETARG_OID(0); + strArg = 1; + } + str = PG_GETARG_TEXT_P(strArg); + + dict = lookup_ts_dictionary_cache(dictOid); + + res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize), + PointerGetDatum(dict->dictData), + PointerGetDatum(VARDATA(str)), + Int32GetDatum(VARSIZE(str) - VARHDRSZ), + PointerGetDatum(NULL))); + + PG_FREE_IF_COPY(str, strArg); + + if ( res == NULL ) + { + PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg)); + } + else if ( res->lexeme == NULL ) + { + pfree(res); + PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg)); + } + else + { + text *txt = cstring_to_text(res->lexeme); + + pfree(res->lexeme); + pfree(res); + + PG_RETURN_TEXT_P(txt); + } +} diff --git a/contrib/unaccent/unaccent.rules b/contrib/unaccent/unaccent.rules new file mode 100644 index 0000000000..cc2f7a6585 --- /dev/null +++ b/contrib/unaccent/unaccent.rules @@ -0,0 +1,187 @@ +脌 A +脕 A +脗 A +脙 A +脛 A +脜 A +脝 A +脿 a +谩 a +芒 a +茫 a +盲 a +氓 a +忙 a +膧 A +膩 a +膫 A +膬 a +膭 A +膮 a +脟 C +莽 c +膯 C +膰 c +膱 C +膲 c +膴 C +膵 c +膶 C +膷 c +膸 D +膹 d +膼 D +膽 d +脠 E +脡 E +脢 E +脣 E +猫 e +茅 e +锚 e +毛 e +膾 E +膿 e +臄 E +臅 e +臇 E +臈 e +臉 E +臋 e +臍 E +臎 e +臏 G +臐 g +臑 G +臒 g +臓 G +摹 g +蘑 G +模 g +膜 H +磨 h +摩 H +魔 h +抹 I +脤 I +脥 I +脦 I +脧 I +矛 i +铆 i +卯 i +茂 i +末 i +莫 I +墨 i +默 I +沫 i +漠 I +寞 i +陌 I +谋 i +牟 I +某 i +拇 J +牡 j +亩 K +姆 k +母 k +墓 L +暮 l +幕 L +募 l +慕 L +木 l +目 L +艀 l +艁 L +艂 l +脩 N +帽 n +艃 N +艅 n +艆 N +艈 n +艊 N +艌 n +艍 n +艎 N +艐 n +脪 O +脫 O +脭 O +脮 O +脰 O +貌 o +贸 o +么 o +玫 o +枚 o +艑 O +艒 o +艓 O +艔 o +艕 O +艖 o +艗 E +艙 e +脴 O +酶 o +艛 R +艜 r +艝 R +艞 r +艠 R +艡 r +脽 S +艢 S +艣 s +艤 S +艥 s +艦 S +艧 s +艩 S +拧 s +泞 T +牛 t +扭 T +钮 t +纽 T +脓 t +脵 U +脷 U +脹 U +脺 U +霉 u +煤 u +没 u +眉 u +浓 U +农 u +弄 U +奴 u +努 U +怒 u +女 U +暖 u +虐 U +疟 u +挪 U +懦 u +糯 W +诺 w +脻 Y +媒 y +每 y +哦 Y +欧 y +鸥 Y +殴 Z +藕 z +呕 Z +偶 z +沤 Z +啪 z +褢 械 +衼 袝 diff --git a/contrib/unaccent/unaccent.sql.in b/contrib/unaccent/unaccent.sql.in new file mode 100644 index 0000000000..ba981398fa --- /dev/null +++ b/contrib/unaccent/unaccent.sql.in @@ -0,0 +1,33 @@ +/* $PostgreSQL: pgsql/contrib/unaccent/unaccent.sql.in,v 1.1 2009/08/18 10:34:39 teodor Exp $ */ + +CREATE OR REPLACE FUNCTION unaccent(regdictionary, text) + RETURNS text + AS 'MODULE_PATHNAME', 'unaccent_dict' + LANGUAGE C RETURNS NULL ON NULL INPUT IMMUTABLE; + +CREATE OR REPLACE FUNCTION unaccent(text) + RETURNS text + AS 'MODULE_PATHNAME', 'unaccent_dict' + LANGUAGE C RETURNS NULL ON NULL INPUT IMMUTABLE; + +CREATE OR REPLACE FUNCTION unaccent_init(internal) + RETURNS internal + AS 'MODULE_PATHNAME', 'unaccent_init' + LANGUAGE C; + +CREATE OR REPLACE FUNCTION unaccent_lexize(internal,internal,internal,internal) + RETURNS internal + AS 'MODULE_PATHNAME', 'unaccent_lexize' + LANGUAGE C; + +CREATE TEXT SEARCH TEMPLATE unaccent ( + INIT = unaccent_init, + LEXIZE = unaccent_lexize +); + + +CREATE TEXT SEARCH DICTIONARY unaccent ( + TEMPLATE = unaccent, + RULES = 'unaccent' +); + diff --git a/contrib/unaccent/uninstall_unaccent.sql b/contrib/unaccent/uninstall_unaccent.sql new file mode 100644 index 0000000000..89e3627fc8 --- /dev/null +++ b/contrib/unaccent/uninstall_unaccent.sql @@ -0,0 +1,9 @@ +/* $PostgreSQL: pgsql/contrib/unaccent/uninstall_unaccent.sql,v 1.1 2009/08/18 10:34:39 teodor Exp $ */ + +DROP FUNCTION IF EXISTS unaccent(regdictionary, text) CASCADE; +DROP FUNCTION IF EXISTS unaccent(text) CASCADE; +DROP TEXT SEARCH DICTIONARY IF EXISTS unaccent CASCADE; +DROP TEXT SEARCH TEMPLATE IF EXISTS unaccent CASCADE; +DROP FUNCTION IF EXISTS unaccent_init(internal) CASCADE; +DROP FUNCTION IF EXISTS unaccent_lexize(internal,internal,internal,internal) CASCADE; + diff --git a/doc/src/sgml/contrib.sgml b/doc/src/sgml/contrib.sgml index 0ef92b4896..cffbc55249 100644 --- a/doc/src/sgml/contrib.sgml +++ b/doc/src/sgml/contrib.sgml @@ -1,4 +1,4 @@ - + Additional Supplied Modules @@ -113,6 +113,7 @@ psql -d dbname -f SHAREDIR/contrib/module.sql &tablefunc; &test-parser; &tsearch2; + &unaccent; &uuid-ossp; &vacuumlo; &xml2; diff --git a/doc/src/sgml/filelist.sgml b/doc/src/sgml/filelist.sgml index 7e194f7bcc..bee66008b6 100644 --- a/doc/src/sgml/filelist.sgml +++ b/doc/src/sgml/filelist.sgml @@ -1,4 +1,4 @@ - + @@ -126,6 +126,7 @@ + diff --git a/doc/src/sgml/unaccent.sgml b/doc/src/sgml/unaccent.sgml new file mode 100644 index 0000000000..b3c7bbee48 --- /dev/null +++ b/doc/src/sgml/unaccent.sgml @@ -0,0 +1,150 @@ + + unaccent + + + unaccent + + + + unaccent removes accents (diacritic signs) from a lexeme. + It's a filtering dictionary, that means its output is + always passed to the next dictionary (if any), contrary to the standard + behaviour. Currently, it supports most important accents from european + languages. + + + + Limitation: Current implementation of unaccent + dictionary cannot be used as a normalizing dictionary for + thesaurus dictionary. + + + + Configuration + + + A unaccent dictionary accepts the following options: + + + + + RULES is the base name of the file containing the list of + translation rules. This file must be stored in + $SHAREDIR/tsearch_data/ (where $SHAREDIR means + the PostgreSQL installation's shared-data directory). + Its name must end in .rules (which is not to be included in + the RULES parameter). + + + + + The rules file has the following format: + + + + + Each line represents pair: character_with_accent character_without_accent + +À A +Á A +Â A +Ã A +Ä A +Å A +Æ A + + + + + + + Look at unaccent.rules, which is installed in + $SHAREDIR/tsearch_data/, for an example. + + + + + Usage + + + Running the installation script creates a text search template + unaccent and a dictionary unaccent + based on it, with default parameters. You can alter the + parameters, for example + + +=# ALTER TEXT SEARCH DICTIONARY unaccent (RULES='my_rules'); + + + or create new dictionaries based on the template. + + + + To test the dictionary, you can try + + +=# select ts_lexize('unaccent','H么tel'); + ts_lexize +----------- + {Hotel} +(1 row) + + + + + Filtering dictionary are useful for correct work of + ts_headline function. + +=# CREATE TEXT SEARCH CONFIGURATION fr ( COPY = french ); +=# ALTER TEXT SEARCH CONFIGURATION fr + ALTER MAPPING FOR hword, hword_part, word + WITH unaccent, french_stem; +=# select to_tsvector('fr','H么tels de la Mer'); + to_tsvector +------------------- + 'hotel':1 'mer':4 +(1 row) + +=# select to_tsvector('fr','H么tel de la Mer') @@ to_tsquery('fr','Hotels'); + ?column? +---------- + t +(1 row) +=# select ts_headline('fr','H么tel de la Mer',to_tsquery('fr','Hotels')); + ts_headline +------------------------ + <b>H么tel</b>de la Mer +(1 row) + + + + + + + Function + + + unaccent function removes accents (diacritic signs) from + argument string. Basically, it's a wrapper around + unaccent dictionary. + + + + unaccent + + + + unaccent(dictionary, + string) + returns text + + + + +SELECT unaccent('unaccent','H么tel'); +SELECT unaccent('H么tel'); + + + + +