Unaccent dictionary.

2025-01-12 18:34:36 +08:00 · 2009-08-18 10:34:39 +00:00 · 2009-08-18 10:34:39 +00:00 · 92e05bc6a5
commit 92e05bc6a5
parent a88a48011c
12 changed files with 808 additions and 3 deletions
--- a/contrib/Makefile
+++ b/contrib/Makefile
@ -1,4 +1,4 @@
-# $PostgreSQL: pgsql/contrib/Makefile,v 1.88 2009/08/07 20:50:21 petere Exp $
+# $PostgreSQL: pgsql/contrib/Makefile,v 1.89 2009/08/18 10:34:39 teodor Exp $

 subdir = contrib
 top_builddir = ..
@ -39,6 +39,7 @@ SUBDIRS = \
 		tablefunc	\
 		test_parser	\
 		tsearch2	\
+		unaccent	\
 		vacuumlo

 ifeq ($(with_openssl),yes)
--- a/contrib/README
+++ b/contrib/README
@ -169,6 +169,10 @@ tsearch2 -
 	Pavel Stehule <pavel.stehule@gmail.com>, based on code originally by
 	Teodor Sigaev <teodor@sigaev.ru> and Oleg Bartunov <oleg@sai.msu.su>.

+unaccent -
+	Unaccent dictionary for text search
+	Teodor Sigaev <teodor@sigaev.ru> and Oleg Bartunov <oleg@sai.msu.su>.
+
 uuid-ossp -
 	UUID generation functions
 	by Peter Eisentraut <peter_e@gmx.net>
--- a/contrib/unaccent/Makefile
+++ b/contrib/unaccent/Makefile
@ -0,0 +1,24 @@
+# $PostgreSQL: pgsql/contrib/unaccent/Makefile,v 1.1 2009/08/18 10:34:39 teodor Exp $
+
+MODULE_big = unaccent
+OBJS = unaccent.o
+
+DATA_built = unaccent.sql
+DATA = uninstall_unaccent.sql
+DATA_TSEARCH = unaccent.rules
+REGRESS = unaccent
+
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = contrib/pg_trgm
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
+
+#redefine REGRESS_OPTS because of needings of UTF8 database
+REGRESS_OPTS = --dbname=$(CONTRIB_TESTDB) --multibyte=UTF8 --no-locale 
--- a/contrib/unaccent/expected/unaccent.out
+++ b/contrib/unaccent/expected/unaccent.out
@ -0,0 +1,58 @@
+SET client_min_messages = warning;
+\set ECHO none
+RESET client_min_messages;
+SET client_encoding TO 'KOI8';
+SELECT unaccent('foobar');
+ unaccent 
+----------
+ foobar
+(1 row)
+
+SELECT unaccent('Ｌ肆');
+ unaccent 
+----------
+ 盘肆
+(1 row)
+
+SELECT unaccent('出殡');
+ unaccent 
+----------
+ 弼殡
+(1 row)
+
+SELECT unaccent('unaccent', 'foobar');
+ unaccent 
+----------
+ foobar
+(1 row)
+
+SELECT unaccent('unaccent', 'Ｌ肆');
+ unaccent 
+----------
+ 盘肆
+(1 row)
+
+SELECT unaccent('unaccent', '出殡');
+ unaccent 
+----------
+ 弼殡
+(1 row)
+
+SELECT ts_lexize('unaccent', 'foobar');
+ ts_lexize 
+-----------
+ 
+(1 row)
+
+SELECT ts_lexize('unaccent', 'Ｌ肆');
+ ts_lexize 
+-----------
+ {盘肆}
+(1 row)
+
+SELECT ts_lexize('unaccent', '出殡');
+ ts_lexize 
+-----------
+ {弼殡}
+(1 row)
+
--- a/contrib/unaccent/sql/unaccent.sql
+++ b/contrib/unaccent/sql/unaccent.sql
@ -0,0 +1,19 @@
+SET client_min_messages = warning;
+\set ECHO none
+\i unaccent.sql
+\set ECHO all
+RESET client_min_messages;
+
+SET client_encoding TO 'KOI8';
+
+SELECT unaccent('foobar');
+SELECT unaccent('Ｌ肆');
+SELECT unaccent('出殡');
+
+SELECT unaccent('unaccent', 'foobar');
+SELECT unaccent('unaccent', 'Ｌ肆');
+SELECT unaccent('unaccent', '出殡');
+
+SELECT ts_lexize('unaccent', 'foobar');
+SELECT ts_lexize('unaccent', 'Ｌ肆');
+SELECT ts_lexize('unaccent', '出殡');
--- a/contrib/unaccent/unaccent.c
+++ b/contrib/unaccent/unaccent.c
@ -0,0 +1,318 @@
+/*-------------------------------------------------------------------------
+ *
+ * unaccent.c
+ *    Text search unaccent dictionary
+ *
+ * Copyright (c) 2009, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *    $PostgreSQL: pgsql/contrib/unaccent/unaccent.c,v 1.1 2009/08/18 10:34:39 teodor Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "fmgr.h"
+#include "catalog/namespace.h"
+#include "commands/defrem.h"
+#include "mb/pg_wchar.h"
+#include "tsearch/ts_cache.h"
+#include "tsearch/ts_locale.h"
+#include "tsearch/ts_public.h"
+#include "utils/builtins.h"
+
+PG_MODULE_MAGIC;
+
+/*
+ * Unaccent dictionary uses uncompressed suffix tree to find a 
+ * character to replace. Each node of tree is an array of 
+ * SuffixChar struct with length = 256 (n-th element of array
+ * corresponds to byte)
+ */
+typedef struct SuffixChar {
+	struct SuffixChar	*nextChar;
+	char				*replaceTo;
+	int					replacelen;
+} SuffixChar;
+
+/*
+ * placeChar - put str into tree's structure, byte by byte.
+ */
+static SuffixChar*
+placeChar(SuffixChar *node, unsigned char *str, int lenstr, char *replaceTo, int replacelen)
+{
+	SuffixChar	*curnode;
+
+	if ( !node )
+	{
+		node = palloc(sizeof(SuffixChar) * 256);
+		memset(node, 0, sizeof(SuffixChar) * 256);
+	}
+
+	curnode = node + *str;
+
+	if ( lenstr == 1 )
+	{
+		if ( curnode->replaceTo )
+			elog(WARNING, "duplicate TO argument, use first one");
+		else
+		{
+			curnode->replacelen = replacelen;
+			curnode->replaceTo = palloc( replacelen );
+			memcpy(curnode->replaceTo, replaceTo, replacelen);
+		}
+	}
+	else
+	{
+		curnode->nextChar = placeChar( curnode->nextChar, str+1, lenstr-1, replaceTo, replacelen);
+	}
+
+	return node;
+}
+
+/*
+ * initSuffixTree  - create suffix tree from file. Function converts
+ * UTF8-encoded file into current encoding.
+ */
+static SuffixChar*
+initSuffixTree(char *filename) 
+{
+	SuffixChar *rootSuffixTree = NULL;
+	MemoryContext ccxt = CurrentMemoryContext;
+	tsearch_readline_state	trst;
+	bool			skip;
+
+	filename = get_tsearch_config_filename(filename, "rules");
+	if (!tsearch_readline_begin(&trst, filename))
+		ereport(ERROR,
+				(errcode(ERRCODE_CONFIG_FILE_ERROR),
+				 errmsg("could not open unaccent file \"%s\": %m",
+						filename)));
+
+	do	
+	{
+		char	src[4096];
+		char	trg[4096];
+		int		srclen;
+		int		trglen;
+		char   *line = NULL;
+
+		skip = true;
+
+		PG_TRY();
+		{
+			/*
+			 * pg_do_encoding_conversion() (called by tsearch_readline())
+			 * will emit exception if it finds untranslatable characters in current locale.
+			 * We just skip such characters.
+			 */
+			while ((line = tsearch_readline(&trst)) != NULL)
+			{
+				if ( sscanf(line, "%s\t%s\n", src, trg)!=2 )
+					continue;
+
+				srclen = strlen(src);
+				trglen = strlen(trg);
+
+				rootSuffixTree = placeChar(rootSuffixTree, 
+											(unsigned char*)src, srclen, 
+											trg, trglen);
+				skip = false;
+				pfree(line);
+			}
+		}
+		PG_CATCH();
+		{
+			ErrorData  *errdata;
+			MemoryContext ecxt;
+
+			ecxt = MemoryContextSwitchTo(ccxt);
+			errdata = CopyErrorData();
+			if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER)
+			{
+				FlushErrorState();
+			}
+			else
+			{
+				MemoryContextSwitchTo(ecxt);
+				PG_RE_THROW();
+			}
+		}
+		PG_END_TRY();
+	}
+	while(skip);
+
+	tsearch_readline_end(&trst);
+
+	return rootSuffixTree;
+}
+
+/*
+ * findReplaceTo - find multibyte character in tree
+ */
+static SuffixChar * 
+findReplaceTo( SuffixChar *node, unsigned char *src, int srclen )
+{
+	while( node ) 
+	{
+		node = node + *src;
+		if ( srclen == 1 )
+			return node;
+
+		src++;
+		srclen--;
+		node = node->nextChar;
+	}
+
+	return NULL;
+}
+
+PG_FUNCTION_INFO_V1(unaccent_init);
+Datum       unaccent_init(PG_FUNCTION_ARGS);
+Datum
+unaccent_init(PG_FUNCTION_ARGS)
+{
+	List       *dictoptions = (List *) PG_GETARG_POINTER(0);
+	SuffixChar *rootSuffixTree;
+	bool        fileloaded = false;
+	ListCell   *l;
+
+	foreach(l, dictoptions)
+	{
+		DefElem    *defel = (DefElem *) lfirst(l);
+
+		if (pg_strcasecmp("Rules", defel->defname) == 0)
+		{
+			if (fileloaded)
+				ereport(ERROR,
+						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+						 errmsg("multiple Rules parameters")));
+				rootSuffixTree = initSuffixTree(defGetString(defel));
+				fileloaded = true;
+		}
+		else
+		{
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+					 errmsg("unrecognized Unaccent parameter: \"%s\"",
+							defel->defname)));
+		}
+	}
+
+	if (!fileloaded)
+	{
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("missing Rules parameter")));
+	}
+
+	PG_RETURN_POINTER(rootSuffixTree);
+}
+
+PG_FUNCTION_INFO_V1(unaccent_lexize);
+Datum       unaccent_lexize(PG_FUNCTION_ARGS);
+Datum
+unaccent_lexize(PG_FUNCTION_ARGS)
+{
+	SuffixChar *rootSuffixTree = (SuffixChar*)PG_GETARG_POINTER(0);
+	char       *srcchar = (char *) PG_GETARG_POINTER(1);
+	int32		len = PG_GETARG_INT32(2);
+	char	   *srcstart, *trgchar;
+	int			charlen;
+	TSLexeme   *res = NULL;
+	SuffixChar *node;
+
+	srcstart = srcchar;
+	while( srcchar - srcstart < len )
+	{
+		charlen = pg_mblen(srcchar);
+
+		node = findReplaceTo( rootSuffixTree, (unsigned char *) srcchar, charlen );
+		if ( node  && node->replaceTo )
+		{
+			if ( !res )
+			{
+				/* allocate res only it it's needed */
+				res = palloc0(sizeof(TSLexeme) * 2);
+				res->lexeme = trgchar = palloc( len * pg_database_encoding_max_length() + 1 /* \0 */ );
+				res->flags = TSL_FILTER;
+				if ( srcchar != srcstart )
+				{
+					memcpy(trgchar, srcstart, srcchar - srcstart);
+					trgchar += (srcchar - srcstart);
+				}
+			}
+			memcpy( trgchar, node->replaceTo, node->replacelen );
+			trgchar += node->replacelen; 
+		}
+		else if ( res )
+		{
+			memcpy( trgchar, srcchar, charlen );
+			trgchar += charlen;
+		}
+
+		srcchar += charlen;
+	}
+
+	if ( res )
+		*trgchar = '\0';
+
+	PG_RETURN_POINTER(res);
+}
+
+/*
+ * Function-like wrapper for dictionary
+ */
+PG_FUNCTION_INFO_V1(unaccent_dict);
+Datum       unaccent_dict(PG_FUNCTION_ARGS);
+Datum
+unaccent_dict(PG_FUNCTION_ARGS)
+{
+	text	*str;
+	int		strArg;
+	Oid		dictOid;
+	TSDictionaryCacheEntry	*dict;
+	TSLexeme *res;
+
+	if (PG_NARGS() == 1)
+	{
+		dictOid = TSDictionaryGetDictid(stringToQualifiedNameList("unaccent"), false);
+		strArg = 0;
+	}
+	else
+	{
+		dictOid = PG_GETARG_OID(0);
+		strArg = 1;
+	}
+	str = PG_GETARG_TEXT_P(strArg);
+
+	dict = lookup_ts_dictionary_cache(dictOid);
+
+	res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
+													 PointerGetDatum(dict->dictData),
+													 PointerGetDatum(VARDATA(str)),
+													 Int32GetDatum(VARSIZE(str) - VARHDRSZ),
+													 PointerGetDatum(NULL)));
+
+	PG_FREE_IF_COPY(str, strArg);
+
+	if ( res == NULL )
+	{
+		PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
+	}
+	else if ( res->lexeme == NULL )
+	{
+		pfree(res);
+		PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
+	}
+	else
+	{
+		text *txt = cstring_to_text(res->lexeme);
+
+		pfree(res->lexeme);
+		pfree(res);
+
+		PG_RETURN_TEXT_P(txt);
+	}
+}
--- a/contrib/unaccent/unaccent.rules
+++ b/contrib/unaccent/unaccent.rules
@ -0,0 +1,187 @@
+À	A
+Á	A
+Â	A
+Ã	A
+Ä	A
+Å	A
+Æ	A
+à	a
+á	a
+â	a
+ã	a
+ä	a
+å	a
+æ	a
+Ā	A
+ā	a
+Ă	A
+ă	a
+Ą	A
+ą	a
+Ç	C
+ç	c
+Ć	C
+ć	c
+Ĉ	C
+ĉ	c
+Ċ	C
+ċ	c
+Č	C
+č	c
+Ď	D
+ď	d
+Đ	D
+đ	d
+È	E
+É	E
+Ê	E
+Ë	E
+è	e
+é	e
+ê	e
+ë	e
+Ē	E
+ē	e
+Ĕ	E
+ĕ	e
+Ė	E
+ė	e
+Ę	E
+ę	e
+Ě	E
+ě	e
+Ĝ	G
+ĝ	g
+Ğ	G
+ğ	g
+Ġ	G
+ġ	g
+Ģ	G
+ģ	g
+Ĥ	H
+ĥ	h
+Ħ	H
+ħ	h
+Ĩ	I
+Ì	I
+Í	I
+Î	I
+Ï	I
+ì	i
+í	i
+î	i
+ï	i
+ĩ	i
+Ī	I
+ī	i
+Ĭ	I
+ĭ	i
+Į	I
+į	i
+İ	I
+ı	i
+Ĳ	I
+ĳ	i
+Ĵ	J
+ĵ	j
+Ķ	K
+ķ	k
+ĸ	k
+Ĺ	L
+ĺ	l
+Ļ	L
+ļ	l
+Ľ	L
+ľ	l
+Ŀ	L
+ŀ	l
+Ł	L
+ł	l
+Ñ	N
+ñ	n
+Ń	N
+ń	n
+Ņ	N
+ņ	n
+Ň	N
+ň	n
+ŉ	n
+Ŋ	N
+ŋ	n
+Ò	O
+Ó	O
+Ô	O
+Õ	O
+Ö	O
+ò	o
+ó	o
+ô	o
+õ	o
+ö	o
+Ō	O
+ō	o
+Ŏ	O
+ŏ	o
+Ő	O
+ő	o
+Œ	E
+œ	e
+Ø	O
+ø	o
+Ŕ	R
+ŕ	r
+Ŗ	R
+ŗ	r
+Ř	R
+ř	r
+ß	S
+Ś	S
+ś	s
+Ŝ	S
+ŝ	s
+Ş	S
+ş	s
+Š	S
+š	s
+Ţ	T
+ţ	t
+Ť	T
+ť	t
+Ŧ	T
+ŧ	t
+Ù	U
+Ú	U
+Û	U
+Ü	U
+ù	u
+ú	u
+û	u
+ü	u
+Ũ	U
+ũ	u
+Ū	U
+ū	u
+Ŭ	U
+ŭ	u
+Ů	U
+ů	u
+Ű	U
+ű	u
+Ų	U
+ų	u
+Ŵ	W
+ŵ	w
+Ý	Y
+ý	y
+ÿ	y
+Ŷ	Y
+ŷ	y
+Ÿ	Y
+Ź	Z
+ź	z
+Ż	Z
+ż	z
+Ž	Z
+ž	z
+ё	е
+Ё	Е
--- a/contrib/unaccent/unaccent.sql.in
+++ b/contrib/unaccent/unaccent.sql.in
@ -0,0 +1,33 @@
+/* $PostgreSQL: pgsql/contrib/unaccent/unaccent.sql.in,v 1.1 2009/08/18 10:34:39 teodor Exp $ */
+
+CREATE OR REPLACE FUNCTION unaccent(regdictionary, text)
+	RETURNS text
+	AS 'MODULE_PATHNAME', 'unaccent_dict'
+	LANGUAGE C RETURNS NULL ON NULL INPUT IMMUTABLE;
+
+CREATE OR REPLACE FUNCTION unaccent(text)
+	RETURNS text
+	AS 'MODULE_PATHNAME', 'unaccent_dict'
+	LANGUAGE C RETURNS NULL ON NULL INPUT IMMUTABLE;
+
+CREATE OR REPLACE FUNCTION unaccent_init(internal)
+	RETURNS internal
+	AS 'MODULE_PATHNAME', 'unaccent_init'
+	LANGUAGE C;
+
+CREATE OR REPLACE FUNCTION unaccent_lexize(internal,internal,internal,internal)
+	RETURNS internal
+	AS 'MODULE_PATHNAME', 'unaccent_lexize'
+	LANGUAGE C;
+
+CREATE TEXT SEARCH TEMPLATE unaccent (
+    INIT = unaccent_init,
+	LEXIZE = unaccent_lexize
+);
+
+
+CREATE TEXT SEARCH DICTIONARY unaccent (
+	TEMPLATE = unaccent,
+	RULES    = 'unaccent'
+);
+
--- a/contrib/unaccent/uninstall_unaccent.sql
+++ b/contrib/unaccent/uninstall_unaccent.sql
@ -0,0 +1,9 @@
+/* $PostgreSQL: pgsql/contrib/unaccent/uninstall_unaccent.sql,v 1.1 2009/08/18 10:34:39 teodor Exp $ */
+
+DROP FUNCTION IF EXISTS unaccent(regdictionary, text) CASCADE;
+DROP FUNCTION IF EXISTS unaccent(text) CASCADE;
+DROP TEXT SEARCH DICTIONARY IF EXISTS unaccent CASCADE;
+DROP TEXT SEARCH TEMPLATE IF EXISTS unaccent CASCADE;
+DROP FUNCTION IF EXISTS unaccent_init(internal) CASCADE;
+DROP FUNCTION IF EXISTS unaccent_lexize(internal,internal,internal,internal) CASCADE;
+
--- a/doc/src/sgml/contrib.sgml
+++ b/doc/src/sgml/contrib.sgml
@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/contrib.sgml,v 1.13 2009/04/27 16:27:35 momjian Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/contrib.sgml,v 1.14 2009/08/18 10:34:39 teodor Exp $ -->

 <appendix id="contrib">
 <title>Additional Supplied Modules</title>
@ -113,6 +113,7 @@ psql -d dbname -f <replaceable>SHAREDIR</>/contrib/<replaceable>module</>.sql
 &tablefunc;
 &test-parser;
 &tsearch2;
+ &unaccent;
 &uuid-ossp;
 &vacuumlo;
 &xml2;
--- a/doc/src/sgml/filelist.sgml
+++ b/doc/src/sgml/filelist.sgml
@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/filelist.sgml,v 1.63 2009/08/17 22:14:44 petere Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/filelist.sgml,v 1.64 2009/08/18 10:34:39 teodor Exp $ -->

 <!entity history    SYSTEM "history.sgml">
 <!entity info       SYSTEM "info.sgml">
@ -126,6 +126,7 @@
 <!entity tablefunc       SYSTEM "tablefunc.sgml">
 <!entity test-parser     SYSTEM "test-parser.sgml">
 <!entity tsearch2        SYSTEM "tsearch2.sgml">
+<!entity unaccent      SYSTEM "unaccent.sgml">
 <!entity uuid-ossp       SYSTEM "uuid-ossp.sgml">
 <!entity vacuumlo        SYSTEM "vacuumlo.sgml">
 <!entity xml2            SYSTEM "xml2.sgml"> 
--- a/doc/src/sgml/unaccent.sgml
+++ b/doc/src/sgml/unaccent.sgml
@ -0,0 +1,150 @@
+<sect1 id="unaccent">
+ <title>unaccent</title>
+
+ <indexterm zone="unaccent">
+  <primary>unaccent</primary>
+ </indexterm>
+
+ <para>
+  <filename>unaccent</> removes accents (diacritic signs) from a lexeme.
+  It's a filtering dictionary, that means its output is 
+  always passed to the next dictionary (if any), contrary to the standard 
+  behaviour. Currently, it supports most important accents from european 
+  languages. 
+ </para>
+
+ <para>
+  Limitation: Current implementation of <filename>unaccent</> 
+  dictionary cannot be used as a normalizing dictionary for 
+  <filename>thesaurus</filename> dictionary.
+ </para>
+ 
+ <sect2>
+  <title>Configuration</title>
+
+  <para>
+   A <literal>unaccent</> dictionary accepts the following options:
+  </para>
+  <itemizedlist>
+   <listitem>
+    <para>
+     <literal>RULES</> is the base name of the file containing the list of
+     translation rules.  This file must be stored in
+     <filename>$SHAREDIR/tsearch_data/</> (where <literal>$SHAREDIR</> means
+     the <productname>PostgreSQL</> installation's shared-data directory).
+     Its name must end in <literal>.rules</> (which is not to be included in
+     the <literal>RULES</> parameter).
+    </para>
+   </listitem>
+  </itemizedlist>
+  <para>
+   The rules file has the following format:
+  </para>
+  <itemizedlist>
+   <listitem>
+    <para>
+     Each line represents pair: character_with_accent  character_without_accent
+    <programlisting>
+&Agrave;	A
+&Aacute; 	A
+&Acirc; 	A
+&Atilde;	A
+&Auml;  	A
+&Aring;		A
+&AElig; 	A
+    </programlisting>
+    </para>
+   </listitem>
+  </itemizedlist>
+
+  <para>
+   Look at <filename>unaccent.rules</>, which is installed in
+   <filename>$SHAREDIR/tsearch_data/</>, for an example.
+  </para>
+ </sect2>
+
+ <sect2>
+  <title>Usage</title>
+
+  <para>
+   Running the installation script creates a text search template
+   <literal>unaccent</> and a dictionary <literal>unaccent</>
+   based on it, with default parameters.  You can alter the
+   parameters, for example
+
+<programlisting>
+=# ALTER TEXT SEARCH DICTIONARY unaccent (RULES='my_rules');
+</programlisting>
+
+   or create new dictionaries based on the template.
+  </para>
+
+  <para>
+   To test the dictionary, you can try
+
+<programlisting>
+=# select ts_lexize('unaccent','Hôtel');
+ ts_lexize 
+-----------
+ {Hotel}
+(1 row)
+</programlisting>
+  </para>
+  
+  <para>
+  Filtering dictionary are useful for correct work of 
+  <function>ts_headline</function> function.
+<programlisting>
+=# CREATE TEXT SEARCH CONFIGURATION fr ( COPY = french );
+=# ALTER TEXT SEARCH CONFIGURATION fr
+	ALTER MAPPING FOR hword, hword_part, word
+	WITH unaccent, french_stem;
+=# select to_tsvector('fr','Hôtels de la Mer');
+    to_tsvector    
+-------------------
+ 'hotel':1 'mer':4
+(1 row)
+
+=# select to_tsvector('fr','Hôtel de la Mer') @@ to_tsquery('fr','Hotels');
+ ?column? 
+----------
+ t
+(1 row)
+=# select ts_headline('fr','Hôtel de la Mer',to_tsquery('fr','Hotels'));
+      ts_headline       
+------------------------
+  &lt;b&gt;Hôtel&lt;/b&gt;de la Mer
+(1 row)
+
+</programlisting>
+  </para>
+ </sect2>
+
+ <sect2>
+ <title>Function</title>
+
+ <para>
+  <function>unaccent</> function removes accents (diacritic signs) from
+  argument string. Basically, it's a wrapper around 
+  <filename>unaccent</> dictionary.
+ </para>
+
+ <indexterm>
+  <primary>unaccent</primary>
+ </indexterm>
+
+ <synopsis>
+   unaccent(<optional><replaceable class="PARAMETER">dictionary</replaceable>,
+   </optional> <replaceable class="PARAMETER">string</replaceable>) 
+  returns <type>text</type>
+ </synopsis>  
+
+ <para>
+<programlisting>
+SELECT unaccent('unaccent','Hôtel');
+SELECT unaccent('Hôtel');
+</programlisting>
+ </para>
+ </sect2>
+
+</sect1>