From 92e05bc6a5e2c8972bd128cbb9914b4149d58709 Mon Sep 17 00:00:00 2001
From: Teodor Sigaev <teodor@sigaev.ru>
Date: Tue, 18 Aug 2009 10:34:39 +0000
Subject: [PATCH] Unaccent dictionary.

---
 contrib/Makefile                        |   3 +-
 contrib/README                          |   4 +
 contrib/unaccent/Makefile               |  24 ++
 contrib/unaccent/expected/unaccent.out  |  58 +++++
 contrib/unaccent/sql/unaccent.sql       |  19 ++
 contrib/unaccent/unaccent.c             | 318 ++++++++++++++++++++++++
 contrib/unaccent/unaccent.rules         | 187 ++++++++++++++
 contrib/unaccent/unaccent.sql.in        |  33 +++
 contrib/unaccent/uninstall_unaccent.sql |   9 +
 doc/src/sgml/contrib.sgml               |   3 +-
 doc/src/sgml/filelist.sgml              |   3 +-
 doc/src/sgml/unaccent.sgml              | 150 +++++++++++
 12 files changed, 808 insertions(+), 3 deletions(-)
 create mode 100644 contrib/unaccent/Makefile
 create mode 100644 contrib/unaccent/expected/unaccent.out
 create mode 100644 contrib/unaccent/sql/unaccent.sql
 create mode 100644 contrib/unaccent/unaccent.c
 create mode 100644 contrib/unaccent/unaccent.rules
 create mode 100644 contrib/unaccent/unaccent.sql.in
 create mode 100644 contrib/unaccent/uninstall_unaccent.sql
 create mode 100644 doc/src/sgml/unaccent.sgml

diff --git a/contrib/Makefile b/contrib/Makefile
index 85cabd8618..8543b5287f 100644
--- a/contrib/Makefile
+++ b/contrib/Makefile
@@ -1,4 +1,4 @@
-# $PostgreSQL: pgsql/contrib/Makefile,v 1.88 2009/08/07 20:50:21 petere Exp $
+# $PostgreSQL: pgsql/contrib/Makefile,v 1.89 2009/08/18 10:34:39 teodor Exp $
 
 subdir = contrib
 top_builddir = ..
@@ -39,6 +39,7 @@ SUBDIRS = \
 		tablefunc	\
 		test_parser	\
 		tsearch2	\
+		unaccent	\
 		vacuumlo
 
 ifeq ($(with_openssl),yes)
diff --git a/contrib/README b/contrib/README
index 1ae49adc70..a8396a5bfa 100644
--- a/contrib/README
+++ b/contrib/README
@@ -169,6 +169,10 @@ tsearch2 -
 	Pavel Stehule <pavel.stehule@gmail.com>, based on code originally by
 	Teodor Sigaev <teodor@sigaev.ru> and Oleg Bartunov <oleg@sai.msu.su>.
 
+unaccent -
+	Unaccent dictionary for text search
+	Teodor Sigaev <teodor@sigaev.ru> and Oleg Bartunov <oleg@sai.msu.su>.
+
 uuid-ossp -
 	UUID generation functions
 	by Peter Eisentraut <peter_e@gmx.net>
diff --git a/contrib/unaccent/Makefile b/contrib/unaccent/Makefile
new file mode 100644
index 0000000000..91b04fc275
--- /dev/null
+++ b/contrib/unaccent/Makefile
@@ -0,0 +1,24 @@
+# $PostgreSQL: pgsql/contrib/unaccent/Makefile,v 1.1 2009/08/18 10:34:39 teodor Exp $
+
+MODULE_big = unaccent
+OBJS = unaccent.o
+
+DATA_built = unaccent.sql
+DATA = uninstall_unaccent.sql
+DATA_TSEARCH = unaccent.rules
+REGRESS = unaccent
+
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = contrib/pg_trgm
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
+
+#redefine REGRESS_OPTS because of needings of UTF8 database
+REGRESS_OPTS = --dbname=$(CONTRIB_TESTDB) --multibyte=UTF8 --no-locale 
diff --git a/contrib/unaccent/expected/unaccent.out b/contrib/unaccent/expected/unaccent.out
new file mode 100644
index 0000000000..8d197c50be
--- /dev/null
+++ b/contrib/unaccent/expected/unaccent.out
@@ -0,0 +1,58 @@
+SET client_min_messages = warning;
+\set ECHO none
+RESET client_min_messages;
+SET client_encoding TO 'KOI8';
+SELECT unaccent('foobar');
+ unaccent 
+----------
+ foobar
+(1 row)
+
+SELECT unaccent('Ｌ肆');
+ unaccent 
+----------
+ 盘肆
+(1 row)
+
+SELECT unaccent('出殡');
+ unaccent 
+----------
+ 弼殡
+(1 row)
+
+SELECT unaccent('unaccent', 'foobar');
+ unaccent 
+----------
+ foobar
+(1 row)
+
+SELECT unaccent('unaccent', 'Ｌ肆');
+ unaccent 
+----------
+ 盘肆
+(1 row)
+
+SELECT unaccent('unaccent', '出殡');
+ unaccent 
+----------
+ 弼殡
+(1 row)
+
+SELECT ts_lexize('unaccent', 'foobar');
+ ts_lexize 
+-----------
+ 
+(1 row)
+
+SELECT ts_lexize('unaccent', 'Ｌ肆');
+ ts_lexize 
+-----------
+ {盘肆}
+(1 row)
+
+SELECT ts_lexize('unaccent', '出殡');
+ ts_lexize 
+-----------
+ {弼殡}
+(1 row)
+
diff --git a/contrib/unaccent/sql/unaccent.sql b/contrib/unaccent/sql/unaccent.sql
new file mode 100644
index 0000000000..71ab5bb435
--- /dev/null
+++ b/contrib/unaccent/sql/unaccent.sql
@@ -0,0 +1,19 @@
+SET client_min_messages = warning;
+\set ECHO none
+\i unaccent.sql
+\set ECHO all
+RESET client_min_messages;
+
+SET client_encoding TO 'KOI8';
+
+SELECT unaccent('foobar');
+SELECT unaccent('Ｌ肆');
+SELECT unaccent('出殡');
+
+SELECT unaccent('unaccent', 'foobar');
+SELECT unaccent('unaccent', 'Ｌ肆');
+SELECT unaccent('unaccent', '出殡');
+
+SELECT ts_lexize('unaccent', 'foobar');
+SELECT ts_lexize('unaccent', 'Ｌ肆');
+SELECT ts_lexize('unaccent', '出殡');
diff --git a/contrib/unaccent/unaccent.c b/contrib/unaccent/unaccent.c
new file mode 100644
index 0000000000..7b5086b958
--- /dev/null
+++ b/contrib/unaccent/unaccent.c
@@ -0,0 +1,318 @@
+/*-------------------------------------------------------------------------
+ *
+ * unaccent.c
+ *    Text search unaccent dictionary
+ *
+ * Copyright (c) 2009, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *    $PostgreSQL: pgsql/contrib/unaccent/unaccent.c,v 1.1 2009/08/18 10:34:39 teodor Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "fmgr.h"
+#include "catalog/namespace.h"
+#include "commands/defrem.h"
+#include "mb/pg_wchar.h"
+#include "tsearch/ts_cache.h"
+#include "tsearch/ts_locale.h"
+#include "tsearch/ts_public.h"
+#include "utils/builtins.h"
+
+PG_MODULE_MAGIC;
+
+/*
+ * Unaccent dictionary uses uncompressed suffix tree to find a 
+ * character to replace. Each node of tree is an array of 
+ * SuffixChar struct with length = 256 (n-th element of array
+ * corresponds to byte)
+ */
+typedef struct SuffixChar {
+	struct SuffixChar	*nextChar;
+	char				*replaceTo;
+	int					replacelen;
+} SuffixChar;
+
+/*
+ * placeChar - put str into tree's structure, byte by byte.
+ */
+static SuffixChar*
+placeChar(SuffixChar *node, unsigned char *str, int lenstr, char *replaceTo, int replacelen)
+{
+	SuffixChar	*curnode;
+
+	if ( !node )
+	{
+		node = palloc(sizeof(SuffixChar) * 256);
+		memset(node, 0, sizeof(SuffixChar) * 256);
+	}
+
+	curnode = node + *str;
+
+	if ( lenstr == 1 )
+	{
+		if ( curnode->replaceTo )
+			elog(WARNING, "duplicate TO argument, use first one");
+		else
+		{
+			curnode->replacelen = replacelen;
+			curnode->replaceTo = palloc( replacelen );
+			memcpy(curnode->replaceTo, replaceTo, replacelen);
+		}
+	}
+	else
+	{
+		curnode->nextChar = placeChar( curnode->nextChar, str+1, lenstr-1, replaceTo, replacelen);
+	}
+
+	return node;
+}
+
+/*
+ * initSuffixTree  - create suffix tree from file. Function converts
+ * UTF8-encoded file into current encoding.
+ */
+static SuffixChar*
+initSuffixTree(char *filename) 
+{
+	SuffixChar *rootSuffixTree = NULL;
+	MemoryContext ccxt = CurrentMemoryContext;
+	tsearch_readline_state	trst;
+	bool			skip;
+
+	filename = get_tsearch_config_filename(filename, "rules");
+	if (!tsearch_readline_begin(&trst, filename))
+		ereport(ERROR,
+				(errcode(ERRCODE_CONFIG_FILE_ERROR),
+				 errmsg("could not open unaccent file \"%s\": %m",
+						filename)));
+
+	do	
+	{
+		char	src[4096];
+		char	trg[4096];
+		int		srclen;
+		int		trglen;
+		char   *line = NULL;
+
+		skip = true;
+
+		PG_TRY();
+		{
+			/*
+			 * pg_do_encoding_conversion() (called by tsearch_readline())
+			 * will emit exception if it finds untranslatable characters in current locale.
+			 * We just skip such characters.
+			 */
+			while ((line = tsearch_readline(&trst)) != NULL)
+			{
+				if ( sscanf(line, "%s\t%s\n", src, trg)!=2 )
+					continue;
+
+				srclen = strlen(src);
+				trglen = strlen(trg);
+
+				rootSuffixTree = placeChar(rootSuffixTree, 
+											(unsigned char*)src, srclen, 
+											trg, trglen);
+				skip = false;
+				pfree(line);
+			}
+		}
+		PG_CATCH();
+		{
+			ErrorData  *errdata;
+			MemoryContext ecxt;
+
+			ecxt = MemoryContextSwitchTo(ccxt);
+			errdata = CopyErrorData();
+			if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER)
+			{
+				FlushErrorState();
+			}
+			else
+			{
+				MemoryContextSwitchTo(ecxt);
+				PG_RE_THROW();
+			}
+		}
+		PG_END_TRY();
+	}
+	while(skip);
+
+	tsearch_readline_end(&trst);
+
+	return rootSuffixTree;
+}
+
+/*
+ * findReplaceTo - find multibyte character in tree
+ */
+static SuffixChar * 
+findReplaceTo( SuffixChar *node, unsigned char *src, int srclen )
+{
+	while( node ) 
+	{
+		node = node + *src;
+		if ( srclen == 1 )
+			return node;
+
+		src++;
+		srclen--;
+		node = node->nextChar;
+	}
+
+	return NULL;
+}
+
+PG_FUNCTION_INFO_V1(unaccent_init);
+Datum       unaccent_init(PG_FUNCTION_ARGS);
+Datum
+unaccent_init(PG_FUNCTION_ARGS)
+{
+	List       *dictoptions = (List *) PG_GETARG_POINTER(0);
+	SuffixChar *rootSuffixTree;
+	bool        fileloaded = false;
+	ListCell   *l;
+
+	foreach(l, dictoptions)
+	{
+		DefElem    *defel = (DefElem *) lfirst(l);
+
+		if (pg_strcasecmp("Rules", defel->defname) == 0)
+		{
+			if (fileloaded)
+				ereport(ERROR,
+						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+						 errmsg("multiple Rules parameters")));
+				rootSuffixTree = initSuffixTree(defGetString(defel));
+				fileloaded = true;
+		}
+		else
+		{
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+					 errmsg("unrecognized Unaccent parameter: \"%s\"",
+							defel->defname)));
+		}
+	}
+
+	if (!fileloaded)
+	{
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("missing Rules parameter")));
+	}
+
+	PG_RETURN_POINTER(rootSuffixTree);
+}
+
+PG_FUNCTION_INFO_V1(unaccent_lexize);
+Datum       unaccent_lexize(PG_FUNCTION_ARGS);
+Datum
+unaccent_lexize(PG_FUNCTION_ARGS)
+{
+	SuffixChar *rootSuffixTree = (SuffixChar*)PG_GETARG_POINTER(0);
+	char       *srcchar = (char *) PG_GETARG_POINTER(1);
+	int32		len = PG_GETARG_INT32(2);
+	char	   *srcstart, *trgchar;
+	int			charlen;
+	TSLexeme   *res = NULL;
+	SuffixChar *node;
+
+	srcstart = srcchar;
+	while( srcchar - srcstart < len )
+	{
+		charlen = pg_mblen(srcchar);
+
+		node = findReplaceTo( rootSuffixTree, (unsigned char *) srcchar, charlen );
+		if ( node  && node->replaceTo )
+		{
+			if ( !res )
+			{
+				/* allocate res only it it's needed */
+				res = palloc0(sizeof(TSLexeme) * 2);
+				res->lexeme = trgchar = palloc( len * pg_database_encoding_max_length() + 1 /* \0 */ );
+				res->flags = TSL_FILTER;
+				if ( srcchar != srcstart )
+				{
+					memcpy(trgchar, srcstart, srcchar - srcstart);
+					trgchar += (srcchar - srcstart);
+				}
+			}
+			memcpy( trgchar, node->replaceTo, node->replacelen );
+			trgchar += node->replacelen; 
+		}
+		else if ( res )
+		{
+			memcpy( trgchar, srcchar, charlen );
+			trgchar += charlen;
+		}
+
+		srcchar += charlen;
+	}
+
+	if ( res )
+		*trgchar = '\0';
+
+	PG_RETURN_POINTER(res);
+}
+
+/*
+ * Function-like wrapper for dictionary
+ */
+PG_FUNCTION_INFO_V1(unaccent_dict);
+Datum       unaccent_dict(PG_FUNCTION_ARGS);
+Datum
+unaccent_dict(PG_FUNCTION_ARGS)
+{
+	text	*str;
+	int		strArg;
+	Oid		dictOid;
+	TSDictionaryCacheEntry	*dict;
+	TSLexeme *res;
+
+	if (PG_NARGS() == 1)
+	{
+		dictOid = TSDictionaryGetDictid(stringToQualifiedNameList("unaccent"), false);
+		strArg = 0;
+	}
+	else
+	{
+		dictOid = PG_GETARG_OID(0);
+		strArg = 1;
+	}
+	str = PG_GETARG_TEXT_P(strArg);
+
+	dict = lookup_ts_dictionary_cache(dictOid);
+
+	res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
+													 PointerGetDatum(dict->dictData),
+													 PointerGetDatum(VARDATA(str)),
+													 Int32GetDatum(VARSIZE(str) - VARHDRSZ),
+													 PointerGetDatum(NULL)));
+
+	PG_FREE_IF_COPY(str, strArg);
+
+	if ( res == NULL )
+	{
+		PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
+	}
+	else if ( res->lexeme == NULL )
+	{
+		pfree(res);
+		PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
+	}
+	else
+	{
+		text *txt = cstring_to_text(res->lexeme);
+
+		pfree(res->lexeme);
+		pfree(res);
+
+		PG_RETURN_TEXT_P(txt);
+	}
+}
diff --git a/contrib/unaccent/unaccent.rules b/contrib/unaccent/unaccent.rules
new file mode 100644
index 0000000000..cc2f7a6585
--- /dev/null
+++ b/contrib/unaccent/unaccent.rules
@@ -0,0 +1,187 @@
+脌	A
+脕	A
+脗	A
+脙	A
+脛	A
+脜	A
+脝	A
+脿	a
+谩	a
+芒	a
+茫	a
+盲	a
+氓	a
+忙	a
+膧	A
+膩	a
+膫	A
+膬	a
+膭	A
+膮	a
+脟	C
+莽	c
+膯	C
+膰	c
+膱	C
+膲	c
+膴	C
+膵	c
+膶	C
+膷	c
+膸	D
+膹	d
+膼	D
+膽	d
+脠	E
+脡	E
+脢	E
+脣	E
+猫	e
+茅	e
+锚	e
+毛	e
+膾	E
+膿	e
+臄	E
+臅	e
+臇	E
+臈	e
+臉	E
+臋	e
+臍	E
+臎	e
+臏	G
+臐	g
+臑	G
+臒	g
+臓	G
+摹	g
+蘑	G
+模	g
+膜	H
+磨	h
+摩	H
+魔	h
+抹	I
+脤	I
+脥	I
+脦	I
+脧	I
+矛	i
+铆	i
+卯	i
+茂	i
+末	i
+莫	I
+墨	i
+默	I
+沫	i
+漠	I
+寞	i
+陌	I
+谋	i
+牟	I
+某	i
+拇	J
+牡	j
+亩	K
+姆	k
+母	k
+墓	L
+暮	l
+幕	L
+募	l
+慕	L
+木	l
+目	L
+艀	l
+艁	L
+艂	l
+脩	N
+帽	n
+艃	N
+艅	n
+艆	N
+艈	n
+艊	N
+艌	n
+艍	n
+艎	N
+艐	n
+脪	O
+脫	O
+脭	O
+脮	O
+脰	O
+貌	o
+贸	o
+么	o
+玫	o
+枚	o
+艑	O
+艒	o
+艓	O
+艔	o
+艕	O
+艖	o
+艗	E
+艙	e
+脴	O
+酶	o
+艛	R
+艜	r
+艝	R
+艞	r
+艠	R
+艡	r
+脽	S
+艢	S
+艣	s
+艤	S
+艥	s
+艦	S
+艧	s
+艩	S
+拧	s
+泞	T
+牛	t
+扭	T
+钮	t
+纽	T
+脓	t
+脵	U
+脷	U
+脹	U
+脺	U
+霉	u
+煤	u
+没	u
+眉	u
+浓	U
+农	u
+弄	U
+奴	u
+努	U
+怒	u
+女	U
+暖	u
+虐	U
+疟	u
+挪	U
+懦	u
+糯	W
+诺	w
+脻	Y
+媒	y
+每	y
+哦	Y
+欧	y
+鸥	Y
+殴	Z
+藕	z
+呕	Z
+偶	z
+沤	Z
+啪	z
+褢	械
+衼	袝
diff --git a/contrib/unaccent/unaccent.sql.in b/contrib/unaccent/unaccent.sql.in
new file mode 100644
index 0000000000..ba981398fa
--- /dev/null
+++ b/contrib/unaccent/unaccent.sql.in
@@ -0,0 +1,33 @@
+/* $PostgreSQL: pgsql/contrib/unaccent/unaccent.sql.in,v 1.1 2009/08/18 10:34:39 teodor Exp $ */
+
+CREATE OR REPLACE FUNCTION unaccent(regdictionary, text)
+	RETURNS text
+	AS 'MODULE_PATHNAME', 'unaccent_dict'
+	LANGUAGE C RETURNS NULL ON NULL INPUT IMMUTABLE;
+
+CREATE OR REPLACE FUNCTION unaccent(text)
+	RETURNS text
+	AS 'MODULE_PATHNAME', 'unaccent_dict'
+	LANGUAGE C RETURNS NULL ON NULL INPUT IMMUTABLE;
+
+CREATE OR REPLACE FUNCTION unaccent_init(internal)
+	RETURNS internal
+	AS 'MODULE_PATHNAME', 'unaccent_init'
+	LANGUAGE C;
+
+CREATE OR REPLACE FUNCTION unaccent_lexize(internal,internal,internal,internal)
+	RETURNS internal
+	AS 'MODULE_PATHNAME', 'unaccent_lexize'
+	LANGUAGE C;
+
+CREATE TEXT SEARCH TEMPLATE unaccent (
+    INIT = unaccent_init,
+	LEXIZE = unaccent_lexize
+);
+
+
+CREATE TEXT SEARCH DICTIONARY unaccent (
+	TEMPLATE = unaccent,
+	RULES    = 'unaccent'
+);
+
diff --git a/contrib/unaccent/uninstall_unaccent.sql b/contrib/unaccent/uninstall_unaccent.sql
new file mode 100644
index 0000000000..89e3627fc8
--- /dev/null
+++ b/contrib/unaccent/uninstall_unaccent.sql
@@ -0,0 +1,9 @@
+/* $PostgreSQL: pgsql/contrib/unaccent/uninstall_unaccent.sql,v 1.1 2009/08/18 10:34:39 teodor Exp $ */
+
+DROP FUNCTION IF EXISTS unaccent(regdictionary, text) CASCADE;
+DROP FUNCTION IF EXISTS unaccent(text) CASCADE;
+DROP TEXT SEARCH DICTIONARY IF EXISTS unaccent CASCADE;
+DROP TEXT SEARCH TEMPLATE IF EXISTS unaccent CASCADE;
+DROP FUNCTION IF EXISTS unaccent_init(internal) CASCADE;
+DROP FUNCTION IF EXISTS unaccent_lexize(internal,internal,internal,internal) CASCADE;
+
diff --git a/doc/src/sgml/contrib.sgml b/doc/src/sgml/contrib.sgml
index 0ef92b4896..cffbc55249 100644
--- a/doc/src/sgml/contrib.sgml
+++ b/doc/src/sgml/contrib.sgml
@@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/contrib.sgml,v 1.13 2009/04/27 16:27:35 momjian Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/contrib.sgml,v 1.14 2009/08/18 10:34:39 teodor Exp $ -->
 
 <appendix id="contrib">
  <title>Additional Supplied Modules</title>
@@ -113,6 +113,7 @@ psql -d dbname -f <replaceable>SHAREDIR</>/contrib/<replaceable>module</>.sql
  &tablefunc;
  &test-parser;
  &tsearch2;
+ &unaccent;
  &uuid-ossp;
  &vacuumlo;
  &xml2;
diff --git a/doc/src/sgml/filelist.sgml b/doc/src/sgml/filelist.sgml
index 7e194f7bcc..bee66008b6 100644
--- a/doc/src/sgml/filelist.sgml
+++ b/doc/src/sgml/filelist.sgml
@@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/filelist.sgml,v 1.63 2009/08/17 22:14:44 petere Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/filelist.sgml,v 1.64 2009/08/18 10:34:39 teodor Exp $ -->
 
 <!entity history    SYSTEM "history.sgml">
 <!entity info       SYSTEM "info.sgml">
@@ -126,6 +126,7 @@
 <!entity tablefunc       SYSTEM "tablefunc.sgml">
 <!entity test-parser     SYSTEM "test-parser.sgml">
 <!entity tsearch2        SYSTEM "tsearch2.sgml">
+<!entity unaccent      SYSTEM "unaccent.sgml">
 <!entity uuid-ossp       SYSTEM "uuid-ossp.sgml">
 <!entity vacuumlo        SYSTEM "vacuumlo.sgml">
 <!entity xml2            SYSTEM "xml2.sgml"> 
diff --git a/doc/src/sgml/unaccent.sgml b/doc/src/sgml/unaccent.sgml
new file mode 100644
index 0000000000..b3c7bbee48
--- /dev/null
+++ b/doc/src/sgml/unaccent.sgml
@@ -0,0 +1,150 @@
+<sect1 id="unaccent">
+ <title>unaccent</title>
+
+ <indexterm zone="unaccent">
+  <primary>unaccent</primary>
+ </indexterm>
+
+ <para>
+  <filename>unaccent</> removes accents (diacritic signs) from a lexeme.
+  It's a filtering dictionary, that means its output is 
+  always passed to the next dictionary (if any), contrary to the standard 
+  behaviour. Currently, it supports most important accents from european 
+  languages. 
+ </para>
+
+ <para>
+  Limitation: Current implementation of <filename>unaccent</> 
+  dictionary cannot be used as a normalizing dictionary for 
+  <filename>thesaurus</filename> dictionary.
+ </para>
+ 
+ <sect2>
+  <title>Configuration</title>
+
+  <para>
+   A <literal>unaccent</> dictionary accepts the following options:
+  </para>
+  <itemizedlist>
+   <listitem>
+    <para>
+     <literal>RULES</> is the base name of the file containing the list of
+     translation rules.  This file must be stored in
+     <filename>$SHAREDIR/tsearch_data/</> (where <literal>$SHAREDIR</> means
+     the <productname>PostgreSQL</> installation's shared-data directory).
+     Its name must end in <literal>.rules</> (which is not to be included in
+     the <literal>RULES</> parameter).
+    </para>
+   </listitem>
+  </itemizedlist>
+  <para>
+   The rules file has the following format:
+  </para>
+  <itemizedlist>
+   <listitem>
+    <para>
+     Each line represents pair: character_with_accent  character_without_accent
+    <programlisting>
+&Agrave;	A
+&Aacute; 	A
+&Acirc; 	A
+&Atilde;	A
+&Auml;  	A
+&Aring;		A
+&AElig; 	A
+    </programlisting>
+    </para>
+   </listitem>
+  </itemizedlist>
+
+  <para>
+   Look at <filename>unaccent.rules</>, which is installed in
+   <filename>$SHAREDIR/tsearch_data/</>, for an example.
+  </para>
+ </sect2>
+
+ <sect2>
+  <title>Usage</title>
+
+  <para>
+   Running the installation script creates a text search template
+   <literal>unaccent</> and a dictionary <literal>unaccent</>
+   based on it, with default parameters.  You can alter the
+   parameters, for example
+
+<programlisting>
+=# ALTER TEXT SEARCH DICTIONARY unaccent (RULES='my_rules');
+</programlisting>
+
+   or create new dictionaries based on the template.
+  </para>
+
+  <para>
+   To test the dictionary, you can try
+
+<programlisting>
+=# select ts_lexize('unaccent','H么tel');
+ ts_lexize 
+-----------
+ {Hotel}
+(1 row)
+</programlisting>
+  </para>
+  
+  <para>
+  Filtering dictionary are useful for correct work of 
+  <function>ts_headline</function> function.
+<programlisting>
+=# CREATE TEXT SEARCH CONFIGURATION fr ( COPY = french );
+=# ALTER TEXT SEARCH CONFIGURATION fr
+	ALTER MAPPING FOR hword, hword_part, word
+	WITH unaccent, french_stem;
+=# select to_tsvector('fr','H么tels de la Mer');
+    to_tsvector    
+-------------------
+ 'hotel':1 'mer':4
+(1 row)
+
+=# select to_tsvector('fr','H么tel de la Mer') @@ to_tsquery('fr','Hotels');
+ ?column? 
+----------
+ t
+(1 row)
+=# select ts_headline('fr','H么tel de la Mer',to_tsquery('fr','Hotels'));
+      ts_headline       
+------------------------
+  &lt;b&gt;H么tel&lt;/b&gt;de la Mer
+(1 row)
+
+</programlisting>
+  </para>
+ </sect2>
+
+ <sect2>
+ <title>Function</title>
+
+ <para>
+  <function>unaccent</> function removes accents (diacritic signs) from
+  argument string. Basically, it's a wrapper around 
+  <filename>unaccent</> dictionary.
+ </para>
+
+ <indexterm>
+  <primary>unaccent</primary>
+ </indexterm>
+
+ <synopsis>
+   unaccent(<optional><replaceable class="PARAMETER">dictionary</replaceable>,
+   </optional> <replaceable class="PARAMETER">string</replaceable>) 
+  returns <type>text</type>
+ </synopsis>  
+
+ <para>
+<programlisting>
+SELECT unaccent('unaccent','H么tel');
+SELECT unaccent('H么tel');
+</programlisting>
+ </para>
+ </sect2>
+
+</sect1>