Snowball multibyte. It's a pity, but snowball sources is very diferent for multibyte and

singlebyte encodings, so we should have snowball for every encodings.

I hope that finalize multibyte support work in tsearch2, but testing is needed...
This commit is contained in:
Teodor Sigaev 2006-01-27 16:32:31 +00:00
parent 75c4747156
commit 5e2707c45f
8 changed files with 974 additions and 27 deletions

View File

@ -1,4 +1,4 @@
# $PostgreSQL: pgsql/contrib/tsearch2/Makefile,v 1.12 2005/11/21 12:27:57 teodor Exp $
# $PostgreSQL: pgsql/contrib/tsearch2/Makefile,v 1.13 2006/01/27 16:32:31 teodor Exp $
MODULE_big = tsearch2
OBJS = dict_ex.o dict.o snmap.o stopword.o common.o prs_dcfg.o \
@ -16,7 +16,7 @@ OBJS += $(SUBDIROBJS)
PG_CPPFLAGS = -I$(srcdir)/snowball -I$(srcdir)/ispell -I$(srcdir)/wordparser
DATA = stopword/english.stop stopword/russian.stop
DATA = stopword/english.stop stopword/russian.stop stopword/russian.stop.utf8
DATA_built = tsearch2.sql untsearch2.sql
DOCS = README.tsearch2
REGRESS = tsearch2

View File

@ -10,6 +10,7 @@
#include "snowball/header.h"
#include "snowball/english_stem.h"
#include "snowball/russian_stem.h"
#include "snowball/russian_stem_UTF8.h"
#include "ts_locale.h"
typedef struct
@ -23,8 +24,11 @@ typedef struct
PG_FUNCTION_INFO_V1(snb_en_init);
Datum snb_en_init(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(snb_ru_init);
Datum snb_ru_init(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(snb_ru_init_koi8);
Datum snb_ru_init_koi8(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(snb_ru_init_utf8);
Datum snb_ru_init_utf8(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(snb_lexize);
Datum snb_lexize(PG_FUNCTION_ARGS);
@ -64,7 +68,7 @@ snb_en_init(PG_FUNCTION_ARGS)
}
Datum
snb_ru_init(PG_FUNCTION_ARGS)
snb_ru_init_koi8(PG_FUNCTION_ARGS)
{
DictSnowball *d = (DictSnowball *) malloc(sizeof(DictSnowball));
@ -97,6 +101,40 @@ snb_ru_init(PG_FUNCTION_ARGS)
PG_RETURN_POINTER(d);
}
Datum
snb_ru_init_utf8(PG_FUNCTION_ARGS)
{
DictSnowball *d = (DictSnowball *) malloc(sizeof(DictSnowball));
if (!d)
ereport(ERROR,
(errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("out of memory")));
memset(d, 0, sizeof(DictSnowball));
d->stoplist.wordop = lowerstr;
if (!PG_ARGISNULL(0) && PG_GETARG_POINTER(0) != NULL)
{
text *in = PG_GETARG_TEXT_P(0);
readstoplist(in, &(d->stoplist));
sortstoplist(&(d->stoplist));
PG_FREE_IF_COPY(in, 0);
}
d->z = russian_UTF_8_create_env();
if (!d->z)
{
freestoplist(&(d->stoplist));
ereport(ERROR,
(errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("out of memory")));
}
d->stem = russian_UTF_8_stem;
PG_RETURN_POINTER(d);
}
Datum
snb_lexize(PG_FUNCTION_ARGS)
{

View File

@ -4,21 +4,21 @@
--
\set ECHO none
psql:tsearch2.sql:13: NOTICE: CREATE TABLE / PRIMARY KEY will create implicit index "pg_ts_dict_pkey" for table "pg_ts_dict"
psql:tsearch2.sql:145: NOTICE: CREATE TABLE / PRIMARY KEY will create implicit index "pg_ts_parser_pkey" for table "pg_ts_parser"
psql:tsearch2.sql:244: NOTICE: CREATE TABLE / PRIMARY KEY will create implicit index "pg_ts_cfg_pkey" for table "pg_ts_cfg"
psql:tsearch2.sql:251: NOTICE: CREATE TABLE / PRIMARY KEY will create implicit index "pg_ts_cfgmap_pkey" for table "pg_ts_cfgmap"
psql:tsearch2.sql:337: NOTICE: type "tsvector" is not yet defined
psql:tsearch2.sql:158: NOTICE: CREATE TABLE / PRIMARY KEY will create implicit index "pg_ts_parser_pkey" for table "pg_ts_parser"
psql:tsearch2.sql:257: NOTICE: CREATE TABLE / PRIMARY KEY will create implicit index "pg_ts_cfg_pkey" for table "pg_ts_cfg"
psql:tsearch2.sql:264: NOTICE: CREATE TABLE / PRIMARY KEY will create implicit index "pg_ts_cfgmap_pkey" for table "pg_ts_cfgmap"
psql:tsearch2.sql:370: NOTICE: type "tsvector" is not yet defined
DETAIL: Creating a shell type definition.
psql:tsearch2.sql:342: NOTICE: argument type tsvector is only a shell
psql:tsearch2.sql:396: NOTICE: type "tsquery" is not yet defined
psql:tsearch2.sql:375: NOTICE: argument type tsvector is only a shell
psql:tsearch2.sql:429: NOTICE: type "tsquery" is not yet defined
DETAIL: Creating a shell type definition.
psql:tsearch2.sql:401: NOTICE: argument type tsquery is only a shell
psql:tsearch2.sql:559: NOTICE: type "gtsvector" is not yet defined
psql:tsearch2.sql:434: NOTICE: argument type tsquery is only a shell
psql:tsearch2.sql:592: NOTICE: type "gtsvector" is not yet defined
DETAIL: Creating a shell type definition.
psql:tsearch2.sql:564: NOTICE: argument type gtsvector is only a shell
psql:tsearch2.sql:1054: NOTICE: type "gtsq" is not yet defined
psql:tsearch2.sql:597: NOTICE: argument type gtsvector is only a shell
psql:tsearch2.sql:1087: NOTICE: type "gtsq" is not yet defined
DETAIL: Creating a shell type definition.
psql:tsearch2.sql:1059: NOTICE: argument type gtsq is only a shell
psql:tsearch2.sql:1092: NOTICE: argument type gtsq is only a shell
--tsvector
SELECT '1'::tsvector;
tsvector

View File

@ -1,6 +1,6 @@
# $PostgreSQL: pgsql/contrib/tsearch2/snowball/Makefile,v 1.8 2005/10/18 01:30:48 tgl Exp $
# $PostgreSQL: pgsql/contrib/tsearch2/snowball/Makefile,v 1.9 2006/01/27 16:32:31 teodor Exp $
SUBOBJS = english_stem.o api.o russian_stem.o utilities.o
SUBOBJS = english_stem.o api.o russian_stem.o russian_stem_UTF8.o utilities.o
EXTRA_CLEAN = SUBSYS.o $(SUBOBJS)

View File

@ -0,0 +1,709 @@
/* This file was generated automatically by the Snowball to ANSI C compiler */
#include "header.h"
extern int russian_UTF_8_stem(struct SN_env * z);
static int r_tidy_up(struct SN_env * z);
static int r_derivational(struct SN_env * z);
static int r_noun(struct SN_env * z);
static int r_verb(struct SN_env * z);
static int r_reflexive(struct SN_env * z);
static int r_adjectival(struct SN_env * z);
static int r_adjective(struct SN_env * z);
static int r_perfective_gerund(struct SN_env * z);
static int r_R2(struct SN_env * z);
static int r_mark_regions(struct SN_env * z);
extern struct SN_env * russian_UTF_8_create_env(void);
extern void russian_UTF_8_close_env(struct SN_env * z);
static symbol s_0_0[10] = { 0xD0, 0xB2, 0xD1, 0x88, 0xD0, 0xB8, 0xD1, 0x81, 0xD1, 0x8C };
static symbol s_0_1[12] = { 0xD1, 0x8B, 0xD0, 0xB2, 0xD1, 0x88, 0xD0, 0xB8, 0xD1, 0x81, 0xD1, 0x8C };
static symbol s_0_2[12] = { 0xD0, 0xB8, 0xD0, 0xB2, 0xD1, 0x88, 0xD0, 0xB8, 0xD1, 0x81, 0xD1, 0x8C };
static symbol s_0_3[2] = { 0xD0, 0xB2 };
static symbol s_0_4[4] = { 0xD1, 0x8B, 0xD0, 0xB2 };
static symbol s_0_5[4] = { 0xD0, 0xB8, 0xD0, 0xB2 };
static symbol s_0_6[6] = { 0xD0, 0xB2, 0xD1, 0x88, 0xD0, 0xB8 };
static symbol s_0_7[8] = { 0xD1, 0x8B, 0xD0, 0xB2, 0xD1, 0x88, 0xD0, 0xB8 };
static symbol s_0_8[8] = { 0xD0, 0xB8, 0xD0, 0xB2, 0xD1, 0x88, 0xD0, 0xB8 };
static struct among a_0[9] =
{
/* 0 */ { 10, s_0_0, -1, 1, 0},
/* 1 */ { 12, s_0_1, 0, 2, 0},
/* 2 */ { 12, s_0_2, 0, 2, 0},
/* 3 */ { 2, s_0_3, -1, 1, 0},
/* 4 */ { 4, s_0_4, 3, 2, 0},
/* 5 */ { 4, s_0_5, 3, 2, 0},
/* 6 */ { 6, s_0_6, -1, 1, 0},
/* 7 */ { 8, s_0_7, 6, 2, 0},
/* 8 */ { 8, s_0_8, 6, 2, 0}
};
static symbol s_1_0[6] = { 0xD0, 0xB5, 0xD0, 0xBC, 0xD1, 0x83 };
static symbol s_1_1[6] = { 0xD0, 0xBE, 0xD0, 0xBC, 0xD1, 0x83 };
static symbol s_1_2[4] = { 0xD1, 0x8B, 0xD1, 0x85 };
static symbol s_1_3[4] = { 0xD0, 0xB8, 0xD1, 0x85 };
static symbol s_1_4[4] = { 0xD1, 0x83, 0xD1, 0x8E };
static symbol s_1_5[4] = { 0xD1, 0x8E, 0xD1, 0x8E };
static symbol s_1_6[4] = { 0xD0, 0xB5, 0xD1, 0x8E };
static symbol s_1_7[4] = { 0xD0, 0xBE, 0xD1, 0x8E };
static symbol s_1_8[4] = { 0xD1, 0x8F, 0xD1, 0x8F };
static symbol s_1_9[4] = { 0xD0, 0xB0, 0xD1, 0x8F };
static symbol s_1_10[4] = { 0xD1, 0x8B, 0xD0, 0xB5 };
static symbol s_1_11[4] = { 0xD0, 0xB5, 0xD0, 0xB5 };
static symbol s_1_12[4] = { 0xD0, 0xB8, 0xD0, 0xB5 };
static symbol s_1_13[4] = { 0xD0, 0xBE, 0xD0, 0xB5 };
static symbol s_1_14[6] = { 0xD1, 0x8B, 0xD0, 0xBC, 0xD0, 0xB8 };
static symbol s_1_15[6] = { 0xD0, 0xB8, 0xD0, 0xBC, 0xD0, 0xB8 };
static symbol s_1_16[4] = { 0xD1, 0x8B, 0xD0, 0xB9 };
static symbol s_1_17[4] = { 0xD0, 0xB5, 0xD0, 0xB9 };
static symbol s_1_18[4] = { 0xD0, 0xB8, 0xD0, 0xB9 };
static symbol s_1_19[4] = { 0xD0, 0xBE, 0xD0, 0xB9 };
static symbol s_1_20[4] = { 0xD1, 0x8B, 0xD0, 0xBC };
static symbol s_1_21[4] = { 0xD0, 0xB5, 0xD0, 0xBC };
static symbol s_1_22[4] = { 0xD0, 0xB8, 0xD0, 0xBC };
static symbol s_1_23[4] = { 0xD0, 0xBE, 0xD0, 0xBC };
static symbol s_1_24[6] = { 0xD0, 0xB5, 0xD0, 0xB3, 0xD0, 0xBE };
static symbol s_1_25[6] = { 0xD0, 0xBE, 0xD0, 0xB3, 0xD0, 0xBE };
static struct among a_1[26] =
{
/* 0 */ { 6, s_1_0, -1, 1, 0},
/* 1 */ { 6, s_1_1, -1, 1, 0},
/* 2 */ { 4, s_1_2, -1, 1, 0},
/* 3 */ { 4, s_1_3, -1, 1, 0},
/* 4 */ { 4, s_1_4, -1, 1, 0},
/* 5 */ { 4, s_1_5, -1, 1, 0},
/* 6 */ { 4, s_1_6, -1, 1, 0},
/* 7 */ { 4, s_1_7, -1, 1, 0},
/* 8 */ { 4, s_1_8, -1, 1, 0},
/* 9 */ { 4, s_1_9, -1, 1, 0},
/* 10 */ { 4, s_1_10, -1, 1, 0},
/* 11 */ { 4, s_1_11, -1, 1, 0},
/* 12 */ { 4, s_1_12, -1, 1, 0},
/* 13 */ { 4, s_1_13, -1, 1, 0},
/* 14 */ { 6, s_1_14, -1, 1, 0},
/* 15 */ { 6, s_1_15, -1, 1, 0},
/* 16 */ { 4, s_1_16, -1, 1, 0},
/* 17 */ { 4, s_1_17, -1, 1, 0},
/* 18 */ { 4, s_1_18, -1, 1, 0},
/* 19 */ { 4, s_1_19, -1, 1, 0},
/* 20 */ { 4, s_1_20, -1, 1, 0},
/* 21 */ { 4, s_1_21, -1, 1, 0},
/* 22 */ { 4, s_1_22, -1, 1, 0},
/* 23 */ { 4, s_1_23, -1, 1, 0},
/* 24 */ { 6, s_1_24, -1, 1, 0},
/* 25 */ { 6, s_1_25, -1, 1, 0}
};
static symbol s_2_0[4] = { 0xD0, 0xB2, 0xD1, 0x88 };
static symbol s_2_1[6] = { 0xD1, 0x8B, 0xD0, 0xB2, 0xD1, 0x88 };
static symbol s_2_2[6] = { 0xD0, 0xB8, 0xD0, 0xB2, 0xD1, 0x88 };
static symbol s_2_3[2] = { 0xD1, 0x89 };
static symbol s_2_4[4] = { 0xD1, 0x8E, 0xD1, 0x89 };
static symbol s_2_5[6] = { 0xD1, 0x83, 0xD1, 0x8E, 0xD1, 0x89 };
static symbol s_2_6[4] = { 0xD0, 0xB5, 0xD0, 0xBC };
static symbol s_2_7[4] = { 0xD0, 0xBD, 0xD0, 0xBD };
static struct among a_2[8] =
{
/* 0 */ { 4, s_2_0, -1, 1, 0},
/* 1 */ { 6, s_2_1, 0, 2, 0},
/* 2 */ { 6, s_2_2, 0, 2, 0},
/* 3 */ { 2, s_2_3, -1, 1, 0},
/* 4 */ { 4, s_2_4, 3, 1, 0},
/* 5 */ { 6, s_2_5, 4, 2, 0},
/* 6 */ { 4, s_2_6, -1, 1, 0},
/* 7 */ { 4, s_2_7, -1, 1, 0}
};
static symbol s_3_0[4] = { 0xD1, 0x81, 0xD1, 0x8C };
static symbol s_3_1[4] = { 0xD1, 0x81, 0xD1, 0x8F };
static struct among a_3[2] =
{
/* 0 */ { 4, s_3_0, -1, 1, 0},
/* 1 */ { 4, s_3_1, -1, 1, 0}
};
static symbol s_4_0[4] = { 0xD1, 0x8B, 0xD1, 0x82 };
static symbol s_4_1[4] = { 0xD1, 0x8E, 0xD1, 0x82 };
static symbol s_4_2[6] = { 0xD1, 0x83, 0xD1, 0x8E, 0xD1, 0x82 };
static symbol s_4_3[4] = { 0xD1, 0x8F, 0xD1, 0x82 };
static symbol s_4_4[4] = { 0xD0, 0xB5, 0xD1, 0x82 };
static symbol s_4_5[6] = { 0xD1, 0x83, 0xD0, 0xB5, 0xD1, 0x82 };
static symbol s_4_6[4] = { 0xD0, 0xB8, 0xD1, 0x82 };
static symbol s_4_7[4] = { 0xD0, 0xBD, 0xD1, 0x8B };
static symbol s_4_8[6] = { 0xD0, 0xB5, 0xD0, 0xBD, 0xD1, 0x8B };
static symbol s_4_9[4] = { 0xD1, 0x82, 0xD1, 0x8C };
static symbol s_4_10[6] = { 0xD1, 0x8B, 0xD1, 0x82, 0xD1, 0x8C };
static symbol s_4_11[6] = { 0xD0, 0xB8, 0xD1, 0x82, 0xD1, 0x8C };
static symbol s_4_12[6] = { 0xD0, 0xB5, 0xD1, 0x88, 0xD1, 0x8C };
static symbol s_4_13[6] = { 0xD0, 0xB8, 0xD1, 0x88, 0xD1, 0x8C };
static symbol s_4_14[2] = { 0xD1, 0x8E };
static symbol s_4_15[4] = { 0xD1, 0x83, 0xD1, 0x8E };
static symbol s_4_16[4] = { 0xD0, 0xBB, 0xD0, 0xB0 };
static symbol s_4_17[6] = { 0xD1, 0x8B, 0xD0, 0xBB, 0xD0, 0xB0 };
static symbol s_4_18[6] = { 0xD0, 0xB8, 0xD0, 0xBB, 0xD0, 0xB0 };
static symbol s_4_19[4] = { 0xD0, 0xBD, 0xD0, 0xB0 };
static symbol s_4_20[6] = { 0xD0, 0xB5, 0xD0, 0xBD, 0xD0, 0xB0 };
static symbol s_4_21[6] = { 0xD0, 0xB5, 0xD1, 0x82, 0xD0, 0xB5 };
static symbol s_4_22[6] = { 0xD0, 0xB8, 0xD1, 0x82, 0xD0, 0xB5 };
static symbol s_4_23[6] = { 0xD0, 0xB9, 0xD1, 0x82, 0xD0, 0xB5 };
static symbol s_4_24[8] = { 0xD1, 0x83, 0xD0, 0xB9, 0xD1, 0x82, 0xD0, 0xB5 };
static symbol s_4_25[8] = { 0xD0, 0xB5, 0xD0, 0xB9, 0xD1, 0x82, 0xD0, 0xB5 };
static symbol s_4_26[4] = { 0xD0, 0xBB, 0xD0, 0xB8 };
static symbol s_4_27[6] = { 0xD1, 0x8B, 0xD0, 0xBB, 0xD0, 0xB8 };
static symbol s_4_28[6] = { 0xD0, 0xB8, 0xD0, 0xBB, 0xD0, 0xB8 };
static symbol s_4_29[2] = { 0xD0, 0xB9 };
static symbol s_4_30[4] = { 0xD1, 0x83, 0xD0, 0xB9 };
static symbol s_4_31[4] = { 0xD0, 0xB5, 0xD0, 0xB9 };
static symbol s_4_32[2] = { 0xD0, 0xBB };
static symbol s_4_33[4] = { 0xD1, 0x8B, 0xD0, 0xBB };
static symbol s_4_34[4] = { 0xD0, 0xB8, 0xD0, 0xBB };
static symbol s_4_35[4] = { 0xD1, 0x8B, 0xD0, 0xBC };
static symbol s_4_36[4] = { 0xD0, 0xB5, 0xD0, 0xBC };
static symbol s_4_37[4] = { 0xD0, 0xB8, 0xD0, 0xBC };
static symbol s_4_38[2] = { 0xD0, 0xBD };
static symbol s_4_39[4] = { 0xD0, 0xB5, 0xD0, 0xBD };
static symbol s_4_40[4] = { 0xD0, 0xBB, 0xD0, 0xBE };
static symbol s_4_41[6] = { 0xD1, 0x8B, 0xD0, 0xBB, 0xD0, 0xBE };
static symbol s_4_42[6] = { 0xD0, 0xB8, 0xD0, 0xBB, 0xD0, 0xBE };
static symbol s_4_43[4] = { 0xD0, 0xBD, 0xD0, 0xBE };
static symbol s_4_44[6] = { 0xD0, 0xB5, 0xD0, 0xBD, 0xD0, 0xBE };
static symbol s_4_45[6] = { 0xD0, 0xBD, 0xD0, 0xBD, 0xD0, 0xBE };
static struct among a_4[46] =
{
/* 0 */ { 4, s_4_0, -1, 2, 0},
/* 1 */ { 4, s_4_1, -1, 1, 0},
/* 2 */ { 6, s_4_2, 1, 2, 0},
/* 3 */ { 4, s_4_3, -1, 2, 0},
/* 4 */ { 4, s_4_4, -1, 1, 0},
/* 5 */ { 6, s_4_5, 4, 2, 0},
/* 6 */ { 4, s_4_6, -1, 2, 0},
/* 7 */ { 4, s_4_7, -1, 1, 0},
/* 8 */ { 6, s_4_8, 7, 2, 0},
/* 9 */ { 4, s_4_9, -1, 1, 0},
/* 10 */ { 6, s_4_10, 9, 2, 0},
/* 11 */ { 6, s_4_11, 9, 2, 0},
/* 12 */ { 6, s_4_12, -1, 1, 0},
/* 13 */ { 6, s_4_13, -1, 2, 0},
/* 14 */ { 2, s_4_14, -1, 2, 0},
/* 15 */ { 4, s_4_15, 14, 2, 0},
/* 16 */ { 4, s_4_16, -1, 1, 0},
/* 17 */ { 6, s_4_17, 16, 2, 0},
/* 18 */ { 6, s_4_18, 16, 2, 0},
/* 19 */ { 4, s_4_19, -1, 1, 0},
/* 20 */ { 6, s_4_20, 19, 2, 0},
/* 21 */ { 6, s_4_21, -1, 1, 0},
/* 22 */ { 6, s_4_22, -1, 2, 0},
/* 23 */ { 6, s_4_23, -1, 1, 0},
/* 24 */ { 8, s_4_24, 23, 2, 0},
/* 25 */ { 8, s_4_25, 23, 2, 0},
/* 26 */ { 4, s_4_26, -1, 1, 0},
/* 27 */ { 6, s_4_27, 26, 2, 0},
/* 28 */ { 6, s_4_28, 26, 2, 0},
/* 29 */ { 2, s_4_29, -1, 1, 0},
/* 30 */ { 4, s_4_30, 29, 2, 0},
/* 31 */ { 4, s_4_31, 29, 2, 0},
/* 32 */ { 2, s_4_32, -1, 1, 0},
/* 33 */ { 4, s_4_33, 32, 2, 0},
/* 34 */ { 4, s_4_34, 32, 2, 0},
/* 35 */ { 4, s_4_35, -1, 2, 0},
/* 36 */ { 4, s_4_36, -1, 1, 0},
/* 37 */ { 4, s_4_37, -1, 2, 0},
/* 38 */ { 2, s_4_38, -1, 1, 0},
/* 39 */ { 4, s_4_39, 38, 2, 0},
/* 40 */ { 4, s_4_40, -1, 1, 0},
/* 41 */ { 6, s_4_41, 40, 2, 0},
/* 42 */ { 6, s_4_42, 40, 2, 0},
/* 43 */ { 4, s_4_43, -1, 1, 0},
/* 44 */ { 6, s_4_44, 43, 2, 0},
/* 45 */ { 6, s_4_45, 43, 1, 0}
};
static symbol s_5_0[2] = { 0xD1, 0x83 };
static symbol s_5_1[4] = { 0xD1, 0x8F, 0xD1, 0x85 };
static symbol s_5_2[6] = { 0xD0, 0xB8, 0xD1, 0x8F, 0xD1, 0x85 };
static symbol s_5_3[4] = { 0xD0, 0xB0, 0xD1, 0x85 };
static symbol s_5_4[2] = { 0xD1, 0x8B };
static symbol s_5_5[2] = { 0xD1, 0x8C };
static symbol s_5_6[2] = { 0xD1, 0x8E };
static symbol s_5_7[4] = { 0xD1, 0x8C, 0xD1, 0x8E };
static symbol s_5_8[4] = { 0xD0, 0xB8, 0xD1, 0x8E };
static symbol s_5_9[2] = { 0xD1, 0x8F };
static symbol s_5_10[4] = { 0xD1, 0x8C, 0xD1, 0x8F };
static symbol s_5_11[4] = { 0xD0, 0xB8, 0xD1, 0x8F };
static symbol s_5_12[2] = { 0xD0, 0xB0 };
static symbol s_5_13[4] = { 0xD0, 0xB5, 0xD0, 0xB2 };
static symbol s_5_14[4] = { 0xD0, 0xBE, 0xD0, 0xB2 };
static symbol s_5_15[2] = { 0xD0, 0xB5 };
static symbol s_5_16[4] = { 0xD1, 0x8C, 0xD0, 0xB5 };
static symbol s_5_17[4] = { 0xD0, 0xB8, 0xD0, 0xB5 };
static symbol s_5_18[2] = { 0xD0, 0xB8 };
static symbol s_5_19[4] = { 0xD0, 0xB5, 0xD0, 0xB8 };
static symbol s_5_20[4] = { 0xD0, 0xB8, 0xD0, 0xB8 };
static symbol s_5_21[6] = { 0xD1, 0x8F, 0xD0, 0xBC, 0xD0, 0xB8 };
static symbol s_5_22[8] = { 0xD0, 0xB8, 0xD1, 0x8F, 0xD0, 0xBC, 0xD0, 0xB8 };
static symbol s_5_23[6] = { 0xD0, 0xB0, 0xD0, 0xBC, 0xD0, 0xB8 };
static symbol s_5_24[2] = { 0xD0, 0xB9 };
static symbol s_5_25[4] = { 0xD0, 0xB5, 0xD0, 0xB9 };
static symbol s_5_26[6] = { 0xD0, 0xB8, 0xD0, 0xB5, 0xD0, 0xB9 };
static symbol s_5_27[4] = { 0xD0, 0xB8, 0xD0, 0xB9 };
static symbol s_5_28[4] = { 0xD0, 0xBE, 0xD0, 0xB9 };
static symbol s_5_29[4] = { 0xD1, 0x8F, 0xD0, 0xBC };
static symbol s_5_30[6] = { 0xD0, 0xB8, 0xD1, 0x8F, 0xD0, 0xBC };
static symbol s_5_31[4] = { 0xD0, 0xB0, 0xD0, 0xBC };
static symbol s_5_32[4] = { 0xD0, 0xB5, 0xD0, 0xBC };
static symbol s_5_33[6] = { 0xD0, 0xB8, 0xD0, 0xB5, 0xD0, 0xBC };
static symbol s_5_34[4] = { 0xD0, 0xBE, 0xD0, 0xBC };
static symbol s_5_35[2] = { 0xD0, 0xBE };
static struct among a_5[36] =
{
/* 0 */ { 2, s_5_0, -1, 1, 0},
/* 1 */ { 4, s_5_1, -1, 1, 0},
/* 2 */ { 6, s_5_2, 1, 1, 0},
/* 3 */ { 4, s_5_3, -1, 1, 0},
/* 4 */ { 2, s_5_4, -1, 1, 0},
/* 5 */ { 2, s_5_5, -1, 1, 0},
/* 6 */ { 2, s_5_6, -1, 1, 0},
/* 7 */ { 4, s_5_7, 6, 1, 0},
/* 8 */ { 4, s_5_8, 6, 1, 0},
/* 9 */ { 2, s_5_9, -1, 1, 0},
/* 10 */ { 4, s_5_10, 9, 1, 0},
/* 11 */ { 4, s_5_11, 9, 1, 0},
/* 12 */ { 2, s_5_12, -1, 1, 0},
/* 13 */ { 4, s_5_13, -1, 1, 0},
/* 14 */ { 4, s_5_14, -1, 1, 0},
/* 15 */ { 2, s_5_15, -1, 1, 0},
/* 16 */ { 4, s_5_16, 15, 1, 0},
/* 17 */ { 4, s_5_17, 15, 1, 0},
/* 18 */ { 2, s_5_18, -1, 1, 0},
/* 19 */ { 4, s_5_19, 18, 1, 0},
/* 20 */ { 4, s_5_20, 18, 1, 0},
/* 21 */ { 6, s_5_21, 18, 1, 0},
/* 22 */ { 8, s_5_22, 21, 1, 0},
/* 23 */ { 6, s_5_23, 18, 1, 0},
/* 24 */ { 2, s_5_24, -1, 1, 0},
/* 25 */ { 4, s_5_25, 24, 1, 0},
/* 26 */ { 6, s_5_26, 25, 1, 0},
/* 27 */ { 4, s_5_27, 24, 1, 0},
/* 28 */ { 4, s_5_28, 24, 1, 0},
/* 29 */ { 4, s_5_29, -1, 1, 0},
/* 30 */ { 6, s_5_30, 29, 1, 0},
/* 31 */ { 4, s_5_31, -1, 1, 0},
/* 32 */ { 4, s_5_32, -1, 1, 0},
/* 33 */ { 6, s_5_33, 32, 1, 0},
/* 34 */ { 4, s_5_34, -1, 1, 0},
/* 35 */ { 2, s_5_35, -1, 1, 0}
};
static symbol s_6_0[6] = { 0xD0, 0xBE, 0xD1, 0x81, 0xD1, 0x82 };
static symbol s_6_1[8] = { 0xD0, 0xBE, 0xD1, 0x81, 0xD1, 0x82, 0xD1, 0x8C };
static struct among a_6[2] =
{
/* 0 */ { 6, s_6_0, -1, 1, 0},
/* 1 */ { 8, s_6_1, -1, 1, 0}
};
static symbol s_7_0[6] = { 0xD0, 0xB5, 0xD0, 0xB9, 0xD1, 0x88 };
static symbol s_7_1[2] = { 0xD1, 0x8C };
static symbol s_7_2[8] = { 0xD0, 0xB5, 0xD0, 0xB9, 0xD1, 0x88, 0xD0, 0xB5 };
static symbol s_7_3[2] = { 0xD0, 0xBD };
static struct among a_7[4] =
{
/* 0 */ { 6, s_7_0, -1, 1, 0},
/* 1 */ { 2, s_7_1, -1, 3, 0},
/* 2 */ { 8, s_7_2, -1, 1, 0},
/* 3 */ { 2, s_7_3, -1, 2, 0}
};
static unsigned char g_v[] = { 33, 65, 8, 232 };
static symbol s_0[] = { 0xD0, 0xB0 };
static symbol s_1[] = { 0xD1, 0x8F };
static symbol s_2[] = { 0xD0, 0xB0 };
static symbol s_3[] = { 0xD1, 0x8F };
static symbol s_4[] = { 0xD0, 0xB0 };
static symbol s_5[] = { 0xD1, 0x8F };
static symbol s_6[] = { 0xD0, 0xBD };
static symbol s_7[] = { 0xD0, 0xBD };
static symbol s_8[] = { 0xD0, 0xBD };
static symbol s_9[] = { 0xD0, 0xB8 };
static int r_mark_regions(struct SN_env * z) {
z->I[0] = z->l;
z->I[1] = z->l;
{ int c = z->c; /* do, line 61 */
while(1) { /* gopast, line 62 */
if (!(in_grouping_U(z, g_v, 1072, 1103))) goto lab1;
break;
lab1:
{ int c = skip_utf8(z->p, z->c, 0, z->l, 1);
if (c < 0) goto lab0;
z->c = c; /* gopast, line 62 */
}
}
z->I[0] = z->c; /* setmark pV, line 62 */
while(1) { /* gopast, line 62 */
if (!(out_grouping_U(z, g_v, 1072, 1103))) goto lab2;
break;
lab2:
{ int c = skip_utf8(z->p, z->c, 0, z->l, 1);
if (c < 0) goto lab0;
z->c = c; /* gopast, line 62 */
}
}
while(1) { /* gopast, line 63 */
if (!(in_grouping_U(z, g_v, 1072, 1103))) goto lab3;
break;
lab3:
{ int c = skip_utf8(z->p, z->c, 0, z->l, 1);
if (c < 0) goto lab0;
z->c = c; /* gopast, line 63 */
}
}
while(1) { /* gopast, line 63 */
if (!(out_grouping_U(z, g_v, 1072, 1103))) goto lab4;
break;
lab4:
{ int c = skip_utf8(z->p, z->c, 0, z->l, 1);
if (c < 0) goto lab0;
z->c = c; /* gopast, line 63 */
}
}
z->I[1] = z->c; /* setmark p2, line 63 */
lab0:
z->c = c;
}
return 1;
}
static int r_R2(struct SN_env * z) {
if (!(z->I[1] <= z->c)) return 0;
return 1;
}
static int r_perfective_gerund(struct SN_env * z) {
int among_var;
z->ket = z->c; /* [, line 72 */
among_var = find_among_b(z, a_0, 9); /* substring, line 72 */
if (!(among_var)) return 0;
z->bra = z->c; /* ], line 72 */
switch(among_var) {
case 0: return 0;
case 1:
{ int m = z->l - z->c; (void) m; /* or, line 76 */
if (!(eq_s_b(z, 2, s_0))) goto lab1;
goto lab0;
lab1:
z->c = z->l - m;
if (!(eq_s_b(z, 2, s_1))) return 0;
}
lab0:
{ int ret;
ret = slice_del(z); /* delete, line 76 */
if (ret < 0) return ret;
}
break;
case 2:
{ int ret;
ret = slice_del(z); /* delete, line 83 */
if (ret < 0) return ret;
}
break;
}
return 1;
}
static int r_adjective(struct SN_env * z) {
int among_var;
z->ket = z->c; /* [, line 88 */
among_var = find_among_b(z, a_1, 26); /* substring, line 88 */
if (!(among_var)) return 0;
z->bra = z->c; /* ], line 88 */
switch(among_var) {
case 0: return 0;
case 1:
{ int ret;
ret = slice_del(z); /* delete, line 97 */
if (ret < 0) return ret;
}
break;
}
return 1;
}
static int r_adjectival(struct SN_env * z) {
int among_var;
{ int ret = r_adjective(z);
if (ret == 0) return 0; /* call adjective, line 102 */
if (ret < 0) return ret;
}
{ int m = z->l - z->c; (void) m; /* try, line 109 */
z->ket = z->c; /* [, line 110 */
among_var = find_among_b(z, a_2, 8); /* substring, line 110 */
if (!(among_var)) { z->c = z->l - m; goto lab0; }
z->bra = z->c; /* ], line 110 */
switch(among_var) {
case 0: { z->c = z->l - m; goto lab0; }
case 1:
{ int m = z->l - z->c; (void) m; /* or, line 115 */
if (!(eq_s_b(z, 2, s_2))) goto lab2;
goto lab1;
lab2:
z->c = z->l - m;
if (!(eq_s_b(z, 2, s_3))) { z->c = z->l - m; goto lab0; }
}
lab1:
{ int ret;
ret = slice_del(z); /* delete, line 115 */
if (ret < 0) return ret;
}
break;
case 2:
{ int ret;
ret = slice_del(z); /* delete, line 122 */
if (ret < 0) return ret;
}
break;
}
lab0:
;
}
return 1;
}
static int r_reflexive(struct SN_env * z) {
int among_var;
z->ket = z->c; /* [, line 129 */
among_var = find_among_b(z, a_3, 2); /* substring, line 129 */
if (!(among_var)) return 0;
z->bra = z->c; /* ], line 129 */
switch(among_var) {
case 0: return 0;
case 1:
{ int ret;
ret = slice_del(z); /* delete, line 132 */
if (ret < 0) return ret;
}
break;
}
return 1;
}
static int r_verb(struct SN_env * z) {
int among_var;
z->ket = z->c; /* [, line 137 */
among_var = find_among_b(z, a_4, 46); /* substring, line 137 */
if (!(among_var)) return 0;
z->bra = z->c; /* ], line 137 */
switch(among_var) {
case 0: return 0;
case 1:
{ int m = z->l - z->c; (void) m; /* or, line 143 */
if (!(eq_s_b(z, 2, s_4))) goto lab1;
goto lab0;
lab1:
z->c = z->l - m;
if (!(eq_s_b(z, 2, s_5))) return 0;
}
lab0:
{ int ret;
ret = slice_del(z); /* delete, line 143 */
if (ret < 0) return ret;
}
break;
case 2:
{ int ret;
ret = slice_del(z); /* delete, line 151 */
if (ret < 0) return ret;
}
break;
}
return 1;
}
static int r_noun(struct SN_env * z) {
int among_var;
z->ket = z->c; /* [, line 160 */
among_var = find_among_b(z, a_5, 36); /* substring, line 160 */
if (!(among_var)) return 0;
z->bra = z->c; /* ], line 160 */
switch(among_var) {
case 0: return 0;
case 1:
{ int ret;
ret = slice_del(z); /* delete, line 167 */
if (ret < 0) return ret;
}
break;
}
return 1;
}
static int r_derivational(struct SN_env * z) {
int among_var;
z->ket = z->c; /* [, line 176 */
among_var = find_among_b(z, a_6, 2); /* substring, line 176 */
if (!(among_var)) return 0;
z->bra = z->c; /* ], line 176 */
{ int ret = r_R2(z);
if (ret == 0) return 0; /* call R2, line 176 */
if (ret < 0) return ret;
}
switch(among_var) {
case 0: return 0;
case 1:
{ int ret;
ret = slice_del(z); /* delete, line 179 */
if (ret < 0) return ret;
}
break;
}
return 1;
}
static int r_tidy_up(struct SN_env * z) {
int among_var;
z->ket = z->c; /* [, line 184 */
among_var = find_among_b(z, a_7, 4); /* substring, line 184 */
if (!(among_var)) return 0;
z->bra = z->c; /* ], line 184 */
switch(among_var) {
case 0: return 0;
case 1:
{ int ret;
ret = slice_del(z); /* delete, line 188 */
if (ret < 0) return ret;
}
z->ket = z->c; /* [, line 189 */
if (!(eq_s_b(z, 2, s_6))) return 0;
z->bra = z->c; /* ], line 189 */
if (!(eq_s_b(z, 2, s_7))) return 0;
{ int ret;
ret = slice_del(z); /* delete, line 189 */
if (ret < 0) return ret;
}
break;
case 2:
if (!(eq_s_b(z, 2, s_8))) return 0;
{ int ret;
ret = slice_del(z); /* delete, line 192 */
if (ret < 0) return ret;
}
break;
case 3:
{ int ret;
ret = slice_del(z); /* delete, line 194 */
if (ret < 0) return ret;
}
break;
}
return 1;
}
extern int russian_UTF_8_stem(struct SN_env * z) {
{ int c = z->c; /* do, line 201 */
{ int ret = r_mark_regions(z);
if (ret == 0) goto lab0; /* call mark_regions, line 201 */
if (ret < 0) return ret;
}
lab0:
z->c = c;
}
z->lb = z->c; z->c = z->l; /* backwards, line 202 */
{ int m3; /* setlimit, line 202 */
int m = z->l - z->c; (void) m;
if (z->c < z->I[0]) return 0;
z->c = z->I[0]; /* tomark, line 202 */
m3 = z->lb; z->lb = z->c;
z->c = z->l - m;
{ int m = z->l - z->c; (void) m; /* do, line 203 */
{ int m = z->l - z->c; (void) m; /* or, line 204 */
{ int ret = r_perfective_gerund(z);
if (ret == 0) goto lab3; /* call perfective_gerund, line 204 */
if (ret < 0) return ret;
}
goto lab2;
lab3:
z->c = z->l - m;
{ int m = z->l - z->c; (void) m; /* try, line 205 */
{ int ret = r_reflexive(z);
if (ret == 0) { z->c = z->l - m; goto lab4; } /* call reflexive, line 205 */
if (ret < 0) return ret;
}
lab4:
;
}
{ int m = z->l - z->c; (void) m; /* or, line 206 */
{ int ret = r_adjectival(z);
if (ret == 0) goto lab6; /* call adjectival, line 206 */
if (ret < 0) return ret;
}
goto lab5;
lab6:
z->c = z->l - m;
{ int ret = r_verb(z);
if (ret == 0) goto lab7; /* call verb, line 206 */
if (ret < 0) return ret;
}
goto lab5;
lab7:
z->c = z->l - m;
{ int ret = r_noun(z);
if (ret == 0) goto lab1; /* call noun, line 206 */
if (ret < 0) return ret;
}
}
lab5:
;
}
lab2:
lab1:
z->c = z->l - m;
}
{ int m = z->l - z->c; (void) m; /* try, line 209 */
z->ket = z->c; /* [, line 209 */
if (!(eq_s_b(z, 2, s_9))) { z->c = z->l - m; goto lab8; }
z->bra = z->c; /* ], line 209 */
{ int ret;
ret = slice_del(z); /* delete, line 209 */
if (ret < 0) return ret;
}
lab8:
;
}
{ int m = z->l - z->c; (void) m; /* do, line 212 */
{ int ret = r_derivational(z);
if (ret == 0) goto lab9; /* call derivational, line 212 */
if (ret < 0) return ret;
}
lab9:
z->c = z->l - m;
}
{ int m = z->l - z->c; (void) m; /* do, line 213 */
{ int ret = r_tidy_up(z);
if (ret == 0) goto lab10; /* call tidy_up, line 213 */
if (ret < 0) return ret;
}
lab10:
z->c = z->l - m;
}
z->lb = m3;
}
z->c = z->lb;
return 1;
}
extern struct SN_env * russian_UTF_8_create_env(void) { return SN_create_env(0, 2, 0); }
extern void russian_UTF_8_close_env(struct SN_env * z) { SN_close_env(z); }

View File

@ -0,0 +1,16 @@
/* This file was generated automatically by the Snowball to ANSI C compiler */
#ifdef __cplusplus
extern "C" {
#endif
extern struct SN_env * russian_UTF_8_create_env(void);
extern void russian_UTF_8_close_env(struct SN_env * z);
extern int russian_UTF_8_stem(struct SN_env * z);
#ifdef __cplusplus
}
#endif

View File

@ -0,0 +1,151 @@
и
в
во
не
что
он
на
я
с
со
как
а
то
все
она
так
его
но
да
ты
к
у
же
вы
за
бы
по
только
ее
мне
было
вот
от
меня
еще
нет
о
из
ему
теперь
когда
даже
ну
вдруг
ли
если
уже
или
ни
быть
был
него
до
вас
нибудь
опять
уж
вам
ведь
там
потом
себя
ничего
ей
может
они
тут
где
есть
надо
ней
для
мы
тебя
их
чем
была
сам
чтоб
без
будто
чего
раз
тоже
себе
под
будет
ж
тогда
кто
этот
того
потому
этого
какой
совсем
ним
здесь
этом
один
почти
мой
тем
чтобы
нее
сейчас
были
куда
зачем
всех
никогда
можно
при
наконец
два
об
другой
хоть
после
над
больше
тот
через
эти
нас
про
всего
них
какая
много
разве
три
эту
моя
впрочем
хорошо
свою
этой
перед
иногда
лучше
чуть
том
нельзя
такой
им
более
всегда
конечно
всю
между

View File

@ -82,17 +82,30 @@ insert into pg_ts_dict select
'English Stemmer. Snowball.'
;
CREATE FUNCTION snb_ru_init(internal)
CREATE FUNCTION snb_ru_init_koi8(internal)
returns internal
as 'MODULE_PATHNAME'
language 'C';
insert into pg_ts_dict select
'ru_stem',
'snb_ru_init(internal)',
'ru_stem_koi8',
'snb_ru_init_koi8(internal)',
'contrib/russian.stop',
'snb_lexize(internal,internal,int4)',
'Russian Stemmer. Snowball.'
'Russian Stemmer. Snowball. KOI8 Encoding'
;
CREATE FUNCTION snb_ru_init_utf8(internal)
returns internal
as 'MODULE_PATHNAME'
language 'C';
insert into pg_ts_dict select
'ru_stem_utf8',
'snb_ru_init_utf8(internal)',
'contrib/russian.stop.utf8',
'snb_lexize(internal,internal,int4)',
'Russian Stemmer. Snowball. UTF8 Encoding'
;
CREATE FUNCTION spell_init(internal)
@ -270,6 +283,7 @@ CREATE FUNCTION show_curcfg()
insert into pg_ts_cfg values ('default', 'default','C');
insert into pg_ts_cfg values ('default_russian', 'default','ru_RU.KOI8-R');
insert into pg_ts_cfg values ('utf8_russian', 'default','ru_RU.UTF-8');
insert into pg_ts_cfg values ('simple', 'default');
insert into pg_ts_cfgmap values ('default', 'lword', '{en_stem}');
@ -292,24 +306,43 @@ insert into pg_ts_cfgmap values ('default', 'float', '{simple}');
insert into pg_ts_cfgmap values ('default', 'int', '{simple}');
insert into pg_ts_cfgmap values ('default', 'uint', '{simple}');
insert into pg_ts_cfgmap values ('default_russian', 'lword', '{en_stem}');
insert into pg_ts_cfgmap values ('default_russian', 'nlword', '{ru_stem}');
insert into pg_ts_cfgmap values ('default_russian', 'word', '{ru_stem}');
insert into pg_ts_cfgmap values ('default_russian', 'nlword', '{ru_stem_koi8}');
insert into pg_ts_cfgmap values ('default_russian', 'word', '{ru_stem_koi8}');
insert into pg_ts_cfgmap values ('default_russian', 'email', '{simple}');
insert into pg_ts_cfgmap values ('default_russian', 'url', '{simple}');
insert into pg_ts_cfgmap values ('default_russian', 'host', '{simple}');
insert into pg_ts_cfgmap values ('default_russian', 'sfloat', '{simple}');
insert into pg_ts_cfgmap values ('default_russian', 'version', '{simple}');
insert into pg_ts_cfgmap values ('default_russian', 'part_hword', '{simple}');
insert into pg_ts_cfgmap values ('default_russian', 'nlpart_hword', '{ru_stem}');
insert into pg_ts_cfgmap values ('default_russian', 'nlpart_hword', '{ru_stem_koi8}');
insert into pg_ts_cfgmap values ('default_russian', 'lpart_hword', '{en_stem}');
insert into pg_ts_cfgmap values ('default_russian', 'hword', '{ru_stem}');
insert into pg_ts_cfgmap values ('default_russian', 'hword', '{ru_stem_koi8}');
insert into pg_ts_cfgmap values ('default_russian', 'lhword', '{en_stem}');
insert into pg_ts_cfgmap values ('default_russian', 'nlhword', '{ru_stem}');
insert into pg_ts_cfgmap values ('default_russian', 'nlhword', '{ru_stem_koi8}');
insert into pg_ts_cfgmap values ('default_russian', 'uri', '{simple}');
insert into pg_ts_cfgmap values ('default_russian', 'file', '{simple}');
insert into pg_ts_cfgmap values ('default_russian', 'float', '{simple}');
insert into pg_ts_cfgmap values ('default_russian', 'int', '{simple}');
insert into pg_ts_cfgmap values ('default_russian', 'uint', '{simple}');
insert into pg_ts_cfgmap values ('utf8_russian', 'lword', '{en_stem}');
insert into pg_ts_cfgmap values ('utf8_russian', 'nlword', '{ru_stem_utf8}');
insert into pg_ts_cfgmap values ('utf8_russian', 'word', '{ru_stem_utf8}');
insert into pg_ts_cfgmap values ('utf8_russian', 'email', '{simple}');
insert into pg_ts_cfgmap values ('utf8_russian', 'url', '{simple}');
insert into pg_ts_cfgmap values ('utf8_russian', 'host', '{simple}');
insert into pg_ts_cfgmap values ('utf8_russian', 'sfloat', '{simple}');
insert into pg_ts_cfgmap values ('utf8_russian', 'version', '{simple}');
insert into pg_ts_cfgmap values ('utf8_russian', 'part_hword', '{simple}');
insert into pg_ts_cfgmap values ('utf8_russian', 'nlpart_hword', '{ru_stem_utf8}');
insert into pg_ts_cfgmap values ('utf8_russian', 'lpart_hword', '{en_stem}');
insert into pg_ts_cfgmap values ('utf8_russian', 'hword', '{ru_stem_utf8}');
insert into pg_ts_cfgmap values ('utf8_russian', 'lhword', '{en_stem}');
insert into pg_ts_cfgmap values ('utf8_russian', 'nlhword', '{ru_stem_utf8}');
insert into pg_ts_cfgmap values ('utf8_russian', 'uri', '{simple}');
insert into pg_ts_cfgmap values ('utf8_russian', 'file', '{simple}');
insert into pg_ts_cfgmap values ('utf8_russian', 'float', '{simple}');
insert into pg_ts_cfgmap values ('utf8_russian', 'int', '{simple}');
insert into pg_ts_cfgmap values ('utf8_russian', 'uint', '{simple}');
insert into pg_ts_cfgmap values ('simple', 'lword', '{simple}');
insert into pg_ts_cfgmap values ('simple', 'nlword', '{simple}');
insert into pg_ts_cfgmap values ('simple', 'word', '{simple}');