forked from mirror/MrDoc
155 lines
7.0 KiB
Python
155 lines
7.0 KiB
Python
# coding:utf-8
|
||
# @文件: chinese_analyzer.py
|
||
# @创建者:州的先生
|
||
# #日期:2020/11/22
|
||
# 博客地址:zmister.com
|
||
|
||
from whoosh.compat import u, text_type
|
||
from whoosh.analysis.filters import LowercaseFilter
|
||
from whoosh.analysis.filters import StopFilter, STOP_WORDS
|
||
from whoosh.analysis.morph import StemFilter
|
||
from whoosh.analysis.tokenizers import default_pattern
|
||
from whoosh.lang.porter import stem
|
||
from whoosh.analysis import Tokenizer, Token
|
||
from whoosh.util.text import rcompile
|
||
import jieba
|
||
|
||
|
||
class ChineseTokenizer(Tokenizer):
|
||
"""
|
||
使用正则表达式从文本中提取 token 令牌。
|
||
>>> rex = ChineseTokenizer()
|
||
>>> [token.text for token in rex(u("hi there 3.141 big-time under_score"))]
|
||
["hi", "there", "3.141", "big", "time", "under_score"]
|
||
"""
|
||
def __init__(self, expression=default_pattern, gaps=False):
|
||
"""
|
||
:param expression: 一个正则表达式对象或字符串,默认为 rcompile(r"\w+(\.?\w+)*")。
|
||
表达式的每一个匹配都等于一个 token 令牌。
|
||
第0组匹配(整个匹配文本)用作 token 令牌的文本。
|
||
如果你需要更复杂的正则表达式匹配处理,只需要编写自己的 tokenizer 令牌解析器即可。
|
||
:param gaps: 如果为 True, tokenizer 令牌解析器会在正则表达式上进行分割,而非匹配。
|
||
"""
|
||
self.expression = rcompile(expression)
|
||
self.gaps = gaps
|
||
|
||
def __eq__(self, other):
|
||
if self.__class__ is other.__class__:
|
||
if self.expression.pattern == other.expression.pattern:
|
||
return True
|
||
return False
|
||
|
||
def __call__(self, value, positions=False, chars=False, keeporiginal=False,
|
||
removestops=True, start_pos=0, start_char=0, tokenize=True,
|
||
mode='', **kwargs):
|
||
"""
|
||
:param value: 进行令牌解析的 Unicode 字符串。
|
||
:param positions: 是否在 token 令牌中记录 token 令牌位置。
|
||
:param chars: 是否在 token 中记录字符偏移。
|
||
:param start_pos: 第一个 token 的位置。例如,
|
||
如果设置 start_pos=2, 那么 token 的位置将是 2,3,4,...而非 0,1,2,...
|
||
:param start_char: 第一个 token 中第一个字符的偏移量。
|
||
例如, 如果设置 start_char=2, 那么文本 "aaa bbb" 解析的两个字符串位置将体现为 (2,5),(6,9) 而非 (0,3),(4,7).
|
||
:param tokenize: 如果为 True, 文本应该被令牌解析。
|
||
"""
|
||
# 判断传入的文本是否为字符串,如果不为字符串则抛出
|
||
assert isinstance(value, text_type), "%s is not unicode" % repr(value)
|
||
t = Token(positions, chars, removestops=removestops, mode=mode,
|
||
**kwargs)
|
||
if not tokenize:
|
||
t.original = t.text = value
|
||
t.boost = 1.0
|
||
if positions:
|
||
t.pos = start_pos
|
||
if chars:
|
||
t.startchar = start_char
|
||
t.endchar = start_char + len(value)
|
||
yield t
|
||
elif not self.gaps:
|
||
# The default: expression matches are used as tokens
|
||
# 默认情况下,正则表达式的匹配用作 token 令牌
|
||
# for pos, match in enumerate(self.expression.finditer(value)):
|
||
# t.text = match.group(0)
|
||
# t.boost = 1.0
|
||
# if keeporiginal:
|
||
# t.original = t.text
|
||
# t.stopped = False
|
||
# if positions:
|
||
# t.pos = start_pos + pos
|
||
# if chars:
|
||
# t.startchar = start_char + match.start()
|
||
# t.endchar = start_char + match.end()
|
||
# yield t
|
||
seglist = jieba.cut(value, cut_all=True)
|
||
for w in seglist:
|
||
t.original = t.text = w
|
||
t.boost = 1.0
|
||
if positions:
|
||
t.pos = start_pos + value.find(w)
|
||
if chars:
|
||
t.startchar = start_char + value.find(w)
|
||
t.endchar = start_char + value.find(w) + len(w)
|
||
yield t
|
||
else:
|
||
# When gaps=True, iterate through the matches and
|
||
# yield the text between them.
|
||
# 当 gaps=True, 遍历匹配项并在它们之间生成文本。
|
||
prevend = 0
|
||
pos = start_pos
|
||
for match in self.expression.finditer(value):
|
||
start = prevend
|
||
end = match.start()
|
||
text = value[start:end]
|
||
if text:
|
||
t.text = text
|
||
t.boost = 1.0
|
||
if keeporiginal:
|
||
t.original = t.text
|
||
t.stopped = False
|
||
if positions:
|
||
t.pos = pos
|
||
pos += 1
|
||
if chars:
|
||
t.startchar = start_char + start
|
||
t.endchar = start_char + end
|
||
yield t
|
||
prevend = match.end()
|
||
# If the last "gap" was before the end of the text,
|
||
# yield the last bit of text as a final token.
|
||
if prevend < len(value):
|
||
t.text = value[prevend:]
|
||
t.boost = 1.0
|
||
if keeporiginal:
|
||
t.original = t.text
|
||
t.stopped = False
|
||
if positions:
|
||
t.pos = pos
|
||
if chars:
|
||
t.startchar = prevend
|
||
t.endchar = len(value)
|
||
yield t
|
||
|
||
|
||
def ChineseAnalyzer(expression=default_pattern, stoplist=None,
|
||
minsize=2, maxsize=None, gaps=False, stemfn=stem,
|
||
ignore=None, cachesize=50000):
|
||
"""Composes a RegexTokenizer with a lower case filter, an optional stop
|
||
filter, and a stemming filter.
|
||
用小写过滤器、可选的停止停用词过滤器和词干过滤器组成生成器。
|
||
>>> ana = ChineseAnalyzer()
|
||
>>> [token.text for token in ana("Testing is testing and testing")]
|
||
["test", "test", "test"]
|
||
:param expression: 用于提取 token 令牌的正则表达式
|
||
:param stoplist: 一个停用词列表。 设置为 None 标识禁用停用词过滤功能。
|
||
:param minsize: 单词最小长度,小于它的单词将被从流中删除。
|
||
:param maxsize: 单词最大长度,大于它的单词将被从流中删除。
|
||
:param gaps: 如果为 True, tokenizer 令牌解析器将会分割正则表达式,而非匹配正则表达式
|
||
:param ignore: 一组忽略的单词。
|
||
:param cachesize: 缓存词干词的最大数目。 这个数字越大,词干生成的速度就越快,但占用的内存就越多。
|
||
使用 None 表示无缓存,使用 -1 表示无限缓存。
|
||
"""
|
||
ret = ChineseTokenizer(expression=expression, gaps=gaps)
|
||
chain = ret | LowercaseFilter()
|
||
if stoplist is not None:
|
||
chain = chain | StopFilter(stoplist=stoplist, minsize=minsize,maxsize=maxsize)
|
||
return chain | StemFilter(stemfn=stemfn, ignore=ignore,cachesize=cachesize) |