ai-content-maker/.venv/Lib/site-packages/pypinyin/seg/mmseg.py

125 lines
4.0 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: utf-8 -*-
"""最大正向匹配分词"""
from pypinyin.constants import PHRASES_DICT
class Seg(object):
"""正向最大匹配分词
:type prefix_set: PrefixSet
:param no_non_phrases: 是否严格按照词语分词,不允许把非词语的词当做词语进行分词
:type no_non_phrases: bool
"""
def __init__(self, prefix_set, no_non_phrases=False):
self._prefix_set = prefix_set
self._no_non_phrases = no_non_phrases
def cut(self, text):
"""分词
:param text: 待分词的文本
:yield: 单个词语
"""
remain = text
while remain:
matched = ''
# 一次加一个字的匹配
for index in range(len(remain)):
word = remain[:index + 1]
if word in self._prefix_set:
matched = word
else:
# 前面的字符串是个词语
if (matched and (
(not self._no_non_phrases) or
matched in PHRASES_DICT
)
):
yield matched
matched = ''
remain = remain[index:]
else: # 前面为空或不是真正的词语
# 严格按照词语分词的情况下,不是词语的词拆分为单个汉字
# 先返回第一个字,后面的重新参与分词,
# 处理前缀匹配导致无法识别输入尾部的词语,
# 支持简单的逆向匹配分词:
# 已有词语:金融寡头 行业
# 输入:金融行业
# 输出:金 融 行业
if self._no_non_phrases:
yield word[0]
remain = remain[index + 2 - len(word):]
else:
yield word
remain = remain[index + 1:]
# 有结果了,剩余的重新开始匹配
matched = ''
break
else: # 整个文本就是一个词语,或者不包含任何词语
if self._no_non_phrases and remain not in PHRASES_DICT:
for x in remain:
yield x
else:
yield remain
break
def train(self, words):
"""训练分词器
:param words: 词语列表
"""
self._prefix_set.train(words)
class PrefixSet(object):
def __init__(self):
self._set = set()
def train(self, word_s):
"""更新 prefix set
:param word_s: 词语库列表
:type word_s: iterable
:return: None
"""
for word in word_s:
# 把词语的每个前缀更新到 prefix_set 中
for index in range(len(word)):
self._set.add(word[:index + 1])
def __contains__(self, key):
return key in self._set
p_set = PrefixSet()
p_set.train(PHRASES_DICT.keys())
#: 基于内置词库的最大正向匹配分词器。使用:
#:
#: .. code-block:: python
#:
#: >>> from pypinyin.contrib.mmseg import seg
#: >>> text = '你好,我是中国人,我爱我的祖国'
#: >>> seg.cut(text)
#: <generator object Seg.cut at 0x10b2df2b0>
#: >>> list(seg.cut(text))
#: ['你好', '', '我', '是', '中国人', '', '我', '爱',
#: '我的', '祖', '国']
#: >>> seg.train(['祖国', '我是'])
#: >>> list(seg.cut(text))
#: ['你好', '', '我是', '中国人', '', '我', '爱',
#: '我的', '祖国']
#: >>>
seg = Seg(p_set, no_non_phrases=True)
def retrain(seg_instance):
"""重新使用内置词典训练 seg_instance。
比如在增加自定义词语信息后需要调用这个模块重新训练分词器
:type seg_instance: Seg
"""
seg_instance.train(PHRASES_DICT.keys())