117 lines
4.2 KiB
Python
117 lines
4.2 KiB
Python
# encoding=utf-8
|
|
from __future__ import absolute_import
|
|
import os
|
|
import jieba
|
|
import jieba.posseg
|
|
from operator import itemgetter
|
|
|
|
_get_module_path = lambda path: os.path.normpath(os.path.join(os.getcwd(),
|
|
os.path.dirname(__file__), path))
|
|
_get_abs_path = jieba._get_abs_path
|
|
|
|
DEFAULT_IDF = _get_module_path("idf.txt")
|
|
|
|
|
|
class KeywordExtractor(object):
|
|
|
|
STOP_WORDS = set((
|
|
"the", "of", "is", "and", "to", "in", "that", "we", "for", "an", "are",
|
|
"by", "be", "as", "on", "with", "can", "if", "from", "which", "you", "it",
|
|
"this", "then", "at", "have", "all", "not", "one", "has", "or", "that"
|
|
))
|
|
|
|
def set_stop_words(self, stop_words_path):
|
|
abs_path = _get_abs_path(stop_words_path)
|
|
if not os.path.isfile(abs_path):
|
|
raise Exception("jieba: file does not exist: " + abs_path)
|
|
content = open(abs_path, 'rb').read().decode('utf-8')
|
|
for line in content.splitlines():
|
|
self.stop_words.add(line)
|
|
|
|
def extract_tags(self, *args, **kwargs):
|
|
raise NotImplementedError
|
|
|
|
|
|
class IDFLoader(object):
|
|
|
|
def __init__(self, idf_path=None):
|
|
self.path = ""
|
|
self.idf_freq = {}
|
|
self.median_idf = 0.0
|
|
if idf_path:
|
|
self.set_new_path(idf_path)
|
|
|
|
def set_new_path(self, new_idf_path):
|
|
if self.path != new_idf_path:
|
|
self.path = new_idf_path
|
|
content = open(new_idf_path, 'rb').read().decode('utf-8')
|
|
self.idf_freq = {}
|
|
for line in content.splitlines():
|
|
word, freq = line.strip().split(' ')
|
|
self.idf_freq[word] = float(freq)
|
|
self.median_idf = sorted(
|
|
self.idf_freq.values())[len(self.idf_freq) // 2]
|
|
|
|
def get_idf(self):
|
|
return self.idf_freq, self.median_idf
|
|
|
|
|
|
class TFIDF(KeywordExtractor):
|
|
|
|
def __init__(self, idf_path=None):
|
|
self.tokenizer = jieba.dt
|
|
self.postokenizer = jieba.posseg.dt
|
|
self.stop_words = self.STOP_WORDS.copy()
|
|
self.idf_loader = IDFLoader(idf_path or DEFAULT_IDF)
|
|
self.idf_freq, self.median_idf = self.idf_loader.get_idf()
|
|
|
|
def set_idf_path(self, idf_path):
|
|
new_abs_path = _get_abs_path(idf_path)
|
|
if not os.path.isfile(new_abs_path):
|
|
raise Exception("jieba: file does not exist: " + new_abs_path)
|
|
self.idf_loader.set_new_path(new_abs_path)
|
|
self.idf_freq, self.median_idf = self.idf_loader.get_idf()
|
|
|
|
def extract_tags(self, sentence, topK=20, withWeight=False, allowPOS=(), withFlag=False):
|
|
"""
|
|
Extract keywords from sentence using TF-IDF algorithm.
|
|
Parameter:
|
|
- topK: return how many top keywords. `None` for all possible words.
|
|
- withWeight: if True, return a list of (word, weight);
|
|
if False, return a list of words.
|
|
- allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v','nr'].
|
|
if the POS of w is not in this list,it will be filtered.
|
|
- withFlag: only work with allowPOS is not empty.
|
|
if True, return a list of pair(word, weight) like posseg.cut
|
|
if False, return a list of words
|
|
"""
|
|
if allowPOS:
|
|
allowPOS = frozenset(allowPOS)
|
|
words = self.postokenizer.cut(sentence)
|
|
else:
|
|
words = self.tokenizer.cut(sentence)
|
|
freq = {}
|
|
for w in words:
|
|
if allowPOS:
|
|
if w.flag not in allowPOS:
|
|
continue
|
|
elif not withFlag:
|
|
w = w.word
|
|
wc = w.word if allowPOS and withFlag else w
|
|
if len(wc.strip()) < 2 or wc.lower() in self.stop_words:
|
|
continue
|
|
freq[w] = freq.get(w, 0.0) + 1.0
|
|
total = sum(freq.values())
|
|
for k in freq:
|
|
kw = k.word if allowPOS and withFlag else k
|
|
freq[k] *= self.idf_freq.get(kw, self.median_idf) / total
|
|
|
|
if withWeight:
|
|
tags = sorted(freq.items(), key=itemgetter(1), reverse=True)
|
|
else:
|
|
tags = sorted(freq, key=freq.__getitem__, reverse=True)
|
|
if topK:
|
|
return tags[:topK]
|
|
else:
|
|
return tags
|