111 lines
3.7 KiB
Python
111 lines
3.7 KiB
Python
#!/usr/bin/env python
|
|
# -*- coding: utf-8 -*-
|
|
|
|
from __future__ import absolute_import, unicode_literals
|
|
import sys
|
|
from operator import itemgetter
|
|
from collections import defaultdict
|
|
import jieba.posseg
|
|
from .tfidf import KeywordExtractor
|
|
from .._compat import *
|
|
|
|
|
|
class UndirectWeightedGraph:
|
|
d = 0.85
|
|
|
|
def __init__(self):
|
|
self.graph = defaultdict(list)
|
|
|
|
def addEdge(self, start, end, weight):
|
|
# use a tuple (start, end, weight) instead of a Edge object
|
|
self.graph[start].append((start, end, weight))
|
|
self.graph[end].append((end, start, weight))
|
|
|
|
def rank(self):
|
|
ws = defaultdict(float)
|
|
outSum = defaultdict(float)
|
|
|
|
wsdef = 1.0 / (len(self.graph) or 1.0)
|
|
for n, out in self.graph.items():
|
|
ws[n] = wsdef
|
|
outSum[n] = sum((e[2] for e in out), 0.0)
|
|
|
|
# this line for build stable iteration
|
|
sorted_keys = sorted(self.graph.keys())
|
|
for x in xrange(10): # 10 iters
|
|
for n in sorted_keys:
|
|
s = 0
|
|
for e in self.graph[n]:
|
|
s += e[2] / outSum[e[1]] * ws[e[1]]
|
|
ws[n] = (1 - self.d) + self.d * s
|
|
|
|
(min_rank, max_rank) = (sys.float_info[0], sys.float_info[3])
|
|
|
|
for w in itervalues(ws):
|
|
if w < min_rank:
|
|
min_rank = w
|
|
if w > max_rank:
|
|
max_rank = w
|
|
|
|
for n, w in ws.items():
|
|
# to unify the weights, don't *100.
|
|
ws[n] = (w - min_rank / 10.0) / (max_rank - min_rank / 10.0)
|
|
|
|
return ws
|
|
|
|
|
|
class TextRank(KeywordExtractor):
|
|
|
|
def __init__(self):
|
|
self.tokenizer = self.postokenizer = jieba.posseg.dt
|
|
self.stop_words = self.STOP_WORDS.copy()
|
|
self.pos_filt = frozenset(('ns', 'n', 'vn', 'v'))
|
|
self.span = 5
|
|
|
|
def pairfilter(self, wp):
|
|
return (wp.flag in self.pos_filt and len(wp.word.strip()) >= 2
|
|
and wp.word.lower() not in self.stop_words)
|
|
|
|
def textrank(self, sentence, topK=20, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v'), withFlag=False):
|
|
"""
|
|
Extract keywords from sentence using TextRank algorithm.
|
|
Parameter:
|
|
- topK: return how many top keywords. `None` for all possible words.
|
|
- withWeight: if True, return a list of (word, weight);
|
|
if False, return a list of words.
|
|
- allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v'].
|
|
if the POS of w is not in this list, it will be filtered.
|
|
- withFlag: if True, return a list of pair(word, weight) like posseg.cut
|
|
if False, return a list of words
|
|
"""
|
|
self.pos_filt = frozenset(allowPOS)
|
|
g = UndirectWeightedGraph()
|
|
cm = defaultdict(int)
|
|
words = tuple(self.tokenizer.cut(sentence))
|
|
for i, wp in enumerate(words):
|
|
if self.pairfilter(wp):
|
|
for j in xrange(i + 1, i + self.span):
|
|
if j >= len(words):
|
|
break
|
|
if not self.pairfilter(words[j]):
|
|
continue
|
|
if allowPOS and withFlag:
|
|
cm[(wp, words[j])] += 1
|
|
else:
|
|
cm[(wp.word, words[j].word)] += 1
|
|
|
|
for terms, w in cm.items():
|
|
g.addEdge(terms[0], terms[1], w)
|
|
nodes_rank = g.rank()
|
|
if withWeight:
|
|
tags = sorted(nodes_rank.items(), key=itemgetter(1), reverse=True)
|
|
else:
|
|
tags = sorted(nodes_rank, key=nodes_rank.__getitem__, reverse=True)
|
|
|
|
if topK:
|
|
return tags[:topK]
|
|
else:
|
|
return tags
|
|
|
|
extract_tags = textrank
|