#!/usr/bin/env python # -*- coding: utf-8 -*- from __future__ import absolute_import, unicode_literals import sys from operator import itemgetter from collections import defaultdict import jieba.posseg from .tfidf import KeywordExtractor from .._compat import * class UndirectWeightedGraph: d = 0.85 def __init__(self): self.graph = defaultdict(list) def addEdge(self, start, end, weight): # use a tuple (start, end, weight) instead of a Edge object self.graph[start].append((start, end, weight)) self.graph[end].append((end, start, weight)) def rank(self): ws = defaultdict(float) outSum = defaultdict(float) wsdef = 1.0 / (len(self.graph) or 1.0) for n, out in self.graph.items(): ws[n] = wsdef outSum[n] = sum((e[2] for e in out), 0.0) # this line for build stable iteration sorted_keys = sorted(self.graph.keys()) for x in xrange(10): # 10 iters for n in sorted_keys: s = 0 for e in self.graph[n]: s += e[2] / outSum[e[1]] * ws[e[1]] ws[n] = (1 - self.d) + self.d * s (min_rank, max_rank) = (sys.float_info[0], sys.float_info[3]) for w in itervalues(ws): if w < min_rank: min_rank = w if w > max_rank: max_rank = w for n, w in ws.items(): # to unify the weights, don't *100. ws[n] = (w - min_rank / 10.0) / (max_rank - min_rank / 10.0) return ws class TextRank(KeywordExtractor): def __init__(self): self.tokenizer = self.postokenizer = jieba.posseg.dt self.stop_words = self.STOP_WORDS.copy() self.pos_filt = frozenset(('ns', 'n', 'vn', 'v')) self.span = 5 def pairfilter(self, wp): return (wp.flag in self.pos_filt and len(wp.word.strip()) >= 2 and wp.word.lower() not in self.stop_words) def textrank(self, sentence, topK=20, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v'), withFlag=False): """ Extract keywords from sentence using TextRank algorithm. Parameter: - topK: return how many top keywords. `None` for all possible words. - withWeight: if True, return a list of (word, weight); if False, return a list of words. - allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v']. if the POS of w is not in this list, it will be filtered. - withFlag: if True, return a list of pair(word, weight) like posseg.cut if False, return a list of words """ self.pos_filt = frozenset(allowPOS) g = UndirectWeightedGraph() cm = defaultdict(int) words = tuple(self.tokenizer.cut(sentence)) for i, wp in enumerate(words): if self.pairfilter(wp): for j in xrange(i + 1, i + self.span): if j >= len(words): break if not self.pairfilter(words[j]): continue if allowPOS and withFlag: cm[(wp, words[j])] += 1 else: cm[(wp.word, words[j].word)] += 1 for terms, w in cm.items(): g.addEdge(terms[0], terms[1], w) nodes_rank = g.rank() if withWeight: tags = sorted(nodes_rank.items(), key=itemgetter(1), reverse=True) else: tags = sorted(nodes_rank, key=nodes_rank.__getitem__, reverse=True) if topK: return tags[:topK] else: return tags extract_tags = textrank