64 lines
1.7 KiB
Python
64 lines
1.7 KiB
Python
|
# Natural Language Toolkit: Dispersion Plots
|
||
|
#
|
||
|
# Copyright (C) 2001-2023 NLTK Project
|
||
|
# Author: Steven Bird <stevenbird1@gmail.com>
|
||
|
# URL: <https://www.nltk.org/>
|
||
|
# For license information, see LICENSE.TXT
|
||
|
|
||
|
"""
|
||
|
A utility for displaying lexical dispersion.
|
||
|
"""
|
||
|
|
||
|
|
||
|
def dispersion_plot(text, words, ignore_case=False, title="Lexical Dispersion Plot"):
|
||
|
"""
|
||
|
Generate a lexical dispersion plot.
|
||
|
|
||
|
:param text: The source text
|
||
|
:type text: list(str) or iter(str)
|
||
|
:param words: The target words
|
||
|
:type words: list of str
|
||
|
:param ignore_case: flag to set if case should be ignored when searching text
|
||
|
:type ignore_case: bool
|
||
|
:return: a matplotlib Axes object that may still be modified before plotting
|
||
|
:rtype: Axes
|
||
|
"""
|
||
|
|
||
|
try:
|
||
|
import matplotlib.pyplot as plt
|
||
|
except ImportError as e:
|
||
|
raise ImportError(
|
||
|
"The plot function requires matplotlib to be installed. "
|
||
|
"See https://matplotlib.org/"
|
||
|
) from e
|
||
|
|
||
|
word2y = {
|
||
|
word.casefold() if ignore_case else word: y
|
||
|
for y, word in enumerate(reversed(words))
|
||
|
}
|
||
|
xs, ys = [], []
|
||
|
for x, token in enumerate(text):
|
||
|
token = token.casefold() if ignore_case else token
|
||
|
y = word2y.get(token)
|
||
|
if y is not None:
|
||
|
xs.append(x)
|
||
|
ys.append(y)
|
||
|
|
||
|
_, ax = plt.subplots()
|
||
|
ax.plot(xs, ys, "|")
|
||
|
ax.set_yticks(list(range(len(words))), words, color="C0")
|
||
|
ax.set_ylim(-1, len(words))
|
||
|
ax.set_title(title)
|
||
|
ax.set_xlabel("Word Offset")
|
||
|
return ax
|
||
|
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
import matplotlib.pyplot as plt
|
||
|
|
||
|
from nltk.corpus import gutenberg
|
||
|
|
||
|
words = ["Elinor", "Marianne", "Edward", "Willoughby"]
|
||
|
dispersion_plot(gutenberg.words("austen-sense.txt"), words)
|
||
|
plt.show()
|