# Natural Language Toolkit: Collocations Application # Much of the GUI code is imported from concordance.py; We intend to merge these tools together # Copyright (C) 2001-2023 NLTK Project # Author: Sumukh Ghodke # URL: # For license information, see LICENSE.TXT # import queue as q import threading from tkinter import ( END, LEFT, SUNKEN, Button, Frame, IntVar, Label, Menu, OptionMenu, Scrollbar, StringVar, Text, Tk, ) from tkinter.font import Font from nltk.corpus import ( alpino, brown, cess_cat, cess_esp, floresta, indian, mac_morpho, machado, nps_chat, sinica_treebank, treebank, ) from nltk.probability import FreqDist from nltk.util import in_idle CORPUS_LOADED_EVENT = "<>" ERROR_LOADING_CORPUS_EVENT = "<>" POLL_INTERVAL = 100 _DEFAULT = "English: Brown Corpus (Humor)" _CORPORA = { "Catalan: CESS-CAT Corpus": lambda: cess_cat.words(), "English: Brown Corpus": lambda: brown.words(), "English: Brown Corpus (Press)": lambda: brown.words( categories=["news", "editorial", "reviews"] ), "English: Brown Corpus (Religion)": lambda: brown.words(categories="religion"), "English: Brown Corpus (Learned)": lambda: brown.words(categories="learned"), "English: Brown Corpus (Science Fiction)": lambda: brown.words( categories="science_fiction" ), "English: Brown Corpus (Romance)": lambda: brown.words(categories="romance"), "English: Brown Corpus (Humor)": lambda: brown.words(categories="humor"), "English: NPS Chat Corpus": lambda: nps_chat.words(), "English: Wall Street Journal Corpus": lambda: treebank.words(), "Chinese: Sinica Corpus": lambda: sinica_treebank.words(), "Dutch: Alpino Corpus": lambda: alpino.words(), "Hindi: Indian Languages Corpus": lambda: indian.words(files="hindi.pos"), "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.words(), "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.words(), "Portuguese: Machado Corpus (Brazil)": lambda: machado.words(), "Spanish: CESS-ESP Corpus": lambda: cess_esp.words(), } class CollocationsView: _BACKGROUND_COLOUR = "#FFF" # white def __init__(self): self.queue = q.Queue() self.model = CollocationsModel(self.queue) self.top = Tk() self._init_top(self.top) self._init_menubar() self._init_widgets(self.top) self.load_corpus(self.model.DEFAULT_CORPUS) self.after = self.top.after(POLL_INTERVAL, self._poll) def _init_top(self, top): top.geometry("550x650+50+50") top.title("NLTK Collocations List") top.bind("", self.destroy) top.protocol("WM_DELETE_WINDOW", self.destroy) top.minsize(550, 650) def _init_widgets(self, parent): self.main_frame = Frame( parent, dict(background=self._BACKGROUND_COLOUR, padx=1, pady=1, border=1) ) self._init_corpus_select(self.main_frame) self._init_results_box(self.main_frame) self._init_paging(self.main_frame) self._init_status(self.main_frame) self.main_frame.pack(fill="both", expand=True) def _init_corpus_select(self, parent): innerframe = Frame(parent, background=self._BACKGROUND_COLOUR) self.var = StringVar(innerframe) self.var.set(self.model.DEFAULT_CORPUS) Label( innerframe, justify=LEFT, text=" Corpus: ", background=self._BACKGROUND_COLOUR, padx=2, pady=1, border=0, ).pack(side="left") other_corpora = list(self.model.CORPORA.keys()).remove( self.model.DEFAULT_CORPUS ) om = OptionMenu( innerframe, self.var, self.model.DEFAULT_CORPUS, command=self.corpus_selected, *self.model.non_default_corpora() ) om["borderwidth"] = 0 om["highlightthickness"] = 1 om.pack(side="left") innerframe.pack(side="top", fill="x", anchor="n") def _init_status(self, parent): self.status = Label( parent, justify=LEFT, relief=SUNKEN, background=self._BACKGROUND_COLOUR, border=0, padx=1, pady=0, ) self.status.pack(side="top", anchor="sw") def _init_menubar(self): self._result_size = IntVar(self.top) menubar = Menu(self.top) filemenu = Menu(menubar, tearoff=0, borderwidth=0) filemenu.add_command( label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-q" ) menubar.add_cascade(label="File", underline=0, menu=filemenu) editmenu = Menu(menubar, tearoff=0) rescntmenu = Menu(editmenu, tearoff=0) rescntmenu.add_radiobutton( label="20", variable=self._result_size, underline=0, value=20, command=self.set_result_size, ) rescntmenu.add_radiobutton( label="50", variable=self._result_size, underline=0, value=50, command=self.set_result_size, ) rescntmenu.add_radiobutton( label="100", variable=self._result_size, underline=0, value=100, command=self.set_result_size, ) rescntmenu.invoke(1) editmenu.add_cascade(label="Result Count", underline=0, menu=rescntmenu) menubar.add_cascade(label="Edit", underline=0, menu=editmenu) self.top.config(menu=menubar) def set_result_size(self, **kwargs): self.model.result_count = self._result_size.get() def _init_results_box(self, parent): innerframe = Frame(parent) i1 = Frame(innerframe) i2 = Frame(innerframe) vscrollbar = Scrollbar(i1, borderwidth=1) hscrollbar = Scrollbar(i2, borderwidth=1, orient="horiz") self.results_box = Text( i1, font=Font(family="courier", size="16"), state="disabled", borderwidth=1, yscrollcommand=vscrollbar.set, xscrollcommand=hscrollbar.set, wrap="none", width="40", height="20", exportselection=1, ) self.results_box.pack(side="left", fill="both", expand=True) vscrollbar.pack(side="left", fill="y", anchor="e") vscrollbar.config(command=self.results_box.yview) hscrollbar.pack(side="left", fill="x", expand=True, anchor="w") hscrollbar.config(command=self.results_box.xview) # there is no other way of avoiding the overlap of scrollbars while using pack layout manager!!! Label(i2, text=" ", background=self._BACKGROUND_COLOUR).pack( side="left", anchor="e" ) i1.pack(side="top", fill="both", expand=True, anchor="n") i2.pack(side="bottom", fill="x", anchor="s") innerframe.pack(side="top", fill="both", expand=True) def _init_paging(self, parent): innerframe = Frame(parent, background=self._BACKGROUND_COLOUR) self.prev = prev = Button( innerframe, text="Previous", command=self.previous, width="10", borderwidth=1, highlightthickness=1, state="disabled", ) prev.pack(side="left", anchor="center") self.next = next = Button( innerframe, text="Next", command=self.__next__, width="10", borderwidth=1, highlightthickness=1, state="disabled", ) next.pack(side="right", anchor="center") innerframe.pack(side="top", fill="y") self.reset_current_page() def reset_current_page(self): self.current_page = -1 def _poll(self): try: event = self.queue.get(block=False) except q.Empty: pass else: if event == CORPUS_LOADED_EVENT: self.handle_corpus_loaded(event) elif event == ERROR_LOADING_CORPUS_EVENT: self.handle_error_loading_corpus(event) self.after = self.top.after(POLL_INTERVAL, self._poll) def handle_error_loading_corpus(self, event): self.status["text"] = "Error in loading " + self.var.get() self.unfreeze_editable() self.clear_results_box() self.freeze_editable() self.reset_current_page() def handle_corpus_loaded(self, event): self.status["text"] = self.var.get() + " is loaded" self.unfreeze_editable() self.clear_results_box() self.reset_current_page() # self.next() collocations = self.model.next(self.current_page + 1) self.write_results(collocations) self.current_page += 1 def corpus_selected(self, *args): new_selection = self.var.get() self.load_corpus(new_selection) def previous(self): self.freeze_editable() collocations = self.model.prev(self.current_page - 1) self.current_page = self.current_page - 1 self.clear_results_box() self.write_results(collocations) self.unfreeze_editable() def __next__(self): self.freeze_editable() collocations = self.model.next(self.current_page + 1) self.clear_results_box() self.write_results(collocations) self.current_page += 1 self.unfreeze_editable() def load_corpus(self, selection): if self.model.selected_corpus != selection: self.status["text"] = "Loading " + selection + "..." self.freeze_editable() self.model.load_corpus(selection) def freeze_editable(self): self.prev["state"] = "disabled" self.next["state"] = "disabled" def clear_results_box(self): self.results_box["state"] = "normal" self.results_box.delete("1.0", END) self.results_box["state"] = "disabled" def fire_event(self, event): # Firing an event so that rendering of widgets happen in the mainloop thread self.top.event_generate(event, when="tail") def destroy(self, *e): if self.top is None: return self.top.after_cancel(self.after) self.top.destroy() self.top = None def mainloop(self, *args, **kwargs): if in_idle(): return self.top.mainloop(*args, **kwargs) def unfreeze_editable(self): self.set_paging_button_states() def set_paging_button_states(self): if self.current_page == -1 or self.current_page == 0: self.prev["state"] = "disabled" else: self.prev["state"] = "normal" if self.model.is_last_page(self.current_page): self.next["state"] = "disabled" else: self.next["state"] = "normal" def write_results(self, results): self.results_box["state"] = "normal" row = 1 for each in results: self.results_box.insert(str(row) + ".0", each[0] + " " + each[1] + "\n") row += 1 self.results_box["state"] = "disabled" class CollocationsModel: def __init__(self, queue): self.result_count = None self.selected_corpus = None self.collocations = None self.CORPORA = _CORPORA self.DEFAULT_CORPUS = _DEFAULT self.queue = queue self.reset_results() def reset_results(self): self.result_pages = [] self.results_returned = 0 def load_corpus(self, name): self.selected_corpus = name self.collocations = None runner_thread = self.LoadCorpus(name, self) runner_thread.start() self.reset_results() def non_default_corpora(self): copy = [] copy.extend(list(self.CORPORA.keys())) copy.remove(self.DEFAULT_CORPUS) copy.sort() return copy def is_last_page(self, number): if number < len(self.result_pages): return False return self.results_returned + ( number - len(self.result_pages) ) * self.result_count >= len(self.collocations) def next(self, page): if (len(self.result_pages) - 1) < page: for i in range(page - (len(self.result_pages) - 1)): self.result_pages.append( self.collocations[ self.results_returned : self.results_returned + self.result_count ] ) self.results_returned += self.result_count return self.result_pages[page] def prev(self, page): if page == -1: return [] return self.result_pages[page] class LoadCorpus(threading.Thread): def __init__(self, name, model): threading.Thread.__init__(self) self.model, self.name = model, name def run(self): try: words = self.model.CORPORA[self.name]() from operator import itemgetter text = [w for w in words if len(w) > 2] fd = FreqDist(tuple(text[i : i + 2]) for i in range(len(text) - 1)) vocab = FreqDist(text) scored = [ ((w1, w2), fd[(w1, w2)] ** 3 / (vocab[w1] * vocab[w2])) for w1, w2 in fd ] scored.sort(key=itemgetter(1), reverse=True) self.model.collocations = list(map(itemgetter(0), scored)) self.model.queue.put(CORPUS_LOADED_EVENT) except Exception as e: print(e) self.model.queue.put(ERROR_LOADING_CORPUS_EVENT) # def collocations(): # colloc_strings = [w1 + ' ' + w2 for w1, w2 in self._collocations[:num]] def app(): c = CollocationsView() c.mainloop() if __name__ == "__main__": app() __all__ = ["app"]