148 lines
5.9 KiB
Python
148 lines
5.9 KiB
Python
|
# Natural Language Toolkit: Tokenizers
|
|||
|
#
|
|||
|
# Copyright (C) 2001-2023 NLTK Project
|
|||
|
# Author: Christopher Hench <chris.l.hench@gmail.com>
|
|||
|
# Alex Estes
|
|||
|
# URL: <https://www.nltk.org>
|
|||
|
# For license information, see LICENSE.TXT
|
|||
|
|
|||
|
"""
|
|||
|
The Legality Principle is a language agnostic principle maintaining that syllable
|
|||
|
onsets and codas (the beginning and ends of syllables not including the vowel)
|
|||
|
are only legal if they are found as word onsets or codas in the language. The English
|
|||
|
word ''admit'' must then be syllabified as ''ad-mit'' since ''dm'' is not found
|
|||
|
word-initially in the English language (Bartlett et al.). This principle was first proposed
|
|||
|
in Daniel Kahn's 1976 dissertation, ''Syllable-based generalizations in English phonology''.
|
|||
|
|
|||
|
Kahn further argues that there is a ''strong tendency to syllabify in such a way that
|
|||
|
initial clusters are of maximal length, consistent with the general constraints on
|
|||
|
word-initial consonant clusters.'' Consequently, in addition to being legal onsets,
|
|||
|
the longest legal onset is preferable---''Onset Maximization''.
|
|||
|
|
|||
|
The default implementation assumes an English vowel set, but the `vowels` attribute
|
|||
|
can be set to IPA or any other alphabet's vowel set for the use-case.
|
|||
|
Both a valid set of vowels as well as a text corpus of words in the language
|
|||
|
are necessary to determine legal onsets and subsequently syllabify words.
|
|||
|
|
|||
|
The legality principle with onset maximization is a universal syllabification algorithm,
|
|||
|
but that does not mean it performs equally across languages. Bartlett et al. (2009)
|
|||
|
is a good benchmark for English accuracy if utilizing IPA (pg. 311).
|
|||
|
|
|||
|
References:
|
|||
|
|
|||
|
- Otto Jespersen. 1904. Lehrbuch der Phonetik.
|
|||
|
Leipzig, Teubner. Chapter 13, Silbe, pp. 185-203.
|
|||
|
- Theo Vennemann, ''On the Theory of Syllabic Phonology,'' 1972, p. 11.
|
|||
|
- Daniel Kahn, ''Syllable-based generalizations in English phonology'', (PhD diss., MIT, 1976).
|
|||
|
- Elisabeth Selkirk. 1984. On the major class features and syllable theory.
|
|||
|
In Aronoff & Oehrle (eds.) Language Sound Structure: Studies in Phonology.
|
|||
|
Cambridge, MIT Press. pp. 107-136.
|
|||
|
- Jeremy Goslin and Ulrich Frauenfelder. 2001. A comparison of theoretical and human syllabification. Language and Speech, 44:409–436.
|
|||
|
- Susan Bartlett, et al. 2009. On the Syllabification of Phonemes.
|
|||
|
In HLT-NAACL. pp. 308-316.
|
|||
|
- Christopher Hench. 2017. Resonances in Middle High German: New Methodologies in Prosody. UC Berkeley.
|
|||
|
"""
|
|||
|
|
|||
|
from collections import Counter
|
|||
|
|
|||
|
from nltk.tokenize.api import TokenizerI
|
|||
|
|
|||
|
|
|||
|
class LegalitySyllableTokenizer(TokenizerI):
|
|||
|
"""
|
|||
|
Syllabifies words based on the Legality Principle and Onset Maximization.
|
|||
|
|
|||
|
>>> from nltk.tokenize import LegalitySyllableTokenizer
|
|||
|
>>> from nltk import word_tokenize
|
|||
|
>>> from nltk.corpus import words
|
|||
|
>>> text = "This is a wonderful sentence."
|
|||
|
>>> text_words = word_tokenize(text)
|
|||
|
>>> LP = LegalitySyllableTokenizer(words.words())
|
|||
|
>>> [LP.tokenize(word) for word in text_words]
|
|||
|
[['This'], ['is'], ['a'], ['won', 'der', 'ful'], ['sen', 'ten', 'ce'], ['.']]
|
|||
|
"""
|
|||
|
|
|||
|
def __init__(
|
|||
|
self, tokenized_source_text, vowels="aeiouy", legal_frequency_threshold=0.001
|
|||
|
):
|
|||
|
"""
|
|||
|
:param tokenized_source_text: List of valid tokens in the language
|
|||
|
:type tokenized_source_text: list(str)
|
|||
|
:param vowels: Valid vowels in language or IPA representation
|
|||
|
:type vowels: str
|
|||
|
:param legal_frequency_threshold: Lowest frequency of all onsets to be considered a legal onset
|
|||
|
:type legal_frequency_threshold: float
|
|||
|
"""
|
|||
|
self.legal_frequency_threshold = legal_frequency_threshold
|
|||
|
self.vowels = vowels
|
|||
|
self.legal_onsets = self.find_legal_onsets(tokenized_source_text)
|
|||
|
|
|||
|
def find_legal_onsets(self, words):
|
|||
|
"""
|
|||
|
Gathers all onsets and then return only those above the frequency threshold
|
|||
|
|
|||
|
:param words: List of words in a language
|
|||
|
:type words: list(str)
|
|||
|
:return: Set of legal onsets
|
|||
|
:rtype: set(str)
|
|||
|
"""
|
|||
|
onsets = [self.onset(word) for word in words]
|
|||
|
legal_onsets = [
|
|||
|
k
|
|||
|
for k, v in Counter(onsets).items()
|
|||
|
if (v / len(onsets)) > self.legal_frequency_threshold
|
|||
|
]
|
|||
|
return set(legal_onsets)
|
|||
|
|
|||
|
def onset(self, word):
|
|||
|
"""
|
|||
|
Returns consonant cluster of word, i.e. all characters until the first vowel.
|
|||
|
|
|||
|
:param word: Single word or token
|
|||
|
:type word: str
|
|||
|
:return: String of characters of onset
|
|||
|
:rtype: str
|
|||
|
"""
|
|||
|
onset = ""
|
|||
|
for c in word.lower():
|
|||
|
if c in self.vowels:
|
|||
|
return onset
|
|||
|
else:
|
|||
|
onset += c
|
|||
|
return onset
|
|||
|
|
|||
|
def tokenize(self, token):
|
|||
|
"""
|
|||
|
Apply the Legality Principle in combination with
|
|||
|
Onset Maximization to return a list of syllables.
|
|||
|
|
|||
|
:param token: Single word or token
|
|||
|
:type token: str
|
|||
|
:return syllable_list: Single word or token broken up into syllables.
|
|||
|
:rtype: list(str)
|
|||
|
"""
|
|||
|
syllables = []
|
|||
|
syllable, current_onset = "", ""
|
|||
|
vowel, onset = False, False
|
|||
|
for char in token[::-1]:
|
|||
|
char_lower = char.lower()
|
|||
|
if not vowel:
|
|||
|
syllable += char
|
|||
|
vowel = bool(char_lower in self.vowels)
|
|||
|
else:
|
|||
|
if char_lower + current_onset[::-1] in self.legal_onsets:
|
|||
|
syllable += char
|
|||
|
current_onset += char_lower
|
|||
|
onset = True
|
|||
|
elif char_lower in self.vowels and not onset:
|
|||
|
syllable += char
|
|||
|
current_onset += char_lower
|
|||
|
else:
|
|||
|
syllables.append(syllable)
|
|||
|
syllable = char
|
|||
|
current_onset = ""
|
|||
|
vowel = bool(char_lower in self.vowels)
|
|||
|
syllables.append(syllable)
|
|||
|
syllables_ordered = [syllable[::-1] for syllable in syllables][::-1]
|
|||
|
return syllables_ordered
|