152 lines
2.8 KiB
Python
152 lines
2.8 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
import re
|
|
|
|
VALID_SYMBOLS = [
|
|
"AA",
|
|
"AA0",
|
|
"AA1",
|
|
"AA2",
|
|
"AE",
|
|
"AE0",
|
|
"AE1",
|
|
"AE2",
|
|
"AH",
|
|
"AH0",
|
|
"AH1",
|
|
"AH2",
|
|
"AO",
|
|
"AO0",
|
|
"AO1",
|
|
"AO2",
|
|
"AW",
|
|
"AW0",
|
|
"AW1",
|
|
"AW2",
|
|
"AY",
|
|
"AY0",
|
|
"AY1",
|
|
"AY2",
|
|
"B",
|
|
"CH",
|
|
"D",
|
|
"DH",
|
|
"EH",
|
|
"EH0",
|
|
"EH1",
|
|
"EH2",
|
|
"ER",
|
|
"ER0",
|
|
"ER1",
|
|
"ER2",
|
|
"EY",
|
|
"EY0",
|
|
"EY1",
|
|
"EY2",
|
|
"F",
|
|
"G",
|
|
"HH",
|
|
"IH",
|
|
"IH0",
|
|
"IH1",
|
|
"IH2",
|
|
"IY",
|
|
"IY0",
|
|
"IY1",
|
|
"IY2",
|
|
"JH",
|
|
"K",
|
|
"L",
|
|
"M",
|
|
"N",
|
|
"NG",
|
|
"OW",
|
|
"OW0",
|
|
"OW1",
|
|
"OW2",
|
|
"OY",
|
|
"OY0",
|
|
"OY1",
|
|
"OY2",
|
|
"P",
|
|
"R",
|
|
"S",
|
|
"SH",
|
|
"T",
|
|
"TH",
|
|
"UH",
|
|
"UH0",
|
|
"UH1",
|
|
"UH2",
|
|
"UW",
|
|
"UW0",
|
|
"UW1",
|
|
"UW2",
|
|
"V",
|
|
"W",
|
|
"Y",
|
|
"Z",
|
|
"ZH",
|
|
]
|
|
|
|
|
|
class CMUDict:
|
|
"""Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict"""
|
|
|
|
def __init__(self, file_or_path, keep_ambiguous=True):
|
|
if isinstance(file_or_path, str):
|
|
with open(file_or_path, encoding="latin-1") as f:
|
|
entries = _parse_cmudict(f)
|
|
else:
|
|
entries = _parse_cmudict(file_or_path)
|
|
if not keep_ambiguous:
|
|
entries = {word: pron for word, pron in entries.items() if len(pron) == 1}
|
|
self._entries = entries
|
|
|
|
def __len__(self):
|
|
return len(self._entries)
|
|
|
|
def lookup(self, word):
|
|
"""Returns list of ARPAbet pronunciations of the given word."""
|
|
return self._entries.get(word.upper())
|
|
|
|
@staticmethod
|
|
def get_arpabet(word, cmudict, punctuation_symbols):
|
|
first_symbol, last_symbol = "", ""
|
|
if word and word[0] in punctuation_symbols:
|
|
first_symbol = word[0]
|
|
word = word[1:]
|
|
if word and word[-1] in punctuation_symbols:
|
|
last_symbol = word[-1]
|
|
word = word[:-1]
|
|
arpabet = cmudict.lookup(word)
|
|
if arpabet is not None:
|
|
return first_symbol + "{%s}" % arpabet[0] + last_symbol
|
|
return first_symbol + word + last_symbol
|
|
|
|
|
|
_alt_re = re.compile(r"\([0-9]+\)")
|
|
|
|
|
|
def _parse_cmudict(file):
|
|
cmudict = {}
|
|
for line in file:
|
|
if line and (line[0] >= "A" and line[0] <= "Z" or line[0] == "'"):
|
|
parts = line.split(" ")
|
|
word = re.sub(_alt_re, "", parts[0])
|
|
pronunciation = _get_pronunciation(parts[1])
|
|
if pronunciation:
|
|
if word in cmudict:
|
|
cmudict[word].append(pronunciation)
|
|
else:
|
|
cmudict[word] = [pronunciation]
|
|
return cmudict
|
|
|
|
|
|
def _get_pronunciation(s):
|
|
parts = s.strip().split(" ")
|
|
for part in parts:
|
|
if part not in VALID_SYMBOLS:
|
|
return None
|
|
return " ".join(parts)
|