171 lines
5.8 KiB
Python
171 lines
5.8 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
|
|
import os, re, platform, sys, importlib
|
|
import subprocess
|
|
|
|
import nltk
|
|
from jamo import h2j
|
|
from nltk.corpus import cmudict
|
|
|
|
# For further info. about cmu dict, consult http://www.speech.cs.cmu.edu/cgi-bin/cmudict.
|
|
try:
|
|
nltk.data.find('corpora/cmudict.zip')
|
|
except LookupError:
|
|
nltk.download('cmudict')
|
|
|
|
from g2pkk.special import jyeo, ye, consonant_ui, josa_ui, vowel_ui, jamo, rieulgiyeok, rieulbieub, verb_nieun, balb, palatalize, modifying_rieul
|
|
from g2pkk.regular import link1, link2, link3, link4
|
|
from g2pkk.utils import annotate, compose, group, gloss, parse_table, get_rule_id2text
|
|
from g2pkk.english import convert_eng
|
|
from g2pkk.numerals import convert_num
|
|
|
|
|
|
class G2p(object):
|
|
def __init__(self):
|
|
self.check_mecab()
|
|
self.mecab = self.get_mecab()
|
|
self.table = parse_table()
|
|
|
|
self.cmu = cmudict.dict() # for English
|
|
|
|
self.rule2text = get_rule_id2text() # for comments of main rules
|
|
self.idioms_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "idioms.txt")
|
|
|
|
def load_module_func(self, module_name):
|
|
tmp = __import__(module_name, fromlist=[module_name])
|
|
return tmp
|
|
|
|
def check_mecab(self):
|
|
if platform.system()=='Windows':
|
|
spam_spec = importlib.util.find_spec("eunjeon")
|
|
non_found = spam_spec is None
|
|
if non_found:
|
|
print(f'you have to install eunjeon. install it...')
|
|
p = subprocess.Popen('pip install eunjeon')
|
|
p.wait()
|
|
else:
|
|
spam_spec = importlib.util.find_spec("mecab")
|
|
non_found = spam_spec is None
|
|
if non_found:
|
|
print(f'you have to install python-mecab-ko. install it...')
|
|
p = subprocess.Popen([sys.executable, "-m", "pip", "install", 'python-mecab-ko'])
|
|
p.wait()
|
|
|
|
|
|
def get_mecab(self):
|
|
if platform.system() == 'Windows':
|
|
try:
|
|
m = self.load_module_func('eunjeon')
|
|
return m.Mecab()
|
|
except Exception as e:
|
|
raise print(f'you have to install eunjeon. "pip install eunjeon"')
|
|
else:
|
|
try:
|
|
m = self.load_module_func('mecab')
|
|
return m.MeCab()
|
|
except Exception as e:
|
|
print(f'you have to install python-mecab-ko. "pip install python-mecab-ko"')
|
|
|
|
|
|
def idioms(self, string, descriptive=False, verbose=False):
|
|
'''Process each line in `idioms.txt`
|
|
Each line is delimited by "===",
|
|
and the left string is replaced by the right one.
|
|
inp: input string.
|
|
descriptive: not used.
|
|
verbose: boolean.
|
|
|
|
>>> idioms("지금 mp3 파일을 다운받고 있어요")
|
|
지금 엠피쓰리 파일을 다운받고 있어요
|
|
'''
|
|
rule = "from idioms.txt"
|
|
out = string
|
|
|
|
with open(self.idioms_path, 'r', encoding="utf8") as f:
|
|
for line in f:
|
|
line = line.split("#")[0].strip()
|
|
if "===" in line:
|
|
str1, str2 = line.split("===")
|
|
out = re.sub(str1, str2, out)
|
|
gloss(verbose, out, string, rule)
|
|
|
|
return out
|
|
|
|
def __call__(self, string, descriptive=False, verbose=False, group_vowels=False, to_syl=True):
|
|
'''Main function
|
|
string: input string
|
|
descriptive: boolean.
|
|
verbose: boolean
|
|
group_vowels: boolean. If True, the vowels of the identical sound are normalized.
|
|
to_syl: boolean. If True, hangul letters or jamo are assembled to form syllables.
|
|
|
|
For example, given an input string "나의 친구가 mp3 file 3개를 다운받고 있다",
|
|
STEP 1. idioms
|
|
-> 나의 친구가 엠피쓰리 file 3개를 다운받고 있다
|
|
|
|
STEP 2. English to Hangul
|
|
-> 나의 친구가 엠피쓰리 파일 3개를 다운받고 있다
|
|
|
|
STEP 3. annotate
|
|
-> 나의/J 친구가 엠피쓰리 파일 3개/B를 다운받고 있다
|
|
|
|
STEP 4. Spell out arabic numbers
|
|
-> 나의/J 친구가 엠피쓰리 파일 세개/B를 다운받고 있다
|
|
|
|
STEP 5. decompose
|
|
-> 나의/J 친구가 엠피쓰리 파일 세개/B를 다운받고 있다
|
|
|
|
STEP 6-9. Hangul
|
|
-> 나의 친구가 엠피쓰리 파일 세개를 다운받꼬 읻따
|
|
'''
|
|
# 1. idioms
|
|
string = self.idioms(string, descriptive, verbose)
|
|
|
|
# 2 English to Hangul
|
|
string = convert_eng(string, self.cmu)
|
|
|
|
# 3. annotate
|
|
string = annotate(string, self.mecab)
|
|
|
|
|
|
# 4. Spell out arabic numbers
|
|
string = convert_num(string)
|
|
|
|
# 5. decompose
|
|
inp = h2j(string)
|
|
|
|
# 6. special
|
|
for func in (jyeo, ye, consonant_ui, josa_ui, vowel_ui, \
|
|
jamo, rieulgiyeok, rieulbieub, verb_nieun, \
|
|
balb, palatalize, modifying_rieul):
|
|
inp = func(inp, descriptive, verbose)
|
|
inp = re.sub("/[PJEB]", "", inp)
|
|
|
|
# 7. regular table: batchim + onset
|
|
for str1, str2, rule_ids in self.table:
|
|
_inp = inp
|
|
inp = re.sub(str1, str2, inp)
|
|
|
|
if len(rule_ids)>0:
|
|
rule = "\n".join(self.rule2text.get(rule_id, "") for rule_id in rule_ids)
|
|
else:
|
|
rule = ""
|
|
gloss(verbose, inp, _inp, rule)
|
|
|
|
# 8 link
|
|
for func in (link1, link2, link3, link4):
|
|
inp = func(inp, descriptive, verbose)
|
|
|
|
# 9. postprocessing
|
|
if group_vowels:
|
|
inp = group(inp)
|
|
|
|
if to_syl:
|
|
inp = compose(inp)
|
|
return inp
|
|
|
|
if __name__ == "__main__":
|
|
g2p = G2p()
|
|
g2p("나의 친구가 mp3 file 3개를 다운받고 있다")
|