ai-content-maker/.venv/Lib/site-packages/g2pkk/utils.py

268 lines
7.1 KiB
Python

import re
from jamo import h2j, j2h
import os
############## English ##############
def adjust(arpabets):
'''Modify arpabets so that it fits our processes'''
string = " " + " ".join(arpabets) + " $"
string = re.sub("\d", "", string)
string = string.replace(" T S ", " TS ")
string = string.replace(" D Z ", " DZ ")
string = string.replace(" AW ER ", " AWER ")
string = string.replace(" IH R $", " IH ER ")
string = string.replace(" EH R $", " EH ER ")
string = string.replace(" $", "")
return string.strip("$ ").split()
def to_choseong(arpabet):
'''Arpabet to choseong or onset'''
d = \
{'B': '',
'CH': '',
'D': '',
'DH': '',
'DZ': '',
'F': '',
'G': '',
'HH': '',
'JH': '',
'K': '',
'L': '',
'M': '',
'N': '',
'NG': '',
'P': '',
'R': '',
'S': '',
'SH': '',
'T': '',
'TH': '',
'TS': '',
'V': '',
'W': 'W',
'Y': 'Y',
'Z': '',
'ZH': ''}
return d.get(arpabet, arpabet)
def to_jungseong(arpabet):
'''Arpabet to jungseong or vowel'''
d = \
{'AA': '',
'AE': '',
'AH': '',
'AO': '',
'AW': 'ᅡ우',
'AWER': "ᅡ워",
'AY': 'ᅡ이',
'EH': '',
'ER': '',
'EY': 'ᅦ이',
'IH': '',
'IY': '',
'OW': '',
'OY': 'ᅩ이',
'UH': '',
'UW': ''}
return d.get(arpabet, arpabet)
def to_jongseong(arpabet):
'''Arpabet to jongseong or coda'''
d = \
{'B': '',
'CH': '',
'D': '',
'DH': '',
'F': '',
'G': '',
'HH': '',
'JH': '',
'K': '',
'L': '',
'M': '',
'N': '',
'NG': '',
'P': '',
'R': '',
'S': '',
'SH': '',
'T': '',
'TH': '',
'V': '',
'W': '',
'Y': '',
'Z': '',
'ZH': ''}
return d.get(arpabet, arpabet)
def reconstruct(string):
'''Some postprocessing rules'''
pairs = [("그W", "ᄀW"),
("흐W", "ᄒW"),
("크W", "ᄏW"),
("ᄂYᅥ", "니어"),
("ᄃYᅥ", "디어"),
("ᄅYᅥ", "리어"),
("Yᅵ", ""),
("Yᅡ", ""),
("Yᅢ", ""),
("Yᅥ", ""),
("Yᅦ", ""),
("Yᅩ", ""),
("Yᅮ", ""),
("Wᅡ", ""),
("Wᅢ", ""),
("Wᅥ", ""),
("Wᅩ", ""),
("Wᅮ", ""),
("Wᅦ", ""),
("Wᅵ", ""),
("ᅳᅵ", ""),
("Y", ""),
("W", "")
]
for str1, str2 in pairs:
string = string.replace(str1, str2)
return string
############## Hangul ##############
def parse_table():
'''Parse the main rule table'''
with open(os.path.dirname(os.path.abspath(__file__)) + '/table.csv', 'r', encoding='utf8') as f:
lines = f.read().splitlines()
onsets = lines[0].split(",")
table = []
for line in lines[1:]:
cols = line.split(",")
coda = cols[0]
for i, onset in enumerate(onsets):
cell = cols[i]
if len(cell) == 0: continue
if i == 0:
continue
else:
str1 = f"{coda}{onset}"
if "(" in cell:
str2 = cell.split("(")[0]
rule_ids = cell.split("(")[1][:-1].split("/")
else:
str2 = cell
rule_ids = []
table.append((str1, str2, rule_ids))
return table
############## Preprocessing ##############
def annotate(string, mecab):
tokens = mecab.pos(string)
if string.replace(" ", "") != "".join(token for token, _ in tokens):
return string
blanks = [i for i, char in enumerate(string) if char == " "]
tag_seq = []
for token, tag in tokens:
tag = tag.split("+")[-1]
if tag == "NNBC": # bound noun
tag = "B"
else:
tag = tag[0]
tag_seq.append("_" * (len(token) - 1) + tag)
tag_seq = "".join(tag_seq)
for i in blanks:
tag_seq = tag_seq[:i] + " " + tag_seq[i:]
annotated = ""
for char, tag in zip(string, tag_seq):
annotated += char
if char == "" and tag == "J":
annotated += "/J"
elif tag == "E":
if h2j(char)[-1] in "":
annotated += "/E"
elif tag == "V":
if h2j(char)[-1] in "ᆫᆬᆷᆱᆰᆲᆴ":
annotated += "/P"
elif tag == "B": # bound noun
annotated += "/B"
return annotated
############## Postprocessing ##############
def compose(letters):
# insert placeholder
letters = re.sub("(^|[^\u1100-\u1112])([\u1161-\u1175])", r"\1ᄋ\2", letters)
string = letters # assembled characters
# c+v+c
syls = set(re.findall("[\u1100-\u1112][\u1161-\u1175][\u11A8-\u11C2]", string))
for syl in syls:
string = string.replace(syl, j2h(*syl))
# c+v
syls = set(re.findall("[\u1100-\u1112][\u1161-\u1175]", string))
for syl in syls:
string = string.replace(syl, j2h(*syl))
return string
def group(inp):
'''For group_vowels=True
Contemporarily, Korean speakers don't distinguish some vowels.
'''
inp = inp.replace("", "")
inp = inp.replace("", "")
inp = inp.replace("", "")
inp = inp.replace("", "")
return inp
def _get_examples():
'''For internal use'''
text = open('rules.txt', 'r', encoding='utf8').read().splitlines()
examples = []
for line in text:
if line.startswith("->"):
examples.extend(re.findall("([ㄱ-힣][ ㄱ-힣]*)\[([ㄱ-힣][ ㄱ-힣]*)]", line))
_examples = []
for inp, gt in examples:
for each in gt.split("/"):
_examples.append((inp, each))
return _examples
############## Utilities ##############
def get_rule_id2text():
'''for verbose=True'''
with open(os.path.dirname(os.path.abspath(__file__)) + '/rules.txt', 'r', encoding='utf8') as f:
rules = f.read().strip().split("\n\n")
rule_id2text = dict()
for rule in rules:
rule_id, texts = rule.splitlines()[0], rule.splitlines()[1:]
rule_id2text[rule_id.strip()] = "\n".join(texts)
return rule_id2text
def gloss(verbose, out, inp, rule):
'''displays the process and relevant information'''
if verbose and out != inp and out != re.sub("/[EJPB]", "", inp):
print(compose(inp), "->", compose(out))
print("\033[1;31m", rule, "\033[0m")