ai-content-maker/.venv/Lib/site-packages/g2pkk/utils.py

import re
from jamo import h2j, j2h
import os

############## English ##############
def adjust(arpabets):
    '''Modify arpabets so that it fits our processes'''
    string = " " + " ".join(arpabets) + " $"
    string = re.sub("\d", "", string)
    string = string.replace(" T S ", " TS ")
    string = string.replace(" D Z ", " DZ ")
    string = string.replace(" AW ER ", " AWER ")
    string = string.replace(" IH R $", " IH ER ")
    string = string.replace(" EH R $", " EH ER ")
    string = string.replace(" $", "")

    return string.strip("$ ").split()


def to_choseong(arpabet):
    '''Arpabet to choseong or onset'''
    d = \
        {'B': 'ᄇ',
         'CH': 'ᄎ',
         'D': 'ᄃ',
         'DH': 'ᄃ',
         'DZ': 'ᄌ',
         'F': 'ᄑ',
         'G': 'ᄀ',
         'HH': 'ᄒ',
         'JH': 'ᄌ',
         'K': 'ᄏ',
         'L': 'ᄅ',
         'M': 'ᄆ',
         'N': 'ᄂ',
         'NG': 'ᄋ',
         'P': 'ᄑ',
         'R': 'ᄅ',
         'S': 'ᄉ',
         'SH': 'ᄉ',
         'T': 'ᄐ',
         'TH': 'ᄉ',
         'TS': 'ᄎ',
         'V': 'ᄇ',
         'W': 'W',
         'Y': 'Y',
         'Z': 'ᄌ',
         'ZH': 'ᄌ'}

    return d.get(arpabet, arpabet)

def to_jungseong(arpabet):
    '''Arpabet to jungseong or vowel'''
    d = \
        {'AA': 'ᅡ',
         'AE': 'ᅢ',
         'AH': 'ᅥ',
         'AO': 'ᅩ',
         'AW': 'ᅡ우',
         'AWER': "ᅡ워",
         'AY': 'ᅡ이',
         'EH': 'ᅦ',
         'ER': 'ᅥ',
         'EY': 'ᅦ이',
         'IH': 'ᅵ',
         'IY': 'ᅵ',
         'OW': 'ᅩ',
         'OY': 'ᅩ이',
         'UH': 'ᅮ',
         'UW': 'ᅮ'}
    return d.get(arpabet, arpabet)

def to_jongseong(arpabet):
    '''Arpabet to jongseong or coda'''
    d = \
        {'B': 'ᆸ',
         'CH': 'ᆾ',
         'D': 'ᆮ',
         'DH': 'ᆮ',
         'F': 'ᇁ',
         'G': 'ᆨ',
         'HH': 'ᇂ',
         'JH': 'ᆽ',
         'K': 'ᆨ',
         'L': 'ᆯ',
         'M': 'ᆷ',
         'N': 'ᆫ',
         'NG': 'ᆼ',
         'P': 'ᆸ',
         'R': 'ᆯ',
         'S': 'ᆺ',
         'SH': 'ᆺ',
         'T': 'ᆺ',
         'TH': 'ᆺ',
         'V': 'ᆸ',
         'W': 'ᆼ',
         'Y': 'ᆼ',
         'Z': 'ᆽ',
         'ZH': 'ᆽ'}

    return d.get(arpabet, arpabet)


def reconstruct(string):
    '''Some postprocessing rules'''
    pairs = [("그W", "ᄀW"),
             ("흐W", "ᄒW"),
             ("크W", "ᄏW"),
             ("ᄂYᅥ", "니어"),
             ("ᄃYᅥ", "디어"),
             ("ᄅYᅥ", "리어"),
             ("Yᅵ", "ᅵ"),
             ("Yᅡ", "ᅣ"),
             ("Yᅢ", "ᅤ"),
             ("Yᅥ", "ᅧ"),
             ("Yᅦ", "ᅨ"),
             ("Yᅩ", "ᅭ"),
             ("Yᅮ", "ᅲ"),
             ("Wᅡ", "ᅪ"),
             ("Wᅢ", "ᅫ"),
             ("Wᅥ", "ᅯ"),
             ("Wᅩ", "ᅯ"),
             ("Wᅮ", "ᅮ"),
             ("Wᅦ", "ᅰ"),
             ("Wᅵ", "ᅱ"),
             ("ᅳᅵ", "ᅴ"),
             ("Y", "ᅵ"),
             ("W", "ᅮ")
             ]
    for str1, str2 in pairs:
        string = string.replace(str1, str2)
    return string


############## Hangul ##############
def parse_table():
    '''Parse the main rule table'''
    with open(os.path.dirname(os.path.abspath(__file__)) + '/table.csv', 'r', encoding='utf8') as f:
        lines = f.read().splitlines()
        onsets = lines[0].split(",")
        table = []
        for line in lines[1:]:
            cols = line.split(",")
            coda = cols[0]
            for i, onset in enumerate(onsets):
                cell = cols[i]
                if len(cell) == 0: continue
                if i == 0:
                    continue
                else:
                    str1 = f"{coda}{onset}"
                    if "(" in cell:
                        str2 = cell.split("(")[0]
                        rule_ids = cell.split("(")[1][:-1].split("/")
                    else:
                        str2 = cell
                        rule_ids = []

                    table.append((str1, str2, rule_ids))

    return table


############## Preprocessing ##############
def annotate(string, mecab):
    tokens = mecab.pos(string)
    if string.replace(" ", "") != "".join(token for token, _ in tokens):
        return string
    blanks = [i for i, char in enumerate(string) if char == " "]

    tag_seq = []
    for token, tag in tokens:
        tag = tag.split("+")[-1]
        if tag == "NNBC":  # bound noun
            tag = "B"
        else:
            tag = tag[0]
        tag_seq.append("_" * (len(token) - 1) + tag)
    tag_seq = "".join(tag_seq)

    for i in blanks:
        tag_seq = tag_seq[:i] + " " + tag_seq[i:]

    annotated = ""
    for char, tag in zip(string, tag_seq):
        annotated += char
        if char == "의" and tag == "J":
            annotated += "/J"
        elif tag == "E":
            if h2j(char)[-1] in "ᆯ":
                annotated += "/E"
        elif tag == "V":
            if h2j(char)[-1] in "ᆫᆬᆷᆱᆰᆲᆴ":
                annotated += "/P"
        elif tag == "B":  # bound noun
            annotated += "/B"

    return annotated


############## Postprocessing ##############
def compose(letters):
    # insert placeholder
    letters = re.sub("(^|[^\u1100-\u1112])([\u1161-\u1175])", r"\1ᄋ\2", letters)

    string = letters # assembled characters
    # c+v+c
    syls = set(re.findall("[\u1100-\u1112][\u1161-\u1175][\u11A8-\u11C2]", string))
    for syl in syls:
        string = string.replace(syl, j2h(*syl))

    # c+v
    syls = set(re.findall("[\u1100-\u1112][\u1161-\u1175]", string))
    for syl in syls:
        string = string.replace(syl, j2h(*syl))

    return string


def group(inp):
    '''For group_vowels=True
    Contemporarily, Korean speakers don't distinguish some vowels.
    '''
    inp = inp.replace("ᅢ", "ᅦ")
    inp = inp.replace("ᅤ", "ᅨ")
    inp = inp.replace("ᅫ", "ᅬ")
    inp = inp.replace("ᅰ", "ᅬ")

    return inp


def _get_examples():
    '''For internal use'''
    text = open('rules.txt', 'r', encoding='utf8').read().splitlines()
    examples = []
    for line in text:
        if line.startswith("->"):
            examples.extend(re.findall("([ㄱ-힣][ ㄱ-힣]*)\[([ㄱ-힣][ ㄱ-힣]*)]", line))
    _examples = []
    for inp, gt in examples:
        for each in gt.split("/"):
            _examples.append((inp, each))

    return _examples


############## Utilities ##############
def get_rule_id2text():
    '''for verbose=True'''
    with open(os.path.dirname(os.path.abspath(__file__)) + '/rules.txt', 'r', encoding='utf8') as f:
        rules = f.read().strip().split("\n\n")
        rule_id2text = dict()
        for rule in rules:
            rule_id, texts = rule.splitlines()[0], rule.splitlines()[1:]
            rule_id2text[rule_id.strip()] = "\n".join(texts)
    return rule_id2text


def gloss(verbose, out, inp, rule):
    '''displays the process and relevant information'''
    if verbose and out != inp and out != re.sub("/[EJPB]", "", inp):
        print(compose(inp), "->", compose(out))
        print("\033[1;31m", rule, "\033[0m")