ai-content-maker/.venv/Lib/site-packages/gruut/lexicon2db.py

113 lines
3.5 KiB
Python
Raw Normal View History

2024-05-03 04:18:51 +03:00
#!/usr/bin/env python3
"""Converts a text lexicon to a gruut sqlite3 database"""
import argparse
import sqlite3
import sys
# -----------------------------------------------------------------------------
def main():
"""Main entry point"""
parser = argparse.ArgumentParser(prog="lexicon2db.py")
parser.add_argument(
"--casing",
required=True,
choices=("keep", "lower", "upper"),
help="Casing to apply to words",
)
parser.add_argument(
"--lexicon",
required=True,
help="Text lexicon to read with <WORD> <PHONEME> <PHONEME> ...",
)
parser.add_argument("--database", required=True, help="SQLite database to write")
parser.add_argument(
"--role", action="store_true", help="Lexicon includes word roles (2nd column)",
)
parser.add_argument(
"--empty-role",
default="_",
help="String used to identify empty word role (see --role)",
)
args = parser.parse_args()
# -------------------------------------------------------------------------
word_casing = None
if args.casing == "lower":
word_casing = str.lower
elif args.casing == "upper":
word_casing = str.upper
# -------------------------------------------------------------------------
conn = sqlite3.connect(args.database)
# Re-create tables in output
conn.execute("DROP TABLE IF EXISTS word_phonemes")
conn.execute("DROP TABLE IF EXISTS g2p_alignments")
conn.execute(
"CREATE TABLE IF NOT EXISTS word_phonemes "
+ "(id INTEGER PRIMARY KEY AUTOINCREMENT, word TEXT, pron_order INTEGER, phonemes TEXT, role TEXT);"
)
conn.execute(
"CREATE TABLE IF NOT EXISTS g2p_alignments "
+ "(id INTEGER PRIMARY KEY AUTOINCREMENT, word TEXT, alignment TEXT);"
)
conn.commit()
# word -> pron_order
pron_orders = {}
if args.lexicon == "-":
lexicon_file = sys.stdin
else:
lexicon_file = open(args.lexicon, "r", encoding="utf-8")
with lexicon_file, conn:
for i, line in enumerate(lexicon_file):
try:
line = line.strip()
if (not line) or line.startswith(";") or (" " not in line):
# Skip blank lines and comments.
# Also skip lines without a pronunciation.
continue
role = ""
if args.role:
# With role
word, role, phonemes_str = line.split(maxsplit=2)
if role == args.empty_role:
role = ""
elif ":" not in role:
role = f"gruut:{role}"
else:
# Without part of speech
word, phonemes_str = line.split(maxsplit=1)
if word_casing:
word = word_casing(word)
pron_order = pron_orders.get(word, 0)
# Don't commit on every word, or it will be terribly slow
conn.execute(
"INSERT into word_phonemes (word, pron_order, phonemes, role) VALUES (?, ?, ?, ?)",
(word, pron_order, phonemes_str, role),
)
pron_orders[word] = pron_order + 1
except Exception as e:
print("Error on line", i + 1, "-", line)
raise e
# -----------------------------------------------------------------------------
if __name__ == "__main__":
main()