73 lines
2.2 KiB
Python
73 lines
2.2 KiB
Python
#!/usr/bin/env python3
|
|
"""Converts a Phonetisaurus G2P corpus to an sqlite database"""
|
|
import argparse
|
|
import sqlite3
|
|
|
|
# -----------------------------------------------------------------------------
|
|
|
|
|
|
def main():
|
|
"""Main entry point"""
|
|
parser = argparse.ArgumentParser(prog="corpus2db.py")
|
|
parser.add_argument(
|
|
"--corpus", required=True, help="Phonetisaurus G2P corpus file to read"
|
|
)
|
|
parser.add_argument("--database", required=True, help="SQLite database to write")
|
|
parser.add_argument(
|
|
"--no-drop",
|
|
action="store_true",
|
|
help="Don't drop existing g2p_alignments table",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
# -------------------------------------------------------------------------
|
|
|
|
conn = sqlite3.connect(args.database)
|
|
|
|
g2p_alignments = {}
|
|
with open(args.corpus, "r", encoding="utf-8") as corpus_file:
|
|
for line in corpus_file:
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
|
|
word = ""
|
|
|
|
# Parse line
|
|
parts = line.split()
|
|
for part in parts:
|
|
# Assume default delimiters:
|
|
# } separates input/output
|
|
# | separates input/output tokens
|
|
# _ indicates empty output
|
|
part_in, _part_out = part.split("}")
|
|
part_ins = part_in.split("|")
|
|
word += "".join(part_ins)
|
|
|
|
if word and (word not in g2p_alignments):
|
|
g2p_alignments[word] = line
|
|
|
|
# Add to database
|
|
with conn:
|
|
if not args.no_drop:
|
|
conn.execute("DROP TABLE IF EXISTS g2p_alignments")
|
|
|
|
conn.execute(
|
|
"CREATE TABLE IF NOT EXISTS g2p_alignments "
|
|
+ "(id INTEGER PRIMARY KEY AUTOINCREMENT, word TEXT, alignment TEXT);"
|
|
)
|
|
|
|
for word, alignment in g2p_alignments.items():
|
|
conn.execute(
|
|
"INSERT INTO g2p_alignments (word, alignment) VALUES (?, ?)",
|
|
(word, alignment),
|
|
)
|
|
|
|
print("Added", len(g2p_alignments), "alignments to", args.database)
|
|
|
|
|
|
# -----------------------------------------------------------------------------
|
|
|
|
if __name__ == "__main__":
|
|
main()
|