ai-content-maker/.venv/Lib/site-packages/language_data/names.py

113 lines
3.8 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import marisa_trie
import warnings
from language_data.util import data_filename
TRIES = {}
# This is something we could hypothetically discover from XML files, but
# we end up learning that most languages separate things with commas, with
# a few exceptions. We'll just put those exceptions here.
DISPLAY_SEPARATORS = {
'am': '',
'ar': '، ',
'brx': ',',
'fa': '، ',
'ja': '',
'my': '',
'ug': '، ',
'und': ', ',
'ur': '، ',
'yue': '',
'zh': '',
}
def normalize_name(name):
"""
When looking up a language-code component by name, we would rather ignore
distinctions of case and certain punctuation. "Chinese (Traditional)"
should be matched by "Chinese Traditional" and "chinese traditional".
"""
name = name.casefold()
name = name.replace("", "'")
name = name.replace("-", " ")
name = name.replace("(", "")
name = name.replace(")", "")
name = name.replace(",", "")
return name.strip()
def load_trie(filename):
"""
Load a BytesTrie from the marisa_trie on-disk format.
"""
trie = marisa_trie.BytesTrie()
# marisa_trie raises warnings that make no sense. Ignore them.
with warnings.catch_warnings():
warnings.simplefilter("ignore")
trie.load(filename)
return trie
def get_trie_value(trie, key):
"""
Get the value that a BytesTrie stores for a particular key, decoded
as Unicode. Raises a KeyError if there is no value for that key.
"""
return trie[key][0].decode("utf-8")
def name_to_code(category, name, language: str = "und"):
"""
Get a language, script, or territory by its name in some language.
The language here must be a string representing a language subtag only.
The `Language.find` method can handle other representations of a language
and normalize them to this form.
The default language, "und", will allow matching names in any language,
so you can get the code 'fr' by looking up "French", "Français", or
"francés".
A small amount of fuzzy matching is supported: if the name can be
shortened or lengthened to match a single language name, you get that
language. This allows, for example, "Hakka Chinese" to match "Hakka".
Occasionally, names are ambiguous in a way that can be resolved by
specifying what name the language is supposed to be in. For example,
there is a language named 'Malayo' in English, but it's different from
the language named 'Malayo' in Spanish (which is Malay). Specifying the
language will look up the name in a trie that is only in that language.
"""
assert "/" not in language, "Language codes cannot contain slashes"
assert "-" not in language, "This code should be reduced to a language subtag only"
trie_name = "{}/name_to_{}".format(language, category)
if trie_name not in TRIES:
TRIES[trie_name] = load_trie(data_filename("trie/{}.marisa".format(trie_name)))
trie = TRIES[trie_name]
lookup = normalize_name(name)
if lookup in trie:
return get_trie_value(trie, lookup)
else:
# Is this a language name plus extra verbiage? Maybe it has "...isch",
# "... language", or "... Chinese" attached to it, for example. Look
# for a matching prefix of the desired name with at least 4 characters.
prefixes = trie.prefixes(lookup)
if prefixes and len(prefixes[-1]) >= 4:
return get_trie_value(trie, prefixes[-1])
else:
return None
def code_to_names(code):
"""
Given the code for a language, script, or territory, get a dictionary of its
names in various languages.
"""
# late import to save memory when possible
import language_data.name_data
return language_data.name_data.CODE_TO_NAMES.get(code, {})