ai-content-maker/.venv/Lib/site-packages/langcodes/language_distance.py

189 lines
5.6 KiB
Python

from .data_dicts import LANGUAGE_DISTANCES
from typing import Dict, Tuple
TagTriple = Tuple[str, str, str]
_DISTANCE_CACHE: Dict[Tuple[TagTriple, TagTriple], int] = {}
DEFAULT_LANGUAGE_DISTANCE = LANGUAGE_DISTANCES["*"]["*"]
DEFAULT_SCRIPT_DISTANCE = LANGUAGE_DISTANCES["*_*"]["*_*"]
DEFAULT_TERRITORY_DISTANCE = 4
# Territory clusters used in territory matching:
# Maghreb (the western Arab world)
MAGHREB = {"MA", "DZ", "TN", "LY", "MR", "EH"}
# United States and its territories
US = {"AS", "GU", "MH", "MP", "PR", "UM", "US", "VI"}
# Special Autonomous Regions of China
CNSAR = {"HK", "MO"}
LATIN_AMERICA = {
"419",
# Central America
"013",
"BZ",
"CR",
"SV",
"GT",
"HN",
"MX",
"NI",
"PA",
# South America
"005",
"AR",
"BO",
"BR",
"CL",
"CO",
"EC",
"FK",
"GF",
"GY",
"PY",
"PE",
"SR",
"UY",
"VE",
}
# North and South America
AMERICAS = {
"019",
# Caribbean
"029",
"AI",
"AG",
"AW",
"BS",
"BB",
"VG",
"BQ",
"KY",
"CU",
"CW",
"DM",
"DO",
"GD",
"GP",
"HT",
"JM",
"MQ",
"MS",
"PR",
"SX",
"BL",
"KN",
"LC",
"MF",
"VC",
"TT",
"TC",
"VI",
# Northern America
"021",
"BM",
"CA",
"GL",
"PM",
"US",
# North America as a whole
"003",
} | LATIN_AMERICA
def tuple_distance_cached(desired: TagTriple, supported: TagTriple) -> int:
"""
Takes in triples of (language, script, territory), which can be derived by
'maximizing' a language tag. Returns a number from 0 to 135 indicating the
'distance' between these for the purposes of language matching.
"""
# First of all, if these are identical, return quickly:
if supported == desired:
return 0
# If we've already figured it out, return the cached distance.
if (desired, supported) in _DISTANCE_CACHE:
return _DISTANCE_CACHE[desired, supported]
else:
result = _tuple_distance(desired, supported)
_DISTANCE_CACHE[desired, supported] = result
return result
def _get2(dictionary: dict, key1: str, key2: str, default):
return dictionary.get(key1, {}).get(key2, default)
def _tuple_distance(desired: TagTriple, supported: TagTriple) -> int:
desired_language, desired_script, desired_territory = desired
supported_language, supported_script, supported_territory = supported
distance = 0
if desired_language != supported_language:
distance += _get2(
LANGUAGE_DISTANCES,
desired_language,
supported_language,
DEFAULT_LANGUAGE_DISTANCE,
)
desired_script_pair = f"{desired_language}_{desired_script}"
supported_script_pair = f"{supported_language}_{supported_script}"
if desired_script != supported_script:
# Scripts can match other scripts, but only when paired with a
# language. For example, there is no reason to assume someone who can
# read 'Latn' can read 'Cyrl', but there is plenty of reason to believe
# someone who can read 'sr-Latn' can read 'sr-Cyrl' because Serbian is
# a language written in two scripts.
distance += _get2(
LANGUAGE_DISTANCES,
desired_script_pair,
supported_script_pair,
DEFAULT_SCRIPT_DISTANCE,
)
if desired_territory != supported_territory:
# The rules for matching territories are too weird to implement the
# general case efficiently. Instead of implementing all the possible
# match rules the XML could define, instead we just reimplement the
# rules of CLDR 36.1 here in code.
tdist = DEFAULT_TERRITORY_DISTANCE
if desired_script_pair == supported_script_pair:
if desired_language == "ar":
if (desired_territory in MAGHREB) != (supported_territory in MAGHREB):
tdist = 5
elif desired_language == "en":
if (desired_territory == "GB") and (supported_territory not in US):
tdist = 3
elif (desired_territory not in US) and (supported_territory == "GB"):
tdist = 3
elif (desired_territory in US) != (supported_territory in US):
tdist = 5
# This is not a rule that's spelled out in CLDR, but is implied by things
# about territory containment mentioned in other standards. Numeric values
# for territories, like '003', represent broad regions that contain more
# specific territories.
#
# 419 is the numeric value most often seen in language codes, particularly
# 'es-419' for Latin American Spanish. If you have a language code that
# differs only in that its territory is more specific, like 'es-PY', it should
# be closer to a supported 'es-419' than anything with a territory difference.
#
# We can implement this for 419 without becoming responsible for keeping up
# with which countries/territories/regions contain others in the general case.
elif desired_territory in LATIN_AMERICA and supported_territory == "419":
tdist = 1
elif desired_language == "es" or desired_language == "pt":
if (desired_territory in AMERICAS) != (supported_territory in AMERICAS):
tdist = 5
elif desired_script_pair == "zh_Hant":
if (desired_territory in CNSAR) != (supported_territory in CNSAR):
tdist = 5
distance += tdist
return distance