499 lines
19 KiB
Python
499 lines
19 KiB
Python
import marisa_trie
|
|
import json
|
|
import xml.etree.ElementTree as ET
|
|
import os
|
|
from pathlib import Path
|
|
from collections import defaultdict, Counter
|
|
|
|
from language_data.names import normalize_name
|
|
from language_data.util import data_filename
|
|
from language_data.registry_parser import parse_registry
|
|
|
|
# Naming things is hard, especially languages
|
|
# ===========================================
|
|
#
|
|
# CLDR is supposed to avoid ambiguous language names, particularly among its
|
|
# core languages. But it seems that languages are incompletely disambiguated.
|
|
#
|
|
# It's convenient to be able to get a language by its name, without having to
|
|
# also refer to the language that the name is in. In most cases, we can do this
|
|
# unambiguously. With the disambiguations and overrides here, this will work
|
|
# in a lot of cases. However, some names such as 'Dongo', 'Fala', 'Malayo', and
|
|
# 'Tonga' are ambiguous in ways that can only be disambiguated by specifying
|
|
# the language the name is in.
|
|
#
|
|
# Ambiguous names can arise from:
|
|
#
|
|
# - Ambiguities in the scope of a name. These tend to span languages, and the
|
|
# data files mask the fact that these names are generally ambiguous *within*
|
|
# a language. This is why we have codes.
|
|
#
|
|
# - Names that just happen to be ambiguous between different things with
|
|
# different etymologies.
|
|
#
|
|
# Most doubly-claimed language names have standard ways to disambiguate
|
|
# them in CLDR, but names such as 'Tonga' and 'Fala' have complex
|
|
# inter-language ambiguities.
|
|
#
|
|
# Our approach is:
|
|
#
|
|
# - Fix conflicts that seem to arise simply from errors in the data, by
|
|
# overriding the data.
|
|
#
|
|
# - Fix ambiguities in scope by preferring one scope over another. For example,
|
|
# "North America" could refer to a territory that includes Central America or
|
|
# a territory that doesn't. In any such conflict, we choose to include Central
|
|
# America.
|
|
#
|
|
# - Avoid ambiguities between different sources of data, by using an order
|
|
# of precedence. CLDR data takes priority over IANA data, which takes priority
|
|
# over Wiktionary data.
|
|
#
|
|
# - When ambiguity remains, that name is not resolvable to a language code.
|
|
# Resolving the name might require a more specific name, or specifying the
|
|
# language that the name is in.
|
|
|
|
|
|
AMBIGUOUS_PREFERENCES = {
|
|
# Prefer 'Micronesia' to refer to the Federated States of Micronesia -
|
|
# this seems to be poorly disambiguated in many languages, but we can't
|
|
# do much with a code for the general region of Micronesia
|
|
'FM': {'057'},
|
|
# Prefer the country of South Africa over the general region of southern
|
|
# Africa, in languages that don't distinguish them
|
|
'ZA': {'018'},
|
|
# Prefer territory 003 for 'North America', which includes Central America
|
|
# and the Caribbean, over territory 021, which excludes them
|
|
'003': {'021'},
|
|
# Prefer territory 005 for 'Lulli-Amerihkká' (South America), over territory
|
|
# 419, which includes Central America
|
|
'005': {'419'},
|
|
# If a name like "Amerika" is ambiguous between the Americas and the United
|
|
# States of America, choose the Americas
|
|
'019': {'US'},
|
|
# Prefer 'Swiss German' to be a specific language
|
|
'gsw': {'de-CH'},
|
|
# Of the two countries named 'Congo', prefer the one with Kinshasa
|
|
'CD': {'CG'},
|
|
# Prefer Han script to not include bopomofo
|
|
'Hani': {'Hanb'},
|
|
# Prefer the specific language Tagalog over standard Filipino, because
|
|
# the ambiguous name was probably some form of 'Tagalog'
|
|
'tl': {'fil'},
|
|
# Confusion between Ilokano and Hiligaynon
|
|
'ilo': {'hil'},
|
|
# Prefer Central Atlas Tamazight over Standard Moroccan Tamazight
|
|
'tzm': {'zgh'},
|
|
# Prefer the specific definition of Low Saxon
|
|
'nds-NL': {'nds'},
|
|
# Prefer the specific definition of Mandarin Chinese
|
|
'cmn': {'zh'},
|
|
# Prefer the territorially-specific definition of Dari
|
|
'fa-AF': {'prs', 'fa', 'gbz'},
|
|
# Ambiguity in the scope of Korean script (whether to include Han characters)
|
|
'Kore': {'Hang'},
|
|
# This ambiguity is kind of our fault, for adding an autonym for 'zsm'.
|
|
# "Bahasa Malaysia" should still resolve to the more expected 'ms'.
|
|
'ms': {'zsm'},
|
|
# I think that the CLDR data for Mazanderani confuses Latvia and Lithuania,
|
|
# and Wikipedia tells me it means Latvia. I should make this a CLDR issue
|
|
'lv': {'lt'},
|
|
'LV': {'LT'},
|
|
}
|
|
|
|
OVERRIDES = {
|
|
# When I ask Wiktionary, it tells me that "Breatnais" is Scots Gaelic for
|
|
# Welsh, not Breton, which is "Breatannais". This may be one of those
|
|
# things that's not as standardized as it sounds, but let's at least agree
|
|
# with Wiktionary and avoid a name conflict.
|
|
("gd", "br"): "Breatannais",
|
|
# 'tagaloga' should be 'tl', not 'fil'
|
|
("eu", "tl"): "Tagaloga",
|
|
("eu", "fil"): "Filipinera",
|
|
# 'Dakota' should be 'dak', not 'dar', which is "Dargwa"
|
|
("af", "dar"): "Dargwa",
|
|
("af-NA", "dar"): "Dargwa",
|
|
# 'интерлингве' should be 'ie', not 'ia', which is 'интерлингва'
|
|
("az-Cyrl", "ia"): "интерлингва",
|
|
# Don't confuse Samaritan Hebrew with Samaritan Aramaic
|
|
("en", "smp"): "Samaritan Hebrew",
|
|
# Don't confuse the Mongol language of New Guinea with Mongolian
|
|
("en", "mgt"): "Mongol (New Guinea)",
|
|
# Don't confuse Romang with Romani over the name 'Roma'
|
|
("en", "rmm"): "Romang",
|
|
# 'Tai' is a large language family, and it should not refer exclusively and
|
|
# unrelatedly to a language spoken by 900 people in New Guinea
|
|
("en", "taw"): "Kalam-Tai",
|
|
# The code for Ladin -- the language that's almost certainly being named in
|
|
# Friulian here -- is "lld". The given code of "lad" seems to be an error,
|
|
# pointing to the Judeo-Spanish language Ladino, which would be less likely
|
|
# to be what you mean when speaking Friulian.
|
|
("fur", "lad"): None,
|
|
# The Amharic data in v39 appears to have switched the words for 'Western'
|
|
# and 'Eastern'.
|
|
("am", "011"): "ምዕራባዊ አፍሪካ", # Western Africa
|
|
("am", "014"): "ምስራቃዊ አፍሪካ", # Eastern Africa
|
|
("am", "155"): "ምዕራባዊ አውሮፓ", # Western Europe
|
|
("am", "151"): "ምስራቃዊ አውሮፓ", # Eastern Europe
|
|
}
|
|
|
|
|
|
def resolve_name(key, vals, debug=False):
|
|
"""
|
|
Given a name, and a number of possible values it could resolve to,
|
|
find the single value it should resolve to, in the following way:
|
|
|
|
- Apply the priority order
|
|
- If names with the highest priority all agree, use that name
|
|
- If there is disagreement that can be resolved by AMBIGUOUS_PREFERENCES,
|
|
use that
|
|
- Otherwise, don't resolve the name (and possibly show a debugging message
|
|
when building the data)
|
|
"""
|
|
max_priority = max([val[2] for val in vals])
|
|
val_count = Counter([val[1] for val in vals if val[2] == max_priority])
|
|
if len(val_count) == 1:
|
|
unanimous = val_count.most_common(1)
|
|
return unanimous[0][0]
|
|
|
|
for pkey in val_count:
|
|
if pkey in AMBIGUOUS_PREFERENCES:
|
|
others = set(val_count)
|
|
others.remove(pkey)
|
|
if others == others & AMBIGUOUS_PREFERENCES[pkey]:
|
|
if debug:
|
|
print("Resolved: {} -> {}".format(key, pkey))
|
|
return pkey
|
|
|
|
# In debug mode, show which languages vote for which name
|
|
if debug and max_priority >= 0:
|
|
votes = defaultdict(list)
|
|
for voter, val, prio in vals:
|
|
if prio == max_priority:
|
|
votes[val].append(voter)
|
|
|
|
print("{}:".format(key))
|
|
for val, voters in sorted(votes.items()):
|
|
print("\t{}: {}".format(val, ' '.join(voters)))
|
|
|
|
# Don't use names that remain ambiguous
|
|
return None
|
|
|
|
|
|
def resolve_names(name_dict, debug=False):
|
|
resolved = {}
|
|
for key, vals in sorted(name_dict.items()):
|
|
resolved_name = resolve_name(key, vals, debug=debug)
|
|
if resolved_name is not None:
|
|
resolved[key] = resolved_name
|
|
return resolved
|
|
|
|
|
|
def read_cldr_names(language, category):
|
|
"""
|
|
Read CLDR's names for things in a particular language.
|
|
"""
|
|
filename = data_filename(
|
|
'cldr-json/cldr-json/cldr-localenames-full/main/{}/{}.json'.format(language, category)
|
|
)
|
|
fulldata = json.load(open(filename, encoding='utf-8'))
|
|
data = fulldata['main'][language]['localeDisplayNames'][category]
|
|
return data
|
|
|
|
|
|
def read_cldr_name_file(langcode, category):
|
|
data = read_cldr_names(langcode, category)
|
|
name_quads = []
|
|
for subtag, name in sorted(data.items()):
|
|
if (langcode, subtag) in OVERRIDES:
|
|
name = OVERRIDES[langcode, subtag]
|
|
if name is None:
|
|
continue
|
|
|
|
if subtag == name:
|
|
# Default entries that map a language code to itself, which
|
|
# an inattentive annotator just left there
|
|
continue
|
|
|
|
priority = 3
|
|
if subtag.endswith('-alt-menu') and name == 'mandarin':
|
|
# The -alt-menu entries are supposed to do things like alphabetize
|
|
# "Mandarin Chinese" under "Chinese, Mandarin". A few languages
|
|
# just put the string "mandarin" there, which seems wrong and
|
|
# messes up our name lookups.
|
|
continue
|
|
|
|
# CLDR assigns multiple names to one code by adding -alt-* to
|
|
# the end of the code. For example, the English name of 'az' is
|
|
# Azerbaijani, but the English name of 'az-alt-short' is Azeri.
|
|
if '-alt-' in subtag:
|
|
subtag, _ = subtag.split('-alt-', 1)
|
|
priority = 1
|
|
|
|
if normalize_name(name) == normalize_name(subtag):
|
|
# Giving the name "zh (Hans)" to "zh-Hans" is still lazy
|
|
continue
|
|
|
|
name_quads.append((langcode, subtag, name, priority))
|
|
return name_quads
|
|
|
|
|
|
def read_iana_registry_names():
|
|
language_quads = []
|
|
script_quads = []
|
|
territory_quads = []
|
|
for entry in parse_registry():
|
|
target = None
|
|
if entry['Type'] == 'language':
|
|
target = language_quads
|
|
elif entry['Type'] == 'script':
|
|
target = script_quads
|
|
elif entry['Type'] == 'region':
|
|
# IANA's terminology is 'region' where CLDR's is 'territory'
|
|
target = territory_quads
|
|
if target is not None:
|
|
subtag = entry['Subtag']
|
|
priority = 2
|
|
if 'Deprecated' in entry:
|
|
priority = 0
|
|
if ('en', subtag) in OVERRIDES:
|
|
target.append(('en', subtag, OVERRIDES['en', subtag], priority))
|
|
else:
|
|
for desc in entry['Description']:
|
|
target.append(('en', subtag, desc, priority))
|
|
return language_quads, script_quads, territory_quads
|
|
|
|
|
|
def read_iana_registry_macrolanguages():
|
|
macros = {}
|
|
for entry in parse_registry():
|
|
if entry['Type'] == 'language' and 'Macrolanguage' in entry:
|
|
macros[entry['Subtag']] = entry['Macrolanguage']
|
|
return macros
|
|
|
|
|
|
def read_iana_registry_replacements():
|
|
replacements = {}
|
|
for entry in parse_registry():
|
|
if entry['Type'] == 'language' and 'Preferred-Value' in entry:
|
|
# Replacements for language codes
|
|
replacements[entry['Subtag']] = entry['Preferred-Value']
|
|
elif 'Tag' in entry and 'Preferred-Value' in entry:
|
|
# Replacements for entire tags
|
|
replacements[entry['Tag'].lower()] = entry['Preferred-Value']
|
|
return replacements
|
|
|
|
|
|
def read_csv_names(filename):
|
|
data = open(filename, encoding='utf-8')
|
|
quads = []
|
|
for line in data:
|
|
quad = line.rstrip().split(',', 3) + [True]
|
|
quads.append(tuple(quad))
|
|
return quads
|
|
|
|
|
|
def read_wiktionary_names(filename, language):
|
|
data = open(filename, encoding='utf-8')
|
|
quads = []
|
|
for line in data:
|
|
parts = line.rstrip().split('\t')
|
|
code = parts[0]
|
|
quads.append((language, code, parts[1], -1))
|
|
names = [parts[1]]
|
|
if len(parts) > 4 and parts[4]:
|
|
names = parts[4].split(', ')
|
|
for name in names:
|
|
quads.append((language, code, name, -2))
|
|
return quads
|
|
|
|
|
|
def update_names(names_fwd, names_rev, name_quads):
|
|
for name_language, referent, name, priority in name_quads:
|
|
# Get just the language from name_language, not the territory or script.
|
|
short_language = name_language.split('-')[0]
|
|
rev_all = names_rev.setdefault('und', {})
|
|
rev_language = names_rev.setdefault(short_language, {})
|
|
for rev_dict in (rev_all, rev_language):
|
|
rev_dict.setdefault(normalize_name(name), []).append(
|
|
(name_language, referent, priority)
|
|
)
|
|
|
|
names_for_referent = names_fwd.setdefault(referent, {})
|
|
if name_language not in names_for_referent:
|
|
names_for_referent[name_language] = name
|
|
|
|
|
|
def save_trie(mapping, filename):
|
|
trie = marisa_trie.BytesTrie(
|
|
(key, value.encode('utf-8')) for (key, value) in sorted(mapping.items())
|
|
)
|
|
trie.save(filename)
|
|
|
|
|
|
def save_reverse_name_tables(category, rev_dict):
|
|
for language, lang_dict in rev_dict.items():
|
|
os.makedirs(data_filename(f"trie/{language}"), exist_ok=True)
|
|
save_trie(
|
|
resolve_names(lang_dict, debug=True),
|
|
data_filename(f"trie/{language}/name_to_{category}.marisa"),
|
|
)
|
|
|
|
|
|
def get_name_languages():
|
|
cldr_main_path = Path(data_filename("cldr-json/cldr-json/cldr-localenames-full/main"))
|
|
languages = [
|
|
subpath.name
|
|
for subpath in sorted(cldr_main_path.iterdir())
|
|
if subpath.name != 'root' and (subpath / 'languages.json').exists()
|
|
]
|
|
return [language for language in languages if 'a' <= language[-1] <= 'z']
|
|
|
|
|
|
def get_population_data():
|
|
import langcodes
|
|
|
|
filename = data_filename("supplementalData.xml")
|
|
root = ET.fromstring(open(filename).read())
|
|
territories = root.findall("./territoryInfo/territory")
|
|
|
|
language_population = defaultdict(int)
|
|
language_writing_population = defaultdict(int)
|
|
|
|
for territory in territories:
|
|
t_code = territory.attrib['type']
|
|
t_population = float(territory.attrib['population'])
|
|
t_literacy_rate = float(territory.attrib['literacyPercent']) / 100
|
|
|
|
for language in territory:
|
|
attrs = language.attrib
|
|
l_code = attrs['type'].replace('_', '-')
|
|
l_proportion = float(attrs.get('populationPercent', 0)) / 100
|
|
if 'writingPercent' in attrs:
|
|
writing_prop = float(attrs['writingPercent']) / 100
|
|
elif 'literacyPercent' in attrs:
|
|
writing_prop = float(attrs['literacyPercent']) / 100
|
|
else:
|
|
writing_prop = t_literacy_rate
|
|
|
|
l_population = t_population * l_proportion
|
|
l_writing = t_population * l_proportion * writing_prop
|
|
|
|
# Distinguish data in different territories, and also in different
|
|
# scripts when necessary, while also accumulating more general data
|
|
|
|
# We need to use maximize() on the bare language code, not just
|
|
# assume_script(), because assumed defaults like 'zh-Hans' are unwritten
|
|
# in the data. We need this if we want to count the relative use of
|
|
# Simplified vs. Traditional Chinese, for example.
|
|
written_ls = (
|
|
langcodes.get(l_code).maximize()._filter_attributes(['language', 'script'])
|
|
)
|
|
written_lst = written_ls.update_dict({'territory': t_code})
|
|
|
|
spoken_lt = written_lst._filter_attributes(['language', 'territory'])
|
|
spoken_l = written_lst._filter_attributes(['language'])
|
|
|
|
written_lt = written_lst._filter_attributes(['language', 'territory'])
|
|
written_l = written_lst._filter_attributes(['language'])
|
|
|
|
for lang in set([spoken_lt, spoken_l]):
|
|
language_population[str(lang)] += int(round(l_population))
|
|
|
|
for lang in set([written_lst, written_lt, written_ls, written_l]):
|
|
language_writing_population[str(lang)] += int(round(l_writing))
|
|
|
|
return language_population, language_writing_population
|
|
|
|
|
|
def write_python_dict(outfile, name, d):
|
|
"""
|
|
Write Python code that initializes a given dictionary, with one value on
|
|
each line.
|
|
"""
|
|
print(f"{name} = {{", file=outfile)
|
|
for key, value in sorted(d.items()):
|
|
print(f" {key!r}: {value!r},", file=outfile)
|
|
print("}", file=outfile)
|
|
|
|
|
|
def write_python_set(outfile, name, s):
|
|
print(f"{name} = {{", file=outfile)
|
|
for key in sorted(set(s)):
|
|
print(f" {key!r},", file=outfile)
|
|
print("}", file=outfile)
|
|
|
|
|
|
GENERATED_HEADER = "# This file is generated by build_data.py."
|
|
|
|
|
|
def build_data():
|
|
language_names_rev = {}
|
|
territory_names_rev = {}
|
|
script_names_rev = {}
|
|
names_fwd = {}
|
|
|
|
override_language_data = read_csv_names(data_filename('override_language_names.csv'))
|
|
update_names(names_fwd, language_names_rev, override_language_data)
|
|
|
|
for langcode in get_name_languages():
|
|
language_data = read_cldr_name_file(langcode, 'languages')
|
|
update_names(names_fwd, language_names_rev, language_data)
|
|
|
|
try:
|
|
script_data = read_cldr_name_file(langcode, 'scripts')
|
|
update_names(names_fwd, script_names_rev, script_data)
|
|
except FileNotFoundError:
|
|
pass
|
|
|
|
try:
|
|
territory_data = read_cldr_name_file(langcode, 'territories')
|
|
update_names(names_fwd, territory_names_rev, territory_data)
|
|
except FileNotFoundError:
|
|
pass
|
|
|
|
iana_languages, iana_scripts, iana_territories = read_iana_registry_names()
|
|
update_names(names_fwd, language_names_rev, iana_languages)
|
|
update_names(names_fwd, script_names_rev, iana_scripts)
|
|
update_names(names_fwd, territory_names_rev, iana_territories)
|
|
|
|
wiktionary_data = read_wiktionary_names(data_filename('wiktionary/codes-en.csv'), 'en')
|
|
update_names(names_fwd, language_names_rev, wiktionary_data)
|
|
|
|
extra_language_data = read_csv_names(data_filename('extra_language_names.csv'))
|
|
update_names(names_fwd, language_names_rev, extra_language_data)
|
|
|
|
save_reverse_name_tables('language', language_names_rev)
|
|
save_reverse_name_tables('script', script_names_rev)
|
|
save_reverse_name_tables('territory', territory_names_rev)
|
|
|
|
# Get the list of languages where we have any name data. These are base
|
|
# language codes (without scripts or territories) which contain a name for
|
|
# themselves.
|
|
name_languages = [
|
|
langcode
|
|
for langcode in get_name_languages()
|
|
if '-' not in langcode and langcode in names_fwd and langcode in names_fwd[langcode]
|
|
]
|
|
|
|
# Add the languages that have autonyms in extra_language_data, perhaps because
|
|
# we specifically put them there to get their autonyms right
|
|
name_languages += [lang1 for (lang1, lang2, _, _) in extra_language_data if lang1 == lang2]
|
|
|
|
# Write the contents of name_data.py.
|
|
with open('name_data.py', 'w', encoding='utf-8') as outfile:
|
|
print(GENERATED_HEADER, file=outfile)
|
|
write_python_set(outfile, 'LANGUAGES_WITH_NAME_DATA', name_languages)
|
|
print(file=outfile)
|
|
write_python_dict(outfile, 'CODE_TO_NAMES', names_fwd)
|
|
|
|
language_population, language_writing_population = get_population_data()
|
|
with open('population_data.py', 'w', encoding='utf-8') as outfile:
|
|
print(GENERATED_HEADER, file=outfile)
|
|
write_python_dict(outfile, 'LANGUAGE_SPEAKING_POPULATION', language_population)
|
|
write_python_dict(outfile, 'LANGUAGE_WRITING_POPULATION', language_writing_population)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
build_data()
|