ai-content-maker/.venv/Lib/site-packages/dateparser_scripts/order_languages.py

195 lines
7.5 KiB
Python

import json
import os
from collections import OrderedDict
import regex as re
from dateparser_scripts.utils import get_raw_data
from parsel import Selector
import requests
os.chdir(os.path.dirname(os.path.abspath(__file__)))
# Languages with insufficient translation data are excluded
avoid_languages = {'cu', 'kkj', 'nds', 'prg', 'tk', 'vai', 'vai-Latn', 'vai-Vaii', 'vo'}
def _get_language_locale_dict():
cldr_dates_full_dir = "../raw_data/cldr_dates_full/main/"
available_locale_names = os.listdir(cldr_dates_full_dir)
available_language_names = [shortname for shortname in available_locale_names
if not re.search(r'-[A-Z0-9]+$', shortname)]
available_language_names.remove('root')
language_locale_dict = {}
for language_name in available_language_names:
language_locale_dict[language_name] = []
for locale_name in available_locale_names:
if re.match(language_name + '-[A-Z0-9]+$', locale_name):
language_locale_dict[language_name].append(locale_name)
for language in avoid_languages:
if language in language_locale_dict:
del language_locale_dict[language]
return language_locale_dict
def _get_language_order(language_locale_dict):
def get_most_common_locales():
# Order from https://w3techs.com/technologies/overview/content_language
# Last updated on 03.10.2022
old_common_locales = ['en',
'ru',
'es',
'de',
'tr',
'fr',
'fa',
'ja',
'zh',
'vi',
'it',
'nl',
'pt',
'ar',
'pl',
'id',
'ko',
'uk',
'th',
'he',
'cs',
'sv',
'ro',
'el',
'da',
'hu',
'fi',
'sr',
'sk',
'bg',
'nb',
'hr',
'lt',
'no',
'hi',
'sl',
'ca',
'et']
response = requests.get('https://w3techs.com/technologies/overview/content_language')
sel = Selector(text=response.text)
if response.ok:
try:
bars = sel.xpath("//table[@class='bars']//a/@href").getall()
if not bars:
raise ValueError("No bars found")
new_most_common_locales = [i.replace('https://w3techs.com/technologies/details/cl', '').strip('-') for i in bars]
if new_most_common_locales[0] != 'en':
raise ValueError("English is not the first language")
except Exception as e:
print(e)
print("The website could have changed, please update the code")
return old_common_locales
else:
return old_common_locales
return new_most_common_locales
territory_info_file = "../raw_data/cldr_core/supplemental/territoryInfo.json"
with open(territory_info_file) as f:
territory_content = json.load(f)
territory_info_data = territory_content["supplemental"]["territoryInfo"]
language_population_dict = {}
for territory in territory_info_data:
population = int(territory_info_data[territory]["_population"])
try:
lang_dict = territory_info_data[territory]["languagePopulation"]
for language in lang_dict:
language_population = float(lang_dict[language]["_populationPercent"]) * population
if language in language_population_dict:
language_population_dict[language] += language_population
else:
language_population_dict[language] = language_population
except Exception:
pass
most_common_locales = get_most_common_locales()
language_order_with_duplicates = (
most_common_locales
+ sorted(
language_population_dict.keys(),
key=lambda x: (language_population_dict[x], x), reverse=True
)
)
language_order = sorted(
set(language_order_with_duplicates),
key=lambda x: language_order_with_duplicates.index(x)
)
for index in range(0, len(language_order)):
language_order[index] = re.sub(r'_', r'-', language_order[index])
cldr_languages = language_locale_dict.keys()
supplementary_date_directory = "../dateparser_data/supplementary_language_data/date_translation_data"
supplementary_languages = [x[:-5] for x in os.listdir(supplementary_date_directory)]
available_languages = set(cldr_languages).union(set(supplementary_languages))
language_order = [shortname for shortname in language_order if shortname in available_languages]
absent_languages = set(available_languages) - set(language_order)
remaining_languages = []
for language in absent_languages:
parent_language = re.sub(r'-\w+', '', language)
if parent_language in language_order:
language_order.insert(language_order.index(parent_language) + 1, language)
else:
remaining_languages.append(language)
language_order = language_order + sorted(remaining_languages)
language_order = list(map(str, language_order))
return language_order
def generate_language_map(language_order):
data = {}
for lang in sorted(language_order):
if '-' not in lang:
data[lang] = [lang]
else:
data[lang.split('-')[0]].append(lang)
return data
def main():
get_raw_data()
language_locale_dict = _get_language_locale_dict()
language_order = _get_language_order(language_locale_dict)
parent_directory = "../dateparser/data/"
filename = "../dateparser/data/languages_info.py"
if not os.path.isdir(parent_directory):
os.mkdir(parent_directory)
language_order_string = 'language_order = ' + json.dumps(
language_order, separators=(',', ': '), indent=4
)
complete_language_locale_dict = OrderedDict()
for key in language_order:
if key in language_locale_dict.keys():
complete_language_locale_dict[key] = sorted(language_locale_dict[key])
else:
complete_language_locale_dict[key] = []
language_locale_dict_string = 'language_locale_dict = ' + json.dumps(
complete_language_locale_dict, separators=(',', ': '), indent=4
)
language_map_data = generate_language_map(language_order)
language_map_data_string = 'language_map = ' + json.dumps(
language_map_data, separators=(',', ': '), indent=4
)
languages_info_string = language_order_string + '\n\n' + language_map_data_string + '\n\n' + language_locale_dict_string + '\n'
with open(filename, 'w') as f:
f.write(languages_info_string)
if __name__ == '__main__':
main()