ai-content-maker/.venv/Lib/site-packages/langcodes/build_data.py

import json
import xml.etree.ElementTree as ET
from langcodes.util import data_filename
from langcodes.registry_parser import parse_registry


def read_cldr_supplemental(dataname):
    cldr_supp_path = data_filename('cldr-json/cldr-json/cldr-core/supplemental')
    filename = data_filename(f'{cldr_supp_path}/{dataname}.json')
    fulldata = json.load(open(filename, encoding='utf-8'))
    if dataname == 'aliases':
        data = fulldata['supplemental']['metadata']['alias']
    else:
        data = fulldata['supplemental'][dataname]
    return data


def read_iana_registry_suppress_scripts():
    scripts = {}
    for entry in parse_registry():
        if entry['Type'] == 'language' and 'Suppress-Script' in entry:
            scripts[entry['Subtag']] = entry['Suppress-Script']
    return scripts


def read_iana_registry_scripts():
    scripts = set()
    for entry in parse_registry():
        if entry['Type'] == 'script':
            scripts.add(entry['Subtag'])
    return scripts


def read_iana_registry_macrolanguages():
    macros = {}
    for entry in parse_registry():
        if entry['Type'] == 'language' and 'Macrolanguage' in entry:
            macros[entry['Subtag']] = entry['Macrolanguage']
    return macros


def read_iana_registry_replacements():
    replacements = {}
    for entry in parse_registry():
        if entry['Type'] == 'language' and 'Preferred-Value' in entry:
            # Replacements for language codes
            replacements[entry['Subtag']] = entry['Preferred-Value']
        elif 'Tag' in entry and 'Preferred-Value' in entry:
            # Replacements for entire tags
            replacements[entry['Tag'].lower()] = entry['Preferred-Value']
    return replacements


def write_python_dict(outfile, name, d):
    print(f"{name} = {{", file=outfile)
    for key in sorted(d):
        value = d[key]
        print(f"    {key!r}: {value!r},", file=outfile)
    print("}", file=outfile)


def write_python_set(outfile, name, s):
    print(f"{name} = {{", file=outfile)
    for key in sorted(set(s)):
        print(f"    {key!r},", file=outfile)
    print("}", file=outfile)


GENERATED_HEADER = "# This file is generated by build_data.py."


def read_validity_regex():
    validity_options = []
    for codetype in ('language', 'region', 'script', 'variant'):
        validity_path = data_filename(f'cldr/common/validity/{codetype}.xml')
        root = ET.fromstring(open(validity_path).read())
        matches = root.findall('./idValidity/id')
        for match in matches:
            for item in match.text.strip().split():
                if '~' in item:
                    assert item[-2] == '~'
                    prefix = item[:-3]
                    range_start = item[-3]
                    range_end = item[-1]
                    option = f"{prefix}[{range_start}-{range_end}]"
                    validity_options.append(option)
                else:
                    validity_options.append(item)
    options = '|'.join(validity_options)
    return f'^({options})$'


def read_language_distances():
    language_info_path = data_filename('cldr/common/supplemental/languageInfo.xml')
    root = ET.fromstring(open(language_info_path).read())
    matches = root.findall(
        './languageMatching/languageMatches[@type="written_new"]/languageMatch'
    )
    tag_distances = {}
    for match in matches:
        attribs = match.attrib
        n_parts = attribs['desired'].count('_') + 1
        if n_parts < 3:
            if attribs.get('oneway') == 'true':
                pairs = [(attribs['desired'], attribs['supported'])]
            else:
                pairs = [
                    (attribs['desired'], attribs['supported']),
                    (attribs['supported'], attribs['desired']),
                ]
            for (desired, supported) in pairs:
                desired_distance = tag_distances.setdefault(desired, {})
                desired_distance[supported] = int(attribs['distance'])

                # The 'languageInfo' data file contains distances for the unnormalized
                # tag 'sh', but we work mostly with normalized tags, and they don't
                # describe at all how to cope with this.
                #
                # 'sh' normalizes to 'sr-Latn', and when we're matching languages we
                # aren't matching scripts yet, so when 'sh' appears we'll add a
                # corresponding match for 'sr'.
                #
                # Then because we're kind of making this plan up, add 1 to the distance
                # so it's a worse match than ones that are actually clearly defined
                # in languageInfo.
                if desired == 'sh' or supported == 'sh':
                    if desired == 'sh':
                        desired = 'sr'
                    if supported == 'sh':
                        supported = 'sr'
                    if desired != supported:
                        # don't try to define a non-zero distance for sr <=> sr
                        desired_distance = tag_distances.setdefault(desired, {})
                        desired_distance[supported] = int(attribs['distance']) + 1

    return tag_distances


def build_data():
    lang_scripts = read_iana_registry_suppress_scripts()
    all_scripts = read_iana_registry_scripts()
    macrolanguages = read_iana_registry_macrolanguages()
    iana_replacements = read_iana_registry_replacements()
    language_distances = read_language_distances()

    alias_data = read_cldr_supplemental('aliases')
    likely_subtags = read_cldr_supplemental('likelySubtags')
    replacements = {}

    # Aliased codes can still have alpha3 codes, and there's no unified source
    # about what they are. It depends on whether the alias predates or postdates
    # ISO 639-2, which nobody should have to care about. So let's set all the
    # alpha3 codes for aliased alpha2 codes here.
    alpha3_mapping = {
        'tl': 'tgl',  # even though it normalizes to 'fil'
        'in': 'ind',
        'iw': 'heb',
        'ji': 'yid',
        'jw': 'jav',
        'sh': 'hbs',
    }
    alpha3_biblio = {}
    norm_macrolanguages = {}
    for alias_type in ['languageAlias', 'scriptAlias', 'territoryAlias']:
        aliases = alias_data[alias_type]
        # Initially populate 'languageAlias' with the aliases from the IANA file
        if alias_type == 'languageAlias':
            replacements[alias_type] = iana_replacements
            replacements[alias_type]['root'] = 'und'
        else:
            replacements[alias_type] = {}
        for code, value in aliases.items():
            # Make all keys lowercase so they can be looked up
            # case-insensitively
            code = code.lower()

            # If there are multiple replacements, take the first one. For example,
            # we just replace the Soviet Union (SU) with Russia (RU), instead of
            # trying to do something context-sensitive and poorly standardized
            # that selects one of the successor countries to the Soviet Union.
            replacement = value['_replacement'].split()[0]
            if value['_reason'] == 'macrolanguage':
                norm_macrolanguages[code] = replacement
            else:
                # CLDR tries to oversimplify some codes as it assigns aliases.
                # For example, 'nor' is the ISO alpha3 code for 'no', but CLDR
                # would prefer you use 'nb' over 'no', so it makes 'nor' an
                # alias of 'nb'. But 'nb' already has an alpha3 code, 'nob'.
                #
                # We undo this oversimplification so that we can get a
                # canonical mapping between alpha2 and alpha3 codes.
                if code == 'nor':
                    replacement = 'no'
                elif code == 'mol':
                    replacement = 'mo'
                elif code == 'twi':
                    replacement = 'tw'
                elif code == 'bih':
                    replacement = 'bh'

                replacements[alias_type][code] = replacement
                if alias_type == 'languageAlias':
                    if value['_reason'] == 'overlong':
                        if replacement in alpha3_mapping:
                            raise ValueError(
                                "{code!r} is an alpha3 for {replacement!r}, which"
                                " already has an alpha3: {orig!r}".format(
                                    code=code,
                                    replacement=replacement,
                                    orig=alpha3_mapping[replacement],
                                )
                            )
                        alpha3_mapping[replacement] = code
                    elif value['_reason'] == 'bibliographic':
                        alpha3_biblio[replacement] = code

    validity_regex = read_validity_regex()

    # Write the contents of data_dicts.py.
    with open('data_dicts.py', 'w', encoding='utf-8') as outfile:
        print(GENERATED_HEADER, file=outfile)
        print("import re\n", file=outfile)
        write_python_dict(outfile, 'DEFAULT_SCRIPTS', lang_scripts)
        write_python_dict(
            outfile, 'LANGUAGE_REPLACEMENTS', replacements['languageAlias']
        )
        write_python_dict(outfile, 'LANGUAGE_ALPHA3', alpha3_mapping)
        write_python_dict(outfile, 'LANGUAGE_ALPHA3_BIBLIOGRAPHIC', alpha3_biblio)
        write_python_dict(outfile, 'SCRIPT_REPLACEMENTS', replacements['scriptAlias'])
        write_python_set(outfile, 'ALL_SCRIPTS', all_scripts)
        write_python_dict(
            outfile, 'TERRITORY_REPLACEMENTS', replacements['territoryAlias']
        )
        write_python_dict(outfile, 'MACROLANGUAGES', macrolanguages)
        write_python_dict(outfile, 'NORMALIZED_MACROLANGUAGES', norm_macrolanguages)
        write_python_dict(outfile, 'LIKELY_SUBTAGS', likely_subtags)
        write_python_dict(outfile, 'LANGUAGE_DISTANCES', language_distances)
        print(f"VALIDITY = re.compile({validity_regex!r})", file=outfile)


if __name__ == '__main__':
    build_data()
first commit 2024-05-03 04:18:51 +03:00			`import json`
			`import xml.etree.ElementTree as ET`
			`from langcodes.util import data_filename`
			`from langcodes.registry_parser import parse_registry`


			`def read_cldr_supplemental(dataname):`
			`cldr_supp_path = data_filename('cldr-json/cldr-json/cldr-core/supplemental')`
			`filename = data_filename(f'{cldr_supp_path}/{dataname}.json')`
			`fulldata = json.load(open(filename, encoding='utf-8'))`
			`if dataname == 'aliases':`
			`data = fulldata['supplemental']['metadata']['alias']`
			`else:`
			`data = fulldata['supplemental'][dataname]`
			`return data`


			`def read_iana_registry_suppress_scripts():`
			`scripts = {}`
			`for entry in parse_registry():`
			`if entry['Type'] == 'language' and 'Suppress-Script' in entry:`
			`scripts[entry['Subtag']] = entry['Suppress-Script']`
			`return scripts`


			`def read_iana_registry_scripts():`
			`scripts = set()`
			`for entry in parse_registry():`
			`if entry['Type'] == 'script':`
			`scripts.add(entry['Subtag'])`
			`return scripts`


			`def read_iana_registry_macrolanguages():`
			`macros = {}`
			`for entry in parse_registry():`
			`if entry['Type'] == 'language' and 'Macrolanguage' in entry:`
			`macros[entry['Subtag']] = entry['Macrolanguage']`
			`return macros`


			`def read_iana_registry_replacements():`
			`replacements = {}`
			`for entry in parse_registry():`
			`if entry['Type'] == 'language' and 'Preferred-Value' in entry:`
			`# Replacements for language codes`
			`replacements[entry['Subtag']] = entry['Preferred-Value']`
			`elif 'Tag' in entry and 'Preferred-Value' in entry:`
			`# Replacements for entire tags`
			`replacements[entry['Tag'].lower()] = entry['Preferred-Value']`
			`return replacements`


			`def write_python_dict(outfile, name, d):`
			`print(f"{name} = {{", file=outfile)`
			`for key in sorted(d):`
			`value = d[key]`
			`print(f" {key!r}: {value!r},", file=outfile)`
			`print("}", file=outfile)`


			`def write_python_set(outfile, name, s):`
			`print(f"{name} = {{", file=outfile)`
			`for key in sorted(set(s)):`
			`print(f" {key!r},", file=outfile)`
			`print("}", file=outfile)`


			`GENERATED_HEADER = "# This file is generated by build_data.py."`


			`def read_validity_regex():`
			`validity_options = []`
			`for codetype in ('language', 'region', 'script', 'variant'):`
			`validity_path = data_filename(f'cldr/common/validity/{codetype}.xml')`
			`root = ET.fromstring(open(validity_path).read())`
			`matches = root.findall('./idValidity/id')`
			`for match in matches:`
			`for item in match.text.strip().split():`
			`if '~' in item:`
			`assert item[-2] == '~'`
			`prefix = item[:-3]`
			`range_start = item[-3]`
			`range_end = item[-1]`
			`option = f"{prefix}[{range_start}-{range_end}]"`
			`validity_options.append(option)`
			`else:`
			`validity_options.append(item)`
			`options = '\|'.join(validity_options)`
			`return f'^({options})$'`


			`def read_language_distances():`
			`language_info_path = data_filename('cldr/common/supplemental/languageInfo.xml')`
			`root = ET.fromstring(open(language_info_path).read())`
			`matches = root.findall(`
			`'./languageMatching/languageMatches[@type="written_new"]/languageMatch'`
			`)`
			`tag_distances = {}`
			`for match in matches:`
			`attribs = match.attrib`
			`n_parts = attribs['desired'].count('_') + 1`
			`if n_parts < 3:`
			`if attribs.get('oneway') == 'true':`
			`pairs = [(attribs['desired'], attribs['supported'])]`
			`else:`
			`pairs = [`
			`(attribs['desired'], attribs['supported']),`
			`(attribs['supported'], attribs['desired']),`
			`]`
			`for (desired, supported) in pairs:`
			`desired_distance = tag_distances.setdefault(desired, {})`
			`desired_distance[supported] = int(attribs['distance'])`

			`# The 'languageInfo' data file contains distances for the unnormalized`
			`# tag 'sh', but we work mostly with normalized tags, and they don't`
			`# describe at all how to cope with this.`
			`#`
			`# 'sh' normalizes to 'sr-Latn', and when we're matching languages we`
			`# aren't matching scripts yet, so when 'sh' appears we'll add a`
			`# corresponding match for 'sr'.`
			`#`
			`# Then because we're kind of making this plan up, add 1 to the distance`
			`# so it's a worse match than ones that are actually clearly defined`
			`# in languageInfo.`
			`if desired == 'sh' or supported == 'sh':`
			`if desired == 'sh':`
			`desired = 'sr'`
			`if supported == 'sh':`
			`supported = 'sr'`
			`if desired != supported:`
			`# don't try to define a non-zero distance for sr <=> sr`
			`desired_distance = tag_distances.setdefault(desired, {})`
			`desired_distance[supported] = int(attribs['distance']) + 1`

			`return tag_distances`


			`def build_data():`
			`lang_scripts = read_iana_registry_suppress_scripts()`
			`all_scripts = read_iana_registry_scripts()`
			`macrolanguages = read_iana_registry_macrolanguages()`
			`iana_replacements = read_iana_registry_replacements()`
			`language_distances = read_language_distances()`

			`alias_data = read_cldr_supplemental('aliases')`
			`likely_subtags = read_cldr_supplemental('likelySubtags')`
			`replacements = {}`

			`# Aliased codes can still have alpha3 codes, and there's no unified source`
			`# about what they are. It depends on whether the alias predates or postdates`
			`# ISO 639-2, which nobody should have to care about. So let's set all the`
			`# alpha3 codes for aliased alpha2 codes here.`
			`alpha3_mapping = {`
			`'tl': 'tgl', # even though it normalizes to 'fil'`
			`'in': 'ind',`
			`'iw': 'heb',`
			`'ji': 'yid',`
			`'jw': 'jav',`
			`'sh': 'hbs',`
			`}`
			`alpha3_biblio = {}`
			`norm_macrolanguages = {}`
			`for alias_type in ['languageAlias', 'scriptAlias', 'territoryAlias']:`
			`aliases = alias_data[alias_type]`
			`# Initially populate 'languageAlias' with the aliases from the IANA file`
			`if alias_type == 'languageAlias':`
			`replacements[alias_type] = iana_replacements`
			`replacements[alias_type]['root'] = 'und'`
			`else:`
			`replacements[alias_type] = {}`
			`for code, value in aliases.items():`
			`# Make all keys lowercase so they can be looked up`
			`# case-insensitively`
			`code = code.lower()`

			`# If there are multiple replacements, take the first one. For example,`
			`# we just replace the Soviet Union (SU) with Russia (RU), instead of`
			`# trying to do something context-sensitive and poorly standardized`
			`# that selects one of the successor countries to the Soviet Union.`
			`replacement = value['_replacement'].split()[0]`
			`if value['_reason'] == 'macrolanguage':`
			`norm_macrolanguages[code] = replacement`
			`else:`
			`# CLDR tries to oversimplify some codes as it assigns aliases.`
			`# For example, 'nor' is the ISO alpha3 code for 'no', but CLDR`
			`# would prefer you use 'nb' over 'no', so it makes 'nor' an`
			`# alias of 'nb'. But 'nb' already has an alpha3 code, 'nob'.`
			`#`
			`# We undo this oversimplification so that we can get a`
			`# canonical mapping between alpha2 and alpha3 codes.`
			`if code == 'nor':`
			`replacement = 'no'`
			`elif code == 'mol':`
			`replacement = 'mo'`
			`elif code == 'twi':`
			`replacement = 'tw'`
			`elif code == 'bih':`
			`replacement = 'bh'`

			`replacements[alias_type][code] = replacement`
			`if alias_type == 'languageAlias':`
			`if value['_reason'] == 'overlong':`
			`if replacement in alpha3_mapping:`
			`raise ValueError(`
			`"{code!r} is an alpha3 for {replacement!r}, which"`
			`" already has an alpha3: {orig!r}".format(`
			`code=code,`
			`replacement=replacement,`
			`orig=alpha3_mapping[replacement],`
			`)`
			`)`
			`alpha3_mapping[replacement] = code`
			`elif value['_reason'] == 'bibliographic':`
			`alpha3_biblio[replacement] = code`

			`validity_regex = read_validity_regex()`

			`# Write the contents of data_dicts.py.`
			`with open('data_dicts.py', 'w', encoding='utf-8') as outfile:`
			`print(GENERATED_HEADER, file=outfile)`
			`print("import re\n", file=outfile)`
			`write_python_dict(outfile, 'DEFAULT_SCRIPTS', lang_scripts)`
			`write_python_dict(`
			`outfile, 'LANGUAGE_REPLACEMENTS', replacements['languageAlias']`
			`)`
			`write_python_dict(outfile, 'LANGUAGE_ALPHA3', alpha3_mapping)`
			`write_python_dict(outfile, 'LANGUAGE_ALPHA3_BIBLIOGRAPHIC', alpha3_biblio)`
			`write_python_dict(outfile, 'SCRIPT_REPLACEMENTS', replacements['scriptAlias'])`
			`write_python_set(outfile, 'ALL_SCRIPTS', all_scripts)`
			`write_python_dict(`
			`outfile, 'TERRITORY_REPLACEMENTS', replacements['territoryAlias']`
			`)`
			`write_python_dict(outfile, 'MACROLANGUAGES', macrolanguages)`
			`write_python_dict(outfile, 'NORMALIZED_MACROLANGUAGES', norm_macrolanguages)`
			`write_python_dict(outfile, 'LIKELY_SUBTAGS', likely_subtags)`
			`write_python_dict(outfile, 'LANGUAGE_DISTANCES', language_distances)`
			`print(f"VALIDITY = re.compile({validity_regex!r})", file=outfile)`


			`if __name__ == '__main__':`
			`build_data()`