ai-content-maker/.venv/Lib/site-packages/bnunicodenormalizer/langs.py

#-*- coding: utf-8 -*-
"""
@author:Bengali.AI
indic generall ref: https://r12a.github.io/scripts/
"""
from __future__ import print_function
#--------------------------------------------------------------------------------------------------------------------------------------------
# language classes
#--------------------------------------------------------------------------------------------------------------------------------------------
class english:
    lower                  =    ["a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z"]
    upper                  =    ["A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z"]
    punctuations           =    ['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?',
                                 '@', '[', '\\', ']', '^', '_', '`','{', '|', '}', '~']
    numbers                =    ["0","1","2","3","4","5","6","7","8","9"]

    valid                  =    sorted(lower+upper+numbers+punctuations)
#--------------------------------------------------------------------------------------------------------------------------------------------
#############################################################################################################################################
#--------------------------------------------------------------------------------------------------------------------------------------------
class bangla:
    '''
        * vowel and consonant division according to : http://bn.wikipedia.org/wiki/%E0%A7%8E
        * consonant conjuncts according to: http://bn.wiktionary.org/wiki/উইকিঅভিধান:বাংলা_যুক্তবর্ণের_তালিকা
        * punctuations according to: https://bn.wikipedia.org/wiki/যতিচিহ্ন
    '''

    #-----------------------------------------------------basic-------------------------------------------------------------------
    iden                   =    "bangla"
    nukta                  =   '়'
    vowels                 =   ['অ', 'আ', 'ই', 'ঈ', 'উ', 'ঊ', 'ঋ', 'এ', 'ঐ', 'ও', 'ঔ']
    consonants             =   ['ক', 'খ', 'গ', 'ঘ', 'ঙ', 'চ', 'ছ','জ', 'ঝ', 'ঞ', 'ট', 'ঠ', 'ড', 'ঢ', 'ণ', 'ত', 'থ', 'দ', 'ধ', 'ন',
                                'প', 'ফ', 'ব', 'ভ', 'ম', 'য', 'র', 'ল', 'শ', 'ষ', 'স', 'হ','ড়', 'ঢ়', 'য়','ৎ']
    vowel_diacritics       =   ['া', 'ি', 'ী', 'ু', 'ূ', 'ৃ', 'ে', 'ৈ', 'ো', 'ৌ']
    consonant_diacritics   =   ['ঁ', 'ং', 'ঃ']
    numbers                =   ['০', '১', '২', '৩', '৪', '৫', '৬', '৭', '৮', '৯']
    punctuations           =   ['!', '"', "'", '(', ')', ',', '-', '.', '...', ':', ':-', ';', '<', '=', '>', '?', '[', ']', '{', '}', '।', '৷', '–', '—', '”', '√']
    symbols                =   ['৳']
    connector              =   '্'

    # based on unicode range : \u0980-\u09FF
    non_gylph_unicodes     =   ['\u0984', '\u098d','\u098e','\u0991','\u0992','\u09a9','\u09b1','\u09b3','\u09b4','\u09b5',
                                '\u09ba','\u09bb', '\u09c5','\u09c6','\u09c9','\u09ca','\u09cf','\u09d0','\u09d1','\u09d2',
                                '\u09d3','\u09d4','\u09d5','\u09d6', '\u09d8','\u09d9','\u09da','\u09db','\u09de', '\u09e4',
                                '\u09e5', 'ৼ','৽','৾','\u09ff']
    legacy_symbols         =   ['৺','৻','ঀ','ঌ','ৡ','ঽ','ৠ','৲','৴','৵','৶','৷','৸','৹']

    non_chars              =   numbers+punctuations+symbols+non_gylph_unicodes+legacy_symbols
    #-----------------------------------------------------basic-------------------------------------------------------------------

    #---------------------------------------------------changeables----------------------------------------------------------------
    conjuncts              =   ['এ্য','অ্য','ক্ক','ক্ট','ক্ট্য','ক্ট্র','ক্ত','ক্ত্র','ক্ব','ক্ম','ক্য','ক্র','ক্র্য','ক্ল','ক্ল্য','ক্ষ','ক্ষ্ণ','ক্ষ্ব','ক্ষ্ম','ক্ষ্ম্য','ক্ষ্য',
                                'ক্স','ক্স্য','খ্য','খ্র','গ্ণ','গ্ধ','গ্ধ্য','গ্ধ্র','গ্ন','গ্ন্য','গ্ব','গ্ম','গ্য','গ্র','গ্র্য','গ্ল','গ্ল্য','ঘ্ন','ঘ্য','ঘ্র',
                                'ঙ্ক','ঙ্ক্ত','ঙ্ক্য','ঙ্ক্ষ','ঙ্খ','ঙ্খ্য','ঙ্গ','ঙ্গ্য','ঙ্ঘ','ঙ্ঘ্য','ঙ্ঘ্র','ঙ্ম','চ্চ','চ্ছ','চ্ছ্ব','চ্ছ্র','চ্ঞ','চ্ব','চ্য','জ্জ',
                                'জ্জ্ব','জ্ঝ','জ্ঞ','জ্ব','জ্য','জ্র','ঞ্চ','ঞ্ছ','ঞ্জ','ঞ্ঝ','ট্ট','ট্ব','ট্ম','ট্য','ট্র','ট্র্য','ড্ড','ড্ব','ড্য','ড্র',
                                'ড্র্য','ঢ্য','ঢ্র','ণ্ট','ণ্ঠ','ণ্ঠ্য','ণ্ড','ণ্ড্য','ণ্ড্র','ণ্ঢ','ণ্ণ','ণ্ব','ণ্ম','ণ্য','ত্ত','ত্ত্ব','ত্ত্য','ত্থ','ত্ন','ত্ব',
                                'ত্ম','ত্ম্য','ত্য','ত্র','ত্র্য','থ্ব','থ্য','থ্র','থ্র্য','দ্গ','দ্ঘ','দ্দ','দ্দ্ব','দ্ধ','দ্ব','দ্ভ','দ্ভ্র','দ্ম','দ্য','দ্র',
                                'দ্র্য','ধ্ন','ধ্ব','ধ্ম','ধ্য','ধ্র','ন্ক','ন্ট','ন্ট্য','ন্ট্র','ন্ট্র্য','ন্ঠ','ন্ড','ন্ড্ব','ন্ড্য','ন্ড্র','ন্ত','ন্ত্ব','ন্ত্য','ন্ত্র',
                                'ন্ত্র্য','ন্থ','ন্থ্য','ন্থ্র','ন্দ','ন্দ্ব','ন্দ্য','ন্দ্র','ন্ধ','ন্ধ্য','ন্ধ্র','ন্ন','ন্ব','ন্ম','ন্য','ন্শ্য','ন্স','ন্স্য','প্ট','প্ট্য',
                                'প্ত','প্ন','প্প','প্য','প্র','প্র্য','প্ল','প্ল্য','প্স','ফ্য','ফ্র','ফ্র্য','ফ্ল','ফ্ল্য','ব্জ','ব্দ','ব্ধ','ব্ব','ব্য','ব্র',
                                'ব্র্য','ব্ল','ভ্ব','ভ্য','ভ্র','ম্ন','ম্ন্য','ম্প','ম্প্য','ম্প্র','ম্ফ','ম্ব','ম্ব্র','ম্ভ','ম্ভ্র','ম্ম','ম্য','ম্র','ম্ল','য্য',
                                'র্ক','র্ক্ট','র্ক্য','র্খ','র্গ','র্গ্য','র্গ্র','র্ঘ','র্ঘ্য','র্চ','র্চ্য','র্ছ','র্জ','র্জ্ঞ','র্জ্য','র্ঝ','র্ট','র্ট্য','র্ট্র','র্ড',
                                'র্ড্র','র্ঢ্য','র্ণ','র্ণ্য','র্ত','র্ত্ম','র্ত্য','র্ত্র','র্থ','র্থ্য','র্দ','র্দ্ব','র্দ্র','র্ধ','র্ধ্ব','র্ন','র্ন্ড','র্প','র্প্ট','র্প্ল',
                                'র্ফ','র্ব','র্ব্য','র্ভ','র্ম','র্ম্থ','র্ম্প','র্ম্য','র্য','র্ল','র্ল্ড','র্ল্য','র্শ','র্শ্ব','র্শ্য','র্ষ','র্ষ্য','র্স','র্স','র্স্ট',
                                'র্স্ম','র্স্য','র্হ','র্হ্য','র্হ্য','র‍্য','ল্ক','ল্ক্য','ল্গ','ল্চ','ল্ট','ল্ট্য','ল্ট্র','ল্ড','ল্ড্য','ল্ড্র','ল্প','ল্ফ','ল্ব','ল্ব্য',
                                'ল্ভ','ল্ম','ল্য','ল্ল','শ্চ','শ্ছ','শ্ন','শ্ব','শ্ম','শ্য','শ্র','শ্র্য','শ্ল','ষ্ক','ষ্ক্র','ষ্ট','ষ্ট্য','ষ্ট্র','ষ্ঠ','ষ্ঠ্য',
                                'ষ্ণ','ষ্প','ষ্প্র','ষ্ফ','ষ্ব','ষ্ম','ষ্য','স্ক','স্ক্য','স্ক্র','স্ক্র্য','স্খ','স্চ','স্ট','স্ট্য','স্ট্র','স্ট্র্য','স্ত','স্ত্ব','স্ত্য',
                                'স্ত্র','স্থ','স্থ্য','স্ন','স্ন্য','স্প','স্প্য','স্প্র','স্প্র্য','স্প্ল','স্প্ল্য','স্ফ','স্ব','স্ম','স্ম্য','স্য','স্র','স্ল','স্ল্য','হ্ণ',
                                'হ্ন','হ্ব','হ্ম','হ্য','হ্র','হ্ল','য়্য','ব্ল্য','র্ন্ত','ঠ্য','ভ্ল']

    # this is a customizeable map : this map is purely based on visual similiarity
    legacy_maps             =   {'ঀ':'৭',
                                'ঌ':'৯',
                                'ৡ':'৯',
                                '৵':'৯',
                                '৻':'ৎ',
                                'ৠ':'ঋ',
                                'ঽ':'ই'}
    #---------------------------------------------------changeables----------------------------------------------------------------

    #---------------------------------------------------normalization maps---------------------------------------------------------
    nukta_map              =   {'য':'য়',
                                'ব':'র',
                                'ড':'ড়',
                                'ঢ':'ঢ়'}
    diacritic_map           =   {'ো':'ো',
                                'ৌ':'ৌ',
                                'অা':'আ',
                                'ৄ':'ৃ'}
    #---------------------------------------------------normalization maps---------------------------------------------------------
    diacritics             =   sorted(vowel_diacritics+consonant_diacritics)
    used                   =   sorted(vowels+consonants+vowel_diacritics+consonant_diacritics+numbers)
    valid                  =   sorted([' ']+used+punctuations+[connector]+["\u200d","\u200c"])
    complex_roots          =   sorted([' ']+vowels+consonants+numbers+punctuations+symbols+conjuncts)
    # these unicodes can not start a word
    invalid_starts         =   sorted(diacritics+[connector])
    # invalid connector cases
    '''
        a connector can not be sorrounded by/ can not come after or before:
            * the vowels
            * the diacritics
            * another connector [double consecutive hosonto]
            * khondo to

    '''
    invalid_connectors     =    sorted(invalid_starts+vowels+['ৎ']+numbers+punctuations)

class devanagari:
    '''
        * vowel and consonant division according to :https://unicode-table.com/en/blocks/devanagari/
        * consonant conjuncts according to: https://en.wikipedia.org/wiki/Devanagari_conjuncts
        * punctuations according to: https://www.learnsanskrit.org/guide/devanagari/numerals-and-punctuation/
    '''

    #-----------------------------------------------------basic-------------------------------------------------------------------
    iden                   =   "devanagari"
    nukta                  =   '़'
    vowels                 =   ['ऄ', 'अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ऋ', 'ऌ', 'ऍ', 'ऎ', 'ए', 'ऐ', 'ऑ', 'ऒ', 'ओ', 'औ','ॠ', 'ॡ', 'ॢ', 'ॣ','ॲ','ॳ', 'ॴ', 'ॵ','ॶ', 'ॷ']
    consonants             =   ['क', 'ख', 'ग', 'घ', 'ङ', 'च', 'छ', 'ज', 'झ', 'ञ', 'ट', 'ठ', 'ड', 'ढ', 'ण', 'त', 'थ', 'द', 'ध', 'न', 'ऩ', 'प', 'फ', 'ब',
                                'भ', 'म', 'य', 'र', 'ऱ', 'ल', 'ळ', 'ऴ', 'व', 'श', 'ष', 'स', 'ह','क़', 'ख़', 'ग़', 'ज़', 'ड़', 'ढ़', 'फ़', 'य़','ॸ', 'ॹ', 'ॺ']

    vowel_diacritics       =   ['ऺ', 'ऻ','ा', 'ि', 'ी', 'ु', 'ू', 'ृ', 'ॄ', 'ॅ', 'ॆ', 'े', 'ै', 'ॉ', 'ॊ', 'ो', 'ौ','ॎ', 'ॏ','ॕ','ॖ', 'ॗ']


    consonant_diacritics   =   ['ऀ', 'ँ', 'ं', 'ः']

    numbers                =   ['०', '१', '२', '३', '४', '५', '६', '७', '८', '९']
    punctuations           =   ['।', '॥', ':', ';', '!', '—', '?', 'ऽ']
    symbols                =   []
    connector              =   '्'


    non_gylph_unicodes     =   []

    legacy_symbols         =   []
    non_chars              =   numbers+punctuations+symbols+non_gylph_unicodes+legacy_symbols

    #-----------------------------------------------------basic-------------------------------------------------------------------

    #---------------------------------------------------changeables----------------------------------------------------------------
    conjuncts              =   []

    # this is a customizeable map : this map is purely based on visual similiarity
    legacy_maps            =   {}
    #---------------------------------------------------changeables----------------------------------------------------------------

    #---------------------------------------------------normalization maps---------------------------------------------------------

    nukta_map              =   {'क':'क़',
                                'ख':'ख़',
                                'ग':'ग़',
                                'ज':'ज़',
                                'ड':'ड़',
                                'ढ':'ढ़',
                                'फ':'फ़',
                                'य':'य़',
                                'ळ':'ऴ',
                                'न':'ऩ'}

    diacritic_map          =   {'ाे':'ो',
                                'ाै':'ौ',
                                'अा':'आ',
                                'अो':'ओ',
                                'अौ': 'औ',
                                'एे': 'ऐ'}
    #---------------------------------------------------normalization maps---------------------------------------------------------
    diacritics             =   sorted(vowel_diacritics+consonant_diacritics)
    used                   =   sorted(vowels+consonants+vowel_diacritics+consonant_diacritics+numbers)
    valid                  =   sorted([' ']+used+punctuations+[connector]+["\u200d","\u200c"])
    complex_roots          =   sorted([' ']+vowels+consonants+numbers+punctuations+symbols+conjuncts)
    # these unicodes can not start a word
    invalid_starts         =   sorted(diacritics+[connector])
    # invalid connector cases
    '''
        a connector can not be sorrounded by/ can not come after or before:
            * the vowels
            * the diacritics
            * another connector [double consecutive hosonto]

    '''
    invalid_connectors     =    sorted(invalid_starts+vowels+numbers+punctuations)

#--------------------------------------------------------------------------------------------------------------------------------------------
#############################################################################################################################################
#--------------------------------------------------------------------------------------------------------------------------------------------
class gujarati:
    '''
        * vowel and consonant division according to : https://unicode-table.com/en/blocks/gujarati/
        * consonant conjuncts according to: https://en.wikipedia.org/wiki/Gujarati_script
        * punctuations according to: https://github.com/BengaliAI/syntheticWords/blob/main/coreLib/languages.py
    '''

    #-----------------------------------------------------basic-------------------------------------------------------------------
    iden                   =   "gujarati"
    nukta                  =   '઼'
    vowels                 =   ['અ', 'આ', 'ઇ', 'ઈ', 'ઉ', 'ઊ', 'ઋ', 'ઌ', 'ઍ', 'એ', 'ઐ', 'ઑ', 'ઓ', 'ઔ','ૠ', 'ૡ', 'ૢ', 'ૣ']
    consonants             =   ['ક', 'ખ', 'ગ', 'ઘ', 'ઙ', 'ચ', 'છ', 'જ', 'ઝ', 'ઞ', 'ટ', 'ઠ', 'ડ', 'ઢ', 'ણ', 'ત', 'થ', 'દ', 'ધ', 'ન', 'પ', 'ફ', 'બ', 'ભ', 'મ', 'ય', 'ર', 'લ', 'ળ', 'વ', 'શ', 'ષ', 'સ', 'હ','ૹ']
    vowel_diacritics       =   ['ા', 'િ', 'ી', 'ુ', 'ૂ', 'ૃ', 'ૄ', 'ૅ', 'ે', 'ૈ', 'ૉ', 'ો', 'ૌ']
    consonant_diacritics   =   ['ઁ', 'ં', 'ઃ']
    numbers                =   ['૦', '૧', '૨', '૩', '૪', '૫', '૬', '૭', '૮', '૯']
    punctuations           =   ['ઽ',',',';','।','?','!',':','—',':-',"'",'”','(', ')','{', '}','[',']','√','<','>','=','...','.','-']
    symbols                =   ['૱']
    connector              =   '્'


    # based on unicode range : \u0980-\u09FF
    non_gylph_unicodes     =   ['\u0a80', '\u0a84', '\u0aa9', '\u0ab1', '\u0ab4', '\u0aba', '\u0abb', '\u0ac6', '\u0aca', '\u0ace', '\u0acf',
                                '\u0ad1', '\u0ad2', '\u0ad3', '\u0ad4', '\u0ad5', '\u0ad6', '\u0ad7', '\u0ad8', '\u0ad9', '\u0ada', '\u0adb',
                                '\u0adc', '\u0add', '\u0ade', '\u0adf', '\u0ae4', '\u0ae5', '\u0af1', '\u0af2', '\u0af3', '\u0af4', '\u0af5',
                                '\u0af6', '\u0af7', '\u0af8']

    legacy_symbols         =   []

    non_chars              =   numbers+punctuations+symbols+non_gylph_unicodes+legacy_symbols
    #-----------------------------------------------------basic-------------------------------------------------------------------

    #---------------------------------------------------changeables----------------------------------------------------------------
    conjuncts              =   []

    # this is a customizeable map : this map is purely based on visual similiarity
    legacy_maps            =   {}
    #---------------------------------------------------changeables----------------------------------------------------------------

    #---------------------------------------------------normalization maps---------------------------------------------------------

    nukta_map              =   {} # NONE

    diacritic_map          =   {'ાે': 'ો',
                                'ાૅ': 'ૉ',
                                'ાૈ': 'ૌ',
                                'અા': 'આ',
                                'અે': 'એ',
                                'અો': 'ઓ',
                                'અૅ': 'ઍ',
                                'અૉ': 'ઑ',
                                'અૈ': 'ઐ',
                                'અૌ': 'ઔ'}
    #---------------------------------------------------normalization maps---------------------------------------------------------
    #---------------------------------------------------normalization maps---------------------------------------------------------
    diacritics             =   sorted(vowel_diacritics+consonant_diacritics)
    used                   =   sorted(vowels+consonants+vowel_diacritics+consonant_diacritics+numbers)
    valid                  =   sorted([' ']+used+punctuations+[connector]+["\u200d","\u200c"])
    complex_roots          =   sorted([' ']+vowels+consonants+numbers+punctuations+symbols+conjuncts)
    # these unicodes can not start a word
    invalid_starts         =   sorted(diacritics+[connector])
    # invalid connector cases
    '''
        a connector can not be sorrounded by/ can not come after or before:
            * the vowels
            * the diacritics
            * another connector [double consecutive hosonto]

    '''
    invalid_connectors     =    sorted(invalid_starts+vowels+numbers+punctuations)

#--------------------------------------------------------------------------------------------------------------------------------------------
#############################################################################################################################################
#--------------------------------------------------------------------------------------------------------------------------------------------
class odiya:
    '''
        * vowel and consonant division according to : https://unicode-table.com/en/blocks/oriya/
        * consonant conjuncts according to: https://en.wikipedia.org/wiki/Odia_script
        * punctuations according to: https://github.com/BengaliAI/syntheticWords/blob/main/coreLib/languages.py
    '''

    #-----------------------------------------------------basic-------------------------------------------------------------------
    iden                   =   "odiya"
    nukta                  =   '଼'
    vowels                 =   ['ଅ', 'ଆ', 'ଇ', 'ଈ', 'ଉ', 'ଊ', 'ଋ', 'ଌ', 'ଏ', 'ଐ', 'ଓ', 'ଔ','ୠ', 'ୡ']
    consonants             =   ['କ', 'ଖ', 'ଗ', 'ଘ', 'ଙ', 'ଚ', 'ଛ', 'ଜ', 'ଝ', 'ଞ', 'ଟ', 'ଠ', 'ଡ', 'ଢ', 'ଣ',
                                'ତ', 'ଥ', 'ଦ', 'ଧ', 'ନ', 'ପ', 'ଫ', 'ବ', 'ଭ', 'ମ', 'ଯ', 'ର', 'ଲ', 'ଳ', 'ଵ',
                                'ଶ', 'ଷ', 'ସ', 'ହ','ଡ଼', 'ଢ଼', 'ୟ','ୱ']
    vowel_diacritics       =   ['ା', 'ି', 'ୀ', 'ୁ', 'ୂ', 'ୃ', 'ୄ', 'େ', 'ୈ','ୋ', 'ୌ','ୢ', 'ୣ']
    consonant_diacritics   =   ['ଁ', 'ଂ', 'ଃ']
    numbers                =   ['୦', '୧', '୨', '୩', '୪', '୫', '୬', '୭', '୮', '୯']
    punctuations           =   [',',';','।','?','!',':','—',':-',"'",'”','(', ')','{', '}','[',']','√','<','>','=','...','.','-']
    symbols                =   ['୲', '୳', '୴', '୵', '୶', '୷']
    connector              =   '୍'

    # based on unicode range : \u0980-\u09FF
    non_gylph_unicodes     =   ['\u0b00', '\u0b04', '\u0b0d', '\u0b0e', '\u0b11', '\u0b12', '\u0b29', '\u0b31', '\u0b34', '\u0b3a', '\u0b3b',
                                '\u0b45', '\u0b46', '\u0b49', '\u0b4a', '\u0b4e', '\u0b4f', '\u0b50', '\u0b51', '\u0b52', '\u0b53', '\u0b54',
                                '\u0b58', '\u0b59', '\u0b5a', '\u0b5b', '\u0b5e', '\u0b64', '\u0b65', '\u0b78', '\u0b79', '\u0b7a', '\u0b7b',
                                '\u0b7c', '\u0b7d', '\u0b7e', '\u0b7f']
    legacy_symbols         =   []

    non_chars              =   numbers+punctuations+symbols+non_gylph_unicodes+legacy_symbols
    #-----------------------------------------------------basic-------------------------------------------------------------------

    #---------------------------------------------------changeables----------------------------------------------------------------
    conjuncts              =   []

    # this is a customizeable map : this map is purely based on visual similiarity
    legacy_maps            =   {}
    #---------------------------------------------------changeables----------------------------------------------------------------

    #---------------------------------------------------normalization maps---------------------------------------------------------
    nukta_map              =   {'ଡ':'ଡ଼',
                                'ଢ':'ଢ଼'}

    diacritic_map          =   {'ୋ':'ୋ',
                                'ୈା':'ୌ',
                                'ଓୗ':'ଔ',
                                'ଏୗ':'ଐ',
                                'ଅା':'ଆ'}
    #---------------------------------------------------normalization maps---------------------------------------------------------
    diacritics             =   sorted(vowel_diacritics+consonant_diacritics)
    used                   =   sorted(vowels+consonants+vowel_diacritics+consonant_diacritics+numbers)
    valid                  =   sorted([' ']+used+punctuations+[connector]+["\u200d","\u200c"])
    complex_roots          =   sorted([' ']+vowels+consonants+numbers+punctuations+symbols+conjuncts)
    # these unicodes can not start a word
    invalid_starts         =   sorted(diacritics+[connector])
    # invalid connector cases
    '''
        a connector can not be sorrounded by/ can not come after or before:
            * the vowels
            * the diacritics
            * another connector [double consecutive hosonto]

    '''
    invalid_connectors     =    sorted(invalid_starts+vowels+numbers+punctuations)


#--------------------------------------------------------------------------------------------------------------------------------------------
#############################################################################################################################################
#--------------------------------------------------------------------------------------------------------------------------------------------

class tamil:
    '''
        * vowel and consonant division according to : https://unicode-table.com/en/blocks/tamil/
        * consonant conjuncts according to: https://en.wikipedia.org/wiki/Tamil_script
        * punctuations according to: https://github.com/BengaliAI/syntheticWords/blob/main/coreLib/languages.py
    '''

    #-----------------------------------------------------basic-------------------------------------------------------------------
    iden                   =   "tamil"
    nukta                  =   ''
    vowels                 =   ['அ', 'ஆ', 'இ', 'ஈ', 'உ', 'ஊ', 'எ', 'ஏ', 'ஐ', 'ஒ', 'ஓ', 'ஔ']
    consonants             =   ['க', 'ங', 'ச', 'ஜ', 'ஞ', 'ட', 'ண', 'த', 'ந', 'ன', 'ப', 'ம', 'ய', 'ர', 'ற', 'ல', 'ள', 'ழ', 'வ', 'ஶ', 'ஷ', 'ஸ', 'ஹ']
    vowel_diacritics       =   ['ா', 'ி', 'ீ', 'ு', 'ூ', 'ெ', 'ே', 'ை','ொ', 'ோ', 'ௌ']
    consonant_diacritics   =   ['ஂ', 'ஃ']
    numbers                =   ['௦', '௧', '௨', '௩', '௪', '௫', '௬', '௭', '௮', '௯','௰', '௱', '௲']
    punctuations           =   [',',';','।','?','!',':','—',':-',"'",'”','(', ')','{', '}','[',']','√','<','>','=','...','.','-']
    symbols                =   ['௳', '௴', '௵','௹','௶', '௷', '௸','௺']
    connector              =   '்'

    # based on unicode range : \u0980-\u09FF
    non_gylph_unicodes     =   ['\u0b80', '\u0b81', '\u0b84', '\u0b8b', '\u0b8c', '\u0b8d', '\u0b91', '\u0b96', '\u0b97', '\u0b98', '\u0b9b',
                                '\u0b9d', '\u0ba0', '\u0ba1', '\u0ba2', '\u0ba5', '\u0ba6', '\u0ba7', '\u0bab', '\u0bac', '\u0bad', '\u0bba',
                                '\u0bbb', '\u0bbc', '\u0bbd', '\u0bc3', '\u0bc4', '\u0bc5', '\u0bc9', '\u0bce', '\u0bcf', '\u0bd8', '\u0bd9',
                                '\u0bda', '\u0bdb', '\u0bdc', '\u0bdd', '\u0bde', '\u0bdf', '\u0be0', '\u0be1', '\u0be2', '\u0be3', '\u0bd1',
                                '\u0bd2', '\u0bd3', '\u0bd4', '\u0bd5', '\u0bd6', '\u0bd8', '\u0bd9', '\u0bda', '\u0bdb', '\u0bdc', '\u0bdd',
                                '\u0bde', '\u0bdf', '\u0be0', '\u0be1', '\u0be2', '\u0be3', '\u0be4', '\u0be5', '\u0bfb', '\u0bfc', '\u0bfd',
                                '\u0bfe', '\u0bff']
    legacy_symbols         =   []

    non_chars              =   numbers+punctuations+symbols+non_gylph_unicodes+legacy_symbols
    #-----------------------------------------------------basic-------------------------------------------------------------------

    #---------------------------------------------------changeables----------------------------------------------------------------
    conjuncts              =   []
    # this is a customizeable map : this map is purely based on visual similiarity

    legacy_maps            =   {}
    #---------------------------------------------------changeables----------------------------------------------------------------

    #---------------------------------------------------normalization maps---------------------------------------------------------

    nukta_map              =   {} # NONE

    diacritic_map          =   {'ொ':'ொ',
                                "ோ":'ோ'}
    #---------------------------------------------------normalization maps---------------------------------------------------------
    diacritics             =   sorted(vowel_diacritics+consonant_diacritics)
    used                   =   sorted(vowels+consonants+vowel_diacritics+consonant_diacritics+numbers)
    valid                  =   sorted([' ']+used+punctuations+[connector]+["\u200d","\u200c"])
    complex_roots          =   sorted([' ']+vowels+consonants+numbers+punctuations+symbols+conjuncts)
    # these unicodes can not start a word
    invalid_starts         =   sorted(diacritics+[connector])
    # invalid connector cases
    '''
        a connector can not be sorrounded by/ can not come after or before:
            * the vowels
            * the diacritics
            * another connector [double consecutive hosonto]

    '''
    invalid_connectors     =    sorted(invalid_starts+vowels+numbers+punctuations)


#--------------------------------------------------------------------------------------------------------------------------------------------
#############################################################################################################################################
#--------------------------------------------------------------------------------------------------------------------------------------------
class panjabi:
    '''
        * vowel and consonant division according to : https://unicode-table.com/en/blocks/gurmukhi/
        * consonant conjuncts according to: NOT USED
        * punctuations according to: https://github.com/BengaliAI/syntheticWords/blob/main/coreLib/languages.py
    '''

    #-----------------------------------------------------basic-------------------------------------------------------------------
    iden                   =   "panjabi"
    nukta                  =   '਼'
    vowels                 =   ['ਅ', 'ਆ', 'ਇ', 'ਈ', 'ਉ', 'ਊ', 'ਏ', 'ਐ', 'ਓ', 'ਔ','ੲ', 'ੳ']
    consonants             =   ['ਕ', 'ਖ', 'ਗ', 'ਘ', 'ਙ', 'ਚ', 'ਛ', 'ਜ', 'ਝ', 'ਞ', 'ਟ', 'ਠ', 'ਡ', 'ਢ', 'ਣ', 'ਤ', 'ਥ',
                                'ਦ', 'ਧ', 'ਨ', 'ਪ', 'ਫ', 'ਬ', 'ਭ', 'ਮ', 'ਯ', 'ਰ', 'ਲ', 'ਲ਼', 'ਵ', 'ਸ਼', 'ਸ', 'ਹ','ਖ਼',
                                'ਗ਼', 'ਜ਼', 'ੜ', 'ਫ਼']
    vowel_diacritics       =   ['ਾ', 'ਿ', 'ੀ', 'ੁ', 'ੂ', 'ੇ', 'ੈ', 'ੋ', 'ੌ']
    consonant_diacritics   =   ['ਁ', 'ਂ', 'ਃ'] # Not found!
    numbers                =   ['੦', '੧', '੨', '੩', '੪', '੫', '੬', '੭', '੮', '੯']
    punctuations           =   [',',';','।','?','!',':','—',':-',"'",'”','(', ')','{', '}','[',']','√','<','>','=','...','.','-']
    symbols                =   []
    connector              =   '੍'

    # based on unicode range : \u0980-\u09FF
    non_gylph_unicodes     =   ['\u0a00', '\u0a04', '\u0a0b', '\u0a0c', '\u0a0d', '\u0a0e', '\u0a11', '\u0a12', '\u0a29', '\u0a31',
                                '\u0a34', '\u0a37', '\u0a3a', '\u0a3b', '\u0a3d', '\u0a43', '\u0a44', '\u0a45', '\u0a46', '\u0a49',
                                '\u0a4a', '\u0a4e', '\u0a4f', '\u0a50', '\u0a52', '\u0a53', '\u0a54', '\u0a55', '\u0a56', '\u0a57',
                                '\u0a58', '\u0a5d', '\u0a5f', '\u0a60', '\u0a61', '\u0a62', '\u0a63', '\u0a64', '\u0a65', '\u0a76',
                                '\u0a77', '\u0a78', '\u0a79', '\u0a7a', '\u0a7b', '\u0a7c', '\u0a7d', '\u0a7e', '\u0a7f']
    legacy_symbols         =   []

    non_chars              =   numbers+punctuations+symbols+non_gylph_unicodes+legacy_symbols
    #-----------------------------------------------------basic-------------------------------------------------------------------

    #---------------------------------------------------changeables----------------------------------------------------------------
    conjuncts              =   []

    # this is a customizeable map : this map is purely based on visual similiarity
    legacy_maps            =   {}
    #---------------------------------------------------changeables----------------------------------------------------------------

    #---------------------------------------------------normalization maps---------------------------------------------------------
    nukta_map              =   {'ਖ':'ਖ਼',
                                'ਗ':'ਗ਼',
                                'ਜ':'ਜ਼',
                                'ਲ':'ਲ਼',
                                'ਸ':'ਸ਼',
                                'ਫ':'ਫ਼'
                                }
    diacritic_map          =   {'ੇੋ': 'ੌ',
                                'ਾੇ': 'ੀ',
                                'ਾੋ': 'ੀ',
                                'ਅੈ': 'ਐ',
                                'ਅੌ': 'ਔ',
                                'ਅਾ': 'ਆ'}
    #---------------------------------------------------normalization maps---------------------------------------------------------
    diacritics             =   sorted(vowel_diacritics+consonant_diacritics)
    used                   =   sorted(vowels+consonants+vowel_diacritics+consonant_diacritics+numbers)
    valid                  =   sorted([' ']+used+punctuations+[connector]+["\u200d","\u200c"])
    complex_roots          =   sorted([' ']+vowels+consonants+numbers+punctuations+symbols+conjuncts)
    # these unicodes can not start a word
    invalid_starts         =   sorted(diacritics+[connector])
    # invalid connector cases
    '''
        a connector can not be sorrounded by/ can not come after or before:
            * the vowels
            * the diacritics
            * another connector [double consecutive hosonto]

    '''
    invalid_connectors     =    sorted(invalid_starts+vowels+numbers+punctuations)


#--------------------------------------------------------------------------------------------------------------------------------------------
#############################################################################################################################################
#--------------------------------------------------------------------------------------------------------------------------------------------
class malayalam:
    '''
        * vowel and consonant division according to: https://unicode-table.com/en/blocks/malayalam/
        * consonant conjuncts according to: Self Generated
        * punctuations according to: https://github.com/BengaliAI/syntheticWords/blob/main/coreLib/languages.py
    '''

    #-----------------------------------------------------basic-------------------------------------------------------------------
    iden                   =   "malayalam"
    nukta                  =   '়'
    vowels                 =   ['അ', 'ആ', 'ഇ', 'ഈ', 'ഉ', 'ഊ', 'ഋ', 'ഌ', 'എ', 'ഏ', 'ഐ', 'ഒ', 'ഓ', 'ഔ','ൠ', 'ൡ']
    consonants             =   ['ക', 'ഖ', 'ഗ', 'ഘ', 'ങ', 'ച', 'ഛ', 'ജ', 'ഝ', 'ഞ', 'ട', 'ഠ', 'ഡ', 'ഢ', 'ണ', 'ത', 'ഥ',
                                'ദ', 'ധ', 'ന', 'ഩ', 'പ', 'ഫ', 'ബ', 'ഭ', 'മ', 'യ', 'ര', 'റ', 'ല', 'ള', 'ഴ', 'വ', 'ശ', 'ഷ',
                                'സ', 'ഹ', 'ഺ']
    vowel_diacritics       =   ['ാ', 'ി', 'ീ', 'ു', 'ൂ', 'ൃ', 'ൄ',     'െ', 'േ', 'ൈ','ൊ', 'ോ', 'ൌ','ൗ','ൢ', 'ൣ']
    consonant_diacritics   =   ['ഀ', 'ഁ', 'ം', 'ഃ', 'ഄ']
    numbers                =   ['൦', '൧', '൨', '൩', '൪', '൫', '൬', '൭', '൮', '൯','൘', '൙', '൚', '൛', '൜', '൝', '൞','൰', '൱', '൲','൳', '൴', '൵', '൶', '൷', '൸']
    punctuations           =    [',',';','।','?','!',':','—',':-',"'",'”','(', ')','{', '}','[',']','√','<','>','=','...','.','-']
    symbols                =   ['ഽ', '൏', '൹']


    connector              =   '്' # There are two other this shit '഻', '഼'

    # based on unicode range : \u0980-\u09FF
    non_gylph_unicodes     =   ['\u0d64', '\u0d65', '\u0d50', '\u0d51', '\u0d52', '\u0d53', '\u0d49', '\u0d45', '\u0d11', '\u0d0d']
    legacy_symbols         =   ['ൟ']

    non_chars              =   numbers+punctuations+symbols+non_gylph_unicodes+legacy_symbols
    #-----------------------------------------------------basic-------------------------------------------------------------------

    #---------------------------------------------------changeables----------------------------------------------------------------
    conjuncts              =   []

    # this is a customizeable map : this map is purely based on visual similiarity
    legacy_maps            =   {}
    #---------------------------------------------------changeables----------------------------------------------------------------

    #---------------------------------------------------normalization maps---------------------------------------------------------
    nukta_map              =   {}
    diacritic_map          =   {} #NONE--<>
    #---------------------------------------------------normalization maps---------------------------------------------------------
    diacritics             =   sorted(vowel_diacritics+consonant_diacritics)
    used                   =   sorted(vowels+consonants+vowel_diacritics+consonant_diacritics+numbers)
    valid                  =   sorted([' ']+used+punctuations+[connector]+["\u200d","\u200c"])
    complex_roots          =   sorted([' ']+vowels+consonants+numbers+punctuations+symbols+conjuncts)
    # these unicodes can not start a word
    invalid_starts         =   sorted(diacritics+[connector])
    # invalid connector cases
    '''
        a connector can not be sorrounded by/ can not come after or before:
            * the vowels
            * the diacritics
            * another connector [double consecutive hosonto]

    '''
    invalid_connectors     =    sorted(invalid_starts+vowels+numbers+punctuations)


class sylhetinagri:
    '''
        * according to asif sushmit
    '''
    #-----------------------------------------------------basic-------------------------------------------------------------------
    iden                   =   "sylhetinagri"
    nukta                  =   '' #done
    vowels                 =   ['ꠀ', 'ꠁ', 'ꠃ', 'ꠄ', 'ꠅ'] #done
    consonants             =   ['ꠇ', 'ꠈ', 'ꠉ', 'ꠊ',
                                 'ꠌ', 'ꠍ', 'ꠎ', 'ꠏ',
                                 'ꠐ', 'ꠑ', 'ꠒ', 'ꠓ',
                                 'ꠔ', 'ꠕ', 'ꠖ', 'ꠗ', 'ꠘ',
                                 'ꠙ', 'ꠚ', 'ꠛ', 'ꠜ', 'ꠝ',
                                 'ꠞ', 'ꠟ', 'ꠠ', 'ꠡ', 'ꠢ'] #done
    vowel_diacritics       =   ['ꠣ', 'ꠤ', 'ꠥ', 'ꠦ', 'ꠧ'] #done
    consonant_diacritics   =   ['ꠋ', 'ꠂ', '꠬'] #done
    numbers                =   ['০', '১', '২', '৩', '৪', '৫', '৬', '৭', '৮', '৯', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '०', '१', '२', '३', '४', '५', '६', '७', '८', '९']
    punctuations           =   ['।', '॥', ':', ';', '!', '—', '?', 'ऽ', '.', ',']
    symbols                =   ['꠨', '꠩', '꠪', '꠫']
    connector              =   '꠆' #done
    non_gylph_unicodes     =   []
    legacy_symbols         =   []
    non_chars              =   numbers+punctuations+symbols+non_gylph_unicodes+legacy_symbols
    #-----------------------------------------------------basic-------------------------------------------------------------------
    #---------------------------------------------------changeables----------------------------------------------------------------
    conjuncts              =   []
    # this is a customizeable map : this map is purely based on visual similiarity
    legacy_maps            =   {}
    #---------------------------------------------------changeables----------------------------------------------------------------
    #---------------------------------------------------normalization maps---------------------------------------------------------
    nukta_map              =   {}
    diacritic_map           =   {'ꠦꠣ':'ꠧ',
                                'ꠣꠦ':'ꠧ'}
    #---------------------------------------------------normalization maps---------------------------------------------------------
    diacritics             =   sorted(vowel_diacritics+consonant_diacritics)
    used                   =   sorted(vowels+consonants+vowel_diacritics+consonant_diacritics+numbers)
    valid                  =   sorted([' ']+used+punctuations+[connector]+["\u200d","\u200c"])
    complex_roots          =   sorted([' ']+vowels+consonants+numbers+punctuations+symbols+conjuncts)
    # these unicodes can not start a word
    invalid_starts         =   sorted(diacritics+[connector])
    # invalid connector cases
    '''
        a connector can not be sorrounded by/ can not come after or before:
            * the vowels
            * the diacritics
            * another connector [double consecutive hosonto]
    '''
    invalid_connectors     =    sorted(invalid_starts+vowels+numbers+punctuations)

#--------------------------------------------------------------------------------------------------------------------------------------------
#############################################################################################################################################
#--------------------------------------------------------------------------------------------------------------------------------------------
languages={}
languages["english"]    =english
languages["bangla"]     =bangla
languages["devanagari"] =devanagari
languages["gujarati"]   =gujarati
languages["odiya"]      =odiya
languages["tamil"]      =tamil
languages["panjabi"]    =panjabi
languages["malayalam"]  =malayalam
languages["sylhetinagri"]=sylhetinagri