590 lines
42 KiB
590 lines
42 KiB
#-*- coding: utf-8 -*-
indic generall ref: https://r12a.github.io/scripts/
from __future__ import print_function
# language classes
class english:
lower = ["a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z"]
upper = ["A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z"]
punctuations = ['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?',
'@', '[', '\\', ']', '^', '_', '`','{', '|', '}', '~']
numbers = ["0","1","2","3","4","5","6","7","8","9"]
valid = sorted(lower+upper+numbers+punctuations)
class bangla:
* vowel and consonant division according to : http://bn.wikipedia.org/wiki/%E0%A7%8E
* consonant conjuncts according to: http://bn.wiktionary.org/wiki/উইকিঅভিধান:বাংলা_যুক্তবর্ণের_তালিকা
* punctuations according to: https://bn.wikipedia.org/wiki/যতিচিহ্ন
iden = "bangla"
nukta = '়'
vowels = ['অ', 'আ', 'ই', 'ঈ', 'উ', 'ঊ', 'ঋ', 'এ', 'ঐ', 'ও', 'ঔ']
consonants = ['ক', 'খ', 'গ', 'ঘ', 'ঙ', 'চ', 'ছ','জ', 'ঝ', 'ঞ', 'ট', 'ঠ', 'ড', 'ঢ', 'ণ', 'ত', 'থ', 'দ', 'ধ', 'ন',
'প', 'ফ', 'ব', 'ভ', 'ম', 'য', 'র', 'ল', 'শ', 'ষ', 'স', 'হ','ড়', 'ঢ়', 'য়','ৎ']
vowel_diacritics = ['া', 'ি', 'ী', 'ু', 'ূ', 'ৃ', 'ে', 'ৈ', 'ো', 'ৌ']
consonant_diacritics = ['ঁ', 'ং', 'ঃ']
numbers = ['০', '১', '২', '৩', '৪', '৫', '৬', '৭', '৮', '৯']
punctuations = ['!', '"', "'", '(', ')', ',', '-', '.', '...', ':', ':-', ';', '<', '=', '>', '?', '[', ']', '{', '}', '।', '৷', '–', '—', '”', '√']
symbols = ['৳']
connector = '্'
# based on unicode range : \u0980-\u09FF
non_gylph_unicodes = ['\u0984', '\u098d','\u098e','\u0991','\u0992','\u09a9','\u09b1','\u09b3','\u09b4','\u09b5',
'\u09ba','\u09bb', '\u09c5','\u09c6','\u09c9','\u09ca','\u09cf','\u09d0','\u09d1','\u09d2',
'\u09d3','\u09d4','\u09d5','\u09d6', '\u09d8','\u09d9','\u09da','\u09db','\u09de', '\u09e4',
'\u09e5', 'ৼ','৽','৾','\u09ff']
legacy_symbols = ['৺','৻','ঀ','ঌ','ৡ','ঽ','ৠ','৲','৴','৵','৶','৷','৸','৹']
non_chars = numbers+punctuations+symbols+non_gylph_unicodes+legacy_symbols
conjuncts = ['এ্য','অ্য','ক্ক','ক্ট','ক্ট্য','ক্ট্র','ক্ত','ক্ত্র','ক্ব','ক্ম','ক্য','ক্র','ক্র্য','ক্ল','ক্ল্য','ক্ষ','ক্ষ্ণ','ক্ষ্ব','ক্ষ্ম','ক্ষ্ম্য','ক্ষ্য',
# this is a customizeable map : this map is purely based on visual similiarity
legacy_maps = {'ঀ':'৭',
#---------------------------------------------------normalization maps---------------------------------------------------------
nukta_map = {'য':'য়',
diacritic_map = {'ো':'ো',
#---------------------------------------------------normalization maps---------------------------------------------------------
diacritics = sorted(vowel_diacritics+consonant_diacritics)
used = sorted(vowels+consonants+vowel_diacritics+consonant_diacritics+numbers)
valid = sorted([' ']+used+punctuations+[connector]+["\u200d","\u200c"])
complex_roots = sorted([' ']+vowels+consonants+numbers+punctuations+symbols+conjuncts)
# these unicodes can not start a word
invalid_starts = sorted(diacritics+[connector])
# invalid connector cases
a connector can not be sorrounded by/ can not come after or before:
* the vowels
* the diacritics
* another connector [double consecutive hosonto]
* khondo to
invalid_connectors = sorted(invalid_starts+vowels+['ৎ']+numbers+punctuations)
class devanagari:
* vowel and consonant division according to :https://unicode-table.com/en/blocks/devanagari/
* consonant conjuncts according to: https://en.wikipedia.org/wiki/Devanagari_conjuncts
* punctuations according to: https://www.learnsanskrit.org/guide/devanagari/numerals-and-punctuation/
iden = "devanagari"
nukta = '़'
vowels = ['ऄ', 'अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ऋ', 'ऌ', 'ऍ', 'ऎ', 'ए', 'ऐ', 'ऑ', 'ऒ', 'ओ', 'औ','ॠ', 'ॡ', 'ॢ', 'ॣ','ॲ','ॳ', 'ॴ', 'ॵ','ॶ', 'ॷ']
consonants = ['क', 'ख', 'ग', 'घ', 'ङ', 'च', 'छ', 'ज', 'झ', 'ञ', 'ट', 'ठ', 'ड', 'ढ', 'ण', 'त', 'थ', 'द', 'ध', 'न', 'ऩ', 'प', 'फ', 'ब',
'भ', 'म', 'य', 'र', 'ऱ', 'ल', 'ळ', 'ऴ', 'व', 'श', 'ष', 'स', 'ह','क़', 'ख़', 'ग़', 'ज़', 'ड़', 'ढ़', 'फ़', 'य़','ॸ', 'ॹ', 'ॺ']
vowel_diacritics = ['ऺ', 'ऻ','ा', 'ि', 'ी', 'ु', 'ू', 'ृ', 'ॄ', 'ॅ', 'ॆ', 'े', 'ै', 'ॉ', 'ॊ', 'ो', 'ौ','ॎ', 'ॏ','ॕ','ॖ', 'ॗ']
consonant_diacritics = ['ऀ', 'ँ', 'ं', 'ः']
numbers = ['०', '१', '२', '३', '४', '५', '६', '७', '८', '९']
punctuations = ['।', '॥', ':', ';', '!', '—', '?', 'ऽ']
symbols = []
connector = '्'
non_gylph_unicodes = []
legacy_symbols = []
non_chars = numbers+punctuations+symbols+non_gylph_unicodes+legacy_symbols
conjuncts = []
# this is a customizeable map : this map is purely based on visual similiarity
legacy_maps = {}
#---------------------------------------------------normalization maps---------------------------------------------------------
nukta_map = {'क':'क़',
diacritic_map = {'ाे':'ो',
'अौ': 'औ',
'एे': 'ऐ'}
#---------------------------------------------------normalization maps---------------------------------------------------------
diacritics = sorted(vowel_diacritics+consonant_diacritics)
used = sorted(vowels+consonants+vowel_diacritics+consonant_diacritics+numbers)
valid = sorted([' ']+used+punctuations+[connector]+["\u200d","\u200c"])
complex_roots = sorted([' ']+vowels+consonants+numbers+punctuations+symbols+conjuncts)
# these unicodes can not start a word
invalid_starts = sorted(diacritics+[connector])
# invalid connector cases
a connector can not be sorrounded by/ can not come after or before:
* the vowels
* the diacritics
* another connector [double consecutive hosonto]
invalid_connectors = sorted(invalid_starts+vowels+numbers+punctuations)
class gujarati:
* vowel and consonant division according to : https://unicode-table.com/en/blocks/gujarati/
* consonant conjuncts according to: https://en.wikipedia.org/wiki/Gujarati_script
* punctuations according to: https://github.com/BengaliAI/syntheticWords/blob/main/coreLib/languages.py
iden = "gujarati"
nukta = '઼'
vowels = ['અ', 'આ', 'ઇ', 'ઈ', 'ઉ', 'ઊ', 'ઋ', 'ઌ', 'ઍ', 'એ', 'ઐ', 'ઑ', 'ઓ', 'ઔ','ૠ', 'ૡ', 'ૢ', 'ૣ']
consonants = ['ક', 'ખ', 'ગ', 'ઘ', 'ઙ', 'ચ', 'છ', 'જ', 'ઝ', 'ઞ', 'ટ', 'ઠ', 'ડ', 'ઢ', 'ણ', 'ત', 'થ', 'દ', 'ધ', 'ન', 'પ', 'ફ', 'બ', 'ભ', 'મ', 'ય', 'ર', 'લ', 'ળ', 'વ', 'શ', 'ષ', 'સ', 'હ','ૹ']
vowel_diacritics = ['ા', 'િ', 'ી', 'ુ', 'ૂ', 'ૃ', 'ૄ', 'ૅ', 'ે', 'ૈ', 'ૉ', 'ો', 'ૌ']
consonant_diacritics = ['ઁ', 'ં', 'ઃ']
numbers = ['૦', '૧', '૨', '૩', '૪', '૫', '૬', '૭', '૮', '૯']
punctuations = ['ઽ',',',';','।','?','!',':','—',':-',"'",'”','(', ')','{', '}','[',']','√','<','>','=','...','.','-']
symbols = ['૱']
connector = '્'
# based on unicode range : \u0980-\u09FF
non_gylph_unicodes = ['\u0a80', '\u0a84', '\u0aa9', '\u0ab1', '\u0ab4', '\u0aba', '\u0abb', '\u0ac6', '\u0aca', '\u0ace', '\u0acf',
'\u0ad1', '\u0ad2', '\u0ad3', '\u0ad4', '\u0ad5', '\u0ad6', '\u0ad7', '\u0ad8', '\u0ad9', '\u0ada', '\u0adb',
'\u0adc', '\u0add', '\u0ade', '\u0adf', '\u0ae4', '\u0ae5', '\u0af1', '\u0af2', '\u0af3', '\u0af4', '\u0af5',
'\u0af6', '\u0af7', '\u0af8']
legacy_symbols = []
non_chars = numbers+punctuations+symbols+non_gylph_unicodes+legacy_symbols
conjuncts = []
# this is a customizeable map : this map is purely based on visual similiarity
legacy_maps = {}
#---------------------------------------------------normalization maps---------------------------------------------------------
nukta_map = {} # NONE
diacritic_map = {'ાે': 'ો',
'ાૅ': 'ૉ',
'ાૈ': 'ૌ',
'અા': 'આ',
'અે': 'એ',
'અો': 'ઓ',
'અૅ': 'ઍ',
'અૉ': 'ઑ',
'અૈ': 'ઐ',
'અૌ': 'ઔ'}
#---------------------------------------------------normalization maps---------------------------------------------------------
#---------------------------------------------------normalization maps---------------------------------------------------------
diacritics = sorted(vowel_diacritics+consonant_diacritics)
used = sorted(vowels+consonants+vowel_diacritics+consonant_diacritics+numbers)
valid = sorted([' ']+used+punctuations+[connector]+["\u200d","\u200c"])
complex_roots = sorted([' ']+vowels+consonants+numbers+punctuations+symbols+conjuncts)
# these unicodes can not start a word
invalid_starts = sorted(diacritics+[connector])
# invalid connector cases
a connector can not be sorrounded by/ can not come after or before:
* the vowels
* the diacritics
* another connector [double consecutive hosonto]
invalid_connectors = sorted(invalid_starts+vowels+numbers+punctuations)
class odiya:
* vowel and consonant division according to : https://unicode-table.com/en/blocks/oriya/
* consonant conjuncts according to: https://en.wikipedia.org/wiki/Odia_script
* punctuations according to: https://github.com/BengaliAI/syntheticWords/blob/main/coreLib/languages.py
iden = "odiya"
nukta = '଼'
vowels = ['ଅ', 'ଆ', 'ଇ', 'ଈ', 'ଉ', 'ଊ', 'ଋ', 'ଌ', 'ଏ', 'ଐ', 'ଓ', 'ଔ','ୠ', 'ୡ']
consonants = ['କ', 'ଖ', 'ଗ', 'ଘ', 'ଙ', 'ଚ', 'ଛ', 'ଜ', 'ଝ', 'ଞ', 'ଟ', 'ଠ', 'ଡ', 'ଢ', 'ଣ',
'ତ', 'ଥ', 'ଦ', 'ଧ', 'ନ', 'ପ', 'ଫ', 'ବ', 'ଭ', 'ମ', 'ଯ', 'ର', 'ଲ', 'ଳ', 'ଵ',
'ଶ', 'ଷ', 'ସ', 'ହ','ଡ଼', 'ଢ଼', 'ୟ','ୱ']
vowel_diacritics = ['ା', 'ି', 'ୀ', 'ୁ', 'ୂ', 'ୃ', 'ୄ', 'େ', 'ୈ','ୋ', 'ୌ','ୢ', 'ୣ']
consonant_diacritics = ['ଁ', 'ଂ', 'ଃ']
numbers = ['୦', '୧', '୨', '୩', '୪', '୫', '୬', '୭', '୮', '୯']
punctuations = [',',';','।','?','!',':','—',':-',"'",'”','(', ')','{', '}','[',']','√','<','>','=','...','.','-']
symbols = ['୲', '୳', '୴', '୵', '୶', '୷']
connector = '୍'
# based on unicode range : \u0980-\u09FF
non_gylph_unicodes = ['\u0b00', '\u0b04', '\u0b0d', '\u0b0e', '\u0b11', '\u0b12', '\u0b29', '\u0b31', '\u0b34', '\u0b3a', '\u0b3b',
'\u0b45', '\u0b46', '\u0b49', '\u0b4a', '\u0b4e', '\u0b4f', '\u0b50', '\u0b51', '\u0b52', '\u0b53', '\u0b54',
'\u0b58', '\u0b59', '\u0b5a', '\u0b5b', '\u0b5e', '\u0b64', '\u0b65', '\u0b78', '\u0b79', '\u0b7a', '\u0b7b',
'\u0b7c', '\u0b7d', '\u0b7e', '\u0b7f']
legacy_symbols = []
non_chars = numbers+punctuations+symbols+non_gylph_unicodes+legacy_symbols
conjuncts = []
# this is a customizeable map : this map is purely based on visual similiarity
legacy_maps = {}
#---------------------------------------------------normalization maps---------------------------------------------------------
nukta_map = {'ଡ':'ଡ଼',
diacritic_map = {'ୋ':'ୋ',
#---------------------------------------------------normalization maps---------------------------------------------------------
diacritics = sorted(vowel_diacritics+consonant_diacritics)
used = sorted(vowels+consonants+vowel_diacritics+consonant_diacritics+numbers)
valid = sorted([' ']+used+punctuations+[connector]+["\u200d","\u200c"])
complex_roots = sorted([' ']+vowels+consonants+numbers+punctuations+symbols+conjuncts)
# these unicodes can not start a word
invalid_starts = sorted(diacritics+[connector])
# invalid connector cases
a connector can not be sorrounded by/ can not come after or before:
* the vowels
* the diacritics
* another connector [double consecutive hosonto]
invalid_connectors = sorted(invalid_starts+vowels+numbers+punctuations)
class tamil:
* vowel and consonant division according to : https://unicode-table.com/en/blocks/tamil/
* consonant conjuncts according to: https://en.wikipedia.org/wiki/Tamil_script
* punctuations according to: https://github.com/BengaliAI/syntheticWords/blob/main/coreLib/languages.py
iden = "tamil"
nukta = ''
vowels = ['அ', 'ஆ', 'இ', 'ஈ', 'உ', 'ஊ', 'எ', 'ஏ', 'ஐ', 'ஒ', 'ஓ', 'ஔ']
consonants = ['க', 'ங', 'ச', 'ஜ', 'ஞ', 'ட', 'ண', 'த', 'ந', 'ன', 'ப', 'ம', 'ய', 'ர', 'ற', 'ல', 'ள', 'ழ', 'வ', 'ஶ', 'ஷ', 'ஸ', 'ஹ']
vowel_diacritics = ['ா', 'ி', 'ீ', 'ு', 'ூ', 'ெ', 'ே', 'ை','ொ', 'ோ', 'ௌ']
consonant_diacritics = ['ஂ', 'ஃ']
numbers = ['௦', '௧', '௨', '௩', '௪', '௫', '௬', '௭', '௮', '௯','௰', '௱', '௲']
punctuations = [',',';','।','?','!',':','—',':-',"'",'”','(', ')','{', '}','[',']','√','<','>','=','...','.','-']
symbols = ['௳', '௴', '௵','௹','௶', '௷', '௸','௺']
connector = '்'
# based on unicode range : \u0980-\u09FF
non_gylph_unicodes = ['\u0b80', '\u0b81', '\u0b84', '\u0b8b', '\u0b8c', '\u0b8d', '\u0b91', '\u0b96', '\u0b97', '\u0b98', '\u0b9b',
'\u0b9d', '\u0ba0', '\u0ba1', '\u0ba2', '\u0ba5', '\u0ba6', '\u0ba7', '\u0bab', '\u0bac', '\u0bad', '\u0bba',
'\u0bbb', '\u0bbc', '\u0bbd', '\u0bc3', '\u0bc4', '\u0bc5', '\u0bc9', '\u0bce', '\u0bcf', '\u0bd8', '\u0bd9',
'\u0bda', '\u0bdb', '\u0bdc', '\u0bdd', '\u0bde', '\u0bdf', '\u0be0', '\u0be1', '\u0be2', '\u0be3', '\u0bd1',
'\u0bd2', '\u0bd3', '\u0bd4', '\u0bd5', '\u0bd6', '\u0bd8', '\u0bd9', '\u0bda', '\u0bdb', '\u0bdc', '\u0bdd',
'\u0bde', '\u0bdf', '\u0be0', '\u0be1', '\u0be2', '\u0be3', '\u0be4', '\u0be5', '\u0bfb', '\u0bfc', '\u0bfd',
'\u0bfe', '\u0bff']
legacy_symbols = []
non_chars = numbers+punctuations+symbols+non_gylph_unicodes+legacy_symbols
conjuncts = []
# this is a customizeable map : this map is purely based on visual similiarity
legacy_maps = {}
#---------------------------------------------------normalization maps---------------------------------------------------------
nukta_map = {} # NONE
diacritic_map = {'ொ':'ொ',
#---------------------------------------------------normalization maps---------------------------------------------------------
diacritics = sorted(vowel_diacritics+consonant_diacritics)
used = sorted(vowels+consonants+vowel_diacritics+consonant_diacritics+numbers)
valid = sorted([' ']+used+punctuations+[connector]+["\u200d","\u200c"])
complex_roots = sorted([' ']+vowels+consonants+numbers+punctuations+symbols+conjuncts)
# these unicodes can not start a word
invalid_starts = sorted(diacritics+[connector])
# invalid connector cases
a connector can not be sorrounded by/ can not come after or before:
* the vowels
* the diacritics
* another connector [double consecutive hosonto]
invalid_connectors = sorted(invalid_starts+vowels+numbers+punctuations)
class panjabi:
* vowel and consonant division according to : https://unicode-table.com/en/blocks/gurmukhi/
* consonant conjuncts according to: NOT USED
* punctuations according to: https://github.com/BengaliAI/syntheticWords/blob/main/coreLib/languages.py
iden = "panjabi"
nukta = '਼'
vowels = ['ਅ', 'ਆ', 'ਇ', 'ਈ', 'ਉ', 'ਊ', 'ਏ', 'ਐ', 'ਓ', 'ਔ','ੲ', 'ੳ']
consonants = ['ਕ', 'ਖ', 'ਗ', 'ਘ', 'ਙ', 'ਚ', 'ਛ', 'ਜ', 'ਝ', 'ਞ', 'ਟ', 'ਠ', 'ਡ', 'ਢ', 'ਣ', 'ਤ', 'ਥ',
'ਦ', 'ਧ', 'ਨ', 'ਪ', 'ਫ', 'ਬ', 'ਭ', 'ਮ', 'ਯ', 'ਰ', 'ਲ', 'ਲ਼', 'ਵ', 'ਸ਼', 'ਸ', 'ਹ','ਖ਼',
'ਗ਼', 'ਜ਼', 'ੜ', 'ਫ਼']
vowel_diacritics = ['ਾ', 'ਿ', 'ੀ', 'ੁ', 'ੂ', 'ੇ', 'ੈ', 'ੋ', 'ੌ']
consonant_diacritics = ['ਁ', 'ਂ', 'ਃ'] # Not found!
numbers = ['੦', '੧', '੨', '੩', '੪', '੫', '੬', '੭', '੮', '੯']
punctuations = [',',';','।','?','!',':','—',':-',"'",'”','(', ')','{', '}','[',']','√','<','>','=','...','.','-']
symbols = []
connector = '੍'
# based on unicode range : \u0980-\u09FF
non_gylph_unicodes = ['\u0a00', '\u0a04', '\u0a0b', '\u0a0c', '\u0a0d', '\u0a0e', '\u0a11', '\u0a12', '\u0a29', '\u0a31',
'\u0a34', '\u0a37', '\u0a3a', '\u0a3b', '\u0a3d', '\u0a43', '\u0a44', '\u0a45', '\u0a46', '\u0a49',
'\u0a4a', '\u0a4e', '\u0a4f', '\u0a50', '\u0a52', '\u0a53', '\u0a54', '\u0a55', '\u0a56', '\u0a57',
'\u0a58', '\u0a5d', '\u0a5f', '\u0a60', '\u0a61', '\u0a62', '\u0a63', '\u0a64', '\u0a65', '\u0a76',
'\u0a77', '\u0a78', '\u0a79', '\u0a7a', '\u0a7b', '\u0a7c', '\u0a7d', '\u0a7e', '\u0a7f']
legacy_symbols = []
non_chars = numbers+punctuations+symbols+non_gylph_unicodes+legacy_symbols
conjuncts = []
# this is a customizeable map : this map is purely based on visual similiarity
legacy_maps = {}
#---------------------------------------------------normalization maps---------------------------------------------------------
nukta_map = {'ਖ':'ਖ਼',
diacritic_map = {'ੇੋ': 'ੌ',
'ਾੇ': 'ੀ',
'ਾੋ': 'ੀ',
'ਅੈ': 'ਐ',
'ਅੌ': 'ਔ',
'ਅਾ': 'ਆ'}
#---------------------------------------------------normalization maps---------------------------------------------------------
diacritics = sorted(vowel_diacritics+consonant_diacritics)
used = sorted(vowels+consonants+vowel_diacritics+consonant_diacritics+numbers)
valid = sorted([' ']+used+punctuations+[connector]+["\u200d","\u200c"])
complex_roots = sorted([' ']+vowels+consonants+numbers+punctuations+symbols+conjuncts)
# these unicodes can not start a word
invalid_starts = sorted(diacritics+[connector])
# invalid connector cases
a connector can not be sorrounded by/ can not come after or before:
* the vowels
* the diacritics
* another connector [double consecutive hosonto]
invalid_connectors = sorted(invalid_starts+vowels+numbers+punctuations)
class malayalam:
* vowel and consonant division according to: https://unicode-table.com/en/blocks/malayalam/
* consonant conjuncts according to: Self Generated
* punctuations according to: https://github.com/BengaliAI/syntheticWords/blob/main/coreLib/languages.py
iden = "malayalam"
nukta = '়'
vowels = ['അ', 'ആ', 'ഇ', 'ഈ', 'ഉ', 'ഊ', 'ഋ', 'ഌ', 'എ', 'ഏ', 'ഐ', 'ഒ', 'ഓ', 'ഔ','ൠ', 'ൡ']
consonants = ['ക', 'ഖ', 'ഗ', 'ഘ', 'ങ', 'ച', 'ഛ', 'ജ', 'ഝ', 'ഞ', 'ട', 'ഠ', 'ഡ', 'ഢ', 'ണ', 'ത', 'ഥ',
'ദ', 'ധ', 'ന', 'ഩ', 'പ', 'ഫ', 'ബ', 'ഭ', 'മ', 'യ', 'ര', 'റ', 'ല', 'ള', 'ഴ', 'വ', 'ശ', 'ഷ',
'സ', 'ഹ', 'ഺ']
vowel_diacritics = ['ാ', 'ി', 'ീ', 'ു', 'ൂ', 'ൃ', 'ൄ', 'െ', 'േ', 'ൈ','ൊ', 'ോ', 'ൌ','ൗ','ൢ', 'ൣ']
consonant_diacritics = ['ഀ', 'ഁ', 'ം', 'ഃ', 'ഄ']
numbers = ['൦', '൧', '൨', '൩', '൪', '൫', '൬', '൭', '൮', '൯','൘', '൙', '൚', '൛', '൜', '൝', '൞','൰', '൱', '൲','൳', '൴', '൵', '൶', '൷', '൸']
punctuations = [',',';','।','?','!',':','—',':-',"'",'”','(', ')','{', '}','[',']','√','<','>','=','...','.','-']
symbols = ['ഽ', '൏', '൹']
connector = '്' # There are two other this shit '഻', '഼'
# based on unicode range : \u0980-\u09FF
non_gylph_unicodes = ['\u0d64', '\u0d65', '\u0d50', '\u0d51', '\u0d52', '\u0d53', '\u0d49', '\u0d45', '\u0d11', '\u0d0d']
legacy_symbols = ['ൟ']
non_chars = numbers+punctuations+symbols+non_gylph_unicodes+legacy_symbols
conjuncts = []
# this is a customizeable map : this map is purely based on visual similiarity
legacy_maps = {}
#---------------------------------------------------normalization maps---------------------------------------------------------
nukta_map = {}
diacritic_map = {} #NONE--<>
#---------------------------------------------------normalization maps---------------------------------------------------------
diacritics = sorted(vowel_diacritics+consonant_diacritics)
used = sorted(vowels+consonants+vowel_diacritics+consonant_diacritics+numbers)
valid = sorted([' ']+used+punctuations+[connector]+["\u200d","\u200c"])
complex_roots = sorted([' ']+vowels+consonants+numbers+punctuations+symbols+conjuncts)
# these unicodes can not start a word
invalid_starts = sorted(diacritics+[connector])
# invalid connector cases
a connector can not be sorrounded by/ can not come after or before:
* the vowels
* the diacritics
* another connector [double consecutive hosonto]
invalid_connectors = sorted(invalid_starts+vowels+numbers+punctuations)
class sylhetinagri:
* according to asif sushmit
iden = "sylhetinagri"
nukta = '' #done
vowels = ['ꠀ', 'ꠁ', 'ꠃ', 'ꠄ', 'ꠅ'] #done
consonants = ['ꠇ', 'ꠈ', 'ꠉ', 'ꠊ',
'ꠌ', 'ꠍ', 'ꠎ', 'ꠏ',
'ꠐ', 'ꠑ', 'ꠒ', 'ꠓ',
'ꠔ', 'ꠕ', 'ꠖ', 'ꠗ', 'ꠘ',
'ꠙ', 'ꠚ', 'ꠛ', 'ꠜ', 'ꠝ',
'ꠞ', 'ꠟ', 'ꠠ', 'ꠡ', 'ꠢ'] #done
vowel_diacritics = ['ꠣ', 'ꠤ', 'ꠥ', 'ꠦ', 'ꠧ'] #done
consonant_diacritics = ['ꠋ', 'ꠂ', '꠬'] #done
numbers = ['০', '১', '২', '৩', '৪', '৫', '৬', '৭', '৮', '৯', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '०', '१', '२', '३', '४', '५', '६', '७', '८', '९']
punctuations = ['।', '॥', ':', ';', '!', '—', '?', 'ऽ', '.', ',']
symbols = ['꠨', '꠩', '꠪', '꠫']
connector = '꠆' #done
non_gylph_unicodes = []
legacy_symbols = []
non_chars = numbers+punctuations+symbols+non_gylph_unicodes+legacy_symbols
conjuncts = []
# this is a customizeable map : this map is purely based on visual similiarity
legacy_maps = {}
#---------------------------------------------------normalization maps---------------------------------------------------------
nukta_map = {}
diacritic_map = {'ꠦꠣ':'ꠧ',
#---------------------------------------------------normalization maps---------------------------------------------------------
diacritics = sorted(vowel_diacritics+consonant_diacritics)
used = sorted(vowels+consonants+vowel_diacritics+consonant_diacritics+numbers)
valid = sorted([' ']+used+punctuations+[connector]+["\u200d","\u200c"])
complex_roots = sorted([' ']+vowels+consonants+numbers+punctuations+symbols+conjuncts)
# these unicodes can not start a word
invalid_starts = sorted(diacritics+[connector])
# invalid connector cases
a connector can not be sorrounded by/ can not come after or before:
* the vowels
* the diacritics
* another connector [double consecutive hosonto]
invalid_connectors = sorted(invalid_starts+vowels+numbers+punctuations)
languages["english"] =english
languages["bangla"] =bangla
languages["devanagari"] =devanagari
languages["gujarati"] =gujarati
languages["odiya"] =odiya
languages["tamil"] =tamil
languages["panjabi"] =panjabi
languages["malayalam"] =malayalam