
590 lines
42 KiB
Raw Normal View History

2024-05-03 04:18:51 +03:00
#-*- coding: utf-8 -*-
indic generall ref:
from __future__ import print_function
# language classes
class english:
lower = ["a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z"]
upper = ["A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z"]
punctuations = ['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?',
'@', '[', '\\', ']', '^', '_', '`','{', '|', '}', '~']
numbers = ["0","1","2","3","4","5","6","7","8","9"]
valid = sorted(lower+upper+numbers+punctuations)
class bangla:
* vowel and consonant division according to :
* consonant conjuncts according to:উইকিঅভি:_যতবরর_তি
* punctuations according to:যতিি
iden = "bangla"
nukta = ''
vowels = ['', '', '', '', '', '', '', '', '', '', '']
consonants = ['', '', '', '', '', '', '','', '', '', '', '', '', '', '', '', '', '', '', '',
'', '', '', '', '', '', '', '', '', '', '', '','', '', '','']
vowel_diacritics = ['', 'ি', '', '', '', '', '', '', '', '']
consonant_diacritics = ['', '', '']
numbers = ['', '', '', '', '', '', '', '', '', '']
punctuations = ['!', '"', "'", '(', ')', ',', '-', '.', '...', ':', ':-', ';', '<', '=', '>', '?', '[', ']', '{', '}', '', '', '', '', '', '']
symbols = ['']
connector = ''
# based on unicode range : \u0980-\u09FF
non_gylph_unicodes = ['\u0984', '\u098d','\u098e','\u0991','\u0992','\u09a9','\u09b1','\u09b3','\u09b4','\u09b5',
'\u09ba','\u09bb', '\u09c5','\u09c6','\u09c9','\u09ca','\u09cf','\u09d0','\u09d1','\u09d2',
'\u09d3','\u09d4','\u09d5','\u09d6', '\u09d8','\u09d9','\u09da','\u09db','\u09de', '\u09e4',
'\u09e5', '','','','\u09ff']
legacy_symbols = ['','','','','','','','','','','','','','']
non_chars = numbers+punctuations+symbols+non_gylph_unicodes+legacy_symbols
conjuncts = ['এ্য','অ্য','ক্ক','ক্ট','ক্ট্য','ক্ট্র','ক্ত','ক্ত্র','ক্ব','ক্ম','ক্য','ক্র','ক্র্য','ক্ল','ক্ল্য','ক্ষ','ক্ষ্ণ','ক্ষ্ব','ক্ষ্ম','ক্ষ্ম্য','ক্ষ্য',
# this is a customizeable map : this map is purely based on visual similiarity
legacy_maps = {'':'',
#---------------------------------------------------normalization maps---------------------------------------------------------
nukta_map = {'':'',
diacritic_map = {'ো':'',
#---------------------------------------------------normalization maps---------------------------------------------------------
diacritics = sorted(vowel_diacritics+consonant_diacritics)
used = sorted(vowels+consonants+vowel_diacritics+consonant_diacritics+numbers)
valid = sorted([' ']+used+punctuations+[connector]+["\u200d","\u200c"])
complex_roots = sorted([' ']+vowels+consonants+numbers+punctuations+symbols+conjuncts)
# these unicodes can not start a word
invalid_starts = sorted(diacritics+[connector])
# invalid connector cases
a connector can not be sorrounded by/ can not come after or before:
* the vowels
* the diacritics
* another connector [double consecutive hosonto]
* khondo to
invalid_connectors = sorted(invalid_starts+vowels+['']+numbers+punctuations)
class devanagari:
* vowel and consonant division according to :
* consonant conjuncts according to:
* punctuations according to:
iden = "devanagari"
nukta = ''
vowels = ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '','', '', '', '','','', '', '','', '']
consonants = ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
'', '', '', '', '', '', '', '', '', '', '', '', '','', '', '', '', '', '', '', '','', '', '']
vowel_diacritics = ['', '','', 'ि', '', '', '', '', '', '', '', '', '', '', '', '', '','', '','','', '']
consonant_diacritics = ['', '', '', '']
numbers = ['', '', '', '', '', '', '', '', '', '']
punctuations = ['', '', ':', ';', '!', '', '?', '']
symbols = []
connector = ''
non_gylph_unicodes = []
legacy_symbols = []
non_chars = numbers+punctuations+symbols+non_gylph_unicodes+legacy_symbols
conjuncts = []
# this is a customizeable map : this map is purely based on visual similiarity
legacy_maps = {}
#---------------------------------------------------normalization maps---------------------------------------------------------
nukta_map = {'':'',
diacritic_map = {'ाे':'',
'अौ': '',
'एे': ''}
#---------------------------------------------------normalization maps---------------------------------------------------------
diacritics = sorted(vowel_diacritics+consonant_diacritics)
used = sorted(vowels+consonants+vowel_diacritics+consonant_diacritics+numbers)
valid = sorted([' ']+used+punctuations+[connector]+["\u200d","\u200c"])
complex_roots = sorted([' ']+vowels+consonants+numbers+punctuations+symbols+conjuncts)
# these unicodes can not start a word
invalid_starts = sorted(diacritics+[connector])
# invalid connector cases
a connector can not be sorrounded by/ can not come after or before:
* the vowels
* the diacritics
* another connector [double consecutive hosonto]
invalid_connectors = sorted(invalid_starts+vowels+numbers+punctuations)
class gujarati:
* vowel and consonant division according to :
* consonant conjuncts according to:
* punctuations according to:
iden = "gujarati"
nukta = ''
vowels = ['', '', '', '', '', '', '', '', '', '', '', '', '', '','', '', '', '']
consonants = ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '','']
vowel_diacritics = ['', 'િ', '', '', '', '', '', '', '', '', '', '', '']
consonant_diacritics = ['', '', '']
numbers = ['', '', '', '', '', '', '', '', '', '']
punctuations = ['',',',';','','?','!',':','',':-',"'",'','(', ')','{', '}','[',']','','<','>','=','...','.','-']
symbols = ['']
connector = ''
# based on unicode range : \u0980-\u09FF
non_gylph_unicodes = ['\u0a80', '\u0a84', '\u0aa9', '\u0ab1', '\u0ab4', '\u0aba', '\u0abb', '\u0ac6', '\u0aca', '\u0ace', '\u0acf',
'\u0ad1', '\u0ad2', '\u0ad3', '\u0ad4', '\u0ad5', '\u0ad6', '\u0ad7', '\u0ad8', '\u0ad9', '\u0ada', '\u0adb',
'\u0adc', '\u0add', '\u0ade', '\u0adf', '\u0ae4', '\u0ae5', '\u0af1', '\u0af2', '\u0af3', '\u0af4', '\u0af5',
'\u0af6', '\u0af7', '\u0af8']
legacy_symbols = []
non_chars = numbers+punctuations+symbols+non_gylph_unicodes+legacy_symbols
conjuncts = []
# this is a customizeable map : this map is purely based on visual similiarity
legacy_maps = {}
#---------------------------------------------------normalization maps---------------------------------------------------------
nukta_map = {} # NONE
diacritic_map = {'ાે': '',
'ાૅ': '',
'ાૈ': '',
'અા': '',
'અે': '',
'અો': '',
'અૅ': '',
'અૉ': '',
'અૈ': '',
'અૌ': ''}
#---------------------------------------------------normalization maps---------------------------------------------------------
#---------------------------------------------------normalization maps---------------------------------------------------------
diacritics = sorted(vowel_diacritics+consonant_diacritics)
used = sorted(vowels+consonants+vowel_diacritics+consonant_diacritics+numbers)
valid = sorted([' ']+used+punctuations+[connector]+["\u200d","\u200c"])
complex_roots = sorted([' ']+vowels+consonants+numbers+punctuations+symbols+conjuncts)
# these unicodes can not start a word
invalid_starts = sorted(diacritics+[connector])
# invalid connector cases
a connector can not be sorrounded by/ can not come after or before:
* the vowels
* the diacritics
* another connector [double consecutive hosonto]
invalid_connectors = sorted(invalid_starts+vowels+numbers+punctuations)
class odiya:
* vowel and consonant division according to :
* consonant conjuncts according to:
* punctuations according to:
iden = "odiya"
nukta = ''
vowels = ['', '', '', '', '', '', '', '', '', '', '', '','', '']
consonants = ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
'', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
'', '', '', '','', '', '','']
vowel_diacritics = ['', 'ି', '', '', '', '', '', '', '','', '','', '']
consonant_diacritics = ['', '', '']
numbers = ['', '', '', '', '', '', '', '', '', '']
punctuations = [',',';','','?','!',':','',':-',"'",'','(', ')','{', '}','[',']','','<','>','=','...','.','-']
symbols = ['', '', '', '', '', '']
connector = ''
# based on unicode range : \u0980-\u09FF
non_gylph_unicodes = ['\u0b00', '\u0b04', '\u0b0d', '\u0b0e', '\u0b11', '\u0b12', '\u0b29', '\u0b31', '\u0b34', '\u0b3a', '\u0b3b',
'\u0b45', '\u0b46', '\u0b49', '\u0b4a', '\u0b4e', '\u0b4f', '\u0b50', '\u0b51', '\u0b52', '\u0b53', '\u0b54',
'\u0b58', '\u0b59', '\u0b5a', '\u0b5b', '\u0b5e', '\u0b64', '\u0b65', '\u0b78', '\u0b79', '\u0b7a', '\u0b7b',
'\u0b7c', '\u0b7d', '\u0b7e', '\u0b7f']
legacy_symbols = []
non_chars = numbers+punctuations+symbols+non_gylph_unicodes+legacy_symbols
conjuncts = []
# this is a customizeable map : this map is purely based on visual similiarity
legacy_maps = {}
#---------------------------------------------------normalization maps---------------------------------------------------------
nukta_map = {'':'',
diacritic_map = {'ୋ':'',
#---------------------------------------------------normalization maps---------------------------------------------------------
diacritics = sorted(vowel_diacritics+consonant_diacritics)
used = sorted(vowels+consonants+vowel_diacritics+consonant_diacritics+numbers)
valid = sorted([' ']+used+punctuations+[connector]+["\u200d","\u200c"])
complex_roots = sorted([' ']+vowels+consonants+numbers+punctuations+symbols+conjuncts)
# these unicodes can not start a word
invalid_starts = sorted(diacritics+[connector])
# invalid connector cases
a connector can not be sorrounded by/ can not come after or before:
* the vowels
* the diacritics
* another connector [double consecutive hosonto]
invalid_connectors = sorted(invalid_starts+vowels+numbers+punctuations)
class tamil:
* vowel and consonant division according to :
* consonant conjuncts according to:
* punctuations according to:
iden = "tamil"
nukta = ''
vowels = ['', '', '', '', '', '', '', '', '', '', '', '']
consonants = ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
vowel_diacritics = ['', 'ி', '', '', '', '', '', '','', '', '']
consonant_diacritics = ['', '']
numbers = ['', '', '', '', '', '', '', '', '', '','', '', '']
punctuations = [',',';','','?','!',':','',':-',"'",'','(', ')','{', '}','[',']','','<','>','=','...','.','-']
symbols = ['', '', '','','', '', '','']
connector = ''
# based on unicode range : \u0980-\u09FF
non_gylph_unicodes = ['\u0b80', '\u0b81', '\u0b84', '\u0b8b', '\u0b8c', '\u0b8d', '\u0b91', '\u0b96', '\u0b97', '\u0b98', '\u0b9b',
'\u0b9d', '\u0ba0', '\u0ba1', '\u0ba2', '\u0ba5', '\u0ba6', '\u0ba7', '\u0bab', '\u0bac', '\u0bad', '\u0bba',
'\u0bbb', '\u0bbc', '\u0bbd', '\u0bc3', '\u0bc4', '\u0bc5', '\u0bc9', '\u0bce', '\u0bcf', '\u0bd8', '\u0bd9',
'\u0bda', '\u0bdb', '\u0bdc', '\u0bdd', '\u0bde', '\u0bdf', '\u0be0', '\u0be1', '\u0be2', '\u0be3', '\u0bd1',
'\u0bd2', '\u0bd3', '\u0bd4', '\u0bd5', '\u0bd6', '\u0bd8', '\u0bd9', '\u0bda', '\u0bdb', '\u0bdc', '\u0bdd',
'\u0bde', '\u0bdf', '\u0be0', '\u0be1', '\u0be2', '\u0be3', '\u0be4', '\u0be5', '\u0bfb', '\u0bfc', '\u0bfd',
'\u0bfe', '\u0bff']
legacy_symbols = []
non_chars = numbers+punctuations+symbols+non_gylph_unicodes+legacy_symbols
conjuncts = []
# this is a customizeable map : this map is purely based on visual similiarity
legacy_maps = {}
#---------------------------------------------------normalization maps---------------------------------------------------------
nukta_map = {} # NONE
diacritic_map = {'ொ':'',
#---------------------------------------------------normalization maps---------------------------------------------------------
diacritics = sorted(vowel_diacritics+consonant_diacritics)
used = sorted(vowels+consonants+vowel_diacritics+consonant_diacritics+numbers)
valid = sorted([' ']+used+punctuations+[connector]+["\u200d","\u200c"])
complex_roots = sorted([' ']+vowels+consonants+numbers+punctuations+symbols+conjuncts)
# these unicodes can not start a word
invalid_starts = sorted(diacritics+[connector])
# invalid connector cases
a connector can not be sorrounded by/ can not come after or before:
* the vowels
* the diacritics
* another connector [double consecutive hosonto]
invalid_connectors = sorted(invalid_starts+vowels+numbers+punctuations)
class panjabi:
* vowel and consonant division according to :
* consonant conjuncts according to: NOT USED
* punctuations according to:
iden = "panjabi"
nukta = ''
vowels = ['', '', '', '', '', '', '', '', '', '','', '']
consonants = ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
'', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '','',
'', '', '', '']
vowel_diacritics = ['', 'ਿ', '', '', '', '', '', '', '']
consonant_diacritics = ['', '', ''] # Not found!
numbers = ['', '', '', '', '', '', '', '', '', '']
punctuations = [',',';','','?','!',':','',':-',"'",'','(', ')','{', '}','[',']','','<','>','=','...','.','-']
symbols = []
connector = ''
# based on unicode range : \u0980-\u09FF
non_gylph_unicodes = ['\u0a00', '\u0a04', '\u0a0b', '\u0a0c', '\u0a0d', '\u0a0e', '\u0a11', '\u0a12', '\u0a29', '\u0a31',
'\u0a34', '\u0a37', '\u0a3a', '\u0a3b', '\u0a3d', '\u0a43', '\u0a44', '\u0a45', '\u0a46', '\u0a49',
'\u0a4a', '\u0a4e', '\u0a4f', '\u0a50', '\u0a52', '\u0a53', '\u0a54', '\u0a55', '\u0a56', '\u0a57',
'\u0a58', '\u0a5d', '\u0a5f', '\u0a60', '\u0a61', '\u0a62', '\u0a63', '\u0a64', '\u0a65', '\u0a76',
'\u0a77', '\u0a78', '\u0a79', '\u0a7a', '\u0a7b', '\u0a7c', '\u0a7d', '\u0a7e', '\u0a7f']
legacy_symbols = []
non_chars = numbers+punctuations+symbols+non_gylph_unicodes+legacy_symbols
conjuncts = []
# this is a customizeable map : this map is purely based on visual similiarity
legacy_maps = {}
#---------------------------------------------------normalization maps---------------------------------------------------------
nukta_map = {'':'',
diacritic_map = {'ੇੋ': '',
'ਾੇ': '',
'ਾੋ': '',
'ਅੈ': '',
'ਅੌ': '',
'ਅਾ': ''}
#---------------------------------------------------normalization maps---------------------------------------------------------
diacritics = sorted(vowel_diacritics+consonant_diacritics)
used = sorted(vowels+consonants+vowel_diacritics+consonant_diacritics+numbers)
valid = sorted([' ']+used+punctuations+[connector]+["\u200d","\u200c"])
complex_roots = sorted([' ']+vowels+consonants+numbers+punctuations+symbols+conjuncts)
# these unicodes can not start a word
invalid_starts = sorted(diacritics+[connector])
# invalid connector cases
a connector can not be sorrounded by/ can not come after or before:
* the vowels
* the diacritics
* another connector [double consecutive hosonto]
invalid_connectors = sorted(invalid_starts+vowels+numbers+punctuations)
class malayalam:
* vowel and consonant division according to:
* consonant conjuncts according to: Self Generated
* punctuations according to:
iden = "malayalam"
nukta = ''
vowels = ['', '', '', '', '', '', '', '', '', '', '', '', '', '','', '']
consonants = ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
'', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
'', '', '']
vowel_diacritics = ['', 'ി', '', '', '', '', '', '', '', '','', '', '','','', '']
consonant_diacritics = ['', '', '', '', '']
numbers = ['', '', '', '', '', '', '', '', '', '','', '', '', '', '', '', '','', '', '','', '', '', '', '', '']
punctuations = [',',';','','?','!',':','',':-',"'",'','(', ')','{', '}','[',']','','<','>','=','...','.','-']
symbols = ['', '', '']
connector = '' # There are two other this shit '഻', '഼'
# based on unicode range : \u0980-\u09FF
non_gylph_unicodes = ['\u0d64', '\u0d65', '\u0d50', '\u0d51', '\u0d52', '\u0d53', '\u0d49', '\u0d45', '\u0d11', '\u0d0d']
legacy_symbols = ['']
non_chars = numbers+punctuations+symbols+non_gylph_unicodes+legacy_symbols
conjuncts = []
# this is a customizeable map : this map is purely based on visual similiarity
legacy_maps = {}
#---------------------------------------------------normalization maps---------------------------------------------------------
nukta_map = {}
diacritic_map = {} #NONE--<>
#---------------------------------------------------normalization maps---------------------------------------------------------
diacritics = sorted(vowel_diacritics+consonant_diacritics)
used = sorted(vowels+consonants+vowel_diacritics+consonant_diacritics+numbers)
valid = sorted([' ']+used+punctuations+[connector]+["\u200d","\u200c"])
complex_roots = sorted([' ']+vowels+consonants+numbers+punctuations+symbols+conjuncts)
# these unicodes can not start a word
invalid_starts = sorted(diacritics+[connector])
# invalid connector cases
a connector can not be sorrounded by/ can not come after or before:
* the vowels
* the diacritics
* another connector [double consecutive hosonto]
invalid_connectors = sorted(invalid_starts+vowels+numbers+punctuations)
class sylhetinagri:
* according to asif sushmit
iden = "sylhetinagri"
nukta = '' #done
vowels = ['', '', '', '', ''] #done
consonants = ['', '', '', '',
'', '', '', '',
'', '', '', '',
'', '', '', '', '',
'', '', '', '', '',
'', '', '', '', ''] #done
vowel_diacritics = ['', '', '', '', ''] #done
consonant_diacritics = ['', '', ''] #done
numbers = ['', '', '', '', '', '', '', '', '', '', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '', '', '', '', '', '', '', '', '', '']
punctuations = ['', '', ':', ';', '!', '', '?', '', '.', ',']
symbols = ['', '', '', '']
connector = '' #done
non_gylph_unicodes = []
legacy_symbols = []
non_chars = numbers+punctuations+symbols+non_gylph_unicodes+legacy_symbols
conjuncts = []
# this is a customizeable map : this map is purely based on visual similiarity
legacy_maps = {}
#---------------------------------------------------normalization maps---------------------------------------------------------
nukta_map = {}
diacritic_map = {'ꠦꠣ':'',
#---------------------------------------------------normalization maps---------------------------------------------------------
diacritics = sorted(vowel_diacritics+consonant_diacritics)
used = sorted(vowels+consonants+vowel_diacritics+consonant_diacritics+numbers)
valid = sorted([' ']+used+punctuations+[connector]+["\u200d","\u200c"])
complex_roots = sorted([' ']+vowels+consonants+numbers+punctuations+symbols+conjuncts)
# these unicodes can not start a word
invalid_starts = sorted(diacritics+[connector])
# invalid connector cases
a connector can not be sorrounded by/ can not come after or before:
* the vowels
* the diacritics
* another connector [double consecutive hosonto]
invalid_connectors = sorted(invalid_starts+vowels+numbers+punctuations)
languages["english"] =english
languages["bangla"] =bangla
languages["devanagari"] =devanagari
languages["gujarati"] =gujarati
languages["odiya"] =odiya
languages["tamil"] =tamil
languages["panjabi"] =panjabi
languages["malayalam"] =malayalam