46 lines
1.5 KiB
Python
46 lines
1.5 KiB
Python
import os
|
|
|
|
import fasttext
|
|
|
|
from dateparser_cli.fasttext_manager import fasttext_downloader
|
|
from dateparser_cli.utils import dateparser_model_home, create_data_model_home
|
|
from dateparser_cli.exceptions import FastTextModelNotFoundException
|
|
|
|
|
|
_supported_models = ["large.bin", "small.bin"]
|
|
_DEFAULT_MODEL = "small"
|
|
|
|
|
|
class _FastTextCache:
|
|
model = None
|
|
|
|
|
|
def _load_fasttext_model():
|
|
if _FastTextCache.model:
|
|
return _FastTextCache.model
|
|
create_data_model_home()
|
|
downloaded_models = [
|
|
file for file in os.listdir(dateparser_model_home)
|
|
if file in _supported_models
|
|
]
|
|
if not downloaded_models:
|
|
fasttext_downloader(_DEFAULT_MODEL)
|
|
return _load_fasttext_model()
|
|
model_path = os.path.join(dateparser_model_home, downloaded_models[0])
|
|
if not os.path.isfile(model_path):
|
|
raise FastTextModelNotFoundException('Fasttext model file not found')
|
|
_FastTextCache.model = fasttext.load_model(model_path)
|
|
return _FastTextCache.model
|
|
|
|
|
|
def detect_languages(text, confidence_threshold):
|
|
_language_parser = _load_fasttext_model()
|
|
text = text.replace('\n', ' ').replace('\r', '')
|
|
language_codes = []
|
|
parser_data = _language_parser.predict(text)
|
|
for idx, language_probability in enumerate(parser_data[1]):
|
|
if language_probability > confidence_threshold:
|
|
language_code = parser_data[0][idx].replace("__label__", "")
|
|
language_codes.append(language_code)
|
|
return language_codes
|