ai-content-maker/.venv/Lib/site-packages/dateparser/search/search.py

244 lines
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from collections.abc import Set
from dateparser.languages.loader import LocaleDataLoader
from dateparser.conf import apply_settings, check_settings, Settings
from dateparser.date import DateDataParser
from dateparser.search.text_detection import FullTextLanguageDetector
from dateparser.custom_language_detection.language_mapping import map_languages
import regex as re
RELATIVE_REG = re.compile("(ago|in|from now|tomorrow|today|yesterday)")
def date_is_relative(translation):
return re.search(RELATIVE_REG, translation) is not None
class _ExactLanguageSearch:
def __init__(self, loader):
self.loader = loader
self.language = None
def get_current_language(self, shortname):
if self.language is None or self.language.shortname != shortname:
self.language = self.loader.get_locale(shortname)
def search(self, shortname, text, settings):
self.get_current_language(shortname)
result = self.language.translate_search(text, settings=settings)
return result
@staticmethod
def set_relative_base(substring, already_parsed):
if len(already_parsed) == 0:
return substring, None
i = len(already_parsed) - 1
while already_parsed[i][1]:
i -= 1
if i == -1:
return substring, None
relative_base = already_parsed[i][0]['date_obj']
return substring, relative_base
def choose_best_split(self, possible_parsed_splits, possible_substrings_splits):
rating = []
for i in range(len(possible_parsed_splits)):
num_substrings = len(possible_substrings_splits[i])
num_substrings_without_digits = 0
not_parsed = 0
for j, item in enumerate(possible_parsed_splits[i]):
if item[0]['date_obj'] is None:
not_parsed += 1
if not any(char.isdigit() for char in possible_substrings_splits[i][j]):
num_substrings_without_digits += 1
rating.append([
num_substrings,
0 if not_parsed == 0 else (float(not_parsed) / float(num_substrings)),
0 if num_substrings_without_digits == 0 else (
float(num_substrings_without_digits) / float(num_substrings))])
best_index, best_rating = min(enumerate(rating), key=lambda p: (p[1][1], p[1][0], p[1][2]))
return possible_parsed_splits[best_index], possible_substrings_splits[best_index]
def split_by(self, item, original, splitter):
if item.count(splitter) <= 2:
return [[item.split(splitter), original.split(splitter)]]
item_all_split = item.split(splitter)
original_all_split = original.split(splitter)
all_possible_splits = [[item_all_split, original_all_split]]
for i in range(2, 4):
item_partially_split = []
original_partially_split = []
for j in range(0, len(item_all_split), i):
item_join = splitter.join(item_all_split[j:j + i])
original_join = splitter.join(original_all_split[j:j + i])
item_partially_split.append(item_join)
original_partially_split.append(original_join)
all_possible_splits.append([item_partially_split, original_partially_split])
return all_possible_splits
def split_if_not_parsed(self, item, original):
splitters = [',', '،', '——', '', '', '.', ' ']
possible_splits = []
for splitter in splitters:
if splitter in item and item.count(splitter) == original.count(splitter):
possible_splits.extend(self.split_by(item, original, splitter))
return possible_splits
def parse_item(self, parser, item, translated_item, parsed, need_relative_base):
relative_base = None
item = item.replace('ngày', '')
item = item.replace('am', '')
parsed_item = parser.get_date_data(item)
is_relative = date_is_relative(translated_item)
if need_relative_base:
item, relative_base = self.set_relative_base(item, parsed)
if relative_base:
parser._settings.RELATIVE_BASE = relative_base
parsed_item = parser.get_date_data(item)
return parsed_item, is_relative
def parse_found_objects(self, parser, to_parse, original, translated, settings):
parsed = []
substrings = []
need_relative_base = True
if settings.RELATIVE_BASE:
need_relative_base = False
for i, item in enumerate(to_parse):
if len(item) <= 2:
continue
parsed_item, is_relative = self.parse_item(parser, item, translated[i], parsed, need_relative_base)
if parsed_item['date_obj']:
parsed.append((parsed_item, is_relative))
substrings.append(original[i].strip(" .,:()[]-'"))
continue
possible_splits = self.split_if_not_parsed(item, original[i])
if not possible_splits:
continue
possible_parsed = []
possible_substrings = []
for split_translated, split_original in possible_splits:
current_parsed = []
current_substrings = []
if split_translated:
for j, jtem in enumerate(split_translated):
if len(jtem) <= 2:
continue
parsed_jtem, is_relative_jtem = self.parse_item(
parser, jtem, split_translated[j], current_parsed, need_relative_base)
current_parsed.append((parsed_jtem, is_relative_jtem))
current_substrings.append(split_original[j].strip(' .,:()[]-'))
possible_parsed.append(current_parsed)
possible_substrings.append(current_substrings)
parsed_best, substrings_best = self.choose_best_split(possible_parsed, possible_substrings)
for k in range(len(parsed_best)):
if parsed_best[k][0]['date_obj']:
parsed.append(parsed_best[k])
substrings.append(substrings_best[k])
return parsed, substrings
def search_parse(self, shortname, text, settings):
translated, original = self.search(shortname, text, settings)
bad_translate_with_search = ['vi', 'hu'] # splitting done by spaces and some dictionary items contain spaces
if shortname not in bad_translate_with_search:
languages = ['en']
to_parse = translated
else:
languages = [shortname]
to_parse = original
parser = DateDataParser(languages=languages, settings=settings)
parsed, substrings = self.parse_found_objects(parser=parser, to_parse=to_parse,
original=original, translated=translated, settings=settings)
parser._settings = Settings()
return list(zip(substrings, [i[0]['date_obj'] for i in parsed]))
class DateSearchWithDetection:
"""
Class which executes language detection of string in a natural language, translation of a given string,
search of substrings which represent date and/or time and parsing of these substrings.
"""
def __init__(self):
self.loader = LocaleDataLoader()
self.available_language_map = self.loader.get_locale_map()
self.search = _ExactLanguageSearch(self.loader)
@apply_settings
def detect_language(self, text, languages, settings=None, detect_languages_function=None):
if detect_languages_function and not languages:
detected_languages = detect_languages_function(
text, confidence_threshold=settings.LANGUAGE_DETECTION_CONFIDENCE_THRESHOLD
)
detected_languages = map_languages(detected_languages) or settings.DEFAULT_LANGUAGES
return detected_languages[0] if detected_languages else None
if isinstance(languages, (list, tuple, Set)):
if all([language in self.available_language_map for language in languages]):
languages = [self.available_language_map[language] for language in languages]
else:
unsupported_languages = set(languages) - set(self.available_language_map.keys())
raise ValueError("Unknown language(s): %s" % ', '.join(map(repr, unsupported_languages)))
elif languages is not None:
raise TypeError("languages argument must be a list (%r given)" % type(languages))
if languages:
self.language_detector = FullTextLanguageDetector(languages=languages)
else:
self.language_detector = FullTextLanguageDetector(list(self.available_language_map.values()))
detected_language = self.language_detector._best_language(text) or (
settings.DEFAULT_LANGUAGES[0] if settings.DEFAULT_LANGUAGES else None
)
return detected_language
@apply_settings
def search_dates(self, text, languages=None, settings=None, detect_languages_function=None):
"""
Find all substrings of the given string which represent date and/or time and parse them.
:param text:
A string in a natural language which may contain date and/or time expressions.
:type text: str
:param languages:
A list of two letters language codes.e.g. ['en', 'es']. If languages are given, it will not attempt
to detect the language.
:type languages: list
:param settings:
Configure customized behavior using settings defined in :mod:`dateparser.conf.Settings`.
:type settings: dict
:param detect_languages_function:
A function for language detection that takes as input a `text` and a `confidence_threshold`,
returns a list of detected language codes.
:type detect_languages_function: function
:return: a dict mapping keys to two letter language code and a list of tuples of pairs:
substring representing date expressions and corresponding :mod:`datetime.datetime` object.
For example:
{'Language': 'en', 'Dates': [('on 4 October 1957', datetime.datetime(1957, 10, 4, 0, 0))]}
If language of the string isn't recognised returns:
{'Language': None, 'Dates': None}
:raises: ValueError - Unknown Language
"""
check_settings(settings)
language_shortname = self.detect_language(
text=text, languages=languages, settings=settings, detect_languages_function=detect_languages_function
)
if not language_shortname:
return {'Language': None, 'Dates': None}
return {'Language': language_shortname, 'Dates': self.search.search_parse(language_shortname, text,
settings=settings)}