from itertools import chain import regex as re from collections import OrderedDict from dateutil import parser from dateparser.timezone_parser import pop_tz_offset_from_string, word_is_tz from dateparser.utils import normalize_unicode, combine_dicts from .dictionary import Dictionary, NormalizedDictionary, ALWAYS_KEEP_TOKENS NUMERAL_PATTERN = re.compile(r'(\d+)', re.U) class Locale: """ Class that deals with applicability and translation from a locale. :param shortname: A locale code, e.g. 'fr-PF', 'qu-EC', 'af-NA'. :type shortname: str :param language_info: Language info (translation data) of the language the locale belongs to. :type language_info: dict :return: A Locale instance """ _dictionary = None _normalized_dictionary = None _simplifications = None _normalized_simplifications = None _splitters = None _wordchars = None _relative_translations = None _normalized_relative_translations = None _abbreviations = None _split_dictionary = None _wordchars_for_detection = None def __init__(self, shortname, language_info): self.shortname = shortname locale_specific_info = language_info.get("locale_specific", {}).get(shortname, {}) self.info = combine_dicts(language_info, locale_specific_info) self.info.pop("locale_specific", None) def is_applicable(self, date_string, strip_timezone=False, settings=None): """ Check if the locale is applicable to translate date string. :param date_string: A string representing date and/or time in a recognizably valid format. :type date_string: str :param strip_timezone: If True, timezone is stripped from date string. :type strip_timezone: bool :return: boolean value representing if the locale is applicable for the date string or not. """ if strip_timezone: date_string, _ = pop_tz_offset_from_string(date_string, as_offset=False) date_string = self._translate_numerals(date_string) if settings.NORMALIZE: date_string = normalize_unicode(date_string) date_string = self._simplify(date_string, settings=settings) dictionary = self._get_dictionary(settings) date_tokens = dictionary.split(date_string) return dictionary.are_tokens_valid(date_tokens) def count_applicability(self, text, strip_timezone=False, settings=None): if strip_timezone: text, _ = pop_tz_offset_from_string(text, as_offset=False) text = self._simplify(text, settings=settings) sentences = self._sentence_split(text, settings=settings) tokens = [] for sent in sentences: tokens.extend(self._split(sent, keep_formatting=False, settings=settings)) return self._count_words_present_in_the_dictionary(tokens, settings) def _count_words_present_in_the_dictionary(self, words, settings=None): dictionary = self.clean_dictionary(self._get_split_dictionary(settings=settings)) dict_cnt = 0 skip_cnt = 0 for word in set(words): if word in dictionary: if dictionary[word]: dict_cnt += 1 else: skip_cnt += 1 elif word.isdigit(): skip_cnt += 1 return [dict_cnt, skip_cnt] @staticmethod def clean_dictionary(dictionary, threshold=2): del_keys = [] for key in dictionary: if len(key) < threshold: del_keys.append(key) for del_key in del_keys: del dictionary[del_key] return dictionary def translate(self, date_string, keep_formatting=False, settings=None): """ Translate the date string to its English equivalent. :param date_string: A string representing date and/or time in a recognizably valid format. :type date_string: str :param keep_formatting: If True, retain formatting of the date string after translation. :type keep_formatting: bool :return: translated date string. """ date_string = self._translate_numerals(date_string) if settings.NORMALIZE: date_string = normalize_unicode(date_string) date_string = self._simplify(date_string, settings=settings) dictionary = self._get_dictionary(settings) date_string_tokens = dictionary.split(date_string, keep_formatting) relative_translations = self._get_relative_translations(settings=settings) for i, word in enumerate(date_string_tokens): word = word.lower() for pattern, replacement in relative_translations.items(): if pattern.match(word): date_string_tokens[i] = pattern.sub(replacement, word) break else: if word in dictionary: fallback = word if keep_formatting and not word.isalpha() else '' date_string_tokens[i] = dictionary[word] or fallback if "in" in date_string_tokens: date_string_tokens = self._clear_future_words(date_string_tokens) return self._join(list(filter(bool, date_string_tokens)), separator="" if keep_formatting else " ", settings=settings) def _translate_numerals(self, date_string): date_string_tokens = NUMERAL_PATTERN.split(date_string) for i, token in enumerate(date_string_tokens): if token.isdecimal(): date_string_tokens[i] = str(int(token)).zfill(len(token)) return ''.join(date_string_tokens) def _get_relative_translations(self, settings=None): if settings.NORMALIZE: if self._normalized_relative_translations is None: self._normalized_relative_translations = ( self._generate_relative_translations(normalize=True)) return self._normalized_relative_translations else: if self._relative_translations is None: self._relative_translations = self._generate_relative_translations(normalize=False) return self._relative_translations def _generate_relative_translations(self, normalize=False): relative_translations = self.info.get('relative-type-regex', {}) relative_dictionary = OrderedDict() for key, value in relative_translations.items(): if normalize: value = list(map(normalize_unicode, value)) pattern = '|'.join(sorted(value, key=len, reverse=True)) pattern = pattern.replace(r'(\d+', r'(?P\d+') pattern = re.compile(r'^(?:{})$'.format(pattern), re.UNICODE | re.IGNORECASE) relative_dictionary[pattern] = key return relative_dictionary def translate_search(self, search_string, settings=None): dashes = ['-', '——', '—', '~'] word_joint_unsupported_languages = ["zh", "ja"] sentences = self._sentence_split(search_string, settings=settings) dictionary = self._get_dictionary(settings=settings) translated = [] original = [] for sentence in sentences: original_tokens, simplified_tokens = self._simplify_split_align(sentence, settings=settings) translated_chunk = [] original_chunk = [] last_token_index = len(simplified_tokens) - 1 skip_next_token = False for i, word in enumerate(simplified_tokens): next_word = simplified_tokens[i + 1] if i < last_token_index else "" current_and_next_joined = self._join_chunk([word, next_word], settings=settings) if skip_next_token: skip_next_token = False continue if word == '' or word == ' ': translated_chunk.append(word) original_chunk.append(original_tokens[i]) elif ( current_and_next_joined in dictionary and word not in dashes and self.shortname not in word_joint_unsupported_languages ): translated_chunk.append(dictionary[current_and_next_joined]) original_chunk.append( self._join_chunk([original_tokens[i], original_tokens[i + 1]], settings=settings) ) skip_next_token = True elif word in dictionary and word not in dashes: translated_chunk.append(dictionary[word]) original_chunk.append(original_tokens[i]) elif word.strip('()\"\'{}[],.،') in dictionary and word not in dashes: punct = word[len(word.strip('()\"\'{}[],.،')):] if punct and dictionary[word.strip('()\"\'{}[],.،')]: translated_chunk.append(dictionary[word.strip('()\"\'{}[],.،')] + punct) else: translated_chunk.append(dictionary[word.strip('()\"\'{}[],.،')]) original_chunk.append(original_tokens[i]) elif self._token_with_digits_is_ok(word): translated_chunk.append(word) original_chunk.append(original_tokens[i]) # Use original token because word_is_tz is case sensitive elif translated_chunk and word_is_tz(original_tokens[i]): translated_chunk.append(word) original_chunk.append(original_tokens[i]) else: if translated_chunk: translated.append(translated_chunk) translated_chunk = [] original.append(original_chunk) original_chunk = [] if translated_chunk: translated.append(translated_chunk) original.append(original_chunk) for i in range(len(translated)): if "in" in translated[i]: translated[i] = self._clear_future_words(translated[i]) translated[i] = self._join_chunk(list(filter(bool, translated[i])), settings=settings) original[i] = self._join_chunk(list(filter(bool, original[i])), settings=settings) return translated, original def _get_abbreviations(self, settings): dictionary = self._get_dictionary(settings=settings) abbreviations = [] if self._abbreviations is None: for item in dictionary: if item.endswith('.') and len(item) > 1: abbreviations.append(item) self._abbreviations = abbreviations return self._abbreviations def _sentence_split(self, string, settings): abbreviations = self._get_abbreviations(settings=settings) digit_abbreviations = ['[0-9]'] # numeric date with full stop abbreviation_string = '' for abbreviation in abbreviations: abbreviation_string += '(? len(simplified_tokens): original_tokens.remove('') else: simplified_tokens.remove('') return original_tokens, simplified_tokens def _get_split_dictionary(self, settings): if self._split_dictionary is None: settings.NORMALIZE = True dictionary = self._get_dictionary(settings=settings) self._split_dictionary = self._split_dict(dictionary) return self._split_dictionary def _split_dict(self, dictionary): newdict = {} for item in dictionary: if ' ' in item: items = item.split() for i in items: newdict[i] = dictionary[item] else: newdict[item] = dictionary[item] return newdict def _word_split(self, string, settings): if 'no_word_spacing' in self.info: return self._split(string, keep_formatting=True, settings=settings) else: return string.split() def _split(self, date_string, keep_formatting, settings=None): tokens = [date_string] tokens = list(self._split_tokens_with_regex(tokens, r"(\d+)")) tokens = list( self._split_tokens_by_known_words(tokens, keep_formatting, settings=settings)) return tokens def _split_tokens_with_regex(self, tokens, regex): tokens = tokens[:] for i, token in enumerate(tokens): tokens[i] = re.split(regex, token) return filter(bool, chain.from_iterable(tokens)) def _split_tokens_by_known_words(self, tokens, keep_formatting, settings=None): dictionary = self._get_dictionary(settings) for i, token in enumerate(tokens): tokens[i] = dictionary.split(token, keep_formatting) return list(chain.from_iterable(tokens)) def _join_chunk(self, chunk, settings): if 'no_word_spacing' in self.info: return self._join(chunk, separator="", settings=settings) else: return re.sub(r'\s{2,}', ' ', " ".join(chunk)) def _token_with_digits_is_ok(self, token): if 'no_word_spacing' in self.info: if re.search(r'[\d\.:\-/]+', token) is not None: return True else: return False else: if re.search(r'\d+', token) is not None: return True else: return False def _simplify(self, date_string, settings=None): date_string = date_string.lower() simplifications = self._get_simplifications(settings=settings) for simplification in simplifications: pattern, replacement = list(simplification.items())[0] date_string = pattern.sub(replacement, date_string).lower() return date_string def _get_simplifications(self, settings=None): no_word_spacing = eval(self.info.get('no_word_spacing', 'False')) if settings.NORMALIZE: if self._normalized_simplifications is None: self._normalized_simplifications = [] simplifications = self._generate_simplifications(normalize=True) for simplification in simplifications: pattern, replacement = list(simplification.items())[0] if not no_word_spacing: pattern = r'(?<=\A|\W|_)%s(?=\Z|\W|_)' % pattern pattern = re.compile(pattern, flags=re.I | re.U) self._normalized_simplifications.append({pattern: replacement}) return self._normalized_simplifications else: if self._simplifications is None: self._simplifications = [] simplifications = self._generate_simplifications(normalize=False) for simplification in simplifications: pattern, replacement = list(simplification.items())[0] if not no_word_spacing: pattern = r'(?<=\A|\W|_)%s(?=\Z|\W|_)' % pattern pattern = re.compile(pattern, flags=re.I | re.U) self._simplifications.append({pattern: replacement}) return self._simplifications def _generate_simplifications(self, normalize=False): simplifications = [] for simplification in self.info.get('simplifications', []): c_simplification = {} key, value = list(simplification.items())[0] if normalize: key = normalize_unicode(key) if isinstance(value, int): c_simplification[key] = str(value) else: c_simplification[key] = normalize_unicode(value) if normalize else value simplifications.append(c_simplification) return simplifications def _clear_future_words(self, words): freshness_words = {'day', 'week', 'month', 'year', 'hour', 'minute', 'second'} if set(words).isdisjoint(freshness_words): words.remove("in") return words def _join(self, tokens, separator=" ", settings=None): if not tokens: return "" capturing_splitters = self._get_splitters(settings)['capturing'] joined = tokens[0] for i in range(1, len(tokens)): left, right = tokens[i - 1], tokens[i] if left not in capturing_splitters and right not in capturing_splitters: joined += separator joined += right return joined def _get_dictionary(self, settings=None): if not settings.NORMALIZE: if self._dictionary is None: self._generate_dictionary() self._dictionary._settings = settings return self._dictionary else: if self._normalized_dictionary is None: self._generate_normalized_dictionary() self._normalized_dictionary._settings = settings return self._normalized_dictionary def _get_wordchars(self, settings=None): if self._wordchars is None: self._set_wordchars(settings) return self._wordchars def _get_splitters(self, settings=None): if self._splitters is None: self._set_splitters(settings) return self._splitters def _set_splitters(self, settings=None): splitters = { # The ones that split string only if they are not surrounded by letters from both sides: 'wordchars': set(), # The ones that are not filtered out from tokens after split: 'capturing': set(), } splitters['capturing'] |= set(ALWAYS_KEEP_TOKENS) wordchars = self._get_wordchars(settings) skip = set(self.info.get('skip', [])) | splitters['capturing'] for token in skip: if not re.match(r'^\W+$', token, re.UNICODE): continue if token in wordchars: splitters['wordchars'].add(token) self._splitters = splitters def _set_wordchars(self, settings=None): wordchars = set() for word in self._get_dictionary(settings): if re.match(r'^[\W\d_]+$', word, re.UNICODE): continue for char in word: wordchars.add(char.lower()) self._wordchars = wordchars - {" "} | {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9"} def get_wordchars_for_detection(self, settings): if self._wordchars_for_detection is None: wordchars = set() for word in self._get_dictionary(settings): if re.match(r'^[\W\d_]+$', word, re.UNICODE): continue for char in word: wordchars.add(char.lower()) self._wordchars_for_detection = wordchars - {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9", ":", "(", ")", "'", "q", "a", "m", "p", " "} return self._wordchars_for_detection def _generate_dictionary(self, settings=None): self._dictionary = Dictionary(self.info, settings=settings) def _generate_normalized_dictionary(self, settings=None): self._normalized_dictionary = NormalizedDictionary(self.info, settings=settings) def to_parserinfo(self, base_cls=parser.parserinfo): attributes = { 'JUMP': self.info.get('skip', []), 'PERTAIN': self.info.get('pertain', []), 'WEEKDAYS': [self.info['monday'], self.info['tuesday'], self.info['wednesday'], self.info['thursday'], self.info['friday'], self.info['saturday'], self.info['sunday']], 'MONTHS': [self.info['january'], self.info['february'], self.info['march'], self.info['april'], self.info['may'], self.info['june'], self.info['july'], self.info['august'], self.info['september'], self.info['october'], self.info['november'], self.info['december']], 'HMS': [self.info['hour'], self.info['minute'], self.info['second']], } name = '{language}ParserInfo'.format(language=self.info['name']) return type(name, bases=[base_cls], dict=attributes)