ai-content-maker/.venv/Lib/site-packages/dateparser/parser.py

649 lines
23 KiB
Python

import calendar
import regex as re
import pytz
from io import StringIO
from collections import OrderedDict
from datetime import datetime
from datetime import timedelta
from dateparser.utils import set_correct_day_from_settings, \
get_last_day_of_month, get_previous_leap_year, get_next_leap_year, \
_get_missing_parts, get_timezone_from_tz_string
from dateparser.utils.strptime import strptime
NSP_COMPATIBLE = re.compile(r'\D+')
MERIDIAN = re.compile(r'am|pm')
MICROSECOND = re.compile(r'\d{1,6}')
EIGHT_DIGIT = re.compile(r'^\d{8}$')
HOUR_MINUTE_REGEX = re.compile(r'^([0-9]|0[0-9]|1[0-9]|2[0-3]):[0-5][0-9]$')
def no_space_parser_eligibile(datestring):
src = NSP_COMPATIBLE.search(datestring)
if not src or ':' == src.group():
return True
return False
def get_unresolved_attrs(parser_object):
attrs = ['year', 'month', 'day']
seen = []
unseen = []
for attr in attrs:
if getattr(parser_object, attr, None) is not None:
seen.append(attr)
else:
unseen.append(attr)
return seen, unseen
date_order_chart = {
'DMY': '%d%m%y',
'DYM': '%d%y%m',
'MDY': '%m%d%y',
'MYD': '%m%y%d',
'YDM': '%y%d%m',
'YMD': '%y%m%d',
}
def resolve_date_order(order, lst=None):
chart_list = {
'DMY': ['day', 'month', 'year'],
'DYM': ['day', 'year', 'month'],
'MDY': ['month', 'day', 'year'],
'MYD': ['month', 'year', 'day'],
'YDM': ['year', 'day', 'month'],
'YMD': ['year', 'month', 'day'],
}
return chart_list[order] if lst else date_order_chart[order]
def _parse_absolute(datestring, settings, tz=None):
return _parser.parse(datestring, settings, tz)
def _parse_nospaces(datestring, settings, tz=None):
return _no_spaces_parser.parse(datestring, settings)
class _time_parser:
time_directives = [
'%H:%M:%S',
'%I:%M:%S %p',
'%H:%M',
'%I:%M %p',
'%I %p',
'%H:%M:%S.%f',
'%I:%M:%S.%f %p',
'%H:%M %p'
]
def __call__(self, timestring):
_timestring = timestring
for directive in self.time_directives:
try:
return strptime(timestring.strip(), directive).time()
except ValueError:
pass
else:
raise ValueError('%s does not seem to be a valid time string' % _timestring)
time_parser = _time_parser()
class _no_spaces_parser:
_dateformats = [
'%Y%m%d', '%Y%d%m', '%m%Y%d',
'%m%d%Y', '%d%Y%m', '%d%m%Y',
'%y%m%d', '%y%d%m', '%m%y%d',
'%m%d%y', '%d%y%m', '%d%m%y'
]
_preferred_formats = ['%Y%m%d%H%M', '%Y%m%d%H%M%S', '%Y%m%d%H%M%S.%f']
_preferred_formats_ordered_8_digit = ['%m%d%Y', '%d%m%Y', '%Y%m%d', '%Y%d%m', '%m%Y%d', '%d%Y%m']
_timeformats = ['%H%M%S.%f', '%H%M%S', '%H%M', '%H']
period = {
'day': ['%d', '%H', '%M', '%S'],
'month': ['%m']
}
_default_order = resolve_date_order('MDY')
def __init__(self, *args, **kwargs):
self._all = (
self._dateformats + [x + y for x in self._dateformats for y in self._timeformats] + self._timeformats
)
self.date_formats = {
'%m%d%y': (
self._preferred_formats
+ sorted(self._all, key=lambda x: x.lower().startswith('%m%d%y'), reverse=True)
),
'%m%y%d': sorted(self._all, key=lambda x: x.lower().startswith('%m%y%d'), reverse=True),
'%y%m%d': sorted(self._all, key=lambda x: x.lower().startswith('%y%m%d'), reverse=True),
'%y%d%m': sorted(self._all, key=lambda x: x.lower().startswith('%y%d%m'), reverse=True),
'%d%m%y': sorted(self._all, key=lambda x: x.lower().startswith('%d%m%y'), reverse=True),
'%d%y%m': sorted(self._all, key=lambda x: x.lower().startswith('%d%y%m'), reverse=True),
}
@classmethod
def _get_period(cls, format_string):
for pname, pdrv in sorted(cls.period.items(), key=lambda x: x[0]):
for drv in pdrv:
if drv in format_string:
return pname
else:
return 'year'
@classmethod
def _find_best_matching_date(cls, datestring):
for fmt in cls._preferred_formats_ordered_8_digit:
try:
dt = strptime(datestring, fmt), cls._get_period(fmt)
if len(str(dt[0].year)) == 4:
return dt
except:
pass
return None
@classmethod
def parse(cls, datestring, settings):
if not no_space_parser_eligibile(datestring):
raise ValueError('Unable to parse date from: %s' % datestring)
datestring = datestring.replace(':', '')
if not datestring:
raise ValueError("Empty string")
tokens = tokenizer(datestring)
if settings.DATE_ORDER:
order = resolve_date_order(settings.DATE_ORDER)
else:
order = cls._default_order
if EIGHT_DIGIT.match(datestring):
dt = cls._find_best_matching_date(datestring)
if dt is not None:
return dt
nsp = cls()
ambiguous_date = None
for token, _ in tokens.tokenize():
for fmt in nsp.date_formats[order]:
try:
dt = strptime(token, fmt), cls._get_period(fmt)
if len(str(dt[0].year)) < 4:
ambiguous_date = dt
continue
missing = _get_missing_parts(fmt)
_check_strict_parsing(missing, settings)
return dt
except:
pass
else:
if ambiguous_date:
return ambiguous_date
else:
raise ValueError('Unable to parse date from: %s' % datestring)
def _get_missing_error(missing):
return 'Fields missing from the date string: {}'.format(', '.join(missing))
def _check_strict_parsing(missing, settings):
if settings.STRICT_PARSING and missing:
raise ValueError(_get_missing_error(missing))
elif settings.REQUIRE_PARTS and missing:
errors = [part for part in settings.REQUIRE_PARTS if part in missing]
if errors:
raise ValueError(_get_missing_error(errors))
class _parser:
alpha_directives = OrderedDict([
('weekday', ['%A', '%a']),
('month', ['%B', '%b']),
])
num_directives = {
'month': ['%m'],
'day': ['%d'],
'year': ['%y', '%Y'],
}
def __init__(self, tokens, settings):
self.settings = settings
self.tokens = [(t[0].strip(), t[1]) for t in list(tokens)]
self.filtered_tokens = [(t[0], t[1], i) for i, t in enumerate(self.tokens) if t[1] <= 1]
self.unset_tokens = []
self.day = None
self.month = None
self.year = None
self.time = None
self.auto_order = []
self._token_day = None
self._token_month = None
self._token_year = None
self._token_time = None
self.ordered_num_directives = OrderedDict(
(k, self.num_directives[k])
for k in (resolve_date_order(settings.DATE_ORDER, lst=True))
)
skip_index = []
skip_component = None
skip_tokens = ["t", "year", "hour", "minute"]
for index, token_type_original_index in enumerate(self.filtered_tokens):
if index in skip_index:
continue
token, type, original_index = token_type_original_index
if token in skip_tokens:
continue
if self.time is None:
meridian_index = index + 1
try:
# try case where hours and minutes are separated by a period. Example: 13.20.
_is_before_period = self.tokens[original_index + 1][0] == '.'
_is_after_period = original_index != 0 and self.tokens[original_index - 1][0] == '.'
if _is_before_period and not _is_after_period:
index_next_token = index + 1
next_token = self.filtered_tokens[index_next_token][0]
index_in_tokens_for_next_token = self.filtered_tokens[index_next_token][2]
next_token_is_last = index_next_token == len(self.filtered_tokens) - 1
if next_token_is_last or self.tokens[index_in_tokens_for_next_token + 1][0] != '.':
new_token = token + ':' + next_token
if re.match(HOUR_MINUTE_REGEX, new_token):
token = new_token
skip_index.append(index + 1)
meridian_index += 1
except Exception:
pass
try:
microsecond = MICROSECOND.search(self.filtered_tokens[index + 1][0]).group()
# Is after time token? raise ValueError if ':' can't be found:
token.index(":")
# Is after period? raise ValueError if '.' can't be found:
self.tokens[self.tokens.index((token, 0)) + 1][0].index('.')
except:
microsecond = None
if microsecond:
meridian_index += 1
try:
meridian = MERIDIAN.search(self.filtered_tokens[meridian_index][0]).group()
except:
meridian = None
if any([':' in token, meridian, microsecond]):
if meridian and not microsecond:
self._token_time = '%s %s' % (token, meridian)
skip_index.append(meridian_index)
elif microsecond and not meridian:
self._token_time = '%s.%s' % (token, microsecond)
skip_index.append(index + 1)
elif meridian and microsecond:
self._token_time = '%s.%s %s' % (token, microsecond, meridian)
skip_index.append(index + 1)
skip_index.append(meridian_index)
else:
self._token_time = token
self.time = lambda: time_parser(self._token_time)
continue
results = self._parse(type, token, skip_component=skip_component)
for res in results:
if len(token) == 4 and res[0] == 'year':
skip_component = 'year'
setattr(self, *res)
known, unknown = get_unresolved_attrs(self)
params = {}
for attr in known:
params.update({attr: getattr(self, attr)})
for attr in unknown:
for token, type, _ in self.unset_tokens:
if type == 0:
params.update({attr: int(token)})
setattr(self, '_token_%s' % attr, token)
setattr(self, attr, int(token))
def _get_period(self):
if self.settings.RETURN_TIME_AS_PERIOD:
if getattr(self, 'time', None):
return 'time'
for period in ['time', 'day']:
if getattr(self, period, None):
return 'day'
for period in ['month', 'year']:
if getattr(self, period, None):
return period
if self._results():
return 'day'
def _get_datetime_obj(self, **params):
try:
return datetime(**params)
except ValueError as e:
error_text = e.__str__()
error_msgs = ['day is out of range', 'day must be in']
if error_msgs[0] in error_text or error_msgs[1] in error_text:
if not(self._token_day or hasattr(self, '_token_weekday')):
# if day is not available put last day of the month
params['day'] = get_last_day_of_month(params['year'], params['month'])
return datetime(**params)
elif not self._token_year and params['day'] == 29 and params['month'] == 2 and \
not calendar.isleap(params['year']):
# fix the year when year is not present and it is 29 of February
params['year'] = self._get_correct_leap_year(self.settings.PREFER_DATES_FROM, params['year'])
return datetime(**params)
raise e
def _get_correct_leap_year(self, prefer_dates_from, current_year):
if prefer_dates_from == 'future':
return get_next_leap_year(current_year)
if prefer_dates_from == 'past':
return get_previous_leap_year(current_year)
# Default case ('current_period'): return closer leap year
next_leap_year = get_next_leap_year(current_year)
previous_leap_year = get_previous_leap_year(current_year)
next_leap_year_is_closer = next_leap_year - current_year < current_year - previous_leap_year
return next_leap_year if next_leap_year_is_closer else previous_leap_year
def _set_relative_base(self):
self.now = self.settings.RELATIVE_BASE
if not self.now:
self.now = datetime.utcnow()
def _get_datetime_obj_params(self):
if not self.now:
self._set_relative_base()
params = {
'day': self.day or self.now.day,
'month': self.month or self.now.month,
'year': self.year or self.now.year,
'hour': 0, 'minute': 0, 'second': 0, 'microsecond': 0,
}
return params
def _get_date_obj(self, token, directive):
return strptime(token, directive)
def _results(self):
missing = [
field for field in ('day', 'month', 'year')
if not getattr(self, field)
]
_check_strict_parsing(missing, self.settings)
self._set_relative_base()
time = self.time() if self.time is not None else None
params = self._get_datetime_obj_params()
if time:
params.update(dict(hour=time.hour,
minute=time.minute,
second=time.second,
microsecond=time.microsecond))
return self._get_datetime_obj(**params)
def _correct_for_time_frame(self, dateobj, tz):
days = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
token_weekday, _ = getattr(self, '_token_weekday', (None, None))
if token_weekday and not(self._token_year or self._token_month or self._token_day):
day_index = calendar.weekday(dateobj.year, dateobj.month, dateobj.day)
day = token_weekday[:3].lower()
steps = 0
if 'future' in self.settings.PREFER_DATES_FROM:
if days[day_index] == day:
steps = 7
else:
while days[day_index] != day:
day_index = (day_index + 1) % 7
steps += 1
delta = timedelta(days=steps)
else:
if days[day_index] == day:
if self.settings.PREFER_DATES_FROM == 'past':
steps = 7
else:
steps = 0
else:
while days[day_index] != day:
day_index -= 1
steps += 1
delta = timedelta(days=-steps)
dateobj = dateobj + delta
# NOTE: If this assert fires, self.now needs to be made offset-aware in a similar
# way that dateobj is temporarily made offset-aware.
assert not (self.now.tzinfo is None and dateobj.tzinfo is not None),\
"`self.now` doesn't have `tzinfo`. Review comment in code for details."
# Store the original dateobj values so that upon subsequent parsing everything is not
# treated as offset-aware if offset awareness is changed.
original_dateobj = dateobj
# Since date comparisons must be either offset-naive or offset-aware, normalize dateobj
# to be offset-aware if one or the other is already offset-aware.
if self.now.tzinfo is not None and dateobj.tzinfo is None:
dateobj = pytz.utc.localize(dateobj)
if self.month and not self.year:
try:
if self.now < dateobj:
if self.settings.PREFER_DATES_FROM == 'past':
dateobj = dateobj.replace(year=dateobj.year - 1)
else:
if self.settings.PREFER_DATES_FROM == 'future':
dateobj = dateobj.replace(year=dateobj.year + 1)
except ValueError as e:
if dateobj.day == 29 and dateobj.month == 2:
valid_year = self._get_correct_leap_year(
self.settings.PREFER_DATES_FROM, dateobj.year)
dateobj = dateobj.replace(year=valid_year)
else:
raise e
if self._token_year and len(self._token_year[0]) == 2:
if self.now < dateobj:
if 'past' in self.settings.PREFER_DATES_FROM:
dateobj = dateobj.replace(year=dateobj.year - 100)
else:
if 'future' in self.settings.PREFER_DATES_FROM:
dateobj = dateobj.replace(year=dateobj.year + 100)
if self._token_time and not any([self._token_year,
self._token_month,
self._token_day,
hasattr(self, '_token_weekday')]):
# Convert dateobj to utc time to compare with self.now
try:
tz = tz or get_timezone_from_tz_string(self.settings.TIMEZONE)
except pytz.UnknownTimeZoneError:
tz = None
if tz:
dateobj_time = (dateobj - tz.utcoffset(dateobj)).time()
else:
dateobj_time = dateobj.time()
if 'past' in self.settings.PREFER_DATES_FROM:
if self.now.time() < dateobj_time:
dateobj = dateobj + timedelta(days=-1)
if 'future' in self.settings.PREFER_DATES_FROM:
if self.now.time() > dateobj_time:
dateobj = dateobj + timedelta(days=1)
# Reset dateobj to the original value, thus removing any offset awareness that may
# have been set earlier.
dateobj = dateobj.replace(tzinfo=original_dateobj.tzinfo)
return dateobj
def _correct_for_day(self, dateobj):
if (
getattr(self, '_token_day', None)
or getattr(self, '_token_weekday', None)
or getattr(self, '_token_time', None)
):
return dateobj
dateobj = set_correct_day_from_settings(
dateobj, self.settings, current_day=self.now.day
)
return dateobj
@classmethod
def parse(cls, datestring, settings, tz=None):
tokens = tokenizer(datestring)
po = cls(tokens.tokenize(), settings)
dateobj = po._results()
# correction for past, future if applicable
dateobj = po._correct_for_time_frame(dateobj, tz)
# correction for preference of day: beginning, current, end
dateobj = po._correct_for_day(dateobj)
period = po._get_period()
return dateobj, period
def _parse(self, type, token, skip_component=None):
def set_and_return(token, type, component, dateobj, skip_date_order=False):
if not skip_date_order:
self.auto_order.append(component)
setattr(self, '_token_%s' % component, (token, type))
return [(component, getattr(dateobj, component))]
def parse_number(token, skip_component=None):
type = 0
for component, directives in self.ordered_num_directives.items():
if skip_component == component:
continue
for directive in directives:
try:
do = self._get_date_obj(token, directive)
prev_value = getattr(self, component, None)
if not prev_value:
return set_and_return(token, type, component, do)
else:
try:
prev_token, prev_type = getattr(self, '_token_%s' % component)
if prev_type == type:
do = self._get_date_obj(prev_token, directive)
except ValueError:
self.unset_tokens.append((prev_token, prev_type, component))
return set_and_return(token, type, component, do)
except ValueError:
pass
else:
raise ValueError('Unable to parse: %s' % token)
def parse_alpha(token, skip_component=None):
type = 1
for component, directives in self.alpha_directives.items():
if skip_component == component:
continue
for directive in directives:
try:
do = self._get_date_obj(token, directive)
prev_value = getattr(self, component, None)
if not prev_value:
return set_and_return(token, type, component, do, skip_date_order=True)
elif component == 'month':
index = self.auto_order.index('month')
self.auto_order[index] = 'day'
setattr(self, '_token_day', self._token_month)
setattr(self, '_token_month', (token, type))
return [(component, getattr(do, component)), ('day', prev_value)]
except:
pass
else:
raise ValueError('Unable to parse: %s' % token)
handlers = {0: parse_number, 1: parse_alpha}
return handlers[type](token, skip_component)
class tokenizer:
digits = '0123456789:'
letters = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
def _isletter(self, tkn):
return tkn in self.letters
def _isdigit(self, tkn):
return tkn in self.digits
def __init__(self, ds):
self.instream = StringIO(ds)
def _switch(self, chara, charb):
if self._isdigit(chara):
return 0, not self._isdigit(charb)
if self._isletter(chara):
return 1, not self._isletter(charb)
return 2, self._isdigit(charb) or self._isletter(charb)
def tokenize(self):
token = ''
EOF = False
while not EOF:
nextchar = self.instream.read(1)
if not nextchar:
EOF = True
type, _ = self._switch(token[-1], nextchar)
yield token, type
return
if token:
type, switch = self._switch(token[-1], nextchar)
if not switch:
token += nextchar
else:
yield token, type
token = nextchar
else:
token += nextchar