161 lines
5.1 KiB
Python
161 lines
5.1 KiB
Python
|
import regex as re
|
||
|
from datetime import datetime
|
||
|
from datetime import time
|
||
|
from tzlocal import get_localzone
|
||
|
|
||
|
from dateutil.relativedelta import relativedelta
|
||
|
|
||
|
from dateparser.utils import apply_timezone, localize_timezone, strip_braces
|
||
|
from .parser import time_parser
|
||
|
from .timezone_parser import pop_tz_offset_from_string
|
||
|
|
||
|
|
||
|
_UNITS = r'decade|year|month|week|day|hour|minute|second'
|
||
|
PATTERN = re.compile(r'(\d+[.,]?\d*)\s*(%s)\b' % _UNITS, re.I | re.S | re.U)
|
||
|
|
||
|
|
||
|
class FreshnessDateDataParser:
|
||
|
""" Parses date string like "1 year, 2 months ago" and "3 hours, 50 minutes ago" """
|
||
|
|
||
|
def _are_all_words_units(self, date_string):
|
||
|
skip = [_UNITS,
|
||
|
r'ago|in|\d+',
|
||
|
r':|[ap]m']
|
||
|
|
||
|
date_string = re.sub(r'\s+', ' ', date_string.strip())
|
||
|
|
||
|
words = [x for x in re.split(r'\W', date_string) if x]
|
||
|
words = [x for x in words if not re.match(r'%s' % '|'.join(skip), x)]
|
||
|
return not words
|
||
|
|
||
|
def _parse_time(self, date_string, settings):
|
||
|
"""Attempts to parse time part of date strings like '1 day ago, 2 PM' """
|
||
|
date_string = PATTERN.sub('', date_string)
|
||
|
date_string = re.sub(r'\b(?:ago|in)\b', '', date_string)
|
||
|
try:
|
||
|
return time_parser(date_string)
|
||
|
except:
|
||
|
pass
|
||
|
|
||
|
def get_local_tz(self):
|
||
|
return get_localzone()
|
||
|
|
||
|
def parse(self, date_string, settings):
|
||
|
date_string = strip_braces(date_string)
|
||
|
date_string, ptz = pop_tz_offset_from_string(date_string)
|
||
|
_time = self._parse_time(date_string, settings)
|
||
|
|
||
|
_settings_tz = settings.TIMEZONE.lower()
|
||
|
|
||
|
def apply_time(dateobj, timeobj):
|
||
|
if not isinstance(_time, time):
|
||
|
return dateobj
|
||
|
|
||
|
return dateobj.replace(
|
||
|
hour=timeobj.hour, minute=timeobj.minute,
|
||
|
second=timeobj.second, microsecond=timeobj.microsecond
|
||
|
)
|
||
|
|
||
|
if settings.RELATIVE_BASE:
|
||
|
now = settings.RELATIVE_BASE
|
||
|
|
||
|
if 'local' not in _settings_tz:
|
||
|
now = localize_timezone(now, settings.TIMEZONE)
|
||
|
|
||
|
if ptz:
|
||
|
if now.tzinfo:
|
||
|
now = now.astimezone(ptz)
|
||
|
else:
|
||
|
if hasattr(ptz, 'localize'):
|
||
|
now = ptz.localize(now)
|
||
|
else:
|
||
|
now = now.replace(tzinfo=ptz)
|
||
|
|
||
|
if not now.tzinfo:
|
||
|
if hasattr(self.get_local_tz(), 'localize'):
|
||
|
now = self.get_local_tz().localize(now)
|
||
|
else:
|
||
|
now = now.replace(tzinfo=self.get_local_tz())
|
||
|
|
||
|
elif ptz:
|
||
|
localized_now = datetime.now(ptz)
|
||
|
|
||
|
if 'local' in _settings_tz:
|
||
|
now = localized_now
|
||
|
else:
|
||
|
now = apply_timezone(localized_now, settings.TIMEZONE)
|
||
|
|
||
|
else:
|
||
|
if 'local' not in _settings_tz:
|
||
|
utc_dt = datetime.utcnow()
|
||
|
now = apply_timezone(utc_dt, settings.TIMEZONE)
|
||
|
else:
|
||
|
now = datetime.now(self.get_local_tz())
|
||
|
|
||
|
date, period = self._parse_date(date_string, now, settings.PREFER_DATES_FROM)
|
||
|
|
||
|
if date:
|
||
|
old_date = date
|
||
|
date = apply_time(date, _time)
|
||
|
if settings.RETURN_TIME_AS_PERIOD and old_date != date:
|
||
|
period = 'time'
|
||
|
|
||
|
if settings.TO_TIMEZONE:
|
||
|
date = apply_timezone(date, settings.TO_TIMEZONE)
|
||
|
|
||
|
if (
|
||
|
not settings.RETURN_AS_TIMEZONE_AWARE
|
||
|
or (settings.RETURN_AS_TIMEZONE_AWARE
|
||
|
and 'default' == settings.RETURN_AS_TIMEZONE_AWARE and not ptz)
|
||
|
):
|
||
|
date = date.replace(tzinfo=None)
|
||
|
|
||
|
return date, period
|
||
|
|
||
|
def _parse_date(self, date_string, now, prefer_dates_from):
|
||
|
if not self._are_all_words_units(date_string):
|
||
|
return None, None
|
||
|
|
||
|
kwargs = self.get_kwargs(date_string)
|
||
|
if not kwargs:
|
||
|
return None, None
|
||
|
period = 'day'
|
||
|
if 'days' not in kwargs:
|
||
|
for k in ['weeks', 'months', 'years']:
|
||
|
if k in kwargs:
|
||
|
period = k[:-1]
|
||
|
break
|
||
|
td = relativedelta(**kwargs)
|
||
|
|
||
|
if (
|
||
|
re.search(r'\bin\b', date_string)
|
||
|
or re.search(r'\bfuture\b', prefer_dates_from)
|
||
|
and not re.search(r'\bago\b', date_string)
|
||
|
):
|
||
|
date = now + td
|
||
|
else:
|
||
|
date = now - td
|
||
|
return date, period
|
||
|
|
||
|
def get_kwargs(self, date_string):
|
||
|
m = PATTERN.findall(date_string)
|
||
|
if not m:
|
||
|
return {}
|
||
|
|
||
|
kwargs = {}
|
||
|
for num, unit in m:
|
||
|
kwargs[unit + 's'] = float(num.replace(",", "."))
|
||
|
if 'decades' in kwargs:
|
||
|
kwargs['years'] = 10 * kwargs['decades'] + kwargs.get('years', 0)
|
||
|
del kwargs['decades']
|
||
|
return kwargs
|
||
|
|
||
|
def get_date_data(self, date_string, settings=None):
|
||
|
from dateparser.date import DateData
|
||
|
|
||
|
date, period = self.parse(date_string, settings)
|
||
|
return DateData(date_obj=date, period=period)
|
||
|
|
||
|
|
||
|
freshness_date_parser = FreshnessDateDataParser()
|