Skip to content

Commit

Permalink
feat: add caching for timezone offsets, significantly speeds up import
Browse files Browse the repository at this point in the history
this is different from pr #1181. that pr only makes import faster but
still incurs cost on the first usage. this one leverages an optional
cache.

closes #533
  • Loading branch information
tobymao committed Jan 30, 2025
1 parent 47acb88 commit 0b3a522
Show file tree
Hide file tree
Showing 5 changed files with 86 additions and 5 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,4 @@ docs/_build

# Other
raw_data
*.pkl
3 changes: 1 addition & 2 deletions dateparser/data/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,2 @@
from dateparser.data import date_translation_data

from .languages_info import language_locale_dict, language_order
from .languages_info import language_order, language_locale_dict
48 changes: 45 additions & 3 deletions dateparser/timezone_parser.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
from datetime import datetime, timedelta, timezone, tzinfo

import os
import pickle
import regex as re
from pathlib import Path

from .timezones import timezone_info_list

Expand Down Expand Up @@ -85,7 +88,46 @@ def get_local_tz_offset():


_search_regex_parts = []
_tz_offsets = list(build_tz_offsets(_search_regex_parts))
_search_regex = re.compile("|".join(_search_regex_parts))
_search_regex_ignorecase = re.compile("|".join(_search_regex_parts), re.IGNORECASE)
local_tz_offset = get_local_tz_offset()

DEFAULT_CACHE_PATH = ".dateparser_tz_cache.pkl"

_tz_offsets = None
_search_regex = None
_search_regex_ignorecase = None


def _load_offsets(cache = False):
from dateparser import __version__

global _tz_offsets, _search_regex, _search_regex_ignorecase

if cache:
path = Path(os.environ.get("DATEPARSER_TZ_CACHE_PATH", DEFAULT_CACHE_PATH))
path.parents[0].mkdir(parents=True, exist_ok=True)

try:
with open(path, mode="rb") as file:
version, _tz_offsets, _search_regex, _search_regex_ignorecase = pickle.load(file)

if version == __version__:
return
except FileNotFoundError:
pass
except (ValueError, TypeError) as ex:
from .utils import get_logger
get_logger().error("Error loading tz cache: %s", ex)

_tz_offsets = list(build_tz_offsets(_search_regex_parts))
_search_regex = re.compile("|".join(_search_regex_parts))
_search_regex_ignorecase = re.compile("|".join(_search_regex_parts), re.IGNORECASE)

if cache:
with open(path, mode="wb") as file:
pickle.dump(
(__version__, _tz_offsets, _search_regex, _search_regex_ignorecase),
file,
)


_load_offsets("DATEPARSER_TZ_CACHE" in os.environ)
7 changes: 7 additions & 0 deletions docs/settings.rst
Original file line number Diff line number Diff line change
Expand Up @@ -225,3 +225,10 @@ Dateparser in the future. For example, to ignore relative times:

``CACHE_SIZE_LIMIT``: limits the size of caches, that store data for already processed dates.
Default to ``1000``, but you can set ``0`` for turning off the limit.


Environment variables
++++++++++++++

```DATEPARSER_TZ_CACHE```: Whether or not to cache tz offsets and related search regexes. This speeds up the initialization time of dateparser. Defaults to False.
```DATEPARSER_TZ_CACHE_PATH```: The path to use for the tz cache file. Defaults to ``.dateparser_tz_cache.pkl``.
32 changes: 32 additions & 0 deletions tests/test_timezone_parser.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import datetime as dt
import pickle
from datetime import datetime, timedelta
from pathlib import Path
from unittest import SkipTest
from unittest.mock import Mock, patch

Expand Down Expand Up @@ -240,3 +242,33 @@ def when_date_is_localized(self, given_date):
def then_localized_date_is(self, expected_date, expected_tzname):
self.assertEqual(self.localized_date.date(), expected_date.date())
self.assertEqual(self.localized_date.tzname(), expected_tzname)


class TestOffsetCaching(BaseTestCase):
def setUp(self):
super().setUp()

self.cache_file = Path(dateparser.timezone_parser.DEFAULT_CACHE_PATH)
self.cache_file.unlink(missing_ok=True)

def test_no_cache(self):
dateparser.timezone_parser._load_offsets()
self.assertFalse(self.cache_file.exists())

def test_cache(self):
dateparser.timezone_parser._tz_offsets = None
dateparser.timezone_parser._load_offsets(True)
self.assertTrue(self.cache_file.exists())
self.assertTrue(dateparser.timezone_parser._tz_offsets)

dateparser.timezone_parser._tz_offsets = None
dateparser.timezone_parser._load_offsets(True)
self.assertTrue(dateparser.timezone_parser._tz_offsets)

def test_cache_error(self):
with open(self.cache_file, "wb") as file:
pickle.dump(1, file)
self.assertTrue(self.cache_file.exists())
dateparser.timezone_parser._tz_offsets = None
dateparser.timezone_parser._load_offsets(True)
self.assertTrue(dateparser.timezone_parser._tz_offsets)

0 comments on commit 0b3a522

Please sign in to comment.