Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add caching for timezone offsets, significantly speeds up import #1250

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,4 @@ docs/_build

# Other
raw_data
*.pkl
1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ include CONTRIBUTING.rst
include HISTORY.rst
include LICENSE
include README.rst
include dateparser_data/dateparser_tz_cache.pkl
include dateparser_data/settings.py
include requirements.txt

Expand Down
42 changes: 12 additions & 30 deletions dateparser/timezone_parser.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import pickle

Check warning on line 1 in dateparser/timezone_parser.py

View check run for this annotation

Codecov / codecov/patch

dateparser/timezone_parser.py#L1

Added line #L1 was not covered by tests
from datetime import datetime, timedelta, timezone, tzinfo
from pathlib import Path

Check warning on line 3 in dateparser/timezone_parser.py

View check run for this annotation

Codecov / codecov/patch

dateparser/timezone_parser.py#L3

Added line #L3 was not covered by tests

import regex as re

from .timezones import timezone_info_list


class StaticTzInfo(tzinfo):
def __init__(self, name, offset):
Expand Down Expand Up @@ -54,38 +54,20 @@
return datetime_obj - datetime_tz_offset + local_tz_offset


def build_tz_offsets(search_regex_parts):
def get_offset(tz_obj, regex, repl="", replw=""):
return (
tz_obj[0],
{
"regex": re.compile(
re.sub(repl, replw, regex % tz_obj[0]), re.IGNORECASE
),
"offset": timedelta(seconds=tz_obj[1]),
},
)

for tz_info in timezone_info_list:
for regex in tz_info["regex_patterns"]:
for tz_obj in tz_info["timezones"]:
search_regex_parts.append(tz_obj[0])
yield get_offset(tz_obj, regex)

# alternate patterns
for replace, replacewith in tz_info.get("replace", []):
search_regex_parts.append(re.sub(replace, replacewith, tz_obj[0]))
yield get_offset(tz_obj, regex, repl=replace, replw=replacewith)


def get_local_tz_offset():
offset = datetime.now() - datetime.now(tz=timezone.utc).replace(tzinfo=None)
offset = timedelta(days=offset.days, seconds=round(offset.seconds, -1))
return offset


_search_regex_parts = []
_tz_offsets = list(build_tz_offsets(_search_regex_parts))
_search_regex = re.compile("|".join(_search_regex_parts))
_search_regex_ignorecase = re.compile("|".join(_search_regex_parts), re.IGNORECASE)
local_tz_offset = get_local_tz_offset()

with open(

Check warning on line 65 in dateparser/timezone_parser.py

View check run for this annotation

Codecov / codecov/patch

dateparser/timezone_parser.py#L65

Added line #L65 was not covered by tests
Path(__file__).parent.parent.joinpath("dateparser_data", "dateparser_tz_cache.pkl"),
mode="rb",
) as file:
(

Check warning on line 69 in dateparser/timezone_parser.py

View check run for this annotation

Codecov / codecov/patch

dateparser/timezone_parser.py#L69

Added line #L69 was not covered by tests
_tz_offsets,
_search_regex,
_search_regex_ignorecase,
) = pickle.load(file)
46 changes: 46 additions & 0 deletions dateparser/timezones.py → dateparser_scripts/timezones.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@
# As well as http://en.wikipedia.org/wiki/List_of_time_zone_abbreviations
# As well as https://github.com/scrapinghub/dateparser/pull/4
# As well as http://en.wikipedia.org/wiki/List_of_UTC_time_offsets
import pickle
import re
import sys
from datetime import timedelta
from pathlib import Path

timezone_info_list = [
{
Expand Down Expand Up @@ -467,3 +472,44 @@
],
},
]


def build_tz_offsets(search_regex_parts):
def get_offset(tz_obj, regex, repl="", replw=""):
return (
tz_obj[0],
{
"regex": re.compile(
re.sub(repl, replw, regex % tz_obj[0]), re.IGNORECASE
),
"offset": timedelta(seconds=tz_obj[1]),
},
)

for tz_info in timezone_info_list:
for regex in tz_info["regex_patterns"]:
for tz_obj in tz_info["timezones"]:
search_regex_parts.append(tz_obj[0])
yield get_offset(tz_obj, regex)

# alternate patterns
for replace, replacewith in tz_info.get("replace", []):
search_regex_parts.append(re.sub(replace, replacewith, tz_obj[0]))
yield get_offset(tz_obj, regex, repl=replace, replw=replacewith)



def main():
search_regex_parts = []
tz_offets = list(build_tz_offsets(search_regex_parts))
search_regex = re.compile("|".join(search_regex_parts))
search_regex_ignorecase = re.compile("|".join(search_regex_parts), re.IGNORECASE)

with open(Path("dateparser_data", "dateparser_tz_cache.pkl"), mode="wb") as file:
pickle.dump(
(tz_offets, search_regex, search_regex_ignorecase),
file,
)

if __name__ == "__main__":
main()
16 changes: 16 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,26 @@
import re
import subprocess

from setuptools import find_packages, setup
from setuptools.command import develop, install

__version__ = re.search(
r"__version__.*\s*=\s*[\"]([^\"]+)[\"]", open("dateparser/__init__.py").read()
).group(1)


class PostDevelop(develop.develop):
def run(self):
subprocess.call("python 3 dateparser_scripts/timezones.py", shell=True)
develop.develop.run(self)


class PostInstall(install.install):
def run(self):
subprocess.call("python3 dateparser_scripts/timezones.py", shell=True)
install.install.run(self)


introduction = re.sub(
r":members:.+|..\sautomodule::.+|:class:|:func:|:ref:",
"",
Expand Down Expand Up @@ -45,6 +60,7 @@
"fasttext": ["fasttext"],
"langdetect": ["langdetect"],
},
cmdclass={"develop": PostDevelop, "install": PostInstall},
license="BSD",
zip_safe=False,
keywords="dateparser",
Expand Down
Loading