From f8bbf7a38687bcb3f2889cb4a82c385818da61aa Mon Sep 17 00:00:00 2001 From: Ilia <30232564+umnovI@users.noreply.github.com> Date: Sat, 27 Jan 2024 09:44:14 +0300 Subject: [PATCH] Make `FileStorage` to check staleness of all cache files with set interval. (#169) * add timer between `AsyncFileStorage` staleness checks. * make `AsyncFileStorage` always check staleness of a current file. * Update hishel/_async/_storages.py Review. * PR review fixes. * Changelog * docs --- CHANGELOG.md | 1 + docs/advanced/storages.md | 12 ++++++++++++ hishel/_async/_storages.py | 20 +++++++++++++++++--- hishel/_sync/_storages.py | 20 +++++++++++++++++--- tests/_async/test_storages.py | 28 +++++++++++++++++++++++++++- tests/_sync/test_storages.py | 28 +++++++++++++++++++++++++++- 6 files changed, 101 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 17625681..eac7b00b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ ## Unreleased +- Make `FileStorage` to check staleness of all cache files with set interval. (#169) - Support AWS S3 storages. (#164) - Move `typing_extensions` from requirements.txt to pyproject.toml. (#161) diff --git a/docs/advanced/storages.md b/docs/advanced/storages.md index 7804cac4..ab6a269a 100644 --- a/docs/advanced/storages.md +++ b/docs/advanced/storages.md @@ -71,6 +71,18 @@ storage = hishel.FileStorage(ttl=3600) If you do this, `Hishel` will delete any stored responses whose ttl has expired. In this example, the stored responses were limited to 1 hour. +#### Check ttl every + +In order to avoid excessive memory utilization, `Hishel` must periodically clean the old responses, or responses that are not being used and should be deleted from the cache. +It clears the cache by default every minute, but you may change the interval directly with the `check_ttl_every` argument. + +Example: + +```python +import hishel + +storage = hishel.FileStorage(check_ttl_every=600) # check every 600s (10m) +``` ### :material-memory: In-memory storage diff --git a/hishel/_async/_storages.py b/hishel/_async/_storages.py index 88a42447..7aae02fe 100644 --- a/hishel/_async/_storages.py +++ b/hishel/_async/_storages.py @@ -67,6 +67,9 @@ class AsyncFileStorage(AsyncBaseStorage): :type base_path: tp.Optional[Path], optional :param ttl: Specifies the maximum number of seconds that the response can be cached, defaults to None :type ttl: tp.Optional[tp.Union[int, float]], optional + :param check_ttl_every: How often in seconds to check staleness of **all** cache files. + Makes sense only with set `ttl`, defaults to 60 + :type check_ttl_every: tp.Union[int, float] """ def __init__( @@ -74,6 +77,7 @@ def __init__( serializer: tp.Optional[BaseSerializer] = None, base_path: tp.Optional[Path] = None, ttl: tp.Optional[tp.Union[int, float]] = None, + check_ttl_every: tp.Union[int, float] = 60, ) -> None: super().__init__(serializer, ttl) @@ -84,6 +88,8 @@ def __init__( self._file_manager = AsyncFileManager(is_binary=self._serializer.is_binary) self._lock = AsyncLock() + self._check_ttl_every = check_ttl_every + self._last_cleaned = time.monotonic() async def store(self, key: str, response: Response, request: Request, metadata: Metadata) -> None: """ @@ -105,7 +111,7 @@ async def store(self, key: str, response: Response, request: Request, metadata: str(response_path), self._serializer.dumps(response=response, request=request, metadata=metadata), ) - await self._remove_expired_caches() + await self._remove_expired_caches(response_path) async def retrieve(self, key: str) -> tp.Optional[StoredResponse]: """ @@ -119,7 +125,7 @@ async def retrieve(self, key: str) -> tp.Optional[StoredResponse]: response_path = self._base_path / key - await self._remove_expired_caches() + await self._remove_expired_caches(response_path) async with self._lock: if response_path.exists(): return self._serializer.loads(await self._file_manager.read_from(str(response_path))) @@ -128,10 +134,18 @@ async def retrieve(self, key: str) -> tp.Optional[StoredResponse]: async def aclose(self) -> None: # pragma: no cover return - async def _remove_expired_caches(self) -> None: + async def _remove_expired_caches(self, response_path: Path) -> None: if self._ttl is None: return + if time.monotonic() - self._last_cleaned < self._check_ttl_every: + if response_path.is_file(): + age = time.time() - response_path.stat().st_mtime + if age > self._ttl: + response_path.unlink() + return + + self._last_cleaned = time.monotonic() async with self._lock: for file in self._base_path.iterdir(): if file.is_file(): diff --git a/hishel/_sync/_storages.py b/hishel/_sync/_storages.py index 8e02dec2..77fa56d9 100644 --- a/hishel/_sync/_storages.py +++ b/hishel/_sync/_storages.py @@ -67,6 +67,9 @@ class FileStorage(BaseStorage): :type base_path: tp.Optional[Path], optional :param ttl: Specifies the maximum number of seconds that the response can be cached, defaults to None :type ttl: tp.Optional[tp.Union[int, float]], optional + :param check_ttl_every: How often in seconds to check staleness of **all** cache files. + Makes sense only with set `ttl`, defaults to 60 + :type check_ttl_every: tp.Union[int, float] """ def __init__( @@ -74,6 +77,7 @@ def __init__( serializer: tp.Optional[BaseSerializer] = None, base_path: tp.Optional[Path] = None, ttl: tp.Optional[tp.Union[int, float]] = None, + check_ttl_every: tp.Union[int, float] = 60, ) -> None: super().__init__(serializer, ttl) @@ -84,6 +88,8 @@ def __init__( self._file_manager = FileManager(is_binary=self._serializer.is_binary) self._lock = Lock() + self._check_ttl_every = check_ttl_every + self._last_cleaned = time.monotonic() def store(self, key: str, response: Response, request: Request, metadata: Metadata) -> None: """ @@ -105,7 +111,7 @@ def store(self, key: str, response: Response, request: Request, metadata: Metada str(response_path), self._serializer.dumps(response=response, request=request, metadata=metadata), ) - self._remove_expired_caches() + self._remove_expired_caches(response_path) def retrieve(self, key: str) -> tp.Optional[StoredResponse]: """ @@ -119,7 +125,7 @@ def retrieve(self, key: str) -> tp.Optional[StoredResponse]: response_path = self._base_path / key - self._remove_expired_caches() + self._remove_expired_caches(response_path) with self._lock: if response_path.exists(): return self._serializer.loads(self._file_manager.read_from(str(response_path))) @@ -128,10 +134,18 @@ def retrieve(self, key: str) -> tp.Optional[StoredResponse]: def close(self) -> None: # pragma: no cover return - def _remove_expired_caches(self) -> None: + def _remove_expired_caches(self, response_path: Path) -> None: if self._ttl is None: return + if time.monotonic() - self._last_cleaned < self._check_ttl_every: + if response_path.is_file(): + age = time.time() - response_path.stat().st_mtime + if age > self._ttl: + response_path.unlink() + return + + self._last_cleaned = time.monotonic() with self._lock: for file in self._base_path.iterdir(): if file.is_file(): diff --git a/tests/_async/test_storages.py b/tests/_async/test_storages.py index 716f315e..12a98777 100644 --- a/tests/_async/test_storages.py +++ b/tests/_async/test_storages.py @@ -117,7 +117,7 @@ async def test_inmemorystorage(): @pytest.mark.asyncio async def test_filestorage_expired(use_temp_dir): - storage = AsyncFileStorage(ttl=0.1) + storage = AsyncFileStorage(ttl=0.2, check_ttl_every=0.1) first_request = Request(b"GET", "https://example.com") second_request = Request(b"GET", "https://anotherexample.com") @@ -136,6 +136,32 @@ async def test_filestorage_expired(use_temp_dir): assert await storage.retrieve(first_key) is None +@pytest.mark.asyncio +async def test_filestorage_timer(use_temp_dir): + storage = AsyncFileStorage(ttl=0.2, check_ttl_every=0.2) + + first_request = Request(b"GET", "https://example.com") + second_request = Request(b"GET", "https://anotherexample.com") + + first_key = generate_key(first_request) + second_key = generate_key(second_request) + + response = Response(200, headers=[], content=b"test") + await response.aread() + + await storage.store(first_key, response=response, request=first_request, metadata=dummy_metadata) + assert await storage.retrieve(first_key) is not None + await asleep(0.1) + assert await storage.retrieve(first_key) is not None + await storage.store(second_key, response=response, request=second_request, metadata=dummy_metadata) + assert await storage.retrieve(second_key) is not None + await asleep(0.1) + assert await storage.retrieve(first_key) is None + assert await storage.retrieve(second_key) is not None + await asleep(0.1) + assert await storage.retrieve(second_key) is None + + @pytest.mark.asyncio async def test_redisstorage_expired(): if await is_redis_down(): # pragma: no cover diff --git a/tests/_sync/test_storages.py b/tests/_sync/test_storages.py index 71e545bf..38ec5c36 100644 --- a/tests/_sync/test_storages.py +++ b/tests/_sync/test_storages.py @@ -117,7 +117,7 @@ def test_inmemorystorage(): def test_filestorage_expired(use_temp_dir): - storage = FileStorage(ttl=0.1) + storage = FileStorage(ttl=0.2, check_ttl_every=0.1) first_request = Request(b"GET", "https://example.com") second_request = Request(b"GET", "https://anotherexample.com") @@ -137,6 +137,32 @@ def test_filestorage_expired(use_temp_dir): +def test_filestorage_timer(use_temp_dir): + storage = FileStorage(ttl=0.2, check_ttl_every=0.2) + + first_request = Request(b"GET", "https://example.com") + second_request = Request(b"GET", "https://anotherexample.com") + + first_key = generate_key(first_request) + second_key = generate_key(second_request) + + response = Response(200, headers=[], content=b"test") + response.read() + + storage.store(first_key, response=response, request=first_request, metadata=dummy_metadata) + assert storage.retrieve(first_key) is not None + sleep(0.1) + assert storage.retrieve(first_key) is not None + storage.store(second_key, response=response, request=second_request, metadata=dummy_metadata) + assert storage.retrieve(second_key) is not None + sleep(0.1) + assert storage.retrieve(first_key) is None + assert storage.retrieve(second_key) is not None + sleep(0.1) + assert storage.retrieve(second_key) is None + + + def test_redisstorage_expired(): if is_redis_down(): # pragma: no cover pytest.fail("Redis server was not found")