From 86d737fabb685b81fc4d851bde195f486fd24b0c Mon Sep 17 00:00:00 2001 From: Ben Jeffery Date: Wed, 24 Apr 2024 17:29:52 +0100 Subject: [PATCH 1/3] Modify getsize to return total size, not just the top level --- docs/release.rst | 2 ++ zarr/storage.py | 70 ++++++++++++++++++++------------------ zarr/tests/test_core.py | 13 ++++--- zarr/tests/test_storage.py | 15 ++------ 4 files changed, 51 insertions(+), 49 deletions(-) diff --git a/docs/release.rst b/docs/release.rst index a62d6a653c..0a86e3a153 100644 --- a/docs/release.rst +++ b/docs/release.rst @@ -28,6 +28,8 @@ Enhancements Maintenance ~~~~~~~~~~~ +* ``getsize`` now returns the total size of all nested arrays. + By :user:`Ben Jeffery ` :issue:`253`. Deprecations ~~~~~~~~~~~~ diff --git a/zarr/storage.py b/zarr/storage.py index f412870f75..e1a6a4b79e 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -30,7 +30,6 @@ from collections import OrderedDict from collections.abc import MutableMapping from functools import lru_cache -from os import scandir from pickle import PicklingError from threading import Lock, RLock from typing import Sequence, Mapping, Optional, Union, List, Tuple, Dict, Any @@ -270,9 +269,15 @@ def _getsize(store: BaseStore, path: Path = None) -> int: # also include zarr.json? # members += ['zarr.json'] else: - members = listdir(store, path) - prefix = _path_to_prefix(path) - members = [prefix + k for k in members] + to_visit = [path] + members = [] + while to_visit: + print(to_visit) + current_path = to_visit.pop() + current_members = listdir(store, current_path) + prefix = _path_to_prefix(current_path) + members.extend([prefix + k for k in current_members]) + to_visit.extend([prefix + k for k in current_members]) for k in members: try: v = store[k] @@ -976,8 +981,12 @@ def getsize(self, path: Path = None): elif isinstance(value, self.cls): # total size for directory size = 0 - for v in value.values(): - if not isinstance(v, self.cls): + to_visit = list(value.values()) + while to_visit: + v = to_visit.pop() + if isinstance(v, self.cls): + to_visit.extend(v.values()) + else: size += buffer_size(v) return size @@ -1274,9 +1283,13 @@ def getsize(self, path=None): return os.path.getsize(fs_path) elif os.path.isdir(fs_path): size = 0 - for child in scandir(fs_path): - if child.is_file(): - size += child.stat().st_size + for root, _, files in os.walk(fs_path): + # Include the size of the directory itself, as this can be substantial + # for directories with many files. + size += os.path.getsize(root) + for file in files: + file_path = os.path.join(root, file) + size += os.path.getsize(file_path) return size else: return 0 @@ -1921,29 +1934,19 @@ def listdir(self, path=None): def getsize(self, path=None): path = normalize_storage_path(path) with self.mutex: - children = self.listdir(path) - if children: - size = 0 - for child in children: - if path: - name = path + "/" + child - else: - name = child - try: - info = self.zf.getinfo(name) - except KeyError: - pass - else: - size += info.compress_size - return size - elif path: + to_visit = [path] if path else self.listdir(path) + total_size = 0 + while to_visit: + current_path = to_visit.pop() try: - info = self.zf.getinfo(path) - return info.compress_size + info = self.zf.getinfo(current_path) + total_size += info.compress_size except KeyError: - return 0 - else: - return 0 + children = self.listdir(current_path) + for child in children: + full_path = current_path + "/" + child if current_path else child + to_visit.append(full_path) + return total_size def clear(self): if self.mode == "r": @@ -2527,6 +2530,8 @@ def listdir(self, path: Path = None): return listing def getsize(self, path=None) -> int: + print("WYF") + print(self._store, path) return getsize(self._store, path=path) def _pop_value(self): @@ -2795,10 +2800,9 @@ def getsize(self, path=None): size = self.cursor.execute( """ SELECT COALESCE(SUM(LENGTH(v)), 0) FROM zarr - WHERE k LIKE (? || "%") AND - 0 == INSTR(LTRIM(SUBSTR(k, LENGTH(?) + 1), "/"), "/") + WHERE k LIKE (? || "%") """, - (path, path), + (path,), ) for (s,) in size: return s diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index 4729dc01b6..bf9e706bf1 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -3,6 +3,7 @@ import sys import pickle import shutil +import tempfile from typing import Any, Literal, Optional, Tuple, Union, Sequence import unittest from itertools import zip_longest @@ -100,6 +101,7 @@ class TestArray: write_empty_chunks = True read_only = False storage_transformers: Tuple[Any, ...] = () + group_size = 0 def create_store(self) -> BaseStore: return KVStore(dict()) @@ -229,7 +231,7 @@ def test_nbytes_stored(self): buffer_size(v) for k, v in z.store.items() if k != "zarr.json" ) else: - expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values()) + expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values()) + self.group_size assert expect_nbytes_stored == z.nbytes_stored z[:] = 42 if self.version == 3: @@ -237,7 +239,7 @@ def test_nbytes_stored(self): buffer_size(v) for k, v in z.store.items() if k != "zarr.json" ) else: - expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values()) + expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values()) + self.group_size assert expect_nbytes_stored == z.nbytes_stored # mess with store @@ -1677,6 +1679,8 @@ def test_nbytes_stored(self): class TestArrayWithDirectoryStore(TestArray): + group_size = 4096 + def create_store(self): path = mkdtemp() atexit.register(shutil.rmtree, path) @@ -1686,10 +1690,10 @@ def create_store(self): def test_nbytes_stored(self): # dict as store z = self.create_array(shape=1000, chunks=100) - expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values()) + expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values()) + self.group_size assert expect_nbytes_stored == z.nbytes_stored z[:] = 42 - expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values()) + expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values()) + self.group_size assert expect_nbytes_stored == z.nbytes_stored @@ -2028,6 +2032,7 @@ def expected(self): @pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") class TestArrayWithN5FSStore(TestArrayWithN5Store): + group_size = 0 def create_store(self): path = mkdtemp() atexit.register(shutil.rmtree, path) diff --git a/zarr/tests/test_storage.py b/zarr/tests/test_storage.py index da690f5959..b6bcbdf3d7 100644 --- a/zarr/tests/test_storage.py +++ b/zarr/tests/test_storage.py @@ -366,19 +366,10 @@ def test_hierarchy(self): # test getsize (optional) if hasattr(store, "getsize"): - # TODO: proper behavior of getsize? - # v3 returns size of all nested arrays, not just the - # size of the arrays in the current folder. - if self.version == 2: - assert 6 == store.getsize() - else: - assert 15 == store.getsize() + assert 15 == store.getsize() assert 3 == store.getsize("a") assert 3 == store.getsize("b") - if self.version == 2: - assert 3 == store.getsize("c") - else: - assert 9 == store.getsize("c") + assert 3 == store.getsize("c") assert 3 == store.getsize("c/d") assert 6 == store.getsize("c/e") assert 3 == store.getsize("c/e/f") @@ -2256,7 +2247,7 @@ def test_getsize(): store["foo"] = b"aaa" store["bar"] = b"bbbb" store["baz/quux"] = b"ccccc" - assert 7 == getsize(store) + assert 12 == getsize(store) assert 5 == getsize(store, "baz") store = KVStore(dict()) From 44e12b5eb1ffb6bbe3dd1506a5bbc7733c4ed240 Mon Sep 17 00:00:00 2001 From: David Stansby Date: Fri, 18 Oct 2024 17:51:58 +0100 Subject: [PATCH 2/3] Pre-commit fixes --- zarr/tests/test_core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index bf9e706bf1..4e72d11006 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -3,7 +3,6 @@ import sys import pickle import shutil -import tempfile from typing import Any, Literal, Optional, Tuple, Union, Sequence import unittest from itertools import zip_longest @@ -2033,6 +2032,7 @@ def expected(self): @pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") class TestArrayWithN5FSStore(TestArrayWithN5Store): group_size = 0 + def create_store(self): path = mkdtemp() atexit.register(shutil.rmtree, path) From 7b01c57d07f9d87e7648448b83e9f55bf083b52e Mon Sep 17 00:00:00 2001 From: David Stansby Date: Fri, 18 Oct 2024 17:59:04 +0100 Subject: [PATCH 3/3] Fix test --- zarr/tests/test_storage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zarr/tests/test_storage.py b/zarr/tests/test_storage.py index b6bcbdf3d7..31a92db8e2 100644 --- a/zarr/tests/test_storage.py +++ b/zarr/tests/test_storage.py @@ -369,7 +369,7 @@ def test_hierarchy(self): assert 15 == store.getsize() assert 3 == store.getsize("a") assert 3 == store.getsize("b") - assert 3 == store.getsize("c") + assert 9 == store.getsize("c") assert 3 == store.getsize("c/d") assert 6 == store.getsize("c/e") assert 3 == store.getsize("c/e/f")