diff --git a/docs/release.rst b/docs/release.rst index 74207d92d1..18e1fd60dc 100644 --- a/docs/release.rst +++ b/docs/release.rst @@ -29,6 +29,8 @@ Enhancements Maintenance ~~~~~~~~~~~ +* ``getsize`` now returns the total size of all nested arrays. + By :user:`Ben Jeffery ` :issue:`253`. * Dropped support for Python 3.10. By :user:`David Stansby ` (:issue:`2344`). * Removed testing for compatibility with the ``bsddb3`` package. diff --git a/zarr/storage.py b/zarr/storage.py index 7e5e966bc1..f7da0210a3 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -30,7 +30,6 @@ from collections import OrderedDict from collections.abc import MutableMapping from functools import lru_cache -from os import scandir from pickle import PicklingError from threading import Lock, RLock from typing import Sequence, Mapping, Optional, Union, List, Tuple, Dict, Any @@ -270,9 +269,15 @@ def _getsize(store: BaseStore, path: Path = None) -> int: # also include zarr.json? # members += ['zarr.json'] else: - members = listdir(store, path) - prefix = _path_to_prefix(path) - members = [prefix + k for k in members] + to_visit = [path] + members = [] + while to_visit: + print(to_visit) + current_path = to_visit.pop() + current_members = listdir(store, current_path) + prefix = _path_to_prefix(current_path) + members.extend([prefix + k for k in current_members]) + to_visit.extend([prefix + k for k in current_members]) for k in members: try: v = store[k] @@ -976,8 +981,12 @@ def getsize(self, path: Path = None): elif isinstance(value, self.cls): # total size for directory size = 0 - for v in value.values(): - if not isinstance(v, self.cls): + to_visit = list(value.values()) + while to_visit: + v = to_visit.pop() + if isinstance(v, self.cls): + to_visit.extend(v.values()) + else: size += buffer_size(v) return size @@ -1274,9 +1283,13 @@ def getsize(self, path=None): return os.path.getsize(fs_path) elif os.path.isdir(fs_path): size = 0 - for child in scandir(fs_path): - if child.is_file(): - size += child.stat().st_size + for root, _, files in os.walk(fs_path): + # Include the size of the directory itself, as this can be substantial + # for directories with many files. + size += os.path.getsize(root) + for file in files: + file_path = os.path.join(root, file) + size += os.path.getsize(file_path) return size else: return 0 @@ -1920,29 +1933,19 @@ def listdir(self, path=None): def getsize(self, path=None): path = normalize_storage_path(path) with self.mutex: - children = self.listdir(path) - if children: - size = 0 - for child in children: - if path: - name = path + "/" + child - else: - name = child - try: - info = self.zf.getinfo(name) - except KeyError: - pass - else: - size += info.compress_size - return size - elif path: + to_visit = [path] if path else self.listdir(path) + total_size = 0 + while to_visit: + current_path = to_visit.pop() try: - info = self.zf.getinfo(path) - return info.compress_size + info = self.zf.getinfo(current_path) + total_size += info.compress_size except KeyError: - return 0 - else: - return 0 + children = self.listdir(current_path) + for child in children: + full_path = current_path + "/" + child if current_path else child + to_visit.append(full_path) + return total_size def clear(self): if self.mode == "r": @@ -2528,6 +2531,8 @@ def listdir(self, path: Path = None): return listing def getsize(self, path=None) -> int: + print("WYF") + print(self._store, path) return getsize(self._store, path=path) def _pop_value(self): @@ -2796,10 +2801,9 @@ def getsize(self, path=None): size = self.cursor.execute( """ SELECT COALESCE(SUM(LENGTH(v)), 0) FROM zarr - WHERE k LIKE (? || "%") AND - 0 == INSTR(LTRIM(SUBSTR(k, LENGTH(?) + 1), "/"), "/") + WHERE k LIKE (? || "%") """, - (path, path), + (path,), ) for (s,) in size: return s diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index 4729dc01b6..4e72d11006 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -100,6 +100,7 @@ class TestArray: write_empty_chunks = True read_only = False storage_transformers: Tuple[Any, ...] = () + group_size = 0 def create_store(self) -> BaseStore: return KVStore(dict()) @@ -229,7 +230,7 @@ def test_nbytes_stored(self): buffer_size(v) for k, v in z.store.items() if k != "zarr.json" ) else: - expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values()) + expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values()) + self.group_size assert expect_nbytes_stored == z.nbytes_stored z[:] = 42 if self.version == 3: @@ -237,7 +238,7 @@ def test_nbytes_stored(self): buffer_size(v) for k, v in z.store.items() if k != "zarr.json" ) else: - expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values()) + expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values()) + self.group_size assert expect_nbytes_stored == z.nbytes_stored # mess with store @@ -1677,6 +1678,8 @@ def test_nbytes_stored(self): class TestArrayWithDirectoryStore(TestArray): + group_size = 4096 + def create_store(self): path = mkdtemp() atexit.register(shutil.rmtree, path) @@ -1686,10 +1689,10 @@ def create_store(self): def test_nbytes_stored(self): # dict as store z = self.create_array(shape=1000, chunks=100) - expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values()) + expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values()) + self.group_size assert expect_nbytes_stored == z.nbytes_stored z[:] = 42 - expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values()) + expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values()) + self.group_size assert expect_nbytes_stored == z.nbytes_stored @@ -2028,6 +2031,8 @@ def expected(self): @pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") class TestArrayWithN5FSStore(TestArrayWithN5Store): + group_size = 0 + def create_store(self): path = mkdtemp() atexit.register(shutil.rmtree, path) diff --git a/zarr/tests/test_storage.py b/zarr/tests/test_storage.py index da690f5959..31a92db8e2 100644 --- a/zarr/tests/test_storage.py +++ b/zarr/tests/test_storage.py @@ -366,19 +366,10 @@ def test_hierarchy(self): # test getsize (optional) if hasattr(store, "getsize"): - # TODO: proper behavior of getsize? - # v3 returns size of all nested arrays, not just the - # size of the arrays in the current folder. - if self.version == 2: - assert 6 == store.getsize() - else: - assert 15 == store.getsize() + assert 15 == store.getsize() assert 3 == store.getsize("a") assert 3 == store.getsize("b") - if self.version == 2: - assert 3 == store.getsize("c") - else: - assert 9 == store.getsize("c") + assert 9 == store.getsize("c") assert 3 == store.getsize("c/d") assert 6 == store.getsize("c/e") assert 3 == store.getsize("c/e/f") @@ -2256,7 +2247,7 @@ def test_getsize(): store["foo"] = b"aaa" store["bar"] = b"bbbb" store["baz/quux"] = b"ccccc" - assert 7 == getsize(store) + assert 12 == getsize(store) assert 5 == getsize(store, "baz") store = KVStore(dict())