diff --git a/bio2zarr/core.py b/bio2zarr/core.py index 8b07ac5..79381ab 100644 --- a/bio2zarr/core.py +++ b/bio2zarr/core.py @@ -3,6 +3,8 @@ import dataclasses import logging import multiprocessing +import os +import os.path import threading import time @@ -45,6 +47,22 @@ def chunk_aligned_slices(z, n, max_chunks=None): return slices +def du(path): + """ + Return the total bytes stored at this path. + """ + total = os.path.getsize(path) + # pathlib walk method doesn't exist until 3.12 :( + for root, dirs, files in os.walk(path): + for lst in [dirs, files]: + for name in lst: + fullname = os.path.join(root, name) + size = os.path.getsize(fullname) + total += size + logger.debug(f"du({path}) = {total}") + return total + + class SynchronousExecutor(cf.Executor): def submit(self, fn, /, *args, **kwargs): future = cf.Future() diff --git a/bio2zarr/vcf.py b/bio2zarr/vcf.py index e7288f8..902aa67 100644 --- a/bio2zarr/vcf.py +++ b/bio2zarr/vcf.py @@ -6,6 +6,7 @@ import logging import math import os +import os.path import pathlib import pickle import shutil @@ -1509,14 +1510,12 @@ class VcfZarr: def __init__(self, path): if not (path / ".zmetadata").exists(): raise ValueError("Not in VcfZarr format") # NEEDS TEST + self.path = path self.root = zarr.open(path, mode="r") - def __repr__(self): - return repr(self.root) # NEEDS TEST - def summary_table(self): data = [] - arrays = [(a.nbytes_stored, a) for _, a in self.root.arrays()] + arrays = [(core.du(self.path / a.basename), a) for _, a in self.root.arrays()] arrays.sort(key=lambda x: x[0]) for stored, array in reversed(arrays): d = { diff --git a/tests/test_core.py b/tests/test_core.py index 21731c0..bdb1e9f 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -179,3 +179,19 @@ def test_5_chunk_1(self, n, expected): z = zarr.array(np.arange(5), chunks=1, dtype=int) result = core.chunk_aligned_slices(z, n) assert result == expected + + +@pytest.mark.parametrize( + ("path", "expected"), + [ + # NOTE: this data was generated using du -sb on a Linux system. + # It *might* work in CI, but it may well not either, as it's + # probably dependent on a whole bunch of things. Expect to fail + # at some point. + ("tests/data", 4630726), + ("tests/data/vcf", 4618589), + ("tests/data/vcf/sample.vcf.gz", 1089), + ], +) +def test_du(path, expected): + assert core.du(path) == expected