-
Notifications
You must be signed in to change notification settings - Fork 10
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
DAS-1699 rechunk the zarr store (#39)
- Loading branch information
1 parent
7cd1232
commit e230636
Showing
12 changed files
with
347 additions
and
46 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,125 @@ | ||
"""Code that will rechunk an existing zarr store.""" | ||
from __future__ import annotations | ||
|
||
from harmony_netcdf_to_zarr.convert import compute_chunksize | ||
|
||
from fsspec.mapping import FSMap | ||
from time import time | ||
from rechunker import rechunk | ||
from typing import List, Dict, TYPE_CHECKING | ||
if TYPE_CHECKING: | ||
from harmony_netcdf_to_zarr.adapter import NetCDFToZarrAdapter | ||
from zarr import open_consolidated, consolidate_metadata, Group as zarrGroup | ||
import xarray as xr | ||
|
||
|
||
def rechunk_zarr(zarr_root: str, chunked_root: str, adapter: NetCDFToZarrAdapter) -> str: | ||
"""Rechunks the zarr store found at zarr_root location. | ||
Rechunks the store found at zarr_root, outputing a new rechunked store into | ||
chunked root. Finally deleting the input zarr_root store. | ||
""" | ||
temp_root = zarr_root.replace('.zarr', '_tmp.zarr') | ||
|
||
zarr_store = adapter.s3.get_mapper(root=zarr_root, | ||
check=False, | ||
create=False) | ||
|
||
zarr_temp = adapter.s3.get_mapper(root=temp_root, check=False, create=True) | ||
zarr_target = adapter.s3.get_mapper(root=chunked_root, | ||
check=False, | ||
create=True) | ||
|
||
try: | ||
adapter.s3.rm(temp_root, recursive=True) | ||
except FileNotFoundError: | ||
adapter.logger.info(f'Nothing to clean in {temp_root}') | ||
|
||
try: | ||
adapter.s3.rm(chunked_root, recursive=True) | ||
except FileNotFoundError: | ||
adapter.logger.info(f'Nothing to clean in {chunked_root}') | ||
|
||
t1 = time() | ||
rechunk_zarr_store(zarr_store, zarr_target, zarr_temp) | ||
t2 = time() | ||
adapter.logger.info(f'Function rechunk_zarr_store executed in {(t2-t1):.4f}s') | ||
|
||
adapter.s3.rm(zarr_root, recursive=True) | ||
adapter.s3.rm(temp_root, recursive=True) | ||
|
||
|
||
def rechunk_zarr_store(zarr_store: FSMap, zarr_target: FSMap, | ||
zarr_temp: FSMap) -> str: | ||
"""Rechunks a zarr store that was created by the mosaic_to_zarr processes. | ||
This is specific to tuning output zarr store variables to the chunksizes | ||
given by compute_chunksize. | ||
""" | ||
target_chunks = get_target_chunks(zarr_store) | ||
opened_zarr_store = open_consolidated(zarr_store, mode='r') | ||
# This is a best guess on trial and error with an 8Gi Memory container | ||
max_memory = '1GB' | ||
array_plan = rechunk(opened_zarr_store, | ||
target_chunks, | ||
max_memory, | ||
zarr_target, | ||
temp_store=zarr_temp) | ||
array_plan.execute() | ||
consolidate_metadata(zarr_target) | ||
|
||
|
||
def get_target_chunks(zarr_store: FSMap) -> Dict: | ||
"""Determine the chuncking strategy for the input zarr store's variables. | ||
Iterate through the zarr store, computing new chunksizes for all variables | ||
that are not coordinates or coordinate bounds. Return a dictionary of the | ||
variable and new chunksizes to be used in the rechunker. | ||
""" | ||
zarr_groups = _groups_from_zarr(zarr_store) | ||
|
||
target_chunks = {} | ||
# open with xr for each group? | ||
for group in zarr_groups: | ||
group_dataset = xr.open_dataset(zarr_store, | ||
group=group, | ||
mode='r', | ||
engine='zarr') | ||
for variable, varinfo in group_dataset.data_vars.items(): | ||
if not _bounds(variable): | ||
target_chunks[f'{group}/{variable}'] = compute_chunksize( | ||
varinfo.shape, varinfo.dtype) | ||
else: | ||
target_chunks[f'{group}/{variable}'] = None | ||
|
||
for variable in group_dataset.coords.keys(): | ||
target_chunks[f'{group}/{variable}'] = None | ||
|
||
return target_chunks | ||
|
||
|
||
def _bounds(variable: str) -> bool: | ||
"""Is a variable a bounds type variable. | ||
Coordinates and coordinate bounds are not chunked in a zarr store, this is | ||
just a convenience function to determine the coordinate bounds variables by | ||
name. | ||
""" | ||
return variable.endswith(('_bnds', '_bounds')) | ||
|
||
|
||
def _groups_from_zarr(zarr_root: str) -> List[str]: | ||
"""Get the name of all groups in the zarr_store.""" | ||
original_zarr = open_consolidated(zarr_root, mode='r') | ||
groups = [''] | ||
|
||
def is_group(name: str) -> None: | ||
"""Create function to test if the item is a group or not.""" | ||
if isinstance(original_zarr.get(name), zarrGroup): | ||
groups.append(name) | ||
|
||
original_zarr.visit(is_group) | ||
|
||
return groups |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,10 @@ | ||
boto3 ~= 1.24 | ||
dask == 2023.1.1 | ||
harmony-service-lib ~= 1.0.22 | ||
netCDF4 ~= 1.6.1 | ||
numpy ~= 1.23.3 | ||
numpy ~= 1.24 | ||
python-dateutil ~= 2.8.2 | ||
rechunker == 0.5.0 | ||
s3fs ~= 0.4.0 | ||
zarr ~= 2.13.3 | ||
xarray == 2022.9.0 | ||
zarr == 2.13.3 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,9 +1,9 @@ | ||
autopep8 ~= 1.5 | ||
coverage ~= 6.3.1 | ||
flake8 ~= 3.8 | ||
ipython ~= 7.17 | ||
autopep8 ~= 2.0.1 | ||
coverage ~= 7.1.0 | ||
flake8 ~= 6.0 | ||
ipython ~= 7.32 | ||
isort ~= 5.5 | ||
moto ~= 1.3 | ||
pytest ~= 5.4 | ||
python-dotenv ~=0.12 | ||
safety ~= 1.8 | ||
pytest ~= 7.2.1 | ||
python-dotenv ~=0.21 | ||
safety ~= 1.10 |
Oops, something went wrong.