From 9f817a2115a870d1e1d5fbc4afd7bc9e5755be2c Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Fri, 17 Jan 2025 15:19:25 +0000 Subject: [PATCH] Logging improvements --- bio2zarr/core.py | 7 ++++++- bio2zarr/vcf2zarr/vcz.py | 8 ++------ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/bio2zarr/core.py b/bio2zarr/core.py index f783f18..b53cbe0 100644 --- a/bio2zarr/core.py +++ b/bio2zarr/core.py @@ -179,7 +179,12 @@ def flush(self): f"{self.array_offset}:{self.array_offset + self.buffer_row}" f"{self.buff.nbytes / 2**20: .2f}Mb" ) - self.max_buff_size = max(self.max_buff_size, sys.getsizeof(self.buff)) + # Note this is inaccurate for string data as we're just reporting the + # size of the container. When we switch the numpy 2 StringDtype this + # should improve and we can get more visibility on how memory + # is being used. + # https://github.com/sgkit-dev/bio2zarr/issues/30 + self.max_buff_size = max(self.max_buff_size, self.buff.nbytes) self.array_offset += self.variants_chunk_size self.buffer_row = 0 diff --git a/bio2zarr/vcf2zarr/vcz.py b/bio2zarr/vcf2zarr/vcz.py index b718f91..566f6bc 100644 --- a/bio2zarr/vcf2zarr/vcz.py +++ b/bio2zarr/vcf2zarr/vcz.py @@ -862,13 +862,9 @@ def init_partition_array(self, partition_index, name): def finalise_partition_array(self, partition_index, buffered_array): buffered_array.flush() - # field_map = self.schema.field_map() - # array_spec = field_map[buffered_array.name] - # ba = buffered_array - # print(array_spec.name, "ba.max_buff_size", ba.max_buff_size, - # array_spec.variant_chunk_nbytes) logger.info( - f"Completed partition {partition_index} array {buffered_array.name}" + f"Completed partition {partition_index} array {buffered_array.name} " + f"max_memory={core.display_size(buffered_array.max_buff_size)}" ) def encode_array_partition(self, array_spec, partition_index):