From 818ced8070c778efada01a0602da624bc3c8f944 Mon Sep 17 00:00:00 2001 From: Adam English Date: Mon, 24 Feb 2025 11:52:32 -0500 Subject: [PATCH] Packaging and field sorting adding packing dependency and allowing header INFO fields to be in different orders between multiple VCFs --- bio2zarr/vcf2zarr/icf.py | 12 +++- pyproject.toml | 1 + tests/data/vcf/out_of_order_fields/input1.bcf | Bin 0 -> 563 bytes .../vcf/out_of_order_fields/input1.bcf.csi | Bin 0 -> 96 bytes tests/data/vcf/out_of_order_fields/input2.bcf | Bin 0 -> 553 bytes .../vcf/out_of_order_fields/input2.bcf.csi | Bin 0 -> 97 bytes tests/test_core.py | 4 +- tests/test_vcf_examples.py | 55 ++++++++++++++++++ 8 files changed, 69 insertions(+), 3 deletions(-) create mode 100644 tests/data/vcf/out_of_order_fields/input1.bcf create mode 100644 tests/data/vcf/out_of_order_fields/input1.bcf.csi create mode 100644 tests/data/vcf/out_of_order_fields/input2.bcf create mode 100644 tests/data/vcf/out_of_order_fields/input2.bcf.csi diff --git a/bio2zarr/vcf2zarr/icf.py b/bio2zarr/vcf2zarr/icf.py index dd0bbc9..0f3f856 100644 --- a/bio2zarr/vcf2zarr/icf.py +++ b/bio2zarr/vcf2zarr/icf.py @@ -41,7 +41,7 @@ def fromdict(d): return VcfFieldSummary(**d) -@dataclasses.dataclass +@dataclasses.dataclass(order=True) class VcfField: category: str name: str @@ -192,6 +192,16 @@ def fromdict(d): d["contigs"] = [Contig(**cd) for cd in d["contigs"]] return IcfMetadata(**d) + def __eq__(self, other): + if not isinstance(other, IcfMetadata): + return NotImplemented + return ( + self.samples == other.samples + and self.contigs == other.contigs + and self.filters == other.filters + and sorted(self.fields) == sorted(other.fields) + ) + def fixed_vcf_field_definitions(): def make_field_def(name, vcf_type, vcf_number): diff --git a/pyproject.toml b/pyproject.toml index 0b72ca1..ecb3f60 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,6 +23,7 @@ dependencies = [ # colouredlogs pulls in humanfriendly", "cyvcf2", "bed_reader", + "packaging", ] requires-python = ">=3.9" classifiers = [ diff --git a/tests/data/vcf/out_of_order_fields/input1.bcf b/tests/data/vcf/out_of_order_fields/input1.bcf new file mode 100644 index 0000000000000000000000000000000000000000..5d0f42de7dec25bd9c67a4316564d064a3fd1545 GIT binary patch literal 563 zcmb2|=3rp}f&Xj_PR>jWOBj1624=r9kU4sE`!CI=+C8DSbXM=GHa@YnZ2N+++xIqZ zzgV`C?a~sJix0)`e_oNdVJC0RZ{c%43;oXB&%E(=Ux~Hs?R65%r-qc~efVo~XYs=8 zo8KDDO5e=#zmsW?Qsyg`km+5fZJv$4TZ#h=^HyA1aB}N1|5|k}xee8~H@^+yI&9{i zzBBYfsfDQY=A(8_9F_lGxG(*_&gI{Uxjx6j-Ire1TAV5&U;0`#^30s4>-1I}nm1Wi zsw;T$j2kt6JMIORGAOOj{BP|wYodzvVV6S}cth_OZaF#CvN3_1>v4+XCS|RN*e1@5 zpQp+H$`@Mce`+)Kmx`sP zE_w2+@cfI47^EBc-7@eXRIvT8ZIzEv^_+ryt?Y7F`}D5^+*z%L$v!*Z!Zp zV`aMe*44F_^CP_<9sHPa=k*>YlawpD|CtyV^?&UefbLaXLn{t|b4zYU4_B82q>6~d=)6}!fqpCjf>TA7s8YdJ^{D=)O ruw1@nhsxFy7mV&P@Z3>YX289Pi9ts$^JfzjW&J4wT-%_3=CnO{!HZZ8E++sfyd5S~nAj_1q2Nd|GeH2P+ fU{=%lTd7rrZ*sR(Uitq#2mOMuG?cHq96+ literal 0 HcmV?d00001 diff --git a/tests/data/vcf/out_of_order_fields/input2.bcf b/tests/data/vcf/out_of_order_fields/input2.bcf new file mode 100644 index 0000000000000000000000000000000000000000..ee8c4a327871ce98b67b4edc77beacf5063d67e9 GIT binary patch literal 553 zcmb2|=3rp}f&Xj_PR>jWiy3<-`RCs;J}`-==*mQ#Ew1^dB2;{){KQ zIo3U5H*F5yU&OH9bCwn3)uluV`EGE_Eu5x+_}DlLyUG)wqJe1u-6ClQW^md75daAm_u>Em literal 0 HcmV?d00001 diff --git a/tests/data/vcf/out_of_order_fields/input2.bcf.csi b/tests/data/vcf/out_of_order_fields/input2.bcf.csi new file mode 100644 index 0000000000000000000000000000000000000000..51503336a6e4f45dfbfc962bf6ef2fb95ea81382 GIT binary patch literal 97 zcmb2|=3rp}f&Xj_PR>jWE)2zd-%_3=CnO{!HZZ7Zd}9>k@s@B@nbELGQljFZQJfP` g!GRXnc@r7U85s7pK4~if>Xk<`M4EvaY$k{R0P3n1#{d8T literal 0 HcmV?d00001 diff --git a/tests/test_core.py b/tests/test_core.py index 3607578..62f76b8 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -237,8 +237,8 @@ def test_examples(self, chunk_size, size, start, stop): # It works in CI on Linux, but it'll probably break at some point. # It's also necessary to update these numbers each time a new data # file gets added - ("tests/data", 4976329), - ("tests/data/vcf", 4964192), + ("tests/data", 4981734), + ("tests/data/vcf", 4969597), ("tests/data/vcf/sample.vcf.gz", 1089), ], ) diff --git a/tests/test_vcf_examples.py b/tests/test_vcf_examples.py index 64e3b26..f560915 100644 --- a/tests/test_vcf_examples.py +++ b/tests/test_vcf_examples.py @@ -1100,3 +1100,58 @@ def test_missing_filter(tmp_path): zarr_path = tmp_path / "zarr" with pytest.raises(ValueError, match="Filter 'q10' was not defined in the header"): vcf2zarr.convert([path], zarr_path) + + +class TestOutOfOrderFields: + # Mixing on purpose + data_path1 = "tests/data/vcf/out_of_order_fields/input2.bcf" + data_path2 = "tests/data/vcf/out_of_order_fields/input1.bcf" + + @pytest.fixture(scope="class") + def ds(self, tmp_path_factory): + out = tmp_path_factory.mktemp("data") / "ooo_example.vcf.zarr" + vcf2zarr.convert([self.data_path1, self.data_path2], out) + return sg.load_dataset(out) + + def test_filters(self, ds): + nt.assert_array_equal(ds["filter_id"], ["PASS", "FAIL"]) + nt.assert_array_equal( + ds["variant_filter"], + [ + [True, False], + [False, True], + [True, False], + ], + ) + + def test_source(self, ds): + assert ds.attrs["source"] == f"bio2zarr-{provenance.__version__}" + + def test_contigs(self, ds): + nt.assert_array_equal(ds["contig_id"], ["chr20", "chr21"]) + nt.assert_array_equal(ds["contig_length"], [64444167.0, 46709983.0]) + nt.assert_array_equal(ds["variant_contig"], [0, 1, 1]) + + def test_position(self, ds): + nt.assert_array_equal(ds["variant_position"], [63971, 64506, 64507]) + + def test_length(self, ds): + nt.assert_array_equal(ds["variant_length"], [11, 1, 1]) + + def test_info_fields(self, ds): + nt.assert_array_equal( + ds["variant_QNAME"], + ["cluster19_000000F", ".", "cluster19_000000F"], + ) + nt.assert_array_equal(ds["variant_QSTART"], [25698928, 25698928, -1]) + + def test_allele(self, ds): + nt.assert_array_equal( + ds["variant_allele"].values.tolist(), + [["TTCCATTCCAC", "T"], ["C", "CTCCAT"], ["G", "A"]], + ) + assert ds["variant_allele"].dtype == "O" + + def test_call_DPs(self, ds): + nt.assert_array_equal(ds["call_DP"], [[5], [-1], [5]]) + nt.assert_array_equal(ds["call_DP2"], [[1], [1], [-1]])