Skip to content

Commit

Permalink
Refactor to use vcf2zarr module
Browse files Browse the repository at this point in the history
Closes #40
  • Loading branch information
jeromekelleher committed May 14, 2024
1 parent e42cc38 commit 80d5bed
Show file tree
Hide file tree
Showing 12 changed files with 293 additions and 243 deletions.
2 changes: 1 addition & 1 deletion bio2zarr/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def bio2zarr():
# install individual commands as console scripts. However, this
# is handy for development and for those whose PATHs aren't set
# up in the right way.
bio2zarr.add_command(cli.vcf2zarr)
bio2zarr.add_command(cli.vcf2zarr_main)
bio2zarr.add_command(cli.plink2zarr)
bio2zarr.add_command(cli.vcfpartition)

Expand Down
54 changes: 27 additions & 27 deletions bio2zarr/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
import numcodecs
import tabulate

from . import icf, plink, provenance, vcf, vcf_utils
from . import plink, provenance, vcf2zarr, vcf_utils
from .vcf2zarr import icf as icf_mod

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -167,7 +168,7 @@ def check_overwrite_dir(path, force):
def get_compressor(cname):
if cname is None:
return None
config = icf.ICF_DEFAULT_COMPRESSOR.get_config()
config = icf_mod.ICF_DEFAULT_COMPRESSOR.get_config()
config["cname"] = cname
return numcodecs.get_codec(config)

Expand Down Expand Up @@ -198,7 +199,7 @@ def explode(
"""
setup_logging(verbose)
check_overwrite_dir(icf_path, force)
icf.explode(
vcf2zarr.explode(
icf_path,
vcfs,
worker_processes=worker_processes,
Expand Down Expand Up @@ -235,7 +236,7 @@ def dexplode_init(
"""
setup_logging(verbose)
check_overwrite_dir(icf_path, force)
work_summary = icf.explode_init(
work_summary = vcf2zarr.explode_init(
icf_path,
vcfs,
target_num_partitions=num_partitions,
Expand Down Expand Up @@ -263,7 +264,7 @@ def dexplode_partition(icf_path, partition, verbose, one_based):
setup_logging(verbose)
if one_based:
partition -= 1
icf.explode_partition(icf_path, partition)
vcf2zarr.explode_partition(icf_path, partition)


@click.command
Expand All @@ -274,7 +275,7 @@ def dexplode_finalise(icf_path, verbose):
Final step for distributed conversion of VCF(s) to intermediate columnar format.
"""
setup_logging(verbose)
icf.explode_finalise(icf_path)
vcf2zarr.explode_finalise(icf_path)


@click.command
Expand All @@ -285,7 +286,7 @@ def inspect(path, verbose):
Inspect an intermediate columnar format or Zarr path.
"""
setup_logging(verbose)
data = vcf.inspect(path)
data = vcf2zarr.inspect(path)
click.echo(tabulate.tabulate(data, headers="keys"))


Expand All @@ -296,7 +297,7 @@ def mkschema(icf_path):
Generate a schema for zarr encoding
"""
stream = click.get_text_stream("stdout")
vcf.mkschema(icf_path, stream)
vcf2zarr.mkschema(icf_path, stream)


@click.command
Expand Down Expand Up @@ -327,7 +328,7 @@ def encode(
"""
setup_logging(verbose)
check_overwrite_dir(zarr_path, force)
vcf.encode(
vcf2zarr.encode(
icf_path,
zarr_path,
schema_path=schema,
Expand Down Expand Up @@ -378,7 +379,7 @@ def dencode_init(
"""
setup_logging(verbose)
check_overwrite_dir(zarr_path, force)
work_summary = vcf.encode_init(
work_summary = vcf2zarr.encode_init(
icf_path,
zarr_path,
target_num_partitions=num_partitions,
Expand Down Expand Up @@ -406,7 +407,7 @@ def dencode_partition(zarr_path, partition, verbose, one_based):
setup_logging(verbose)
if one_based:
partition -= 1
vcf.encode_partition(zarr_path, partition)
vcf2zarr.encode_partition(zarr_path, partition)


@click.command
Expand All @@ -417,7 +418,7 @@ def dencode_finalise(zarr_path, verbose):
Final step for distributed conversion of ICF to VCF Zarr.
"""
setup_logging(verbose)
vcf.encode_finalise(zarr_path, show_progress=True)
vcf2zarr.encode_finalise(zarr_path, show_progress=True)


@click.command(name="convert")
Expand All @@ -442,7 +443,7 @@ def convert_vcf(
"""
setup_logging(verbose)
check_overwrite_dir(zarr_path, force)
vcf.convert(
vcf2zarr.convert(
vcfs,
zarr_path,
variants_chunk_size=variants_chunk_size,
Expand All @@ -453,8 +454,8 @@ def convert_vcf(


@version
@click.group(cls=NaturalOrderGroup)
def vcf2zarr():
@click.group(cls=NaturalOrderGroup, name="vcf2zarr")
def vcf2zarr_main():
"""
Convert VCF file(s) to the vcfzarr format.
Expand Down Expand Up @@ -506,18 +507,17 @@ def vcf2zarr():
"""


# TODO figure out how to get click to list these in the given order.
vcf2zarr.add_command(convert_vcf)
vcf2zarr.add_command(inspect)
vcf2zarr.add_command(explode)
vcf2zarr.add_command(mkschema)
vcf2zarr.add_command(encode)
vcf2zarr.add_command(dexplode_init)
vcf2zarr.add_command(dexplode_partition)
vcf2zarr.add_command(dexplode_finalise)
vcf2zarr.add_command(dencode_init)
vcf2zarr.add_command(dencode_partition)
vcf2zarr.add_command(dencode_finalise)
vcf2zarr_main.add_command(convert_vcf)
vcf2zarr_main.add_command(inspect)
vcf2zarr_main.add_command(explode)
vcf2zarr_main.add_command(mkschema)
vcf2zarr_main.add_command(encode)
vcf2zarr_main.add_command(dexplode_init)
vcf2zarr_main.add_command(dexplode_partition)
vcf2zarr_main.add_command(dexplode_finalise)
vcf2zarr_main.add_command(dencode_init)
vcf2zarr_main.add_command(dencode_partition)
vcf2zarr_main.add_command(dencode_finalise)


@click.command(name="convert")
Expand Down
38 changes: 38 additions & 0 deletions bio2zarr/vcf2zarr/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from .icf import (
IntermediateColumnarFormat,
explode,
explode_finalise,
explode_init,
explode_partition,
)
from .vcz import (
VcfZarrSchema,
convert,
encode,
encode_finalise,
encode_init,
encode_partition,
inspect,
mkschema,
)
from .verification import verify

# NOTE some of these aren't intended to be part of the external
# interface (like IntermediateColumnarFormat), but putting
# them into the list to keep the lint nagging under control
__all__ = [
"IntermediateColumnarFormat",
"explode",
"explode_finalise",
"explode_init",
"explode_partition",
"VcfZarrSchema",
"convert",
"encode",
"encode_finalise",
"encode_init",
"encode_partition",
"inspect",
"mkschema",
"verify",
]
2 changes: 1 addition & 1 deletion bio2zarr/icf.py → bio2zarr/vcf2zarr/icf.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
import numcodecs
import numpy as np

from . import constants, core, provenance, vcf_utils
from .. import constants, core, provenance, vcf_utils

logger = logging.getLogger(__name__)

Expand Down
3 changes: 2 additions & 1 deletion bio2zarr/vcf.py → bio2zarr/vcf2zarr/vcz.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
import numpy as np
import zarr

from . import constants, core, icf, provenance
from .. import constants, core, provenance
from . import icf

logger = logging.getLogger(__name__)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import tqdm
import zarr

from . import constants
from .. import constants


def assert_all_missing_float(a):
Expand Down Expand Up @@ -146,8 +146,7 @@ def assert_format_val_equal(vcf_val, zarr_val, vcf_type):
nt.assert_equal(vcf_val, zarr_val)


# TODO rename to "verify"
def validate(vcf_path, zarr_path, show_progress=False):
def verify(vcf_path, zarr_path, show_progress=False):
store = zarr.DirectoryStore(zarr_path)

root = zarr.group(store=store)
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ repository = "https://github.com/sgkit-dev/bio2zarr"
documentation = "https://sgkit-dev.github.io/bio2zarr/"

[project.scripts]
vcf2zarr = "bio2zarr.cli:vcf2zarr"
vcf2zarr = "bio2zarr.cli:vcf2zarr_main"
vcfpartition = "bio2zarr.cli:vcfpartition"

[project.optional-dependencies]
Expand Down
Loading

0 comments on commit 80d5bed

Please sign in to comment.