Skip to content

Commit

Permalink
Merge pull request #9 from IGVF/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
emattei authored Jan 22, 2025
2 parents 8dbdfc1 + 2e2991d commit 737037f
Show file tree
Hide file tree
Showing 8 changed files with 140 additions and 120 deletions.
Empty file added .gitattributes
Empty file.
17 changes: 12 additions & 5 deletions .github/workflows/build-docker-kallisto-bustools.yml
Original file line number Diff line number Diff line change
@@ -1,15 +1,14 @@

name: IGVF kallisto-bustools CI

on:
push:
branches: [ "dev", "main" ]
paths:
- 'modules/igvf-kallisto-bustools/**'
- 'modules/igvf-kallisto-bustools/**'
pull_request:
branches: [ "dev", "main" ]
paths:
- 'modules/igvf-kallisto-bustools/**'
- 'modules/igvf-kallisto-bustools/**'
workflow_dispatch:
inputs:
image_tag:
Expand All @@ -26,7 +25,7 @@ env:

jobs:

build-for-dockerhub:
build-and-test:

runs-on: ubuntu-latest
defaults:
Expand All @@ -35,7 +34,15 @@ jobs:

steps:
- uses: actions/checkout@v4
- name: Check out the repository
run: |
echo "Checking out the repository"
pwd
ls -lht
- name: Build the Docker image
run: docker build . --file docker_builder.dockerfile --tag ${{ env.DOCKER_REGISTRY }}/${{ env.DOCKER_PATH }}:${{ env.TAG }}
- name: Run tests in Docker container
run: |
docker run --init --rm -v ${{ github.workspace }}/test_data:/software/test_data ${{ env.DOCKER_REGISTRY }}/${{ env.DOCKER_PATH }}:${{ env.TAG }} pytest /software/tests/test_run_kallisto.py
- name: Push image
run: "docker push ${{ env.DOCKER_REGISTRY }}/${{ env.DOCKER_PATH }}:${{ env.TAG }}"
run: docker push ${{ env.DOCKER_REGISTRY }}/${{ env.DOCKER_PATH }}:${{ env.TAG }}
79 changes: 0 additions & 79 deletions modules/igvf-kallisto-bustool/tests/test_run_kallisto.py

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ run_kallisto quant nac --index_dir <index_dir> --read_format <read_format> --rep
## Logging
The tool uses Python's logging module to log information and errors. Logs are output to stderr.
## License
This project is licensed under the MIT License. See the LICENSE file for details.
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# Based on Python
############################################################

FROM python@sha256:fd0fa50d997eb56ce560c6e5ca6a1f5cf8fdff87572a16ac07fb1f5ca01eb608
FROM --platform="linux/amd64" python:3.10-slim

LABEL maintainer="Eugenio Mattei"
LABEL software="IGVF single-cell pipeline"
Expand All @@ -18,7 +18,8 @@ LABEL software.description="Run the kallisto-bustools module of the IGVF single-
RUN pip install --upgrade pip

RUN mkdir /software
COPY src/python/run_kallisto.py /software
COPY run_kallisto.py /software
COPY pyproject.toml /software
RUN cd /software && pip install --editable .

# Create and setup new user
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@
name = "igvf-kallisto-bustools"
version = "1.0.0"
description = "Align scRNA using kallisto-bustools"
requires-python = "==3.10.11"
requires-python = "==3.10.16"
dependencies = [
"click>=8.1.7",
"kb-python==0.28.2",
"anndata==0.10.1"
"kb-python==0.29.1",
"anndata==0.11.1"
]

[project.scripts]
Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,34 @@
import click
import gzip
import logging
import shutil
import sys
import subprocess

# Configure logging
logging.basicConfig(stream=sys.stderr, level=logging.INFO)


def check_and_unzip(file_path):
"""
Checks if a file is gzipped and unzips it if necessary.
Parameters:
file_path (str): Path to the file to check and unzip.
Returns:
str: Path to the unzipped file.
"""
if file_path.endswith('.gz'):
file_name = file_path.split('/')[-1]
unzipped_file_path = file_name[:-3] # Remove the .gz extension
with gzip.open(file_path, 'rb') as f_in:
with open(unzipped_file_path, 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
return unzipped_file_path
return file_path


@click.group()
@click.version_option(package_name="igvf-kallisto-bustools")
def cli():
Expand All @@ -15,16 +37,18 @@ def cli():
This script runs the kallisto and bustools pipeline.
You can run the quantification step or the index creation step.
"""
pass


# Index sub-command
@cli.group("index")
def index():
"""Manages the index creation step."""
pass

@index.command("standard")
@click.option('--output_dir', type=click.Path(exists=True), help='Path to the output directory.', required=True)
@click.option('--genome-fasta', type=click.Path(exists=True), help='Path to the genome fasta file.', required=True)
@click.option('--genome_fasta', type=click.Path(exists=True), help='Path to the genome fasta file.', required=True)
@click.option('--gtf', type=click.Path(exists=True), help='Path to the GTF file.', required=True)
def index_standard(output_dir, genome_fasta, gtf):
"""
Expand All @@ -39,6 +63,8 @@ def index_standard(output_dir, genome_fasta, gtf):
{output_dir}.tar.gz: A tarball of the output directory containing all the indexes.
"""
logging.info(f"Creating standard kallisto index in {output_dir}.")
genome_fasta = check_and_unzip(genome_fasta)
gtf = check_and_unzip(gtf)
# Create the command line string and run it using subprocess
cmd = f"kb ref -i {output_dir}/index.idx -g {output_dir}/t2g.txt -f1 {output_dir}/transcriptome.fa {genome_fasta} {gtf}"
logging.info(f"Running command: {cmd}")
Expand All @@ -60,7 +86,7 @@ def index_standard(output_dir, genome_fasta, gtf):

@index.command("nac")
@click.option('--output_dir', type=click.Path(exists=True), help='Path to the output directory.', required=True)
@click.option('--genome-fasta', type=click.Path(exists=True), help='Path to the genome fasta file.', required=True)
@click.option('--genome_fasta', type=click.Path(exists=True), help='Path to the genome fasta file.', required=True)
@click.option('--gtf', type=click.Path(exists=True), help='Path to the GTF file.', required=True)
def index_nac(output_dir, genome_fasta, gtf):
"""
Expand All @@ -75,6 +101,8 @@ def index_nac(output_dir, genome_fasta, gtf):
{output_dir}.tar.gz (File): A tarball of the output directory containing all the indexes.
"""
logging.info(f"Creating nac kallisto index in {output_dir}.")
genome_fasta = check_and_unzip(genome_fasta)
gtf = check_and_unzip(gtf)
# Create the command line string and run it using subprocess
cmd = f"kb ref --workflow=nac -i {output_dir}/index.idx -g {output_dir}/t2g.txt -c1 ~{output_dir}/cdna.txt -c2 ~{output_dir}/nascent.txt -f1 ~{output_dir}/cdna.fasta -f2 ~{output_dir}/nascent.fasta ~{genome_fasta} {gtf}"
logging.info(f"Running command: {cmd}")
Expand All @@ -94,36 +122,36 @@ def index_nac(output_dir, genome_fasta, gtf):


# Quantification sub-command
@cli.group("quant")
def quant():
@cli.group("quantify")
def quantify():
"""Manages the quantification step."""
pass


@quant.command("standard")
@quantify.command("standard")
@click.option('--index_dir', type=click.Path(exists=True), help='Path to the index directory.', required=True)
@click.option('--read_format', type=str, help='String indicating the position of umi and barcode.', required=True)
@click.option('--output_dir', type=click.Path(exists=True), help='Path to the output directory.', required=True)
@click.option('--strand', type=str, help='Library strand orientation.', required=True)
@click.option('--threads', default=1, type=int, help='Number of threads to use. Default is 1.')
@click.option('--barcode_onlist', type=click.Path(exists=True), help='Barcode onlist file.', required=True)
@click.option('--replacement_list', type=click.Path(exists=True), help='Replacement list file.')
@click.argument('interleaved_fastqs', nargs=-1, type=click.Path(exists=True), help='FASTQ files to align. The file needs to be supply in interleaved format(Example: pairA_1.fastq pairA_2.fastq pairB_1.fastq pairB_2.fastq).')
def quant_standard(index_dir, read_format, output_dir, strand, threads, barcode_onlist, interleaved_fastqs, replacement_list):
@click.argument('interleaved_fastqs', nargs=-1, type=click.Path(exists=True))
def quantify_standard(index_dir, read_format, output_dir, strand, threads, barcode_onlist, replacement_list, interleaved_fastqs):
"""
Runs the standard quantification pipeline using kallisto and bustools.
Parameters:
index_dir (Path): Directory containing the kallisto index and transcript-to-gene mapping files.
read_format (str): Format of the reads (e.g., '10xv2', '10xv3').
output_dir (Path): Directory where the output files will be saved.
strand (str): Strand specificity (e.g., 'unstranded', 'forward', 'reverse').
threads (int): Number of threads to use for the computation.
barcode_onlist (File): Path to the whitelist of barcodes.
replacement_list (File): Path to the replacement list file.
interleaved_fastqs (File): Path to the interleaved FASTQ files.
index_dir (Path): Directory containing the kallisto index and transcript-to-gene mapping files.
read_format (str): Format of the reads (e.g., '10xv2', '10xv3').
output_dir (Path): Directory where the output files will be saved.
strand (str): Strand specificity (e.g., 'unstranded', 'forward', 'reverse').
threads (int): Number of threads to use for the computation.
barcode_onlist (File): Path to the whitelist of barcodes.
replacement_list (File): Path to the replacement list file.
interleaved_fastqs (File): Path to the interleaved FASTQ files. The files need to be supplied in interleaved format(Example: pairA_1.fastq pairA_2.fastq pairB_1.fastq pairB_2.fastq).
Returns:
Please refer to the kallisto and bustools documentation for the output files.
Please refer to the kallisto and bustools documentation for the output files.
"""
logging.info("Running standard quantification pipeline.")
# Create the command line string and run it using subprocess
Expand All @@ -136,31 +164,31 @@ def quant_standard(index_dir, read_format, output_dir, strand, threads, barcode_
logging.error(f"Command failed with error: {e.stderr}")


@quant.command("nac")
@quantify.command("nac")
@click.option('--index_dir', type=click.Path(exists=True), help='Path to the index directory.', required=True)
@click.option('--read_format', type=str, help='String indicating the position of umi and barcode.', required=True)
@click.option('--output_dir', type=click.Path(exists=True), help='Path to the output directory.', required=True)
@click.option('--strand', type=str, help='Library strand orientation.', required=True)
@click.option('--threads', default=1, type=int, help='Number of threads to use. Default is 1.')
@click.option('--barcode_onlist', type=click.Path(exists=True), help='Barcode onlist file.', required=True)
@click.option('--replacement_list', type=click.Path(exists=True), help='Replacement list file.')
@click.argument('interleaved_fastqs', nargs=-1, type=click.Path(exists=True), help='FASTQ files to align. The file needs to be supply in interleaved format(Example: pairA_1.fastq pairA_2.fastq pairB_1.fastq pairB_2.fastq).')
def quant_nac(index_dir, read_format, replacement_list, barcode_onlist, strand, output_dir, threads, interleaved_fastqs):
@click.argument('interleaved_fastqs', nargs=-1, type=click.Path(exists=True))
def quantify_nac(index_dir, read_format, replacement_list, barcode_onlist, strand, output_dir, threads, interleaved_fastqs):
"""
Runs the nac quantification pipeline using kallisto and bustools.
Parameters:
index_dir (Path): Directory containing the kallisto index and transcript-to-gene mapping files.
read_format (str): Format of the reads (e.g., '10xv2', '10xv3').
output_dir (Path): Directory where the output files will be saved.
strand (str): Strand specificity (e.g., 'unstranded', 'forward', 'reverse').
threads (int): Number of threads to use for the computation.
barcode_onlist (File): Path to the whitelist of barcodes.
replacement_list (File): Path to the replacement list file.
interleaved_fastqs (File): Path to the interleaved FASTQ files.
index_dir (Path): Directory containing the kallisto index and transcript-to-gene mapping files.
read_format (str): Format of the reads (e.g., '10xv2', '10xv3').
output_dir (Path): Directory where the output files will be saved.
strand (str): Strand specificity (e.g., 'unstranded', 'forward', 'reverse').
threads (int): Number of threads to use for the computation.
barcode_onlist (File): Path to the whitelist of barcodes.
replacement_list (File): Path to the replacement list file.
interleaved_fastqs (File): Path to the interleaved FASTQ files. The files need to be supplied in interleaved format(Example: pairA_1.fastq pairA_2.fastq pairB_1.fastq pairB_2.fastq).
Returns:
Please refer to the kallisto and bustools documentation for the output files.
Please refer to the kallisto and bustools documentation for the output files.
"""
logging.info("Running nac quantification pipeline.")
# Create the command line string and run it using subprocess
Expand All @@ -171,5 +199,3 @@ def quant_nac(index_dir, read_format, replacement_list, barcode_onlist, strand,
logging.info(f"Command output: {result.stdout}")
except subprocess.CalledProcessError as e:
logging.error(f"Command failed with error: {e.stderr}")


Loading

0 comments on commit 737037f

Please sign in to comment.