## path to longcallR-dp, e.g. "/path/to/longcallR-dp"
cmd="/path/to/longcallR-dp"
## path to bam file, e.g. "path/to/aln.bam"
bam="demo/hg002_chr22.bam"
## path to reference genome, e.g. "/path/to/GRCh38.fa"
ref="demo/chr22.fa"
## output_path + prefix, e.g. "sample_longcallR_inference/contig"
out="hg002_chr22_feature/contig"
## minimum depth, default="6"
depth="6"
## minimum alternative allele fraction, default="0.1"
alt_frac="0.1"
## minimum base quality, hifi:0, ont:10, default="0"
min_bq="0"
## n_jobs, default="25"
n_jobs="1"
## threads, default="8"
threads="4"
## contigs, default="chr1 chr2 ... chr22, chrX chrY"
CTGS=("chr22")
## run longcallR-dp (Parallel)
parallel -j ${n_jobs} \
"$cmd --mode predict \
--bam-path '${bam}' \
--ref-path '${ref}' \
--min-depth '${depth}' \
--min-alt-freq '${alt_frac}' \
--min-baseq '${min_bq}' \
--threads '${threads}' \
--contigs {1} \
--output '${out}_{1}'" ::: "${CTGS[@]}"
## download the configure files and models
longcallR_nn download --download_dir models
## run longcallR-nn (Serial)
for ctg in ${CTGS[@]}; do
longcallR_nn call \
-config models/pb_masseq_config.yaml \
-model models/pb_masseq_model.chkpt \
-data ${out}_${ctg} \
-ref ${ref} \
-output hg002_${ctg}.vcf \
-max_depth 200 \
-batch_size 500
done
<longcallR_nn> is a variant caller specifically designed for long-read RNA-seq data, utilizing a ResNet50 model. It provides high-precision identification of RNA SNPs and A-to-I RNA editing events. Currently, longcallR_nn is compatible with PacBio MAS-Seq/Iso-Seq and Nanopore cDNA/dRNA data.
- Python>=3.9
- Rust
- Parallel
- Pytorch>=1.3
- Clone the repo:
git clone https://github.com/huangnengCSU/longcallR-nn.git
- Navigate to the project directory:
cd longcallR-nn
- Install the longcallR-nn dependencies:
# create env conda create -n longcallRenv python=3.9 # activate env conda activate longcallRenv # install dependencies pip install -r requirements.txt # install PyTorch gpu version (pytorch>=1.3) conda install pytorch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 pytorch-cuda=12.4 torchmetrics -c pytorch -c nvidia # or install PyTorch cpu version (pytorch>=1.3) conda install pytorch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 torchmetrics cpuonly -c pytorch
- Install longcallR-nn:
pip install .
- Compile longcallR-dp:
cd longcallR_dp cargo build --release
- Install longcallR-nn:
# create env conda create -n longcallRenv python=3.9 # activate env conda activate longcallRenv # install PyTorch gpu version (pytorch>=1.3) conda install pytorch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 pytorch-cuda=12.4 -c pytorch -c nvidia # or install PyTorch cpu version (pytorch>=1.3) conda install pytorch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 cpuonly -c pytorch # install longcallR-nn from bioconda conda install -c bioconda -c conda-forge longcallr_nn
- Compile longcallR-dp:
git clone https://github.com/huangnengCSU/longcallR-nn.git cd longcallR-nn/longcallR-dp cargo build --release
You can build an Apptainer/Singularity container using the provided longcallR-nn.def
definition file with the following command:
cd longcallR-nn
apptainer build --fakeroot longcallR-nn.sif longcallR-nn.def
Alternatively, you can pull a pre-built image directly with:
apptainer pull docker://huangnengcsu/longcallr_nn:v0.0.1_gpu
To build a Docker image, run the following command:
cd longcallR-nn
docker build -t longcallr_nn:v0.0.1_gpu .
Alternatively, you can pull the Docker image directly with:
docker pull huangnengcsu/longcallr_nn:v0.0.1_gpu
Template scripts for generating training dataset or inference dataset can be found in the longcallR_dp/job_templates
directory. The script for generating a REDIportal VCF file for training A-to-I RNA-editing events is located at longcallR_dp/scripts/REDIportal_to_vcf.py
.
Usage: longcallR_dp [OPTIONS] --bam-path <BAM_PATH> --ref-path <REF_PATH> --mode <MODE> --output <OUTPUT>
Options:
-b, --bam-path <BAM_PATH>
Path to input bam file
-f, --ref-path <REF_PATH>
Path to reference file
-m, --mode <MODE>
Mode: train or predict [possible values: train, predict]
--truth <TRUTH>
Truth vcf file path (Optional)
--editing <EDITING>
RNA editing dataset (Optional)
--passed-editing <PASSED_EDITING>
Passed RNA editing dataset by comparing with alignment (Optional)
--bed-path <BED_PATH>
Interested region
-o, --output <OUTPUT>
Output bam file path
-r, --region <REGION>
Region to realign (Optional). Format: chr:start-end, left-closed, right-open
-x, --contigs [<CONTIGS>...]
Contigs to be processed. Example: -x chr1 chr2 chr3
-t, --threads <THREADS>
Number of threads, default 1 [default: 1]
--min-mapq <MIN_MAPQ>
Minimum mapping quality for reads [default: 10]
--min-baseq <MIN_BASEQ>
Minimum base quality for allele [default: 0]
--min-alt-freq <MIN_ALT_FREQ>
Minimum allele frequency for candidate SNPs [default: 0.1]
--min-alt-depth <MIN_ALT_DEPTH>
Minimum alternate allele depth for candidate SNPs [default: 2]
--min-depth <MIN_DEPTH>
Minimum depth for candidate SNPs [default: 6]
--max-depth <MAX_DEPTH>
Maximum depth for candidate SNPs [default: 20000]
--min-read-length <MIN_READ_LENGTH>
Minimum read length to filter reads [default: 500]
--flanking-size <FLANKING_SIZE>
Flanking size [default: 20]
--chunk-size <CHUNK_SIZE>
Chunk size [default: 5000]
-h, --help
Print help
-V, --version
Print version
The configuration files and models for ONT and PacBio long-read RNA-seq have been released on Zenodo. You can use the command longcallR_nn download
to automatically retrieve the latest versions of these files.
Platform | config | model |
---|---|---|
PacBio Masseq | hg002_na24385_masseq.yaml | hg002_na24385_mix_nopass_resnet50_sgd.epoch30.chkpt |
PacBio Isoseq | hg002_isoseq.yaml | hg002_baylor_isoseq_nopass_resnet50_sgd.epoch30.chkpt |
ONT cDNA | wtc11_cdna.yaml | cdna_wtc11_nopass_resnet50_sgd.epoch30.chkpt |
ONT dRNA | hg002_drna.yaml | drna_hg002_nopass_resnet50_sgd2.epoch31.chkpt |
Once you have obtained the model files, you can run the following command to call variants:
longcallR_nn call [-h] -config CONFIG -model MODEL -data DATA [-ref REF] -output OUTPUT [-max_depth MAX_DEPTH] [-batch_size BATCH_SIZE] [--no_cuda]
optional arguments:
-h, --help show this help message and exit
-config CONFIG path to config file
-model MODEL path to trained model
-data DATA directory of feature files
-ref REF reference genome file
-output OUTPUT output vcf file
-max_depth MAX_DEPTH max depth threshold
-batch_size BATCH_SIZE batch size
--no_cuda If running on cpu device, set the argument.
The template configuration file to refer to is config/test.yaml
.
longcallR_nn train [-h] -config CONFIG [-log LOG]
optional arguments:
-h, --help show this help message and exit
-config CONFIG path to config file
-log LOG name of log file
MIT License
Copyright (c) 2024 Dana-Farber Cancer Institute.
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.