Skip to content

Commit

Permalink
Dev (#1)
Browse files Browse the repository at this point in the history
* add snp-only aln support

* update_readme

* update .gitignore'

'
''

* update workflow

* update gitignore

* .gitignore updated

* update gitignore

* update gitignore

---------

Co-authored-by: Sudaraka88 <sudarakm@ua-eduroam-ten-25-132-182.uniaccess.unimelb.edu.au>
  • Loading branch information
Sudaraka88 and Sudaraka88 authored Oct 17, 2023
1 parent 334fc1b commit 702a8bf
Show file tree
Hide file tree
Showing 14 changed files with 226 additions and 222 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/r.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
on:
push:
branches: [main, master]
branches: [main, master, dev]
pull_request:
branches: [main, master]
branches: [main, master, dev]
schedule:
- cron: '1 1 1 * *'

Expand Down
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,5 @@
*.so
*.o
.DS_Store
*.Rmd
Makevars
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: LDWeaver
Type: Package
Title:Genomewide Epistasis Analysis on Bacteria
Version: 1.1.1
Version: 1.2.0
Authors@R: person("Sudaraka", "Mallawaarachchi", email = "smallawaarachchi@gmail.com", role = c("aut", "cre"))
Maintainer: Sudaraka Mallawaarachchi <smallawaarachchi@gmail.com>
Description:Perform genomewide epistasis analysis by evaluating the LD structure in bacteria.
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ export(estimate_variation_in_CDS)
export(generate_Links_SNPS_fasta)
export(genomewide_LDMap)
export(make_gwes_plots)
export(parse_fasta_SNP_alignment)
export(parse_fasta_alignment)
export(parse_genbank_file)
export(parse_gff_file)
Expand Down
63 changes: 52 additions & 11 deletions R/BacGWES.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
#'
#' @param dset name of the dataset, all outputs will be saved to the folder <dset>
#' @param aln_path path to the multi fasta alignment
#' @param aln_has_all_bases specify whether the alignment has all bases in the reference genome (default = T). For example, if it's a SNP only alignment, set to F and provide pos
#' @param pos numeric vector of positions for each base in the alignment (default = NULL). Only required if sites are missing from the alignment (e.g. SNP alignment output from snp-sites or https://github.com/Sudaraka88/FastaR)
#' @param gbk_path path to genbank annotations file (default = NULL). Only provide one of genbank or gff3 annotation files.
#' @param gff3_path path to gff3 annotations file (default = NULL). Only provide one of genbank or gff3 annotation files.
#' @param ref_fasta_path path to Reference fasta file. The file MUST be in fasta format and contain exactly one sequence! Required for gff3 annotations,
Expand Down Expand Up @@ -42,8 +44,9 @@
#' sr_links_red = LDWeaver(dset = "efcm", aln_path = "<efcm_aln>", gbk_path = "<efcm.gbk>")
#' }
#' @export
LDWeaver = function(dset, aln_path, gbk_path = NULL, gff3_path = NULL, ref_fasta_path = NULL, validate_ref_ann_lengths = T,
snp_filt_method = "default", gap_freq = 0.15, maf_freq = 0.01, snpeff_jar_path = NULL, hdw_threshold = 0.1,
LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path = NULL, gff3_path = NULL,
ref_fasta_path = NULL, validate_ref_ann_lengths = T, snp_filt_method = "default",
gap_freq = 0.15, maf_freq = 0.01, snpeff_jar_path = NULL, hdw_threshold = 0.1,
perform_SR_analysis_only = F, SnpEff_Annotate = T, sr_dist = 20000, lr_retain_links = 1e6,
max_tophits = 250, num_clusts_CDS = 3, srp_cutoff = 3, tanglegram_break_segments = 5,
multicore = T, max_blk_sz = 10000, ncores = NULL, save_additional_outputs = F){
Expand All @@ -60,11 +63,8 @@ LDWeaver = function(dset, aln_path, gbk_path = NULL, gff3_path = NULL, ref_fasta

#TODO: Add the option to provide genbank file without reference sequence
#TODO: Count through blocks and automate the displayed BLOCK NUMBER
#TODO: Add the option to give spydrpick links directly to the pipeline instead of performing LR analysis

# # Welcome message
# timestamp()
# cat(paste("\n\n Performing GWES analysis on:", dset, "\n\n"))
# # Welcome message # #

# Sanity checks
# annotations
Expand All @@ -79,6 +79,16 @@ LDWeaver = function(dset, aln_path, gbk_path = NULL, gff3_path = NULL, ref_fasta
order_links = T # sr_links will be ordered and saved without annotations
}

# alignment (now with SNP-only alignment support 2023/10/12)
if(aln_has_all_bases == F){ # snp-only alignment, POS must be provided
# sanity checks
if(is.null(pos)) stop("A numeric vector of 'positions' <pos> must be provided if aln_has_all_bases = F")
if(!is.numeric(pos)) stop("Provided pos must be numeric!")
if(any(duplicated(pos))) stop("Provided pos contains duplicates!")
} else { # This is a full alignment, pos must be NULL
if(!is.null(pos)) stop("pos cannot be provided for alignments with all bases! Depending on the use case, either set pos = NULL or aln_has_all_bases = T")
}

# multicore
if(multicore == T) { # multicore requested (default)
max_n_cores = parallel::detectCores()
Expand Down Expand Up @@ -131,6 +141,10 @@ LDWeaver = function(dset, aln_path, gbk_path = NULL, gff3_path = NULL, ref_fasta
max_blk_sz = 10000
}

if(aln_has_all_bases == F){ # For snp-only alignments, reference and alignment length checks will fail, stop checking
validate_ref_ann_lengths = F
}

# normalise_input_paths
aln_path = normalizePath(aln_path)
if(!is.null(gbk_path)) gbk_path = normalizePath(gbk_path)
Expand Down Expand Up @@ -207,12 +221,21 @@ LDWeaver = function(dset, aln_path, gbk_path = NULL, gff3_path = NULL, ref_fasta
if(!file.exists(ACGTN_snp_path)) {
t0 = Sys.time()
cat(paste("Parsing Alignment:", aln_path ,"\n"))
snp.dat = LDWeaver::parse_fasta_alignment(aln_path = aln_path, method = snp_filt_method, gap_freq = gap_freq, maf_freq = maf_freq)
if(save_additional_outputs){
cat("Step 5: Savings snp.dat...")
saveRDS(snp.dat, ACGTN_snp_path)

# Adding support for SNP-only alignments
if(aln_has_all_bases == T){
snp.dat = LDWeaver::parse_fasta_alignment(aln_path = aln_path, method = snp_filt_method, gap_freq = gap_freq, maf_freq = maf_freq)
if(save_additional_outputs){
cat("Step 5: Savings snp.dat...")
saveRDS(snp.dat, ACGTN_snp_path)
}
} else {
snp.dat = LDWeaver::parse_fasta_SNP_alignment(aln_path = aln_path, pos = pos, method = snp_filt_method, gap_freq = gap_freq, maf_freq = maf_freq)
# Note that snp.dat$g = NULL (we cannot measure this, need to get it from the genbank file)
# we cannot save snp.dat here due to absent snp.dat$g, moving downstream (block 2)
}


cat(paste("BLOCK 1 complete in", round(difftime(Sys.time(), t0, units = "secs"), 2), "s \n"))
}else{
cat("Loading previous snp matrix \n")
Expand All @@ -225,7 +248,9 @@ LDWeaver = function(dset, aln_path, gbk_path = NULL, gff3_path = NULL, ref_fasta
gff = NULL # alternative set to NULL
if(!file.exists(parsed_gbk_path)) {
cat(paste("Reading the GBK file, validate_length_check = ", validate_ref_ann_lengths, "\n"))
gbk = LDWeaver::parse_genbank_file(gbk_path = gbk_path, g = snp.dat$g, length_check = validate_ref_ann_lengths) # will return 1 if fails
gbk_ = LDWeaver::parse_genbank_file(gbk_path = gbk_path, g = snp.dat$g, length_check = validate_ref_ann_lengths) # will return 1 if fails
gbk = gbk_$gbk # gbk is coming in gbk_$gbk now
# For snp-only alignments, g = snp.dat$g = NULL and validate_ref_ann_lengths = F, won't be an issue...
if(save_additional_outputs){
saveRDS(gbk, parsed_gbk_path)
}
Expand All @@ -249,6 +274,22 @@ LDWeaver = function(dset, aln_path, gbk_path = NULL, gff3_path = NULL, ref_fasta
}
}

# This is a patch to add snp.dat$g back in case this is a SNP only alignment
if(is.null(snp.dat$g)){
if(!is.null(gbk_path)){ # input reference is genbank format
snp.dat$g = gbk_$ref_g
cat("Extracted ref genome length", snp.dat$g, "from genbank...")
}
if(!is.null(gff3_path)){ # input reference is gff3 format
snp.dat$g = gff$g
}
if(save_additional_outputs){
cat("saving snp.dat...\n")
saveRDS(snp.dat, ACGTN_snp_path)
#### This saved file will have snp.dat$g = NULL ### TODO: Ensure no issues downstream
}
}

# BLK3
cat("\n\n #################### BLOCK 3 #################### \n\n")
if(!file.exists(cds_var_path)) {
Expand Down
8 changes: 6 additions & 2 deletions R/computePairwiseMI.R
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,10 @@ perform_MI_computation = function(snp.dat, hdw, cds_var, ncores, lr_save_path =
if(!perform_SR_analysis_only){
# estimate the number of lr_links to decide on a discard threshold using all or 1% of SNPs
snp_subset = min(snp.dat$nsnp, round(snp.dat$nsnp*0.1))
set.seed(1988) # set a seed, so that the same subset of positions get picked below (not absolutely needed, but good for reproducibility)
lr_link_count = sapply(snp.dat$POS[sample(snp.dat$nsnp, snp_subset)], function(x) sum((0.5*snp.dat$g - abs((x - snp.dat$POS)%%snp.dat$g - 0.5*snp.dat$g))>sr_dist))
lr_links_approx = sum(lr_link_count)/snp_subset*snp.dat$nsnp/2

} else {
lr_links_approx = NULL
}
Expand All @@ -81,7 +83,7 @@ perform_MI_computation = function(snp.dat, hdw, cds_var, ncores, lr_save_path =
cat(paste("Block", i, "of", nblcks, "..."))
from_ = MI_cmp_blks$from_s[i]:MI_cmp_blks$from_e[i]
to_ = MI_cmp_blks$to_s[i]:MI_cmp_blks$to_e[i]

# cat("**debug** snp_subset =",snp_subset, " lr_links_approx=",lr_links_approx," lr_retain_links=",lr_retain_links, " **debug**\n")
sr_links = perform_MI_computation_ACGTN(snp.dat = snp.dat, neff = neff, hsq = hsq, cds_var = cds_var,
lr_save_path = lr_save_path, from = from_, sr_dist = sr_dist,
lr_retain_links = lr_retain_links, to = to_, ncores = ncores, sr_links = sr_links,
Expand Down Expand Up @@ -294,12 +296,14 @@ perform_MI_computation_ACGTN = function(snp.dat, from, to, neff, hsq, cds_var, l
if(lr_present & !perform_SR_analysis_only){ # if only the SR analysis is requested, discard this section
# disc_thresh = stats::quantile(MI_df_lr$MI, lr_retain_level)
n_lr_links = nrow(MI_df_lr)

# Following prob works well for most cases where n_lr_links_total >> lr_retain_links (i.e. save 1M out of 1B),
# set the maximum to 0 (i.e. no negative values), should we set the max to 1 if prob > 1
prob = max(c(0, (1 - ((lr_retain_links * (n_lr_links / lr_links_approx)) / n_lr_links))))
# disc_thresh = Rfast2::Quantile(MI_df_lr$MI, probs = prob)
disc_thresh = stats::quantile(MI_df_lr$MI, probs = prob)

# cat("**debug** n_lr_links =",n_lr_links," prob=",prob, " disc_thresh=",disc_thresh, " **debug**\n")

len_filt = MI_df_lr$MI >= disc_thresh
cat(paste("... Adding ", sum(len_filt) ," LR links with MI>",round(disc_thresh, 3) ," to file ...", sep = "")) # debug
if(!all(len_filt == FALSE)){
Expand Down
File renamed without changes.
121 changes: 105 additions & 16 deletions R/extractSNPs.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#' }
#' @export
parse_fasta_alignment <- function(aln_path, gap_freq = 0.15, maf_freq = 0.01, method = "default"){

# Check inputs
aln_path = normalizePath(aln_path) # C safety (~ character causes crash)
if(!file.exists(aln_path)) stop(paste("Can't locate file", aln_path))
Expand All @@ -33,29 +34,118 @@ parse_fasta_alignment <- function(aln_path, gap_freq = 0.15, maf_freq = 0.01, me
filter = 0
}

# t0 = Sys.time() # This timer is not accurate, return function has computations to save memory, time externally instead

# snp.data <- getSNP_Sites(aln_path, filter, gap_freq, maf_freq)
# snp.data <- .getACGTN_Sites(aln_path, filter, gap_freq, maf_freq)
# extract align parameters and filtering...
snp.param <- .extractAlnParam(aln_path, filter, gap_freq, maf_freq)

# g = snp.data$params$seq.length
# nseq = snp.data$params$num.seqs
# g = snp.param$seq.length; snp.param$seq.length = NULL
# nseq = snp.param$num.seqs; snp.param$num.seqs = NULL
# nsnp = snp.param$num.snps; snp.param$num.snps = NULL
# print(paste("Done in", round(difftime(Sys.time(), t0, units = "secs"), 2), "s"))

if(snp.param$seq.length==-1) stop("Error! sequences are of different lengths!")
if(snp.param$num.seqs==0) stop("File does not contain any sequences!")
if(snp.param$num.snps==0) stop("File does not contain any SNPs")

snp.data <- .extractSNPs(aln_path, snp.param$num.seqs, snp.param$num.snps, snp.param$pos)
# seq.names <- gsub("^>","",snp.data$params$seq.names)
seq.names <- gsub("^>","",snp.data$seq.names); snp.data$seq.names = NULL
uqe = apply(snp.data$ACGTN_table>0, 1, function(x) as.numeric(x>0)); snp.data$ACGTN_table = NULL

# nsnp = snp.data$params$num.snps
snp.matrix_A <- Matrix::sparseMatrix(i=snp.data$i_A,
j=snp.data$j_A,
x=as.logical(snp.data$x_A),
dims = c(snp.param$num.seqs, snp.param$num.snps),
dimnames = list(seq.names, snp.param$pos))
snp.data$i_A = snp.data$j_A = snp.data$x_A = NULL

snp.matrix_C <- Matrix::sparseMatrix(i=snp.data$i_C,
j=snp.data$j_C,
x=as.logical(snp.data$x_C),
dims = c(snp.param$num.seqs, snp.param$num.snps),
dimnames = list(seq.names, snp.param$pos))
snp.data$i_C = snp.data$j_C = snp.data$x_C = NULL

snp.matrix_G <- Matrix::sparseMatrix(i=snp.data$i_G,
j=snp.data$j_G,
x=as.logical(snp.data$x_G),
dims = c(snp.param$num.seqs, snp.param$num.snps),
dimnames = list(seq.names, snp.param$pos))
snp.data$i_G = snp.data$j_G = snp.data$x_G = NULL

snp.matrix_T <- Matrix::sparseMatrix(i=snp.data$i_T,
j=snp.data$j_T,
x=as.logical(snp.data$x_T),
dims = c(snp.param$num.seqs, snp.param$num.snps),
dimnames = list(seq.names, snp.param$pos))
snp.data$i_T = snp.data$j_T = snp.data$x_T = NULL

snp.matrix_N <- Matrix::sparseMatrix(i=snp.data$i_N,
j=snp.data$j_N,
x=as.logical(snp.data$x_N),
dims = c(snp.param$num.seqs, snp.param$num.snps),
dimnames = list(seq.names, snp.param$pos))
snp.data = NULL

return(list(snp.matrix_A=Matrix::t(snp.matrix_A), snp.matrix_C=Matrix::t(snp.matrix_C),
snp.matrix_G=Matrix::t(snp.matrix_G), snp.matrix_T=Matrix::t(snp.matrix_T),
snp.matrix_N=Matrix::t(snp.matrix_N), g = snp.param$seq.length, nsnp = snp.param$num.snps,
nseq = snp.param$num.seqs, seq.names = seq.names, r = rowSums(uqe), uqe = uqe, POS = snp.param$pos))
}


#' extractSNPs
#'
#' Function to extract SNPs from a multi-fasta alignment.
#'
#' @importFrom Rcpp sourceCpp
#' @importFrom Matrix sparseMatrix t
#'
#' @param aln_path path to multi fasta SNP alignment
#' @param pos numeric vector containing positions in the original alignment
#' @param gap_freq sites with a gap frequency >gap_greq will be dropped (default = 0.15)
#' @param maf_freq sites with a minor allele frequency <maf_freq will be dropped (default = 0.01)
#' @param method specify the filtering method 'relaxed' or 'default' (default = 'default')
#'
#' @return R list with SNPs in sparse format and additional parameters
#' @useDynLib LDWeaver
#'
#' @examples
#' \dontrun{
#' aln_path <- system.file("extdata", "sample.aln", package = "LDWeaver")
#' snp.dat <- parseFastaAlignment(aln_path, gap_freq = 0.15, maf_freq = 0.01, method = "default")
#' }
#' @export
parse_fasta_SNP_alignment <- function(aln_path, pos, gap_freq = 0.15, maf_freq = 0.01, method = "default"){

## Update to accept a SNP alignment with a positions files (output from snp-sites or https://github.com/sudaraka88/FastaR)
# We also input pos <numeric vector> here 2023/10/12

# Check inputs
aln_path = normalizePath(aln_path) # C safety (~ character causes crash)
if(!file.exists(aln_path)) stop(paste("Can't locate file", aln_path))

if(method == "default") {
filter = 0
} else if(method == "relaxed") {
filter = 1
} else {
warning("Unkown filtering method, using default...")
filter = 0
}

# update .extractAlnParam to say we are working with a SNP alignment
snp.param <- .extractAlnParam(aln_path, filter, gap_freq, maf_freq)

if(snp.param$seq.length==-1) stop("Error! sequences are of different lengths!")
if(snp.param$num.seqs==0) stop("File does not contain any sequences!")
if(snp.param$num.snps==0) stop("File does not contain any SNPs")

# we should do a sanity check here to ensure POS can be extracted
if(length(pos) != snp.param$seq.length) stop("Error! Number of positions do not match the fasta sequence length")

# need to extract SNPs using the snp.param positions
snp.data <- .extractSNPs(aln_path, snp.param$num.seqs, snp.param$num.snps, snp.param$pos)

# now we can update snp.param$pos to real ones
snp.param$pos = as.integer(pos[snp.param$pos]) # to account for possible dropped snps

# seq.names <- gsub("^>","",snp.data$params$seq.names)
seq.names <- gsub("^>","",snp.data$seq.names); snp.data$seq.names = NULL
uqe = apply(snp.data$ACGTN_table>0, 1, function(x) as.numeric(x>0)); snp.data$ACGTN_table = NULL # uqe is not sensitive to SNP positions


snp.matrix_A <- Matrix::sparseMatrix(i=snp.data$i_A,
Expand Down Expand Up @@ -93,11 +183,10 @@ parse_fasta_alignment <- function(aln_path, gap_freq = 0.15, maf_freq = 0.01, me
dimnames = list(seq.names, snp.param$pos))
snp.data = NULL


# cat(paste("Done in", round(difftime(Sys.time(), t0, units = "secs"), 2), "s \n"))
# g is wrong, change to NULL

return(list(snp.matrix_A=Matrix::t(snp.matrix_A), snp.matrix_C=Matrix::t(snp.matrix_C),
snp.matrix_G=Matrix::t(snp.matrix_G), snp.matrix_T=Matrix::t(snp.matrix_T),
snp.matrix_N=Matrix::t(snp.matrix_N), g = snp.param$seq.length, nsnp = snp.param$num.snps,
snp.matrix_N=Matrix::t(snp.matrix_N), g = NULL, nsnp = snp.param$num.snps,
nseq = snp.param$num.seqs, seq.names = seq.names, r = rowSums(uqe), uqe = uqe, POS = snp.param$pos))
}
11 changes: 7 additions & 4 deletions R/parseGBK.R
Original file line number Diff line number Diff line change
Expand Up @@ -35,29 +35,32 @@ parse_genbank_file = function(gbk_path, g = NULL, length_check = T){
# gbk = suppressWarnings(genbankr::import(gbk_path))
gbk = suppressWarnings(genbankr::readGenBank(gbk_path))
refseq = genbankr::getSeq(gbk)

if(length(refseq) != 1){
stop("The GBK file should contain the reference sequence!\n")
# return(-1)
}
# the length check is good to check if the alignment matches with the gbk, setting it to F will stop it
ref_g = length(refseq[[1]]) # length of the reference sequence

if(length_check){ # perform the length check
if(length(refseq[[1]]) != g){ # perform check
if(ref_g != g){ # perform check
stop("Genbank reference sequence length mismatches with the fasta alignment!\n")
# return(-1)
}

} else {
if(!is.null(g)){
if(length(refseq[[1]]) != g){
if(ref_g != g){
warning("Fasta length does not match the genbank reference sequence length!\n")
}
} else {
warning("Similarity between the genbank reference and fasta sequences NOT checked!\n")
warning("Similarity between the genbank reference and fasta sequences NOT checked, ignore if <pos> was provided...\n")
}
}
cat(paste("Successfully read gbk file:", gbk_path, "in", round(difftime(Sys.time(), t0, units = "secs"), 2), "s\n"))

# genbankr::seqinfo(gbk)
return(gbk)
return(list(gbk = gbk,
ref_g = ref_g))
}
Loading

0 comments on commit 702a8bf

Please sign in to comment.