Dev (#1)

* add snp-only aln support * update_readme * update .gitignore' ' '' * update workflow * update gitignore * .gitignore updated * update gitignore * update gitignore --------- Co-authored-by: Sudaraka88 <sudarakm@ua-eduroam-ten-25-132-182.uniaccess.unimelb.edu.au>
Sudaraka88 · Oct 17, 2023 · 702a8bf · 702a8bf
1 parent 334fc1b
commit 702a8bf
Show file tree

Hide file tree

Showing 14 changed files with 226 additions and 222 deletions.
diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml
@@ -2,9 +2,9 @@
 # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 on:
   push:
-    branches: [main, master]
+    branches: [main, master, dev]
   pull_request:
-    branches: [main, master]
+    branches: [main, master, dev]
   schedule:
     - cron:  '1 1 1 * *'
 

diff --git a/.gitignore b/.gitignore
@@ -5,3 +5,5 @@
 *.so
 *.o
 .DS_Store
+*.Rmd
+Makevars
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: LDWeaver
 Type: Package
 Title:Genomewide Epistasis Analysis on Bacteria
-Version: 1.1.1
+Version: 1.2.0
 Authors@R: person("Sudaraka", "Mallawaarachchi", email = "smallawaarachchi@gmail.com", role = c("aut", "cre"))
 Maintainer: Sudaraka Mallawaarachchi <smallawaarachchi@gmail.com>
 Description:Perform genomewide epistasis analysis by evaluating the LD structure in bacteria.

diff --git a/NAMESPACE b/NAMESPACE
@@ -11,6 +11,7 @@ export(estimate_variation_in_CDS)
 export(generate_Links_SNPS_fasta)
 export(genomewide_LDMap)
 export(make_gwes_plots)
+export(parse_fasta_SNP_alignment)
 export(parse_fasta_alignment)
 export(parse_genbank_file)
 export(parse_gff_file)

diff --git a/R/BacGWES.R b/R/BacGWES.R
@@ -7,6 +7,8 @@
 #'
 #' @param dset name of the dataset, all outputs will be saved to the folder <dset>
 #' @param aln_path path to the multi fasta alignment
+#' @param aln_has_all_bases specify whether the alignment has all bases in the reference genome (default = T). For example, if it's a SNP only alignment, set to F and provide pos
+#' @param pos numeric vector of positions for each base in the alignment (default = NULL). Only required if sites are missing from the alignment (e.g. SNP alignment output from snp-sites or https://github.com/Sudaraka88/FastaR)
 #' @param gbk_path path to genbank annotations file (default = NULL). Only provide one of genbank or gff3 annotation files.
 #' @param gff3_path path to gff3 annotations file (default = NULL). Only provide one of genbank or gff3 annotation files.
 #' @param ref_fasta_path path to Reference fasta file. The file MUST be in fasta format and contain exactly one sequence! Required for gff3 annotations,
@@ -42,8 +44,9 @@
 #' sr_links_red = LDWeaver(dset = "efcm", aln_path = "<efcm_aln>", gbk_path = "<efcm.gbk>")
 #' }
 #' @export
-LDWeaver = function(dset, aln_path, gbk_path = NULL, gff3_path = NULL, ref_fasta_path = NULL, validate_ref_ann_lengths = T,
-                    snp_filt_method = "default", gap_freq = 0.15, maf_freq = 0.01, snpeff_jar_path = NULL, hdw_threshold = 0.1,
+LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path = NULL, gff3_path = NULL,
+                    ref_fasta_path = NULL, validate_ref_ann_lengths = T, snp_filt_method = "default",
+                    gap_freq = 0.15, maf_freq = 0.01, snpeff_jar_path = NULL, hdw_threshold = 0.1,
                     perform_SR_analysis_only = F, SnpEff_Annotate = T, sr_dist = 20000, lr_retain_links = 1e6,
                     max_tophits = 250, num_clusts_CDS = 3, srp_cutoff = 3, tanglegram_break_segments = 5,
                     multicore = T, max_blk_sz = 10000, ncores = NULL, save_additional_outputs = F){
@@ -60,11 +63,8 @@ LDWeaver = function(dset, aln_path, gbk_path = NULL, gff3_path = NULL, ref_fasta
 
   #TODO: Add the option to provide genbank file without reference sequence
   #TODO: Count through blocks and automate the displayed BLOCK NUMBER
-  #TODO: Add the option to give spydrpick links directly to the pipeline instead of performing LR analysis
 
-  # # Welcome message
-  # timestamp()
-  # cat(paste("\n\n Performing GWES analysis on:", dset, "\n\n"))
+  # # Welcome message # #
 
   # Sanity checks
   # annotations
@@ -79,6 +79,16 @@ LDWeaver = function(dset, aln_path, gbk_path = NULL, gff3_path = NULL, ref_fasta
     order_links = T # sr_links will be ordered and saved without annotations
   }
 
+  # alignment (now with SNP-only alignment support 2023/10/12)
+  if(aln_has_all_bases == F){ # snp-only alignment, POS must be provided
+    # sanity checks
+    if(is.null(pos)) stop("A numeric vector of 'positions' <pos> must be provided if aln_has_all_bases = F")
+    if(!is.numeric(pos)) stop("Provided pos must be numeric!")
+    if(any(duplicated(pos))) stop("Provided pos contains duplicates!")
+  } else { # This is a full alignment, pos must be NULL
+    if(!is.null(pos)) stop("pos cannot be provided for alignments with all bases! Depending on the use case, either set pos = NULL or aln_has_all_bases = T")
+  }
+
   # multicore
   if(multicore == T) { # multicore requested (default)
     max_n_cores = parallel::detectCores()
@@ -131,6 +141,10 @@ LDWeaver = function(dset, aln_path, gbk_path = NULL, gff3_path = NULL, ref_fasta
     max_blk_sz = 10000
   }
 
+  if(aln_has_all_bases == F){ # For snp-only alignments, reference and alignment length checks will fail, stop checking
+    validate_ref_ann_lengths = F
+  }
+
   # normalise_input_paths
   aln_path = normalizePath(aln_path)
   if(!is.null(gbk_path)) gbk_path = normalizePath(gbk_path)
@@ -207,12 +221,21 @@ LDWeaver = function(dset, aln_path, gbk_path = NULL, gff3_path = NULL, ref_fasta
   if(!file.exists(ACGTN_snp_path)) {
     t0 = Sys.time()
     cat(paste("Parsing Alignment:", aln_path ,"\n"))
-    snp.dat = LDWeaver::parse_fasta_alignment(aln_path = aln_path, method = snp_filt_method, gap_freq = gap_freq, maf_freq = maf_freq)
-    if(save_additional_outputs){
-      cat("Step 5: Savings snp.dat...")
-      saveRDS(snp.dat, ACGTN_snp_path)
+
+    # Adding support for SNP-only alignments
+    if(aln_has_all_bases == T){
+      snp.dat = LDWeaver::parse_fasta_alignment(aln_path = aln_path, method = snp_filt_method, gap_freq = gap_freq, maf_freq = maf_freq)
+      if(save_additional_outputs){
+        cat("Step 5: Savings snp.dat...")
+        saveRDS(snp.dat, ACGTN_snp_path)
+      }
+    } else {
+      snp.dat = LDWeaver::parse_fasta_SNP_alignment(aln_path = aln_path, pos = pos, method = snp_filt_method, gap_freq = gap_freq, maf_freq = maf_freq)
+      # Note that snp.dat$g = NULL (we cannot measure this, need to get it from the genbank file)
+      # we cannot save snp.dat here due to absent snp.dat$g, moving downstream (block 2)
     }
 
+
     cat(paste("BLOCK 1 complete in", round(difftime(Sys.time(), t0, units = "secs"), 2), "s \n"))
   }else{
     cat("Loading previous snp matrix \n")
@@ -225,7 +248,9 @@ LDWeaver = function(dset, aln_path, gbk_path = NULL, gff3_path = NULL, ref_fasta
     gff = NULL # alternative set to NULL
     if(!file.exists(parsed_gbk_path)) {
       cat(paste("Reading the GBK file, validate_length_check = ", validate_ref_ann_lengths, "\n"))
-      gbk = LDWeaver::parse_genbank_file(gbk_path = gbk_path, g = snp.dat$g, length_check = validate_ref_ann_lengths) # will return 1 if fails
+      gbk_ = LDWeaver::parse_genbank_file(gbk_path = gbk_path, g = snp.dat$g, length_check = validate_ref_ann_lengths) # will return 1 if fails
+      gbk = gbk_$gbk # gbk is coming in gbk_$gbk now
+      # For snp-only alignments, g = snp.dat$g = NULL and validate_ref_ann_lengths = F, won't be an issue...
       if(save_additional_outputs){
         saveRDS(gbk, parsed_gbk_path)
       }
@@ -249,6 +274,22 @@ LDWeaver = function(dset, aln_path, gbk_path = NULL, gff3_path = NULL, ref_fasta
     }
   }
 
+  # This is a patch to add snp.dat$g back in case this is a SNP only alignment
+  if(is.null(snp.dat$g)){
+    if(!is.null(gbk_path)){ # input reference is genbank format
+      snp.dat$g = gbk_$ref_g
+      cat("Extracted ref genome length", snp.dat$g, "from genbank...")
+    }
+    if(!is.null(gff3_path)){ # input reference is gff3 format
+      snp.dat$g = gff$g
+    }
+    if(save_additional_outputs){
+      cat("saving snp.dat...\n")
+      saveRDS(snp.dat, ACGTN_snp_path)
+      #### This saved file will have snp.dat$g = NULL ### TODO: Ensure no issues downstream
+    }
+  }
+
   # BLK3
   cat("\n\n #################### BLOCK 3 #################### \n\n")
   if(!file.exists(cds_var_path)) {

diff --git a/R/computePairwiseMI.R b/R/computePairwiseMI.R
@@ -70,8 +70,10 @@ perform_MI_computation = function(snp.dat, hdw, cds_var, ncores, lr_save_path =
   if(!perform_SR_analysis_only){
     # estimate the number of lr_links to decide on a discard threshold using  all or 1% of SNPs
     snp_subset = min(snp.dat$nsnp, round(snp.dat$nsnp*0.1))
+    set.seed(1988) # set a seed, so that the same subset of positions get picked below (not absolutely needed, but good for reproducibility)
     lr_link_count = sapply(snp.dat$POS[sample(snp.dat$nsnp, snp_subset)], function(x) sum((0.5*snp.dat$g - abs((x - snp.dat$POS)%%snp.dat$g  - 0.5*snp.dat$g))>sr_dist))
     lr_links_approx = sum(lr_link_count)/snp_subset*snp.dat$nsnp/2
+
   } else {
     lr_links_approx = NULL
   }
@@ -81,7 +83,7 @@ perform_MI_computation = function(snp.dat, hdw, cds_var, ncores, lr_save_path =
     cat(paste("Block", i, "of", nblcks, "..."))
     from_ = MI_cmp_blks$from_s[i]:MI_cmp_blks$from_e[i]
     to_ = MI_cmp_blks$to_s[i]:MI_cmp_blks$to_e[i]
-
+    # cat("**debug** snp_subset =",snp_subset, " lr_links_approx=",lr_links_approx," lr_retain_links=",lr_retain_links, " **debug**\n")
     sr_links = perform_MI_computation_ACGTN(snp.dat = snp.dat, neff = neff, hsq = hsq, cds_var = cds_var,
                                             lr_save_path = lr_save_path, from = from_, sr_dist = sr_dist,
                                             lr_retain_links = lr_retain_links, to = to_, ncores = ncores, sr_links = sr_links,
@@ -294,12 +296,14 @@ perform_MI_computation_ACGTN = function(snp.dat, from, to, neff, hsq, cds_var, l
   if(lr_present & !perform_SR_analysis_only){ # if only the SR analysis is requested, discard this section
     # disc_thresh = stats::quantile(MI_df_lr$MI, lr_retain_level)
     n_lr_links = nrow(MI_df_lr)
-
     # Following prob works well for most cases where n_lr_links_total >> lr_retain_links (i.e. save 1M out of 1B),
     # set the maximum to 0 (i.e. no negative values), should we set the max to 1 if prob > 1
     prob = max(c(0, (1 - ((lr_retain_links * (n_lr_links / lr_links_approx)) / n_lr_links))))
     # disc_thresh = Rfast2::Quantile(MI_df_lr$MI, probs = prob)
     disc_thresh = stats::quantile(MI_df_lr$MI, probs = prob)
+
+    # cat("**debug** n_lr_links =",n_lr_links," prob=",prob, " disc_thresh=",disc_thresh, " **debug**\n")
+
     len_filt = MI_df_lr$MI >= disc_thresh
     cat(paste("... Adding ", sum(len_filt) ," LR links with MI>",round(disc_thresh, 3) ," to file ...", sep = "")) # debug
     if(!all(len_filt == FALSE)){

diff --git a/R/EstimateCDSDiversity.R → R/estimateCDSDiversity.R b/R/EstimateCDSDiversity.R → R/estimateCDSDiversity.R
diff --git a/R/extractSNPs.R b/R/extractSNPs.R
@@ -20,6 +20,7 @@
 #' }
 #' @export
 parse_fasta_alignment <- function(aln_path, gap_freq = 0.15, maf_freq = 0.01, method = "default"){
+
   # Check inputs
   aln_path = normalizePath(aln_path) # C safety (~ character causes crash)
   if(!file.exists(aln_path)) stop(paste("Can't locate file", aln_path))
@@ -33,29 +34,118 @@ parse_fasta_alignment <- function(aln_path, gap_freq = 0.15, maf_freq = 0.01, me
     filter = 0
   }
 
-  # t0 = Sys.time() # This timer is not accurate, return function has computations to save memory, time externally instead
-
-  # snp.data <- getSNP_Sites(aln_path, filter, gap_freq, maf_freq)
-  # snp.data <- .getACGTN_Sites(aln_path, filter, gap_freq, maf_freq)
+  # extract align parameters and filtering...
   snp.param <- .extractAlnParam(aln_path, filter, gap_freq, maf_freq)
 
-  # g = snp.data$params$seq.length
-  # nseq = snp.data$params$num.seqs
-  # g = snp.param$seq.length; snp.param$seq.length = NULL
-  # nseq = snp.param$num.seqs; snp.param$num.seqs = NULL
-  # nsnp = snp.param$num.snps; snp.param$num.snps = NULL
-  # print(paste("Done in", round(difftime(Sys.time(), t0, units = "secs"), 2), "s"))
-
   if(snp.param$seq.length==-1) stop("Error! sequences are of different lengths!")
   if(snp.param$num.seqs==0) stop("File does not contain any sequences!")
   if(snp.param$num.snps==0) stop("File does not contain any SNPs")
 
   snp.data <- .extractSNPs(aln_path, snp.param$num.seqs, snp.param$num.snps, snp.param$pos)
-  # seq.names <-  gsub("^>","",snp.data$params$seq.names)
   seq.names <- gsub("^>","",snp.data$seq.names); snp.data$seq.names = NULL
   uqe = apply(snp.data$ACGTN_table>0, 1, function(x) as.numeric(x>0)); snp.data$ACGTN_table = NULL
 
-  # nsnp = snp.data$params$num.snps
+  snp.matrix_A <- Matrix::sparseMatrix(i=snp.data$i_A,
+                                       j=snp.data$j_A,
+                                       x=as.logical(snp.data$x_A),
+                                       dims = c(snp.param$num.seqs, snp.param$num.snps),
+                                       dimnames = list(seq.names, snp.param$pos))
+  snp.data$i_A = snp.data$j_A = snp.data$x_A = NULL
+
+  snp.matrix_C <- Matrix::sparseMatrix(i=snp.data$i_C,
+                                       j=snp.data$j_C,
+                                       x=as.logical(snp.data$x_C),
+                                       dims = c(snp.param$num.seqs, snp.param$num.snps),
+                                       dimnames = list(seq.names, snp.param$pos))
+  snp.data$i_C = snp.data$j_C = snp.data$x_C = NULL
+
+  snp.matrix_G <- Matrix::sparseMatrix(i=snp.data$i_G,
+                                       j=snp.data$j_G,
+                                       x=as.logical(snp.data$x_G),
+                                       dims = c(snp.param$num.seqs, snp.param$num.snps),
+                                       dimnames = list(seq.names, snp.param$pos))
+  snp.data$i_G = snp.data$j_G = snp.data$x_G = NULL
+
+  snp.matrix_T <- Matrix::sparseMatrix(i=snp.data$i_T,
+                                       j=snp.data$j_T,
+                                       x=as.logical(snp.data$x_T),
+                                       dims = c(snp.param$num.seqs, snp.param$num.snps),
+                                       dimnames = list(seq.names, snp.param$pos))
+  snp.data$i_T = snp.data$j_T = snp.data$x_T = NULL
+
+  snp.matrix_N <- Matrix::sparseMatrix(i=snp.data$i_N,
+                                       j=snp.data$j_N,
+                                       x=as.logical(snp.data$x_N),
+                                       dims = c(snp.param$num.seqs, snp.param$num.snps),
+                                       dimnames = list(seq.names, snp.param$pos))
+  snp.data = NULL
+
+  return(list(snp.matrix_A=Matrix::t(snp.matrix_A), snp.matrix_C=Matrix::t(snp.matrix_C),
+              snp.matrix_G=Matrix::t(snp.matrix_G), snp.matrix_T=Matrix::t(snp.matrix_T),
+              snp.matrix_N=Matrix::t(snp.matrix_N), g = snp.param$seq.length, nsnp = snp.param$num.snps,
+              nseq = snp.param$num.seqs, seq.names = seq.names, r = rowSums(uqe), uqe = uqe, POS = snp.param$pos))
+}
+
+
+#' extractSNPs
+#'
+#' Function to extract SNPs from a multi-fasta alignment.
+#'
+#' @importFrom Rcpp sourceCpp
+#' @importFrom Matrix sparseMatrix t
+#'
+#' @param aln_path path to multi fasta SNP alignment
+#' @param pos numeric vector containing positions in the original alignment
+#' @param gap_freq sites with a gap frequency >gap_greq will be dropped (default = 0.15)
+#' @param maf_freq sites with a minor allele frequency <maf_freq will be dropped (default = 0.01)
+#' @param method specify the filtering method 'relaxed' or 'default' (default = 'default')
+#'
+#' @return R list with SNPs in sparse format and additional parameters
+#' @useDynLib LDWeaver
+#'
+#' @examples
+#' \dontrun{
+#' aln_path <- system.file("extdata", "sample.aln", package = "LDWeaver")
+#' snp.dat <- parseFastaAlignment(aln_path, gap_freq = 0.15, maf_freq = 0.01, method = "default")
+#' }
+#' @export
+parse_fasta_SNP_alignment <- function(aln_path, pos, gap_freq = 0.15, maf_freq = 0.01, method = "default"){
+
+  ## Update to accept a SNP alignment with a positions files (output from snp-sites or https://github.com/sudaraka88/FastaR)
+  # We also input pos <numeric vector> here 2023/10/12
+
+  # Check inputs
+  aln_path = normalizePath(aln_path) # C safety (~ character causes crash)
+  if(!file.exists(aln_path)) stop(paste("Can't locate file", aln_path))
+
+  if(method == "default") {
+    filter = 0
+  } else if(method == "relaxed") {
+    filter = 1
+  } else {
+    warning("Unkown filtering method, using default...")
+    filter = 0
+  }
+
+  # update .extractAlnParam to say we are working with a SNP alignment
+  snp.param <- .extractAlnParam(aln_path, filter, gap_freq, maf_freq)
+
+  if(snp.param$seq.length==-1) stop("Error! sequences are of different lengths!")
+  if(snp.param$num.seqs==0) stop("File does not contain any sequences!")
+  if(snp.param$num.snps==0) stop("File does not contain any SNPs")
+
+  # we should do a sanity check here to ensure POS can be extracted
+  if(length(pos) != snp.param$seq.length) stop("Error! Number of positions do not match the fasta sequence length")
+
+  # need to extract SNPs using the snp.param positions
+  snp.data <- .extractSNPs(aln_path, snp.param$num.seqs, snp.param$num.snps, snp.param$pos)
+
+  # now we can update snp.param$pos to real ones
+  snp.param$pos = as.integer(pos[snp.param$pos]) # to account for possible dropped snps
+
+  # seq.names <-  gsub("^>","",snp.data$params$seq.names)
+  seq.names <- gsub("^>","",snp.data$seq.names); snp.data$seq.names = NULL
+  uqe = apply(snp.data$ACGTN_table>0, 1, function(x) as.numeric(x>0)); snp.data$ACGTN_table = NULL # uqe is not sensitive to SNP positions
 
 
   snp.matrix_A <- Matrix::sparseMatrix(i=snp.data$i_A,
@@ -93,11 +183,10 @@ parse_fasta_alignment <- function(aln_path, gap_freq = 0.15, maf_freq = 0.01, me
                                        dimnames = list(seq.names, snp.param$pos))
   snp.data = NULL
 
-
-  # cat(paste("Done in", round(difftime(Sys.time(), t0, units = "secs"), 2), "s \n"))
+  # g is wrong, change to NULL
 
   return(list(snp.matrix_A=Matrix::t(snp.matrix_A), snp.matrix_C=Matrix::t(snp.matrix_C),
               snp.matrix_G=Matrix::t(snp.matrix_G), snp.matrix_T=Matrix::t(snp.matrix_T),
-              snp.matrix_N=Matrix::t(snp.matrix_N), g = snp.param$seq.length, nsnp = snp.param$num.snps,
+              snp.matrix_N=Matrix::t(snp.matrix_N), g = NULL, nsnp = snp.param$num.snps,
               nseq = snp.param$num.seqs, seq.names = seq.names, r = rowSums(uqe), uqe = uqe, POS = snp.param$pos))
 }
diff --git a/R/parseGBK.R b/R/parseGBK.R
@@ -35,29 +35,32 @@ parse_genbank_file = function(gbk_path, g = NULL, length_check = T){
   # gbk = suppressWarnings(genbankr::import(gbk_path))
   gbk = suppressWarnings(genbankr::readGenBank(gbk_path))
   refseq = genbankr::getSeq(gbk)
+
   if(length(refseq) != 1){
     stop("The GBK file should contain the reference sequence!\n")
     # return(-1)
   }
   # the length check is good to check if the alignment matches with the gbk, setting it to F will stop it
+  ref_g = length(refseq[[1]]) # length of the reference sequence
 
   if(length_check){ # perform the length check
-    if(length(refseq[[1]]) != g){ # perform check
+    if(ref_g != g){ # perform check
       stop("Genbank reference sequence length mismatches with the fasta alignment!\n")
       # return(-1)
     }
 
   } else {
     if(!is.null(g)){
-      if(length(refseq[[1]]) != g){
+      if(ref_g != g){
         warning("Fasta length does not match the genbank reference sequence length!\n")
       }
     } else {
-      warning("Similarity between the genbank reference and fasta sequences NOT checked!\n")
+      warning("Similarity between the genbank reference and fasta sequences NOT checked, ignore if <pos> was provided...\n")
     }
   }
   cat(paste("Successfully read gbk file:", gbk_path, "in", round(difftime(Sys.time(), t0, units = "secs"), 2), "s\n"))
 
   # genbankr::seqinfo(gbk)
-  return(gbk)
+  return(list(gbk = gbk,
+              ref_g = ref_g))
 }
-Original file line number
+Diff line change
@@ Expand Up / @@ -5,3 +5,5 @@ @@
     *.so
     *.o
     .DS_Store
+    *.Rmd
+    Makevars