diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index b1e4b4e..db23b84 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -22,7 +22,7 @@ jobs: config: - {os: macos-latest, r: 'release'} - {os: windows-latest, r: 'release'} - #- {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} + # - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} - {os: ubuntu-latest, r: 'release'} - {os: ubuntu-latest, r: 'oldrel-1'} @@ -49,3 +49,4 @@ jobs: - uses: r-lib/actions/check-r-package@v2 with: upload-snapshots: true + error-on: '"error"' diff --git a/DESCRIPTION b/DESCRIPTION index c6ba8d7..f05d12c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: LDWeaver Type: Package Title:Genomewide Epistasis Analysis on Bacteria -Version: 1.3.1 +Version: 1.4 Authors@R: person("Sudaraka", "Mallawaarachchi", email = "smallawaarachchi@gmail.com", role = c("aut", "cre")) Maintainer: Sudaraka Mallawaarachchi Description:Perform genomewide epistasis analysis by evaluating the LD structure in bacteria. @@ -12,12 +12,13 @@ biocViews: Software Depends: R (>= 4.0.0), Imports: ape, + Biostrings, chromoMap, data.table, dplyr, fitdistrplus, - genbankr, GenomicRanges, + GenomeInfoDb, ggnewscale, ggplot2, ggtree, @@ -26,6 +27,7 @@ Imports: heatmap3, htmlwidgets, igraph, + IRanges, Matrix, MatrixExtra, methods, @@ -35,8 +37,10 @@ Imports: RColorBrewer, Rcpp, RcppArmadillo, + S4Vectors, stats, - utils + utils, + VariantAnnotation LinkingTo: Rcpp, RcppArmadillo RoxygenNote: 7.2.3 URL: https://github.com/Sudaraka88/LDWeaver diff --git a/NAMESPACE b/NAMESPACE index b7f18de..2f5546c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,5 +1,6 @@ # Generated by roxygen2: do not edit by hand +export(GBAccession) export(LDWeaver) export(analyse_long_range_links) export(cleanup) @@ -24,9 +25,28 @@ export(read_TopHits) export(snpdat_to_fa) export(view_tree) export(write_output_for_gwes_explorer) +exportClasses(GBAccession) +exportClasses(GBKFile) +exportClasses(GenBankFile) +exportClasses(GenBankRecord) +import(GenomicRanges) +importClassesFrom(Biostrings,XStringSet) +importClassesFrom(GenomicRanges,CompressedGRangesList) +importClassesFrom(GenomicRanges,GRangesList) +importFrom(Biostrings,AAString) +importFrom(Biostrings,AAStringSet) +importFrom(Biostrings,DNAString) +importFrom(Biostrings,extractAt) +importFrom(GenomeInfoDb,Seqinfo) +importFrom(GenomeInfoDb,seqinfo) +importFrom(GenomeInfoDb,seqlevels) +importFrom(GenomeInfoDb,seqnames) importFrom(GenomicRanges,end) importFrom(GenomicRanges,start) importFrom(GenomicRanges,width) +importFrom(IRanges,IRanges) +importFrom(IRanges,heads) +importFrom(IRanges,ranges) importFrom(Matrix,colSums) importFrom(Matrix,rowSums) importFrom(Matrix,sparseMatrix) @@ -38,6 +58,11 @@ importFrom(MatrixExtra,tcrossprod) importFrom(RColorBrewer,brewer.pal) importFrom(Rcpp,sourceCpp) importFrom(RcppArmadillo,fastLm) +importFrom(S4Vectors,DataFrame) +importFrom(S4Vectors,queryHits) +importFrom(S4Vectors,subjectHits) +importFrom(VariantAnnotation,VRanges) +importFrom(VariantAnnotation,makeVRangesFromGRanges) importFrom(ape,read.gff) importFrom(ape,read.tree) importFrom(chromoMap,chromoMap) @@ -49,9 +74,6 @@ importFrom(data.table,setattr) importFrom(dplyr,`%>%`) importFrom(dplyr,summarise) importFrom(fitdistrplus,fitdist) -importFrom(genbankr,cds) -importFrom(genbankr,getSeq) -importFrom(genbankr,readGenBank) importFrom(ggnewscale,new_scale_fill) importFrom(ggplot2,aes) importFrom(ggplot2,facet_wrap) @@ -76,6 +98,8 @@ importFrom(htmlwidgets,saveWidget) importFrom(igraph,graph_from_edgelist) importFrom(igraph,set.edge.attribute) importFrom(methods,as) +importFrom(methods,is) +importFrom(methods,new) importFrom(parallel,detectCores) importFrom(phytools,midpoint.root) importFrom(plyr,.) @@ -88,8 +112,10 @@ importFrom(stats,hclust) importFrom(stats,kmeans) importFrom(stats,pbeta) importFrom(stats,quantile) +importFrom(utils,packageVersion) importFrom(utils,read.table) importFrom(utils,setTxtProgressBar) +importFrom(utils,stack) importFrom(utils,timestamp) importFrom(utils,txtProgressBar) importFrom(utils,write.table) diff --git a/R/BacGWES.R b/R/BacGWES.R index 03a42d6..9ffacc8 100644 --- a/R/BacGWES.R +++ b/R/BacGWES.R @@ -3,7 +3,7 @@ #' Function to run the LDWeaver pipeline #' #' @importFrom parallel detectCores -#' @importFrom utils timestamp +#' @importFrom utils timestamp packageVersion #' #' @param dset name of the dataset, all outputs will be saved to the folder #' @param aln_path path to the multi fasta alignment @@ -29,7 +29,8 @@ #' @param srp_cutoff specify the short-range -log10(p) cut-off value to discard short-range links before returning the data.frame. This setting has no impact on the #' modelling since all links are used. However, setting a threshold > 2 will generally reduce the memory usage, plotting time (default = 3, i.e. corresponding to p = 0.001), #' and run time for ARACNE. If all links are required to be returned, set to 0 (i.e. corresponding to p = 1), range 0 - 5 -#' @param tanglegram_break_segments specify the number of genome segments to prepare - one tanglegram per segment (default = 5), range 1 - 10 +#' @param tanglegram_break_segments specify the number of genome segments to prepare - one tanglegram per segment (default = 5), range 1 - 10. Set NULL to skip tanglegram +#' @param write_gwesExplorer specify whether output for GWESExplorer is required (default = T) #' @param multicore specify whether to use parallel processing (default = T) #' @param ncores specify the number of cores to use for parallel processing (default = NULL), will auto detect if NULL #' @param max_blk_sz specify maximum block size for MI computation (default = 10000), larger sizes require more RAM, range 1000 - 100000 @@ -48,7 +49,8 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path gap_freq = 0.15, maf_freq = 0.01, hdw_threshold = 0.1, perform_SR_analysis_only = F, SnpEff_Annotate = T, sr_dist = 20000, lr_retain_links = 1e6, max_tophits = 250, num_clusts_CDS = 3, srp_cutoff = 3, tanglegram_break_segments = 5, - multicore = T, max_blk_sz = 10000, ncores = NULL, save_additional_outputs = F){ + write_gwesExplorer = T, multicore = T, max_blk_sz = 10000, ncores = NULL, + save_additional_outputs = F){ # Build blocks # BLK1: Extract SNPs and create sparse Mx from MSA (fasta) # BLK2: Parse GBK or GFF+REF @@ -62,8 +64,11 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path # BLK10: GWESExplorer (depends: GWESExplorer) # BLK11: Cleanup + #TODO: Provide the option to skip SNP extraction and use the whole provided alignment (redundant if pre-filtered) #TODO: Add the option to provide genbank file without reference sequence #TODO: Count through blocks and automate the displayed BLOCK NUMBER + #TODO: genbankr is being droped from the newest bioconductor, add alternative (https://github.com/gmbecker/genbankr) + #TODO: Add Hamming Distance plot, can we have a SNP Tree + Hamming Distance weights to show population structure control? #NOTE: SnpEff does not parse the GBK and GFF3 file from the same refseq reference genome the same way. There might be differences between annotations/tophits/etc. # # Welcome message # # @@ -138,11 +143,12 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path srp_cutoff = 3 } - if(tanglegram_break_segments < 0 | tanglegram_break_segments > 10) { - warning(paste("Unable to use the provided value for , using", 5)) - tanglegram_break_segments = 5 + if(!is.null(tanglegram_break_segments)){ + if(tanglegram_break_segments < 0 | tanglegram_break_segments > 10) { + warning(paste("Unable to use the provided value for , using", 5)) + tanglegram_break_segments = 5 + } } - if(max_blk_sz < 1000 | max_blk_sz > 100000) { warning(paste("Unable to use the provided value for , using", 10000, "...!If this value is causing the function to crash, consider reducing!...")) max_blk_sz = 10000 @@ -162,6 +168,12 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path # setup paths if(!file.exists(dset)) dir.create(dset) # save everything in here + + # Save console output as a text file + info_file = file.path(dset, paste("LDW_run_",format(Sys.time(), "%Y%m%d%H%M%S"), ".txt", sep = "")) + suppressWarnings(sink(file= NULL)) + sink(info_file, split = T) + add_path = file.path(dset, "Additional_Outputs") # Additional Outputs if(save_additional_outputs) { if(!file.exists(add_path)) dir.create(file.path(dset, "Additional_Outputs")) @@ -203,7 +215,11 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path cat(paste("All outputs will be saved to:", normalizePath(dset), "\n")) cat(paste("\n *** Input paths *** \n\n")) cat(paste("* Alignment:", aln_path, "\n")) - cat(paste("* GenBank Annotation:", gbk_path, "\n")) + if(!is.null(gbk_path)) { + cat(paste("* GenBank Annotation:", gbk_path, "\n")) + cat(paste("* Parser built using genbankr source (https://github.com/gmbecker/genbankr) \n")) + } + if(!is.null(gff3_path)) cat(paste("* GFF3 Annotation:", gff3_path, "\n")) if(!is.null(snpeff_jar_path)) cat(paste("* SnpEff Annotations will be performed on short-range links. SnpEff path:", snpeff_jar_path, "\n")) cat(paste("\n *** Parameters *** \n\n")) @@ -356,6 +372,7 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path if(nrow(sr_links) == 0){ + suppressWarnings(sink(file= NULL)) ### output info to text file stop("No potentially important sr_links were identified! Cannot continue analysis...") } @@ -383,25 +400,30 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path tophits = LDWeaver::read_TopHits(top_hits_path = tophits_path) } - # Additional paths if annotations are requested - # tanglegram - tanglegram_path = file.path(dset, "SR_Tanglegram") - if(!file.exists(tanglegram_path)) dir.create(tanglegram_path) - # GWESExplorer - gwesexplorer_path = file.path(dset, "SR_GWESExplorer") - if(!file.exists(gwesexplorer_path)) dir.create(gwesexplorer_path) - # NetworkPlot - netplot_path = file.path(dset, "SR_network_plot.png") # BLK8 - cat("\n\n #################### BLOCK 9 #################### \n\n") - LDWeaver::create_tanglegram(tophits = tophits, gbk = gbk, gff = gff, tanglegram_folder = tanglegram_path, break_segments = tanglegram_break_segments) - + if(!is.null(tanglegram_break_segments)){ + # tanglegram + tanglegram_path = file.path(dset, "SR_Tanglegram") + if(!file.exists(tanglegram_path)) dir.create(tanglegram_path) + cat("\n\n #################### BLOCK 9 #################### \n\n") + LDWeaver::create_tanglegram(tophits = tophits, gbk = gbk, gff = gff, tanglegram_folder = tanglegram_path, break_segments = tanglegram_break_segments) + } # BLK9 - cat("\n\n #################### BLOCK 10 #################### \n\n") - LDWeaver::write_output_for_gwes_explorer(snp.dat = snp.dat, tophits = tophits, gwes_explorer_folder = gwesexplorer_path) + if(write_gwesExplorer){ + # GWESExplorer + gwesexplorer_path = file.path(dset, "SR_GWESExplorer") + if(!file.exists(gwesexplorer_path)) dir.create(gwesexplorer_path) + cat("\n\n #################### BLOCK 10 #################### \n\n") + LDWeaver::write_output_for_gwes_explorer(snp.dat = snp.dat, tophits = tophits, gwes_explorer_folder = gwesexplorer_path) + } + # BLK10 + # Additional paths if annotations are requested + # NetworkPlot + netplot_path = file.path(dset, "SR_network_plot.png") + cat("\n\n #################### BLOCK 11 #################### \n\n") LDWeaver::create_network(tophits = tophits, netplot_path = netplot_path, plot_title = paste("Networks in short-range tophits for", dset)) diff --git a/R/createTanglegram.R b/R/createTanglegram.R index e460f93..f055ea6 100644 --- a/R/createTanglegram.R +++ b/R/createTanglegram.R @@ -4,7 +4,6 @@ #' #' @importFrom htmlwidgets saveWidget #' @importFrom stats cutree hclust dist -#' @importFrom genbankr cds #' @importFrom plyr . ddply #' @importFrom chromoMap chromoMap #' diff --git a/R/estimateCDSDiversity.R b/R/estimateCDSDiversity.R index 1dc68b0..db73094 100644 --- a/R/estimateCDSDiversity.R +++ b/R/estimateCDSDiversity.R @@ -3,7 +3,6 @@ #' Function to estimate the variation within each coding region, the output from this function #' can be used to segment the genome into diversity-based clusters. #' -#' @importFrom genbankr cds getSeq #' @importFrom GenomicRanges start width end #' @importFrom Matrix rowSums colSums #' @importFrom data.table data.table setattr %between% .I @@ -25,6 +24,9 @@ #' } #' @export estimate_variation_in_CDS = function(snp.dat, ncores, gbk = NULL, gff = NULL, num_clusts_CDS = 3, clust_plt_path = NULL){ + ## NOTE: genbankr depreciation, removed the following import + # importFrom genbankr cds getSeq + # This method is only approximate, but much MUCH faster and easier on resources # TODO: Include the higher accuracy function t0 = Sys.time() @@ -34,12 +36,14 @@ estimate_variation_in_CDS = function(snp.dat, ncores, gbk = NULL, gff = NULL, nu # extract the information we need if(!is.null(gbk)){ - cds_reg = genbankr::cds(gbk) + # cds_reg = genbankr::cds(gbk) # no longer exporting this function after genbankr depreciation + cds_reg = gbk@cds starts = GenomicRanges::start(cds_reg) widths = GenomicRanges::width(cds_reg) ends = GenomicRanges::end(cds_reg) # convert ref to a CharacterVector - ref = unlist(unname(strsplit(as.character(genbankr::getSeq(gbk)), '')))[snp.dat$POS] + # ref = unlist(unname(strsplit(as.character(genbankr::getSeq(gbk)), '')))[snp.dat$POS] # no longer exporting this function after genbankr deprecation + ref = unlist(unname(strsplit(as.character(gbk@sequence), '')))[snp.dat$POS] } else if(!is.null(gff)){ gff_cds = gff$gff[tolower(gff$gff$type) == "cds", ] starts = gff_cds$start diff --git a/R/io_functions.R b/R/io_functions.R index fc814b0..050bd6d 100644 --- a/R/io_functions.R +++ b/R/io_functions.R @@ -250,7 +250,7 @@ cleanup = function(dset, delete_after_moving = F){ mv_success = c(mv_success, idx) } - idx = c(grep("cds_var.rds", files), grep("hdw.rds", files), grep("parsed_gbk.rds", files), grep("snp_ACGTN.rds", files)) + idx = c(grep("cds_var.rds", files), grep("hdw.rds", files), grep("parsed_gbk.rds", files), grep("parsed_gff3.rds", files), grep("snp_ACGTN.rds", files)) if(length(idx) > 0){ fldr = file.path(dset, "Additional_Outputs") cleanup_support(files = file.path(dset, files[idx]), fldr) @@ -298,7 +298,7 @@ cleanup = function(dset, delete_after_moving = F){ } #### Temp folder #### - idx = c(grep("snpEff", files), grep("*.vcf", files), grep("*annotations.tsv", files), grep("*_links.tsv", files)) + idx = c(grep("snpEff", files), grep("*.vcf", files), grep("*annotations.tsv", files), grep("*_links.tsv", files), grep("LDW_run_*", files)) if(length(idx) > 0){ fldr = file.path(dset, "Temp") cleanup_support(files = file.path(dset, files[idx]), fldr) diff --git a/R/lr_analyser.R b/R/lr_analyser.R index 3641b6d..90f1546 100644 --- a/R/lr_analyser.R +++ b/R/lr_analyser.R @@ -44,16 +44,14 @@ analyse_long_range_links = function(dset, lr_links_path, sr_links_path, are_lrli # NOTE: spydrpick does not add clusters, add them from paint (requires cds_var) if(SnpEff_Annotate == T) { - if( (is.null(gbk_path) & is.null(gff3_path)) | (!is.null(gbk_path) & !is.null(gff3_path)) ) stop("Either gbk_path or gff3_path must be provided. - To run without annotations, re-run analyse_long_range_links() with SnpEff_Annotate = F") # only one of gbk or gff can be NULL - if(!is.null(gff3_path) & is.null(ref_fasta_path)) stop("Reference fasta file must be provided for gff3 annoations. - To run without annotations, re-run analyse_long_range_links() with SnpEff_Annotate = F") # only one of gbk or gff can be NULL - if(is.null(snpeff_jar_path)) stop("You must specify for annotations. To run without annotations, re-run analyse_long_range_links() with SnpEff_Annotate = F") - if(!file.exists(snpeff_jar_path)) stop(paste(" not found at:", snpeff_jar_path, "please check the path provided")) - if(is.null(snpeff_jar_path)) stop("You must specify for annotations. To run without annotations, re-run analyse_long_range_links() with SnpEff_Annotate = F") + if( (is.null(gbk_path) & is.null(gff3_path)) | (!is.null(gbk_path) & !is.null(gff3_path)) ) stop("Either gbk_path or gff3_path must be provided. To run without annotations, re-run analyse_long_range_links() with SnpEff_Annotate = F") # only one of gbk or gff can be NULL + if(!is.null(gff3_path) & is.null(ref_fasta_path)) stop("Reference fasta file must be provided for gff3 annoations. To run without annotations, re-run analyse_long_range_links() with SnpEff_Annotate = F") # only one of gbk or gff can be NULL + # if(is.null(snpeff_jar_path)) stop("You must specify for annotations. To run without annotations, re-run analyse_long_range_links() with SnpEff_Annotate = F") + # if(!file.exists(snpeff_jar_path)) stop(paste(" not found at:", snpeff_jar_path, "please check the path provided")) + # if(is.null(snpeff_jar_path)) stop("You must specify for annotations. To run without annotations, re-run analyse_long_range_links() with SnpEff_Annotate = F") # if(is.null(gbk_path)) stop("You must specify for annotations. To run without annotations, re-run analyse_long_range_links() with SnpEff_Annotate = F") - if(is.null(snp.dat)) stop("You must specify for annotations. To run without annotations, re-run analyse_long_range_links() with SnpEff_Annotate = F") - if(is.null(cds_var)) stop("You must specify for annotations. To run without annotations, re-run analyse_long_range_links() with SnpEff_Annotate = F") + if(is.null(snp.dat)) stop("You must provide snp.dat to perform annotations.") + if(is.null(cds_var)) stop("You must specify cds_var to perform for annotations.") } # lr_links_path = "~/Desktop/LDWeaver_RUN/maela/lr_links.tsv" diff --git a/R/parseGBK.R b/R/parseGBK.R index e7067ed..5802e3a 100644 --- a/R/parseGBK.R +++ b/R/parseGBK.R @@ -1,9 +1,14 @@ #' parse_genbank_file #' -#' Function to parse the genbank file for the fasta alignment. -#' -#' @importFrom genbankr readGenBank getSeq -#' +#' Function to parse the genbank file for the fasta alignment. Parser built using genbankr source available here: https://github.com/gmbecker/genbankr. +#' @importFrom methods is new +#' @importFrom GenomeInfoDb seqnames seqlevels seqinfo Seqinfo +#' @importFrom Biostrings DNAString AAString extractAt AAStringSet +#' @importFrom IRanges IRanges ranges heads +#' @importFrom S4Vectors queryHits subjectHits DataFrame +#' @importFrom VariantAnnotation VRanges makeVRangesFromGRanges +#' @importFrom utils stack +#' @import GenomicRanges #' @param gbk_path path to genbank file #' @param g sequence length, available from the LDWeaver::parse_fasta_alignment() output (default = NULL), #' required if @@ -20,6 +25,18 @@ #' } #' @export parse_genbank_file = function(gbk_path, g = NULL, length_check = T){ + + # TODO: Fix later + #### I can't get this function to work by only importing the required parts from GenomicRanges: + # #'importFrom GenomicRanges GRanges mcols findOverlaps GRangesList resize + # #'importMethodsFrom GenomicRanges range + # I'm importing the whole package instead + # Fails at txslst = range(spl, ignore.strand = F), it seems the S4 method range() does not get exported + + # TODO: This parser is almost identical to genbankr and can probably be improved for speed and reliability. + # More imp. we only need to extract CDS start/stop positions and the fasta reference sequence for LDWeaver: should be simplified eventually... + ########################################## + t0 = Sys.time() # Check inputs if(length_check){ # perform the length check @@ -32,9 +49,11 @@ parse_genbank_file = function(gbk_path, g = NULL, length_check = T){ if(!file.exists(gbk_path)) stop(paste("Can't locate file", gbk_path)) - # gbk = suppressWarnings(genbankr::import(gbk_path)) - gbk = suppressWarnings(genbankr::readGenBank(gbk_path)) - refseq = genbankr::getSeq(gbk) + ### INSERT CODE HERE + gbk = suppressWarnings(readGenBank2(gbk_path)) + refseq = gbk@sequence + # refseq = genbankr::getSeq(gbk) + if(length(refseq) != 1){ stop("The GBK file should contain the reference sequence!\n") @@ -63,4 +82,997 @@ parse_genbank_file = function(gbk_path, g = NULL, length_check = T){ # genbankr::seqinfo(gbk) return(list(gbk = gbk, ref_g = ref_g)) + +} + +### readGenBank from genbankr +readGenBank2 = function(file, text = readLines(file), partial = NA) { + + verbose = FALSE + # I am skipping this because we test for file validity separately + # if(missing(text)) { + # if(missing(file)) + # stop("One of text or file must be specified.") + # if(is(file, "GBAccession")) + # text = .getGBfromNuccore(file) + # else if (!file.exists(file)) + # stop("file does not appear to an existing file or a GBAccession object. Valid file or text argument is required.") + # } + + record_brk_re = "^[[:space:]]*//[[:space:]]*$" ## the spaces probably aren't needed here... + + + if(methods::is(text, "character") && + length(recbrks <- grep(record_brk_re, text)) > 1) { + sq = seq_along(text) + indfac = cut(sq, breaks = recbrks[-length(recbrks)]) + text = split(text, indfac) + stopifnot(all(sapply(text, + function(x) identical(substr(x[1], 1, 5), "LOCUS")))) + } + # if(is(text, "list")) + # return(lapply(text, function(txt) readGenBank(text = txt, partial = partial, ret.seq= ret.seq, verbose = verbose))) + ## we always read in sequence because it is required for variant annotations + ## we throw it away after if the user set ret.seq=FALSE + prsed = parseGenBank(text = text, partial = partial, verbose = verbose, + ret.seq = TRUE) + ret = make_gbrecord(rawgbk = prsed, verbose = verbose) + # if(!ret.seq) sequence(ret) = NULL + ret +} + + +parseGenBank = function(file, text = readLines(file), partial = NA, + verbose = FALSE, + ret.anno = TRUE, + ret.seq = TRUE) { + + prime_field_re = "^[[:upper:]]+[[:space:]]+" # changed to local scope + + if(!ret.anno && !ret.seq) + stop("Must return at least one of annotations or sequence.") + bf = proc.time()["elapsed"] + if(missing(text) && !file.exists(file)) + stop("No text provided and file does not exist or was not specified. Either an existing file or text to parse must be provided.") + if(length(text) == 1) + text = fastwriteread(text) + + fldlines = grepl(prime_field_re, text) + fldfac = cumsum(fldlines) + fldnames = gsub("^([[:upper:]]+).*", "\\1", text[fldlines])[fldfac] + + spl = split(text, fldnames) + + resthang = list(LOCUS = readLocus(spl[["LOCUS"]])) + resthang[["FEATURES"]] = readFeatures(spl[["FEATURES"]], + source.only=!ret.anno, + partial = partial) + seqtype = .seqTypeFromLocus(resthang$LOCUS) + resthang$ORIGIN = if(ret.seq) + readOrigin(spl[["ORIGIN"]], + seqtype = seqtype) + else NULL + + if(ret.anno) { + resthang2 = mapply(function(field, lines, verbose) { + switch(field, + DEFINITION = readDefinition(lines), + ACCESSION = readAccession(lines), + VERSION = readVersions(lines), + KEYWORDS = readKeywords(lines), + SOURCE = readSource(lines), + ## don't read FEATURES, ORIGIN, or LOCUS because they are + ## already in resthang from above + NULL) + }, lines = spl, field = names(spl), SIMPLIFY=FALSE, verbose = verbose) + resthang2$FEATURES = resthang2$FEATURES[sapply(resthang2$FEATURES, + function(x) length(x)>0)] + resthang2 = resthang2[!names(resthang2) %in% names(resthang)] + resthang = c(resthang, resthang2) + } + ##DNAString to DNAStringSet + origin = resthang$ORIGIN + if(ret.seq && length(origin) > 0) { + typs = sapply(resthang$FEATURES, function(x) x$type[1]) + srcs = fill_stack_df(resthang$FEATURES[typs == "source"]) + ## dss = DNAStringSet(lapply(GRanges(ranges(srcs), function(x) origin[x]))) + dss = Biostrings::extractAt(origin, IRanges::ranges(srcs)) + names(dss) = as.character(GenomeInfoDb::seqnames(srcs)) + if(!ret.anno) + resthang = dss + else + resthang$ORIGIN = dss + } else if (!ret.anno) { ##implies ret.seq is TRUE + stop("Asked for only sequence (ret.anno=FALSE) from a file with no sequence information") + } + af = proc.time()["elapsed"] + if(verbose) + message("Done Parsing raw GenBank file text. [ ", af-bf, " seconds ]") + resthang + } + +make_gbrecord = function(rawgbk, verbose = FALSE) { + bf = proc.time()["elapsed"] + feats = rawgbk$FEATURES + sq = rawgbk$ORIGIN + + typs = sapply(feats, function(x) + if(length(x) > 0) x$type[1] else NA_character_) + empty = is.na(typs) + feats = feats[!empty] + typs = typs[!empty] + featspl = split(feats, typs) + srcs = fill_stack_df(featspl$source) + circ = rep(grepl("circular", rawgbk$LOCUS), times = length(srcs)) + ##grab the versioned accession to use as the "genome" in seqinfo + genom = gn_from_vers(rawgbk$VERSION) + sqinfo = GenomeInfoDb::Seqinfo(GenomeInfoDb::seqlevels(srcs), width(srcs), circ, genom) + if(verbose) + message(Sys.time(), " Starting creation of gene GRanges") + gns = make_genegr(featspl$gene, sqinfo) + + if(verbose) + message(Sys.time(), " Starting creation of CDS GRanges") + if(!is.null(featspl$CDS)) + cdss = make_cdsgr(featspl$CDS, gns, sqinfo) + else + cdss = GenomicRanges::GRanges(seqinfo = sqinfo) + if(verbose) + message(Sys.time(), " Starting creation of exon GRanges") + + + exns = make_exongr(featspl$exon, cdss = cdss, sqinfo) + + if(verbose) + message(Sys.time(), " Starting creation of variant VRanges") + vars = make_varvr(featspl$variation, sq = sq, sqinfo) + + + if(verbose) + message(Sys.time(), " Starting creation of transcript GRanges") + + txs = make_txgr(featspl$mRNA, exons = exns, sqinfo, genes = gns) + + if(verbose) + message(Sys.time(), " Starting creation of misc feature GRanges") + + ofeats = fill_stack_df(feats[!typs %in% c("gene", "exon", "CDS", "variation", + "mRNA", "source")]) + + ofeats$temp_grouping_id = NULL + + if(is.null(ofeats)) + ofeats = GenomicRanges::GRanges() + GenomeInfoDb::seqinfo(ofeats) = sqinfo + res = methods::new("GenBankRecord", genes = gns, cds = cdss, exons = exns, + transcripts = txs, variations = vars, + sources = fill_stack_df(feats[typs == "source"]), + other_features = ofeats, + accession = rawgbk$ACCESSION %||% NA_character_, + version = rawgbk$VERSION %||% NA_character_, + locus = rawgbk$LOCUS, + definition = rawgbk$DEFINITION, + sequence = sq) + af = proc.time()["elapsed"] + if(verbose) + message(Sys.time(), " - Done creating GenBankRecord object [ ", af - bf, " seconds ]") + res +} + +## slightly specialized function to stack granges which may have different +## mcols together, filling with NA_character_ as needed. +## also collapses multiple db_xref notes into single CharacterList column and +## creates an AAStringSet for the translation field +fill_stack_df = function(dflist, cols, fill.logical = TRUE, sqinfo = NULL) { + if(length(dflist) == 0 ) + return(NULL) + + multivalfields = c("db_xref", "EC_number", "gene_synonym", "old_locus_tag") + + dflist = dflist[sapply(dflist, function(x) !is.null(x) && nrow(x) > 0)] + + + allcols = unique(unlist(lapply(dflist, function(x) names(x)))) + basenms = gsub("(.*)(\\.[[:digit:]]+)$", "\\1", allcols) + nmtab = table(basenms) + dupnms = names(nmtab[nmtab>1]) + if(any(!dupnms %in% multivalfields)) + warning("Got unexpected multi-value field(s) [ ", + paste(setdiff(dupnms, multivalfields), collapse = ", "), + " ]. The resulting column(s) will be of class CharacterList, rather than vector(s). Please contact the maintainer if multi-valuedness is expected/meaningful for the listed field(s).") + allcols = unique(basenms) + + logcols = unique(unlist(lapply(dflist, function(x) names(x)[sapply(x, is.logical)]))) + charcols = setdiff(allcols, logcols) + + if(missing(cols)) + cols = allcols + + filled = mapply( + function(x, i) { + ## have to deal with arbitrary multiple columns + ## transform them into list columns + for(nm in dupnms) { + locs = grep(nm, names(x)) + if(length(locs)) { + rows = lapply(seq(along = rownames(x)), + function(y) unlist(x[y,locs])) + + x = x[,-locs] + } else { + rows = list(character()) + } + + x[[nm]] = rows + } + + + + ## setdiff is not symmetric + missnm = setdiff(charcols, names(x)) + x[,missnm] = NA_character_ + falsenm = setdiff(logcols, names(x)) + x[,falsenm] = FALSE + x = x[,cols] + x$temp_grouping_id = i + x + }, x = dflist, i = seq(along = dflist), SIMPLIFY=FALSE) + stk = .simple_rbind_dataframe(filled, "temp") + stk[["temp"]] = NULL + + listcols = which(sapply(names(stk), + function(x) methods::is(stk[[x]], "list") || + x %in% multivalfields)) + stk[listcols] = lapply(listcols, function(i) as(stk[[i]], "CharacterList")) + mc = names(stk)[!names(stk) %in% c("seqnames", "start", "end", "strand")] + if(fill.logical) { + logcols = which(sapply(stk, is.logical)) + stk[,logcols] = lapply(logcols, function(i) { + dat = stk[[i]] + dat[is.na(dat)] = FALSE + dat + }) + } + grstk = GenomicRanges::GRanges(seqnames = stk$seqnames, + ranges = IRanges::IRanges(start = stk$start, end = stk$end), + strand = stk$strand ) + + ## this may be slightly slower, but specifying mcols during + ## creation appends mcols. to all the column names, super annoying. + GenomicRanges::mcols(grstk) = stk[,mc] + if("translation" %in% names(GenomicRanges::mcols(grstk))) { + if(anyNA(grstk$translation)) { + message("Translation product seems to be missing for ", + sum(is.na(grstk$translation)), + " of ", length(grstk), " ", + grstk$type[1], " annotations. Setting to ''") + grstk$translation[is.na(grstk$translation)] = "" + } + grstk$translation = Biostrings::AAStringSet(grstk$translation) + } + + if(!is.null(sqinfo)) + GenomeInfoDb::seqinfo(grstk) = sqinfo + grstk +} + +# sec_field_re = "^( {5}|\\t)[[:alnum:]'_-]+[[:space:]]+(complement|join|order|[[:digit:]<,])" + +## Functions to read in the fields of a GenBank file +## utility borrowed from Hadley et al. rlang +`%||%` <- function(a, b) if(is.null(a)) b else a + +strip_fieldname = function(lines) gsub("^[[:space:]]*[[:upper:]]*[[:space:]]+", "", lines) + +readLocus = function(line) { + ## missing strip fieldname? + spl = strsplit(line, "[\t]+", line)[[1]] + spl +} + +readDefinition = function(lines) { + paste(strip_fieldname(lines), collapse = " ") +} + +readAccession = function(line) strip_fieldname(line) + +readVersions = function(line) { + txt = strip_fieldname(line) + spl = strsplit(txt, "[[:space:]]+")[[1]] + c(accession.version = spl[1], GenInfoID = gsub("GI:", "", spl[2])) +} + +readKeywords = function(lines) { + txt = strip_fieldname(lines) + txt = paste(txt, collapse = " ") + if(identical(txt, ".")) + txt = NA_character_ + txt +} + +readSource = function(lines) { + sec_field_re = "^( {5}|\\t)[[:alnum:]'_-]+[[:space:]]+(complement|join|order|[[:digit:]<,])" + secfieldinds = grep(sec_field_re, lines) + src = strip_fieldname(lines[1]) + lines = lines[-1] #consume SOURCE line + org = strip_fieldname(lines[1]) + lines = lines[-1] # consume ORGANISM line + lineage = strsplit(paste(lines, collapse = " "), split = "; ")[[1]] + list(source= src, organism = org, lineage = lineage) +} + + +chk_outer_complement = function(str) grepl("[^(]*complement\\(", str) +strip_outer_operator = function(str, op = "complement") { + regex = paste0("[^(]*", op, "\\((.*)\\)") + + gsub(regex,"\\1", str) +} + +.do_join_silliness = function(str, chr, ats, partial = NA, strand = NA) { + + # if(grepl("^join", str)) + if(is.na(partial) || !partial) { + part = grepl("[<>]", str) + if(part) { + if(is.na(partial)) + message("Partial range detected in a compound range (join or order) feature. Excluding entire annotation") + return( + data.frame(seqnames = character(), + start = numeric(), + end = numeric(), + strand = character(), + type = character(), + stringsAsFactors=FALSE) + ) + } + } + + sstr = substr(str, 1, 1) + if(sstr == "j") ##join + str = strip_outer_operator(str, "join") + else if(sstr == "o") ## order + str = strip_outer_operator(str, "order") + spl = strsplit(str, ",", fixed=TRUE)[[1]] + grs = lapply(spl, make_feat_gr, chr = chr, ats = ats, + partial = partial, strand = strand) + .simple_rbind_dataframe(grs) + +} + + +make_feat_gr = function(str, chr, ats, partial = NA, strand = NA) { + + if(is.na(strand) && chk_outer_complement(str)) { + strand = "-" + str = strip_outer_operator(str) + } + + + sbstr = substr(str, 1, 4) + if(sbstr == "join" || sbstr == "orde") + return(.do_join_silliness(str = str, chr = chr, + ats = ats, partial = partial, + strand = strand)) + haslt = grepl("<", str, fixed=TRUE) || grepl(">", str, fixed=TRUE) + + + ## control with option. Default to exclude those ranges entirely. + ## if( (haslt || hasgt ) && (is.na(partial) || !partial) ){ + if( haslt && (is.na(partial) || !partial) ){ + if(is.na(partial)) + warning("Incomplete feature annotation detected. ", + "Omitting feature at ", str) + return(GenomicRanges::GRanges(character(), IRanges::IRanges(numeric(), numeric()))) + } + + ## format is 123, 123..789, or 123^124 + spl = strsplit(str, "[.^]{1,2}")[[1]] + start = as.integer(gsub("<*([[:digit:]]+).*", "\\1", spl[1])) + if(length(spl) == 1) + end = start + else + end = as.integer(gsub(">*([[:digit:]]+).*", "\\1", spl[2])) + if(grepl("^", str, fixed=TRUE )) { + end = end - 1 + ats$loctype = "insert" + } else + ats$loctype = "normal" + + if(is.na(strand)) + strand = "+" + + gr = cheap_unsafe_data.frame(lst = c(seqnames = chr, start = start, end = end, strand = strand, ats)) + gr +} + +read_feat_attr = function(line) { + num = grepl('=[[:digit:]]+(\\.[[:digit:]]+){0,1}$', line) + val = gsub('[[:space:]]*/[^=]+($|="{0,1}([^"]*)"{0,1})', "\\2", line) + mapply(function(val, num) { + if(nchar(val)==0) + TRUE + else if(num) + as.numeric(val) + else + val + }, val = val, num = num, SIMPLIFY=FALSE) +} + +### These two functions are just for API compatibility, replace with actual values later +getCDS = function(gbk){ + return(gbk@cds) +} + +# getSeq = function(gbk){ # replaced +# return(gbk@sequence) +# } + +## XXX is the leading * safe here? whitespace is getting clobbered by line +##combination below I think it's ok +strip_feat_type = function(ln) gsub("^[[:space:]]*[[:alnum:]_'-]+[[:space:]]+((complement\\(|join\\(|order\\(|[[:digit:]<]+).*)", "\\1", ln) + +readFeatures = function(lines, partial = NA, verbose = FALSE, + source.only = FALSE) { + sec_field_re = "^( {5}|\\t)[[:alnum:]'_-]+[[:space:]]+(complement|join|order|[[:digit:]<,])" + + if(substr(lines[1], 1, 8) == "FEATURES") + lines = lines[-1] ## consume FEATURES line + fttypelins = grepl(sec_field_re, lines) + featfactor = cumsum(fttypelins) + + if(source.only) { + srcfeats = which(substr(lines[fttypelins], 6, 11) == "source") + keepinds = featfactor %in% srcfeats + lines = lines[keepinds] + featfactor = featfactor[keepinds] + } + + ##scope bullshittery + chr = "unk" + + totsources = length(grep("[[:space:]]+source[[:space:]]+[<[:digit:]]", lines[which(fttypelins)])) + numsources = 0 + everhadchr = FALSE + + do_readfeat = function(lines, partial = NA) { + + ## before collapse so the leading space is still there + type = gsub("[[:space:]]+([[:alnum:]_'-]+).*", "\\1", lines[1]) + ##feature/location can go across multpiple lines x.x why genbank? whyyyy + attrstrts = cumsum(grepl("^[[:space:]]+/[^[:space:]]+($|=([[:digit:]]|\"))", lines)) + lines = tapply(lines, attrstrts, function(x) { + paste(gsub("^[[:space:]]+", "", x), collapse="") + }, simplify=TRUE) + + rawlocstring = lines[1] + + rngstr = strip_feat_type(rawlocstring) + + ## consume primary feature line + lines = lines[-1] + if(length(lines)) { + attrs = read_feat_attr(lines) + + names(attrs) = gsub("^[[:space:]]*/([^=]+)($|=[^[:space:]].*$)", "\\1", lines) + if(type == "source") { + numsources <<- numsources + 1 + if("chromosome" %in% names(attrs)) { + if(numsources > 1 && !everhadchr) + stop("This file appears to have some source features which specify chromosome names and others that do not. This is not currently supported. Please contact the maintainer if you need this feature.") + everhadchr <<- TRUE + chr <<- attrs$chromosome + } else if(everhadchr) { + stop("This file appears to have some source features which specify chromosome names and others that do not. This is not currently supported. Please contact the maintainer if you need this feature.") + ## this assumes that if one source has strain, they all will. + ## Good assumption? + } else if("strain" %in% names(attrs)) { + chr <<- if(totsources == 1) attrs$strain else paste(attrs$strain, numsources, sep=":") + } else { + chr <<- if(totsources == 1) attrs$organism else paste(attrs$organism, numsources, sep=":") + } + } + } else { + attrs = list() + } + make_feat_gr(str = rngstr, chr = chr, ats = c(type = type, attrs), + partial = partial) + + } + + if(verbose) + message(Sys.time(), " Starting feature parsing") + resgrs = tapply(lines, featfactor, do_readfeat, + simplify=FALSE, partial = partial) + if(verbose) + message(Sys.time(), " Done feature parsing") + + resgrs + +} + + +## seqtype: bp = base pairs (DNA/RNA), aa = amino acids (peptides/protein) + +readOrigin = function(lines, seqtype = "bp") { + ## strip spacing and line numbering + regex = "([[:space:]]+|[[:digit:]]+|//)" + + dnachar = gsub(regex, "", lines[-1]) + chars = paste(dnachar, collapse="") + if(any(nzchar(dnachar))) { + switch(seqtype, + bp = Biostrings::DNAString(chars), + aa = Biostrings::AAString(chars), + stop("Unknown origin sequence type: ", seqtype)) + } else + NULL +} + + +fastwriteread = function(txtline) { + f = file() + on.exit(close(f)) + writeLines(txtline, con = f) + readLines(f) + +} + + + +##LOCUS ABD64816 1353 aa linear INV 12-MAR-2006 +.seqTypeFromLocus = function(locus) { + gsub("^[[:space:]]*LOCUS[[:space:]]+[^[:space:]]+[[:space:]]+[[:digit:]]+[[:space:]]+([^[:space:]]+).*", + "\\1", + locus) +} + + + +allna = function(vec) all(is.na(vec)) + +.getGeneIdLab = function(ranges, verbose, stoponfail = FALSE) { + cols = names(GenomicRanges::mcols(ranges)) + if("gene_id" %in% cols && !allna(ranges$gene_id)) + "gene_id" + else if("locus_tag" %in% cols) + "locus_tag" + else if ("gene" %in% cols) { + if(verbose) + message("Annotations don't have 'locus_tag' label, using 'gene' as gene_id column") + "gene" + } else if (stoponfail) + stop("Unable to determine gene id column on feature GRanges object") + else + NULL +} + +.getGeneIdVec = function(ranges) { + res = .getGeneIdLab(ranges, verbose = TRUE) + if(!is.null(res)) + res = GenomicRanges::mcols(ranges)[[res]] + res +} + +match_cds_genes = function(cds, genes) { + ## XXX do we want "equal" here or within? + hits = GenomicRanges::findOverlaps(cds, genes, type = "equal") + cds$gene_id= NA_character_ + cds$gene_id[S4Vectors::queryHits(hits)] = genes$gene_id + if(anyNA(cds$gene_id)) { + warning("unable to determine gene for some CDS annotations.", + " Ignoring these ", sum(is.na(cds$gene_id)), " segments") + cds = cds[!is.na(cds$gene_id)] + } + cds +} + +make_cdsgr = function(rawcdss, gns, sqinfo) { + + ## rawcdss = sanitize_feats(rawcdss, cds_cols) + rawcdss = fill_stack_df(rawcdss, sqinfo = sqinfo) + ## double order gives us something that can be merged directly into what + ## out of tapply + + havegenelabs = FALSE + + rawcdss$gene_id = .getGeneIdVec(rawcdss) + + if(!is.null(rawcdss$gene_id) && !anyNA(rawcdss$gene_id)) { + + o = order(order(rawcdss$gene_id)) + var = "gene_id" + } else { + message("genes not available for all CDS ranges, using internal grouping ids") + var = "temp_grouping_id" + o = order(order(rawcdss$temp_grouping_id)) + } + idnum = unlist(tapply(rawcdss$temp_grouping_id, GenomicRanges::mcols(rawcdss)[[var]], function(x) as.numeric(factor(x)), simplify=FALSE))[o] + newid = paste(GenomicRanges::mcols(rawcdss)[[var]], idnum, sep=".") + if(var == "temp_grouping_id") + newid = paste0(ifelse(is.na(GenomicRanges::mcols(rawcdss)$gene), "unknown_gene_",GenomicRanges::mcols(rawcdss)$gene), newid) + + cdss = rawcdss + cdss$transcript_id = newid + cdss$temp_grouping_id = NULL + cdss +} + + +.gnMappingHlpr = function(gnfeat, txfeatlst, knownlabs, + splcol = .getGeneIdLab(gnfeat, stoponfail=TRUE, + verbose=FALSE), + olaptype = "within", stopondup = TRUE) { + unknown = which(is.na(knownlabs)) + ## is it already split? + if(!methods::is(gnfeat, "GRangesList")) + gnfeat = split(gnfeat, GenomicRanges::mcols(gnfeat)[[splcol]]) + hts = GenomicRanges::findOverlaps(gnfeat, txfeatlst, type=olaptype, select="all") + ## duplicated queryHits is ok b/c gene can have more than one tx, right? + subhits = S4Vectors::subjectHits(hts) + if(anyDuplicated(subhits)) { + duphits = which(duplicated(subhits)) + dupstart = start(IRanges::heads(txfeatlst[duphits], 1)) + + if(stopondup) + stop("mRNA feature(s) starting at [", + paste(as.vector(dupstart), collapse=", "), + "] appear to match more than one gene. If you feel the file is", + " correct please contact the maintainer.") + else { # throw away dup hits so they stay NA + hts = hts[subhits %in% subhits[duphits],] + subhits = S4Vectors::subjectHits(hts) + } + } + ## don't override information that we already know. + ## if efficiency of this routine ever becomes an issue + ## this subsetting should be pushed up before the overlap + ## calcs, but the indexing is simpler here and I don't + ## think it will be too slow. We'll see .... + + keep = subhits %in% unknown + + knownlabs[ subhits[ keep ] ] = names(gnfeat)[ S4Vectors::queryHits(hts)[ keep ] ] + knownlabs +} + +assignTxsToGenes = function(rawtx, exons, genes) { + + txspl = split(rawtx, rawtx$temp_grouping_id) + + gnlabs = rep(NA_character_, times = length(txspl)) + + ## exon mapping is more precise, try that first. + ## exons should fall strictly within transcript sections + ## not identical b/c of padding at ends. AFAIK should + ## be identical for internal exons, but we aren't + ## specifically checking for that here. + + gnlabs = .gnMappingHlpr(exons, txspl, gnlabs) + if(anyNA(gnlabs)) { + + ## damn, now we have to try with genes + gnlabs = .gnMappingHlpr(genes, txspl, gnlabs, + olaptype = "any", + stopondup=FALSE) + + } + + rawtx$gene_id= rep(gnlabs, times = lengths(txspl)) + rawtx + +} + +make_txgr = function(rawtxs, exons, sqinfo, genes) { + if(length(rawtxs) > 0) { + rawtxs = fill_stack_df(rawtxs, sqinfo = sqinfo) + gvec = .getGeneIdVec(rawtxs) + if(is.null(gvec)) { + rawtxs = assignTxsToGenes(rawtxs, exons, genes) + gvec = .getGeneIdVec(rawtxs) + } + + rawtxs$tx_id_base = ifelse(is.na(gvec), paste("unknown_gene", cumsum(is.na(gvec)), sep="_"), gvec) + spltxs = split(rawtxs, rawtxs$tx_id_base) + txsraw = lapply(spltxs, function(grl) { + grl$transcript_id = paste(grl$tx_id_base, (grl$temp_grouping_id - + min(grl$temp_grouping_id) + + 1), + sep=".") + grl$tx_id_base = NULL + grl + }) + txslst = GenomicRanges::GRangesList(txsraw) + txs = unlist(txslst, use.names=FALSE) + txs$gene_id = .getGeneIdVec(txs) + txs$temp_grouping_id=NULL + } else if (length(exons) == 0L) { + txs = GenomicRanges::GRanges(seqinfo=sqinfo) + } else { + message("No transcript features (mRNA) found, using spans of CDSs") + spl = split(exons, exons$transcript_id %||% "unknown") + # cat(summary(spl)) ## debug + txslst = range(spl, ignore.strand = F) + # LIST = list(spl = spl, txslst = txslst) ## debug + # list2env(LIST, envir = .GlobalEnv) ## debug + + mcdf = GenomicRanges::mcols(unlist(IRanges::heads(spl, 1))) + txs = unlist(txslst, use.names=FALSE) + GenomicRanges::mcols(txs) = mcdf + GenomeInfoDb::seqinfo(txs) = sqinfo + } + txs +} + +make_varvr = function(rawvars, sq, sqinfo) { + if(length(rawvars) == 0) + return(VariantAnnotation::VRanges(seqinfo = sqinfo)) + if(is.null(sq)) { + warning("importing variation features when origin sequence is not included in the file is not currently supported. Skipping ", length(rawvars), " variation features.") + return(VariantAnnotation::VRanges(seqinfo = sqinfo)) + } + + vrs = fill_stack_df(rawvars, sqinfo = sqinfo) + vrs$temp_grouping_id = NULL + ## not all variants have /replace so we can't assume that ANY of them + ## in a file have it (though it is very likely at least one will) + ## if none of them do, the column won't exist at all + if(is.null(vrs$replace)) + vrs$replace = NA_character_ + + ## makeVRangesFromGRanges seems to have a bug(?) that requires the + ## columns used dfor the VRanges core info to be the first 6 in the + ## granges mcols + dels = nchar(vrs$replace) == 0L & !is.na(vrs$replace) + if(length(dels)) { + vrs[dels] = GenomicRanges::resize(vrs[dels], width(vrs[dels]) + 1L, fix = "end") + vrs$replace[dels] = as.character(sq[GenomicRanges::resize(vrs[dels], 1L)]) + } + newcols = S4Vectors::DataFrame( ref = as.character(sq[vrs]), + alt = vrs$replace, + totalDepth = NA_integer_, + altDepth = NA_integer_, + refDepth = NA_integer_, + sampleNames = NA_character_) + GenomicRanges::mcols(vrs) = cbind(newcols, GenomicRanges::mcols(vrs)) + res = VariantAnnotation::makeVRangesFromGRanges(vrs) + res +} + + +make_exongr = function(rawex, cdss, sqinfo) { + ##exns = sanitize_feats(rawex, exon_cols) + exns = fill_stack_df(rawex, sqinfo = sqinfo) + if(is.null(exns)) { + + message("No exons read from genbank file. Assuming sections of CDS are full exons") + if(length(cdss) > 0) { + exns = cdss + exns$type = "exon" + } else { + return(GenomicRanges::GRanges(seqinfo = sqinfo)) + } + + } else { + if(methods::is(exns, "GRangesList")) + exns = utils::stack(exns) + + hits = GenomicRanges::findOverlaps(exns, cdss, type = "within", select = "all") + qh = S4Vectors::queryHits(hits) + qhtab = table(qh) + dup = as.integer(names(qhtab[qhtab>1])) + havdups = length(dup) > 0 + if(havdups) { + ambig = exns[unique(dup)] + exns = exns[-unique(dup)] + noduphits = hits[-match(qh, dup)] + warning("Some exons specified in genbank file have ambiguous relationship to transcript(s). ") + ambig$transcript_id = paste(ambig$gene_id, + "ambiguous", sep=".") + } else { + noduphits = hits + } + + exns$transcript_id = cdss$transcript_id[S4Vectors::subjectHits(noduphits)] + + } + exns$temp_grouping_id = NULL + exns +} + +make_genegr = function(x, sqinfo) { + res = fill_stack_df(x, sqinfo = sqinfo) + + if(is.null(res)) + res = GenomicRanges::GRanges(seqinfo = sqinfo) + else + res$gene_id = .getGeneIdVec(res) + + res$temp_grouping_id = NULL + res +} + +gn_from_vers = function(x) { + if(is.null(x)) + "unknown" + else + strsplit(x, " ")[[1]][1] +} + +## super fast rbind of data.frame lists from Pete Haverty +.simple_rbind_dataframe <- function(dflist, element.colname) { + numrows = vapply(dflist, nrow, integer(1)) + if (!missing(element.colname)) { + list.name.col = factor(rep(names(dflist), numrows), levels=names(dflist)) + } + dflist = dflist[ numrows > 0 ] # ARGH, if some data.frames have zero rows, factors become integers + # myunlist = function(x) base::unlist(x, recursive=FALSE, use.names=FALSE) + mylapply = base::lapply + cn = names(dflist[[1]]) + inds = structure(1L:length(cn), names=cn) + big <- mylapply(inds, + function(x) { + unlist( + # mylapply(dflist, function(y) { y[[x]] }), + mylapply(dflist, function(y) { .subset2(y, x) }), + recursive=FALSE, use.names=FALSE) + }) + if (!missing(element.colname)) { + big[[element.colname]] = list.name.col + } + class(big) <- "data.frame" + attr(big, "row.names") <- .set_row_names(length(big[[1]])) + return(big) +} + + + +cheap_unsafe_data.frame = function(..., lst = list(...)) { + lens = lengths(lst) + len = max(lens) + if(!all(lens == len)) + lst = lapply(lst, rep, length.out = len) + + if(anyDuplicated.default(names(lst))) + names(lst) =make.unique(names(lst)) + + class(lst) = "data.frame" + attr(lst, "row.names") = .set_row_names(length(lst[[1]])) + lst +} + + +#### Classes +## File from: https://github.com/gmbecker/genbankr +#' @importClassesFrom Biostrings XStringSet +#' @importClassesFrom GenomicRanges GRangesList CompressedGRangesList +setOldClass("NULL") +setClassUnion("XStringSetOrNULL", c("XStringSet", "NULL")) + +##' @title GenBank data objects. +##' @description These objects represent GenBank annotations +##' +##' @rdname GenBank-classes +##' @docType methods +##' @exportClass GenBankRecord +setClass("GenBankRecord", slots = list(genes = "GenomicRanges", cds = "GenomicRanges", + exons = "GenomicRanges", + transcripts = "GenomicRanges", + variations = "VRanges", + sources = "GenomicRanges", + other_features = "GenomicRanges", + locus = "character", + definition = "character", + accession = "character", + version = "character", + source = "ANY", + sequence = "XStringSetOrNULL" +) +) + +##' @title GenBank File +##' +##' @description A resource class for use within the rtracklayer framework +##' +##' @rdname gbkfile +##' @docType methods +##' @exportClass GenBankFile +setClass("GenBankFile", contains = "RTLFile") + +##' @rdname gbkfile +##' @docType methods +##' @exportClass GBKFile +##' @aliases GBKFile-class +setClass("GBKFile", contains = "GenBankFile") + +##' @rdname gbkfile +##' @docType methods +##' @exportClass GBKFile +##' @aliases GBFile-class +setClass("GBFile", contains = "GenBankFile") + +##' @title GBAccession ID class +##' +##' @description A class representing the (versioned) GenBank accession +##' +##' @rdname GBAccession +##' @exportClass GBAccession +setClass("GBAccession", contains="character") + +##' @rdname GBAccession +##' @param id A versioned GenBank Accession id +##' @return a \code{GBAccession} object. +##' @export +GBAccession = function(id) { + new("GBAccession", id) +} + + +### I'm leaving the old code below, remove eventually at a cleanup +# parse_genbank_file +# +# Function to parse the genbank file for the fasta alignment. +# +# importFrom genbankr readGenBank getSeq +# +# param gbk_path path to genbank file +# param g sequence length, available from the LDWeaver::parse_fasta_alignment() output (default = NULL), +# required if +# param length_check specify whether to check if fasta and gbk sequence lengths are equal (default = T) +# +# return GenBankRecord object +# +# examples +# dontrun{ +# aln_path <- system.file("extdata", "sample.aln", package = "LDWeaver") +# snp.dat <- parseFastaAlignment(aln_path, gap_freq = 0.15, maf_freq = 0.01, method = "default") +# gbk_path <- system.file("extdata", "sample.gbk", package = "LDWeaver") +# gbk <- parse_genbank_file(gbk_path, snp.dat$g) +# } +# export +# parse_genbank_file_old = function(gbk_path, g = NULL, length_check = T){ +# t0 = Sys.time() +# # Check inputs +# if(length_check){ # perform the length check +# if(is.null(g)){ +# # then g cannot be null +# stop("g must be provided to perform length check!\n") +# # return(-1) +# } +# } + +# if(!file.exists(gbk_path)) stop(paste("Can't locate file", gbk_path)) +# +# # gbk = suppressWarnings(genbankr::import(gbk_path)) +# gbk = suppressWarnings(genbankr::readGenBank(gbk_path)) +# refseq = genbankr::getSeq(gbk) +# +# if(length(refseq) != 1){ +# stop("The GBK file should contain the reference sequence!\n") +# # return(-1) +# } +# # the length check is good to check if the alignment matches with the gbk, setting it to F will stop it +# ref_g = length(refseq[[1]]) # length of the reference sequence +# +# if(length_check){ # perform the length check +# if(ref_g != g){ # perform check +# stop("Genbank reference sequence length mismatches with the fasta alignment!\n") +# # return(-1) +# } +# +# } else { +# if(!is.null(g)){ +# if(ref_g != g){ +# warning("Fasta length does not match the genbank reference sequence length!\n") +# } +# } else { +# warning("Similarity between the genbank reference and fasta sequences NOT checked, ignore if was provided...\n") +# } +# } +# cat(paste("Successfully read gbk file:", gbk_path, "in", round(difftime(Sys.time(), t0, units = "secs"), 2), "s\n")) +# +# # genbankr::seqinfo(gbk) +# return(list(gbk = gbk, +# ref_g = ref_g)) +# } diff --git a/man/GBAccession.Rd b/man/GBAccession.Rd new file mode 100644 index 0000000..6394080 --- /dev/null +++ b/man/GBAccession.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/parseGBK.R +\docType{class} +\name{GBAccession-class} +\alias{GBAccession-class} +\alias{GBAccession} +\title{GBAccession ID class} +\usage{ +GBAccession(id) +} +\arguments{ +\item{id}{A versioned GenBank Accession id} +} +\value{ +a \code{GBAccession} object. +} +\description{ +A class representing the (versioned) GenBank accession +} diff --git a/man/GenBank-classes.Rd b/man/GenBank-classes.Rd new file mode 100644 index 0000000..c76f443 --- /dev/null +++ b/man/GenBank-classes.Rd @@ -0,0 +1,9 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/parseGBK.R +\docType{methods} +\name{GenBankRecord-class} +\alias{GenBankRecord-class} +\title{GenBank data objects.} +\description{ +These objects represent GenBank annotations +} diff --git a/man/LDWeaver.Rd b/man/LDWeaver.Rd index 015ee6e..dff05e3 100644 --- a/man/LDWeaver.Rd +++ b/man/LDWeaver.Rd @@ -25,6 +25,7 @@ LDWeaver( num_clusts_CDS = 3, srp_cutoff = 3, tanglegram_break_segments = 5, + write_gwesExplorer = T, multicore = T, max_blk_sz = 10000, ncores = NULL, @@ -75,7 +76,9 @@ but only the top will be used for visualisation (default = 250), r modelling since all links are used. However, setting a threshold > 2 will generally reduce the memory usage, plotting time (default = 3, i.e. corresponding to p = 0.001), and run time for ARACNE. If all links are required to be returned, set to 0 (i.e. corresponding to p = 1), range 0 - 5} -\item{tanglegram_break_segments}{specify the number of genome segments to prepare - one tanglegram per segment (default = 5), range 1 - 10} +\item{tanglegram_break_segments}{specify the number of genome segments to prepare - one tanglegram per segment (default = 5), range 1 - 10. Set NULL to skip tanglegram} + +\item{write_gwesExplorer}{specify whether output for GWESExplorer is required (default = T)} \item{multicore}{specify whether to use parallel processing (default = T)} diff --git a/man/gbkfile.Rd b/man/gbkfile.Rd new file mode 100644 index 0000000..3e45c3d --- /dev/null +++ b/man/gbkfile.Rd @@ -0,0 +1,11 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/parseGBK.R +\docType{methods} +\name{GenBankFile-class} +\alias{GenBankFile-class} +\alias{GBKFile-class} +\alias{GBFile-class} +\title{GenBank File} +\description{ +A resource class for use within the rtracklayer framework +} diff --git a/man/parse_genbank_file.Rd b/man/parse_genbank_file.Rd index 7632ca4..47d0e4c 100644 --- a/man/parse_genbank_file.Rd +++ b/man/parse_genbank_file.Rd @@ -18,7 +18,7 @@ required if } GenBankRecord object } \description{ -Function to parse the genbank file for the fasta alignment. +Function to parse the genbank file for the fasta alignment. Parser built using genbankr source available here: https://github.com/gmbecker/genbankr. } \examples{ \dontrun{