diff --git a/README.md b/README.md index 633172e3..66c0b072 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ ![Coverage](https://medema-group.github.io/BiG-SCAPE/badges/coverage.svg) ![Pylint](https://medema-group.github.io/BiG-SCAPE/badges/pylint.svg) -## _Note: BiG-SCAPE 2.0 is still in beta. Please submit an issue if you find anything wrong with this release!_ +## _Notes:
BiG-SCAPE 2.0 is still in beta. Please submit an issue if you find anything wrong with this release!
BiG-SCAPE 2.0 features several updates to input validation and reference usage. We encourage both experienced BiG-SCAPE 1 users as well as new BiG-SCAPE users to read the updated [documentation](https://github.com/medema-group/BiG-SCAPE/wiki)._ # BiG-SCAPE @@ -21,7 +21,7 @@ For installation instructions, see [here](https://github.com/medema-group/BiG-SC Learn more about BiG-SCAPE in the [wiki](https://github.com/medema-group/BiG-SCAPE/wiki). -![BiG-SCAPE workflow](Figures/BiG-SCAPE-CORASON_Fig1_20171122_v01_MM_nocorason.png) +![BiG-SCAPE workflow](figures/BiG-SCAPE-CORASON_Fig1_20171122_v01_MM_nocorason.png) If you find BiG-SCAPE useful, please cite us: diff --git a/big_scape/data/sqlite.py b/big_scape/data/sqlite.py index 5c549278..f1f7b328 100644 --- a/big_scape/data/sqlite.py +++ b/big_scape/data/sqlite.py @@ -14,6 +14,7 @@ Compiled, Select, Insert, + Delete, CursorResult, create_engine, func, @@ -283,7 +284,9 @@ def execute_raw_query(query: str) -> CursorResult: return DB.connection.execute(text(query)) @staticmethod - def execute(query: Compiled | Select[Any] | Insert, commit=True) -> CursorResult: + def execute( + query: Compiled | Select[Any] | Insert | Delete, commit=True + ) -> CursorResult: """Wrapper for SQLAlchemy.connection.execute expecting a Compiled query Arguments: diff --git a/big_scape/distances/query.py b/big_scape/distances/query.py index 4c819f73..62e1ba33 100644 --- a/big_scape/distances/query.py +++ b/big_scape/distances/query.py @@ -67,7 +67,7 @@ def calculate_distances_query( query_nodes = bs_network.get_nodes_from_cc(query_connected_component, query_records) bs_network.remove_connected_component( - query_connected_component, max_cutoff, run["run_id"] + query_connected_component, query_bin.label, max_cutoff, run["run_id"] ) query_bin_connected = bs_comparison.RecordPairGenerator( diff --git a/big_scape/genbank/gbk.py b/big_scape/genbank/gbk.py index a6844fc3..a3f04262 100644 --- a/big_scape/genbank/gbk.py +++ b/big_scape/genbank/gbk.py @@ -95,11 +95,11 @@ def batch_hash(gbks: list[GBK], n: int): return temp_table -def create_temp_gbk_id_table(gbks: list[GBK]) -> Table: +def create_temp_gbk_id_table(gbk_ids: list[int]) -> Table: """Create a temporary table with ids of given gbks Args: - gbks (list[GBK]): the gbks to include in the connected component + gbk_ids (list[int]): the ids of the gbks to add to the temporary table Returns: Table: the temporary table @@ -132,12 +132,13 @@ def create_temp_gbk_id_table(gbks: list[GBK]) -> Table: INSERT INTO {temp_table_name} (gbk_id) VALUES (?); """ - def batch_hash(gbks: list[GBK], n: int): - l = len(gbks) + # local function for batching + def batch_hash(gbk_ids: list[int], n: int): + l = len(gbk_ids) for ndx in range(0, l, n): - yield [gbk._db_id for gbk in gbks[ndx : min(ndx + n, l)]] + yield [gbk_id for gbk_id in gbk_ids[ndx : min(ndx + n, l)]] - for hash_batch in batch_hash(gbks, 1000): + for hash_batch in batch_hash(gbk_ids, 1000): cursor.executemany(insert_query, [(x,) for x in hash_batch]) # type: ignore cursor.close() @@ -412,7 +413,7 @@ def load_many(input_gbks: list[GBK]) -> list[GBK]: # load GBK regions. This will also populate all record levels below region # e.g. candidate cluster, protocore if they exist - temp_gbk_id_table = create_temp_gbk_id_table(input_gbks) + temp_gbk_id_table = create_temp_gbk_id_table(list(gbk_dict.keys())) Region.load_all(gbk_dict, temp_gbk_id_table) diff --git a/big_scape/genbank/proto_cluster.py b/big_scape/genbank/proto_cluster.py index 53c6cdfd..0c55f9bd 100644 --- a/big_scape/genbank/proto_cluster.py +++ b/big_scape/genbank/proto_cluster.py @@ -270,7 +270,7 @@ def load_all( # add to dictionary protocluster_dict[result.id] = new_proto_cluster - ProtoCore.load_all(protocluster_dict) + ProtoCore.load_all(protocluster_dict, temp_gbk_id_table) class MergedProtoCluster(ProtoCluster): diff --git a/big_scape/network/families.py b/big_scape/network/families.py index 3f3bd15e..b96507a0 100644 --- a/big_scape/network/families.py +++ b/big_scape/network/families.py @@ -311,7 +311,7 @@ def run_family_assignments( connected_component, bin.source_records ): bs_network.remove_connected_component( - connected_component, cutoff, run["run_id"] + connected_component, bin.label, cutoff, run["run_id"] ) continue diff --git a/big_scape/network/network.py b/big_scape/network/network.py index daa83e74..3df2ec51 100644 --- a/big_scape/network/network.py +++ b/big_scape/network/network.py @@ -548,7 +548,7 @@ def reference_only_connected_component(connected_component, bgc_records) -> bool def get_connected_component_id( - connected_component: list, cutoff: float, run_id: int + connected_component: list, bin_label: str, cutoff: float, run_id: int ) -> int: """Get the connected component id for the given connected component expects all edges to be in one connected component, if thats not the @@ -575,33 +575,37 @@ def get_connected_component_id( .distinct() .where( and_( - cc_table.c.cutoff == cutoff, cc_table.c.record_id == record_id, + cc_table.c.bin_label == bin_label, + cc_table.c.cutoff == cutoff, cc_table.c.run_id == run_id, ) ) .limit(1) ) - cc_ids = DB.execute(select_statement).fetchone() + cc_id = DB.execute(select_statement).scalar_one() - return cc_ids[0] + return cc_id def remove_connected_component( - connected_component: list, cutoff: float, run_id: int + connected_component: list, bin_label: str, cutoff: float, run_id: int ) -> None: """Removes a connected component from the cc table in the database""" if DB.metadata is None: raise RuntimeError("DB.metadata is None") - cc_id = get_connected_component_id(connected_component, cutoff, run_id) + cc_id = get_connected_component_id(connected_component, bin_label, cutoff, run_id) cc_table = DB.metadata.tables["connected_component"] delete_statement = delete(cc_table).where( - cc_table.c.id == cc_id, cc_table.c.cutoff == cutoff, cc_table.c.run_id == run_id + cc_table.c.id == cc_id, + cc_table.c.bin_label == bin_label, + cc_table.c.cutoff == cutoff, + cc_table.c.run_id == run_id, ) DB.execute(delete_statement) diff --git a/big_scape/output/__init__.py b/big_scape/output/__init__.py index 1e897146..20b99133 100644 --- a/big_scape/output/__init__.py +++ b/big_scape/output/__init__.py @@ -5,7 +5,7 @@ legacy_generate_bin_output, write_record_annotations_file, write_clustering_file, - write_cutoff_network_file, + write_cutoff_network_files, write_full_network_file, ) @@ -16,6 +16,6 @@ "legacy_generate_bin_output", "write_record_annotations_file", "write_clustering_file", - "write_cutoff_network_file", + "write_cutoff_network_files", "write_full_network_file", ] diff --git a/big_scape/output/html_template/output/html_content/js/bigscape.js b/big_scape/output/html_template/output/html_content/js/bigscape.js index 5b22dee5..abe0c0b1 100755 --- a/big_scape/output/html_template/output/html_content/js/bigscape.js +++ b/big_scape/output/html_template/output/html_content/js/bigscape.js @@ -487,7 +487,7 @@ function Bigscape(run_data, bs_data, bs_families, bs_alignment, bs_similarity, n showSingletons(true); updateDescription(highlighted_nodes); net_ui.find("svg").css("height", "100%").css("width", "100%"); - var countDown = options["render_time"] ? options["render_time"] : 5 + parseInt(graph.getLinksCount() / 1000); + var countDown = options["render_time"] ? options["render_time"] : 1 + parseInt(graph.getNodesCount() / 300) + parseInt(graph.getLinksCount() / 1000); var perZoom = 5; var zoomCount = 0; info_ui.append("
Adjusting network layout for... " + countDown + " second(s)
"); diff --git a/big_scape/output/html_template/output/index.html b/big_scape/output/html_template/output/index.html index 21cd750e..230172e9 100644 --- a/big_scape/output/html_template/output/index.html +++ b/big_scape/output/html_template/output/index.html @@ -269,6 +269,8 @@

Network

+ *On larger datasets (Upward 1000 BGCs), this might take a + minute to load.
@@ -289,7 +291,14 @@

Network

- +
+ +
+ If loading takes longer than a minute, your database is likely too + large to be loaded. Use the output network tsv files to contruct your + networks in a networking tool like Cytoscape. +
+

Select a database file to read from:
@@ -358,9 +367,9 @@

Network

if (enable) { $("#loadingWindow").css("display", "block"); if (spinner) { - $("#loadingWindow img").css("display", "inline"); + $("#loadingImg").css("display", "inline"); } else { - $("#loadingWindow img").css("display", "none"); + $("#loadingImg").css("display", "none"); } } else { $("#loadingWindow").css("display", "none"); @@ -453,9 +462,25 @@

Network

function dataLoaded() { // only called after a database is loaded showLoading(false, false); + createTmpTables(); populateRunSelect(); } + function createTmpTables() { + // creates tmp db tables for large query building + window.db.run(`CREATE TABLE rec_ids (rec_id int)`) + window.db.run(`CREATE TABLE gbk_ids (gbk_id int)`) + window.db.run(`CREATE TABLE cds_ids (cds_id int)`) + } + + function populateTmpTable(table_name, data) { + //populate tmp db tables in batches of 10k insertions + window.db.run(`DELETE FROM ${table_name}`) //clear table + for (i = 0; i < data.length; i += 10000) { + window.db.run(`INSERT INTO ${table_name} VALUES ${data.slice(i, i + 10000).map(idx => "(" + idx + ")")}`) + } + } + function populateRunSelect() { runs = window.db.exec(`SELECT * FROM run`)[0] run_data = {} @@ -637,7 +662,8 @@

Network

} var bs_families = bs_ccs.flatMap(cc => cc.families) var bs_records = bs_families.flatMap(fam => fam.members) - var bs_data = generate_bs_data(bs_records, run_data) + populateTmpTable("rec_ids", bs_records) + var bs_data = generate_bs_data(run_data) var bs_alignments = generate_bs_alignments(bs_families, bs_data) var bs_similarities = generate_bs_similarity(bs_records, run_data) var bs = new Bigscape(run_data, bs_data, bs_families, bs_alignments, bs_similarities, "network-container") @@ -647,7 +673,7 @@

Network

// load a single connected component and topolinked ccs if present. // to distinguish between the selected cc and topolinked ones, pass // all topolinked record ids in render_options when creating a Bigscape object - var render_options = { "render_time": 1, "topo_records": [] } + var render_options = { "topo_records": [] } var bs_family = bs_cc["families"] if (topo_ccs.length && $("#topo-toggle").is(":checked")) { var topo_cc_ids = topo_ccs.map(cc => cc.id) @@ -656,7 +682,8 @@

Network

bs_family = bs_family.concat(topo_fams) } var bs_records = bs_family.flatMap(fam => fam.members) - var bs_cc_data = generate_bs_data(bs_records, run_data) + populateTmpTable("rec_ids", bs_records) + var bs_cc_data = generate_bs_data(run_data) var bs_alignment = generate_bs_alignments(bs_family, bs_cc_data) var bs_similarity = generate_bs_similarity(bs_records, run_data) var bs = new Bigscape(run_data, bs_cc_data, bs_family, bs_alignment, bs_similarity, "network-container", render_options) @@ -697,8 +724,9 @@

Network

} function generate_bs_data_domains(cds_ids) { + populateTmpTable("cds_ids", cds_ids) let dom_query = `SELECT hsp.cds_id, hsp.accession, hsp.env_start, hsp.env_stop, hsp.bit_score FROM hsp - WHERE hsp.cds_id IN (${cds_ids})` + WHERE hsp.cds_id IN (SELECT cds_id FROM cds_ids)` var domains = window.db.exec(dom_query)[0].values var dom_data = {} for (i in domains) { @@ -710,9 +738,11 @@

Network

} return dom_data } + function generate_bs_data_orfs(gbk_ids) { + populateTmpTable("gbk_ids", gbk_ids) let cds_query = `SELECT cds.gbk_id, cds.orf_num, cds.strand, cds.nt_start, cds.nt_stop, cds.id FROM cds - WHERE cds.gbk_id IN (${gbk_ids})` + WHERE cds.gbk_id IN (SELECT gbk_id FROM gbk_ids)` var cds = window.db.exec(cds_query)[0].values var dom_data = generate_bs_data_domains(cds.map(c => c[5])) @@ -726,12 +756,13 @@

Network

} return cds_data } - function generate_bs_data(record_ids, run_data) { + + function generate_bs_data(run_data) { var records_query = `SELECT gbk.id, gbk.description, length(gbk.nt_seq), gbk.hash, gbk.path, bgc_record.record_type, bgc_record.nt_start, bgc_record.nt_stop, bgc_record.record_number, bgc_record.id FROM bgc_record INNER JOIN gbk ON bgc_record.gbk_id==gbk.id - WHERE bgc_record.id IN (${record_ids})` + WHERE bgc_record.id IN (SELECT rec_id FROM rec_ids)` var records = window.db.exec(records_query)[0].values cds_data = generate_bs_data_orfs(records.map(rec => rec[0])) @@ -800,8 +831,8 @@

Network

var distances = window.db.exec(`SELECT distance.record_a_id, distance.record_b_id, distance.distance FROM distance INNER JOIN edge_params ON edge_params.id==distance.edge_param_id - WHERE distance.distance<${cutoff} AND distance.record_a_id IN (${record_ids}) - AND distance.record_b_id IN (${record_ids}) AND edge_params.weights=="${weight}" + WHERE distance.distance<${cutoff} AND distance.record_a_id IN (SELECT rec_id FROM rec_ids) + AND distance.record_b_id IN (SELECT rec_id FROM rec_ids) AND edge_params.weights=="${weight}" AND edge_params.alignment_mode=="${alignment_mode}" AND edge_params.extend_strategy=="${extend_strategy}"`)[0].values var bs_sim = {} diff --git a/big_scape/output/legacy_output.py b/big_scape/output/legacy_output.py index 74ea1741..e1b9698e 100644 --- a/big_scape/output/legacy_output.py +++ b/big_scape/output/legacy_output.py @@ -10,11 +10,12 @@ import click from sqlalchemy import select, alias from typing import Optional +from itertools import combinations # from other modules from big_scape.data import DB from big_scape.comparison import RecordPairGenerator -from big_scape.genbank import GBK, BGCRecord +from big_scape.genbank import GBK, BGCRecord, CandidateCluster, ProtoCluster, ProtoCore from big_scape.trees import generate_newick_tree, save_trees from big_scape.comparison import lcs, get_record_category @@ -259,7 +260,7 @@ def legacy_generate_bin_output( cutoff, pair_generator, run["run_id"] ) write_clustering_file(run, cutoff, pair_generator) - write_cutoff_network_file(run, cutoff, pair_generator) + write_cutoff_network_files(run, cutoff, pair_generator) if click_context and click_context.obj["no_trees"]: return @@ -498,11 +499,13 @@ def write_clustering_file(run, cutoff, pair_generator) -> None: return None -def write_cutoff_network_file( +def write_cutoff_network_files( run: dict, cutoff: float, pair_generator: RecordPairGenerator ) -> None: - """Writes the cutoff network file to the output directory - i.e. edge list for a given bin with edges above the cutoff + """Writes the cutoff network files to the output directory + + This includes an edge list for a given bin with edges above the cutoff, as well as + topology links as an edge list, if relevant. Args: run (dict): run parameters @@ -528,6 +531,10 @@ def write_cutoff_network_file( cutoff, ) + if run["record_type"] != bs_enums.RECORD_TYPE.REGION: + topolink_path = pair_generator_path / f"{bin_label}_c{cutoff}_topolinks.network" + write_topolink_file(pair_generator.source_records, topolink_path) + def write_full_network_file(run: dict, all_bgc_records: list[BGCRecord]) -> None: """Writes the full network file to the output directory, @@ -704,3 +711,61 @@ def write_network_file( ) network_file.write(row + "\n") + + +def write_topolink_file(bgc_records: list[BGCRecord], output_path: Path) -> None: + """Write topology links as edges to a network file + + Args: + bgc_records (list[BGCRecord]): BGC records to find and write topolinks for + output_path (Path): output file path + """ + + def find_record_type(record): + """Helper function to correctly spelled record type""" + if isinstance(record, CandidateCluster): + return "cand_cluster" + elif isinstance(record, ProtoCluster): + return "protocluster" + elif isinstance(record, ProtoCore): + return "proto_core" + else: + # redundancy, should never reach this + return "region" + + parent_dict: dict[GBK, list[BGCRecord]] = {} + + for record in bgc_records: + if record.parent_gbk is not None: + parent_dict.setdefault(record.parent_gbk, []).append(record) + + # don't create a file if there are no topolinks in this bin + if len(parent_dict) == len(bgc_records): + return + + with open(output_path, "w") as topolink_file: + header = ( + "GBK_a\tRecord_Type_a\tRecord_Number_a\tFull_Name_a\tGBK_b\t" + "Record_Type_b\tRecord_Number_b\tFull_Name_b\tType\n" + ) + topolink_file.write(header) + for parent, records in parent_dict.items(): + if len(records) > 1: + for rec_a, rec_b in combinations(records, 2): + type_a = find_record_type(rec_a) + type_b = find_record_type(rec_b) + + row = "\t".join( + [ + parent.path.stem, + type_a, + str(rec_a.number), + f"{parent.path.name}_{type_a}_{rec_a.number}", + parent.path.stem, + type_b, + str(rec_b.number), + f"{parent.path.name}_{type_b}_{rec_b.number}", + "Topology", + ] + ) + topolink_file.write(row + "\n") diff --git a/pyproject.toml b/pyproject.toml index baa2ff18..215c10f1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name ="big-scape" -version = "2.0.0-beta.3" +version = "2.0.0-beta.4" description = "Biosynthetic Gene Similarity Clustering and Prospecting Engine" requires-python = ">=3.11" license = { file = "LICENSE" } diff --git a/test/integration/test_network.py b/test/integration/test_network.py index 5457390f..af49fa80 100644 --- a/test/integration/test_network.py +++ b/test/integration/test_network.py @@ -465,7 +465,7 @@ def test_get_connected_components_no_ref_to_ref_ccs(self): cc, include_records ) if is_ref_only: - bs_network.remove_connected_component(cc, 0.8, 1) + bs_network.remove_connected_component(cc, mix_bin.label, 0.8, 1) cc_table = bs_data.DB.metadata.tables["connected_component"] @@ -488,7 +488,7 @@ def test_get_connected_components_no_ref_to_ref_ccs(self): cc, include_records ) if is_ref_only: - bs_network.remove_connected_component(cc, 0.5, 1) + bs_network.remove_connected_component(cc, mix_bin.label, 0.5, 1) cc_table = bs_data.DB.metadata.tables["connected_component"] diff --git a/test/network/test_network.py b/test/network/test_network.py index 21ed7bff..89b870bd 100644 --- a/test/network/test_network.py +++ b/test/network/test_network.py @@ -463,7 +463,7 @@ def test_get_connected_component_id(self): cc = next(bs_network.get_connected_components(0.5, 1, mix_bin, 1)) - cc_id = bs_network.get_connected_component_id(cc, 0.5, 1) + cc_id = bs_network.get_connected_component_id(cc, mix_bin.label, 0.5, 1) expected_data = 1 @@ -528,7 +528,7 @@ def test_remove_connected_component(self): cc, include_records ) if is_ref_only: - bs_network.remove_connected_component(cc, 0.5, 1) + bs_network.remove_connected_component(cc, mix_bin.label, 0.5, 1) select_statement = select(cc_table.c.id).distinct()