+ *On larger datasets (Upward 1000 BGCs), this might take a
+ minute to load.
@@ -289,7 +291,14 @@
Network
-
+
+
+
+ If loading takes longer than a minute, your database is likely too
+ large to be loaded. Use the output network tsv files to contruct your
+ networks in a networking tool like Cytoscape.
+
+
function dataLoaded() { // only called after a database is loaded
showLoading(false, false);
+ createTmpTables();
populateRunSelect();
}
+ function createTmpTables() {
+ // creates tmp db tables for large query building
+ window.db.run(`CREATE TABLE rec_ids (rec_id int)`)
+ window.db.run(`CREATE TABLE gbk_ids (gbk_id int)`)
+ window.db.run(`CREATE TABLE cds_ids (cds_id int)`)
+ }
+
+ function populateTmpTable(table_name, data) {
+ //populate tmp db tables in batches of 10k insertions
+ window.db.run(`DELETE FROM ${table_name}`) //clear table
+ for (i = 0; i < data.length; i += 10000) {
+ window.db.run(`INSERT INTO ${table_name} VALUES ${data.slice(i, i + 10000).map(idx => "(" + idx + ")")}`)
+ }
+ }
+
function populateRunSelect() {
runs = window.db.exec(`SELECT * FROM run`)[0]
run_data = {}
@@ -637,7 +662,8 @@
Network
}
var bs_families = bs_ccs.flatMap(cc => cc.families)
var bs_records = bs_families.flatMap(fam => fam.members)
- var bs_data = generate_bs_data(bs_records, run_data)
+ populateTmpTable("rec_ids", bs_records)
+ var bs_data = generate_bs_data(run_data)
var bs_alignments = generate_bs_alignments(bs_families, bs_data)
var bs_similarities = generate_bs_similarity(bs_records, run_data)
var bs = new Bigscape(run_data, bs_data, bs_families, bs_alignments, bs_similarities, "network-container")
@@ -647,7 +673,7 @@
Network
// load a single connected component and topolinked ccs if present.
// to distinguish between the selected cc and topolinked ones, pass
// all topolinked record ids in render_options when creating a Bigscape object
- var render_options = { "render_time": 1, "topo_records": [] }
+ var render_options = { "topo_records": [] }
var bs_family = bs_cc["families"]
if (topo_ccs.length && $("#topo-toggle").is(":checked")) {
var topo_cc_ids = topo_ccs.map(cc => cc.id)
@@ -656,7 +682,8 @@
Network
bs_family = bs_family.concat(topo_fams)
}
var bs_records = bs_family.flatMap(fam => fam.members)
- var bs_cc_data = generate_bs_data(bs_records, run_data)
+ populateTmpTable("rec_ids", bs_records)
+ var bs_cc_data = generate_bs_data(run_data)
var bs_alignment = generate_bs_alignments(bs_family, bs_cc_data)
var bs_similarity = generate_bs_similarity(bs_records, run_data)
var bs = new Bigscape(run_data, bs_cc_data, bs_family, bs_alignment, bs_similarity, "network-container", render_options)
@@ -697,8 +724,9 @@
Network
}
function generate_bs_data_domains(cds_ids) {
+ populateTmpTable("cds_ids", cds_ids)
let dom_query = `SELECT hsp.cds_id, hsp.accession, hsp.env_start, hsp.env_stop, hsp.bit_score FROM hsp
- WHERE hsp.cds_id IN (${cds_ids})`
+ WHERE hsp.cds_id IN (SELECT cds_id FROM cds_ids)`
var domains = window.db.exec(dom_query)[0].values
var dom_data = {}
for (i in domains) {
@@ -710,9 +738,11 @@
Network
}
return dom_data
}
+
function generate_bs_data_orfs(gbk_ids) {
+ populateTmpTable("gbk_ids", gbk_ids)
let cds_query = `SELECT cds.gbk_id, cds.orf_num, cds.strand, cds.nt_start, cds.nt_stop, cds.id FROM cds
- WHERE cds.gbk_id IN (${gbk_ids})`
+ WHERE cds.gbk_id IN (SELECT gbk_id FROM gbk_ids)`
var cds = window.db.exec(cds_query)[0].values
var dom_data = generate_bs_data_domains(cds.map(c => c[5]))
@@ -726,12 +756,13 @@
Network
}
return cds_data
}
- function generate_bs_data(record_ids, run_data) {
+
+ function generate_bs_data(run_data) {
var records_query = `SELECT gbk.id, gbk.description, length(gbk.nt_seq), gbk.hash, gbk.path,
bgc_record.record_type, bgc_record.nt_start, bgc_record.nt_stop,
bgc_record.record_number, bgc_record.id FROM bgc_record
INNER JOIN gbk ON bgc_record.gbk_id==gbk.id
- WHERE bgc_record.id IN (${record_ids})`
+ WHERE bgc_record.id IN (SELECT rec_id FROM rec_ids)`
var records = window.db.exec(records_query)[0].values
cds_data = generate_bs_data_orfs(records.map(rec => rec[0]))
@@ -800,8 +831,8 @@
Network
var distances = window.db.exec(`SELECT distance.record_a_id, distance.record_b_id, distance.distance FROM distance
INNER JOIN edge_params ON edge_params.id==distance.edge_param_id
- WHERE distance.distance<${cutoff} AND distance.record_a_id IN (${record_ids})
- AND distance.record_b_id IN (${record_ids}) AND edge_params.weights=="${weight}"
+ WHERE distance.distance<${cutoff} AND distance.record_a_id IN (SELECT rec_id FROM rec_ids)
+ AND distance.record_b_id IN (SELECT rec_id FROM rec_ids) AND edge_params.weights=="${weight}"
AND edge_params.alignment_mode=="${alignment_mode}"
AND edge_params.extend_strategy=="${extend_strategy}"`)[0].values
var bs_sim = {}
diff --git a/big_scape/output/legacy_output.py b/big_scape/output/legacy_output.py
index 74ea1741..e1b9698e 100644
--- a/big_scape/output/legacy_output.py
+++ b/big_scape/output/legacy_output.py
@@ -10,11 +10,12 @@
import click
from sqlalchemy import select, alias
from typing import Optional
+from itertools import combinations
# from other modules
from big_scape.data import DB
from big_scape.comparison import RecordPairGenerator
-from big_scape.genbank import GBK, BGCRecord
+from big_scape.genbank import GBK, BGCRecord, CandidateCluster, ProtoCluster, ProtoCore
from big_scape.trees import generate_newick_tree, save_trees
from big_scape.comparison import lcs, get_record_category
@@ -259,7 +260,7 @@ def legacy_generate_bin_output(
cutoff, pair_generator, run["run_id"]
)
write_clustering_file(run, cutoff, pair_generator)
- write_cutoff_network_file(run, cutoff, pair_generator)
+ write_cutoff_network_files(run, cutoff, pair_generator)
if click_context and click_context.obj["no_trees"]:
return
@@ -498,11 +499,13 @@ def write_clustering_file(run, cutoff, pair_generator) -> None:
return None
-def write_cutoff_network_file(
+def write_cutoff_network_files(
run: dict, cutoff: float, pair_generator: RecordPairGenerator
) -> None:
- """Writes the cutoff network file to the output directory
- i.e. edge list for a given bin with edges above the cutoff
+ """Writes the cutoff network files to the output directory
+
+ This includes an edge list for a given bin with edges above the cutoff, as well as
+ topology links as an edge list, if relevant.
Args:
run (dict): run parameters
@@ -528,6 +531,10 @@ def write_cutoff_network_file(
cutoff,
)
+ if run["record_type"] != bs_enums.RECORD_TYPE.REGION:
+ topolink_path = pair_generator_path / f"{bin_label}_c{cutoff}_topolinks.network"
+ write_topolink_file(pair_generator.source_records, topolink_path)
+
def write_full_network_file(run: dict, all_bgc_records: list[BGCRecord]) -> None:
"""Writes the full network file to the output directory,
@@ -704,3 +711,61 @@ def write_network_file(
)
network_file.write(row + "\n")
+
+
+def write_topolink_file(bgc_records: list[BGCRecord], output_path: Path) -> None:
+ """Write topology links as edges to a network file
+
+ Args:
+ bgc_records (list[BGCRecord]): BGC records to find and write topolinks for
+ output_path (Path): output file path
+ """
+
+ def find_record_type(record):
+ """Helper function to correctly spelled record type"""
+ if isinstance(record, CandidateCluster):
+ return "cand_cluster"
+ elif isinstance(record, ProtoCluster):
+ return "protocluster"
+ elif isinstance(record, ProtoCore):
+ return "proto_core"
+ else:
+ # redundancy, should never reach this
+ return "region"
+
+ parent_dict: dict[GBK, list[BGCRecord]] = {}
+
+ for record in bgc_records:
+ if record.parent_gbk is not None:
+ parent_dict.setdefault(record.parent_gbk, []).append(record)
+
+ # don't create a file if there are no topolinks in this bin
+ if len(parent_dict) == len(bgc_records):
+ return
+
+ with open(output_path, "w") as topolink_file:
+ header = (
+ "GBK_a\tRecord_Type_a\tRecord_Number_a\tFull_Name_a\tGBK_b\t"
+ "Record_Type_b\tRecord_Number_b\tFull_Name_b\tType\n"
+ )
+ topolink_file.write(header)
+ for parent, records in parent_dict.items():
+ if len(records) > 1:
+ for rec_a, rec_b in combinations(records, 2):
+ type_a = find_record_type(rec_a)
+ type_b = find_record_type(rec_b)
+
+ row = "\t".join(
+ [
+ parent.path.stem,
+ type_a,
+ str(rec_a.number),
+ f"{parent.path.name}_{type_a}_{rec_a.number}",
+ parent.path.stem,
+ type_b,
+ str(rec_b.number),
+ f"{parent.path.name}_{type_b}_{rec_b.number}",
+ "Topology",
+ ]
+ )
+ topolink_file.write(row + "\n")
diff --git a/pyproject.toml b/pyproject.toml
index baa2ff18..215c10f1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
[project]
name ="big-scape"
-version = "2.0.0-beta.3"
+version = "2.0.0-beta.4"
description = "Biosynthetic Gene Similarity Clustering and Prospecting Engine"
requires-python = ">=3.11"
license = { file = "LICENSE" }
diff --git a/test/integration/test_network.py b/test/integration/test_network.py
index 5457390f..af49fa80 100644
--- a/test/integration/test_network.py
+++ b/test/integration/test_network.py
@@ -465,7 +465,7 @@ def test_get_connected_components_no_ref_to_ref_ccs(self):
cc, include_records
)
if is_ref_only:
- bs_network.remove_connected_component(cc, 0.8, 1)
+ bs_network.remove_connected_component(cc, mix_bin.label, 0.8, 1)
cc_table = bs_data.DB.metadata.tables["connected_component"]
@@ -488,7 +488,7 @@ def test_get_connected_components_no_ref_to_ref_ccs(self):
cc, include_records
)
if is_ref_only:
- bs_network.remove_connected_component(cc, 0.5, 1)
+ bs_network.remove_connected_component(cc, mix_bin.label, 0.5, 1)
cc_table = bs_data.DB.metadata.tables["connected_component"]
diff --git a/test/network/test_network.py b/test/network/test_network.py
index 21ed7bff..89b870bd 100644
--- a/test/network/test_network.py
+++ b/test/network/test_network.py
@@ -463,7 +463,7 @@ def test_get_connected_component_id(self):
cc = next(bs_network.get_connected_components(0.5, 1, mix_bin, 1))
- cc_id = bs_network.get_connected_component_id(cc, 0.5, 1)
+ cc_id = bs_network.get_connected_component_id(cc, mix_bin.label, 0.5, 1)
expected_data = 1
@@ -528,7 +528,7 @@ def test_remove_connected_component(self):
cc, include_records
)
if is_ref_only:
- bs_network.remove_connected_component(cc, 0.5, 1)
+ bs_network.remove_connected_component(cc, mix_bin.label, 0.5, 1)
select_statement = select(cc_table.c.id).distinct()