Skip to content

Commit

Permalink
Merge pull request #251 from medema-group/release/2.0.0-beta.4
Browse files Browse the repository at this point in the history
Release/2.0.0 beta.4
  • Loading branch information
adraismawur authored Jan 22, 2025
2 parents 44a4cc7 + 784b237 commit da5614a
Show file tree
Hide file tree
Showing 14 changed files with 149 additions and 45 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
![Coverage](https://medema-group.github.io/BiG-SCAPE/badges/coverage.svg)
![Pylint](https://medema-group.github.io/BiG-SCAPE/badges/pylint.svg)

## _Note: BiG-SCAPE 2.0 is still in beta. Please submit an issue if you find anything wrong with this release!_
## _Notes:<br>BiG-SCAPE 2.0 is still in beta. Please submit an issue if you find anything wrong with this release!<br>BiG-SCAPE 2.0 features several updates to input validation and reference usage. We encourage both experienced BiG-SCAPE 1 users as well as new BiG-SCAPE users to read the updated [documentation](https://github.com/medema-group/BiG-SCAPE/wiki)._

# BiG-SCAPE

Expand All @@ -21,7 +21,7 @@ For installation instructions, see [here](https://github.com/medema-group/BiG-SC
Learn more about BiG-SCAPE in the [wiki](https://github.com/medema-group/BiG-SCAPE/wiki).


![BiG-SCAPE workflow](Figures/BiG-SCAPE-CORASON_Fig1_20171122_v01_MM_nocorason.png)
![BiG-SCAPE workflow](figures/BiG-SCAPE-CORASON_Fig1_20171122_v01_MM_nocorason.png)


If you find BiG-SCAPE useful, please cite us:
Expand Down
5 changes: 4 additions & 1 deletion big_scape/data/sqlite.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
Compiled,
Select,
Insert,
Delete,
CursorResult,
create_engine,
func,
Expand Down Expand Up @@ -283,7 +284,9 @@ def execute_raw_query(query: str) -> CursorResult:
return DB.connection.execute(text(query))

@staticmethod
def execute(query: Compiled | Select[Any] | Insert, commit=True) -> CursorResult:
def execute(
query: Compiled | Select[Any] | Insert | Delete, commit=True
) -> CursorResult:
"""Wrapper for SQLAlchemy.connection.execute expecting a Compiled query
Arguments:
Expand Down
2 changes: 1 addition & 1 deletion big_scape/distances/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def calculate_distances_query(
query_nodes = bs_network.get_nodes_from_cc(query_connected_component, query_records)

bs_network.remove_connected_component(
query_connected_component, max_cutoff, run["run_id"]
query_connected_component, query_bin.label, max_cutoff, run["run_id"]
)

query_bin_connected = bs_comparison.RecordPairGenerator(
Expand Down
15 changes: 8 additions & 7 deletions big_scape/genbank/gbk.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,11 +95,11 @@ def batch_hash(gbks: list[GBK], n: int):
return temp_table


def create_temp_gbk_id_table(gbks: list[GBK]) -> Table:
def create_temp_gbk_id_table(gbk_ids: list[int]) -> Table:
"""Create a temporary table with ids of given gbks
Args:
gbks (list[GBK]): the gbks to include in the connected component
gbk_ids (list[int]): the ids of the gbks to add to the temporary table
Returns:
Table: the temporary table
Expand Down Expand Up @@ -132,12 +132,13 @@ def create_temp_gbk_id_table(gbks: list[GBK]) -> Table:
INSERT INTO {temp_table_name} (gbk_id) VALUES (?);
"""

def batch_hash(gbks: list[GBK], n: int):
l = len(gbks)
# local function for batching
def batch_hash(gbk_ids: list[int], n: int):
l = len(gbk_ids)
for ndx in range(0, l, n):
yield [gbk._db_id for gbk in gbks[ndx : min(ndx + n, l)]]
yield [gbk_id for gbk_id in gbk_ids[ndx : min(ndx + n, l)]]

for hash_batch in batch_hash(gbks, 1000):
for hash_batch in batch_hash(gbk_ids, 1000):
cursor.executemany(insert_query, [(x,) for x in hash_batch]) # type: ignore

cursor.close()
Expand Down Expand Up @@ -412,7 +413,7 @@ def load_many(input_gbks: list[GBK]) -> list[GBK]:
# load GBK regions. This will also populate all record levels below region
# e.g. candidate cluster, protocore if they exist

temp_gbk_id_table = create_temp_gbk_id_table(input_gbks)
temp_gbk_id_table = create_temp_gbk_id_table(list(gbk_dict.keys()))

Region.load_all(gbk_dict, temp_gbk_id_table)

Expand Down
2 changes: 1 addition & 1 deletion big_scape/genbank/proto_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,7 @@ def load_all(
# add to dictionary
protocluster_dict[result.id] = new_proto_cluster

ProtoCore.load_all(protocluster_dict)
ProtoCore.load_all(protocluster_dict, temp_gbk_id_table)


class MergedProtoCluster(ProtoCluster):
Expand Down
2 changes: 1 addition & 1 deletion big_scape/network/families.py
Original file line number Diff line number Diff line change
Expand Up @@ -311,7 +311,7 @@ def run_family_assignments(
connected_component, bin.source_records
):
bs_network.remove_connected_component(
connected_component, cutoff, run["run_id"]
connected_component, bin.label, cutoff, run["run_id"]
)
continue

Expand Down
18 changes: 11 additions & 7 deletions big_scape/network/network.py
Original file line number Diff line number Diff line change
Expand Up @@ -548,7 +548,7 @@ def reference_only_connected_component(connected_component, bgc_records) -> bool


def get_connected_component_id(
connected_component: list, cutoff: float, run_id: int
connected_component: list, bin_label: str, cutoff: float, run_id: int
) -> int:
"""Get the connected component id for the given connected component
expects all edges to be in one connected component, if thats not the
Expand All @@ -575,33 +575,37 @@ def get_connected_component_id(
.distinct()
.where(
and_(
cc_table.c.cutoff == cutoff,
cc_table.c.record_id == record_id,
cc_table.c.bin_label == bin_label,
cc_table.c.cutoff == cutoff,
cc_table.c.run_id == run_id,
)
)
.limit(1)
)

cc_ids = DB.execute(select_statement).fetchone()
cc_id = DB.execute(select_statement).scalar_one()

return cc_ids[0]
return cc_id


def remove_connected_component(
connected_component: list, cutoff: float, run_id: int
connected_component: list, bin_label: str, cutoff: float, run_id: int
) -> None:
"""Removes a connected component from the cc table in the database"""

if DB.metadata is None:
raise RuntimeError("DB.metadata is None")

cc_id = get_connected_component_id(connected_component, cutoff, run_id)
cc_id = get_connected_component_id(connected_component, bin_label, cutoff, run_id)

cc_table = DB.metadata.tables["connected_component"]

delete_statement = delete(cc_table).where(
cc_table.c.id == cc_id, cc_table.c.cutoff == cutoff, cc_table.c.run_id == run_id
cc_table.c.id == cc_id,
cc_table.c.bin_label == bin_label,
cc_table.c.cutoff == cutoff,
cc_table.c.run_id == run_id,
)

DB.execute(delete_statement)
Expand Down
4 changes: 2 additions & 2 deletions big_scape/output/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
legacy_generate_bin_output,
write_record_annotations_file,
write_clustering_file,
write_cutoff_network_file,
write_cutoff_network_files,
write_full_network_file,
)

Expand All @@ -16,6 +16,6 @@
"legacy_generate_bin_output",
"write_record_annotations_file",
"write_clustering_file",
"write_cutoff_network_file",
"write_cutoff_network_files",
"write_full_network_file",
]
Original file line number Diff line number Diff line change
Expand Up @@ -487,7 +487,7 @@ function Bigscape(run_data, bs_data, bs_families, bs_alignment, bs_similarity, n
showSingletons(true);
updateDescription(highlighted_nodes);
net_ui.find("svg").css("height", "100%").css("width", "100%");
var countDown = options["render_time"] ? options["render_time"] : 5 + parseInt(graph.getLinksCount() / 1000);
var countDown = options["render_time"] ? options["render_time"] : 1 + parseInt(graph.getNodesCount() / 300) + parseInt(graph.getLinksCount() / 1000);
var perZoom = 5;
var zoomCount = 0;
info_ui.append("<div>Adjusting network layout for... <span class='network-layout-counter'>" + countDown + "</span> second(s)</div>");
Expand Down
55 changes: 43 additions & 12 deletions big_scape/output/html_template/output/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,8 @@ <h3>Network</h3>
</div>
<div style="margin:1em">
<button id="load-all" title="Load all CCs currently shown in the table">Visualize All</button>
<span class="w3-tiny" style="margin:0.5em">*On larger datasets (Upward 1000 BGCs), this might take a
minute to load.</span>
</div>
<div id="topo-toggle-container" style="margin:1em">
</div>
Expand All @@ -289,7 +291,14 @@ <h3>Network</h3>
<div class="w3-modal-content" style="width: 400px; text-align: center; padding: 1em; border-radius: 1em">
<button id="close-db-select" class="showhide-btn active"></button>
<div class="w3-container">
<img src="./html_content/img/loading.gif"></img>
<div id="loadingImg">
<img src="./html_content/img/loading.gif">
<br>
If loading takes longer than a minute, your database is likely too
large to be loaded. Use the output network tsv files to contruct your
networks in a networking tool like Cytoscape.
<br>
</div>
<br>
Select a database file to read from:
</div>
Expand Down Expand Up @@ -358,9 +367,9 @@ <h3>Network</h3>
if (enable) {
$("#loadingWindow").css("display", "block");
if (spinner) {
$("#loadingWindow img").css("display", "inline");
$("#loadingImg").css("display", "inline");
} else {
$("#loadingWindow img").css("display", "none");
$("#loadingImg").css("display", "none");
}
} else {
$("#loadingWindow").css("display", "none");
Expand Down Expand Up @@ -453,9 +462,25 @@ <h3>Network</h3>

function dataLoaded() { // only called after a database is loaded
showLoading(false, false);
createTmpTables();
populateRunSelect();
}

function createTmpTables() {
// creates tmp db tables for large query building
window.db.run(`CREATE TABLE rec_ids (rec_id int)`)
window.db.run(`CREATE TABLE gbk_ids (gbk_id int)`)
window.db.run(`CREATE TABLE cds_ids (cds_id int)`)
}

function populateTmpTable(table_name, data) {
//populate tmp db tables in batches of 10k insertions
window.db.run(`DELETE FROM ${table_name}`) //clear table
for (i = 0; i < data.length; i += 10000) {
window.db.run(`INSERT INTO ${table_name} VALUES ${data.slice(i, i + 10000).map(idx => "(" + idx + ")")}`)
}
}

function populateRunSelect() {
runs = window.db.exec(`SELECT * FROM run`)[0]
run_data = {}
Expand Down Expand Up @@ -637,7 +662,8 @@ <h3>Network</h3>
}
var bs_families = bs_ccs.flatMap(cc => cc.families)
var bs_records = bs_families.flatMap(fam => fam.members)
var bs_data = generate_bs_data(bs_records, run_data)
populateTmpTable("rec_ids", bs_records)
var bs_data = generate_bs_data(run_data)
var bs_alignments = generate_bs_alignments(bs_families, bs_data)
var bs_similarities = generate_bs_similarity(bs_records, run_data)
var bs = new Bigscape(run_data, bs_data, bs_families, bs_alignments, bs_similarities, "network-container")
Expand All @@ -647,7 +673,7 @@ <h3>Network</h3>
// load a single connected component and topolinked ccs if present.
// to distinguish between the selected cc and topolinked ones, pass
// all topolinked record ids in render_options when creating a Bigscape object
var render_options = { "render_time": 1, "topo_records": [] }
var render_options = { "topo_records": [] }
var bs_family = bs_cc["families"]
if (topo_ccs.length && $("#topo-toggle").is(":checked")) {
var topo_cc_ids = topo_ccs.map(cc => cc.id)
Expand All @@ -656,7 +682,8 @@ <h3>Network</h3>
bs_family = bs_family.concat(topo_fams)
}
var bs_records = bs_family.flatMap(fam => fam.members)
var bs_cc_data = generate_bs_data(bs_records, run_data)
populateTmpTable("rec_ids", bs_records)
var bs_cc_data = generate_bs_data(run_data)
var bs_alignment = generate_bs_alignments(bs_family, bs_cc_data)
var bs_similarity = generate_bs_similarity(bs_records, run_data)
var bs = new Bigscape(run_data, bs_cc_data, bs_family, bs_alignment, bs_similarity, "network-container", render_options)
Expand Down Expand Up @@ -697,8 +724,9 @@ <h3>Network</h3>
}

function generate_bs_data_domains(cds_ids) {
populateTmpTable("cds_ids", cds_ids)
let dom_query = `SELECT hsp.cds_id, hsp.accession, hsp.env_start, hsp.env_stop, hsp.bit_score FROM hsp
WHERE hsp.cds_id IN (${cds_ids})`
WHERE hsp.cds_id IN (SELECT cds_id FROM cds_ids)`
var domains = window.db.exec(dom_query)[0].values
var dom_data = {}
for (i in domains) {
Expand All @@ -710,9 +738,11 @@ <h3>Network</h3>
}
return dom_data
}

function generate_bs_data_orfs(gbk_ids) {
populateTmpTable("gbk_ids", gbk_ids)
let cds_query = `SELECT cds.gbk_id, cds.orf_num, cds.strand, cds.nt_start, cds.nt_stop, cds.id FROM cds
WHERE cds.gbk_id IN (${gbk_ids})`
WHERE cds.gbk_id IN (SELECT gbk_id FROM gbk_ids)`
var cds = window.db.exec(cds_query)[0].values

var dom_data = generate_bs_data_domains(cds.map(c => c[5]))
Expand All @@ -726,12 +756,13 @@ <h3>Network</h3>
}
return cds_data
}
function generate_bs_data(record_ids, run_data) {

function generate_bs_data(run_data) {
var records_query = `SELECT gbk.id, gbk.description, length(gbk.nt_seq), gbk.hash, gbk.path,
bgc_record.record_type, bgc_record.nt_start, bgc_record.nt_stop,
bgc_record.record_number, bgc_record.id FROM bgc_record
INNER JOIN gbk ON bgc_record.gbk_id==gbk.id
WHERE bgc_record.id IN (${record_ids})`
WHERE bgc_record.id IN (SELECT rec_id FROM rec_ids)`
var records = window.db.exec(records_query)[0].values
cds_data = generate_bs_data_orfs(records.map(rec => rec[0]))

Expand Down Expand Up @@ -800,8 +831,8 @@ <h3>Network</h3>

var distances = window.db.exec(`SELECT distance.record_a_id, distance.record_b_id, distance.distance FROM distance
INNER JOIN edge_params ON edge_params.id==distance.edge_param_id
WHERE distance.distance<${cutoff} AND distance.record_a_id IN (${record_ids})
AND distance.record_b_id IN (${record_ids}) AND edge_params.weights=="${weight}"
WHERE distance.distance<${cutoff} AND distance.record_a_id IN (SELECT rec_id FROM rec_ids)
AND distance.record_b_id IN (SELECT rec_id FROM rec_ids) AND edge_params.weights=="${weight}"
AND edge_params.alignment_mode=="${alignment_mode}"
AND edge_params.extend_strategy=="${extend_strategy}"`)[0].values
var bs_sim = {}
Expand Down
Loading

0 comments on commit da5614a

Please sign in to comment.