From ed0de21d6b2c8fa598bc4c7d533fe4000fd9c2bd Mon Sep 17 00:00:00 2001 From: nlouwen Date: Tue, 31 Oct 2023 13:46:47 +0100 Subject: [PATCH 01/15] add GCF newick tree creation --- big_scape/trees/__init__.py | 5 + big_scape/trees/newick_tree.py | 221 +++++++++++++++++++++++++++++++++ 2 files changed, 226 insertions(+) create mode 100644 big_scape/trees/__init__.py create mode 100644 big_scape/trees/newick_tree.py diff --git a/big_scape/trees/__init__.py b/big_scape/trees/__init__.py new file mode 100644 index 00000000..27088f50 --- /dev/null +++ b/big_scape/trees/__init__.py @@ -0,0 +1,5 @@ +"""Contains newick tree generation functionality""" + +from .newick_tree import generate_newick_tree + +__all__ = ["generate_newick_tree"] diff --git a/big_scape/trees/newick_tree.py b/big_scape/trees/newick_tree.py new file mode 100644 index 00000000..b03f69f3 --- /dev/null +++ b/big_scape/trees/newick_tree.py @@ -0,0 +1,221 @@ +"""Module to generate newick GCF trees""" + +# from python +import tempfile +import subprocess +import logging +import numpy as np +from typing import TextIO +from Bio import Phylo +from pathlib import Path +from collections import defaultdict +from scipy.optimize import linear_sum_assignment + +# from other modules +from big_scape.genbank import BGCRecord + + +def generate_newick_tree( + records: list[BGCRecord], exemplar: int, family_members: list[int] +) -> str: + """Generate newick formatted tree for each GCF + + Args: + records (list[BGCRecord]): list of records within GCF + exemplar (int): index of exemplar to use during alignment + family_members (list[int]): list of bgc ids in one family + + Returns: + Correctly formatted newick tree + """ + # no need for alignment + if len(records) < 3: + return f"({','.join([str(bgc_id)+':0.0' for bgc_id in family_members])}):0.01;" + + algn = generate_gcf_alignment(records, exemplar, family_members) + with tempfile.TemporaryDirectory() as tmpdir: + tmppath = Path(tmpdir) + tmp_algn = tmppath / Path("alignment.fasta") + tmp_newick = tmppath / Path("alignment.newick") + with open(tmp_algn, "w") as tmp_a: + tmp_a.write(algn) + with open(tmp_newick, "w") as tmp_n: + run_fasttree(tmp_algn, tmp_n) + tree = process_newick_tree(tmp_newick) + return tree + + +def run_fasttree(algn_file: Path, out_file: TextIO): + """Generate FastTree newick GCF tree + + Args: + algn_file (Path): Path to alignment file + out_file (TextIO): Opened output file object + """ + subprocess.run( + ["fasttree", "-quiet", "-nopr", algn_file], stdout=out_file, shell=False + ) + + +def process_newick_tree(tree_file: Path) -> str: + """Process newick tree file format + + Args: + tree_file (Path): Path to tree file + """ + if not tree_file.exists(): + logging.error("Failed to create newick tree") + raise FileNotFoundError() + with open(tree_file, "r") as newick_file: + try: + tree = Phylo.read(newick_file, "newick") + except ValueError as e: + logging.warning("Error encountered while reading newick tree: ", str(e)) + return "" + else: + try: + tree.root_at_midpoint() + except UnboundLocalError: + # Noticed this could happen if the sequences are exactly + # the same and all distances == 0 + logging.debug("Unable to root at midpoint") + return tree.format("newick") + + +def generate_gcf_alignment( + records: list[BGCRecord], exemplar: int, family_members: list[int] +) -> str: + """Write protein domain alignment to file + + Args: + records (list[BGCRecord]): Records within one GCF to align + exemplar (int): Index of exemplar to use during alignment + family_members (list[int]): list of bgc ids in one family + + Returns: + Written file containing protein domain alignment + """ + record_ids = list(range(len(records))) + + # collect present domains for each GCF member + domain_sets = {} + # count the frequency of occurrence of each domain (excluding copies) + frequency_table: dict[str, int] = defaultdict(int) + for idx, record in enumerate(records): + domain_sets[idx] = set([domain.domain for domain in record.get_hsps()]) + for domain in domain_sets[idx]: + frequency_table[domain] += 1 + + # Find the set of [(tree domains)]. They should 1) be in the exemplar + # and 2) appear with the most frequency. Iterate over the different + # frequencies (descending) until set is not empty + tree_domains: set[str] = set() + frequencies = sorted(set(frequency_table.values()), reverse=True) + + # first try with domain(s) with max frequency, even if it's just one + f = 0 + while len(tree_domains) == 0 and f < len(frequencies): + for domain in frequency_table: + if ( + frequency_table[domain] == frequencies[f] + and domain in domain_sets[exemplar] + ): + tree_domains.add(domain) + f += 1 + if len(tree_domains) == 1: + logging.debug( + "core shared domains for GCF {} consists of a single domain ({})".format( + exemplar, [x for x in tree_domains][0] + ) + ) + + alignments: dict[int, str] = {} + alignments[exemplar] = "" + + out_of_tree_bgcs = [] # no common domain core + delete_bgc = set() + record_ids.remove(exemplar) + for record_idx in record_ids: + alignments[record_idx] = "" + + match_dict: dict[int, int] = {} + for domain in tree_domains: + specific_domain_list_a = [ + hsp for hsp in records[exemplar].get_hsps() if hsp.domain == domain + ] + num_copies_a = len(specific_domain_list_a) + for hsp in specific_domain_list_a: + if hsp.alignment is not None: + alignments[exemplar] += hsp.alignment.align_string + seq_length = len(hsp.alignment.align_string) # TODO: find better spot + + for bgc in alignments: + match_dict.clear() + if bgc == exemplar: + pass + elif domain not in domain_sets[bgc]: + out_of_tree_bgcs.append(bgc) + delete_bgc.add(bgc) + elif bgc in delete_bgc: + pass + else: + specific_domain_list_b = [ + hsp for hsp in records[bgc].get_hsps() if hsp.domain == domain + ] + num_copies_b = len(specific_domain_list_b) + dist_matrix: np.ndarray = np.ndarray((num_copies_a, num_copies_b)) + + for domsa in range(num_copies_a): + for domsb in range(num_copies_b): + hsp_a = specific_domain_list_a[domsa] + hsp_b = specific_domain_list_b[domsb] + + if hsp_a.alignment is None or hsp_b.alignment is None: + logging.error( + "Trying to compare unaligned domains", hsp_a, hsp_b + ) + raise AttributeError() + aligned_seq_a = hsp_a.alignment.align_string + aligned_seq_b = hsp_b.alignment.align_string + + matches = 0 + gaps = 0 + + for position in range(seq_length): + if aligned_seq_a[position] == aligned_seq_b[position]: + if aligned_seq_a[position] != "-": + matches += 1 + else: + gaps += 1 + + dist_matrix[domsa][domsb] = 1 - (matches / (seq_length - gaps)) + + best_indexes = linear_sum_assignment(dist_matrix) + + # at this point is not ensured that we have the same order + # for the exemplar's copies (rows in BestIndexes) + # ideally they should go from 0-numcopies. Better make sure + + for x in range(len(best_indexes[0])): + match_dict[best_indexes[0][x]] = best_indexes[1][x] + + for copy in range(num_copies_a): + try: + hsp_b = specific_domain_list_b[match_dict[copy]] + if hsp_b.alignment is None: + logging.error("Encountered unaligned domain", hsp_b) + raise AttributeError() + alignments[bgc] += hsp_b.alignment.align_string + except KeyError: + # This means that this copy of exemplar did not + # have a match in bgc (i.e. bgc has less copies + # of this domain than exemplar) + alignments[bgc] += "-" * seq_length + for bgc in list(delete_bgc): + del alignments[bgc] + delete_bgc.clear() + algn_string = f">{family_members[exemplar]}\n{alignments[exemplar]}\n" + for bgc in alignments: + if bgc != exemplar: + algn_string += f">{family_members[bgc]}\n{alignments[bgc]}\n" + return algn_string From bf0140484ae870dbc4ed8e0a59be9a923bb2b86d Mon Sep 17 00:00:00 2001 From: nlouwen Date: Tue, 31 Oct 2023 13:58:07 +0100 Subject: [PATCH 02/15] generate families alignments --- big_scape/output/legacy_output.py | 173 ++++++++++++++++++++++++++++-- 1 file changed, 165 insertions(+), 8 deletions(-) diff --git a/big_scape/output/legacy_output.py b/big_scape/output/legacy_output.py index 02ea2a1a..ce88c41d 100644 --- a/big_scape/output/legacy_output.py +++ b/big_scape/output/legacy_output.py @@ -12,9 +12,11 @@ from big_scape.comparison import RecordPairGenerator, legacy_get_class from big_scape.genbank import GBK, CDS from big_scape.enums import SOURCE_TYPE +from big_scape.trees import generate_newick_tree import big_scape.network.network as bs_network import big_scape.network.utility as bs_network_utility +import big_scape.comparison as bs_comparison def copy_base_output_templates(output_dir: Path): @@ -527,7 +529,7 @@ def generate_bs_data_js_orfs(gbk: GBK) -> Any: "id": f"{gbk.path.name[:-4]}_ORF{idx+1}", "start": cds.nt_start, "end": cds.nt_stop, - "strand": 1 if cds.strand else -1, + "strand": cds.strand if cds.strand else 1, "domains": generate_bs_data_js_orfs_domains(cds), } ) @@ -674,6 +676,160 @@ def generate_bs_networks_js_sim_matrix( return sparse_matrix +def generate_bs_families_alignment( + bs_families: dict[int, list[int]], pair_generator: RecordPairGenerator +): + """Generate list of dictionaries with information on bgc alignment of family members""" + bs_families_alignment = [] + records = pair_generator.source_records + # dictionary maps bgc to all orfs that contain domains + domain_genes_to_all_genes: dict[int, dict[int, int]] = {} + # dictionary maps bgc to list of domain count per gene + domain_count_gene: dict[int, list[int]] = {} + + for family_db_id, family_members in bs_families.items(): + # collects records within this GCF + family_records = [records[bgc_num] for bgc_num in family_members] + family_member_db_ids = [rec._db_id for rec in family_records] + fam_record_idx = family_member_db_ids.index(family_db_id) + fam_gbk = family_records[fam_record_idx].parent_gbk + if fam_gbk is None: + raise AttributeError("Record parent GBK is not set!") + for bgc, bgc_db_id in zip(family_members, family_member_db_ids): + bgc_gbk = records[bgc].parent_gbk + if bgc_gbk is None: + raise AttributeError("Record parent GBK is not set!") + if bgc_db_id is None: + raise AttributeError("Record has no database id!") + domain_genes_to_all_genes[bgc_db_id] = {} + domain_count_gene[bgc_db_id] = [] + has_domains = 0 + for cds_idx, cds in enumerate(bgc_gbk.genes): + if len(cds.hsps) > 0: + domain_count_gene[bgc_db_id].append(len(cds.hsps)) + domain_genes_to_all_genes[bgc_db_id][has_domains] = cds_idx + has_domains += 1 + + ref_genes_ = set() + aln = [] + for bgc, bgc_db_id in zip(family_members, family_member_db_ids): + bgc_gbk = records[bgc].parent_gbk + if bgc_gbk is None: + raise AttributeError("Record parent GBK is not set!") + if bgc_db_id is None: + raise AttributeError("Record has no database id!") + if bgc_db_id == family_db_id: + aln.append([[gene_num, 0] for gene_num in range(len(bgc_gbk.genes))]) + else: + # TODO: implement database storage of lcs and fetch here + ( + a_start, + a_stop, + b_start, + b_stop, + reverse, + ) = bs_comparison.legacy_lcs.legacy_find_cds_lcs( + family_records[fam_record_idx].get_cds_with_domains(), + records[bgc].get_cds_with_domains(), + ) + # if DB.metadata is None: + # raise RuntimeError("Database metadata is None!") + # dist_table = DB.metadata.tables["distance"] + # select_query = ( + # dist_table.select() + # .add_columns( + # dist_table.c.region_a_id, + # dist_table.c.region_b_id, + # dist_table.c.lcs_a_start, + # dist_table.c.lcs_a_stop, + # dist_table.c.lcs_b_start, + # dist_table.c.lcs_b_stop, + # dist_table.c.lcs_reverse, + # ) + # .where(dist_table.c.region_a_id.in_((family_db_id, bgc_db_id))) + # .where(dist_table.c.region_b_id.in_((family_db_id, bgc_db_id))) + # .compile() + # ) + # result = DB.execute(select_query).fetchone() + # if result is None: + # logging.error( + # "LCS not found in database (%s, %s)", family_db_id, bgc_db_id + # ) + # raise RuntimeError() + # if result.region_a_id == family_db_id: + # a_start: int = result.lcs_a_start + # a_stop: int = result.lcs_a_stop + # b_start: int = result.lcs_b_start + # b_stop: int = result.lcs_b_stop + # reverse: bool = result.lcs_reverse + # elif result.region_b_id == family_db_id: + # a_start = result.lcs_b_start + # a_stop = result.lcs_b_stop + # b_start = result.lcs_a_start + # b_stop = result.lcs_a_stop + # reverse = result.lcs_reverse + + length = abs(a_start - a_stop) # seed length + a_start = domain_genes_to_all_genes[family_db_id][a_start] + if length == 0: + pass + + elif reverse: + b_start = domain_genes_to_all_genes[bgc_db_id][ + len(domain_count_gene[bgc_db_id]) - b_start - 1 + ] + else: + b_start = domain_genes_to_all_genes[bgc_db_id][b_start] + + if length == 0: + length = 1 + # let's try aligning using the genes with most domains + # after all, they ended up being in the same GCF + # for some reason + x = max(domain_count_gene[family_db_id]) + x = domain_count_gene[family_db_id].index(x) + a_start = domain_genes_to_all_genes[family_db_id][x] + + y = max(list(domain_count_gene[bgc_db_id])) + y = domain_count_gene[bgc_db_id].index(y) + + # check orientation + if fam_gbk.genes[x].strand == bgc_gbk.genes[y].strand: + b_start = domain_genes_to_all_genes[bgc_db_id][y] + reverse = False + else: + b_start = domain_genes_to_all_genes[bgc_db_id][ + len(domain_count_gene[bgc_db_id]) - y - 1 + ] + reverse = True + ref_genes_.add(a_start) + + bgc_algn = [] + for gene_num in range(len(bgc_gbk.genes)): + if gene_num == b_start: + if reverse: + bgc_algn.append([a_start, -100]) + else: + bgc_algn.append([a_start, 100]) + else: + bgc_algn.append([-1, 100]) + aln.append(bgc_algn) + + ref_genes = list(ref_genes_) + + fam_alignment = { + "id": "FAM_{:05d}".format(family_db_id), + "ref": family_members[fam_record_idx], + "newick": generate_newick_tree( + family_records, fam_record_idx, family_members + ), + "ref_genes": ref_genes, + "aln": aln, + } + bs_families_alignment.append(fam_alignment) + return bs_families_alignment + + def generate_bs_families_members( cutoff: float, pair_generator: RecordPairGenerator, @@ -750,7 +906,7 @@ def generate_bs_networks_js( label: str, cutoff: float, pair_generator: RecordPairGenerator, - bs_families: list[dict[str, Any]], + bs_families: dict[int, list[int]], ): """Generates the bs_networks.js file located at output/html_content/networks/[label]/[pair_generator] @@ -781,10 +937,13 @@ def generate_bs_networks_js( bs_networks_js_path = pair_generator_path / "bs_networks.js" # TODO: replace with functions to generate objects + networks_families = generate_bs_networks_families(bs_families) bs_similarity: list[Any] = generate_bs_networks_js_sim_matrix( cutoff, pair_generator ) - bs_families_alignment: list[Any] = [] + bs_families_alignment: list[Any] = generate_bs_families_alignment( + bs_families, pair_generator + ) bs_similarity_families: list[Any] = [] with open(bs_networks_js_path, "w") as bs_networks_js: @@ -802,7 +961,7 @@ def generate_bs_networks_js( bs_networks_js.write( "var bs_families={};\n".format( json.dumps( - bs_families, + networks_families, indent=4, separators=(",", ":"), sort_keys=True, @@ -903,9 +1062,7 @@ def legacy_generate_bin_output( network (BSNetwork): the network object for the pair_generator """ families_members = generate_bs_families_members(cutoff, pair_generator) - networks_families = generate_bs_networks_families(families_members) + # networks_families = generate_bs_networks_families(families_members) add_run_data_network(output_dir, label, cutoff, pair_generator, families_members) - generate_bs_networks_js( - output_dir, label, cutoff, pair_generator, networks_families - ) + generate_bs_networks_js(output_dir, label, cutoff, pair_generator, families_members) From 7f8f6114a72dc1a8705064425aeb3b872d671882 Mon Sep 17 00:00:00 2001 From: nlouwen Date: Tue, 31 Oct 2023 16:04:45 +0100 Subject: [PATCH 03/15] implement lcs database storage --- big_scape/comparison/comparable_region.py | 7 ++++++ big_scape/comparison/legacy_workflow_alt.py | 25 +++++++++++++++------ big_scape/comparison/utility.py | 8 ++++--- big_scape/data/schema.sql | 5 +++++ big_scape/file_input/load_files.py | 18 ++++++++++----- big_scape/network/network.py | 6 ++--- 6 files changed, 50 insertions(+), 19 deletions(-) diff --git a/big_scape/comparison/comparable_region.py b/big_scape/comparison/comparable_region.py index 8f7a1fc1..f9c700ee 100644 --- a/big_scape/comparison/comparable_region.py +++ b/big_scape/comparison/comparable_region.py @@ -47,6 +47,7 @@ def __init__( reverse: bool, ): self.pair = pair + # store possibly extended comparable region self.a_start = a_start self.b_start = b_start @@ -60,6 +61,12 @@ def __init__( self.domain_dicts: Optional[ tuple[dict[HSP, list[int]], dict[HSP, list[int]]] ] = None + # store lcs without any extensions + self.lcs_a_start = a_start + self.lcs_b_start = b_start + self.lcs_a_stop = a_stop + self.lcs_b_stop = b_stop + self.lcs_reverse = reverse def get_domain_sets( self, regenerate=False, cache=True diff --git a/big_scape/comparison/legacy_workflow_alt.py b/big_scape/comparison/legacy_workflow_alt.py index 550af438..01781429 100644 --- a/big_scape/comparison/legacy_workflow_alt.py +++ b/big_scape/comparison/legacy_workflow_alt.py @@ -162,8 +162,9 @@ def generate_edges( if len(results) != len(task_batch): raise ValueError("Mismatch between task length and result length") - for idx, pair in enumerate(task_batch): - distance, jaccard, adjacency, dss = results[idx] + # for idx, pair in enumerate(task_batch): + # distance, jaccard, adjacency, dss = results[idx] + for pair, distance, jaccard, adjacency, dss in results: yield ( pair.region_a._db_id, pair.region_b._db_id, @@ -172,6 +173,11 @@ def generate_edges( adjacency, dss, pair_generator.weights, + pair.comparable_region.lcs_a_start, + pair.comparable_region.lcs_a_stop, + pair.comparable_region.lcs_b_start, + pair.comparable_region.lcs_b_stop, + pair.comparable_region.lcs_reverse, ) done_pairs += len(results) @@ -198,6 +204,11 @@ def do_lcs_pair( ) # set the comparable region + pair.comparable_region.lcs_a_start = a_start + pair.comparable_region.lcs_a_stop = a_stop + pair.comparable_region.lcs_b_start = b_start + pair.comparable_region.lcs_b_stop = b_stop + pair.comparable_region.lcs_reverse = reverse pair.comparable_region.a_start = a_start pair.comparable_region.a_stop = a_stop pair.comparable_region.b_start = b_start @@ -234,7 +245,7 @@ def expand_pair(pair: RecordPair) -> float: def calculate_scores_pair( data: tuple[list[RecordPair], bs_enums.ALIGNMENT_MODE, str] -) -> list[tuple[float, float, float, float]]: # pragma no cover +) -> list[tuple[RecordPair, float, float, float, float]]: # pragma no cover """Calculate the scores for a list of pairs Args: @@ -251,13 +262,13 @@ def calculate_scores_pair( for pair in pairs: if pair.region_a.parent_gbk == pair.region_b.parent_gbk: - results.append((0.0, 1.0, 1.0, 1.0)) + results.append((pair, 0.0, 1.0, 1.0, 1.0)) continue jaccard = calc_jaccard_pair(pair) if jaccard == 0.0: - results.append((1.0, 0.0, 0.0, 0.0)) + results.append((pair, 1.0, 0.0, 0.0, 0.0)) continue # in the form [bool, Pair]. true bools means they need expansion, false they don't @@ -267,7 +278,7 @@ def calculate_scores_pair( jaccard = expand_pair(pair) if jaccard == 0.0: - results.append((1.0, 0.0, 0.0, 0.0)) + results.append((pair, 1.0, 0.0, 0.0, 0.0)) continue if weights_label not in LEGACY_WEIGHTS: @@ -285,6 +296,6 @@ def calculate_scores_pair( similarity = jaccard * jc_weight + adjacency * ai_weight + dss * dss_weight distance = 1 - similarity - results.append((distance, jaccard, adjacency, dss)) + results.append((pair, distance, jaccard, adjacency, dss)) return results diff --git a/big_scape/comparison/utility.py b/big_scape/comparison/utility.py index 34c87f69..5cf098df 100644 --- a/big_scape/comparison/utility.py +++ b/big_scape/comparison/utility.py @@ -50,12 +50,14 @@ def save_edge_to_db( def save_edges_to_db( - edges: list[tuple[int, int, float, float, float, float, str]] + edges: list[ + tuple[int, int, float, float, float, float, str, int, int, int, int, bool] + ] ) -> None: """Save many edges to the database Args: - edges (list[tuple[int, int, float, float, float, float, str]]): list of edges to save + edges (list[tuple[]]): list of edges to save """ # save the comparison data to the database # using raw sqlite for this because sqlalchemy is not fast enough @@ -75,7 +77,7 @@ def save_edges_to_db( # create a query # TODO: this should not need ignore. it's there now because protoclusters somehow # trigger an integrityerror - query = "INSERT OR IGNORE INTO distance VALUES (?, ?, ?, ?, ?, ?, ?)" + query = "INSERT OR IGNORE INTO distance VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)" cursor.executemany(query, edges) diff --git a/big_scape/data/schema.sql b/big_scape/data/schema.sql index fdd0d689..0a901f51 100644 --- a/big_scape/data/schema.sql +++ b/big_scape/data/schema.sql @@ -76,6 +76,11 @@ CREATE TABLE IF NOT EXISTS distance ( adjacency REAL NOT NULL, dss REAL NOT NULL, weights TEXT NOT NULL, + lcs_a_start INTEGER NOT NULL, + lcs_a_stop INTEGER NOT NULL, + lcs_b_start INTEGER NOT NULL, + lcs_b_stop INTEGER NOT NULL, + lcs_reverse BOOLEAN NOT NULL, UNIQUE(region_a_id, region_b_id) FOREIGN KEY(region_a_id) REFERENCES bgc_record(id) FOREIGN KEY(region_b_id) REFERENCES bgc_record(id) diff --git a/big_scape/file_input/load_files.py b/big_scape/file_input/load_files.py index 0f939acf..bba213de 100644 --- a/big_scape/file_input/load_files.py +++ b/big_scape/file_input/load_files.py @@ -91,14 +91,16 @@ def download_dataset(url: str, path: Path, path_compressed: Path) -> None: os.remove(path_compressed) -def load_dataset_folder(run: dict, source_type: bs_enums.SOURCE_TYPE) -> List[GBK]: +def load_dataset_folder( + input_dir: Path, run: dict, source_type: bs_enums.SOURCE_TYPE +) -> List[GBK]: """Loads all gbk files in a given folder Returns empty list if path does not point to a folder or if folder does not contain gbk files """ - path = run["input_dir"] + path = input_dir source_type = source_type mode = run["input_mode"] include_gbk = run["include_gbk"] @@ -248,22 +250,26 @@ def load_gbks(run: dict, bigscape_dir: Path) -> list[GBK]: # add the query bgc to the exclude list run["exclude_gbk"].append(query_bgc_stem) - gbks = load_dataset_folder(run, bs_enums.SOURCE_TYPE.REFERENCE) + gbks = load_dataset_folder( + run["input_dir"], run, bs_enums.SOURCE_TYPE.REFERENCE + ) input_gbks.extend(gbks) else: - gbks = load_dataset_folder(run, bs_enums.SOURCE_TYPE.QUERY) + gbks = load_dataset_folder(run["input_dir"], run, bs_enums.SOURCE_TYPE.QUERY) input_gbks.extend(gbks) # get reference if either MIBiG version or user-made reference dir passed if run["mibig_version"]: mibig_version_dir = get_mibig(run["mibig_version"], bigscape_dir) - mibig_gbks = load_dataset_folder(mibig_version_dir, bs_enums.SOURCE_TYPE.MIBIG) + mibig_gbks = load_dataset_folder( + mibig_version_dir, run, bs_enums.SOURCE_TYPE.MIBIG + ) input_gbks.extend(mibig_gbks) if run["reference_dir"]: reference_gbks = load_dataset_folder( - run["reference_dir"], bs_enums.SOURCE_TYPE.REFERENCE + run["reference_dir"], run, bs_enums.SOURCE_TYPE.REFERENCE ) input_gbks.extend(reference_gbks) diff --git a/big_scape/network/network.py b/big_scape/network/network.py index fd9a4335..2a0ba91c 100644 --- a/big_scape/network/network.py +++ b/big_scape/network/network.py @@ -96,7 +96,7 @@ def get_edge( if edge is None: return None - return cast(tuple[int, int, float, float, float, float, str], edge) + return cast(tuple[int, int, float, float, float, float, str], edge[0:7]) def get_edges( @@ -123,7 +123,7 @@ def get_edges( .where(distance_table.c.distance < distance_cutoff) ).compile() - edges = DB.execute(select_statement).fetchall() + edges = [edge[0:7] for edge in DB.execute(select_statement).fetchall()] return cast(list[tuple[int, int, float, float, float, float, str]], edges) @@ -166,6 +166,6 @@ def get_connected_edges( .where(distance_table.c.distance < distance_cutoff).compile() ) - edges = DB.execute(select_statement).fetchall() + edges = [edge[0:7] for edge in DB.execute(select_statement).fetchall()] return cast(list[tuple[int, int, float, float, float, float, str]], edges) From 6c74edaf4f4fa954c1a20004628dd79cd7241dfc Mon Sep 17 00:00:00 2001 From: nlouwen Date: Wed, 1 Nov 2023 10:43:55 +0100 Subject: [PATCH 04/15] Fetch lcs bounds from database --- big_scape/output/legacy_output.py | 117 +++++++++++++++--------------- 1 file changed, 58 insertions(+), 59 deletions(-) diff --git a/big_scape/output/legacy_output.py b/big_scape/output/legacy_output.py index ce88c41d..369c4023 100644 --- a/big_scape/output/legacy_output.py +++ b/big_scape/output/legacy_output.py @@ -16,7 +16,6 @@ import big_scape.network.network as bs_network import big_scape.network.utility as bs_network_utility -import big_scape.comparison as bs_comparison def copy_base_output_templates(output_dir: Path): @@ -721,65 +720,65 @@ def generate_bs_families_alignment( if bgc_db_id == family_db_id: aln.append([[gene_num, 0] for gene_num in range(len(bgc_gbk.genes))]) else: - # TODO: implement database storage of lcs and fetch here - ( - a_start, - a_stop, - b_start, - b_stop, - reverse, - ) = bs_comparison.legacy_lcs.legacy_find_cds_lcs( - family_records[fam_record_idx].get_cds_with_domains(), - records[bgc].get_cds_with_domains(), + if DB.metadata is None: + raise RuntimeError("Database metadata is None!") + weights = pair_generator.weights + dist_table = DB.metadata.tables["distance"] + select_query = ( + dist_table.select() + .add_columns( + dist_table.c.region_a_id, + dist_table.c.region_b_id, + dist_table.c.lcs_a_start, + dist_table.c.lcs_a_stop, + dist_table.c.lcs_b_start, + dist_table.c.lcs_reverse, + ) + .where(dist_table.c.region_a_id.in_((family_db_id, bgc_db_id))) + .where(dist_table.c.region_b_id.in_((family_db_id, bgc_db_id))) + .where(dist_table.c.weights == weights) + .compile() ) - # if DB.metadata is None: - # raise RuntimeError("Database metadata is None!") - # dist_table = DB.metadata.tables["distance"] - # select_query = ( - # dist_table.select() - # .add_columns( - # dist_table.c.region_a_id, - # dist_table.c.region_b_id, - # dist_table.c.lcs_a_start, - # dist_table.c.lcs_a_stop, - # dist_table.c.lcs_b_start, - # dist_table.c.lcs_b_stop, - # dist_table.c.lcs_reverse, - # ) - # .where(dist_table.c.region_a_id.in_((family_db_id, bgc_db_id))) - # .where(dist_table.c.region_b_id.in_((family_db_id, bgc_db_id))) - # .compile() - # ) - # result = DB.execute(select_query).fetchone() - # if result is None: - # logging.error( - # "LCS not found in database (%s, %s)", family_db_id, bgc_db_id - # ) - # raise RuntimeError() - # if result.region_a_id == family_db_id: - # a_start: int = result.lcs_a_start - # a_stop: int = result.lcs_a_stop - # b_start: int = result.lcs_b_start - # b_stop: int = result.lcs_b_stop - # reverse: bool = result.lcs_reverse - # elif result.region_b_id == family_db_id: - # a_start = result.lcs_b_start - # a_stop = result.lcs_b_stop - # b_start = result.lcs_a_start - # b_stop = result.lcs_a_stop - # reverse = result.lcs_reverse - - length = abs(a_start - a_stop) # seed length - a_start = domain_genes_to_all_genes[family_db_id][a_start] - if length == 0: - pass - - elif reverse: - b_start = domain_genes_to_all_genes[bgc_db_id][ - len(domain_count_gene[bgc_db_id]) - b_start - 1 - ] - else: - b_start = domain_genes_to_all_genes[bgc_db_id][b_start] + result = DB.execute(select_query).fetchone() + if result is None: + raise RuntimeError("LCS not found in database") + if result.region_a_id == family_db_id: + a_start: int = result.lcs_a_start + a_stop: int = result.lcs_a_stop + b_start: int = result.lcs_b_start + reverse: bool = result.lcs_reverse + + length = abs(a_start - a_stop) # seed length + a_start = domain_genes_to_all_genes[family_db_id][a_start] + if length == 0: + pass + + elif reverse: + b_start = domain_genes_to_all_genes[bgc_db_id][ + len(domain_count_gene[bgc_db_id]) - b_start - 1 + ] + else: + b_start = domain_genes_to_all_genes[bgc_db_id][b_start] + + elif result.region_b_id == family_db_id: + a_start = result.lcs_b_start + a_stop = result.lcs_b_stop + b_start = result.lcs_a_start + reverse = result.lcs_reverse + + length = abs(a_start - a_stop) # seed length + if length == 0: + pass + elif reverse: + a_start = domain_genes_to_all_genes[family_db_id][ + len(domain_count_gene[family_db_id]) - a_start - length + ] + b_start = domain_genes_to_all_genes[bgc_db_id][ + b_start + length - 1 + ] + else: + a_start = domain_genes_to_all_genes[family_db_id][a_start] + b_start = domain_genes_to_all_genes[bgc_db_id][b_start] if length == 0: length = 1 From 8ecee734a6e4986f342d7797db016e98bbe2235e Mon Sep 17 00:00:00 2001 From: nlouwen Date: Wed, 1 Nov 2023 11:36:35 +0100 Subject: [PATCH 05/15] fix tests and errors --- big_scape/comparison/utility.py | 23 +++++++++++++++-- big_scape/output/legacy_output.py | 5 +++- test/comparison/test_binning.py | 40 ++++++++++++++++++++++++++++++ test/db/test_partial.py | 17 +++++++++++-- test/file_input/test_load_files.py | 10 ++++---- test/network/test_network.py | 17 +++++++++++-- 6 files changed, 100 insertions(+), 12 deletions(-) diff --git a/big_scape/comparison/utility.py b/big_scape/comparison/utility.py index 5cf098df..e417d24d 100644 --- a/big_scape/comparison/utility.py +++ b/big_scape/comparison/utility.py @@ -13,7 +13,8 @@ def save_edge_to_db( - edge: tuple[int, int, float, float, float, float, str], upsert=False + edge: tuple[int, int, float, float, float, float, str, int, int, int, int, bool], + upsert=False, ) -> None: """Save edge to the database @@ -23,7 +24,20 @@ def save_edge_to_db( upsert (bool, optional): whether to upsert the edge into the database. """ - region_a_id, region_b_id, distance, jaccard, adjacency, dss, weights = edge + ( + region_a_id, + region_b_id, + distance, + jaccard, + adjacency, + dss, + weights, + a_start, + a_stop, + b_start, + b_stop, + reverse, + ) = edge # save the comparison data to the database @@ -41,6 +55,11 @@ def save_edge_to_db( adjacency=adjacency, dss=dss, weights=weights, + lcs_a_start=a_start, + lcs_a_stop=a_stop, + lcs_b_start=b_start, + lcs_b_stop=b_stop, + lcs_reverse=reverse, ) if upsert: diff --git a/big_scape/output/legacy_output.py b/big_scape/output/legacy_output.py index 369c4023..b94cb1ff 100644 --- a/big_scape/output/legacy_output.py +++ b/big_scape/output/legacy_output.py @@ -741,7 +741,10 @@ def generate_bs_families_alignment( ) result = DB.execute(select_query).fetchone() if result is None: - raise RuntimeError("LCS not found in database") + raise RuntimeError( + "LCS not found in database (%s %s %s)" + % (family_db_id, bgc_db_id, weights) + ) if result.region_a_id == family_db_id: a_start: int = result.lcs_a_start a_stop: int = result.lcs_a_stop diff --git a/test/comparison/test_binning.py b/test/comparison/test_binning.py index a6f2e340..9391cbcd 100644 --- a/test/comparison/test_binning.py +++ b/test/comparison/test_binning.py @@ -268,6 +268,11 @@ def test_ref_to_ref_pair_generator_first_iteration(self): 1.0, 1.0, "mix", + 0, + 0, + 0, + 0, + False, ) ) else: @@ -281,6 +286,11 @@ def test_ref_to_ref_pair_generator_first_iteration(self): 0.0, 0.0, "mix", + 0, + 0, + 0, + 0, + False, ) ) @@ -357,6 +367,11 @@ def test_ref_to_ref_pair_generator_second_iteration(self): 1.0, 1.0, "mix", + 0, + 0, + 0, + 0, + False, ) ) else: @@ -370,6 +385,11 @@ def test_ref_to_ref_pair_generator_second_iteration(self): 0.0, 0.0, "mix", + 0, + 0, + 0, + 0, + False, ) ) @@ -397,6 +417,11 @@ def test_ref_to_ref_pair_generator_second_iteration(self): 1.0, 1.0, "mix", + 0, + 0, + 0, + 0, + False, ) ) @@ -410,6 +435,11 @@ def test_ref_to_ref_pair_generator_second_iteration(self): 0.0, 0.0, "mix", + 0, + 0, + 0, + 0, + False, ) ) save_edge_to_db( @@ -421,6 +451,11 @@ def test_ref_to_ref_pair_generator_second_iteration(self): 0.0, 0.0, "mix", + 0, + 0, + 0, + 0, + False, ) ) save_edge_to_db( @@ -432,6 +467,11 @@ def test_ref_to_ref_pair_generator_second_iteration(self): 0.0, 0.0, "mix", + 0, + 0, + 0, + 0, + False, ) ) diff --git a/test/db/test_partial.py b/test/db/test_partial.py index ee046b76..1819e583 100644 --- a/test/db/test_partial.py +++ b/test/db/test_partial.py @@ -48,7 +48,7 @@ def add_mock_hsp_alignment_hsp(hsp: HSP) -> None: def gen_mock_edge_list( edge_gbks: list[GBK], -) -> list[tuple[int, int, float, float, float, float, str]]: +) -> list[tuple[int, int, float, float, float, float, str, int, int, int, int, bool]]: edges = [] for gbk_a, gbk_b in combinations(edge_gbks, 2): if gbk_a.region is None or gbk_b.region is None: @@ -57,7 +57,20 @@ def gen_mock_edge_list( continue edges.append( - (gbk_a.region._db_id, gbk_b.region._db_id, 0.0, 0.0, 0.0, 0.0, "mix") + ( + gbk_a.region._db_id, + gbk_b.region._db_id, + 0.0, + 0.0, + 0.0, + 0.0, + "mix", + 0, + 0, + 0, + 0, + False, + ) ) return edges diff --git a/test/file_input/test_load_files.py b/test/file_input/test_load_files.py index 27f31cd6..344d7d4e 100644 --- a/test/file_input/test_load_files.py +++ b/test/file_input/test_load_files.py @@ -28,7 +28,7 @@ def test_gbk_path_invalid(self): } with self.assertRaises(NotADirectoryError): - load_dataset_folder(run, SOURCE_TYPE.QUERY) + load_dataset_folder(run["input_dir"], run, SOURCE_TYPE.QUERY) def test_gbk_path_valid(self): """Tests whether loading a given path returns none""" @@ -45,7 +45,7 @@ def test_gbk_path_valid(self): "legacy_classify": False, } - load_result = load_dataset_folder(run, SOURCE_TYPE.QUERY) + load_result = load_dataset_folder(run["input_dir"], run, SOURCE_TYPE.QUERY) expected_count = 3 actual_count = len(load_result) @@ -68,7 +68,7 @@ def test_gbk_path_empty(self): } with self.assertRaises(FileNotFoundError): - load_dataset_folder(run, SOURCE_TYPE.QUERY) + load_dataset_folder(run["input_dir"], run, SOURCE_TYPE.QUERY) def test_load_gbk_not_a_file(self): """Tests whether the load_gbk function correctly returns none when input is not a file""" @@ -122,7 +122,7 @@ def test_include_exclude_str(self): "legacy_classify": False, } - load_result = load_dataset_folder(run, SOURCE_TYPE.QUERY) + load_result = load_dataset_folder(run["input_dir"], run, SOURCE_TYPE.QUERY) expected_count = 1 actual_count = len(load_result) @@ -143,7 +143,7 @@ def test_include_all_override_exclude(self): "legacy_classify": False, } - load_result = load_dataset_folder(run, SOURCE_TYPE.QUERY) + load_result = load_dataset_folder(run["input_dir"], run, SOURCE_TYPE.QUERY) expected_count = 3 actual_count = len(load_result) diff --git a/test/network/test_network.py b/test/network/test_network.py index a2dae3ed..06c05a11 100644 --- a/test/network/test_network.py +++ b/test/network/test_network.py @@ -28,7 +28,7 @@ def create_mock_gbk(i) -> bs_gbk.GBK: def gen_mock_edge_list( edge_gbks: list[bs_gbk.GBK], -) -> list[tuple[int, int, float, float, float, float, str]]: +) -> list[tuple[int, int, float, float, float, float, str, int, int, int, int, bool]]: edges = [] for gbk_a, gbk_b in combinations(edge_gbks, 2): if gbk_a.region is None or gbk_b.region is None: @@ -37,7 +37,20 @@ def gen_mock_edge_list( continue edges.append( - (gbk_a.region._db_id, gbk_b.region._db_id, 0.0, 1.0, 1.0, 1.0, "mix") + ( + gbk_a.region._db_id, + gbk_b.region._db_id, + 0.0, + 1.0, + 1.0, + 1.0, + "mix", + 0, + 0, + 0, + 0, + False, + ) ) return edges From a3a9c940bec90203fc56f3b9638293fbd4799966 Mon Sep 17 00:00:00 2001 From: nlouwen Date: Wed, 1 Nov 2023 13:32:37 +0100 Subject: [PATCH 06/15] modularized code --- big_scape/output/legacy_output.py | 215 ++++++++++++++++++------------ 1 file changed, 132 insertions(+), 83 deletions(-) diff --git a/big_scape/output/legacy_output.py b/big_scape/output/legacy_output.py index b94cb1ff..ef4c9e24 100644 --- a/big_scape/output/legacy_output.py +++ b/big_scape/output/legacy_output.py @@ -675,6 +675,126 @@ def generate_bs_networks_js_sim_matrix( return sparse_matrix +def fetch_lcs_from_db(a_id: int, b_id: int, weights: str) -> dict[str, Any]: + """Find the lcs start, stop and direction for a pair of records in the database + + Args: + a_id (int): record db id of region a + b_id (int): record db id of region b + weights (str): weights used in analysis + """ + if DB.metadata is None: + raise RuntimeError("Database metadata is None!") + dist_table = DB.metadata.tables["distance"] + select_query = ( + dist_table.select() + # .add_columns( + # dist_table.c.region_a_id, + # dist_table.c.region_b_id, + # dist_table.c.lcs_a_start, + # dist_table.c.lcs_a_stop, + # dist_table.c.lcs_b_start, + # dist_table.c.lcs_reverse, + # ) + .where(dist_table.c.region_a_id.in_((a_id, b_id))).where( + dist_table.c.region_b_id.in_((a_id, b_id)) + ) + # .where(dist_table.c.weights == weights) + .compile() + ) + result = DB.execute(select_query).fetchone() + if result is None: + raise RuntimeError( + "LCS not found in database (%s %s %s)" % (a_id, b_id, weights) + ) + return dict(result._mapping) + + +def adjust_lcs_to_all_genes( + result: dict[str, Any], + family_db_id: int, + bgc_db_id: int, + fam_gbk: GBK, + bgc_gbk: GBK, + domain_genes_to_all_genes: dict[int, dict[int, int]], + domain_count_gene: dict[int, list[int]], +) -> tuple[int, int, bool]: + """Adjust boundaries of lcs from only genes with domains to all present genes + + Args: + result (dict[str, Any]): database distance row with lcs information + family_db_id (int): database record id of family exemplar + bgc_db_id (int): database record id of family member + fam_gbk (GBK): family exemplar GBK + bgc_gbk (GBK): family member GBK + domain_genes_to_all_genes (dict[int, dict[int, int]]): maps indices from genes + with domains to all present genes + domain_count_gene (dict[int, list[int]]): contains the number of domains each + gene contains + + Returns: + tuple[int, int, bool]: adjusted a_start, b_start and reverse + """ + if result["region_a_id"] == family_db_id: + a_start = result["lcs_a_start"] + a_stop = result["lcs_a_stop"] + b_start = result["lcs_b_start"] + reverse = result["lcs_reverse"] + + length = abs(a_start - a_stop) # seed length + a_start = domain_genes_to_all_genes[family_db_id][a_start] + if length == 0: + pass + + elif reverse: + b_start = domain_genes_to_all_genes[bgc_db_id][ + len(domain_count_gene[bgc_db_id]) - b_start - 1 + ] + else: + b_start = domain_genes_to_all_genes[bgc_db_id][b_start] + + elif result["region_b_id"] == family_db_id: + a_start = result["lcs_b_start"] + a_stop = result["lcs_b_stop"] + b_start = result["lcs_a_start"] + reverse = result["lcs_reverse"] + + length = abs(a_start - a_stop) # seed length + if length == 0: + pass + elif reverse: + a_start = domain_genes_to_all_genes[family_db_id][ + len(domain_count_gene[family_db_id]) - a_start - length + ] + b_start = domain_genes_to_all_genes[bgc_db_id][b_start + length - 1] + else: + a_start = domain_genes_to_all_genes[family_db_id][a_start] + b_start = domain_genes_to_all_genes[bgc_db_id][b_start] + + if length == 0: + length = 1 + # let's try aligning using the genes with most domains + # after all, they ended up being in the same GCF + # for some reason + x = max(domain_count_gene[family_db_id]) + x = domain_count_gene[family_db_id].index(x) + a_start = domain_genes_to_all_genes[family_db_id][x] + + y = max(list(domain_count_gene[bgc_db_id])) + y = domain_count_gene[bgc_db_id].index(y) + + # check orientation + if fam_gbk.genes[x].strand == bgc_gbk.genes[y].strand: + b_start = domain_genes_to_all_genes[bgc_db_id][y] + reverse = False + else: + b_start = domain_genes_to_all_genes[bgc_db_id][ + len(domain_count_gene[bgc_db_id]) - y - 1 + ] + reverse = True + return a_start, b_start, reverse + + def generate_bs_families_alignment( bs_families: dict[int, list[int]], pair_generator: RecordPairGenerator ): @@ -717,93 +837,22 @@ def generate_bs_families_alignment( raise AttributeError("Record parent GBK is not set!") if bgc_db_id is None: raise AttributeError("Record has no database id!") + if bgc_db_id == family_db_id: aln.append([[gene_num, 0] for gene_num in range(len(bgc_gbk.genes))]) else: - if DB.metadata is None: - raise RuntimeError("Database metadata is None!") - weights = pair_generator.weights - dist_table = DB.metadata.tables["distance"] - select_query = ( - dist_table.select() - .add_columns( - dist_table.c.region_a_id, - dist_table.c.region_b_id, - dist_table.c.lcs_a_start, - dist_table.c.lcs_a_stop, - dist_table.c.lcs_b_start, - dist_table.c.lcs_reverse, - ) - .where(dist_table.c.region_a_id.in_((family_db_id, bgc_db_id))) - .where(dist_table.c.region_b_id.in_((family_db_id, bgc_db_id))) - .where(dist_table.c.weights == weights) - .compile() + result = fetch_lcs_from_db( + family_db_id, bgc_db_id, pair_generator.weights + ) + a_start, b_start, reverse = adjust_lcs_to_all_genes( + result, + family_db_id, + bgc_db_id, + fam_gbk, + bgc_gbk, + domain_genes_to_all_genes, + domain_count_gene, ) - result = DB.execute(select_query).fetchone() - if result is None: - raise RuntimeError( - "LCS not found in database (%s %s %s)" - % (family_db_id, bgc_db_id, weights) - ) - if result.region_a_id == family_db_id: - a_start: int = result.lcs_a_start - a_stop: int = result.lcs_a_stop - b_start: int = result.lcs_b_start - reverse: bool = result.lcs_reverse - - length = abs(a_start - a_stop) # seed length - a_start = domain_genes_to_all_genes[family_db_id][a_start] - if length == 0: - pass - - elif reverse: - b_start = domain_genes_to_all_genes[bgc_db_id][ - len(domain_count_gene[bgc_db_id]) - b_start - 1 - ] - else: - b_start = domain_genes_to_all_genes[bgc_db_id][b_start] - - elif result.region_b_id == family_db_id: - a_start = result.lcs_b_start - a_stop = result.lcs_b_stop - b_start = result.lcs_a_start - reverse = result.lcs_reverse - - length = abs(a_start - a_stop) # seed length - if length == 0: - pass - elif reverse: - a_start = domain_genes_to_all_genes[family_db_id][ - len(domain_count_gene[family_db_id]) - a_start - length - ] - b_start = domain_genes_to_all_genes[bgc_db_id][ - b_start + length - 1 - ] - else: - a_start = domain_genes_to_all_genes[family_db_id][a_start] - b_start = domain_genes_to_all_genes[bgc_db_id][b_start] - - if length == 0: - length = 1 - # let's try aligning using the genes with most domains - # after all, they ended up being in the same GCF - # for some reason - x = max(domain_count_gene[family_db_id]) - x = domain_count_gene[family_db_id].index(x) - a_start = domain_genes_to_all_genes[family_db_id][x] - - y = max(list(domain_count_gene[bgc_db_id])) - y = domain_count_gene[bgc_db_id].index(y) - - # check orientation - if fam_gbk.genes[x].strand == bgc_gbk.genes[y].strand: - b_start = domain_genes_to_all_genes[bgc_db_id][y] - reverse = False - else: - b_start = domain_genes_to_all_genes[bgc_db_id][ - len(domain_count_gene[bgc_db_id]) - y - 1 - ] - reverse = True ref_genes_.add(a_start) bgc_algn = [] From 0c9086571691eca9503e8a947a8ec46b959a2470 Mon Sep 17 00:00:00 2001 From: nlouwen Date: Wed, 1 Nov 2023 16:56:08 +0100 Subject: [PATCH 07/15] Add gcf alignment and tree tests --- test/trees/test_alignment.py | 214 +++++++++++++++++++++++++++++++++++ 1 file changed, 214 insertions(+) create mode 100644 test/trees/test_alignment.py diff --git a/test/trees/test_alignment.py b/test/trees/test_alignment.py new file mode 100644 index 00000000..cce4647b --- /dev/null +++ b/test/trees/test_alignment.py @@ -0,0 +1,214 @@ +"""Contains test for GCF alignment and tree generation""" + +# from python +from unittest import TestCase + +# from other modules +from big_scape.genbank import GBK, BGCRecord, CDS +from big_scape.hmm import HSP, HSPAlignment +from big_scape.trees import generate_newick_tree +from big_scape.trees.newick_tree import generate_gcf_alignment +from big_scape.output.legacy_output import adjust_lcs_to_all_genes + + +class TestTrees(TestCase): + """Contains alignment and tree generation tests""" + + def test_tree_gen_small(self): + """Tests generated tree for families with less than three members""" + + records = [ + BGCRecord(GBK("", ""), 0, 0, 0, False, ""), + BGCRecord(GBK("", ""), 0, 0, 0, False, ""), + ] + exemplar = 0 + mock_family = [0, 1] + + expected_tree = "(0:0.0,1:0.0):0.01;" + tree = generate_newick_tree(records, exemplar, mock_family) + + self.assertEqual(tree, expected_tree) + + def test_gcf_alignment(self): + """Tests alignment of GCF HSP alignments""" + gbk_a = GBK("", "") + gbk_b = GBK("", "") + cds_a = CDS(0, 20) + cds_b = CDS(0, 20) + hsp_a = HSP(cds_a, "PF1", 1, 0, 10) + hsp_b = HSP(cds_b, "PF1", 1, 0, 12) + hsp_a.alignment = HSPAlignment(hsp_a, "TEST-PF1--") + hsp_b.alignment = HSPAlignment(hsp_b, "TEST--P-F1") + cds_a.hsps.append(hsp_a) + cds_b.hsps.append(hsp_b) + gbk_a.genes.append(cds_a) + gbk_b.genes.append(cds_b) + + records = [ + BGCRecord(gbk_a, 0, 0, 100, False, ""), + BGCRecord(gbk_b, 1, 0, 100, False, ""), + ] + exemplar = 0 + family_members = [0, 1] + expected_alignment = ">0\nTEST-PF1--\n>1\nTEST--P-F1\n" + algn = generate_gcf_alignment(records, exemplar, family_members) + + self.assertEqual(algn, expected_alignment) + + def test_lcs_adjust_ex2mem(self): + """Tests adjusted lcs exemplar to member not reversed""" + mock_result = { + "region_a_id": 0, + "region_b_id": 1, + "lcs_a_start": 4, + "lcs_a_stop": 7, + "lcs_b_start": 6, + "lcs_b_stop": 9, + "lcs_reverse": False, + } + dom_to_gene = {0: {4: 5, 7: 9}, 1: {6: 8}} + dom_count = {} + + expected_lcs = (5, 8, False) + + new_lcs = adjust_lcs_to_all_genes( + mock_result, 0, 1, GBK("", ""), GBK("", ""), dom_to_gene, dom_count + ) + + self.assertEqual(new_lcs, expected_lcs) + + def test_lcs_adjust_ex2mem_reverse(self): + """Tests adjusted lcs exemplar to member with reverse""" + mock_result = { + "region_a_id": 0, + "region_b_id": 1, + "lcs_a_start": 2, + "lcs_a_stop": 3, + "lcs_b_start": 1, + "lcs_b_stop": 2, + "lcs_reverse": True, + } + dom_to_gene = {0: {0: 0, 1: 2, 2: 4}, 1: {0: 1, 1: 2, 2: 3}} + dom_count = {0: [1, 1, 1], 1: [1, 1, 1]} + + expected_lcs = (4, 2, True) + + new_lcs = adjust_lcs_to_all_genes( + mock_result, 0, 1, GBK("", ""), GBK("", ""), dom_to_gene, dom_count + ) + + self.assertEqual(new_lcs, expected_lcs) + + def test_lcs_adjust_mem2ex(self): + """Tests adjusted lcs member to exemplar not reversed""" + mock_result = { + "region_a_id": 0, + "region_b_id": 1, + "lcs_a_start": 2, + "lcs_a_stop": 3, + "lcs_b_start": 1, + "lcs_b_stop": 2, + "lcs_reverse": False, + } + dom_to_gene = {0: {0: 0, 1: 2, 2: 4}, 1: {0: 1, 1: 2, 2: 3}} + dom_count = {0: [1, 1, 1], 1: [1, 1, 1]} + + expected_lcs = (2, 4, False) + + new_lcs = adjust_lcs_to_all_genes( + mock_result, 1, 0, GBK("", ""), GBK("", ""), dom_to_gene, dom_count + ) + + self.assertEqual(new_lcs, expected_lcs) + + def test_lcs_adjust_mem2ex_reverse(self): + """Tests adjusted lcs member to exemplar with reverse""" + mock_result = { + "region_a_id": 0, + "region_b_id": 1, + "lcs_a_start": 2, + "lcs_a_stop": 3, + "lcs_b_start": 1, + "lcs_b_stop": 2, + "lcs_reverse": True, + } + dom_to_gene = {0: {0: 0, 1: 2, 2: 4}, 1: {0: 1, 1: 2, 2: 3}} + dom_count = {0: [1, 1, 1], 1: [1, 1, 1]} + + expected_lcs = (2, 4, True) + + new_lcs = adjust_lcs_to_all_genes( + mock_result, 1, 0, GBK("", ""), GBK("", ""), dom_to_gene, dom_count + ) + + self.assertEqual(new_lcs, expected_lcs) + + def test_lcs_adjust_zero_length_not_reversed(self): + """Tests adjusted lcs for lcs with length 0""" + + mock_result = { + "region_a_id": 0, + "region_b_id": 1, + "lcs_a_start": 0, + "lcs_a_stop": 0, + "lcs_b_start": 0, + "lcs_b_stop": 0, + "lcs_reverse": False, + } + dom_to_gene = {0: {0: 0, 1: 2, 2: 4}, 1: {0: 1, 1: 2, 2: 3}} + dom_count = {0: [1, 2, 1], 1: [1, 1, 2]} + + max_dom_cds = CDS(20, 30) + max_dom_cds.strand = 1 + fill_cds = CDS(10, 20) + fill_cds.strand = -1 + + exempl_gbk = GBK("", "") + exempl_gbk.genes = [fill_cds, fill_cds, max_dom_cds, fill_cds, fill_cds] + + mem_gbk = GBK("", "") + mem_gbk.genes = [CDS(0, 1), CDS(2, 3), CDS(4, 5), max_dom_cds] + + expected_lcs = (2, 3, False) + + new_lcs = adjust_lcs_to_all_genes( + mock_result, 0, 1, exempl_gbk, mem_gbk, dom_to_gene, dom_count + ) + + self.assertEqual(new_lcs, expected_lcs) + + def test_lcs_adjust_zero_length_reversed(self): + """Tests adjusted lcs for lcs with length 0""" + + mock_result = { + "region_a_id": 0, + "region_b_id": 1, + "lcs_a_start": 0, + "lcs_a_stop": 0, + "lcs_b_start": 0, + "lcs_b_stop": 0, + "lcs_reverse": False, + } + dom_to_gene = {0: {0: 0, 1: 2, 2: 4}, 1: {0: 1, 1: 2, 2: 3}} + dom_count = {0: [1, 2, 1], 1: [1, 1, 2]} + + max_dom_cds = CDS(0, 0) + max_dom_cds.strand = 1 + fill_cds = CDS(0, 0) + fill_cds.strand = -1 + + exempl_gbk = GBK("", "") + exempl_gbk.genes = [fill_cds, fill_cds, max_dom_cds, fill_cds, fill_cds] + + max_dom_cds_rev = CDS(0, 0) + max_dom_cds_rev.strand = -1 + mem_gbk = GBK("", "") + mem_gbk.genes = [CDS(0, 0), CDS(0, 0), CDS(0, 0), max_dom_cds_rev] + + expected_lcs = (2, 1, True) + + new_lcs = adjust_lcs_to_all_genes( + mock_result, 0, 1, exempl_gbk, mem_gbk, dom_to_gene, dom_count + ) + + self.assertEqual(new_lcs, expected_lcs) From c2b5e3f2072502973ddf1d449b49743047196af3 Mon Sep 17 00:00:00 2001 From: nlouwen Date: Thu, 2 Nov 2023 09:45:32 +0100 Subject: [PATCH 08/15] Store lcs extension and align mode in db --- big_scape/comparison/comparable_region.py | 2 ++ big_scape/comparison/legacy_workflow_alt.py | 7 ++++++ big_scape/comparison/utility.py | 24 +++++++++++++++++++-- big_scape/data/schema.sql | 6 ++++++ big_scape/output/legacy_output.py | 5 ++++- 5 files changed, 41 insertions(+), 3 deletions(-) diff --git a/big_scape/comparison/comparable_region.py b/big_scape/comparison/comparable_region.py index f9c700ee..ac311d78 100644 --- a/big_scape/comparison/comparable_region.py +++ b/big_scape/comparison/comparable_region.py @@ -10,6 +10,7 @@ # from other modules from big_scape.genbank import BGCRecord from big_scape.hmm import HSP +from big_scape.enums import ALIGNMENT_MODE # from this module from .legacy_lcs import legacy_find_cds_lcs @@ -67,6 +68,7 @@ def __init__( self.lcs_a_stop = a_stop self.lcs_b_stop = b_stop self.lcs_reverse = reverse + self.alignment_mode: ALIGNMENT_MODE = ALIGNMENT_MODE.GLOBAL def get_domain_sets( self, regenerate=False, cache=True diff --git a/big_scape/comparison/legacy_workflow_alt.py b/big_scape/comparison/legacy_workflow_alt.py index 01781429..7d9f1356 100644 --- a/big_scape/comparison/legacy_workflow_alt.py +++ b/big_scape/comparison/legacy_workflow_alt.py @@ -178,6 +178,12 @@ def generate_edges( pair.comparable_region.lcs_b_start, pair.comparable_region.lcs_b_stop, pair.comparable_region.lcs_reverse, + pair.comparable_region.a_start, + pair.comparable_region.a_stop, + pair.comparable_region.b_start, + pair.comparable_region.b_stop, + pair.comparable_region.reverse, + pair.comparable_region.alignment_mode.value, ) done_pairs += len(results) @@ -238,6 +244,7 @@ def expand_pair(pair: RecordPair) -> float: jc = calc_jaccard_pair(pair) return jc + pair.comparable_region.alignment_mode = bs_enums.ALIGNMENT_MODE.GLOCAL jc = calc_jaccard_pair(pair) return jc diff --git a/big_scape/comparison/utility.py b/big_scape/comparison/utility.py index e417d24d..d4c3cd5f 100644 --- a/big_scape/comparison/utility.py +++ b/big_scape/comparison/utility.py @@ -10,6 +10,7 @@ # from other modules from big_scape.data import DB from big_scape.comparison.binning import RecordPairGenerator, RecordPair +from big_scape.enums import ALIGNMENT_MODE def save_edge_to_db( @@ -70,7 +71,26 @@ def save_edge_to_db( def save_edges_to_db( edges: list[ - tuple[int, int, float, float, float, float, str, int, int, int, int, bool] + tuple[ + int, + int, + float, + float, + float, + float, + str, + int, + int, + int, + int, + bool, + int, + int, + int, + int, + bool, + ALIGNMENT_MODE, + ] ] ) -> None: """Save many edges to the database @@ -96,7 +116,7 @@ def save_edges_to_db( # create a query # TODO: this should not need ignore. it's there now because protoclusters somehow # trigger an integrityerror - query = "INSERT OR IGNORE INTO distance VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)" + query = "INSERT OR IGNORE INTO distance VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)" cursor.executemany(query, edges) diff --git a/big_scape/data/schema.sql b/big_scape/data/schema.sql index 0a901f51..2547cc95 100644 --- a/big_scape/data/schema.sql +++ b/big_scape/data/schema.sql @@ -81,6 +81,12 @@ CREATE TABLE IF NOT EXISTS distance ( lcs_b_start INTEGER NOT NULL, lcs_b_stop INTEGER NOT NULL, lcs_reverse BOOLEAN NOT NULL, + ext_a_start INTEGER NOT NULL, + ext_a_stop INTEGER NOT NULL, + ext_b_start INTEGER NOT NULL, + ext_b_stop INTEGER NOT NULL, + ext_reverse BOOLEAN NOT NULL, + alignment_mode TEXT NOT NULL, UNIQUE(region_a_id, region_b_id) FOREIGN KEY(region_a_id) REFERENCES bgc_record(id) FOREIGN KEY(region_b_id) REFERENCES bgc_record(id) diff --git a/big_scape/output/legacy_output.py b/big_scape/output/legacy_output.py index ef4c9e24..a5558f25 100644 --- a/big_scape/output/legacy_output.py +++ b/big_scape/output/legacy_output.py @@ -784,7 +784,10 @@ def adjust_lcs_to_all_genes( y = domain_count_gene[bgc_db_id].index(y) # check orientation - if fam_gbk.genes[x].strand == bgc_gbk.genes[y].strand: + if ( + fam_gbk.genes[domain_genes_to_all_genes[family_db_id][x]].strand + == bgc_gbk.genes[domain_genes_to_all_genes[bgc_db_id][y]].strand + ): b_start = domain_genes_to_all_genes[bgc_db_id][y] reverse = False else: From 61de2a150c9a56cd7e5c9daf77fd6b01330e3050 Mon Sep 17 00:00:00 2001 From: nlouwen Date: Thu, 2 Nov 2023 09:53:48 +0100 Subject: [PATCH 09/15] fix tests --- big_scape/comparison/utility.py | 53 ++++++++++++++++++++++++++------- test/comparison/test_binning.py | 48 +++++++++++++++++++++++++++++ test/db/test_partial.py | 29 +++++++++++++++++- test/network/test_network.py | 29 +++++++++++++++++- 4 files changed, 146 insertions(+), 13 deletions(-) diff --git a/big_scape/comparison/utility.py b/big_scape/comparison/utility.py index d4c3cd5f..df807099 100644 --- a/big_scape/comparison/utility.py +++ b/big_scape/comparison/utility.py @@ -14,7 +14,26 @@ def save_edge_to_db( - edge: tuple[int, int, float, float, float, float, str, int, int, int, int, bool], + edge: tuple[ + int, + int, + float, + float, + float, + float, + str, + int, + int, + int, + int, + bool, + int, + int, + int, + int, + bool, + ALIGNMENT_MODE, + ], upsert=False, ) -> None: """Save edge to the database @@ -33,11 +52,17 @@ def save_edge_to_db( adjacency, dss, weights, - a_start, - a_stop, - b_start, - b_stop, - reverse, + lcs_a_start, + lcs_a_stop, + lcs_b_start, + lcs_b_stop, + lcs_reverse, + ext_a_start, + ext_a_stop, + ext_b_start, + ext_b_stop, + ext_reverse, + alignment_mode, ) = edge # save the comparison data to the database @@ -56,11 +81,17 @@ def save_edge_to_db( adjacency=adjacency, dss=dss, weights=weights, - lcs_a_start=a_start, - lcs_a_stop=a_stop, - lcs_b_start=b_start, - lcs_b_stop=b_stop, - lcs_reverse=reverse, + lcs_a_start=lcs_a_start, + lcs_a_stop=lcs_a_stop, + lcs_b_start=lcs_b_start, + lcs_b_stop=lcs_b_stop, + lcs_reverse=lcs_reverse, + ext_a_start=ext_a_start, + ext_a_stop=ext_a_stop, + ext_b_start=ext_b_start, + ext_b_stop=ext_b_stop, + ext_reverse=ext_reverse, + alignment_mode=alignment_mode.value, ) if upsert: diff --git a/test/comparison/test_binning.py b/test/comparison/test_binning.py index 9391cbcd..9254d50b 100644 --- a/test/comparison/test_binning.py +++ b/test/comparison/test_binning.py @@ -273,6 +273,12 @@ def test_ref_to_ref_pair_generator_first_iteration(self): 0, 0, False, + 0, + 0, + 0, + 0, + False, + bs_enums.ALIGNMENT_MODE.GLOBAL, ) ) else: @@ -291,6 +297,12 @@ def test_ref_to_ref_pair_generator_first_iteration(self): 0, 0, False, + 0, + 0, + 0, + 0, + False, + bs_enums.ALIGNMENT_MODE.GLOBAL, ) ) @@ -372,6 +384,12 @@ def test_ref_to_ref_pair_generator_second_iteration(self): 0, 0, False, + 0, + 0, + 0, + 0, + False, + bs_enums.ALIGNMENT_MODE.GLOBAL, ) ) else: @@ -390,6 +408,12 @@ def test_ref_to_ref_pair_generator_second_iteration(self): 0, 0, False, + 0, + 0, + 0, + 0, + False, + bs_enums.ALIGNMENT_MODE.GLOBAL, ) ) @@ -422,6 +446,12 @@ def test_ref_to_ref_pair_generator_second_iteration(self): 0, 0, False, + 0, + 0, + 0, + 0, + False, + bs_enums.ALIGNMENT_MODE.GLOBAL, ) ) @@ -440,6 +470,12 @@ def test_ref_to_ref_pair_generator_second_iteration(self): 0, 0, False, + 0, + 0, + 0, + 0, + False, + bs_enums.ALIGNMENT_MODE.GLOBAL, ) ) save_edge_to_db( @@ -456,6 +492,12 @@ def test_ref_to_ref_pair_generator_second_iteration(self): 0, 0, False, + 0, + 0, + 0, + 0, + False, + bs_enums.ALIGNMENT_MODE.GLOBAL, ) ) save_edge_to_db( @@ -472,6 +514,12 @@ def test_ref_to_ref_pair_generator_second_iteration(self): 0, 0, False, + 0, + 0, + 0, + 0, + False, + bs_enums.ALIGNMENT_MODE.GLOBAL, ) ) diff --git a/test/db/test_partial.py b/test/db/test_partial.py index 1819e583..7cd5d052 100644 --- a/test/db/test_partial.py +++ b/test/db/test_partial.py @@ -48,7 +48,28 @@ def add_mock_hsp_alignment_hsp(hsp: HSP) -> None: def gen_mock_edge_list( edge_gbks: list[GBK], -) -> list[tuple[int, int, float, float, float, float, str, int, int, int, int, bool]]: +) -> list[ + tuple[ + int, + int, + float, + float, + float, + float, + str, + int, + int, + int, + int, + bool, + int, + int, + int, + int, + bool, + bs_enums.ALIGNMENT_MODE, + ] +]: edges = [] for gbk_a, gbk_b in combinations(edge_gbks, 2): if gbk_a.region is None or gbk_b.region is None: @@ -70,6 +91,12 @@ def gen_mock_edge_list( 0, 0, False, + 0, + 0, + 0, + 0, + False, + bs_enums.ALIGNMENT_MODE.GLOBAL, ) ) diff --git a/test/network/test_network.py b/test/network/test_network.py index 06c05a11..7f3e1f3a 100644 --- a/test/network/test_network.py +++ b/test/network/test_network.py @@ -28,7 +28,28 @@ def create_mock_gbk(i) -> bs_gbk.GBK: def gen_mock_edge_list( edge_gbks: list[bs_gbk.GBK], -) -> list[tuple[int, int, float, float, float, float, str, int, int, int, int, bool]]: +) -> list[ + tuple[ + int, + int, + float, + float, + float, + float, + str, + int, + int, + int, + int, + bool, + int, + int, + int, + int, + bool, + bs_enums.ALIGNMENT_MODE, + ] +]: edges = [] for gbk_a, gbk_b in combinations(edge_gbks, 2): if gbk_a.region is None or gbk_b.region is None: @@ -50,6 +71,12 @@ def gen_mock_edge_list( 0, 0, False, + 0, + 0, + 0, + 0, + False, + bs_enums.ALIGNMENT_MODE.GLOBAL, ) ) From 9f3508157b8a309c2225dd72195f518a47ffbf77 Mon Sep 17 00:00:00 2001 From: nlouwen Date: Thu, 2 Nov 2023 16:22:46 +0100 Subject: [PATCH 10/15] highlight compared region --- .../output/html_content/js/bigscape.js | 48 +++++++++++++------ big_scape/output/legacy_output.py | 23 +++++---- 2 files changed, 45 insertions(+), 26 deletions(-) diff --git a/big_scape/output/html_template/output/html_content/js/bigscape.js b/big_scape/output/html_template/output/html_content/js/bigscape.js index df501359..bd0c1997 100644 --- a/big_scape/output/html_template/output/html_content/js/bigscape.js +++ b/big_scape/output/html_template/output/html_content/js/bigscape.js @@ -21,9 +21,29 @@ function Bigscape(run_data, bs_data, bs_families, bs_alignment, bs_similarity, n var bs_to_cl = []; var bs_svg = []; for (var i in bs_data) { - var svg = $(Arrower.drawClusterSVG(bs_data[i])); + var cl_data = bs_data[i]; + var ext_start = cl_data.record_start ? cl_data.record_start - 1 : 0; + var ext_stop = cl_data.record_stop ? cl_data.record_stop - 1 : cl_data.orfs.length; + var svg = $(Arrower.drawClusterSVG(cl_data)); svg.css("clear", "both"); svg.addClass("arrower-svg"); + // opacity for domains/orfs outside of compared region + if ((ext_start !== 0) || (ext_stop !== cl_data.orfs.length)) { + var outside_ext_bounds = false + svg.find("polygon").each(function () { + if ((outside_ext_bounds) && ($(this).attr("class") === "arrower-domain")) { + $(this).css("opacity", "0.4") + } else if ( + ($(this).attr("class") === "arrower-orf") && + (($(this).attr("idx") < ext_start) || ($(this).attr("idx") > ext_stop)) + ) { + $(this).css("opacity", "0.4") + outside_ext_bounds = true + } else { + outside_ext_bounds = false + } + }) + } bs_svg.push(svg); } // for search optimization @@ -322,19 +342,19 @@ function Bigscape(run_data, bs_data, bs_families, bs_alignment, bs_similarity, n var ui = Viva.Graph.svg('circle') .attr('r', 10) .attr('fill', (fam_colors[bs_to_cl[node.id]])); - if (run_data["mode"] == "Cluster") { - if (bs_data[node.id]["source"] == ("mibig" || "reference")) { - ui.attr("stroke", "blue"); - ui.attr("stroke-width", "4px"); - } + if (run_data["mode"] == "Cluster") { + if (bs_data[node.id]["source"] == ("mibig" || "reference")) { + ui.attr("stroke", "blue"); + ui.attr("stroke-width", "4px"); } - if (run_data["mode"] == "Query") { - if (bs_data[node.id]["source"] == "query") { - // ui.attr("r", "20") - ui.attr("stroke", "green"); - ui.attr("stroke-width", "6px"); - } + } + if (run_data["mode"] == "Query") { + if (bs_data[node.id]["source"] == "query") { + // ui.attr("r", "20") + ui.attr("stroke", "green"); + ui.attr("stroke-width", "6px"); } + } $(ui).hover(function () { // mouse over var temp_highlight = []; for (var i in highlighted_nodes) { @@ -426,8 +446,8 @@ function Bigscape(run_data, bs_data, bs_families, bs_alignment, bs_similarity, n graphics.link(function (link) { let line = Viva.Graph.svg('line') - .attr("stroke", "#777") - .attr("stroke-width", link["data"]["weight"] * 10); + .attr("stroke", "#777") + .attr("stroke-width", link["data"]["weight"] * 10); if (graph.getNode(link.fromId).data.id === graph.getNode(link.toId).data.id) { line = line.attr("stroke-dasharray", "10,10") diff --git a/big_scape/output/legacy_output.py b/big_scape/output/legacy_output.py index a5569e18..6c7395ad 100644 --- a/big_scape/output/legacy_output.py +++ b/big_scape/output/legacy_output.py @@ -576,6 +576,8 @@ def generate_bs_data_js( "id": str, (e.g. AL645882.2.cluster010), "mibig": bool, "source": str, (e.g. mibig, reference, or query), + "record_start": int, (e.g. cds boundaries of protocluster, index starts at 1) + "record_stop": int, "orfs": [ { "domains": [ @@ -612,6 +614,9 @@ def generate_bs_data_js( if record.parent_gbk is None: raise AttributeError("Record parent GBK is not set!") + rec_cds = record.get_cds() + rec_start = rec_cds[0].orf_num + rec_stop = rec_cds[-1].orf_num gbk = record.parent_gbk organism = "Unknown" if "organism" in gbk.metadata: @@ -625,6 +630,8 @@ def generate_bs_data_js( "id": gbk.path.name, "mibig": gbk.source_type == SOURCE_TYPE.MIBIG, "source": gbk.source_type.name.lower(), + "record_start": rec_start, + "record_stop": rec_stop, "orfs": generate_bs_data_js_orfs(gbk), } ) @@ -714,18 +721,10 @@ def fetch_lcs_from_db(a_id: int, b_id: int, weights: str) -> dict[str, Any]: dist_table = DB.metadata.tables["distance"] select_query = ( dist_table.select() - # .add_columns( - # dist_table.c.region_a_id, - # dist_table.c.region_b_id, - # dist_table.c.lcs_a_start, - # dist_table.c.lcs_a_stop, - # dist_table.c.lcs_b_start, - # dist_table.c.lcs_reverse, - # ) - .where(dist_table.c.region_a_id.in_((a_id, b_id))).where( - dist_table.c.region_b_id.in_((a_id, b_id)) - ) - # .where(dist_table.c.weights == weights) + .where(dist_table.c.region_a_id != dist_table.c.region_b_id) + .where(dist_table.c.region_a_id.in_((a_id, b_id))) + .where(dist_table.c.region_b_id.in_((a_id, b_id))) + .where(dist_table.c.weights == weights) .compile() ) result = DB.execute(select_query).fetchone() From 40b559ed64bf58f2e20dd0708be66024daa07298 Mon Sep 17 00:00:00 2001 From: nlouwen Date: Fri, 3 Nov 2023 10:20:29 +0100 Subject: [PATCH 11/15] add todo region type lcs adjust --- big_scape/output/legacy_output.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/big_scape/output/legacy_output.py b/big_scape/output/legacy_output.py index 6c7395ad..9de4f0b4 100644 --- a/big_scape/output/legacy_output.py +++ b/big_scape/output/legacy_output.py @@ -765,7 +765,6 @@ def adjust_lcs_to_all_genes( a_stop = result["lcs_a_stop"] b_start = result["lcs_b_start"] reverse = result["lcs_reverse"] - length = abs(a_start - a_stop) # seed length a_start = domain_genes_to_all_genes[family_db_id][a_start] if length == 0: @@ -783,7 +782,6 @@ def adjust_lcs_to_all_genes( a_stop = result["lcs_b_stop"] b_start = result["lcs_a_start"] reverse = result["lcs_reverse"] - length = abs(a_start - a_stop) # seed length if length == 0: pass @@ -796,6 +794,11 @@ def adjust_lcs_to_all_genes( a_start = domain_genes_to_all_genes[family_db_id][a_start] b_start = domain_genes_to_all_genes[bgc_db_id][b_start] + # TODO: adjust lcs bounds for protocluster/protocore mode once lcs is implemented + # a_start, b_start = adjust_lcs_record_bounds( + # a_start, b_start, fam_rec, bgc_rec, reverse + # ) + if length == 0: length = 1 # let's try aligning using the genes with most domains From 9a8af288ed23829569a2a11898eb3dbe12437c03 Mon Sep 17 00:00:00 2001 From: nlouwen Date: Fri, 3 Nov 2023 10:20:53 +0100 Subject: [PATCH 12/15] test fix --- test/comparison/test_binning.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/test/comparison/test_binning.py b/test/comparison/test_binning.py index fd5352e5..f9b8a8b0 100644 --- a/test/comparison/test_binning.py +++ b/test/comparison/test_binning.py @@ -656,6 +656,17 @@ def test_cull_singletons_cutoff(self): 1.0, 1.0, "mix", + 0, + 0, + 0, + 0, + False, + 0, + 0, + 0, + 0, + False, + bs_enums.ALIGNMENT_MODE.GLOBAL, ) ) @@ -668,6 +679,17 @@ def test_cull_singletons_cutoff(self): 0.0, 0.0, "mix", + 0, + 0, + 0, + 0, + False, + 0, + 0, + 0, + 0, + False, + bs_enums.ALIGNMENT_MODE.GLOBAL, ) ) @@ -680,6 +702,17 @@ def test_cull_singletons_cutoff(self): 1.0, 1.0, "mix", + 0, + 0, + 0, + 0, + False, + 0, + 0, + 0, + 0, + False, + bs_enums.ALIGNMENT_MODE.GLOBAL, ) ) From 700c2ba8cef351b082d3d98cebc21114f57a08d8 Mon Sep 17 00:00:00 2001 From: nlouwen Date: Fri, 3 Nov 2023 14:11:38 +0100 Subject: [PATCH 13/15] Simplify lcs extend db storage and fetching --- big_scape/comparison/comparable_region.py | 2 +- big_scape/comparison/legacy_workflow_alt.py | 135 +++++++++++++++----- big_scape/comparison/utility.py | 16 +-- big_scape/data/schema.sql | 3 +- big_scape/network/network.py | 47 +++++-- big_scape/output/legacy_output.py | 4 +- big_scape/trees/newick_tree.py | 10 +- 7 files changed, 157 insertions(+), 60 deletions(-) diff --git a/big_scape/comparison/comparable_region.py b/big_scape/comparison/comparable_region.py index ac311d78..70e2c1e8 100644 --- a/big_scape/comparison/comparable_region.py +++ b/big_scape/comparison/comparable_region.py @@ -36,6 +36,7 @@ class ComparableRegion: domain_lists: Optional[tuple[list[HSP], list[HSP]]] domain_sets: Optional[tuple[set[HSP], set[HSP]]] domain_dicts: Optional[tuple[dict[HSP, list[int]], dict[HSP, list[int]]]] + alignment_mode: ALIGNMENT_MODE """ def __init__( @@ -67,7 +68,6 @@ def __init__( self.lcs_b_start = b_start self.lcs_a_stop = a_stop self.lcs_b_stop = b_stop - self.lcs_reverse = reverse self.alignment_mode: ALIGNMENT_MODE = ALIGNMENT_MODE.GLOBAL def get_domain_sets( diff --git a/big_scape/comparison/legacy_workflow_alt.py b/big_scape/comparison/legacy_workflow_alt.py index 7d9f1356..53f49113 100644 --- a/big_scape/comparison/legacy_workflow_alt.py +++ b/big_scape/comparison/legacy_workflow_alt.py @@ -162,29 +162,7 @@ def generate_edges( if len(results) != len(task_batch): raise ValueError("Mismatch between task length and result length") - # for idx, pair in enumerate(task_batch): - # distance, jaccard, adjacency, dss = results[idx] - for pair, distance, jaccard, adjacency, dss in results: - yield ( - pair.region_a._db_id, - pair.region_b._db_id, - distance, - jaccard, - adjacency, - dss, - pair_generator.weights, - pair.comparable_region.lcs_a_start, - pair.comparable_region.lcs_a_stop, - pair.comparable_region.lcs_b_start, - pair.comparable_region.lcs_b_stop, - pair.comparable_region.lcs_reverse, - pair.comparable_region.a_start, - pair.comparable_region.a_stop, - pair.comparable_region.b_start, - pair.comparable_region.b_stop, - pair.comparable_region.reverse, - pair.comparable_region.alignment_mode.value, - ) + yield from results done_pairs += len(results) if callback: @@ -214,7 +192,6 @@ def do_lcs_pair( pair.comparable_region.lcs_a_stop = a_stop pair.comparable_region.lcs_b_start = b_start pair.comparable_region.lcs_b_stop = b_stop - pair.comparable_region.lcs_reverse = reverse pair.comparable_region.a_start = a_start pair.comparable_region.a_stop = a_stop pair.comparable_region.b_start = b_start @@ -252,7 +229,26 @@ def expand_pair(pair: RecordPair) -> float: def calculate_scores_pair( data: tuple[list[RecordPair], bs_enums.ALIGNMENT_MODE, str] -) -> list[tuple[RecordPair, float, float, float, float]]: # pragma no cover +) -> list[ + tuple[ + Optional[int], + Optional[int], + float, + float, + float, + float, + int, + int, + int, + int, + int, + int, + int, + int, + bool, + str, + ] +]: # pragma no cover """Calculate the scores for a list of pairs Args: @@ -260,8 +256,9 @@ def calculate_scores_pair( label Returns: - list[tuple[float, float, float, float]]: list of scores for each pair in the - order as the input data list + list[tuple[int, int, float, float, float, float, int, int, int, int, int, int, + int, int, bool, str,]]: list of scores for each pair in the + order as the input data list, including lcs and extension coordinates """ pairs, alignment_mode, weights_label = data @@ -269,13 +266,51 @@ def calculate_scores_pair( for pair in pairs: if pair.region_a.parent_gbk == pair.region_b.parent_gbk: - results.append((pair, 0.0, 1.0, 1.0, 1.0)) + results.append( + ( + pair.region_a._db_id, + pair.region_b._db_id, + 0.0, + 1.0, + 1.0, + 1.0, + pair.comparable_region.lcs_a_start, + pair.comparable_region.lcs_a_stop, + pair.comparable_region.lcs_b_start, + pair.comparable_region.lcs_b_stop, + pair.comparable_region.a_start, + pair.comparable_region.a_stop, + pair.comparable_region.b_start, + pair.comparable_region.b_stop, + pair.comparable_region.reverse, + pair.comparable_region.alignment_mode.value, + ) + ) continue jaccard = calc_jaccard_pair(pair) if jaccard == 0.0: - results.append((pair, 1.0, 0.0, 0.0, 0.0)) + results.append( + ( + pair.region_a._db_id, + pair.region_b._db_id, + 1.0, + 0.0, + 0.0, + 0.0, + pair.comparable_region.lcs_a_start, + pair.comparable_region.lcs_a_stop, + pair.comparable_region.lcs_b_start, + pair.comparable_region.lcs_b_stop, + pair.comparable_region.a_start, + pair.comparable_region.a_stop, + pair.comparable_region.b_start, + pair.comparable_region.b_stop, + pair.comparable_region.reverse, + pair.comparable_region.alignment_mode.value, + ) + ) continue # in the form [bool, Pair]. true bools means they need expansion, false they don't @@ -285,7 +320,26 @@ def calculate_scores_pair( jaccard = expand_pair(pair) if jaccard == 0.0: - results.append((pair, 1.0, 0.0, 0.0, 0.0)) + results.append( + ( + pair.region_a._db_id, + pair.region_b._db_id, + 1.0, + 0.0, + 0.0, + 0.0, + pair.comparable_region.lcs_a_start, + pair.comparable_region.lcs_a_stop, + pair.comparable_region.lcs_b_start, + pair.comparable_region.lcs_b_stop, + pair.comparable_region.a_start, + pair.comparable_region.a_stop, + pair.comparable_region.b_start, + pair.comparable_region.b_stop, + pair.comparable_region.reverse, + pair.comparable_region.alignment_mode.value, + ) + ) continue if weights_label not in LEGACY_WEIGHTS: @@ -303,6 +357,25 @@ def calculate_scores_pair( similarity = jaccard * jc_weight + adjacency * ai_weight + dss * dss_weight distance = 1 - similarity - results.append((pair, distance, jaccard, adjacency, dss)) + results.append( + ( + pair.region_a._db_id, + pair.region_b._db_id, + distance, + jaccard, + adjacency, + dss, + pair.comparable_region.lcs_a_start, + pair.comparable_region.lcs_a_stop, + pair.comparable_region.lcs_b_start, + pair.comparable_region.lcs_b_stop, + pair.comparable_region.a_start, + pair.comparable_region.a_stop, + pair.comparable_region.b_start, + pair.comparable_region.b_stop, + pair.comparable_region.reverse, + pair.comparable_region.alignment_mode.value, + ) + ) return results diff --git a/big_scape/comparison/utility.py b/big_scape/comparison/utility.py index df807099..b1788a7a 100644 --- a/big_scape/comparison/utility.py +++ b/big_scape/comparison/utility.py @@ -26,7 +26,6 @@ def save_edge_to_db( int, int, int, - bool, int, int, int, @@ -39,8 +38,10 @@ def save_edge_to_db( """Save edge to the database Args: - edge (tuple[int, int, float, float, float, float]): edge tuple containing - region_a_id, region_b_id, distance, jaccard, adjacency, dss + edge (tuple[int, int, float, float, float, float, str, int, int, int, int, int, + int, int, int, bool, ALIGNMENT_MODE,]): edge tuple containing + region_a_id, region_b_id, distance, jaccard, adjacency, dss, weights, + lcs start/stop, extension start/stop, reverse, alignment_mode upsert (bool, optional): whether to upsert the edge into the database. """ @@ -56,12 +57,11 @@ def save_edge_to_db( lcs_a_stop, lcs_b_start, lcs_b_stop, - lcs_reverse, ext_a_start, ext_a_stop, ext_b_start, ext_b_stop, - ext_reverse, + reverse, alignment_mode, ) = edge @@ -85,12 +85,11 @@ def save_edge_to_db( lcs_a_stop=lcs_a_stop, lcs_b_start=lcs_b_start, lcs_b_stop=lcs_b_stop, - lcs_reverse=lcs_reverse, ext_a_start=ext_a_start, ext_a_stop=ext_a_stop, ext_b_start=ext_b_start, ext_b_stop=ext_b_stop, - ext_reverse=ext_reverse, + reverse=reverse, alignment_mode=alignment_mode.value, ) @@ -114,7 +113,6 @@ def save_edges_to_db( int, int, int, - bool, int, int, int, @@ -147,7 +145,7 @@ def save_edges_to_db( # create a query # TODO: this should not need ignore. it's there now because protoclusters somehow # trigger an integrityerror - query = "INSERT OR IGNORE INTO distance VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)" + query = "INSERT OR IGNORE INTO distance VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)" cursor.executemany(query, edges) diff --git a/big_scape/data/schema.sql b/big_scape/data/schema.sql index ce49ac2a..e7906562 100644 --- a/big_scape/data/schema.sql +++ b/big_scape/data/schema.sql @@ -79,12 +79,11 @@ CREATE TABLE IF NOT EXISTS distance ( lcs_a_stop INTEGER NOT NULL, lcs_b_start INTEGER NOT NULL, lcs_b_stop INTEGER NOT NULL, - lcs_reverse BOOLEAN NOT NULL, ext_a_start INTEGER NOT NULL, ext_a_stop INTEGER NOT NULL, ext_b_start INTEGER NOT NULL, ext_b_stop INTEGER NOT NULL, - ext_reverse BOOLEAN NOT NULL, + reverse BOOLEAN NOT NULL, alignment_mode TEXT NOT NULL, UNIQUE(region_a_id, region_b_id, weights) FOREIGN KEY(region_a_id) REFERENCES bgc_record(id) diff --git a/big_scape/network/network.py b/big_scape/network/network.py index f1f69b83..dd26f5fe 100644 --- a/big_scape/network/network.py +++ b/big_scape/network/network.py @@ -3,7 +3,7 @@ # from dependencies from typing import Optional, Generator, cast -from sqlalchemy import tuple_ +from sqlalchemy import tuple_, select # from other modules from big_scape.data import DB @@ -82,13 +82,20 @@ def get_edge( if not DB.metadata: raise RuntimeError("DB.metadata is None") - + distance_table = DB.metadata.tables["distance"] select_statment = ( - DB.metadata.tables["distance"] - .select() - .where(DB.metadata.tables["distance"].c.region_a_id.notin_(exclude_nodes)) - .where(DB.metadata.tables["distance"].c.region_b_id.notin_(exclude_nodes)) - .where(DB.metadata.tables["distance"].c.distance < cutoff) + select( + distance_table.c.region_a_id, + distance_table.c.region_b_id, + distance_table.c.distance, + distance_table.c.jaccard, + distance_table.c.adjacency, + distance_table.c.dss, + distance_table.c.weights, + ) + .where(distance_table.c.region_a_id.notin_(exclude_nodes)) + .where(distance_table.c.region_b_id.notin_(exclude_nodes)) + .where(distance_table.c.distance < cutoff) ) edge = DB.execute(select_statment).fetchone() @@ -96,7 +103,7 @@ def get_edge( if edge is None: return None - return cast(tuple[int, int, float, float, float, float, str], edge[0:7]) + return cast(tuple[int, int, float, float, float, float, str], edge) def get_edges( @@ -113,7 +120,15 @@ def get_edges( distance_table = DB.metadata.tables["distance"] select_statement = ( - distance_table.select() + select( + distance_table.c.region_a_id, + distance_table.c.region_b_id, + distance_table.c.distance, + distance_table.c.jaccard, + distance_table.c.adjacency, + distance_table.c.dss, + distance_table.c.weights, + ) # equivalent to WHERE (region_a_id in (...) OR region_b_id in (...)) .where( distance_table.c.region_a_id.in_(include_nodes) @@ -123,7 +138,7 @@ def get_edges( .where(distance_table.c.distance < distance_cutoff) ).compile() - edges = [edge[0:7] for edge in DB.execute(select_statement).fetchall()] + edges = DB.execute(select_statement).fetchall() return cast(list[tuple[int, int, float, float, float, float, str]], edges) @@ -144,7 +159,15 @@ def get_connected_edges( distance_table = DB.metadata.tables["distance"] select_statement = ( - distance_table.select() + select( + distance_table.c.region_a_id, + distance_table.c.region_b_id, + distance_table.c.distance, + distance_table.c.jaccard, + distance_table.c.adjacency, + distance_table.c.dss, + distance_table.c.weights, + ) # equivalent to WHERE (region_a_id in (...) OR region_b_id in (...)) .where( distance_table.c.region_a_id.in_(include_nodes) @@ -166,7 +189,7 @@ def get_connected_edges( .where(distance_table.c.distance < distance_cutoff).compile() ) - edges = [edge[0:7] for edge in DB.execute(select_statement).fetchall()] + edges = DB.execute(select_statement).fetchall() return cast(list[tuple[int, int, float, float, float, float, str]], edges) diff --git a/big_scape/output/legacy_output.py b/big_scape/output/legacy_output.py index 9de4f0b4..c6b84a21 100644 --- a/big_scape/output/legacy_output.py +++ b/big_scape/output/legacy_output.py @@ -764,7 +764,7 @@ def adjust_lcs_to_all_genes( a_start = result["lcs_a_start"] a_stop = result["lcs_a_stop"] b_start = result["lcs_b_start"] - reverse = result["lcs_reverse"] + reverse = result["reverse"] length = abs(a_start - a_stop) # seed length a_start = domain_genes_to_all_genes[family_db_id][a_start] if length == 0: @@ -781,7 +781,7 @@ def adjust_lcs_to_all_genes( a_start = result["lcs_b_start"] a_stop = result["lcs_b_stop"] b_start = result["lcs_a_start"] - reverse = result["lcs_reverse"] + reverse = result["reverse"] length = abs(a_start - a_stop) # seed length if length == 0: pass diff --git a/big_scape/trees/newick_tree.py b/big_scape/trees/newick_tree.py index b03f69f3..6a769331 100644 --- a/big_scape/trees/newick_tree.py +++ b/big_scape/trees/newick_tree.py @@ -26,7 +26,7 @@ def generate_newick_tree( family_members (list[int]): list of bgc ids in one family Returns: - Correctly formatted newick tree + str: Correctly formatted newick tree """ # no need for alignment if len(records) < 3: @@ -62,6 +62,9 @@ def process_newick_tree(tree_file: Path) -> str: Args: tree_file (Path): Path to tree file + + Returns: + str: processed newick tree """ if not tree_file.exists(): logging.error("Failed to create newick tree") @@ -85,7 +88,7 @@ def process_newick_tree(tree_file: Path) -> str: def generate_gcf_alignment( records: list[BGCRecord], exemplar: int, family_members: list[int] ) -> str: - """Write protein domain alignment to file + """Generate protein domain alignment for records in GCF Args: records (list[BGCRecord]): Records within one GCF to align @@ -93,7 +96,8 @@ def generate_gcf_alignment( family_members (list[int]): list of bgc ids in one family Returns: - Written file containing protein domain alignment + str: alignment of GCF based on protein domain + TODO: refactor """ record_ids = list(range(len(records))) From 6e08c7013facfee3edf51cbdd0c58d6389fe5c46 Mon Sep 17 00:00:00 2001 From: nlouwen Date: Fri, 3 Nov 2023 14:12:16 +0100 Subject: [PATCH 14/15] fix tests after db change --- test/comparison/test_binning.py | 11 ----------- test/db/test_partial.py | 2 -- test/network/test_network.py | 2 -- test/trees/test_alignment.py | 12 ++++++------ 4 files changed, 6 insertions(+), 21 deletions(-) diff --git a/test/comparison/test_binning.py b/test/comparison/test_binning.py index f9b8a8b0..f079c13a 100644 --- a/test/comparison/test_binning.py +++ b/test/comparison/test_binning.py @@ -273,7 +273,6 @@ def test_ref_to_ref_pair_generator_first_iteration(self): 0, 0, 0, - False, 0, 0, 0, @@ -297,7 +296,6 @@ def test_ref_to_ref_pair_generator_first_iteration(self): 0, 0, 0, - False, 0, 0, 0, @@ -384,7 +382,6 @@ def test_ref_to_ref_pair_generator_second_iteration(self): 0, 0, 0, - False, 0, 0, 0, @@ -408,7 +405,6 @@ def test_ref_to_ref_pair_generator_second_iteration(self): 0, 0, 0, - False, 0, 0, 0, @@ -446,7 +442,6 @@ def test_ref_to_ref_pair_generator_second_iteration(self): 0, 0, 0, - False, 0, 0, 0, @@ -470,7 +465,6 @@ def test_ref_to_ref_pair_generator_second_iteration(self): 0, 0, 0, - False, 0, 0, 0, @@ -492,7 +486,6 @@ def test_ref_to_ref_pair_generator_second_iteration(self): 0, 0, 0, - False, 0, 0, 0, @@ -514,7 +507,6 @@ def test_ref_to_ref_pair_generator_second_iteration(self): 0, 0, 0, - False, 0, 0, 0, @@ -660,7 +652,6 @@ def test_cull_singletons_cutoff(self): 0, 0, 0, - False, 0, 0, 0, @@ -683,7 +674,6 @@ def test_cull_singletons_cutoff(self): 0, 0, 0, - False, 0, 0, 0, @@ -706,7 +696,6 @@ def test_cull_singletons_cutoff(self): 0, 0, 0, - False, 0, 0, 0, diff --git a/test/db/test_partial.py b/test/db/test_partial.py index 7cd5d052..6e934685 100644 --- a/test/db/test_partial.py +++ b/test/db/test_partial.py @@ -61,7 +61,6 @@ def gen_mock_edge_list( int, int, int, - bool, int, int, int, @@ -90,7 +89,6 @@ def gen_mock_edge_list( 0, 0, 0, - False, 0, 0, 0, diff --git a/test/network/test_network.py b/test/network/test_network.py index 7f3e1f3a..ee09ecbd 100644 --- a/test/network/test_network.py +++ b/test/network/test_network.py @@ -41,7 +41,6 @@ def gen_mock_edge_list( int, int, int, - bool, int, int, int, @@ -70,7 +69,6 @@ def gen_mock_edge_list( 0, 0, 0, - False, 0, 0, 0, diff --git a/test/trees/test_alignment.py b/test/trees/test_alignment.py index cce4647b..ff75aab9 100644 --- a/test/trees/test_alignment.py +++ b/test/trees/test_alignment.py @@ -64,7 +64,7 @@ def test_lcs_adjust_ex2mem(self): "lcs_a_stop": 7, "lcs_b_start": 6, "lcs_b_stop": 9, - "lcs_reverse": False, + "reverse": False, } dom_to_gene = {0: {4: 5, 7: 9}, 1: {6: 8}} dom_count = {} @@ -86,7 +86,7 @@ def test_lcs_adjust_ex2mem_reverse(self): "lcs_a_stop": 3, "lcs_b_start": 1, "lcs_b_stop": 2, - "lcs_reverse": True, + "reverse": True, } dom_to_gene = {0: {0: 0, 1: 2, 2: 4}, 1: {0: 1, 1: 2, 2: 3}} dom_count = {0: [1, 1, 1], 1: [1, 1, 1]} @@ -108,7 +108,7 @@ def test_lcs_adjust_mem2ex(self): "lcs_a_stop": 3, "lcs_b_start": 1, "lcs_b_stop": 2, - "lcs_reverse": False, + "reverse": False, } dom_to_gene = {0: {0: 0, 1: 2, 2: 4}, 1: {0: 1, 1: 2, 2: 3}} dom_count = {0: [1, 1, 1], 1: [1, 1, 1]} @@ -130,7 +130,7 @@ def test_lcs_adjust_mem2ex_reverse(self): "lcs_a_stop": 3, "lcs_b_start": 1, "lcs_b_stop": 2, - "lcs_reverse": True, + "reverse": True, } dom_to_gene = {0: {0: 0, 1: 2, 2: 4}, 1: {0: 1, 1: 2, 2: 3}} dom_count = {0: [1, 1, 1], 1: [1, 1, 1]} @@ -153,7 +153,7 @@ def test_lcs_adjust_zero_length_not_reversed(self): "lcs_a_stop": 0, "lcs_b_start": 0, "lcs_b_stop": 0, - "lcs_reverse": False, + "reverse": False, } dom_to_gene = {0: {0: 0, 1: 2, 2: 4}, 1: {0: 1, 1: 2, 2: 3}} dom_count = {0: [1, 2, 1], 1: [1, 1, 2]} @@ -187,7 +187,7 @@ def test_lcs_adjust_zero_length_reversed(self): "lcs_a_stop": 0, "lcs_b_start": 0, "lcs_b_stop": 0, - "lcs_reverse": False, + "reverse": False, } dom_to_gene = {0: {0: 0, 1: 2, 2: 4}, 1: {0: 1, 1: 2, 2: 3}} dom_count = {0: [1, 2, 1], 1: [1, 1, 2]} From 8dfeb5fd4196af45271164b8a0b8c8b7e33fc9d8 Mon Sep 17 00:00:00 2001 From: nlouwen Date: Fri, 3 Nov 2023 14:49:52 +0100 Subject: [PATCH 15/15] minor docs and typing --- big_scape/comparison/legacy_workflow_alt.py | 5 +++++ big_scape/comparison/utility.py | 5 +++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/big_scape/comparison/legacy_workflow_alt.py b/big_scape/comparison/legacy_workflow_alt.py index 53f49113..fb104f8d 100644 --- a/big_scape/comparison/legacy_workflow_alt.py +++ b/big_scape/comparison/legacy_workflow_alt.py @@ -237,6 +237,7 @@ def calculate_scores_pair( float, float, float, + str, int, int, int, @@ -274,6 +275,7 @@ def calculate_scores_pair( 1.0, 1.0, 1.0, + weights_label, pair.comparable_region.lcs_a_start, pair.comparable_region.lcs_a_stop, pair.comparable_region.lcs_b_start, @@ -299,6 +301,7 @@ def calculate_scores_pair( 0.0, 0.0, 0.0, + weights_label, pair.comparable_region.lcs_a_start, pair.comparable_region.lcs_a_stop, pair.comparable_region.lcs_b_start, @@ -328,6 +331,7 @@ def calculate_scores_pair( 0.0, 0.0, 0.0, + weights_label, pair.comparable_region.lcs_a_start, pair.comparable_region.lcs_a_stop, pair.comparable_region.lcs_b_start, @@ -365,6 +369,7 @@ def calculate_scores_pair( jaccard, adjacency, dss, + weights_label, pair.comparable_region.lcs_a_start, pair.comparable_region.lcs_a_stop, pair.comparable_region.lcs_b_start, diff --git a/big_scape/comparison/utility.py b/big_scape/comparison/utility.py index b1788a7a..2d049a0c 100644 --- a/big_scape/comparison/utility.py +++ b/big_scape/comparison/utility.py @@ -118,14 +118,15 @@ def save_edges_to_db( int, int, bool, - ALIGNMENT_MODE, + str, ] ] ) -> None: """Save many edges to the database Args: - edges (list[tuple[]]): list of edges to save + edges (list[tuple[int, int, float, float, float, float, str, int, int, int, int, + int, int, int, int, bool, str]]): list of edges to save """ # save the comparison data to the database # using raw sqlite for this because sqlalchemy is not fast enough