From ed0de21d6b2c8fa598bc4c7d533fe4000fd9c2bd Mon Sep 17 00:00:00 2001
From: nlouwen <nicolouwen@hotmail.com>
Date: Tue, 31 Oct 2023 13:46:47 +0100
Subject: [PATCH 01/15] add GCF newick tree creation

---
 big_scape/trees/__init__.py    |   5 +
 big_scape/trees/newick_tree.py | 221 +++++++++++++++++++++++++++++++++
 2 files changed, 226 insertions(+)
 create mode 100644 big_scape/trees/__init__.py
 create mode 100644 big_scape/trees/newick_tree.py

diff --git a/big_scape/trees/__init__.py b/big_scape/trees/__init__.py
new file mode 100644
index 00000000..27088f50
--- /dev/null
+++ b/big_scape/trees/__init__.py
@@ -0,0 +1,5 @@
+"""Contains newick tree generation functionality"""
+
+from .newick_tree import generate_newick_tree
+
+__all__ = ["generate_newick_tree"]
diff --git a/big_scape/trees/newick_tree.py b/big_scape/trees/newick_tree.py
new file mode 100644
index 00000000..b03f69f3
--- /dev/null
+++ b/big_scape/trees/newick_tree.py
@@ -0,0 +1,221 @@
+"""Module to generate newick GCF trees"""
+
+# from python
+import tempfile
+import subprocess
+import logging
+import numpy as np
+from typing import TextIO
+from Bio import Phylo
+from pathlib import Path
+from collections import defaultdict
+from scipy.optimize import linear_sum_assignment
+
+# from other modules
+from big_scape.genbank import BGCRecord
+
+
+def generate_newick_tree(
+    records: list[BGCRecord], exemplar: int, family_members: list[int]
+) -> str:
+    """Generate newick formatted tree for each GCF
+
+    Args:
+        records (list[BGCRecord]): list of records within GCF
+        exemplar (int): index of exemplar to use during alignment
+        family_members (list[int]): list of bgc ids in one family
+
+    Returns:
+        Correctly formatted newick tree
+    """
+    # no need for alignment
+    if len(records) < 3:
+        return f"({','.join([str(bgc_id)+':0.0' for bgc_id in family_members])}):0.01;"
+
+    algn = generate_gcf_alignment(records, exemplar, family_members)
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmppath = Path(tmpdir)
+        tmp_algn = tmppath / Path("alignment.fasta")
+        tmp_newick = tmppath / Path("alignment.newick")
+        with open(tmp_algn, "w") as tmp_a:
+            tmp_a.write(algn)
+        with open(tmp_newick, "w") as tmp_n:
+            run_fasttree(tmp_algn, tmp_n)
+        tree = process_newick_tree(tmp_newick)
+    return tree
+
+
+def run_fasttree(algn_file: Path, out_file: TextIO):
+    """Generate FastTree newick GCF tree
+
+    Args:
+        algn_file (Path): Path to alignment file
+        out_file (TextIO): Opened output file object
+    """
+    subprocess.run(
+        ["fasttree", "-quiet", "-nopr", algn_file], stdout=out_file, shell=False
+    )
+
+
+def process_newick_tree(tree_file: Path) -> str:
+    """Process newick tree file format
+
+    Args:
+        tree_file (Path): Path to tree file
+    """
+    if not tree_file.exists():
+        logging.error("Failed to create newick tree")
+        raise FileNotFoundError()
+    with open(tree_file, "r") as newick_file:
+        try:
+            tree = Phylo.read(newick_file, "newick")
+        except ValueError as e:
+            logging.warning("Error encountered while reading newick tree: ", str(e))
+            return ""
+        else:
+            try:
+                tree.root_at_midpoint()
+            except UnboundLocalError:
+                # Noticed this could happen if the sequences are exactly
+                # the same and all distances == 0
+                logging.debug("Unable to root at midpoint")
+            return tree.format("newick")
+
+
+def generate_gcf_alignment(
+    records: list[BGCRecord], exemplar: int, family_members: list[int]
+) -> str:
+    """Write protein domain alignment to file
+
+    Args:
+        records (list[BGCRecord]): Records within one GCF to align
+        exemplar (int): Index of exemplar to use during alignment
+        family_members (list[int]): list of bgc ids in one family
+
+    Returns:
+        Written file containing protein domain alignment
+    """
+    record_ids = list(range(len(records)))
+
+    # collect present domains for each GCF member
+    domain_sets = {}
+    # count the frequency of occurrence of each domain (excluding copies)
+    frequency_table: dict[str, int] = defaultdict(int)
+    for idx, record in enumerate(records):
+        domain_sets[idx] = set([domain.domain for domain in record.get_hsps()])
+        for domain in domain_sets[idx]:
+            frequency_table[domain] += 1
+
+    # Find the set of [(tree domains)]. They should 1) be in the exemplar
+    # and 2) appear with the most frequency. Iterate over the different
+    # frequencies (descending) until set is not empty
+    tree_domains: set[str] = set()
+    frequencies = sorted(set(frequency_table.values()), reverse=True)
+
+    # first try with domain(s) with max frequency, even if it's just one
+    f = 0
+    while len(tree_domains) == 0 and f < len(frequencies):
+        for domain in frequency_table:
+            if (
+                frequency_table[domain] == frequencies[f]
+                and domain in domain_sets[exemplar]
+            ):
+                tree_domains.add(domain)
+        f += 1
+    if len(tree_domains) == 1:
+        logging.debug(
+            "core shared domains for GCF {} consists of a single domain ({})".format(
+                exemplar, [x for x in tree_domains][0]
+            )
+        )
+
+    alignments: dict[int, str] = {}
+    alignments[exemplar] = ""
+
+    out_of_tree_bgcs = []  # no common domain core
+    delete_bgc = set()
+    record_ids.remove(exemplar)
+    for record_idx in record_ids:
+        alignments[record_idx] = ""
+
+    match_dict: dict[int, int] = {}
+    for domain in tree_domains:
+        specific_domain_list_a = [
+            hsp for hsp in records[exemplar].get_hsps() if hsp.domain == domain
+        ]
+        num_copies_a = len(specific_domain_list_a)
+        for hsp in specific_domain_list_a:
+            if hsp.alignment is not None:
+                alignments[exemplar] += hsp.alignment.align_string
+                seq_length = len(hsp.alignment.align_string)  # TODO: find better spot
+
+        for bgc in alignments:
+            match_dict.clear()
+            if bgc == exemplar:
+                pass
+            elif domain not in domain_sets[bgc]:
+                out_of_tree_bgcs.append(bgc)
+                delete_bgc.add(bgc)
+            elif bgc in delete_bgc:
+                pass
+            else:
+                specific_domain_list_b = [
+                    hsp for hsp in records[bgc].get_hsps() if hsp.domain == domain
+                ]
+                num_copies_b = len(specific_domain_list_b)
+                dist_matrix: np.ndarray = np.ndarray((num_copies_a, num_copies_b))
+
+                for domsa in range(num_copies_a):
+                    for domsb in range(num_copies_b):
+                        hsp_a = specific_domain_list_a[domsa]
+                        hsp_b = specific_domain_list_b[domsb]
+
+                        if hsp_a.alignment is None or hsp_b.alignment is None:
+                            logging.error(
+                                "Trying to compare unaligned domains", hsp_a, hsp_b
+                            )
+                            raise AttributeError()
+                        aligned_seq_a = hsp_a.alignment.align_string
+                        aligned_seq_b = hsp_b.alignment.align_string
+
+                        matches = 0
+                        gaps = 0
+
+                        for position in range(seq_length):
+                            if aligned_seq_a[position] == aligned_seq_b[position]:
+                                if aligned_seq_a[position] != "-":
+                                    matches += 1
+                                else:
+                                    gaps += 1
+
+                        dist_matrix[domsa][domsb] = 1 - (matches / (seq_length - gaps))
+
+                best_indexes = linear_sum_assignment(dist_matrix)
+
+                # at this point is not ensured that we have the same order
+                # for the exemplar's copies (rows in BestIndexes)
+                # ideally they should go from 0-numcopies. Better make sure
+
+                for x in range(len(best_indexes[0])):
+                    match_dict[best_indexes[0][x]] = best_indexes[1][x]
+
+                for copy in range(num_copies_a):
+                    try:
+                        hsp_b = specific_domain_list_b[match_dict[copy]]
+                        if hsp_b.alignment is None:
+                            logging.error("Encountered unaligned domain", hsp_b)
+                            raise AttributeError()
+                        alignments[bgc] += hsp_b.alignment.align_string
+                    except KeyError:
+                        # This means that this copy of exemplar did not
+                        # have a match in bgc (i.e. bgc has less copies
+                        # of this domain than exemplar)
+                        alignments[bgc] += "-" * seq_length
+        for bgc in list(delete_bgc):
+            del alignments[bgc]
+        delete_bgc.clear()
+    algn_string = f">{family_members[exemplar]}\n{alignments[exemplar]}\n"
+    for bgc in alignments:
+        if bgc != exemplar:
+            algn_string += f">{family_members[bgc]}\n{alignments[bgc]}\n"
+    return algn_string

From bf0140484ae870dbc4ed8e0a59be9a923bb2b86d Mon Sep 17 00:00:00 2001
From: nlouwen <nicolouwen@hotmail.com>
Date: Tue, 31 Oct 2023 13:58:07 +0100
Subject: [PATCH 02/15] generate families alignments

---
 big_scape/output/legacy_output.py | 173 ++++++++++++++++++++++++++++--
 1 file changed, 165 insertions(+), 8 deletions(-)

diff --git a/big_scape/output/legacy_output.py b/big_scape/output/legacy_output.py
index 02ea2a1a..ce88c41d 100644
--- a/big_scape/output/legacy_output.py
+++ b/big_scape/output/legacy_output.py
@@ -12,9 +12,11 @@
 from big_scape.comparison import RecordPairGenerator, legacy_get_class
 from big_scape.genbank import GBK, CDS
 from big_scape.enums import SOURCE_TYPE
+from big_scape.trees import generate_newick_tree
 
 import big_scape.network.network as bs_network
 import big_scape.network.utility as bs_network_utility
+import big_scape.comparison as bs_comparison
 
 
 def copy_base_output_templates(output_dir: Path):
@@ -527,7 +529,7 @@ def generate_bs_data_js_orfs(gbk: GBK) -> Any:
                 "id": f"{gbk.path.name[:-4]}_ORF{idx+1}",
                 "start": cds.nt_start,
                 "end": cds.nt_stop,
-                "strand": 1 if cds.strand else -1,
+                "strand": cds.strand if cds.strand else 1,
                 "domains": generate_bs_data_js_orfs_domains(cds),
             }
         )
@@ -674,6 +676,160 @@ def generate_bs_networks_js_sim_matrix(
     return sparse_matrix
 
 
+def generate_bs_families_alignment(
+    bs_families: dict[int, list[int]], pair_generator: RecordPairGenerator
+):
+    """Generate list of dictionaries with information on bgc alignment of family members"""
+    bs_families_alignment = []
+    records = pair_generator.source_records
+    # dictionary maps bgc to all orfs that contain domains
+    domain_genes_to_all_genes: dict[int, dict[int, int]] = {}
+    # dictionary maps bgc to list of domain count per gene
+    domain_count_gene: dict[int, list[int]] = {}
+
+    for family_db_id, family_members in bs_families.items():
+        # collects records within this GCF
+        family_records = [records[bgc_num] for bgc_num in family_members]
+        family_member_db_ids = [rec._db_id for rec in family_records]
+        fam_record_idx = family_member_db_ids.index(family_db_id)
+        fam_gbk = family_records[fam_record_idx].parent_gbk
+        if fam_gbk is None:
+            raise AttributeError("Record parent GBK is not set!")
+        for bgc, bgc_db_id in zip(family_members, family_member_db_ids):
+            bgc_gbk = records[bgc].parent_gbk
+            if bgc_gbk is None:
+                raise AttributeError("Record parent GBK is not set!")
+            if bgc_db_id is None:
+                raise AttributeError("Record has no database id!")
+            domain_genes_to_all_genes[bgc_db_id] = {}
+            domain_count_gene[bgc_db_id] = []
+            has_domains = 0
+            for cds_idx, cds in enumerate(bgc_gbk.genes):
+                if len(cds.hsps) > 0:
+                    domain_count_gene[bgc_db_id].append(len(cds.hsps))
+                    domain_genes_to_all_genes[bgc_db_id][has_domains] = cds_idx
+                    has_domains += 1
+
+        ref_genes_ = set()
+        aln = []
+        for bgc, bgc_db_id in zip(family_members, family_member_db_ids):
+            bgc_gbk = records[bgc].parent_gbk
+            if bgc_gbk is None:
+                raise AttributeError("Record parent GBK is not set!")
+            if bgc_db_id is None:
+                raise AttributeError("Record has no database id!")
+            if bgc_db_id == family_db_id:
+                aln.append([[gene_num, 0] for gene_num in range(len(bgc_gbk.genes))])
+            else:
+                # TODO: implement database storage of lcs and fetch here
+                (
+                    a_start,
+                    a_stop,
+                    b_start,
+                    b_stop,
+                    reverse,
+                ) = bs_comparison.legacy_lcs.legacy_find_cds_lcs(
+                    family_records[fam_record_idx].get_cds_with_domains(),
+                    records[bgc].get_cds_with_domains(),
+                )
+                # if DB.metadata is None:
+                #     raise RuntimeError("Database metadata is None!")
+                # dist_table = DB.metadata.tables["distance"]
+                # select_query = (
+                #     dist_table.select()
+                #     .add_columns(
+                #         dist_table.c.region_a_id,
+                #         dist_table.c.region_b_id,
+                #         dist_table.c.lcs_a_start,
+                #         dist_table.c.lcs_a_stop,
+                #         dist_table.c.lcs_b_start,
+                #         dist_table.c.lcs_b_stop,
+                #         dist_table.c.lcs_reverse,
+                #     )
+                #     .where(dist_table.c.region_a_id.in_((family_db_id, bgc_db_id)))
+                #     .where(dist_table.c.region_b_id.in_((family_db_id, bgc_db_id)))
+                #     .compile()
+                # )
+                # result = DB.execute(select_query).fetchone()
+                # if result is None:
+                #     logging.error(
+                #         "LCS not found in database (%s, %s)", family_db_id, bgc_db_id
+                #     )
+                #     raise RuntimeError()
+                # if result.region_a_id == family_db_id:
+                #     a_start: int = result.lcs_a_start
+                #     a_stop: int = result.lcs_a_stop
+                #     b_start: int = result.lcs_b_start
+                #     b_stop: int = result.lcs_b_stop
+                #     reverse: bool = result.lcs_reverse
+                # elif result.region_b_id == family_db_id:
+                #     a_start = result.lcs_b_start
+                #     a_stop = result.lcs_b_stop
+                #     b_start = result.lcs_a_start
+                #     b_stop = result.lcs_a_stop
+                #     reverse = result.lcs_reverse
+
+                length = abs(a_start - a_stop)  # seed length
+                a_start = domain_genes_to_all_genes[family_db_id][a_start]
+                if length == 0:
+                    pass
+
+                elif reverse:
+                    b_start = domain_genes_to_all_genes[bgc_db_id][
+                        len(domain_count_gene[bgc_db_id]) - b_start - 1
+                    ]
+                else:
+                    b_start = domain_genes_to_all_genes[bgc_db_id][b_start]
+
+                if length == 0:
+                    length = 1
+                    # let's try aligning using the genes with most domains
+                    # after all, they ended up being in the same GCF
+                    # for some reason
+                    x = max(domain_count_gene[family_db_id])
+                    x = domain_count_gene[family_db_id].index(x)
+                    a_start = domain_genes_to_all_genes[family_db_id][x]
+
+                    y = max(list(domain_count_gene[bgc_db_id]))
+                    y = domain_count_gene[bgc_db_id].index(y)
+
+                    # check orientation
+                    if fam_gbk.genes[x].strand == bgc_gbk.genes[y].strand:
+                        b_start = domain_genes_to_all_genes[bgc_db_id][y]
+                        reverse = False
+                    else:
+                        b_start = domain_genes_to_all_genes[bgc_db_id][
+                            len(domain_count_gene[bgc_db_id]) - y - 1
+                        ]
+                        reverse = True
+                ref_genes_.add(a_start)
+
+                bgc_algn = []
+                for gene_num in range(len(bgc_gbk.genes)):
+                    if gene_num == b_start:
+                        if reverse:
+                            bgc_algn.append([a_start, -100])
+                        else:
+                            bgc_algn.append([a_start, 100])
+                    else:
+                        bgc_algn.append([-1, 100])
+                aln.append(bgc_algn)
+
+        ref_genes = list(ref_genes_)
+
+        fam_alignment = {
+            "id": "FAM_{:05d}".format(family_db_id),
+            "ref": family_members[fam_record_idx],
+            "newick": generate_newick_tree(
+                family_records, fam_record_idx, family_members
+            ),
+            "ref_genes": ref_genes,
+            "aln": aln,
+        }
+        bs_families_alignment.append(fam_alignment)
+    return bs_families_alignment
+
+
 def generate_bs_families_members(
     cutoff: float,
     pair_generator: RecordPairGenerator,
@@ -750,7 +906,7 @@ def generate_bs_networks_js(
     label: str,
     cutoff: float,
     pair_generator: RecordPairGenerator,
-    bs_families: list[dict[str, Any]],
+    bs_families: dict[int, list[int]],
 ):
     """Generates the bs_networks.js file located at
     output/html_content/networks/[label]/[pair_generator]
@@ -781,10 +937,13 @@ def generate_bs_networks_js(
     bs_networks_js_path = pair_generator_path / "bs_networks.js"
 
     # TODO: replace with functions to generate objects
+    networks_families = generate_bs_networks_families(bs_families)
     bs_similarity: list[Any] = generate_bs_networks_js_sim_matrix(
         cutoff, pair_generator
     )
-    bs_families_alignment: list[Any] = []
+    bs_families_alignment: list[Any] = generate_bs_families_alignment(
+        bs_families, pair_generator
+    )
     bs_similarity_families: list[Any] = []
 
     with open(bs_networks_js_path, "w") as bs_networks_js:
@@ -802,7 +961,7 @@ def generate_bs_networks_js(
         bs_networks_js.write(
             "var bs_families={};\n".format(
                 json.dumps(
-                    bs_families,
+                    networks_families,
                     indent=4,
                     separators=(",", ":"),
                     sort_keys=True,
@@ -903,9 +1062,7 @@ def legacy_generate_bin_output(
         network (BSNetwork): the network object for the pair_generator
     """
     families_members = generate_bs_families_members(cutoff, pair_generator)
-    networks_families = generate_bs_networks_families(families_members)
+    # networks_families = generate_bs_networks_families(families_members)
 
     add_run_data_network(output_dir, label, cutoff, pair_generator, families_members)
-    generate_bs_networks_js(
-        output_dir, label, cutoff, pair_generator, networks_families
-    )
+    generate_bs_networks_js(output_dir, label, cutoff, pair_generator, families_members)

From 7f8f6114a72dc1a8705064425aeb3b872d671882 Mon Sep 17 00:00:00 2001
From: nlouwen <nicolouwen@hotmail.com>
Date: Tue, 31 Oct 2023 16:04:45 +0100
Subject: [PATCH 03/15] implement lcs database storage

---
 big_scape/comparison/comparable_region.py   |  7 ++++++
 big_scape/comparison/legacy_workflow_alt.py | 25 +++++++++++++++------
 big_scape/comparison/utility.py             |  8 ++++---
 big_scape/data/schema.sql                   |  5 +++++
 big_scape/file_input/load_files.py          | 18 ++++++++++-----
 big_scape/network/network.py                |  6 ++---
 6 files changed, 50 insertions(+), 19 deletions(-)

diff --git a/big_scape/comparison/comparable_region.py b/big_scape/comparison/comparable_region.py
index 8f7a1fc1..f9c700ee 100644
--- a/big_scape/comparison/comparable_region.py
+++ b/big_scape/comparison/comparable_region.py
@@ -47,6 +47,7 @@ def __init__(
         reverse: bool,
     ):
         self.pair = pair
+        # store possibly extended comparable region
         self.a_start = a_start
         self.b_start = b_start
 
@@ -60,6 +61,12 @@ def __init__(
         self.domain_dicts: Optional[
             tuple[dict[HSP, list[int]], dict[HSP, list[int]]]
         ] = None
+        # store lcs without any extensions
+        self.lcs_a_start = a_start
+        self.lcs_b_start = b_start
+        self.lcs_a_stop = a_stop
+        self.lcs_b_stop = b_stop
+        self.lcs_reverse = reverse
 
     def get_domain_sets(
         self, regenerate=False, cache=True
diff --git a/big_scape/comparison/legacy_workflow_alt.py b/big_scape/comparison/legacy_workflow_alt.py
index 550af438..01781429 100644
--- a/big_scape/comparison/legacy_workflow_alt.py
+++ b/big_scape/comparison/legacy_workflow_alt.py
@@ -162,8 +162,9 @@ def generate_edges(
                 if len(results) != len(task_batch):
                     raise ValueError("Mismatch between task length and result length")
 
-                for idx, pair in enumerate(task_batch):
-                    distance, jaccard, adjacency, dss = results[idx]
+                # for idx, pair in enumerate(task_batch):
+                #     distance, jaccard, adjacency, dss = results[idx]
+                for pair, distance, jaccard, adjacency, dss in results:
                     yield (
                         pair.region_a._db_id,
                         pair.region_b._db_id,
@@ -172,6 +173,11 @@ def generate_edges(
                         adjacency,
                         dss,
                         pair_generator.weights,
+                        pair.comparable_region.lcs_a_start,
+                        pair.comparable_region.lcs_a_stop,
+                        pair.comparable_region.lcs_b_start,
+                        pair.comparable_region.lcs_b_stop,
+                        pair.comparable_region.lcs_reverse,
                     )
 
                 done_pairs += len(results)
@@ -198,6 +204,11 @@ def do_lcs_pair(
     )
 
     # set the comparable region
+    pair.comparable_region.lcs_a_start = a_start
+    pair.comparable_region.lcs_a_stop = a_stop
+    pair.comparable_region.lcs_b_start = b_start
+    pair.comparable_region.lcs_b_stop = b_stop
+    pair.comparable_region.lcs_reverse = reverse
     pair.comparable_region.a_start = a_start
     pair.comparable_region.a_stop = a_stop
     pair.comparable_region.b_start = b_start
@@ -234,7 +245,7 @@ def expand_pair(pair: RecordPair) -> float:
 
 def calculate_scores_pair(
     data: tuple[list[RecordPair], bs_enums.ALIGNMENT_MODE, str]
-) -> list[tuple[float, float, float, float]]:  # pragma no cover
+) -> list[tuple[RecordPair, float, float, float, float]]:  # pragma no cover
     """Calculate the scores for a list of pairs
 
     Args:
@@ -251,13 +262,13 @@ def calculate_scores_pair(
 
     for pair in pairs:
         if pair.region_a.parent_gbk == pair.region_b.parent_gbk:
-            results.append((0.0, 1.0, 1.0, 1.0))
+            results.append((pair, 0.0, 1.0, 1.0, 1.0))
             continue
 
         jaccard = calc_jaccard_pair(pair)
 
         if jaccard == 0.0:
-            results.append((1.0, 0.0, 0.0, 0.0))
+            results.append((pair, 1.0, 0.0, 0.0, 0.0))
             continue
 
         # in the form [bool, Pair]. true bools means they need expansion, false they don't
@@ -267,7 +278,7 @@ def calculate_scores_pair(
             jaccard = expand_pair(pair)
 
         if jaccard == 0.0:
-            results.append((1.0, 0.0, 0.0, 0.0))
+            results.append((pair, 1.0, 0.0, 0.0, 0.0))
             continue
 
         if weights_label not in LEGACY_WEIGHTS:
@@ -285,6 +296,6 @@ def calculate_scores_pair(
         similarity = jaccard * jc_weight + adjacency * ai_weight + dss * dss_weight
         distance = 1 - similarity
 
-        results.append((distance, jaccard, adjacency, dss))
+        results.append((pair, distance, jaccard, adjacency, dss))
 
     return results
diff --git a/big_scape/comparison/utility.py b/big_scape/comparison/utility.py
index 34c87f69..5cf098df 100644
--- a/big_scape/comparison/utility.py
+++ b/big_scape/comparison/utility.py
@@ -50,12 +50,14 @@ def save_edge_to_db(
 
 
 def save_edges_to_db(
-    edges: list[tuple[int, int, float, float, float, float, str]]
+    edges: list[
+        tuple[int, int, float, float, float, float, str, int, int, int, int, bool]
+    ]
 ) -> None:
     """Save many edges to the database
 
     Args:
-        edges (list[tuple[int, int, float, float, float, float, str]]): list of edges to save
+        edges (list[tuple[]]): list of edges to save
     """
     # save the comparison data to the database
     # using raw sqlite for this because sqlalchemy is not fast enough
@@ -75,7 +77,7 @@ def save_edges_to_db(
     # create a query
     # TODO: this should not need ignore. it's there now because protoclusters somehow
     # trigger an integrityerror
-    query = "INSERT OR IGNORE INTO distance VALUES (?, ?, ?, ?, ?, ?, ?)"
+    query = "INSERT OR IGNORE INTO distance VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"
 
     cursor.executemany(query, edges)
 
diff --git a/big_scape/data/schema.sql b/big_scape/data/schema.sql
index fdd0d689..0a901f51 100644
--- a/big_scape/data/schema.sql
+++ b/big_scape/data/schema.sql
@@ -76,6 +76,11 @@ CREATE TABLE IF NOT EXISTS distance (
     adjacency REAL NOT NULL,
     dss REAL NOT NULL,
     weights TEXT NOT NULL,
+    lcs_a_start INTEGER NOT NULL,
+    lcs_a_stop INTEGER NOT NULL,
+    lcs_b_start INTEGER NOT NULL,
+    lcs_b_stop INTEGER NOT NULL,
+    lcs_reverse BOOLEAN NOT NULL,
     UNIQUE(region_a_id, region_b_id)
     FOREIGN KEY(region_a_id) REFERENCES bgc_record(id)
     FOREIGN KEY(region_b_id) REFERENCES bgc_record(id)
diff --git a/big_scape/file_input/load_files.py b/big_scape/file_input/load_files.py
index 0f939acf..bba213de 100644
--- a/big_scape/file_input/load_files.py
+++ b/big_scape/file_input/load_files.py
@@ -91,14 +91,16 @@ def download_dataset(url: str, path: Path, path_compressed: Path) -> None:
     os.remove(path_compressed)
 
 
-def load_dataset_folder(run: dict, source_type: bs_enums.SOURCE_TYPE) -> List[GBK]:
+def load_dataset_folder(
+    input_dir: Path, run: dict, source_type: bs_enums.SOURCE_TYPE
+) -> List[GBK]:
     """Loads all gbk files in a given folder
 
     Returns empty list if path does not point to a folder or if folder does not contain
     gbk files
     """
 
-    path = run["input_dir"]
+    path = input_dir
     source_type = source_type
     mode = run["input_mode"]
     include_gbk = run["include_gbk"]
@@ -248,22 +250,26 @@ def load_gbks(run: dict, bigscape_dir: Path) -> list[GBK]:
         # add the query bgc to the exclude list
         run["exclude_gbk"].append(query_bgc_stem)
 
-        gbks = load_dataset_folder(run, bs_enums.SOURCE_TYPE.REFERENCE)
+        gbks = load_dataset_folder(
+            run["input_dir"], run, bs_enums.SOURCE_TYPE.REFERENCE
+        )
         input_gbks.extend(gbks)
 
     else:
-        gbks = load_dataset_folder(run, bs_enums.SOURCE_TYPE.QUERY)
+        gbks = load_dataset_folder(run["input_dir"], run, bs_enums.SOURCE_TYPE.QUERY)
         input_gbks.extend(gbks)
 
     # get reference if either MIBiG version or user-made reference dir passed
     if run["mibig_version"]:
         mibig_version_dir = get_mibig(run["mibig_version"], bigscape_dir)
-        mibig_gbks = load_dataset_folder(mibig_version_dir, bs_enums.SOURCE_TYPE.MIBIG)
+        mibig_gbks = load_dataset_folder(
+            mibig_version_dir, run, bs_enums.SOURCE_TYPE.MIBIG
+        )
         input_gbks.extend(mibig_gbks)
 
     if run["reference_dir"]:
         reference_gbks = load_dataset_folder(
-            run["reference_dir"], bs_enums.SOURCE_TYPE.REFERENCE
+            run["reference_dir"], run, bs_enums.SOURCE_TYPE.REFERENCE
         )
         input_gbks.extend(reference_gbks)
 
diff --git a/big_scape/network/network.py b/big_scape/network/network.py
index fd9a4335..2a0ba91c 100644
--- a/big_scape/network/network.py
+++ b/big_scape/network/network.py
@@ -96,7 +96,7 @@ def get_edge(
     if edge is None:
         return None
 
-    return cast(tuple[int, int, float, float, float, float, str], edge)
+    return cast(tuple[int, int, float, float, float, float, str], edge[0:7])
 
 
 def get_edges(
@@ -123,7 +123,7 @@ def get_edges(
         .where(distance_table.c.distance < distance_cutoff)
     ).compile()
 
-    edges = DB.execute(select_statement).fetchall()
+    edges = [edge[0:7] for edge in DB.execute(select_statement).fetchall()]
 
     return cast(list[tuple[int, int, float, float, float, float, str]], edges)
 
@@ -166,6 +166,6 @@ def get_connected_edges(
         .where(distance_table.c.distance < distance_cutoff).compile()
     )
 
-    edges = DB.execute(select_statement).fetchall()
+    edges = [edge[0:7] for edge in DB.execute(select_statement).fetchall()]
 
     return cast(list[tuple[int, int, float, float, float, float, str]], edges)

From 6c74edaf4f4fa954c1a20004628dd79cd7241dfc Mon Sep 17 00:00:00 2001
From: nlouwen <nicolouwen@hotmail.com>
Date: Wed, 1 Nov 2023 10:43:55 +0100
Subject: [PATCH 04/15] Fetch lcs bounds from database

---
 big_scape/output/legacy_output.py | 117 +++++++++++++++---------------
 1 file changed, 58 insertions(+), 59 deletions(-)

diff --git a/big_scape/output/legacy_output.py b/big_scape/output/legacy_output.py
index ce88c41d..369c4023 100644
--- a/big_scape/output/legacy_output.py
+++ b/big_scape/output/legacy_output.py
@@ -16,7 +16,6 @@
 
 import big_scape.network.network as bs_network
 import big_scape.network.utility as bs_network_utility
-import big_scape.comparison as bs_comparison
 
 
 def copy_base_output_templates(output_dir: Path):
@@ -721,65 +720,65 @@ def generate_bs_families_alignment(
             if bgc_db_id == family_db_id:
                 aln.append([[gene_num, 0] for gene_num in range(len(bgc_gbk.genes))])
             else:
-                # TODO: implement database storage of lcs and fetch here
-                (
-                    a_start,
-                    a_stop,
-                    b_start,
-                    b_stop,
-                    reverse,
-                ) = bs_comparison.legacy_lcs.legacy_find_cds_lcs(
-                    family_records[fam_record_idx].get_cds_with_domains(),
-                    records[bgc].get_cds_with_domains(),
+                if DB.metadata is None:
+                    raise RuntimeError("Database metadata is None!")
+                weights = pair_generator.weights
+                dist_table = DB.metadata.tables["distance"]
+                select_query = (
+                    dist_table.select()
+                    .add_columns(
+                        dist_table.c.region_a_id,
+                        dist_table.c.region_b_id,
+                        dist_table.c.lcs_a_start,
+                        dist_table.c.lcs_a_stop,
+                        dist_table.c.lcs_b_start,
+                        dist_table.c.lcs_reverse,
+                    )
+                    .where(dist_table.c.region_a_id.in_((family_db_id, bgc_db_id)))
+                    .where(dist_table.c.region_b_id.in_((family_db_id, bgc_db_id)))
+                    .where(dist_table.c.weights == weights)
+                    .compile()
                 )
-                # if DB.metadata is None:
-                #     raise RuntimeError("Database metadata is None!")
-                # dist_table = DB.metadata.tables["distance"]
-                # select_query = (
-                #     dist_table.select()
-                #     .add_columns(
-                #         dist_table.c.region_a_id,
-                #         dist_table.c.region_b_id,
-                #         dist_table.c.lcs_a_start,
-                #         dist_table.c.lcs_a_stop,
-                #         dist_table.c.lcs_b_start,
-                #         dist_table.c.lcs_b_stop,
-                #         dist_table.c.lcs_reverse,
-                #     )
-                #     .where(dist_table.c.region_a_id.in_((family_db_id, bgc_db_id)))
-                #     .where(dist_table.c.region_b_id.in_((family_db_id, bgc_db_id)))
-                #     .compile()
-                # )
-                # result = DB.execute(select_query).fetchone()
-                # if result is None:
-                #     logging.error(
-                #         "LCS not found in database (%s, %s)", family_db_id, bgc_db_id
-                #     )
-                #     raise RuntimeError()
-                # if result.region_a_id == family_db_id:
-                #     a_start: int = result.lcs_a_start
-                #     a_stop: int = result.lcs_a_stop
-                #     b_start: int = result.lcs_b_start
-                #     b_stop: int = result.lcs_b_stop
-                #     reverse: bool = result.lcs_reverse
-                # elif result.region_b_id == family_db_id:
-                #     a_start = result.lcs_b_start
-                #     a_stop = result.lcs_b_stop
-                #     b_start = result.lcs_a_start
-                #     b_stop = result.lcs_a_stop
-                #     reverse = result.lcs_reverse
-
-                length = abs(a_start - a_stop)  # seed length
-                a_start = domain_genes_to_all_genes[family_db_id][a_start]
-                if length == 0:
-                    pass
-
-                elif reverse:
-                    b_start = domain_genes_to_all_genes[bgc_db_id][
-                        len(domain_count_gene[bgc_db_id]) - b_start - 1
-                    ]
-                else:
-                    b_start = domain_genes_to_all_genes[bgc_db_id][b_start]
+                result = DB.execute(select_query).fetchone()
+                if result is None:
+                    raise RuntimeError("LCS not found in database")
+                if result.region_a_id == family_db_id:
+                    a_start: int = result.lcs_a_start
+                    a_stop: int = result.lcs_a_stop
+                    b_start: int = result.lcs_b_start
+                    reverse: bool = result.lcs_reverse
+
+                    length = abs(a_start - a_stop)  # seed length
+                    a_start = domain_genes_to_all_genes[family_db_id][a_start]
+                    if length == 0:
+                        pass
+
+                    elif reverse:
+                        b_start = domain_genes_to_all_genes[bgc_db_id][
+                            len(domain_count_gene[bgc_db_id]) - b_start - 1
+                        ]
+                    else:
+                        b_start = domain_genes_to_all_genes[bgc_db_id][b_start]
+
+                elif result.region_b_id == family_db_id:
+                    a_start = result.lcs_b_start
+                    a_stop = result.lcs_b_stop
+                    b_start = result.lcs_a_start
+                    reverse = result.lcs_reverse
+
+                    length = abs(a_start - a_stop)  # seed length
+                    if length == 0:
+                        pass
+                    elif reverse:
+                        a_start = domain_genes_to_all_genes[family_db_id][
+                            len(domain_count_gene[family_db_id]) - a_start - length
+                        ]
+                        b_start = domain_genes_to_all_genes[bgc_db_id][
+                            b_start + length - 1
+                        ]
+                    else:
+                        a_start = domain_genes_to_all_genes[family_db_id][a_start]
+                        b_start = domain_genes_to_all_genes[bgc_db_id][b_start]
 
                 if length == 0:
                     length = 1

From 8ecee734a6e4986f342d7797db016e98bbe2235e Mon Sep 17 00:00:00 2001
From: nlouwen <nicolouwen@hotmail.com>
Date: Wed, 1 Nov 2023 11:36:35 +0100
Subject: [PATCH 05/15] fix tests and errors

---
 big_scape/comparison/utility.py    | 23 +++++++++++++++--
 big_scape/output/legacy_output.py  |  5 +++-
 test/comparison/test_binning.py    | 40 ++++++++++++++++++++++++++++++
 test/db/test_partial.py            | 17 +++++++++++--
 test/file_input/test_load_files.py | 10 ++++----
 test/network/test_network.py       | 17 +++++++++++--
 6 files changed, 100 insertions(+), 12 deletions(-)

diff --git a/big_scape/comparison/utility.py b/big_scape/comparison/utility.py
index 5cf098df..e417d24d 100644
--- a/big_scape/comparison/utility.py
+++ b/big_scape/comparison/utility.py
@@ -13,7 +13,8 @@
 
 
 def save_edge_to_db(
-    edge: tuple[int, int, float, float, float, float, str], upsert=False
+    edge: tuple[int, int, float, float, float, float, str, int, int, int, int, bool],
+    upsert=False,
 ) -> None:
     """Save edge to the database
 
@@ -23,7 +24,20 @@ def save_edge_to_db(
         upsert (bool, optional): whether to upsert the edge into the database.
     """
 
-    region_a_id, region_b_id, distance, jaccard, adjacency, dss, weights = edge
+    (
+        region_a_id,
+        region_b_id,
+        distance,
+        jaccard,
+        adjacency,
+        dss,
+        weights,
+        a_start,
+        a_stop,
+        b_start,
+        b_stop,
+        reverse,
+    ) = edge
 
     # save the comparison data to the database
 
@@ -41,6 +55,11 @@ def save_edge_to_db(
         adjacency=adjacency,
         dss=dss,
         weights=weights,
+        lcs_a_start=a_start,
+        lcs_a_stop=a_stop,
+        lcs_b_start=b_start,
+        lcs_b_stop=b_stop,
+        lcs_reverse=reverse,
     )
 
     if upsert:
diff --git a/big_scape/output/legacy_output.py b/big_scape/output/legacy_output.py
index 369c4023..b94cb1ff 100644
--- a/big_scape/output/legacy_output.py
+++ b/big_scape/output/legacy_output.py
@@ -741,7 +741,10 @@ def generate_bs_families_alignment(
                 )
                 result = DB.execute(select_query).fetchone()
                 if result is None:
-                    raise RuntimeError("LCS not found in database")
+                    raise RuntimeError(
+                        "LCS not found in database (%s %s %s)"
+                        % (family_db_id, bgc_db_id, weights)
+                    )
                 if result.region_a_id == family_db_id:
                     a_start: int = result.lcs_a_start
                     a_stop: int = result.lcs_a_stop
diff --git a/test/comparison/test_binning.py b/test/comparison/test_binning.py
index a6f2e340..9391cbcd 100644
--- a/test/comparison/test_binning.py
+++ b/test/comparison/test_binning.py
@@ -268,6 +268,11 @@ def test_ref_to_ref_pair_generator_first_iteration(self):
                         1.0,
                         1.0,
                         "mix",
+                        0,
+                        0,
+                        0,
+                        0,
+                        False,
                     )
                 )
             else:
@@ -281,6 +286,11 @@ def test_ref_to_ref_pair_generator_first_iteration(self):
                         0.0,
                         0.0,
                         "mix",
+                        0,
+                        0,
+                        0,
+                        0,
+                        False,
                     )
                 )
 
@@ -357,6 +367,11 @@ def test_ref_to_ref_pair_generator_second_iteration(self):
                         1.0,
                         1.0,
                         "mix",
+                        0,
+                        0,
+                        0,
+                        0,
+                        False,
                     )
                 )
             else:
@@ -370,6 +385,11 @@ def test_ref_to_ref_pair_generator_second_iteration(self):
                         0.0,
                         0.0,
                         "mix",
+                        0,
+                        0,
+                        0,
+                        0,
+                        False,
                     )
                 )
 
@@ -397,6 +417,11 @@ def test_ref_to_ref_pair_generator_second_iteration(self):
                 1.0,
                 1.0,
                 "mix",
+                0,
+                0,
+                0,
+                0,
+                False,
             )
         )
 
@@ -410,6 +435,11 @@ def test_ref_to_ref_pair_generator_second_iteration(self):
                 0.0,
                 0.0,
                 "mix",
+                0,
+                0,
+                0,
+                0,
+                False,
             )
         )
         save_edge_to_db(
@@ -421,6 +451,11 @@ def test_ref_to_ref_pair_generator_second_iteration(self):
                 0.0,
                 0.0,
                 "mix",
+                0,
+                0,
+                0,
+                0,
+                False,
             )
         )
         save_edge_to_db(
@@ -432,6 +467,11 @@ def test_ref_to_ref_pair_generator_second_iteration(self):
                 0.0,
                 0.0,
                 "mix",
+                0,
+                0,
+                0,
+                0,
+                False,
             )
         )
 
diff --git a/test/db/test_partial.py b/test/db/test_partial.py
index ee046b76..1819e583 100644
--- a/test/db/test_partial.py
+++ b/test/db/test_partial.py
@@ -48,7 +48,7 @@ def add_mock_hsp_alignment_hsp(hsp: HSP) -> None:
 
 def gen_mock_edge_list(
     edge_gbks: list[GBK],
-) -> list[tuple[int, int, float, float, float, float, str]]:
+) -> list[tuple[int, int, float, float, float, float, str, int, int, int, int, bool]]:
     edges = []
     for gbk_a, gbk_b in combinations(edge_gbks, 2):
         if gbk_a.region is None or gbk_b.region is None:
@@ -57,7 +57,20 @@ def gen_mock_edge_list(
             continue
 
         edges.append(
-            (gbk_a.region._db_id, gbk_b.region._db_id, 0.0, 0.0, 0.0, 0.0, "mix")
+            (
+                gbk_a.region._db_id,
+                gbk_b.region._db_id,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                "mix",
+                0,
+                0,
+                0,
+                0,
+                False,
+            )
         )
 
     return edges
diff --git a/test/file_input/test_load_files.py b/test/file_input/test_load_files.py
index 27f31cd6..344d7d4e 100644
--- a/test/file_input/test_load_files.py
+++ b/test/file_input/test_load_files.py
@@ -28,7 +28,7 @@ def test_gbk_path_invalid(self):
         }
 
         with self.assertRaises(NotADirectoryError):
-            load_dataset_folder(run, SOURCE_TYPE.QUERY)
+            load_dataset_folder(run["input_dir"], run, SOURCE_TYPE.QUERY)
 
     def test_gbk_path_valid(self):
         """Tests whether loading a given path returns none"""
@@ -45,7 +45,7 @@ def test_gbk_path_valid(self):
             "legacy_classify": False,
         }
 
-        load_result = load_dataset_folder(run, SOURCE_TYPE.QUERY)
+        load_result = load_dataset_folder(run["input_dir"], run, SOURCE_TYPE.QUERY)
 
         expected_count = 3
         actual_count = len(load_result)
@@ -68,7 +68,7 @@ def test_gbk_path_empty(self):
         }
 
         with self.assertRaises(FileNotFoundError):
-            load_dataset_folder(run, SOURCE_TYPE.QUERY)
+            load_dataset_folder(run["input_dir"], run, SOURCE_TYPE.QUERY)
 
     def test_load_gbk_not_a_file(self):
         """Tests whether the load_gbk function correctly returns none when input is not a file"""
@@ -122,7 +122,7 @@ def test_include_exclude_str(self):
             "legacy_classify": False,
         }
 
-        load_result = load_dataset_folder(run, SOURCE_TYPE.QUERY)
+        load_result = load_dataset_folder(run["input_dir"], run, SOURCE_TYPE.QUERY)
 
         expected_count = 1
         actual_count = len(load_result)
@@ -143,7 +143,7 @@ def test_include_all_override_exclude(self):
             "legacy_classify": False,
         }
 
-        load_result = load_dataset_folder(run, SOURCE_TYPE.QUERY)
+        load_result = load_dataset_folder(run["input_dir"], run, SOURCE_TYPE.QUERY)
 
         expected_count = 3
         actual_count = len(load_result)
diff --git a/test/network/test_network.py b/test/network/test_network.py
index a2dae3ed..06c05a11 100644
--- a/test/network/test_network.py
+++ b/test/network/test_network.py
@@ -28,7 +28,7 @@ def create_mock_gbk(i) -> bs_gbk.GBK:
 
 def gen_mock_edge_list(
     edge_gbks: list[bs_gbk.GBK],
-) -> list[tuple[int, int, float, float, float, float, str]]:
+) -> list[tuple[int, int, float, float, float, float, str, int, int, int, int, bool]]:
     edges = []
     for gbk_a, gbk_b in combinations(edge_gbks, 2):
         if gbk_a.region is None or gbk_b.region is None:
@@ -37,7 +37,20 @@ def gen_mock_edge_list(
             continue
 
         edges.append(
-            (gbk_a.region._db_id, gbk_b.region._db_id, 0.0, 1.0, 1.0, 1.0, "mix")
+            (
+                gbk_a.region._db_id,
+                gbk_b.region._db_id,
+                0.0,
+                1.0,
+                1.0,
+                1.0,
+                "mix",
+                0,
+                0,
+                0,
+                0,
+                False,
+            )
         )
 
     return edges

From a3a9c940bec90203fc56f3b9638293fbd4799966 Mon Sep 17 00:00:00 2001
From: nlouwen <nicolouwen@hotmail.com>
Date: Wed, 1 Nov 2023 13:32:37 +0100
Subject: [PATCH 06/15] modularized code

---
 big_scape/output/legacy_output.py | 215 ++++++++++++++++++------------
 1 file changed, 132 insertions(+), 83 deletions(-)

diff --git a/big_scape/output/legacy_output.py b/big_scape/output/legacy_output.py
index b94cb1ff..ef4c9e24 100644
--- a/big_scape/output/legacy_output.py
+++ b/big_scape/output/legacy_output.py
@@ -675,6 +675,126 @@ def generate_bs_networks_js_sim_matrix(
     return sparse_matrix
 
 
+def fetch_lcs_from_db(a_id: int, b_id: int, weights: str) -> dict[str, Any]:
+    """Find the lcs start, stop and direction for a pair of records in the database
+
+    Args:
+        a_id (int): record db id of region a
+        b_id (int): record db id of region b
+        weights (str): weights used in analysis
+    """
+    if DB.metadata is None:
+        raise RuntimeError("Database metadata is None!")
+    dist_table = DB.metadata.tables["distance"]
+    select_query = (
+        dist_table.select()
+        # .add_columns(
+        #     dist_table.c.region_a_id,
+        #     dist_table.c.region_b_id,
+        #     dist_table.c.lcs_a_start,
+        #     dist_table.c.lcs_a_stop,
+        #     dist_table.c.lcs_b_start,
+        #     dist_table.c.lcs_reverse,
+        # )
+        .where(dist_table.c.region_a_id.in_((a_id, b_id))).where(
+            dist_table.c.region_b_id.in_((a_id, b_id))
+        )
+        # .where(dist_table.c.weights == weights)
+        .compile()
+    )
+    result = DB.execute(select_query).fetchone()
+    if result is None:
+        raise RuntimeError(
+            "LCS not found in database (%s %s %s)" % (a_id, b_id, weights)
+        )
+    return dict(result._mapping)
+
+
+def adjust_lcs_to_all_genes(
+    result: dict[str, Any],
+    family_db_id: int,
+    bgc_db_id: int,
+    fam_gbk: GBK,
+    bgc_gbk: GBK,
+    domain_genes_to_all_genes: dict[int, dict[int, int]],
+    domain_count_gene: dict[int, list[int]],
+) -> tuple[int, int, bool]:
+    """Adjust boundaries of lcs from only genes with domains to all present genes
+
+    Args:
+        result (dict[str, Any]): database distance row with lcs information
+        family_db_id (int): database record id of family exemplar
+        bgc_db_id (int): database record id of family member
+        fam_gbk (GBK): family exemplar GBK
+        bgc_gbk (GBK): family member GBK
+        domain_genes_to_all_genes (dict[int, dict[int, int]]): maps indices from genes
+            with domains to all present genes
+        domain_count_gene (dict[int, list[int]]): contains the number of domains each
+            gene contains
+
+    Returns:
+        tuple[int, int, bool]: adjusted a_start, b_start and reverse
+    """
+    if result["region_a_id"] == family_db_id:
+        a_start = result["lcs_a_start"]
+        a_stop = result["lcs_a_stop"]
+        b_start = result["lcs_b_start"]
+        reverse = result["lcs_reverse"]
+
+        length = abs(a_start - a_stop)  # seed length
+        a_start = domain_genes_to_all_genes[family_db_id][a_start]
+        if length == 0:
+            pass
+
+        elif reverse:
+            b_start = domain_genes_to_all_genes[bgc_db_id][
+                len(domain_count_gene[bgc_db_id]) - b_start - 1
+            ]
+        else:
+            b_start = domain_genes_to_all_genes[bgc_db_id][b_start]
+
+    elif result["region_b_id"] == family_db_id:
+        a_start = result["lcs_b_start"]
+        a_stop = result["lcs_b_stop"]
+        b_start = result["lcs_a_start"]
+        reverse = result["lcs_reverse"]
+
+        length = abs(a_start - a_stop)  # seed length
+        if length == 0:
+            pass
+        elif reverse:
+            a_start = domain_genes_to_all_genes[family_db_id][
+                len(domain_count_gene[family_db_id]) - a_start - length
+            ]
+            b_start = domain_genes_to_all_genes[bgc_db_id][b_start + length - 1]
+        else:
+            a_start = domain_genes_to_all_genes[family_db_id][a_start]
+            b_start = domain_genes_to_all_genes[bgc_db_id][b_start]
+
+    if length == 0:
+        length = 1
+        # let's try aligning using the genes with most domains
+        # after all, they ended up being in the same GCF
+        # for some reason
+        x = max(domain_count_gene[family_db_id])
+        x = domain_count_gene[family_db_id].index(x)
+        a_start = domain_genes_to_all_genes[family_db_id][x]
+
+        y = max(list(domain_count_gene[bgc_db_id]))
+        y = domain_count_gene[bgc_db_id].index(y)
+
+        # check orientation
+        if fam_gbk.genes[x].strand == bgc_gbk.genes[y].strand:
+            b_start = domain_genes_to_all_genes[bgc_db_id][y]
+            reverse = False
+        else:
+            b_start = domain_genes_to_all_genes[bgc_db_id][
+                len(domain_count_gene[bgc_db_id]) - y - 1
+            ]
+            reverse = True
+    return a_start, b_start, reverse
+
+
 def generate_bs_families_alignment(
     bs_families: dict[int, list[int]], pair_generator: RecordPairGenerator
 ):
@@ -717,93 +837,22 @@ def generate_bs_families_alignment(
                 raise AttributeError("Record parent GBK is not set!")
             if bgc_db_id is None:
                 raise AttributeError("Record has no database id!")
+
             if bgc_db_id == family_db_id:
                 aln.append([[gene_num, 0] for gene_num in range(len(bgc_gbk.genes))])
             else:
-                if DB.metadata is None:
-                    raise RuntimeError("Database metadata is None!")
-                weights = pair_generator.weights
-                dist_table = DB.metadata.tables["distance"]
-                select_query = (
-                    dist_table.select()
-                    .add_columns(
-                        dist_table.c.region_a_id,
-                        dist_table.c.region_b_id,
-                        dist_table.c.lcs_a_start,
-                        dist_table.c.lcs_a_stop,
-                        dist_table.c.lcs_b_start,
-                        dist_table.c.lcs_reverse,
-                    )
-                    .where(dist_table.c.region_a_id.in_((family_db_id, bgc_db_id)))
-                    .where(dist_table.c.region_b_id.in_((family_db_id, bgc_db_id)))
-                    .where(dist_table.c.weights == weights)
-                    .compile()
+                result = fetch_lcs_from_db(
+                    family_db_id, bgc_db_id, pair_generator.weights
+                )
+                a_start, b_start, reverse = adjust_lcs_to_all_genes(
+                    result,
+                    family_db_id,
+                    bgc_db_id,
+                    fam_gbk,
+                    bgc_gbk,
+                    domain_genes_to_all_genes,
+                    domain_count_gene,
                 )
-                result = DB.execute(select_query).fetchone()
-                if result is None:
-                    raise RuntimeError(
-                        "LCS not found in database (%s %s %s)"
-                        % (family_db_id, bgc_db_id, weights)
-                    )
-                if result.region_a_id == family_db_id:
-                    a_start: int = result.lcs_a_start
-                    a_stop: int = result.lcs_a_stop
-                    b_start: int = result.lcs_b_start
-                    reverse: bool = result.lcs_reverse
-
-                    length = abs(a_start - a_stop)  # seed length
-                    a_start = domain_genes_to_all_genes[family_db_id][a_start]
-                    if length == 0:
-                        pass
-
-                    elif reverse:
-                        b_start = domain_genes_to_all_genes[bgc_db_id][
-                            len(domain_count_gene[bgc_db_id]) - b_start - 1
-                        ]
-                    else:
-                        b_start = domain_genes_to_all_genes[bgc_db_id][b_start]
-
-                elif result.region_b_id == family_db_id:
-                    a_start = result.lcs_b_start
-                    a_stop = result.lcs_b_stop
-                    b_start = result.lcs_a_start
-                    reverse = result.lcs_reverse
-
-                    length = abs(a_start - a_stop)  # seed length
-                    if length == 0:
-                        pass
-                    elif reverse:
-                        a_start = domain_genes_to_all_genes[family_db_id][
-                            len(domain_count_gene[family_db_id]) - a_start - length
-                        ]
-                        b_start = domain_genes_to_all_genes[bgc_db_id][
-                            b_start + length - 1
-                        ]
-                    else:
-                        a_start = domain_genes_to_all_genes[family_db_id][a_start]
-                        b_start = domain_genes_to_all_genes[bgc_db_id][b_start]
-
-                if length == 0:
-                    length = 1
-                    # let's try aligning using the genes with most domains
-                    # after all, they ended up being in the same GCF
-                    # for some reason
-                    x = max(domain_count_gene[family_db_id])
-                    x = domain_count_gene[family_db_id].index(x)
-                    a_start = domain_genes_to_all_genes[family_db_id][x]
-
-                    y = max(list(domain_count_gene[bgc_db_id]))
-                    y = domain_count_gene[bgc_db_id].index(y)
-
-                    # check orientation
-                    if fam_gbk.genes[x].strand == bgc_gbk.genes[y].strand:
-                        b_start = domain_genes_to_all_genes[bgc_db_id][y]
-                        reverse = False
-                    else:
-                        b_start = domain_genes_to_all_genes[bgc_db_id][
-                            len(domain_count_gene[bgc_db_id]) - y - 1
-                        ]
-                        reverse = True
                 ref_genes_.add(a_start)
 
                 bgc_algn = []

From 0c9086571691eca9503e8a947a8ec46b959a2470 Mon Sep 17 00:00:00 2001
From: nlouwen <nicolouwen@hotmail.com>
Date: Wed, 1 Nov 2023 16:56:08 +0100
Subject: [PATCH 07/15] Add gcf alignment and tree tests

---
 test/trees/test_alignment.py | 214 +++++++++++++++++++++++++++++++++++
 1 file changed, 214 insertions(+)
 create mode 100644 test/trees/test_alignment.py

diff --git a/test/trees/test_alignment.py b/test/trees/test_alignment.py
new file mode 100644
index 00000000..cce4647b
--- /dev/null
+++ b/test/trees/test_alignment.py
@@ -0,0 +1,214 @@
+"""Contains test for GCF alignment and tree generation"""
+
+# from python
+from unittest import TestCase
+
+# from other modules
+from big_scape.genbank import GBK, BGCRecord, CDS
+from big_scape.hmm import HSP, HSPAlignment
+from big_scape.trees import generate_newick_tree
+from big_scape.trees.newick_tree import generate_gcf_alignment
+from big_scape.output.legacy_output import adjust_lcs_to_all_genes
+
+
+class TestTrees(TestCase):
+    """Contains alignment and tree generation tests"""
+
+    def test_tree_gen_small(self):
+        """Tests generated tree for families with less than three members"""
+
+        records = [
+            BGCRecord(GBK("", ""), 0, 0, 0, False, ""),
+            BGCRecord(GBK("", ""), 0, 0, 0, False, ""),
+        ]
+        exemplar = 0
+        mock_family = [0, 1]
+
+        expected_tree = "(0:0.0,1:0.0):0.01;"
+        tree = generate_newick_tree(records, exemplar, mock_family)
+
+        self.assertEqual(tree, expected_tree)
+
+    def test_gcf_alignment(self):
+        """Tests alignment of GCF HSP alignments"""
+        gbk_a = GBK("", "")
+        gbk_b = GBK("", "")
+        cds_a = CDS(0, 20)
+        cds_b = CDS(0, 20)
+        hsp_a = HSP(cds_a, "PF1", 1, 0, 10)
+        hsp_b = HSP(cds_b, "PF1", 1, 0, 12)
+        hsp_a.alignment = HSPAlignment(hsp_a, "TEST-PF1--")
+        hsp_b.alignment = HSPAlignment(hsp_b, "TEST--P-F1")
+        cds_a.hsps.append(hsp_a)
+        cds_b.hsps.append(hsp_b)
+        gbk_a.genes.append(cds_a)
+        gbk_b.genes.append(cds_b)
+
+        records = [
+            BGCRecord(gbk_a, 0, 0, 100, False, ""),
+            BGCRecord(gbk_b, 1, 0, 100, False, ""),
+        ]
+        exemplar = 0
+        family_members = [0, 1]
+        expected_alignment = ">0\nTEST-PF1--\n>1\nTEST--P-F1\n"
+        algn = generate_gcf_alignment(records, exemplar, family_members)
+
+        self.assertEqual(algn, expected_alignment)
+
+    def test_lcs_adjust_ex2mem(self):
+        """Tests adjusted lcs exemplar to member not reversed"""
+        mock_result = {
+            "region_a_id": 0,
+            "region_b_id": 1,
+            "lcs_a_start": 4,
+            "lcs_a_stop": 7,
+            "lcs_b_start": 6,
+            "lcs_b_stop": 9,
+            "lcs_reverse": False,
+        }
+        dom_to_gene = {0: {4: 5, 7: 9}, 1: {6: 8}}
+        dom_count = {}
+
+        expected_lcs = (5, 8, False)
+
+        new_lcs = adjust_lcs_to_all_genes(
+            mock_result, 0, 1, GBK("", ""), GBK("", ""), dom_to_gene, dom_count
+        )
+
+        self.assertEqual(new_lcs, expected_lcs)
+
+    def test_lcs_adjust_ex2mem_reverse(self):
+        """Tests adjusted lcs exemplar to member with reverse"""
+        mock_result = {
+            "region_a_id": 0,
+            "region_b_id": 1,
+            "lcs_a_start": 2,
+            "lcs_a_stop": 3,
+            "lcs_b_start": 1,
+            "lcs_b_stop": 2,
+            "lcs_reverse": True,
+        }
+        dom_to_gene = {0: {0: 0, 1: 2, 2: 4}, 1: {0: 1, 1: 2, 2: 3}}
+        dom_count = {0: [1, 1, 1], 1: [1, 1, 1]}
+
+        expected_lcs = (4, 2, True)
+
+        new_lcs = adjust_lcs_to_all_genes(
+            mock_result, 0, 1, GBK("", ""), GBK("", ""), dom_to_gene, dom_count
+        )
+
+        self.assertEqual(new_lcs, expected_lcs)
+
+    def test_lcs_adjust_mem2ex(self):
+        """Tests adjusted lcs member to exemplar not reversed"""
+        mock_result = {
+            "region_a_id": 0,
+            "region_b_id": 1,
+            "lcs_a_start": 2,
+            "lcs_a_stop": 3,
+            "lcs_b_start": 1,
+            "lcs_b_stop": 2,
+            "lcs_reverse": False,
+        }
+        dom_to_gene = {0: {0: 0, 1: 2, 2: 4}, 1: {0: 1, 1: 2, 2: 3}}
+        dom_count = {0: [1, 1, 1], 1: [1, 1, 1]}
+
+        expected_lcs = (2, 4, False)
+
+        new_lcs = adjust_lcs_to_all_genes(
+            mock_result, 1, 0, GBK("", ""), GBK("", ""), dom_to_gene, dom_count
+        )
+
+        self.assertEqual(new_lcs, expected_lcs)
+
+    def test_lcs_adjust_mem2ex_reverse(self):
+        """Tests adjusted lcs member to exemplar with reverse"""
+        mock_result = {
+            "region_a_id": 0,
+            "region_b_id": 1,
+            "lcs_a_start": 2,
+            "lcs_a_stop": 3,
+            "lcs_b_start": 1,
+            "lcs_b_stop": 2,
+            "lcs_reverse": True,
+        }
+        dom_to_gene = {0: {0: 0, 1: 2, 2: 4}, 1: {0: 1, 1: 2, 2: 3}}
+        dom_count = {0: [1, 1, 1], 1: [1, 1, 1]}
+
+        expected_lcs = (2, 4, True)
+
+        new_lcs = adjust_lcs_to_all_genes(
+            mock_result, 1, 0, GBK("", ""), GBK("", ""), dom_to_gene, dom_count
+        )
+
+        self.assertEqual(new_lcs, expected_lcs)
+
+    def test_lcs_adjust_zero_length_not_reversed(self):
+        """Tests adjusted lcs for lcs with length 0"""
+
+        mock_result = {
+            "region_a_id": 0,
+            "region_b_id": 1,
+            "lcs_a_start": 0,
+            "lcs_a_stop": 0,
+            "lcs_b_start": 0,
+            "lcs_b_stop": 0,
+            "lcs_reverse": False,
+        }
+        dom_to_gene = {0: {0: 0, 1: 2, 2: 4}, 1: {0: 1, 1: 2, 2: 3}}
+        dom_count = {0: [1, 2, 1], 1: [1, 1, 2]}
+
+        max_dom_cds = CDS(20, 30)
+        max_dom_cds.strand = 1
+        fill_cds = CDS(10, 20)
+        fill_cds.strand = -1
+
+        exempl_gbk = GBK("", "")
+        exempl_gbk.genes = [fill_cds, fill_cds, max_dom_cds, fill_cds, fill_cds]
+
+        mem_gbk = GBK("", "")
+        mem_gbk.genes = [CDS(0, 1), CDS(2, 3), CDS(4, 5), max_dom_cds]
+
+        expected_lcs = (2, 3, False)
+
+        new_lcs = adjust_lcs_to_all_genes(
+            mock_result, 0, 1, exempl_gbk, mem_gbk, dom_to_gene, dom_count
+        )
+
+        self.assertEqual(new_lcs, expected_lcs)
+
+    def test_lcs_adjust_zero_length_reversed(self):
+        """Tests adjusted lcs for lcs with length 0"""
+
+        mock_result = {
+            "region_a_id": 0,
+            "region_b_id": 1,
+            "lcs_a_start": 0,
+            "lcs_a_stop": 0,
+            "lcs_b_start": 0,
+            "lcs_b_stop": 0,
+            "lcs_reverse": False,
+        }
+        dom_to_gene = {0: {0: 0, 1: 2, 2: 4}, 1: {0: 1, 1: 2, 2: 3}}
+        dom_count = {0: [1, 2, 1], 1: [1, 1, 2]}
+
+        max_dom_cds = CDS(0, 0)
+        max_dom_cds.strand = 1
+        fill_cds = CDS(0, 0)
+        fill_cds.strand = -1
+
+        exempl_gbk = GBK("", "")
+        exempl_gbk.genes = [fill_cds, fill_cds, max_dom_cds, fill_cds, fill_cds]
+
+        max_dom_cds_rev = CDS(0, 0)
+        max_dom_cds_rev.strand = -1
+        mem_gbk = GBK("", "")
+        mem_gbk.genes = [CDS(0, 0), CDS(0, 0), CDS(0, 0), max_dom_cds_rev]
+
+        expected_lcs = (2, 1, True)
+
+        new_lcs = adjust_lcs_to_all_genes(
+            mock_result, 0, 1, exempl_gbk, mem_gbk, dom_to_gene, dom_count
+        )
+
+        self.assertEqual(new_lcs, expected_lcs)

From c2b5e3f2072502973ddf1d449b49743047196af3 Mon Sep 17 00:00:00 2001
From: nlouwen <nicolouwen@hotmail.com>
Date: Thu, 2 Nov 2023 09:45:32 +0100
Subject: [PATCH 08/15] Store lcs extension and align mode in db

---
 big_scape/comparison/comparable_region.py   |  2 ++
 big_scape/comparison/legacy_workflow_alt.py |  7 ++++++
 big_scape/comparison/utility.py             | 24 +++++++++++++++++++--
 big_scape/data/schema.sql                   |  6 ++++++
 big_scape/output/legacy_output.py           |  5 ++++-
 5 files changed, 41 insertions(+), 3 deletions(-)

diff --git a/big_scape/comparison/comparable_region.py b/big_scape/comparison/comparable_region.py
index f9c700ee..ac311d78 100644
--- a/big_scape/comparison/comparable_region.py
+++ b/big_scape/comparison/comparable_region.py
@@ -10,6 +10,7 @@
 # from other modules
 from big_scape.genbank import BGCRecord
 from big_scape.hmm import HSP
+from big_scape.enums import ALIGNMENT_MODE
 
 # from this module
 from .legacy_lcs import legacy_find_cds_lcs
@@ -67,6 +68,7 @@ def __init__(
         self.lcs_a_stop = a_stop
         self.lcs_b_stop = b_stop
         self.lcs_reverse = reverse
+        self.alignment_mode: ALIGNMENT_MODE = ALIGNMENT_MODE.GLOBAL
 
     def get_domain_sets(
         self, regenerate=False, cache=True
diff --git a/big_scape/comparison/legacy_workflow_alt.py b/big_scape/comparison/legacy_workflow_alt.py
index 01781429..7d9f1356 100644
--- a/big_scape/comparison/legacy_workflow_alt.py
+++ b/big_scape/comparison/legacy_workflow_alt.py
@@ -178,6 +178,12 @@ def generate_edges(
                         pair.comparable_region.lcs_b_start,
                         pair.comparable_region.lcs_b_stop,
                         pair.comparable_region.lcs_reverse,
+                        pair.comparable_region.a_start,
+                        pair.comparable_region.a_stop,
+                        pair.comparable_region.b_start,
+                        pair.comparable_region.b_stop,
+                        pair.comparable_region.reverse,
+                        pair.comparable_region.alignment_mode.value,
                     )
 
                 done_pairs += len(results)
@@ -238,6 +244,7 @@ def expand_pair(pair: RecordPair) -> float:
         jc = calc_jaccard_pair(pair)
         return jc
 
+    pair.comparable_region.alignment_mode = bs_enums.ALIGNMENT_MODE.GLOCAL
     jc = calc_jaccard_pair(pair)
 
     return jc
diff --git a/big_scape/comparison/utility.py b/big_scape/comparison/utility.py
index e417d24d..d4c3cd5f 100644
--- a/big_scape/comparison/utility.py
+++ b/big_scape/comparison/utility.py
@@ -10,6 +10,7 @@
 # from other modules
 from big_scape.data import DB
 from big_scape.comparison.binning import RecordPairGenerator, RecordPair
+from big_scape.enums import ALIGNMENT_MODE
 
 
 def save_edge_to_db(
@@ -70,7 +71,26 @@ def save_edge_to_db(
 
 def save_edges_to_db(
     edges: list[
-        tuple[int, int, float, float, float, float, str, int, int, int, int, bool]
+        tuple[
+            int,
+            int,
+            float,
+            float,
+            float,
+            float,
+            str,
+            int,
+            int,
+            int,
+            int,
+            bool,
+            int,
+            int,
+            int,
+            int,
+            bool,
+            ALIGNMENT_MODE,
+        ]
     ]
 ) -> None:
     """Save many edges to the database
@@ -96,7 +116,7 @@ def save_edges_to_db(
     # create a query
     # TODO: this should not need ignore. it's there now because protoclusters somehow
     # trigger an integrityerror
-    query = "INSERT OR IGNORE INTO distance VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"
+    query = "INSERT OR IGNORE INTO distance VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"
 
     cursor.executemany(query, edges)
 
diff --git a/big_scape/data/schema.sql b/big_scape/data/schema.sql
index 0a901f51..2547cc95 100644
--- a/big_scape/data/schema.sql
+++ b/big_scape/data/schema.sql
@@ -81,6 +81,12 @@ CREATE TABLE IF NOT EXISTS distance (
     lcs_b_start INTEGER NOT NULL,
     lcs_b_stop INTEGER NOT NULL,
     lcs_reverse BOOLEAN NOT NULL,
+    ext_a_start INTEGER NOT NULL,
+    ext_a_stop INTEGER NOT NULL,
+    ext_b_start INTEGER NOT NULL,
+    ext_b_stop INTEGER NOT NULL,
+    ext_reverse BOOLEAN NOT NULL,
+    alignment_mode TEXT NOT NULL,
     UNIQUE(region_a_id, region_b_id)
     FOREIGN KEY(region_a_id) REFERENCES bgc_record(id)
     FOREIGN KEY(region_b_id) REFERENCES bgc_record(id)
diff --git a/big_scape/output/legacy_output.py b/big_scape/output/legacy_output.py
index ef4c9e24..a5558f25 100644
--- a/big_scape/output/legacy_output.py
+++ b/big_scape/output/legacy_output.py
@@ -784,7 +784,10 @@ def adjust_lcs_to_all_genes(
         y = domain_count_gene[bgc_db_id].index(y)
 
         # check orientation
-        if fam_gbk.genes[x].strand == bgc_gbk.genes[y].strand:
+        if (
+            fam_gbk.genes[domain_genes_to_all_genes[family_db_id][x]].strand
+            == bgc_gbk.genes[domain_genes_to_all_genes[bgc_db_id][y]].strand
+        ):
             b_start = domain_genes_to_all_genes[bgc_db_id][y]
             reverse = False
         else:

From 61de2a150c9a56cd7e5c9daf77fd6b01330e3050 Mon Sep 17 00:00:00 2001
From: nlouwen <nicolouwen@hotmail.com>
Date: Thu, 2 Nov 2023 09:53:48 +0100
Subject: [PATCH 09/15] fix tests

---
 big_scape/comparison/utility.py | 53 ++++++++++++++++++++++++++-------
 test/comparison/test_binning.py | 48 +++++++++++++++++++++++++++++
 test/db/test_partial.py         | 29 +++++++++++++++++-
 test/network/test_network.py    | 29 +++++++++++++++++-
 4 files changed, 146 insertions(+), 13 deletions(-)

diff --git a/big_scape/comparison/utility.py b/big_scape/comparison/utility.py
index d4c3cd5f..df807099 100644
--- a/big_scape/comparison/utility.py
+++ b/big_scape/comparison/utility.py
@@ -14,7 +14,26 @@
 
 
 def save_edge_to_db(
-    edge: tuple[int, int, float, float, float, float, str, int, int, int, int, bool],
+    edge: tuple[
+        int,
+        int,
+        float,
+        float,
+        float,
+        float,
+        str,
+        int,
+        int,
+        int,
+        int,
+        bool,
+        int,
+        int,
+        int,
+        int,
+        bool,
+        ALIGNMENT_MODE,
+    ],
     upsert=False,
 ) -> None:
     """Save edge to the database
@@ -33,11 +52,17 @@ def save_edge_to_db(
         adjacency,
         dss,
         weights,
-        a_start,
-        a_stop,
-        b_start,
-        b_stop,
-        reverse,
+        lcs_a_start,
+        lcs_a_stop,
+        lcs_b_start,
+        lcs_b_stop,
+        lcs_reverse,
+        ext_a_start,
+        ext_a_stop,
+        ext_b_start,
+        ext_b_stop,
+        ext_reverse,
+        alignment_mode,
     ) = edge
 
     # save the comparison data to the database
@@ -56,11 +81,17 @@ def save_edge_to_db(
         adjacency=adjacency,
         dss=dss,
         weights=weights,
-        lcs_a_start=a_start,
-        lcs_a_stop=a_stop,
-        lcs_b_start=b_start,
-        lcs_b_stop=b_stop,
-        lcs_reverse=reverse,
+        lcs_a_start=lcs_a_start,
+        lcs_a_stop=lcs_a_stop,
+        lcs_b_start=lcs_b_start,
+        lcs_b_stop=lcs_b_stop,
+        lcs_reverse=lcs_reverse,
+        ext_a_start=ext_a_start,
+        ext_a_stop=ext_a_stop,
+        ext_b_start=ext_b_start,
+        ext_b_stop=ext_b_stop,
+        ext_reverse=ext_reverse,
+        alignment_mode=alignment_mode.value,
     )
 
     if upsert:
diff --git a/test/comparison/test_binning.py b/test/comparison/test_binning.py
index 9391cbcd..9254d50b 100644
--- a/test/comparison/test_binning.py
+++ b/test/comparison/test_binning.py
@@ -273,6 +273,12 @@ def test_ref_to_ref_pair_generator_first_iteration(self):
                         0,
                         0,
                         False,
+                        0,
+                        0,
+                        0,
+                        0,
+                        False,
+                        bs_enums.ALIGNMENT_MODE.GLOBAL,
                     )
                 )
             else:
@@ -291,6 +297,12 @@ def test_ref_to_ref_pair_generator_first_iteration(self):
                         0,
                         0,
                         False,
+                        0,
+                        0,
+                        0,
+                        0,
+                        False,
+                        bs_enums.ALIGNMENT_MODE.GLOBAL,
                     )
                 )
 
@@ -372,6 +384,12 @@ def test_ref_to_ref_pair_generator_second_iteration(self):
                         0,
                         0,
                         False,
+                        0,
+                        0,
+                        0,
+                        0,
+                        False,
+                        bs_enums.ALIGNMENT_MODE.GLOBAL,
                     )
                 )
             else:
@@ -390,6 +408,12 @@ def test_ref_to_ref_pair_generator_second_iteration(self):
                         0,
                         0,
                         False,
+                        0,
+                        0,
+                        0,
+                        0,
+                        False,
+                        bs_enums.ALIGNMENT_MODE.GLOBAL,
                     )
                 )
 
@@ -422,6 +446,12 @@ def test_ref_to_ref_pair_generator_second_iteration(self):
                 0,
                 0,
                 False,
+                0,
+                0,
+                0,
+                0,
+                False,
+                bs_enums.ALIGNMENT_MODE.GLOBAL,
             )
         )
 
@@ -440,6 +470,12 @@ def test_ref_to_ref_pair_generator_second_iteration(self):
                 0,
                 0,
                 False,
+                0,
+                0,
+                0,
+                0,
+                False,
+                bs_enums.ALIGNMENT_MODE.GLOBAL,
             )
         )
         save_edge_to_db(
@@ -456,6 +492,12 @@ def test_ref_to_ref_pair_generator_second_iteration(self):
                 0,
                 0,
                 False,
+                0,
+                0,
+                0,
+                0,
+                False,
+                bs_enums.ALIGNMENT_MODE.GLOBAL,
             )
         )
         save_edge_to_db(
@@ -472,6 +514,12 @@ def test_ref_to_ref_pair_generator_second_iteration(self):
                 0,
                 0,
                 False,
+                0,
+                0,
+                0,
+                0,
+                False,
+                bs_enums.ALIGNMENT_MODE.GLOBAL,
             )
         )
 
diff --git a/test/db/test_partial.py b/test/db/test_partial.py
index 1819e583..7cd5d052 100644
--- a/test/db/test_partial.py
+++ b/test/db/test_partial.py
@@ -48,7 +48,28 @@ def add_mock_hsp_alignment_hsp(hsp: HSP) -> None:
 
 def gen_mock_edge_list(
     edge_gbks: list[GBK],
-) -> list[tuple[int, int, float, float, float, float, str, int, int, int, int, bool]]:
+) -> list[
+    tuple[
+        int,
+        int,
+        float,
+        float,
+        float,
+        float,
+        str,
+        int,
+        int,
+        int,
+        int,
+        bool,
+        int,
+        int,
+        int,
+        int,
+        bool,
+        bs_enums.ALIGNMENT_MODE,
+    ]
+]:
     edges = []
     for gbk_a, gbk_b in combinations(edge_gbks, 2):
         if gbk_a.region is None or gbk_b.region is None:
@@ -70,6 +91,12 @@ def gen_mock_edge_list(
                 0,
                 0,
                 False,
+                0,
+                0,
+                0,
+                0,
+                False,
+                bs_enums.ALIGNMENT_MODE.GLOBAL,
             )
         )
 
diff --git a/test/network/test_network.py b/test/network/test_network.py
index 06c05a11..7f3e1f3a 100644
--- a/test/network/test_network.py
+++ b/test/network/test_network.py
@@ -28,7 +28,28 @@ def create_mock_gbk(i) -> bs_gbk.GBK:
 
 def gen_mock_edge_list(
     edge_gbks: list[bs_gbk.GBK],
-) -> list[tuple[int, int, float, float, float, float, str, int, int, int, int, bool]]:
+) -> list[
+    tuple[
+        int,
+        int,
+        float,
+        float,
+        float,
+        float,
+        str,
+        int,
+        int,
+        int,
+        int,
+        bool,
+        int,
+        int,
+        int,
+        int,
+        bool,
+        bs_enums.ALIGNMENT_MODE,
+    ]
+]:
     edges = []
     for gbk_a, gbk_b in combinations(edge_gbks, 2):
         if gbk_a.region is None or gbk_b.region is None:
@@ -50,6 +71,12 @@ def gen_mock_edge_list(
                 0,
                 0,
                 False,
+                0,
+                0,
+                0,
+                0,
+                False,
+                bs_enums.ALIGNMENT_MODE.GLOBAL,
             )
         )
 

From 9f3508157b8a309c2225dd72195f518a47ffbf77 Mon Sep 17 00:00:00 2001
From: nlouwen <nicolouwen@hotmail.com>
Date: Thu, 2 Nov 2023 16:22:46 +0100
Subject: [PATCH 10/15] highlight compared region

---
 .../output/html_content/js/bigscape.js        | 48 +++++++++++++------
 big_scape/output/legacy_output.py             | 23 +++++----
 2 files changed, 45 insertions(+), 26 deletions(-)

diff --git a/big_scape/output/html_template/output/html_content/js/bigscape.js b/big_scape/output/html_template/output/html_content/js/bigscape.js
index df501359..bd0c1997 100644
--- a/big_scape/output/html_template/output/html_content/js/bigscape.js
+++ b/big_scape/output/html_template/output/html_content/js/bigscape.js
@@ -21,9 +21,29 @@ function Bigscape(run_data, bs_data, bs_families, bs_alignment, bs_similarity, n
   var bs_to_cl = [];
   var bs_svg = [];
   for (var i in bs_data) {
-    var svg = $(Arrower.drawClusterSVG(bs_data[i]));
+    var cl_data = bs_data[i];
+    var ext_start = cl_data.record_start ? cl_data.record_start - 1 : 0;
+    var ext_stop = cl_data.record_stop ? cl_data.record_stop - 1 : cl_data.orfs.length;
+    var svg = $(Arrower.drawClusterSVG(cl_data));
     svg.css("clear", "both");
     svg.addClass("arrower-svg");
+    // opacity for domains/orfs outside of compared region
+    if ((ext_start !== 0) || (ext_stop !== cl_data.orfs.length)) {
+      var outside_ext_bounds = false
+      svg.find("polygon").each(function () {
+        if ((outside_ext_bounds) && ($(this).attr("class") === "arrower-domain")) {
+          $(this).css("opacity", "0.4")
+        } else if (
+          ($(this).attr("class") === "arrower-orf") &&
+          (($(this).attr("idx") < ext_start) || ($(this).attr("idx") > ext_stop))
+        ) {
+          $(this).css("opacity", "0.4")
+          outside_ext_bounds = true
+        } else {
+          outside_ext_bounds = false
+        }
+      })
+    }
     bs_svg.push(svg);
   }
   // for search optimization
@@ -322,19 +342,19 @@ function Bigscape(run_data, bs_data, bs_families, bs_alignment, bs_similarity, n
     var ui = Viva.Graph.svg('circle')
       .attr('r', 10)
       .attr('fill', (fam_colors[bs_to_cl[node.id]]));
-      if (run_data["mode"] == "Cluster") {
-        if (bs_data[node.id]["source"] == ("mibig" || "reference")) {
-          ui.attr("stroke", "blue");
-          ui.attr("stroke-width", "4px");
-        }
+    if (run_data["mode"] == "Cluster") {
+      if (bs_data[node.id]["source"] == ("mibig" || "reference")) {
+        ui.attr("stroke", "blue");
+        ui.attr("stroke-width", "4px");
       }
-      if (run_data["mode"] == "Query") {
-        if (bs_data[node.id]["source"] == "query") {
-          // ui.attr("r", "20")
-          ui.attr("stroke", "green");
-          ui.attr("stroke-width", "6px");
-        }
+    }
+    if (run_data["mode"] == "Query") {
+      if (bs_data[node.id]["source"] == "query") {
+        // ui.attr("r", "20")
+        ui.attr("stroke", "green");
+        ui.attr("stroke-width", "6px");
       }
+    }
     $(ui).hover(function () { // mouse over
       var temp_highlight = [];
       for (var i in highlighted_nodes) {
@@ -426,8 +446,8 @@ function Bigscape(run_data, bs_data, bs_families, bs_alignment, bs_similarity, n
   graphics.link(function (link) {
 
     let line = Viva.Graph.svg('line')
-    .attr("stroke", "#777")
-    .attr("stroke-width", link["data"]["weight"] * 10);
+      .attr("stroke", "#777")
+      .attr("stroke-width", link["data"]["weight"] * 10);
 
     if (graph.getNode(link.fromId).data.id === graph.getNode(link.toId).data.id) {
       line = line.attr("stroke-dasharray", "10,10")
diff --git a/big_scape/output/legacy_output.py b/big_scape/output/legacy_output.py
index a5569e18..6c7395ad 100644
--- a/big_scape/output/legacy_output.py
+++ b/big_scape/output/legacy_output.py
@@ -576,6 +576,8 @@ def generate_bs_data_js(
             "id": str, (e.g. AL645882.2.cluster010),
             "mibig": bool,
             "source": str, (e.g. mibig, reference, or query),
+            "record_start": int, (e.g. cds boundaries of protocluster, index starts at 1)
+            "record_stop": int,
             "orfs": [
                 {
                     "domains": [
@@ -612,6 +614,9 @@ def generate_bs_data_js(
         if record.parent_gbk is None:
             raise AttributeError("Record parent GBK is not set!")
 
+        rec_cds = record.get_cds()
+        rec_start = rec_cds[0].orf_num
+        rec_stop = rec_cds[-1].orf_num
         gbk = record.parent_gbk
         organism = "Unknown"
         if "organism" in gbk.metadata:
@@ -625,6 +630,8 @@ def generate_bs_data_js(
                 "id": gbk.path.name,
                 "mibig": gbk.source_type == SOURCE_TYPE.MIBIG,
                 "source": gbk.source_type.name.lower(),
+                "record_start": rec_start,
+                "record_stop": rec_stop,
                 "orfs": generate_bs_data_js_orfs(gbk),
             }
         )
@@ -714,18 +721,10 @@ def fetch_lcs_from_db(a_id: int, b_id: int, weights: str) -> dict[str, Any]:
     dist_table = DB.metadata.tables["distance"]
     select_query = (
         dist_table.select()
-        # .add_columns(
-        #     dist_table.c.region_a_id,
-        #     dist_table.c.region_b_id,
-        #     dist_table.c.lcs_a_start,
-        #     dist_table.c.lcs_a_stop,
-        #     dist_table.c.lcs_b_start,
-        #     dist_table.c.lcs_reverse,
-        # )
-        .where(dist_table.c.region_a_id.in_((a_id, b_id))).where(
-            dist_table.c.region_b_id.in_((a_id, b_id))
-        )
-        # .where(dist_table.c.weights == weights)
+        .where(dist_table.c.region_a_id != dist_table.c.region_b_id)
+        .where(dist_table.c.region_a_id.in_((a_id, b_id)))
+        .where(dist_table.c.region_b_id.in_((a_id, b_id)))
+        .where(dist_table.c.weights == weights)
         .compile()
     )
     result = DB.execute(select_query).fetchone()

From 40b559ed64bf58f2e20dd0708be66024daa07298 Mon Sep 17 00:00:00 2001
From: nlouwen <nicolouwen@hotmail.com>
Date: Fri, 3 Nov 2023 10:20:29 +0100
Subject: [PATCH 11/15] add todo region type lcs adjust

---
 big_scape/output/legacy_output.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/big_scape/output/legacy_output.py b/big_scape/output/legacy_output.py
index 6c7395ad..9de4f0b4 100644
--- a/big_scape/output/legacy_output.py
+++ b/big_scape/output/legacy_output.py
@@ -765,7 +765,6 @@ def adjust_lcs_to_all_genes(
         a_stop = result["lcs_a_stop"]
         b_start = result["lcs_b_start"]
         reverse = result["lcs_reverse"]
-
         length = abs(a_start - a_stop)  # seed length
         a_start = domain_genes_to_all_genes[family_db_id][a_start]
         if length == 0:
@@ -783,7 +782,6 @@ def adjust_lcs_to_all_genes(
         a_stop = result["lcs_b_stop"]
         b_start = result["lcs_a_start"]
         reverse = result["lcs_reverse"]
-
         length = abs(a_start - a_stop)  # seed length
         if length == 0:
             pass
@@ -796,6 +794,11 @@ def adjust_lcs_to_all_genes(
             a_start = domain_genes_to_all_genes[family_db_id][a_start]
             b_start = domain_genes_to_all_genes[bgc_db_id][b_start]
 
+    # TODO: adjust lcs bounds for protocluster/protocore mode once lcs is implemented
+    # a_start, b_start = adjust_lcs_record_bounds(
+    #     a_start, b_start, fam_rec, bgc_rec, reverse
+    # )
+
     if length == 0:
         length = 1
         # let's try aligning using the genes with most domains

From 9a8af288ed23829569a2a11898eb3dbe12437c03 Mon Sep 17 00:00:00 2001
From: nlouwen <nicolouwen@hotmail.com>
Date: Fri, 3 Nov 2023 10:20:53 +0100
Subject: [PATCH 12/15] test fix

---
 test/comparison/test_binning.py | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/test/comparison/test_binning.py b/test/comparison/test_binning.py
index fd5352e5..f9b8a8b0 100644
--- a/test/comparison/test_binning.py
+++ b/test/comparison/test_binning.py
@@ -656,6 +656,17 @@ def test_cull_singletons_cutoff(self):
                 1.0,
                 1.0,
                 "mix",
+                0,
+                0,
+                0,
+                0,
+                False,
+                0,
+                0,
+                0,
+                0,
+                False,
+                bs_enums.ALIGNMENT_MODE.GLOBAL,
             )
         )
 
@@ -668,6 +679,17 @@ def test_cull_singletons_cutoff(self):
                 0.0,
                 0.0,
                 "mix",
+                0,
+                0,
+                0,
+                0,
+                False,
+                0,
+                0,
+                0,
+                0,
+                False,
+                bs_enums.ALIGNMENT_MODE.GLOBAL,
             )
         )
 
@@ -680,6 +702,17 @@ def test_cull_singletons_cutoff(self):
                 1.0,
                 1.0,
                 "mix",
+                0,
+                0,
+                0,
+                0,
+                False,
+                0,
+                0,
+                0,
+                0,
+                False,
+                bs_enums.ALIGNMENT_MODE.GLOBAL,
             )
         )
 

From 700c2ba8cef351b082d3d98cebc21114f57a08d8 Mon Sep 17 00:00:00 2001
From: nlouwen <nicolouwen@hotmail.com>
Date: Fri, 3 Nov 2023 14:11:38 +0100
Subject: [PATCH 13/15] Simplify lcs extend db storage and fetching

---
 big_scape/comparison/comparable_region.py   |   2 +-
 big_scape/comparison/legacy_workflow_alt.py | 135 +++++++++++++++-----
 big_scape/comparison/utility.py             |  16 +--
 big_scape/data/schema.sql                   |   3 +-
 big_scape/network/network.py                |  47 +++++--
 big_scape/output/legacy_output.py           |   4 +-
 big_scape/trees/newick_tree.py              |  10 +-
 7 files changed, 157 insertions(+), 60 deletions(-)

diff --git a/big_scape/comparison/comparable_region.py b/big_scape/comparison/comparable_region.py
index ac311d78..70e2c1e8 100644
--- a/big_scape/comparison/comparable_region.py
+++ b/big_scape/comparison/comparable_region.py
@@ -36,6 +36,7 @@ class ComparableRegion:
         domain_lists: Optional[tuple[list[HSP], list[HSP]]]
         domain_sets: Optional[tuple[set[HSP], set[HSP]]]
         domain_dicts: Optional[tuple[dict[HSP, list[int]], dict[HSP, list[int]]]]
+        alignment_mode: ALIGNMENT_MODE
     """
 
     def __init__(
@@ -67,7 +68,6 @@ def __init__(
         self.lcs_b_start = b_start
         self.lcs_a_stop = a_stop
         self.lcs_b_stop = b_stop
-        self.lcs_reverse = reverse
         self.alignment_mode: ALIGNMENT_MODE = ALIGNMENT_MODE.GLOBAL
 
     def get_domain_sets(
diff --git a/big_scape/comparison/legacy_workflow_alt.py b/big_scape/comparison/legacy_workflow_alt.py
index 7d9f1356..53f49113 100644
--- a/big_scape/comparison/legacy_workflow_alt.py
+++ b/big_scape/comparison/legacy_workflow_alt.py
@@ -162,29 +162,7 @@ def generate_edges(
                 if len(results) != len(task_batch):
                     raise ValueError("Mismatch between task length and result length")
 
-                # for idx, pair in enumerate(task_batch):
-                #     distance, jaccard, adjacency, dss = results[idx]
-                for pair, distance, jaccard, adjacency, dss in results:
-                    yield (
-                        pair.region_a._db_id,
-                        pair.region_b._db_id,
-                        distance,
-                        jaccard,
-                        adjacency,
-                        dss,
-                        pair_generator.weights,
-                        pair.comparable_region.lcs_a_start,
-                        pair.comparable_region.lcs_a_stop,
-                        pair.comparable_region.lcs_b_start,
-                        pair.comparable_region.lcs_b_stop,
-                        pair.comparable_region.lcs_reverse,
-                        pair.comparable_region.a_start,
-                        pair.comparable_region.a_stop,
-                        pair.comparable_region.b_start,
-                        pair.comparable_region.b_stop,
-                        pair.comparable_region.reverse,
-                        pair.comparable_region.alignment_mode.value,
-                    )
+                yield from results
 
                 done_pairs += len(results)
                 if callback:
@@ -214,7 +192,6 @@ def do_lcs_pair(
     pair.comparable_region.lcs_a_stop = a_stop
     pair.comparable_region.lcs_b_start = b_start
     pair.comparable_region.lcs_b_stop = b_stop
-    pair.comparable_region.lcs_reverse = reverse
     pair.comparable_region.a_start = a_start
     pair.comparable_region.a_stop = a_stop
     pair.comparable_region.b_start = b_start
@@ -252,7 +229,26 @@ def expand_pair(pair: RecordPair) -> float:
 
 def calculate_scores_pair(
     data: tuple[list[RecordPair], bs_enums.ALIGNMENT_MODE, str]
-) -> list[tuple[RecordPair, float, float, float, float]]:  # pragma no cover
+) -> list[
+    tuple[
+        Optional[int],
+        Optional[int],
+        float,
+        float,
+        float,
+        float,
+        int,
+        int,
+        int,
+        int,
+        int,
+        int,
+        int,
+        int,
+        bool,
+        str,
+    ]
+]:  # pragma no cover
     """Calculate the scores for a list of pairs
 
     Args:
@@ -260,8 +256,9 @@ def calculate_scores_pair(
         label
 
     Returns:
-        list[tuple[float, float, float, float]]: list of scores for each pair in the
-        order as the input data list
+        list[tuple[int, int, float, float, float, float, int, int, int, int, int, int,
+        int, int, bool, str,]]: list of scores for each pair in the
+        order as the input data list, including lcs and extension coordinates
     """
     pairs, alignment_mode, weights_label = data
 
@@ -269,13 +266,51 @@ def calculate_scores_pair(
 
     for pair in pairs:
         if pair.region_a.parent_gbk == pair.region_b.parent_gbk:
-            results.append((pair, 0.0, 1.0, 1.0, 1.0))
+            results.append(
+                (
+                    pair.region_a._db_id,
+                    pair.region_b._db_id,
+                    0.0,
+                    1.0,
+                    1.0,
+                    1.0,
+                    pair.comparable_region.lcs_a_start,
+                    pair.comparable_region.lcs_a_stop,
+                    pair.comparable_region.lcs_b_start,
+                    pair.comparable_region.lcs_b_stop,
+                    pair.comparable_region.a_start,
+                    pair.comparable_region.a_stop,
+                    pair.comparable_region.b_start,
+                    pair.comparable_region.b_stop,
+                    pair.comparable_region.reverse,
+                    pair.comparable_region.alignment_mode.value,
+                )
+            )
             continue
 
         jaccard = calc_jaccard_pair(pair)
 
         if jaccard == 0.0:
-            results.append((pair, 1.0, 0.0, 0.0, 0.0))
+            results.append(
+                (
+                    pair.region_a._db_id,
+                    pair.region_b._db_id,
+                    1.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    pair.comparable_region.lcs_a_start,
+                    pair.comparable_region.lcs_a_stop,
+                    pair.comparable_region.lcs_b_start,
+                    pair.comparable_region.lcs_b_stop,
+                    pair.comparable_region.a_start,
+                    pair.comparable_region.a_stop,
+                    pair.comparable_region.b_start,
+                    pair.comparable_region.b_stop,
+                    pair.comparable_region.reverse,
+                    pair.comparable_region.alignment_mode.value,
+                )
+            )
             continue
 
         # in the form [bool, Pair]. true bools means they need expansion, false they don't
@@ -285,7 +320,26 @@ def calculate_scores_pair(
             jaccard = expand_pair(pair)
 
         if jaccard == 0.0:
-            results.append((pair, 1.0, 0.0, 0.0, 0.0))
+            results.append(
+                (
+                    pair.region_a._db_id,
+                    pair.region_b._db_id,
+                    1.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    pair.comparable_region.lcs_a_start,
+                    pair.comparable_region.lcs_a_stop,
+                    pair.comparable_region.lcs_b_start,
+                    pair.comparable_region.lcs_b_stop,
+                    pair.comparable_region.a_start,
+                    pair.comparable_region.a_stop,
+                    pair.comparable_region.b_start,
+                    pair.comparable_region.b_stop,
+                    pair.comparable_region.reverse,
+                    pair.comparable_region.alignment_mode.value,
+                )
+            )
             continue
 
         if weights_label not in LEGACY_WEIGHTS:
@@ -303,6 +357,25 @@ def calculate_scores_pair(
         similarity = jaccard * jc_weight + adjacency * ai_weight + dss * dss_weight
         distance = 1 - similarity
 
-        results.append((pair, distance, jaccard, adjacency, dss))
+        results.append(
+            (
+                pair.region_a._db_id,
+                pair.region_b._db_id,
+                distance,
+                jaccard,
+                adjacency,
+                dss,
+                pair.comparable_region.lcs_a_start,
+                pair.comparable_region.lcs_a_stop,
+                pair.comparable_region.lcs_b_start,
+                pair.comparable_region.lcs_b_stop,
+                pair.comparable_region.a_start,
+                pair.comparable_region.a_stop,
+                pair.comparable_region.b_start,
+                pair.comparable_region.b_stop,
+                pair.comparable_region.reverse,
+                pair.comparable_region.alignment_mode.value,
+            )
+        )
 
     return results
diff --git a/big_scape/comparison/utility.py b/big_scape/comparison/utility.py
index df807099..b1788a7a 100644
--- a/big_scape/comparison/utility.py
+++ b/big_scape/comparison/utility.py
@@ -26,7 +26,6 @@ def save_edge_to_db(
         int,
         int,
         int,
-        bool,
         int,
         int,
         int,
@@ -39,8 +38,10 @@ def save_edge_to_db(
     """Save edge to the database
 
     Args:
-        edge (tuple[int, int, float, float, float, float]): edge tuple containing
-            region_a_id, region_b_id, distance, jaccard, adjacency, dss
+        edge (tuple[int, int, float, float, float, float, str, int, int, int, int, int,
+        int, int, int, bool, ALIGNMENT_MODE,]): edge tuple containing
+            region_a_id, region_b_id, distance, jaccard, adjacency, dss, weights,
+            lcs start/stop, extension start/stop, reverse, alignment_mode
         upsert (bool, optional): whether to upsert the edge into the database.
     """
 
@@ -56,12 +57,11 @@ def save_edge_to_db(
         lcs_a_stop,
         lcs_b_start,
         lcs_b_stop,
-        lcs_reverse,
         ext_a_start,
         ext_a_stop,
         ext_b_start,
         ext_b_stop,
-        ext_reverse,
+        reverse,
         alignment_mode,
     ) = edge
 
@@ -85,12 +85,11 @@ def save_edge_to_db(
         lcs_a_stop=lcs_a_stop,
         lcs_b_start=lcs_b_start,
         lcs_b_stop=lcs_b_stop,
-        lcs_reverse=lcs_reverse,
         ext_a_start=ext_a_start,
         ext_a_stop=ext_a_stop,
         ext_b_start=ext_b_start,
         ext_b_stop=ext_b_stop,
-        ext_reverse=ext_reverse,
+        reverse=reverse,
         alignment_mode=alignment_mode.value,
     )
 
@@ -114,7 +113,6 @@ def save_edges_to_db(
             int,
             int,
             int,
-            bool,
             int,
             int,
             int,
@@ -147,7 +145,7 @@ def save_edges_to_db(
     # create a query
     # TODO: this should not need ignore. it's there now because protoclusters somehow
     # trigger an integrityerror
-    query = "INSERT OR IGNORE INTO distance VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"
+    query = "INSERT OR IGNORE INTO distance VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"
 
     cursor.executemany(query, edges)
 
diff --git a/big_scape/data/schema.sql b/big_scape/data/schema.sql
index ce49ac2a..e7906562 100644
--- a/big_scape/data/schema.sql
+++ b/big_scape/data/schema.sql
@@ -79,12 +79,11 @@ CREATE TABLE IF NOT EXISTS distance (
     lcs_a_stop INTEGER NOT NULL,
     lcs_b_start INTEGER NOT NULL,
     lcs_b_stop INTEGER NOT NULL,
-    lcs_reverse BOOLEAN NOT NULL,
     ext_a_start INTEGER NOT NULL,
     ext_a_stop INTEGER NOT NULL,
     ext_b_start INTEGER NOT NULL,
     ext_b_stop INTEGER NOT NULL,
-    ext_reverse BOOLEAN NOT NULL,
+    reverse BOOLEAN NOT NULL,
     alignment_mode TEXT NOT NULL,
     UNIQUE(region_a_id, region_b_id, weights)
     FOREIGN KEY(region_a_id) REFERENCES bgc_record(id)
diff --git a/big_scape/network/network.py b/big_scape/network/network.py
index f1f69b83..dd26f5fe 100644
--- a/big_scape/network/network.py
+++ b/big_scape/network/network.py
@@ -3,7 +3,7 @@
 
 # from dependencies
 from typing import Optional, Generator, cast
-from sqlalchemy import tuple_
+from sqlalchemy import tuple_, select
 
 # from other modules
 from big_scape.data import DB
@@ -82,13 +82,20 @@ def get_edge(
 
     if not DB.metadata:
         raise RuntimeError("DB.metadata is None")
-
+    distance_table = DB.metadata.tables["distance"]
     select_statment = (
-        DB.metadata.tables["distance"]
-        .select()
-        .where(DB.metadata.tables["distance"].c.region_a_id.notin_(exclude_nodes))
-        .where(DB.metadata.tables["distance"].c.region_b_id.notin_(exclude_nodes))
-        .where(DB.metadata.tables["distance"].c.distance < cutoff)
+        select(
+            distance_table.c.region_a_id,
+            distance_table.c.region_b_id,
+            distance_table.c.distance,
+            distance_table.c.jaccard,
+            distance_table.c.adjacency,
+            distance_table.c.dss,
+            distance_table.c.weights,
+        )
+        .where(distance_table.c.region_a_id.notin_(exclude_nodes))
+        .where(distance_table.c.region_b_id.notin_(exclude_nodes))
+        .where(distance_table.c.distance < cutoff)
     )
 
     edge = DB.execute(select_statment).fetchone()
@@ -96,7 +103,7 @@ def get_edge(
     if edge is None:
         return None
 
-    return cast(tuple[int, int, float, float, float, float, str], edge[0:7])
+    return cast(tuple[int, int, float, float, float, float, str], edge)
 
 
 def get_edges(
@@ -113,7 +120,15 @@ def get_edges(
 
     distance_table = DB.metadata.tables["distance"]
     select_statement = (
-        distance_table.select()
+        select(
+            distance_table.c.region_a_id,
+            distance_table.c.region_b_id,
+            distance_table.c.distance,
+            distance_table.c.jaccard,
+            distance_table.c.adjacency,
+            distance_table.c.dss,
+            distance_table.c.weights,
+        )
         # equivalent to WHERE (region_a_id in (...) OR region_b_id in (...))
         .where(
             distance_table.c.region_a_id.in_(include_nodes)
@@ -123,7 +138,7 @@ def get_edges(
         .where(distance_table.c.distance < distance_cutoff)
     ).compile()
 
-    edges = [edge[0:7] for edge in DB.execute(select_statement).fetchall()]
+    edges = DB.execute(select_statement).fetchall()
 
     return cast(list[tuple[int, int, float, float, float, float, str]], edges)
 
@@ -144,7 +159,15 @@ def get_connected_edges(
 
     distance_table = DB.metadata.tables["distance"]
     select_statement = (
-        distance_table.select()
+        select(
+            distance_table.c.region_a_id,
+            distance_table.c.region_b_id,
+            distance_table.c.distance,
+            distance_table.c.jaccard,
+            distance_table.c.adjacency,
+            distance_table.c.dss,
+            distance_table.c.weights,
+        )
         # equivalent to WHERE (region_a_id in (...) OR region_b_id in (...))
         .where(
             distance_table.c.region_a_id.in_(include_nodes)
@@ -166,7 +189,7 @@ def get_connected_edges(
         .where(distance_table.c.distance < distance_cutoff).compile()
     )
 
-    edges = [edge[0:7] for edge in DB.execute(select_statement).fetchall()]
+    edges = DB.execute(select_statement).fetchall()
 
     return cast(list[tuple[int, int, float, float, float, float, str]], edges)
 
diff --git a/big_scape/output/legacy_output.py b/big_scape/output/legacy_output.py
index 9de4f0b4..c6b84a21 100644
--- a/big_scape/output/legacy_output.py
+++ b/big_scape/output/legacy_output.py
@@ -764,7 +764,7 @@ def adjust_lcs_to_all_genes(
         a_start = result["lcs_a_start"]
         a_stop = result["lcs_a_stop"]
         b_start = result["lcs_b_start"]
-        reverse = result["lcs_reverse"]
+        reverse = result["reverse"]
         length = abs(a_start - a_stop)  # seed length
         a_start = domain_genes_to_all_genes[family_db_id][a_start]
         if length == 0:
@@ -781,7 +781,7 @@ def adjust_lcs_to_all_genes(
         a_start = result["lcs_b_start"]
         a_stop = result["lcs_b_stop"]
         b_start = result["lcs_a_start"]
-        reverse = result["lcs_reverse"]
+        reverse = result["reverse"]
         length = abs(a_start - a_stop)  # seed length
         if length == 0:
             pass
diff --git a/big_scape/trees/newick_tree.py b/big_scape/trees/newick_tree.py
index b03f69f3..6a769331 100644
--- a/big_scape/trees/newick_tree.py
+++ b/big_scape/trees/newick_tree.py
@@ -26,7 +26,7 @@ def generate_newick_tree(
         family_members (list[int]): list of bgc ids in one family
 
     Returns:
-        Correctly formatted newick tree
+        str: Correctly formatted newick tree
     """
     # no need for alignment
     if len(records) < 3:
@@ -62,6 +62,9 @@ def process_newick_tree(tree_file: Path) -> str:
 
     Args:
         tree_file (Path): Path to tree file
+
+    Returns:
+        str: processed newick tree
     """
     if not tree_file.exists():
         logging.error("Failed to create newick tree")
@@ -85,7 +88,7 @@ def process_newick_tree(tree_file: Path) -> str:
 def generate_gcf_alignment(
     records: list[BGCRecord], exemplar: int, family_members: list[int]
 ) -> str:
-    """Write protein domain alignment to file
+    """Generate protein domain alignment for records in GCF
 
     Args:
         records (list[BGCRecord]): Records within one GCF to align
@@ -93,7 +96,8 @@ def generate_gcf_alignment(
         family_members (list[int]): list of bgc ids in one family
 
     Returns:
-        Written file containing protein domain alignment
+        str: alignment of GCF based on protein domain
+        TODO: refactor
     """
     record_ids = list(range(len(records)))
 

From 6e08c7013facfee3edf51cbdd0c58d6389fe5c46 Mon Sep 17 00:00:00 2001
From: nlouwen <nicolouwen@hotmail.com>
Date: Fri, 3 Nov 2023 14:12:16 +0100
Subject: [PATCH 14/15] fix tests after db change

---
 test/comparison/test_binning.py | 11 -----------
 test/db/test_partial.py         |  2 --
 test/network/test_network.py    |  2 --
 test/trees/test_alignment.py    | 12 ++++++------
 4 files changed, 6 insertions(+), 21 deletions(-)

diff --git a/test/comparison/test_binning.py b/test/comparison/test_binning.py
index f9b8a8b0..f079c13a 100644
--- a/test/comparison/test_binning.py
+++ b/test/comparison/test_binning.py
@@ -273,7 +273,6 @@ def test_ref_to_ref_pair_generator_first_iteration(self):
                         0,
                         0,
                         0,
-                        False,
                         0,
                         0,
                         0,
@@ -297,7 +296,6 @@ def test_ref_to_ref_pair_generator_first_iteration(self):
                         0,
                         0,
                         0,
-                        False,
                         0,
                         0,
                         0,
@@ -384,7 +382,6 @@ def test_ref_to_ref_pair_generator_second_iteration(self):
                         0,
                         0,
                         0,
-                        False,
                         0,
                         0,
                         0,
@@ -408,7 +405,6 @@ def test_ref_to_ref_pair_generator_second_iteration(self):
                         0,
                         0,
                         0,
-                        False,
                         0,
                         0,
                         0,
@@ -446,7 +442,6 @@ def test_ref_to_ref_pair_generator_second_iteration(self):
                 0,
                 0,
                 0,
-                False,
                 0,
                 0,
                 0,
@@ -470,7 +465,6 @@ def test_ref_to_ref_pair_generator_second_iteration(self):
                 0,
                 0,
                 0,
-                False,
                 0,
                 0,
                 0,
@@ -492,7 +486,6 @@ def test_ref_to_ref_pair_generator_second_iteration(self):
                 0,
                 0,
                 0,
-                False,
                 0,
                 0,
                 0,
@@ -514,7 +507,6 @@ def test_ref_to_ref_pair_generator_second_iteration(self):
                 0,
                 0,
                 0,
-                False,
                 0,
                 0,
                 0,
@@ -660,7 +652,6 @@ def test_cull_singletons_cutoff(self):
                 0,
                 0,
                 0,
-                False,
                 0,
                 0,
                 0,
@@ -683,7 +674,6 @@ def test_cull_singletons_cutoff(self):
                 0,
                 0,
                 0,
-                False,
                 0,
                 0,
                 0,
@@ -706,7 +696,6 @@ def test_cull_singletons_cutoff(self):
                 0,
                 0,
                 0,
-                False,
                 0,
                 0,
                 0,
diff --git a/test/db/test_partial.py b/test/db/test_partial.py
index 7cd5d052..6e934685 100644
--- a/test/db/test_partial.py
+++ b/test/db/test_partial.py
@@ -61,7 +61,6 @@ def gen_mock_edge_list(
         int,
         int,
         int,
-        bool,
         int,
         int,
         int,
@@ -90,7 +89,6 @@ def gen_mock_edge_list(
                 0,
                 0,
                 0,
-                False,
                 0,
                 0,
                 0,
diff --git a/test/network/test_network.py b/test/network/test_network.py
index 7f3e1f3a..ee09ecbd 100644
--- a/test/network/test_network.py
+++ b/test/network/test_network.py
@@ -41,7 +41,6 @@ def gen_mock_edge_list(
         int,
         int,
         int,
-        bool,
         int,
         int,
         int,
@@ -70,7 +69,6 @@ def gen_mock_edge_list(
                 0,
                 0,
                 0,
-                False,
                 0,
                 0,
                 0,
diff --git a/test/trees/test_alignment.py b/test/trees/test_alignment.py
index cce4647b..ff75aab9 100644
--- a/test/trees/test_alignment.py
+++ b/test/trees/test_alignment.py
@@ -64,7 +64,7 @@ def test_lcs_adjust_ex2mem(self):
             "lcs_a_stop": 7,
             "lcs_b_start": 6,
             "lcs_b_stop": 9,
-            "lcs_reverse": False,
+            "reverse": False,
         }
         dom_to_gene = {0: {4: 5, 7: 9}, 1: {6: 8}}
         dom_count = {}
@@ -86,7 +86,7 @@ def test_lcs_adjust_ex2mem_reverse(self):
             "lcs_a_stop": 3,
             "lcs_b_start": 1,
             "lcs_b_stop": 2,
-            "lcs_reverse": True,
+            "reverse": True,
         }
         dom_to_gene = {0: {0: 0, 1: 2, 2: 4}, 1: {0: 1, 1: 2, 2: 3}}
         dom_count = {0: [1, 1, 1], 1: [1, 1, 1]}
@@ -108,7 +108,7 @@ def test_lcs_adjust_mem2ex(self):
             "lcs_a_stop": 3,
             "lcs_b_start": 1,
             "lcs_b_stop": 2,
-            "lcs_reverse": False,
+            "reverse": False,
         }
         dom_to_gene = {0: {0: 0, 1: 2, 2: 4}, 1: {0: 1, 1: 2, 2: 3}}
         dom_count = {0: [1, 1, 1], 1: [1, 1, 1]}
@@ -130,7 +130,7 @@ def test_lcs_adjust_mem2ex_reverse(self):
             "lcs_a_stop": 3,
             "lcs_b_start": 1,
             "lcs_b_stop": 2,
-            "lcs_reverse": True,
+            "reverse": True,
         }
         dom_to_gene = {0: {0: 0, 1: 2, 2: 4}, 1: {0: 1, 1: 2, 2: 3}}
         dom_count = {0: [1, 1, 1], 1: [1, 1, 1]}
@@ -153,7 +153,7 @@ def test_lcs_adjust_zero_length_not_reversed(self):
             "lcs_a_stop": 0,
             "lcs_b_start": 0,
             "lcs_b_stop": 0,
-            "lcs_reverse": False,
+            "reverse": False,
         }
         dom_to_gene = {0: {0: 0, 1: 2, 2: 4}, 1: {0: 1, 1: 2, 2: 3}}
         dom_count = {0: [1, 2, 1], 1: [1, 1, 2]}
@@ -187,7 +187,7 @@ def test_lcs_adjust_zero_length_reversed(self):
             "lcs_a_stop": 0,
             "lcs_b_start": 0,
             "lcs_b_stop": 0,
-            "lcs_reverse": False,
+            "reverse": False,
         }
         dom_to_gene = {0: {0: 0, 1: 2, 2: 4}, 1: {0: 1, 1: 2, 2: 3}}
         dom_count = {0: [1, 2, 1], 1: [1, 1, 2]}

From 8dfeb5fd4196af45271164b8a0b8c8b7e33fc9d8 Mon Sep 17 00:00:00 2001
From: nlouwen <nicolouwen@hotmail.com>
Date: Fri, 3 Nov 2023 14:49:52 +0100
Subject: [PATCH 15/15] minor docs and typing

---
 big_scape/comparison/legacy_workflow_alt.py | 5 +++++
 big_scape/comparison/utility.py             | 5 +++--
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/big_scape/comparison/legacy_workflow_alt.py b/big_scape/comparison/legacy_workflow_alt.py
index 53f49113..fb104f8d 100644
--- a/big_scape/comparison/legacy_workflow_alt.py
+++ b/big_scape/comparison/legacy_workflow_alt.py
@@ -237,6 +237,7 @@ def calculate_scores_pair(
         float,
         float,
         float,
+        str,
         int,
         int,
         int,
@@ -274,6 +275,7 @@ def calculate_scores_pair(
                     1.0,
                     1.0,
                     1.0,
+                    weights_label,
                     pair.comparable_region.lcs_a_start,
                     pair.comparable_region.lcs_a_stop,
                     pair.comparable_region.lcs_b_start,
@@ -299,6 +301,7 @@ def calculate_scores_pair(
                     0.0,
                     0.0,
                     0.0,
+                    weights_label,
                     pair.comparable_region.lcs_a_start,
                     pair.comparable_region.lcs_a_stop,
                     pair.comparable_region.lcs_b_start,
@@ -328,6 +331,7 @@ def calculate_scores_pair(
                     0.0,
                     0.0,
                     0.0,
+                    weights_label,
                     pair.comparable_region.lcs_a_start,
                     pair.comparable_region.lcs_a_stop,
                     pair.comparable_region.lcs_b_start,
@@ -365,6 +369,7 @@ def calculate_scores_pair(
                 jaccard,
                 adjacency,
                 dss,
+                weights_label,
                 pair.comparable_region.lcs_a_start,
                 pair.comparable_region.lcs_a_stop,
                 pair.comparable_region.lcs_b_start,
diff --git a/big_scape/comparison/utility.py b/big_scape/comparison/utility.py
index b1788a7a..2d049a0c 100644
--- a/big_scape/comparison/utility.py
+++ b/big_scape/comparison/utility.py
@@ -118,14 +118,15 @@ def save_edges_to_db(
             int,
             int,
             bool,
-            ALIGNMENT_MODE,
+            str,
         ]
     ]
 ) -> None:
     """Save many edges to the database
 
     Args:
-        edges (list[tuple[]]): list of edges to save
+        edges (list[tuple[int, int, float, float, float, float, str, int, int, int, int,
+               int, int, int, int, bool, str]]): list of edges to save
     """
     # save the comparison data to the database
     # using raw sqlite for this because sqlalchemy is not fast enough