From 418198911af178b7a1ac2ff5d8906379327a238e Mon Sep 17 00:00:00 2001 From: nlouwen Date: Thu, 7 Nov 2024 15:01:26 +0100 Subject: [PATCH 1/5] parse correct family, add tool to axis label --- big_scape/benchmark.py | 2 +- big_scape/benchmarking/benchmark_data_loader.py | 11 ++++++++++- big_scape/benchmarking/benchmark_output.py | 9 ++++++--- 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/big_scape/benchmark.py b/big_scape/benchmark.py index 1068adc1..2338b9ba 100644 --- a/big_scape/benchmark.py +++ b/big_scape/benchmark.py @@ -35,7 +35,7 @@ def run_bigscape_benchmark(run: dict): logging.info("Generating summary output") # output summary of all cutoffs metadata = OutputGenerator.generate_metadata(run) - outputter = OutputGenerator(run["output_dir"], metadata, run["label"]) + outputter = OutputGenerator(run["output_dir"], metadata, run["label"], data.tool) outputter.plot_per_cutoff(metrics) outputter.output_summary_per_cutoff(metrics) logging.info("Benchmark done!") diff --git a/big_scape/benchmarking/benchmark_data_loader.py b/big_scape/benchmarking/benchmark_data_loader.py index 144b0a92..efd18624 100644 --- a/big_scape/benchmarking/benchmark_data_loader.py +++ b/big_scape/benchmarking/benchmark_data_loader.py @@ -60,7 +60,7 @@ def read_gcf_long_tsv(self, infile: TextIO) -> dict[str, str]: else: bgc = f"{clean_name}_{parts[1]}_{parts[2]}" - data[bgc] = parts[3] + data[bgc] = parts[4].replace("FAM_", "") return data def load_curated_labels(self) -> None: @@ -106,6 +106,8 @@ def load_computed_bs2_labels(self, data_path: Path) -> None: data_path (Path): Path pointing to output files of BS2 output directory """ logging.info("Loading computed GCFs from BiG-SCAPE 2 output") + self.tool = "BiG-SCAPE 2" + run_times = [ p.stem.replace("_full", "") for p in data_path.glob("*_full.network") ] @@ -148,6 +150,8 @@ def load_computed_bs1_labels(self, data_path: Path) -> None: FileNotFoundError: Missing BS1 results in given output directory """ logging.info("Loading computed GCFs from BiG-SCAPE 1 output") + self.tool = "BiG-SCAPE 1" + runs = list(data_path.glob("*")) if len(runs) == 0: raise FileNotFoundError("No BiG-SCAPE 1 output found") @@ -197,6 +201,11 @@ def load_computed_bslice_labels(self, db_path: Path) -> None: ) threshs = {thresh: run_id for thresh, run_id in thresh_data} + if max(threshs.keys()) > 1.2: + self.tool = "BiG-SLiCE 1" + else: + self.tool = "BiG-SLiCE 2" + # collect bgc and their family assignment per threshold cursor_results = cur.execute( "SELECT bgc.orig_filename, gcf_membership.gcf_id, clustering.threshold " diff --git a/big_scape/benchmarking/benchmark_output.py b/big_scape/benchmarking/benchmark_output.py index cb176776..81e17003 100644 --- a/big_scape/benchmarking/benchmark_output.py +++ b/big_scape/benchmarking/benchmark_output.py @@ -24,10 +24,13 @@ class OutputGenerator: run_name (str): Dataset name and starttime added to each filename """ - def __init__(self, output_dir: Path, metadata: str, name: str) -> None: + def __init__( + self, output_dir: Path, metadata: str, name: str, tool: Optional[str] = None + ) -> None: self.output_dir = output_dir self.metadata = metadata self.name = name + self.tool = tool def initialize_output_dir(self) -> None: """Set up output directory""" @@ -163,7 +166,7 @@ def plot_per_cutoff(self, metrics: dict[str, dict[str, Any]]) -> None: Args: metrics: data dictionary storing all metrics per used cutoff """ - cutoffs = sorted(metrics.keys()) + cutoffs = sorted(metrics.keys(), key=float) homogeneity = [metrics[cut]["homogeneity"] for cut in cutoffs] completeness = [metrics[cut]["completeness"] for cut in cutoffs] v_measure = [metrics[cut]["v_measure"] for cut in cutoffs] @@ -206,7 +209,7 @@ def plot_per_cutoff(self, metrics: dict[str, dict[str, Any]]) -> None: ) ax.text(0, -0.4, self.metadata, transform=ax.transAxes) plt.title("External cluster evaluation metrics per used cutoff") - plt.xlabel("BiG-SCAPE family cutoff") + plt.xlabel(f"{self.tool} family cutoff") plt.ylabel("Score") plots = h + c + v + wl + ml ax.legend(plots, [p.get_label() for p in plots], loc=0) From 4b082f56245e746db9e6963b019e1a1f8a3d151a Mon Sep 17 00:00:00 2001 From: nlouwen Date: Thu, 7 Nov 2024 15:02:03 +0100 Subject: [PATCH 2/5] only stop run if config hash is present --- big_scape/data/sqlite.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/big_scape/data/sqlite.py b/big_scape/data/sqlite.py index 53d6c6ce..d477b7d8 100644 --- a/big_scape/data/sqlite.py +++ b/big_scape/data/sqlite.py @@ -477,9 +477,9 @@ def check_config_hash(): run_table = DB.metadata.tables["run"] latest_config = DB.execute( select(run_table.c.config_hash).order_by(desc(run_table.c.id)).limit(1) - ).scalar_one() + ).scalar_one_or_none() - if BigscapeConfig.HASH != latest_config: + if latest_config and BigscapeConfig.HASH != latest_config: raise RuntimeError( "Config file values have changed from the previous run! " "Existing data is not guarenteed to be reusable, please " From 2d0c5bbd6232be6daf8acdd29dc8fcb362fef128 Mon Sep 17 00:00:00 2001 From: nlouwen Date: Thu, 7 Nov 2024 15:05:29 +0100 Subject: [PATCH 3/5] make classify category default --- big_scape/cli/cluster_cli.py | 4 ++-- big_scape/file_input/load_files.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/big_scape/cli/cluster_cli.py b/big_scape/cli/cluster_cli.py index fa80ec20..31060088 100644 --- a/big_scape/cli/cluster_cli.py +++ b/big_scape/cli/cluster_cli.py @@ -28,7 +28,7 @@ @click.option( "--classify", type=click.Choice(["none", "class", "category", "legacy"]), - default="class", + default="category", callback=validate_classify, help=( """Define which method BiG-SCAPE should use to separate BGC records into @@ -49,7 +49,7 @@ higher. For older antiSMASH versions, either use --classify 'legacy' or do not select --legacy_weights, which will perform the weighted distance calculations based on the generic 'mix' weights. For more detail, see wiki. -(default: class)""" +(default: category)""" ), ) @click.option( diff --git a/big_scape/file_input/load_files.py b/big_scape/file_input/load_files.py index 7cd0f7e6..c9313cd4 100644 --- a/big_scape/file_input/load_files.py +++ b/big_scape/file_input/load_files.py @@ -367,7 +367,7 @@ def load_gbks(run: dict, bigscape_dir: Path) -> list[GBK]: # if we end up here, we are in some halfway state and need to load in the new data logging.info("Adding new data to the database...") missing_gbks = bs_data.get_missing_gbks(input_gbks) - logging.info("Found %d new gbks to process", len(missing_gbks)) + logging.info("Found %d new GBKs to process", len(missing_gbks)) for gbk in missing_gbks: gbk.save_all() From 14cde19dbe25ed45857d40d51dd9cd6616d3771b Mon Sep 17 00:00:00 2001 From: nlouwen Date: Thu, 7 Nov 2024 15:52:41 +0100 Subject: [PATCH 4/5] update outdated test data --- test/benchmark/test_data_loading.py | 16 ++++++++-------- .../mix/mix_clustering_c0.5.tsv | 10 +++++----- .../mix/mix_clustering_c0.5.tsv | 10 +++++----- .../mix/mix_clustering_c0.7.tsv | 10 +++++----- .../curated_gcfs/valid_protocluster_gcfs.tsv | 10 +++++----- 5 files changed, 28 insertions(+), 28 deletions(-) diff --git a/test/benchmark/test_data_loading.py b/test/benchmark/test_data_loading.py index e9313d47..f2546a01 100644 --- a/test/benchmark/test_data_loading.py +++ b/test/benchmark/test_data_loading.py @@ -87,16 +87,16 @@ def test_bs2_computed_gcf_loading(self): expected_data = { "0.5": { - "CM000578.1.cluster047_protocluster_1": "72", - "CM000578.1.cluster038_protocluster_1": "77", - "CM000578.1.cluster038_protocluster_2": "179", - "CM000578.1.cluster033": "167", + "CM000578.1.cluster047_protocluster_1": "00072", + "CM000578.1.cluster038_protocluster_1": "00077", + "CM000578.1.cluster038_protocluster_2": "00179", + "CM000578.1.cluster033": "00167", }, "0.7": { - "CM000578.1.cluster038_protocluster_1": "6", - "CM000578.1.cluster038_protocluster_2": "6", - "CM000578.1.cluster042_protocluster_1_2": "10", - "CM000578.1.cluster033": "62", + "CM000578.1.cluster038_protocluster_1": "00006", + "CM000578.1.cluster038_protocluster_2": "00006", + "CM000578.1.cluster042_protocluster_1_2": "00010", + "CM000578.1.cluster033": "00062", }, } diff --git a/test/test_data/bs2_output/01-12-2023_12-01-01_c0.3/mix/mix_clustering_c0.5.tsv b/test/test_data/bs2_output/01-12-2023_12-01-01_c0.3/mix/mix_clustering_c0.5.tsv index f4347474..77bc9300 100644 --- a/test/test_data/bs2_output/01-12-2023_12-01-01_c0.3/mix/mix_clustering_c0.5.tsv +++ b/test/test_data/bs2_output/01-12-2023_12-01-01_c0.3/mix/mix_clustering_c0.5.tsv @@ -1,5 +1,5 @@ -GBK Record_Type Record_Number GCF_number -../data/benchmark_antismash_files/extracted/Fusarium/Fusarium_16/CM000578.1.cluster047.gbk protocluster 1 72 -../data/benchmark_antismash_files/extracted/Fusarium/Fusarium_108/CM000578.1.cluster038.gbk protocluster 1 77 -../data/benchmark_antismash_files/extracted/Fusarium/Fusarium_108/CM000578.1.cluster038.gbk protocluster 2 179 -../data/benchmark_antismash_files/extracted/Fusarium/Fusarium_75/CM000578.1.cluster033.gbk region 33 167 +GBK Record_Type Record_Number CC Family +../data/benchmark_antismash_files/extracted/Fusarium/Fusarium_16/CM000578.1.cluster047.gbk protocluster 1 1 FAM_00072 +../data/benchmark_antismash_files/extracted/Fusarium/Fusarium_108/CM000578.1.cluster038.gbk protocluster 1 1 FAM_00077 +../data/benchmark_antismash_files/extracted/Fusarium/Fusarium_108/CM000578.1.cluster038.gbk protocluster 2 1 FAM_00179 +../data/benchmark_antismash_files/extracted/Fusarium/Fusarium_75/CM000578.1.cluster033.gbk region 33 1 FAM_00167 diff --git a/test/test_data/bs2_output/07-12-2023_12-15-35_c0.5/mix/mix_clustering_c0.5.tsv b/test/test_data/bs2_output/07-12-2023_12-15-35_c0.5/mix/mix_clustering_c0.5.tsv index f4347474..77bc9300 100644 --- a/test/test_data/bs2_output/07-12-2023_12-15-35_c0.5/mix/mix_clustering_c0.5.tsv +++ b/test/test_data/bs2_output/07-12-2023_12-15-35_c0.5/mix/mix_clustering_c0.5.tsv @@ -1,5 +1,5 @@ -GBK Record_Type Record_Number GCF_number -../data/benchmark_antismash_files/extracted/Fusarium/Fusarium_16/CM000578.1.cluster047.gbk protocluster 1 72 -../data/benchmark_antismash_files/extracted/Fusarium/Fusarium_108/CM000578.1.cluster038.gbk protocluster 1 77 -../data/benchmark_antismash_files/extracted/Fusarium/Fusarium_108/CM000578.1.cluster038.gbk protocluster 2 179 -../data/benchmark_antismash_files/extracted/Fusarium/Fusarium_75/CM000578.1.cluster033.gbk region 33 167 +GBK Record_Type Record_Number CC Family +../data/benchmark_antismash_files/extracted/Fusarium/Fusarium_16/CM000578.1.cluster047.gbk protocluster 1 1 FAM_00072 +../data/benchmark_antismash_files/extracted/Fusarium/Fusarium_108/CM000578.1.cluster038.gbk protocluster 1 1 FAM_00077 +../data/benchmark_antismash_files/extracted/Fusarium/Fusarium_108/CM000578.1.cluster038.gbk protocluster 2 1 FAM_00179 +../data/benchmark_antismash_files/extracted/Fusarium/Fusarium_75/CM000578.1.cluster033.gbk region 33 1 FAM_00167 diff --git a/test/test_data/bs2_output/07-12-2023_12-15-35_c0.7/mix/mix_clustering_c0.7.tsv b/test/test_data/bs2_output/07-12-2023_12-15-35_c0.7/mix/mix_clustering_c0.7.tsv index 613c3a2f..10aa2811 100644 --- a/test/test_data/bs2_output/07-12-2023_12-15-35_c0.7/mix/mix_clustering_c0.7.tsv +++ b/test/test_data/bs2_output/07-12-2023_12-15-35_c0.7/mix/mix_clustering_c0.7.tsv @@ -1,5 +1,5 @@ -GBK Record_Type Record_Number GCF_number -../data/benchmark_antismash_files/extracted/Fusarium/Fusarium_108/CM000578.1.cluster038.gbk protocluster 1 6 -../data/benchmark_antismash_files/extracted/Fusarium/Fusarium_108/CM000578.1.cluster038.gbk protocluster 2 6 -../data/benchmark_antismash_files/extracted/Fusarium/Fusarium_19/CM000578.1.cluster042.gbk protocluster 1_2 10 -../data/benchmark_antismash_files/extracted/Fusarium/Fusarium_75/CM000578.1.cluster033.gbk region 33 62 +GBK Record_Type Record_Number CC Family +../data/benchmark_antismash_files/extracted/Fusarium/Fusarium_108/CM000578.1.cluster038.gbk protocluster 1 1 FAM_00006 +../data/benchmark_antismash_files/extracted/Fusarium/Fusarium_108/CM000578.1.cluster038.gbk protocluster 2 1 FAM_00006 +../data/benchmark_antismash_files/extracted/Fusarium/Fusarium_19/CM000578.1.cluster042.gbk protocluster 1_2 1 FAM_00010 +../data/benchmark_antismash_files/extracted/Fusarium/Fusarium_75/CM000578.1.cluster033.gbk region 33 1 FAM_00062 diff --git a/test/test_data/curated_gcfs/valid_protocluster_gcfs.tsv b/test/test_data/curated_gcfs/valid_protocluster_gcfs.tsv index 40d5f31e..c9ea6ee1 100644 --- a/test/test_data/curated_gcfs/valid_protocluster_gcfs.tsv +++ b/test/test_data/curated_gcfs/valid_protocluster_gcfs.tsv @@ -1,5 +1,5 @@ -BGC_filename record_type record_number GCF_number product(s) -AGCF01000001.1.cluster077 protocluster 1 ectoine ['ectoine'] -CM001149.1.cluster003 protocluster 1 ectoine ['ectoine'] -CM001149.1.cluster001 protocluster 1 FAS ['fatty_acid'] -CM002177.1.cluster025 region 1 FAS ['fatty_acid'] +BGC_filename record_type record_number product(s) GCF_number +AGCF01000001.1.cluster077 protocluster 1 ['ectoine'] ectoine +CM001149.1.cluster003 protocluster 1 ['ectoine'] ectoine +CM001149.1.cluster001 protocluster 1 ['fatty_acid'] FAS +CM002177.1.cluster025 region 1 ['fatty_acid'] FAS From b38cff770dbb067ee2bf3a7f68175b30ca2ed5f1 Mon Sep 17 00:00:00 2001 From: nlouwen Date: Fri, 8 Nov 2024 13:36:09 +0100 Subject: [PATCH 5/5] add BiG-SCAPE version to log --- big_scape/run_bigscape.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/big_scape/run_bigscape.py b/big_scape/run_bigscape.py index 25af26af..c378b92b 100644 --- a/big_scape/run_bigscape.py +++ b/big_scape/run_bigscape.py @@ -45,8 +45,9 @@ def run_bigscape(run: dict) -> None: command line arguments, loads the data, runs the analysis and saves the output. """ # starting information + # TODO: add automatic updating of version number logging.info( - "Starting BiG-SCAPE %s run on %s level with %s alignment and %s weights", + "Starting BiG-SCAPE 2.0.0 %s run on %s level with %s alignment and %s weights", run["mode"], run["record_type"].value, run["alignment_mode"].value,