Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Hotfix/benchmark figs #206

Merged
merged 5 commits into from
Nov 8, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion big_scape/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def run_bigscape_benchmark(run: dict):
logging.info("Generating summary output")
# output summary of all cutoffs
metadata = OutputGenerator.generate_metadata(run)
outputter = OutputGenerator(run["output_dir"], metadata, run["label"])
outputter = OutputGenerator(run["output_dir"], metadata, run["label"], data.tool)
outputter.plot_per_cutoff(metrics)
outputter.output_summary_per_cutoff(metrics)
logging.info("Benchmark done!")
11 changes: 10 additions & 1 deletion big_scape/benchmarking/benchmark_data_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def read_gcf_long_tsv(self, infile: TextIO) -> dict[str, str]:
else:
bgc = f"{clean_name}_{parts[1]}_{parts[2]}"

data[bgc] = parts[3]
data[bgc] = parts[4].replace("FAM_", "")
return data

def load_curated_labels(self) -> None:
Expand Down Expand Up @@ -106,6 +106,8 @@ def load_computed_bs2_labels(self, data_path: Path) -> None:
data_path (Path): Path pointing to output files of BS2 output directory
"""
logging.info("Loading computed GCFs from BiG-SCAPE 2 output")
self.tool = "BiG-SCAPE 2"

run_times = [
p.stem.replace("_full", "") for p in data_path.glob("*_full.network")
]
Expand Down Expand Up @@ -148,6 +150,8 @@ def load_computed_bs1_labels(self, data_path: Path) -> None:
FileNotFoundError: Missing BS1 results in given output directory
"""
logging.info("Loading computed GCFs from BiG-SCAPE 1 output")
self.tool = "BiG-SCAPE 1"

runs = list(data_path.glob("*"))
if len(runs) == 0:
raise FileNotFoundError("No BiG-SCAPE 1 output found")
Expand Down Expand Up @@ -197,6 +201,11 @@ def load_computed_bslice_labels(self, db_path: Path) -> None:
)
threshs = {thresh: run_id for thresh, run_id in thresh_data}

if max(threshs.keys()) > 1.2:
self.tool = "BiG-SLiCE 1"
else:
self.tool = "BiG-SLiCE 2"

# collect bgc and their family assignment per threshold
cursor_results = cur.execute(
"SELECT bgc.orig_filename, gcf_membership.gcf_id, clustering.threshold "
Expand Down
9 changes: 6 additions & 3 deletions big_scape/benchmarking/benchmark_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,13 @@ class OutputGenerator:
run_name (str): Dataset name and starttime added to each filename
"""

def __init__(self, output_dir: Path, metadata: str, name: str) -> None:
def __init__(
self, output_dir: Path, metadata: str, name: str, tool: Optional[str] = None
) -> None:
self.output_dir = output_dir
self.metadata = metadata
self.name = name
self.tool = tool

def initialize_output_dir(self) -> None:
"""Set up output directory"""
Expand Down Expand Up @@ -163,7 +166,7 @@ def plot_per_cutoff(self, metrics: dict[str, dict[str, Any]]) -> None:
Args:
metrics: data dictionary storing all metrics per used cutoff
"""
cutoffs = sorted(metrics.keys())
cutoffs = sorted(metrics.keys(), key=float)
homogeneity = [metrics[cut]["homogeneity"] for cut in cutoffs]
completeness = [metrics[cut]["completeness"] for cut in cutoffs]
v_measure = [metrics[cut]["v_measure"] for cut in cutoffs]
Expand Down Expand Up @@ -206,7 +209,7 @@ def plot_per_cutoff(self, metrics: dict[str, dict[str, Any]]) -> None:
)
ax.text(0, -0.4, self.metadata, transform=ax.transAxes)
plt.title("External cluster evaluation metrics per used cutoff")
plt.xlabel("BiG-SCAPE family cutoff")
plt.xlabel(f"{self.tool} family cutoff")
plt.ylabel("Score")
plots = h + c + v + wl + ml
ax.legend(plots, [p.get_label() for p in plots], loc=0)
Expand Down
4 changes: 2 additions & 2 deletions big_scape/cli/cluster_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
@click.option(
"--classify",
type=click.Choice(["none", "class", "category", "legacy"]),
default="class",
default="category",
callback=validate_classify,
help=(
"""Define which method BiG-SCAPE should use to separate BGC records into
Expand All @@ -49,7 +49,7 @@
higher. For older antiSMASH versions, either use --classify 'legacy' or do not
select --legacy_weights, which will perform the weighted distance calculations
based on the generic 'mix' weights. For more detail, see wiki.
(default: class)"""
(default: category)"""
),
)
@click.option(
Expand Down
4 changes: 2 additions & 2 deletions big_scape/data/sqlite.py
Original file line number Diff line number Diff line change
Expand Up @@ -477,9 +477,9 @@ def check_config_hash():
run_table = DB.metadata.tables["run"]
latest_config = DB.execute(
select(run_table.c.config_hash).order_by(desc(run_table.c.id)).limit(1)
).scalar_one()
).scalar_one_or_none()

if BigscapeConfig.HASH != latest_config:
if latest_config and BigscapeConfig.HASH != latest_config:
raise RuntimeError(
"Config file values have changed from the previous run! "
"Existing data is not guarenteed to be reusable, please "
Expand Down
2 changes: 1 addition & 1 deletion big_scape/file_input/load_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -367,7 +367,7 @@ def load_gbks(run: dict, bigscape_dir: Path) -> list[GBK]:
# if we end up here, we are in some halfway state and need to load in the new data
logging.info("Adding new data to the database...")
missing_gbks = bs_data.get_missing_gbks(input_gbks)
logging.info("Found %d new gbks to process", len(missing_gbks))
logging.info("Found %d new GBKs to process", len(missing_gbks))

for gbk in missing_gbks:
gbk.save_all()
Expand Down
16 changes: 8 additions & 8 deletions test/benchmark/test_data_loading.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,16 +87,16 @@ def test_bs2_computed_gcf_loading(self):

expected_data = {
"0.5": {
"CM000578.1.cluster047_protocluster_1": "72",
"CM000578.1.cluster038_protocluster_1": "77",
"CM000578.1.cluster038_protocluster_2": "179",
"CM000578.1.cluster033": "167",
"CM000578.1.cluster047_protocluster_1": "00072",
"CM000578.1.cluster038_protocluster_1": "00077",
"CM000578.1.cluster038_protocluster_2": "00179",
"CM000578.1.cluster033": "00167",
},
"0.7": {
"CM000578.1.cluster038_protocluster_1": "6",
"CM000578.1.cluster038_protocluster_2": "6",
"CM000578.1.cluster042_protocluster_1_2": "10",
"CM000578.1.cluster033": "62",
"CM000578.1.cluster038_protocluster_1": "00006",
"CM000578.1.cluster038_protocluster_2": "00006",
"CM000578.1.cluster042_protocluster_1_2": "00010",
"CM000578.1.cluster033": "00062",
},
}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
GBK Record_Type Record_Number GCF_number
../data/benchmark_antismash_files/extracted/Fusarium/Fusarium_16/CM000578.1.cluster047.gbk protocluster 1 72
../data/benchmark_antismash_files/extracted/Fusarium/Fusarium_108/CM000578.1.cluster038.gbk protocluster 1 77
../data/benchmark_antismash_files/extracted/Fusarium/Fusarium_108/CM000578.1.cluster038.gbk protocluster 2 179
../data/benchmark_antismash_files/extracted/Fusarium/Fusarium_75/CM000578.1.cluster033.gbk region 33 167
GBK Record_Type Record_Number CC Family
../data/benchmark_antismash_files/extracted/Fusarium/Fusarium_16/CM000578.1.cluster047.gbk protocluster 1 1 FAM_00072
../data/benchmark_antismash_files/extracted/Fusarium/Fusarium_108/CM000578.1.cluster038.gbk protocluster 1 1 FAM_00077
../data/benchmark_antismash_files/extracted/Fusarium/Fusarium_108/CM000578.1.cluster038.gbk protocluster 2 1 FAM_00179
../data/benchmark_antismash_files/extracted/Fusarium/Fusarium_75/CM000578.1.cluster033.gbk region 33 1 FAM_00167
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
GBK Record_Type Record_Number GCF_number
../data/benchmark_antismash_files/extracted/Fusarium/Fusarium_16/CM000578.1.cluster047.gbk protocluster 1 72
../data/benchmark_antismash_files/extracted/Fusarium/Fusarium_108/CM000578.1.cluster038.gbk protocluster 1 77
../data/benchmark_antismash_files/extracted/Fusarium/Fusarium_108/CM000578.1.cluster038.gbk protocluster 2 179
../data/benchmark_antismash_files/extracted/Fusarium/Fusarium_75/CM000578.1.cluster033.gbk region 33 167
GBK Record_Type Record_Number CC Family
../data/benchmark_antismash_files/extracted/Fusarium/Fusarium_16/CM000578.1.cluster047.gbk protocluster 1 1 FAM_00072
../data/benchmark_antismash_files/extracted/Fusarium/Fusarium_108/CM000578.1.cluster038.gbk protocluster 1 1 FAM_00077
../data/benchmark_antismash_files/extracted/Fusarium/Fusarium_108/CM000578.1.cluster038.gbk protocluster 2 1 FAM_00179
../data/benchmark_antismash_files/extracted/Fusarium/Fusarium_75/CM000578.1.cluster033.gbk region 33 1 FAM_00167
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
GBK Record_Type Record_Number GCF_number
../data/benchmark_antismash_files/extracted/Fusarium/Fusarium_108/CM000578.1.cluster038.gbk protocluster 1 6
../data/benchmark_antismash_files/extracted/Fusarium/Fusarium_108/CM000578.1.cluster038.gbk protocluster 2 6
../data/benchmark_antismash_files/extracted/Fusarium/Fusarium_19/CM000578.1.cluster042.gbk protocluster 1_2 10
../data/benchmark_antismash_files/extracted/Fusarium/Fusarium_75/CM000578.1.cluster033.gbk region 33 62
GBK Record_Type Record_Number CC Family
../data/benchmark_antismash_files/extracted/Fusarium/Fusarium_108/CM000578.1.cluster038.gbk protocluster 1 1 FAM_00006
../data/benchmark_antismash_files/extracted/Fusarium/Fusarium_108/CM000578.1.cluster038.gbk protocluster 2 1 FAM_00006
../data/benchmark_antismash_files/extracted/Fusarium/Fusarium_19/CM000578.1.cluster042.gbk protocluster 1_2 1 FAM_00010
../data/benchmark_antismash_files/extracted/Fusarium/Fusarium_75/CM000578.1.cluster033.gbk region 33 1 FAM_00062
10 changes: 5 additions & 5 deletions test/test_data/curated_gcfs/valid_protocluster_gcfs.tsv
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
BGC_filename record_type record_number GCF_number product(s)
AGCF01000001.1.cluster077 protocluster 1 ectoine ['ectoine']
CM001149.1.cluster003 protocluster 1 ectoine ['ectoine']
CM001149.1.cluster001 protocluster 1 FAS ['fatty_acid']
CM002177.1.cluster025 region 1 FAS ['fatty_acid']
BGC_filename record_type record_number product(s) GCF_number
AGCF01000001.1.cluster077 protocluster 1 ['ectoine'] ectoine
CM001149.1.cluster003 protocluster 1 ['ectoine'] ectoine
CM001149.1.cluster001 protocluster 1 ['fatty_acid'] FAS
CM002177.1.cluster025 region 1 ['fatty_acid'] FAS
Loading