diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 03f974a..2f4c7f8 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -10,11 +10,13 @@ on: - main - dev - dev_1.0 + - dev_1.0_weighting pull_request: branches: - main - dev - dev_1.0 + - dev_1.0_weighting workflow_dispatch: # make is manually start-able # A workflow run is made up of one or more jobs that can run sequentially or in parallel diff --git a/datasail/cluster/ecfp.py b/datasail/cluster/ecfp.py index 951679f..42826fb 100644 --- a/datasail/cluster/ecfp.py +++ b/datasail/cluster/ecfp.py @@ -19,26 +19,31 @@ def run_ecfp(dataset: DataSet, method: SIM_OPTIONS = "tanimoto") -> None: """ lg = RDLogger.logger() lg.setLevel(RDLogger.CRITICAL) + if dataset.type != "M": raise ValueError("ECFP with Tanimoto-scores can only be applied to molecular data.") - scaffolds = {} LOGGER.info("Start ECFP clustering") invalid_mols = [] + scaffolds = {} for name in dataset.names: - scaffold = read_molecule_encoding(dataset.data[name]) - if scaffold is None: + mol = Chem.MolFromSmiles(dataset.data[name]) + # scaffold = read_molecule_encoding(dataset.data[name]) + # if scaffold is None: + if mol is None: bo, bc = "{", "}" LOGGER.warning(f"RDKit cannot parse {name} {bo}{dataset.data[name]}{bc}") invalid_mols.append(name) continue - try: - scaffolds[name] = MakeScaffoldGeneric(scaffold) - except MolSanitizeException: - LOGGER.warning(f"RDKit cannot parse {name} ({dataset.data[name]})") - invalid_mols.append(name) - continue + scaffolds[name] = mol + # try: + # scaffolds[name] = MakeScaffoldGeneric(scaffold) + # except MolSanitizeException: + # LOGGER.warning(f"RDKit cannot parse {name} ({dataset.data[name]})") + # invalid_mols.append(name) + # continue + for invalid_name in invalid_mols: # obsolete code? dataset.names.remove(invalid_name) dataset.data.pop(invalid_name) @@ -48,15 +53,20 @@ def run_ecfp(dataset: DataSet, method: SIM_OPTIONS = "tanimoto") -> None: poppable.append(key) for pop in poppable: dataset.id_map.pop(pop) - + fps = [] - dataset.cluster_names = list(set(Chem.MolToSmiles(s) for s in list(scaffolds.values()))) - for scaffold in dataset.cluster_names: - fps.append(AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(scaffold), 2, nBits=1024)) + # dataset.cluster_names = list(set(Chem.MolToSmiles(s) for s in list(scaffolds.values()))) + # for scaffold in dataset.cluster_names: + # fps.append(AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(scaffold), 2, nBits=1024)) + dataset.cluster_names = dataset.names + for name in dataset.names: + fps.append(AllChem.GetMorganFingerprintAsBitVect(scaffolds[name], 2, nBits=1024)) LOGGER.info(f"Reduced {len(dataset.names)} molecules to {len(dataset.cluster_names)}") LOGGER.info("Compute Tanimoto Coefficients") run(dataset, fps, method) - dataset.cluster_map = dict((name, Chem.MolToSmiles(scaffolds[name])) for name in dataset.names) + dataset.cluster_map = {name: name for name in dataset.names} + dataset.cluster_weights = {name: 1 for name in dataset.names} + diff --git a/datasail/cluster/vectors.py b/datasail/cluster/vectors.py index e9bb764..6fe0516 100644 --- a/datasail/cluster/vectors.py +++ b/datasail/cluster/vectors.py @@ -8,9 +8,10 @@ from datasail.reader.utils import DataSet from datasail.settings import LOGGER +# TODO: Exclude all those that do not automatically scale between 0 and 1 SIM_OPTIONS = Literal[ "allbit", "asymmetric", "braunblanquet", "cosine", "dice", "kulczynski", "mcconnaughey", "onbit", "rogotgoldberg", - "russel", "sokal" + "russel", "sokal", "tanimoto" ] # produces inf or nan: correlation, cosine, jensenshannon, seuclidean, braycurtis @@ -18,7 +19,7 @@ # matching == hamming, manhattan == cityblock (inofficial) DIST_OPTIONS = Literal[ "canberra", "chebyshev", "cityblock", "euclidean", "hamming", "jaccard", "mahalanobis", "manhattan", "matching", - "minkowski", "sqeuclidean", "tanimoto" + "minkowski", "sqeuclidean" ] @@ -52,6 +53,8 @@ def get_rdkit_fct(method: SIM_OPTIONS): return DataStructs.BulkRogotGoldbergSimilarity if method == "russel": return DataStructs.BulkRusselSimilarity + if method == "tanimoto": + return DataStructs.BulkTanimotoSimilarity if method == "sokal": return DataStructs.BulkSokalSimilarity raise ValueError(f"Unknown method {method}") @@ -182,7 +185,8 @@ def run( method: The similarity measure to use. """ if method in get_args(SIM_OPTIONS): - dataset.cluster_similarity = scale_min_max(rdkit_sim(fps, method)) + # dataset.cluster_similarity = scale_min_max(rdkit_sim(fps, method)) + dataset.cluster_similarity = rdkit_sim(fps, method) elif method in get_args(DIST_OPTIONS): if method == "mahalanobis" and len(fps) <= len(fps[0]): raise ValueError( diff --git a/datasail/solver/cluster_1d.py b/datasail/solver/cluster_1d.py index f2bd2f4..3f06bba 100644 --- a/datasail/solver/cluster_1d.py +++ b/datasail/solver/cluster_1d.py @@ -6,6 +6,7 @@ import numpy as np from datasail.solver.utils import solve, cluster_y_constraints, compute_limits, stratification_constraints +from experiments.ablation import david def solve_c1( @@ -47,7 +48,7 @@ def solve_c1( min_lim = compute_limits(epsilon, sum(weights), splits) x = cvxpy.Variable((len(splits), len(clusters)), boolean=True) # 19 - y = [[cvxpy.Variable(1, boolean=True) for _ in range(e)] for e in range(len(clusters))] # 20 + # y = [[cvxpy.Variable(1, boolean=True) for _ in range(e)] for e in range(len(clusters))] # 20 constraints = [cvxpy.sum(x, axis=0) == np.ones((len(clusters)))] # 16 @@ -57,14 +58,35 @@ def solve_c1( if s_matrix is not None: constraints.append(stratification_constraints(s_matrix, splits, delta, x)) - constraints += cluster_y_constraints(clusters, y, x, splits) # 18 + # constraints += cluster_y_constraints(clusters, y, x, splits) # 18 + + intra_weights = similarities if similarities is not None else np.max(distances) - distances + # tmp = [[intra_weights[e1, e2] * y[e1][e2] for e2 in range(e1)] for e1 in range(len(clusters))] # 15 + + # Because of different weights tmp != len(clusters) * (len(clusters) - 1) / 2 + tmp = [[weights[e1] * weights[e2] * intra_weights[e1, e2] * cvxpy.max(cvxpy.vstack([x[s, e1] - x[s, e2] for s in range(len(splits))])) for e2 in range(e1 + 1, len(clusters))] for e1 in range(len(clusters))] # 15 - intra_weights = similarities if similarities is not None else distances - tmp = [[intra_weights[e1, e2] * y[e1][e2] for e2 in range(e1)] for e1 in range(len(clusters))] # 15 loss = cvxpy.sum([t for tmp_list in tmp for t in tmp_list]) - if distances is not None: - loss = -loss + # if distances is not None: + # loss = -loss + # loss += cvxpy.sum([cvxpy.sum([y[e1][e2] for e2 in range(e1)]) for e1 in range(len(clusters))]) # 14 problem = solve(loss, constraints, max_sec, solver, log_file) + # print("============= Evaluation =============") + # y_mat = np.full((len(clusters), len(clusters)), 0) + # w_mat = np.full((len(clusters), len(clusters)), 0) + # for e1 in range(len(clusters)): + # w_mat[e1, e1] = weights[e1] ** 2 + # for e2 in range(e1): + # y_mat[e1, e2] = np.max([x[s, e1].value - x[s, e2].value for s in range(len(splits))]) + # w_mat[e1, e2] = weights[e1] * weights[e2] + # y_mat[e2, e1] = y_mat[e1, e2] + # w_mat[e2, e1] = w_mat[e1, e2] + # print(problem.value) + # weights = np.array(weights).reshape(-1, 1) + # print(david.eval(np.array([ + # [1 if x[0, i].value > 0.1 else -1] for i in range(len(clusters)) + # ]), similarities, weights @ weights.T)) # , y_mat=y_mat, w_mat=w_mat)) + # print("======================================") return None if problem is None else { e: names[s] for s in range(len(splits)) for i, e in enumerate(clusters) if x[s, i].value > 0.1 diff --git a/datasail/solver/cluster_2d.py b/datasail/solver/cluster_2d.py index 1c4bad1..418651f 100644 --- a/datasail/solver/cluster_2d.py +++ b/datasail/solver/cluster_2d.py @@ -12,10 +12,12 @@ def solve_c2( e_s_matrix: Optional[np.ndarray], e_similarities: Optional[np.ndarray], e_distances: Optional[np.ndarray], + e_weights: Optional[np.ndarray], f_clusters: List[Union[str, int]], f_s_matrix: Optional[np.ndarray], f_similarities: Optional[np.ndarray], f_distances: Optional[np.ndarray], + f_weights: Optional[np.ndarray], inter: np.ndarray, delta: float, epsilon: float, @@ -35,10 +37,12 @@ def solve_c2( e_s_matrix: Stratification for the e-dataset e_similarities: Pairwise similarity matrix of clusters in the order of their names e_distances: Pairwise distance matrix of clusters in the order of their names + e_weights: Weights of the clusters in the order of their names in e_clusters f_clusters: List of cluster names to split from the f-dataset f_s_matrix: Stratification for the f-dataset f_similarities: Pairwise similarity matrix of clusters in the order of their names f_distances: Pairwise distance matrix of clusters in the order of their names + f_weights: Weights of the clusters in the order of their names in f_clusters inter: Matrix storing the amount of interactions between the entities in the e-clusters and f-clusters delta: Additive bound for stratification imbalance epsilon: Additive bound for exceeding the requested split size @@ -58,14 +62,16 @@ def solve_c2( x_f = cvxpy.Variable((len(splits), len(f_clusters)), boolean=True) x_i = {(e, f): cvxpy.Variable(len(splits), boolean=True) for e in range(len(e_clusters)) for f in range(len(f_clusters)) if inter[e, f] != 0} - y_e = [[cvxpy.Variable(1, boolean=True) for _ in range(e)] for e in range(len(e_clusters))] - y_f = [[cvxpy.Variable(1, boolean=True) for _ in range(f)] for f in range(len(f_clusters))] + # y_e = [[cvxpy.Variable(1, boolean=True) for _ in range(e)] for e in range(len(e_clusters))] + # y_f = [[cvxpy.Variable(1, boolean=True) for _ in range(f)] for f in range(len(f_clusters))] # check if the cluster relations are uniform - e_intra_weights = e_similarities if e_similarities is not None else e_distances - f_intra_weights = f_similarities if f_similarities is not None else f_distances - e_uniform = e_intra_weights is None or np.allclose(e_intra_weights, np.ones_like(e_intra_weights)) - f_uniform = f_intra_weights is None or np.allclose(f_intra_weights, np.ones_like(f_intra_weights)) + e_intra_weights = e_similarities if e_similarities is not None else 1 - e_distances + f_intra_weights = f_similarities if f_similarities is not None else 1 - f_distances + e_uniform = e_intra_weights is None or np.allclose(e_intra_weights, np.ones_like(e_intra_weights)) or \ + np.allclose(e_intra_weights, np.zeros_like(e_intra_weights)) + f_uniform = f_intra_weights is None or np.allclose(f_intra_weights, np.ones_like(f_intra_weights)) or \ + np.allclose(f_intra_weights, np.zeros_like(f_intra_weights)) def index(x, y): return (x, y) if (x, y) in x_i else None @@ -83,12 +89,12 @@ def index(x, y): interaction_contraints(e_clusters, f_clusters, x_i, constraints, splits, x_e, x_f, min_lim, lambda key: inter[key], index) - constraints += cluster_y_constraints(e_clusters, y_e, x_e, splits) + \ - cluster_y_constraints(f_clusters, y_f, x_f, splits) + # constraints += cluster_y_constraints(e_clusters, y_e, x_e, splits) + \ + # cluster_y_constraints(f_clusters, y_f, x_f, splits) # inter_loss = (np.sum(inter) - sum(cvxpy.sum(x) for x in x_i.values())) / np.sum(inter) - e_loss = leakage_loss(e_uniform, e_intra_weights, y_e, e_clusters, e_similarities) - f_loss = leakage_loss(f_uniform, f_intra_weights, y_f, f_clusters, f_similarities) + e_loss = leakage_loss(e_uniform, e_intra_weights, x_e, e_clusters, e_weights, len(splits)) + f_loss = leakage_loss(f_uniform, f_intra_weights, x_f, f_clusters, f_weights, len(splits)) problem = solve(e_loss + f_loss, constraints, max_sec, solver, log_file) diff --git a/datasail/solver/solve.py b/datasail/solver/solve.py index 65dd203..beb7437 100644 --- a/datasail/solver/solve.py +++ b/datasail/solver/solve.py @@ -196,11 +196,13 @@ def run_solver( for c in e_dataset.cluster_names]) if e_dataset.cluster_stratification is not None else None, e_similarities=e_dataset.cluster_similarity, e_distances=e_dataset.cluster_distance, + e_weights=np.ndarray([e_dataset.cluster_weights.get(c, 1) for c in e_dataset.cluster_names]), f_clusters=f_dataset.cluster_names, f_s_matrix=np.stack([f_dataset.cluster_stratification.get(c, np.zeros(len(dataset.classes))) for c in f_dataset.cluster_names]) if f_dataset.cluster_stratification is not None else None, f_similarities=f_dataset.cluster_similarity, f_distances=f_dataset.cluster_distance, + f_weights=np.ndarray([f_dataset.cluster_weights.get(c, 1) for c in f_dataset.cluster_names]), inter=cluster_inter, delta=delta, epsilon=epsilon, diff --git a/datasail/solver/utils.py b/datasail/solver/utils.py index 1249813..32918f5 100644 --- a/datasail/solver/utils.py +++ b/datasail/solver/utils.py @@ -390,9 +390,10 @@ def collect_results_2d( def leakage_loss( uniform: bool, intra_weights, - y, + x, clusters, - similarities + weights, + num_splits: int, ): """ Compute the leakage loss for the cluster-based double-cold splitting. @@ -400,9 +401,10 @@ def leakage_loss( Args: uniform: Boolean flag if the cluster metric is uniform intra_weights: Weights of the intra-cluster edges - y: Helper variables + x: Variables of the optimization problem clusters: List of cluster names - similarities: Pairwise similarity matrix of clusters in the order of their names + weights: Weights of the clusters + num_splits: Number of splits Returns: Loss describing the leakage between clusters @@ -410,8 +412,8 @@ def leakage_loss( if uniform: return 0 else: - if similarities is None: - intra_weights = 1 - intra_weights - tmp = [intra_weights[c1, c2] * y[c1][c2] for c1 in range(len(clusters)) for c2 in range(c1)] - e_loss = cvxpy.sum(tmp) - return e_loss + # tmp = [intra_weights[c1, c2] * y[c1][c2] for c1 in range(len(clusters)) for c2 in range(c1)] + # e_loss = cvxpy.sum(tmp) + tmp = [[weights[e1] * weights[e2] * intra_weights[e1, e2] * cvxpy.max(cvxpy.vstack([x[s, e1] - x[s, e2] for s in range(num_splits)])) for e2 in range(e1 + 1, len(clusters))] for e1 in range(len(clusters))] + loss = cvxpy.sum([t for tmp_list in tmp for t in tmp_list]) + return loss diff --git a/experiments/MPP/split.py b/experiments/MPP/split.py index 95e8301..dba2799 100644 --- a/experiments/MPP/split.py +++ b/experiments/MPP/split.py @@ -49,7 +49,7 @@ def split_w_datasail(base_path: Path, name: str, techniques: List[str], solver: techniques=techniques, splits=[8, 2], names=["train", "test"], - runs=1, + runs=5, solver=solver, e_type="M", e_data=dict(df[["ID", "SMILES"]].values.tolist()), @@ -164,8 +164,8 @@ def split(full_path, name, solver="GUROBI"): Split the MoleculeNet datasets using different techniques. """ split_w_datasail(full_path / "datasail" / name, name, techniques=["I1e", "C1e"], solver=solver) - split_w_deepchem(full_path / "deepchem" / name, name, techniques=SPLITTERS.keys()) - split_w_lohi(full_path / "lohi" / name, name) + # split_w_deepchem(full_path / "deepchem" / name, name, techniques=SPLITTERS.keys()) + # split_w_lohi(full_path / "lohi" / name, name) def specific(): @@ -182,9 +182,9 @@ def specific(): if __name__ == '__main__': if len(sys.argv) == 1: specific() - if len(sys.argv) == 2: + elif len(sys.argv) == 2: split_all(Path(sys.argv[1])) - elif len(sys.argv) >= 3: + elif len(sys.argv) == 3: split(Path(sys.argv[1]), sys.argv[2]) elif len(sys.argv) >= 4: split(Path(sys.argv[1]), sys.argv[2], sys.argv[3]) diff --git a/experiments/MPP/visualize.py b/experiments/MPP/visualize.py index 8202e21..bec0459 100644 --- a/experiments/MPP/visualize.py +++ b/experiments/MPP/visualize.py @@ -26,26 +26,41 @@ def compute_il(name, tools, techniques): data=dict(df[["ID", "SMILES"]].values.tolist()), id_map={x: x for x in df["ID"].tolist()}, ) + # print(len(dataset.names)) dataset.cluster_names, dataset.cluster_map, dataset.cluster_similarity, dataset.cluster_weights = run_ecfp( dataset ) + # print(dataset.cluster_similarity.shape) output = {} + if "deepchem" in tools: + print("Use v0.3") + root = Path("/") / "scratch" / "SCRATCH_SAS" / "roman" / "DataSAIL" / "v03" / "MPP" + else: + print("Use v1.0") + root = Path("/") / "home" / "rjo21" / "Desktop" / "DataSAIL" / "v1.0_test" for tool in tools: if tool not in output: output[tool] = {} for technique in techniques: + # print(technique) if technique not in TECHNIQUES[tool]: continue if technique not in output[tool]: output[tool][technique] = [] for run in range(RUNS): - base = Path("/") / "scratch" / "SCRATCH_SAS" / "roman" / "DataSAIL" / "v03" / "MPP" / tool / name / technique / f"split_{run}" + base = root / tool / name / technique / f"split_{run}" train_ids = pd.read_csv(base / "train.csv")["ID"] test_ids = pd.read_csv(base / "test.csv")["ID"] - df["assi"] = df["ID"].apply(lambda x: 1 if x in train_ids.values else -1 if x in test_ids.values else None) + df["assi"] = df["ID"].apply(lambda x: 1 if x in train_ids.values else -1 if x in test_ids.values else 0) df.dropna(subset=["assi"], inplace=True) - il = david.eval(df["assi"].to_numpy().reshape(-1, 1), dataset.cluster_similarity) + il = david.eval( + df["assi"].to_numpy().reshape(-1, 1), + dataset.cluster_similarity, + [dataset.cluster_weights[c] for c in dataset.cluster_names], + ) output[tool][technique].append(il) + # print(output) + # break return output @@ -184,12 +199,15 @@ def comp_all_il(): except Exception as e: print(f"Failed for {name}") print(e) - with open("il.pkl", "wb") as f: + with open(f"il.pkl", "wb") as f: pickle.dump(output, f) if __name__ == '__main__': - comp_all_il() + # print("ESOL :", compute_il("esol", ["datasail"], ["I1e", "C1e"])) + print("FreeSolv:", compute_il("freesolv", ["datasail"], ["I1e", "C1e"])) + print("FreeSolv:", compute_il("freesolv", ["deepchem", "lohi"], TECHNIQUES["deepchem"] + TECHNIQUES["lohi"])) + # comp_all_il() # compute_il("esol", ["datasail"], ["I1e", "C1e"]) # plot_double(Path(sys.argv[1]), ["QM8", "Tox21"]) # heatmap_plot(Path(sys.argv[1])) diff --git a/experiments/ablation/david.py b/experiments/ablation/david.py index 667385a..db02e53 100644 --- a/experiments/ablation/david.py +++ b/experiments/ablation/david.py @@ -61,7 +61,8 @@ def run_ecfp(dataset): invalid_mols = [] molecules = {} for name in dataset.names: - mol = read_molecule_encoding(dataset.data[name]) + mol = Chem.MolFromSmiles(dataset.data[name]) + # mol = read_molecule_encoding(dataset.data[name]) if mol is None: invalid_mols.append(name) continue @@ -89,13 +90,8 @@ def run_ecfp(dataset): sim_matrix[i, :i] = DataStructs.BulkTanimotoSimilarity(fps[i], fps[:i]) sim_matrix[:i, i] = sim_matrix[i, :i] - cluster_map = dict((name, Chem.MolToSmiles(molecules[name])) for name in dataset.names) - - cluster_weights = {} - for key, value in cluster_map.items(): - if value not in cluster_weights: - cluster_weights[value] = 0 - cluster_weights[value] += 1 + cluster_map = {name: name for name in dataset.names} + cluster_weights = {name: 1 for name in dataset.names} return dataset.names, cluster_map, sim_matrix, cluster_weights @@ -179,10 +175,14 @@ def time_overhead(full_path): return times -def eval(assignments, similarity): +def eval(assignments, similarity, weights=None): + if weights is None: + weights = np.ones_like(similarity) mask = assignments @ assignments.T mask[mask == -1] = 0 - return 1 - np.sum(similarity * mask) / np.sum(similarity) + mask = (1 - mask) * (1 - np.eye(mask.shape[0], dtype=np.int64)) + leak = (np.sum(similarity * weights * mask) / np.sum(similarity * weights)) / 2 + return leak def visualize(full_path: Path, clusters: List[int], solvers, ax: Optional[Tuple] = None, fig=None): diff --git a/experiments/utils.py b/experiments/utils.py index 0923ff8..48d4c3a 100644 --- a/experiments/utils.py +++ b/experiments/utils.py @@ -118,8 +118,8 @@ def save_datasail_splits(base: Path, df: pd.DataFrame, key: str, techniques: Lis inter_splits: Interactions splits """ for name, tech in techniques: - # for run in range(RUNS): - for run in range(1): + for run in range(RUNS): + # for run in range(1): path = base / name / f"split_{run}" path.mkdir(parents=True, exist_ok=True) diff --git a/tests/test_basic_ilp.py b/tests/test_basic_ilp.py index 8a71cd7..3861f7d 100644 --- a/tests/test_basic_ilp.py +++ b/tests/test_basic_ilp.py @@ -79,6 +79,7 @@ def test_ccd(): ]), e_distances=None, e_s_matrix=None, + e_weights=np.array([3, 3, 3]), f_clusters=["P1", "P2", "P3"], f_similarities=np.asarray([ [5, 5, 0], @@ -87,6 +88,7 @@ def test_ccd(): ]), f_distances=None, f_s_matrix=None, + f_weights=np.array([3, 3, 3]), inter=np.asarray([ [9, 9, 0], [9, 9, 0],