From 2e8b7e449569c6ebc9c64088e25a0abf545ceed0 Mon Sep 17 00:00:00 2001 From: Roman Joeres Date: Thu, 21 Mar 2024 18:15:56 +0100 Subject: [PATCH] More tests on different molecule formats --- .github/workflows/test.yaml | 2 +- datasail/reader/read_genomes.py | 5 ++-- datasail/reader/read_molecules.py | 16 +++++------ datasail/reader/read_other.py | 5 ++-- datasail/reader/read_proteins.py | 4 +-- datasail/reader/utils.py | 4 +-- tests/data/molecules.csv | 36 ++++++++++++++++++++++++ tests/test_pipeline.py | 46 +++++++++++++++++++++++++------ 8 files changed, 93 insertions(+), 25 deletions(-) create mode 100644 tests/data/molecules.csv diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 02f821e..03f974a 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -37,7 +37,7 @@ jobs: - name: Install environment shell: bash -l {0} run: | - mamba install -c conda-forge -c bioconda -y numpy pandas networkx matplotlib pytest setuptools pyscipopt"<4.0.0" foldseek mmseqs2 cd-hit mash tmalign diamond cvxpy pytest-cov rdkit">=2022.09.1" pytest-cases scikit-learn">=1.2,<1.6" pyyaml h5py + mamba install -c conda-forge -c bioconda -y numpy pandas networkx matplotlib pytest setuptools pyscipopt"<4.0.0" foldseek mmseqs2 cd-hit mash tmalign diamond cvxpy pytest-cov rdkit">=2023.09.1" pytest-cases scikit-learn pyyaml h5py pip install grakel - name: Run tests diff --git a/datasail/reader/read_genomes.py b/datasail/reader/read_genomes.py index 9e934b2..88afac4 100644 --- a/datasail/reader/read_genomes.py +++ b/datasail/reader/read_genomes.py @@ -1,3 +1,4 @@ +from pathlib import Path from typing import List, Tuple, Optional from datasail.reader.read_molecules import remove_duplicate_values @@ -35,8 +36,8 @@ def read_genome_data( """ dataset = DataSet(type=G_TYPE, location=UNK_LOCATION, format=FORM_FASTA) - def read_dir(ds): - ds.data = dict(read_folder(data)) + def read_dir(ds: DataSet, path: Path) -> None: + ds.data = dict(read_folder(path)) ds.format = FORM_GENOMES read_data_input(data, dataset, read_dir) diff --git a/datasail/reader/read_molecules.py b/datasail/reader/read_molecules.py index a6baff4..10a3f6a 100644 --- a/datasail/reader/read_molecules.py +++ b/datasail/reader/read_molecules.py @@ -14,13 +14,13 @@ mol_reader = { - ".mol": MolFromMolFile, - ".mol2": MolFromMol2File, - ".mrv": MolFromMrvFile, + "mol": MolFromMolFile, + "mol2": MolFromMol2File, + "mrv": MolFromMrvFile, # "sdf": MolFromMol2File, - ".pdb": MolFromPDBFile, - ".tpl": MolFromTPLFile, - ".xyz": MolFromXYZFile, + "pdb": MolFromPDBFile, + "tpl": MolFromTPLFile, + "xyz": MolFromXYZFile, } @@ -54,11 +54,11 @@ def read_molecule_data( """ dataset = DataSet(type=M_TYPE, format=FORM_SMILES, location=UNK_LOCATION) - def read_dir(ds: DataSet, path: Path): + def read_dir(ds: DataSet, path: Path) -> None: ds.data = {} for file in path.iterdir(): if file.suffix[1:].lower() != "sdf" and mol_reader[file.suffix[1:].lower()] is not None: - ds.data[file.stem] = mol_reader[file.suffix[1:].lower()](file) + ds.data[file.stem] = Chem.MolToSmiles(mol_reader[file.suffix[1:].lower()](str(file))) else: ds.data = read_sdf_file(file) diff --git a/datasail/reader/read_other.py b/datasail/reader/read_other.py index c96ecae..69d6c8e 100644 --- a/datasail/reader/read_other.py +++ b/datasail/reader/read_other.py @@ -1,3 +1,4 @@ +from pathlib import Path from typing import List, Tuple, Optional from datasail.reader.read_genomes import read_folder @@ -37,8 +38,8 @@ def read_other_data( """ dataset = DataSet(type=O_TYPE, location=UNK_LOCATION, format=FORM_OTHER) - def read_dir(ds): - ds.data = dict(read_folder(data)) + def read_dir(ds: DataSet, path: Path) -> None: + ds.data = dict(read_folder(path)) read_data_input(data, dataset, read_dir) diff --git a/datasail/reader/read_proteins.py b/datasail/reader/read_proteins.py index d73212e..44477db 100644 --- a/datasail/reader/read_proteins.py +++ b/datasail/reader/read_proteins.py @@ -38,8 +38,8 @@ def read_protein_data( """ dataset = DataSet(type=P_TYPE, location=UNK_LOCATION) - def read_dir(ds): - ds.data = dict(read_folder(data, "pdb")) + def read_dir(ds: DataSet, path: Path) -> None: + ds.data = dict(read_folder(path, "pdb")) read_data_input(data, dataset, read_dir) diff --git a/datasail/reader/utils.py b/datasail/reader/utils.py index 465c95a..4eab400 100644 --- a/datasail/reader/utils.py +++ b/datasail/reader/utils.py @@ -358,7 +358,7 @@ def read_folder(folder_path: Path, file_extension: Optional[str] = None) -> Gene yield filename.stem, filename -def read_data_input(data: DATA_INPUT, dataset: DataSet, read_dir: Callable[[DataSet], None]): +def read_data_input(data: DATA_INPUT, dataset: DataSet, read_dir: Callable[[DataSet, Path], None]): """ Read in the data from different sources and store it in the dataset. @@ -386,7 +386,7 @@ def read_data_input(data: DATA_INPUT, dataset: DataSet, read_dir: Callable[[Data else: raise ValueError("Unknown file format. Supported formats are: .fasta, .fna, .fa, tsv, .csv, .pkl, .h5") elif data.is_dir(): - read_dir(dataset) + read_dir(dataset, data) else: raise ValueError("Unknown data input type. Path encodes neither a file nor a directory.") dataset.location = data diff --git a/tests/data/molecules.csv b/tests/data/molecules.csv new file mode 100644 index 0000000..6b7c0d7 --- /dev/null +++ b/tests/data/molecules.csv @@ -0,0 +1,36 @@ +Drug_ID, SMILES +D001, C +D002, N +D003, O +D004, C#C +D005, C#N +D006, C=O +D007, CC +D008, CO +D009, CC#C +D010, CC#N +D011, CC=O +D012, NC=O +D013, CCC +D014, CCO +D015, COC +D016, C1CC1 +D017, C1CO1 +D018, CC(C)=O +D019, CC(N)=O +D020, NC(N)=O +D021, CC(C)C +D022, CC(C)O +D023, C#CC#C +D024, C#CC#N +D025, N#CC#N +D026, O=CC#C +D027, O=CC#N +D028, O=CC=O +D029, CC#CC +D030, CCC#C +D031, CCC#N +D032, NCC#N +D033, OCC#C +D034, OCC#N +D035, CCC=O diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index c52d1cb..b7ba0ee 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -234,31 +234,31 @@ def md_calculator(): @pytest.mark.parametrize("mode", ["CSV", "TSV", "PKL", "H5PY", "SDF"]) def test_input_formats(mode, md_calculator): - base = Path("data") / "pipeline" + base = Path("data") / "pipeline" / "input_forms" drugs = pd.read_csv(base / "drugs.tsv", sep="\t") ddict = {row["Drug_ID"]: row["SMILES"] for index, row in drugs.iterrows()} - (base / "input_forms").mkdir(exist_ok=True, parents=True) + base.mkdir(exist_ok=True, parents=True) if mode == "CSV": - filepath = base / "input_forms" / "drugs.csv" + filepath = base / "drugs.csv" drugs.to_csv(filepath, sep=",", index=False) elif mode == "TSV": - filepath = base / "input_forms" / "drugs.tsv" + filepath = base / "drugs.tsv" drugs.to_csv(filepath, sep="\t", index=False) elif mode == "PKL": data = {} for k, v in ddict.items(): data[k] = AllChem.MolToSmiles(Chem.MolFromSmiles(v)) - filepath = base / "input_forms" / "drugs.pkl" + filepath = base / "drugs.pkl" with open(filepath, "wb") as f: pickle.dump(data, f) elif mode == "H5PY": - filepath = base / "input_forms" / "drugs.h5" + filepath = base / "drugs.h5" with h5py.File(filepath, "w") as f: for k, v in ddict.items(): f[k] = list(md_calculator.CalcDescriptors(Chem.MolFromSmiles(v))) elif mode == "SDF": - filepath = base / "input_forms" / "drugs.sdf" + filepath = base / "drugs.sdf" with Chem.SDWriter(str(filepath)) as w: for k, v in ddict.items(): mol = Chem.MolFromSmiles(v) @@ -269,11 +269,41 @@ def test_input_formats(mode, md_calculator): dataset = read_molecule_data(filepath) - shutil.rmtree(base / "input_forms", ignore_errors=True) + shutil.rmtree(base, ignore_errors=True) assert set(dataset.names) == set(ddict.keys()) +@pytest.mark.parametrize("mode", ["MOL", "MRV", "PDB", "TPL"]) # , "XYZ"]) +def test_molecule_formats(mode): + base = Path("data") / "pipeline" / "input_forms" + base.mkdir(exist_ok=True, parents=True) + mols = {} + with open(Path("data") / "molecules.csv", "r") as f: + for line in f.readlines()[1:]: + k, v = line.strip().split(",") + mols[k] = Chem.MolFromSmiles(v) + + for k, mol in mols.items(): + AllChem.EmbedMultipleConfs(mol, numConfs=1) + if mode == "MOL": + Chem.MolToMolFile(mol, str(base / f"{k}.mol")) + elif mode == "MRV": + Chem.MolToMrvFile(mol, str(base / f"{k}.mrv")) + elif mode == "PDB": + Chem.MolToPDBFile(mol, str(base / f"{k}.pdb")) # , removeHs=False) + elif mode == "TPL": + Chem.MolToTPLFile(mol, str(base / f"{k}.tpl")) + # elif mode == "XYZ": + # Chem.MolToXYZFile(mol, str(base / f"{k}.xyz")) + else: + raise ValueError(f"Unknown mode: {mode}") + + dataset = read_molecule_data(base) + shutil.rmtree(base, ignore_errors=True) + assert set(dataset.names) == set(mols.keys()) + + @pytest.mark.todo def test_genomes(): base = Path("data") / "genomes"