From 4b04c1cb19fa8e4e184802d6913d403d94ad5f3e Mon Sep 17 00:00:00 2001 From: Lu Zhu Date: Wed, 8 May 2024 06:42:59 -0400 Subject: [PATCH 01/39] add ipython image --- auroris/report/_report.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/auroris/report/_report.py b/auroris/report/_report.py index bd00929..b07c1c8 100644 --- a/auroris/report/_report.py +++ b/auroris/report/_report.py @@ -2,9 +2,12 @@ from datetime import datetime from typing import ByteString, List, Optional, Union +from io import BytesIO from matplotlib import pyplot as plt from matplotlib.figure import Figure from PIL.Image import Image as ImageType +from PIL import Image as PILImage +from IPython.core.display import Image as IpythonImage from pydantic import BaseModel, ConfigDict, Field, PrivateAttr from auroris import __version__ @@ -71,13 +74,15 @@ def log_new_column(self, name: str): def log_image( self, - image_or_figure: Union[ImageType, Figure, ByteString], + image_or_figure: Union[ImageType, Figure, ByteString, IpythonImage], title: Optional[str] = None, description: Optional[str] = None, ): """Logs an image. Also accepts Matplotlib figures, which will be converted to images.""" self._check_active_section() - if isinstance(image_or_figure, Figure): + if isinstance(image_or_figure, IpythonImage): + image = PILImage.open(BytesIO(image_or_figure.data)) + elif isinstance(image_or_figure, Figure): image = fig2img(image_or_figure) plt.close(image_or_figure) else: From 9096b3039dd808d8b66aa97325bfdaad331747c9 Mon Sep 17 00:00:00 2001 From: Lu Zhu Date: Wed, 8 May 2024 06:43:21 -0400 Subject: [PATCH 02/39] minor fix --- auroris/curation/actions/_discretize.py | 2 +- auroris/visualization/_distribution.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/auroris/curation/actions/_discretize.py b/auroris/curation/actions/_discretize.py index d9be8f9..965f06f 100644 --- a/auroris/curation/actions/_discretize.py +++ b/auroris/curation/actions/_discretize.py @@ -82,7 +82,7 @@ class Discretization(BaseAction): inplace: bool = False allow_nan: bool = True label_order: Literal["ascending", "descending"] = "ascending" - log_scale: bool = True + log_scale: bool = False def transform( self, diff --git a/auroris/visualization/_distribution.py b/auroris/visualization/_distribution.py index 8bd47c6..07900bd 100644 --- a/auroris/visualization/_distribution.py +++ b/auroris/visualization/_distribution.py @@ -42,11 +42,11 @@ def visualize_continuous_distribution( for threshold in bins: if log_scale and lower != -np.inf: lower = np.log(lower) + if log_scale and threshold != np.inf: threshold = np.log(threshold) mask = (xs > lower) & (xs <= threshold) - lower = threshold # Update xs to make sure they cover the range even if the # coordinates don't fully cover it @@ -75,6 +75,7 @@ def _format(val): ax.fill_between(masked_xs, ys[mask], alpha=0.5, label=label) ax.plot([threshold, threshold], [ylim[0], ys[mask][-1]], "k--") + lower = threshold ax.legend() return fig From 42b833c602e2cfb600ecb681441cd991d60529b5 Mon Sep 17 00:00:00 2001 From: Lu Zhu Date: Thu, 9 May 2024 10:47:49 -0400 Subject: [PATCH 03/39] allow image export to remote path --- auroris/report/_report.py | 6 ++-- auroris/report/broadcaster/_html.py | 14 ++++++--- auroris/utils.py | 44 +++++++++++++++++++++++++---- 3 files changed, 50 insertions(+), 14 deletions(-) diff --git a/auroris/report/_report.py b/auroris/report/_report.py index b07c1c8..ef8d685 100644 --- a/auroris/report/_report.py +++ b/auroris/report/_report.py @@ -2,16 +2,14 @@ from datetime import datetime from typing import ByteString, List, Optional, Union -from io import BytesIO from matplotlib import pyplot as plt from matplotlib.figure import Figure from PIL.Image import Image as ImageType -from PIL import Image as PILImage from IPython.core.display import Image as IpythonImage from pydantic import BaseModel, ConfigDict, Field, PrivateAttr from auroris import __version__ -from auroris.utils import fig2img +from auroris.utils import fig2img, ipyimg2img class AnnotatedImage(BaseModel): @@ -81,7 +79,7 @@ def log_image( """Logs an image. Also accepts Matplotlib figures, which will be converted to images.""" self._check_active_section() if isinstance(image_or_figure, IpythonImage): - image = PILImage.open(BytesIO(image_or_figure.data)) + image = ipyimg2img(image_or_figure) elif isinstance(image_or_figure, Figure): image = fig2img(image_or_figure) plt.close(image_or_figure) diff --git a/auroris/report/broadcaster/_html.py b/auroris/report/broadcaster/_html.py index a855d65..e25f62c 100644 --- a/auroris/report/broadcaster/_html.py +++ b/auroris/report/broadcaster/_html.py @@ -1,5 +1,6 @@ import base64 import os +import re import pathlib from copy import deepcopy from importlib import resources @@ -8,7 +9,7 @@ import fsspec from auroris.report import CurationReport -from auroris.utils import img2bytes +from auroris.utils import img2bytes, save_image, path2url from ._base import ReportBroadcaster @@ -61,9 +62,14 @@ def broadcast(self): src = f"data:image/png;base64,{image_data}" else: # Save as separate file - path = dm.fs.join(self._image_dir, f"{image_counter}.png") - image.image.save(path) - src = os.path.relpath(path, self._destination) + filename = ( + f"{re.sub(r'[^\w\-\.]', '_', image.title)}.png" + if image.title is not None + else f"{image_counter}.png" + ) + path = dm.fs.join(self._image_dir, filename) + save_image(image.image, path, self._destination) + src = path2url(path, self._destination) image.image = src image_counter += 1 diff --git a/auroris/utils.py b/auroris/utils.py index fb59f38..868a0d4 100644 --- a/auroris/utils.py +++ b/auroris/utils.py @@ -1,10 +1,16 @@ +import os from io import BytesIO import numpy as np from matplotlib.figure import Figure from PIL import Image from PIL.Image import Image as ImageType +from IPython.core.display import Image as IpythonImage +import fsspec +from google.cloud import storage + from sklearn.utils.multiclass import type_of_target +import datamol as dm def is_regression(values: np.ndarray): @@ -19,12 +25,17 @@ def is_regression(values: np.ndarray): def fig2img(fig: Figure) -> ImageType: """Convert a Matplotlib figure to a PIL Image""" - fig.canvas.draw() - return Image.frombytes( - "RGBA", - fig.canvas.get_width_height(), - fig.canvas.buffer_rgba(), - ) + if isinstance(fig, Figure): + fig.canvas.draw() + return Image.frombytes( + "RGBA", + fig.canvas.get_width_height(), + fig.canvas.buffer_rgba(), + ) + + +def ipyimg2img(fig: IpythonImage) -> ImageType: + return Image.open(BytesIO(fig.data)) def img2bytes(image: ImageType): @@ -33,3 +44,24 @@ def img2bytes(image: ImageType): image.save(image_bytes, format="PNG") image_bytes = image_bytes.getvalue() return image_bytes + + +def path2url(path: str, destination: str): + if not os.path.isfile(path): + if path.startswith("gs://"): + return path.replace("gs://", "https://storage.googleapis.com/") + else: + raise ValueError("Only GCP path is supported.") + else: + return os.path.relpath(path, destination) + + +def save_image(image: ImageType, path: str, destination: str): + """Save image to local and remote path""" + if dm.fs.is_local_path(destination): + image.save(path) + else: + # Lu: couldn't find a way to save image directly to remote path + image_bytes = img2bytes(image) + with fsspec.open(path, "wb") as f: + f.write(image_bytes) From f1f3abbf02ea9f7c69d2557d33d75a8e71021c86 Mon Sep 17 00:00:00 2001 From: Lu Zhu Date: Thu, 9 May 2024 10:48:18 -0400 Subject: [PATCH 04/39] minor changes --- auroris/curation/actions/_ac_stereoisomer.py | 4 ++-- auroris/curation/actions/_distribution.py | 4 +--- auroris/curation/actions/_outlier.py | 2 +- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/auroris/curation/actions/_ac_stereoisomer.py b/auroris/curation/actions/_ac_stereoisomer.py index 10357c3..5ae4c2e 100644 --- a/auroris/curation/actions/_ac_stereoisomer.py +++ b/auroris/curation/actions/_ac_stereoisomer.py @@ -54,7 +54,7 @@ class StereoIsomerACDetection(BaseAction): Automatic detection of outliers. """ - stereoisomer_id_col: str + stereoisomer_id_col: str = "MOL_molhash_id_no_stereo" y_cols: List[str] threshold: float = 2.0 prefix: str = "AC_" @@ -80,7 +80,7 @@ def transform( col_with_prefix = self.get_column_name(col) report.log_new_column(col_with_prefix) - has_cliff = dataset[col_with_prefix].notna() + has_cliff = dataset[col_with_prefix] == True num_cliff = has_cliff.sum() if num_cliff > 0: diff --git a/auroris/curation/actions/_distribution.py b/auroris/curation/actions/_distribution.py index 97f225b..9b001ae 100644 --- a/auroris/curation/actions/_distribution.py +++ b/auroris/curation/actions/_distribution.py @@ -27,9 +27,7 @@ def transform( ): if report is not None: for y_col in self.y_cols: - fig = visualize_continuous_distribution( - data=dataset[y_col], label_name=y_col, log_scale=self.log_scale - ) + fig = visualize_continuous_distribution(data=dataset[y_col], log_scale=self.log_scale) report.log_image(fig, title=f"Data distribution - {y_col}") return dataset diff --git a/auroris/curation/actions/_outlier.py b/auroris/curation/actions/_outlier.py index 5731bb3..cfdf5d0 100644 --- a/auroris/curation/actions/_outlier.py +++ b/auroris/curation/actions/_outlier.py @@ -150,7 +150,7 @@ def transform( report.log_new_column(is_outlier_col_label) fig = visualize_distribution_with_outliers(values=values, is_outlier=is_outlier) - report.log_image(fig) + report.log_image(fig, title=f"Outlier detection - {column}") return dataset From 9d365ae49ad14a509123d296dd6c43daa67993d1 Mon Sep 17 00:00:00 2001 From: Lu Zhu Date: Wed, 15 May 2024 14:21:52 -0400 Subject: [PATCH 05/39] change loglevel --- auroris/report/broadcaster/_logger.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/auroris/report/broadcaster/_logger.py b/auroris/report/broadcaster/_logger.py index b6daa9a..317f740 100644 --- a/auroris/report/broadcaster/_logger.py +++ b/auroris/report/broadcaster/_logger.py @@ -34,7 +34,8 @@ class LoggerBroadcaster(ReportBroadcaster): def __init__(self, report: CurationReport): super().__init__(report) self.logger = logging.getLogger() - self.logger.setLevel(logging.DEBUG) + # Lu: debug level might log other irrelevant debugging logs + self.logger.setLevel(logging.INFO) handler = logging.StreamHandler(sys.stdout) handler.setFormatter(ColoredFormatter()) @@ -51,18 +52,22 @@ def broadcast(self): self.render_log(log) for image in section.images: self.render_image(image) + self.on_report_end(self._report) def render_log(self, message: str): - self.logger.debug(f"[LOG]: {message}") + self.logger.info(f"[LOG]: {message}") def render_image(self, image: AnnotatedImage): width, height = image.image.size - self.logger.debug(f"[IMG]: Dimensions {width} x {height}") + self.logger.info(f"[IMG]: Dimensions {width} x {height}") def on_section_start(self, section: Section): self.logger.info(f"===== {section.title} =====") def on_report_start(self, report: CurationReport): self.logger.critical("===== Curation Report =====") - self.logger.debug(f"Time: {report.time_stamp.strftime('%Y-%m-%d %H:%M:%S')}") - self.logger.debug(f"Version: {report.auroris_version}") + self.logger.info(f"Time: {report.time_stamp.strftime('%Y-%m-%d %H:%M:%S')}") + self.logger.info(f"Version: {report.auroris_version}") + + def on_report_end(self, report: CurationReport): + self.logger.critical("===== Curation Report END =====") From 2ab83d7f0394a6837e79141a00ad65634f02baa1 Mon Sep 17 00:00:00 2001 From: Lu Zhu Date: Wed, 15 May 2024 14:22:30 -0400 Subject: [PATCH 06/39] update chemspace viz --- auroris/curation/actions/_mol.py | 32 +++++++++++++++++++++++------ auroris/visualization/_chemspace.py | 28 +++++++++++++------------ 2 files changed, 41 insertions(+), 19 deletions(-) diff --git a/auroris/curation/actions/_mol.py b/auroris/curation/actions/_mol.py index dcff30c..aca6c01 100644 --- a/auroris/curation/actions/_mol.py +++ b/auroris/curation/actions/_mol.py @@ -11,6 +11,11 @@ from auroris.types import VerbosityLevel from auroris.visualization import visualize_chemspace +try: + from molfeat.trans.pretrained.hf_transformers import PretrainedHFTransformer +except: + PretrainedHFTransformer = None + def curate_molecules( mols: List[Union[str, dm.Mol]], @@ -228,6 +233,8 @@ class MoleculeCuration(BaseAction): remove_stereo: bool = False count_stereoisomers: bool = True count_stereocenters: bool = True + y_cols: Optional[List[str]] = None + fast: Optional[bool] = True def transform( self, @@ -264,12 +271,25 @@ def transform( smiles_col = self.get_column_name("smiles") smiles = dataset[smiles_col].dropna().values - with dm.without_rdkit_log(): - # Temporary disable logs because of deprecation warning - X = np.array([dm.to_fp(smi) for smi in smiles]) - - fig = visualize_chemspace(X=X) - report.log_image(fig, "Distribution in Chemical Space") + if PretrainedHFTransformer and not self.fast: + featurizer = "ChemBERTa-77M-MTR" + transformer = PretrainedHFTransformer(kind=featurizer, notation="smiles", dtype=float) + X = transformer(smiles) + report.log( + "`ChemBERTa-77M-MTR` embedding is used to compute the distributionin chemical space." + ) + else: + featurizer = "ECFP" + with dm.without_rdkit_log(): + # Temporary disable logs because of deprecation warning + X = np.array([dm.to_fp(smi) for smi in smiles]) + report.log("Default `ecfp` fingerprint is used to compute the distributionin chemical space.") + + # list of data per column + y = dataset[self.y_cols].T.values.tolist() if self.y_cols else None + + fig = visualize_chemspace(X=X, y=y, labels=self.y_cols) + report.log_image(fig, title=f"Distribution in Chemical Space - {featurizer}") if self.count_stereocenters: # Plot all compounds with undefined stereocenters for visual inspection diff --git a/auroris/visualization/_chemspace.py b/auroris/visualization/_chemspace.py index 16d6246..e662a22 100644 --- a/auroris/visualization/_chemspace.py +++ b/auroris/visualization/_chemspace.py @@ -15,7 +15,7 @@ def visualize_chemspace( X: Union[List[np.ndarray], np.ndarray], y: Optional[Union[List[np.ndarray], np.ndarray]] = None, labels: Optional[List[str]] = None, - n_cols: int = 3, + n_cols: int = 2, fig_base_size: float = 8, w_h_ratio: float = 0.5, dpi: int = 150, @@ -39,36 +39,38 @@ def visualize_chemspace( if umap is None: raise ImportError("Please run `pip install umap-learn` to use UMAP visualizations for the chemspace.") - if isinstance(X, np.ndarray): - X = [X] if isinstance(y, np.ndarray): - y = [y] + y = list(y) + if y is None: - y = [None for _ in range(len(X))] - if len(X) != len(y): - raise ValueError("X and y must have the same length.") + y = [None] if labels is None: - labels = ["" for i in range(len(X))] + labels = ["" for _ in range(len(y))] + + if len(y) != len(labels): + raise ValueError("`labels` and `y` must have the same length.") + + embedding = umap.UMAP(**umap_kwargs).fit_transform(X) + umap_0, umap_1 = embedding[:, 0], embedding[:, 1] with create_figure( - n_plots=len(X), + n_plots=len(y), n_cols=n_cols, fig_base_size=fig_base_size, w_h_ratio=w_h_ratio, dpi=dpi, seaborn_theme=seaborn_theme, ) as (fig, axes): - for idx, (X_i, y_i, label) in enumerate(zip(X, y, labels)): - embedding = umap.UMAP(**umap_kwargs).fit_transform(X_i) - umap_0, umap_1 = embedding[:, 0], embedding[:, 1] - + for idx, (y_i, label) in enumerate(zip(y, labels)): ax = sns.scatterplot( x=umap_0, y=umap_1, hue=y_i, ax=axes[idx], ) + ax.set_xlabel("Component 0") + ax.set_xlabel("Component 1") ax.set_title(label) return fig From 8412c9ad7a7391ae1cf0c3e69f9e59ef11ff6f59 Mon Sep 17 00:00:00 2001 From: Lu Zhu Date: Wed, 15 May 2024 14:23:18 -0400 Subject: [PATCH 07/39] add dup logger --- auroris/curation/actions/_deduplicate.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/auroris/curation/actions/_deduplicate.py b/auroris/curation/actions/_deduplicate.py index 30cb7aa..d10e94a 100644 --- a/auroris/curation/actions/_deduplicate.py +++ b/auroris/curation/actions/_deduplicate.py @@ -67,10 +67,14 @@ def transform( verbosity: VerbosityLevel = VerbosityLevel.NORMAL, parallelized_kwargs: Optional[Dict] = None, ): - return deduplicate( + dataset_dedup = deduplicate( dataset, deduplicate_on=self.deduplicate_on, y_cols=self.y_cols, keep=self.keep, method=self.method, ) + if report is not None: + num_duplicates = dataset.shape[0] - dataset_dedup.shape[0] + report.log(f"Deduplication merged and removed {num_duplicates} duplicated molecules from dataset") + return dataset_dedup From 041cc7a0519db624a78f37978b29352596002838 Mon Sep 17 00:00:00 2001 From: Lu Zhu Date: Wed, 15 May 2024 14:23:39 -0400 Subject: [PATCH 08/39] simplify distribution viz --- auroris/visualization/_distribution.py | 34 +++++++++++++++++--------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/auroris/visualization/_distribution.py b/auroris/visualization/_distribution.py index 07900bd..c49a469 100644 --- a/auroris/visualization/_distribution.py +++ b/auroris/visualization/_distribution.py @@ -3,6 +3,7 @@ import numpy as np import seaborn as sns from scipy import stats +import matplotlib.pyplot as plt from auroris.visualization.utils import create_figure @@ -82,8 +83,7 @@ def _format(val): def visualize_distribution_with_outliers( - values: np.ndarray, - is_outlier: Optional[List[bool]] = None, + values: np.ndarray, is_outlier: Optional[List[bool]] = None, title: str = "Probability Plot" ): """Visualize the distribution of the data and highlight the potential outliers.""" @@ -98,14 +98,26 @@ def visualize_distribution_with_outliers( values = values[sorted_ind] is_outlier = is_outlier[sorted_ind] - with create_figure(n_plots=2) as (fig, axes): - sns.scatterplot( - x=np.arange(len(values)), - y=values, - hue=is_outlier, - palette={1.0: "red", 0.0: "navy", 0.5: "grey"}, - ax=axes[0], - ) - stats.probplot(values, dist="norm", plot=axes[1]) + fig = plt.figure() + res = stats.probplot(values, dist="norm", plot=plt, fit=True) + x = res[0][0] + y = res[0][1] + + # Specify the indices of data points to highlight + highlight_indices = np.argwhere(is_outlier == True).flatten() + highlight_color = "red" + + # Plot the probability plot + # plt.plot(x, y, "bo") # Blue circles for regular points + + # Overlay specific points with different colors + for idx in highlight_indices: + plt.plot( + x[idx], y[idx], marker="o", markersize=8, color=highlight_color + ) # Red circles for highlighted points + + plt.xlabel("Theoretical quantiles") + plt.ylabel("Ordered Values") + plt.title(title) return fig From d1f41de2eb5e4f42c6a53fa65a86f587b74eca83 Mon Sep 17 00:00:00 2001 From: Lu Zhu Date: Wed, 15 May 2024 14:24:18 -0400 Subject: [PATCH 09/39] update outlier logs --- auroris/curation/actions/_outlier.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/auroris/curation/actions/_outlier.py b/auroris/curation/actions/_outlier.py index cfdf5d0..af66dd8 100644 --- a/auroris/curation/actions/_outlier.py +++ b/auroris/curation/actions/_outlier.py @@ -145,11 +145,17 @@ def transform( is_outlier_col_label = self.get_column_name(column) dataset[is_outlier_col_label] = is_outlier + num_outliers = sum(is_outlier) if report is not None: report.log_new_column(is_outlier_col_label) - - fig = visualize_distribution_with_outliers(values=values, is_outlier=is_outlier) + report.log( + f"Found {num_outliers} potential outliers " + f"with respect to the {column} column for review." + ) + fig = visualize_distribution_with_outliers( + values=values, is_outlier=is_outlier, title=f"Probability Plot - {column}" + ) report.log_image(fig, title=f"Outlier detection - {column}") return dataset From f841f12e6532b55fbef794f733764e6f7205304a Mon Sep 17 00:00:00 2001 From: Lu Zhu Date: Wed, 15 May 2024 14:24:40 -0400 Subject: [PATCH 10/39] minor changes --- auroris/curation/_curator.py | 2 +- auroris/curation/actions/_ac_stereoisomer.py | 8 +++++--- auroris/report/broadcaster/_html.py | 2 ++ 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/auroris/curation/_curator.py b/auroris/curation/_curator.py index 57025a8..66b6322 100644 --- a/auroris/curation/_curator.py +++ b/auroris/curation/_curator.py @@ -38,7 +38,7 @@ def _serialize_verbosity(self, value: VerbosityLevel): def transform(self, dataset: pd.DataFrame) -> Tuple[pd.DataFrame, CurationReport]: report = CurationReport() - dataset = dataset.copy(deep=True) + dataset = dataset.copy() action: BaseAction for action in self.steps: diff --git a/auroris/curation/actions/_ac_stereoisomer.py b/auroris/curation/actions/_ac_stereoisomer.py index 5ae4c2e..53069ec 100644 --- a/auroris/curation/actions/_ac_stereoisomer.py +++ b/auroris/curation/actions/_ac_stereoisomer.py @@ -51,7 +51,7 @@ def detect_streoisomer_activity_cliff( class StereoIsomerACDetection(BaseAction): """ - Automatic detection of outliers. + Automatic detection of activity shift between stereoisomers. """ stereoisomer_id_col: str = "MOL_molhash_id_no_stereo" @@ -92,10 +92,12 @@ def transform( legends = (col + dataset.loc[has_cliff, col].astype(str)).tolist() image = dm.to_image([dm.to_mol(s) for s in to_plot], legends=legends, use_svg=False) - report.log_image(image) + report.log_image( + image_or_figure=image, title="Detection of activity shifts among stereoisomers" + ) else: report.log( - "Found no activity cliffs among stereoisomers with respect to the {col} column." + f"Found no activity cliffs among stereoisomers with respect to the {col} column." ) return dataset diff --git a/auroris/report/broadcaster/_html.py b/auroris/report/broadcaster/_html.py index e25f62c..97396c9 100644 --- a/auroris/report/broadcaster/_html.py +++ b/auroris/report/broadcaster/_html.py @@ -87,3 +87,5 @@ def broadcast(self): path = dm.fs.join(self._destination, "index.html") with fsspec.open(path, "w") as fd: fd.write(html) + + return path From 1f25fa57e196a0629cb3d409dcdab65ae1b17eb4 Mon Sep 17 00:00:00 2001 From: Lu Zhu Date: Thu, 16 May 2024 01:37:30 -0400 Subject: [PATCH 11/39] fix serialization --- auroris/cli.py | 5 ++-- auroris/curation/_curator.py | 48 ++++++++++++++++++++++++++++++++---- 2 files changed, 46 insertions(+), 7 deletions(-) diff --git a/auroris/cli.py b/auroris/cli.py index 0bbcb89..0bc3301 100644 --- a/auroris/cli.py +++ b/auroris/cli.py @@ -2,6 +2,7 @@ import pandas as pd import typer +from typing import Optional from auroris.curation import Curator from auroris.report.broadcaster import HTMLBroadcaster @@ -9,9 +10,9 @@ @app.command() -def curate(config_path: str, dataset_path: str, destination: str, overwrite: bool = False): +def curate(config_path: str, destination: str, dataset_path: Optional[str] = None, overwrite: bool = False): # Load data - dataset = pd.read_csv(dataset_path) + dataset = pd.read_csv(dataset_path) if dataset_path else None curator = Curator.from_json(config_path) # Run curation diff --git a/auroris/curation/_curator.py b/auroris/curation/_curator.py index 66b6322..e8d7734 100644 --- a/auroris/curation/_curator.py +++ b/auroris/curation/_curator.py @@ -1,10 +1,11 @@ import json -from typing import List, Tuple, Union +from typing import List, Tuple, Union, Optional +from os import PathLike import fsspec import pandas as pd from loguru import logger -from pydantic import BaseModel, Field, field_serializer, field_validator +from pydantic import BaseModel, Field, field_serializer, field_validator, ValidationError from auroris.curation.actions._base import ACTION_REGISTRY, BaseAction from auroris.report import CurationReport @@ -21,8 +22,9 @@ class Curator(BaseModel): # This is the recommended way to add all subclasses in the type. # See e.g. https://github.com/pydantic/pydantic/issues/2200 # and https://github.com/pydantic/pydantic/issues/2036 - steps: List[Union[tuple(ACTION_REGISTRY)]] = Field(..., discriminator="name") # type: ignore + data_path: Optional[Union[str, PathLike]] = None + steps: List[Union[tuple(ACTION_REGISTRY)]] = Field(..., discriminator="name") # type: ignore verbosity: VerbosityLevel = VerbosityLevel.NORMAL parallelized_kwargs: dict = Field(default_factory=dict) @@ -36,7 +38,28 @@ def _validate_verbosity(cls, v): def _serialize_verbosity(self, value: VerbosityLevel): return value.name - def transform(self, dataset: pd.DataFrame) -> Tuple[pd.DataFrame, CurationReport]: + @field_validator("data_path", mode="before") + def _validate_data_path(cls, value: Union[str, PathLike]): + try: + pd.read_csv(value, nrows=5) + return value + except: + raise ValueError( + f"Dataset cann't be loaded by `panda.read_csv('{value}')`." + f"Consider to directly pass the loaded the data to `Curator.curate()`." + ) + + @field_serializer("verbosity") + def _serialize_verbosity(self, value: Union[str, PathLike]): + return value.name + + def _load_data(self): + return pd.read_csv(self.data_path) + + def transform(self, dataset: Optional[pd.DataFrame] = None) -> Tuple[pd.DataFrame, CurationReport]: + if dataset is None: + dataset = self._load_data() + report = CurationReport() dataset = dataset.copy() @@ -57,6 +80,13 @@ def transform(self, dataset: pd.DataFrame) -> Tuple[pd.DataFrame, CurationReport def __call__(self, dataset): return self.transform(dataset) + @classmethod + def _get_action(cls, name: str): + for action in ACTION_REGISTRY: + if action.__name__ == name: + return action + return None + @classmethod def from_json(cls, path: str): """Loads a curation workflow from a JSON file. @@ -66,6 +96,9 @@ def from_json(cls, path: str): """ with fsspec.open(path, "r") as f: data = json.load(f) + + steps = [cls._get_action(name)(**args) for step in data["steps"] for name, args in step.items()] + data["steps"] = steps return cls.model_validate(data) def to_json(self, path: str): @@ -74,6 +107,11 @@ def to_json(self, path: str): Args: path: The destination to save to """ + serialization = self.model_dump(exclude="steps") + # # save steps in defined order + serialization["steps"] = [{step.name: step.model_dump()} for step in self.steps] with fsspec.open(path, "w") as f: - json.dump(self.model_dump(), f) + json.dump(serialization, f) + # with fsspec.open(path, "w") as f: + # json.dump(self.model_dump(), f) return path From a9669a6ebf8eda1f20d47fcadcb4b40603e39b35 Mon Sep 17 00:00:00 2001 From: Lu Zhu Date: Thu, 16 May 2024 14:53:19 -0400 Subject: [PATCH 12/39] add more docstrings --- auroris/curation/_curator.py | 11 +- auroris/curation/actions/_ac_stereoisomer.py | 23 +++- auroris/curation/actions/_deduplicate.py | 6 + auroris/curation/actions/_discretize.py | 34 +++++- auroris/curation/actions/_distribution.py | 15 ++- auroris/curation/actions/_mol.py | 33 +++++- auroris/curation/actions/_outlier.py | 118 ++++++++++--------- auroris/report/broadcaster/_html.py | 10 +- auroris/utils.py | 11 +- auroris/visualization/_distribution.py | 15 ++- auroris/visualization/utils.py | 2 +- 11 files changed, 196 insertions(+), 82 deletions(-) diff --git a/auroris/curation/_curator.py b/auroris/curation/_curator.py index e8d7734..2f7969c 100644 --- a/auroris/curation/_curator.py +++ b/auroris/curation/_curator.py @@ -5,7 +5,7 @@ import fsspec import pandas as pd from loguru import logger -from pydantic import BaseModel, Field, field_serializer, field_validator, ValidationError +from pydantic import BaseModel, Field, field_serializer, field_validator from auroris.curation.actions._base import ACTION_REGISTRY, BaseAction from auroris.report import CurationReport @@ -16,6 +16,13 @@ class Curator(BaseModel): """ A curator is a collection of actions that are applied to a dataset. Can be serialized. + + Args: + data_path: Data path. + The data must be loadable by `pd.read_csv` with default parameters. + steps: List of curation actions. + Check all the available action . + """ # To know which Action object to create, we need a discriminated union. @@ -43,7 +50,7 @@ def _validate_data_path(cls, value: Union[str, PathLike]): try: pd.read_csv(value, nrows=5) return value - except: + except Exception: raise ValueError( f"Dataset cann't be loaded by `panda.read_csv('{value}')`." f"Consider to directly pass the loaded the data to `Curator.curate()`." diff --git a/auroris/curation/actions/_ac_stereoisomer.py b/auroris/curation/actions/_ac_stereoisomer.py index 53069ec..7ecf336 100644 --- a/auroris/curation/actions/_ac_stereoisomer.py +++ b/auroris/curation/actions/_ac_stereoisomer.py @@ -15,9 +15,19 @@ def detect_streoisomer_activity_cliff( dataset: pd.DataFrame, stereoisomer_id_col: str, y_cols: List[str], - threshold: float = 1.0, + threshold: float = 2.0, prefix: str = "AC_", -): +) -> pd.DataFrame: + """ + Detect activity cliff among stereoisomers based on classification label or pre-defined threshold for continuous values. + + Args: + dataset: Dataframe + stereoisomer_id_col: Column which identifies the stereoisomers + y_cols: List of columns for bioactivities + threshold: Threshold to identify the activity cliff. Currently, the difference of zscores between isomers are used for identification. + prefix: Prefix for the adding columns + """ dataset_ori = dataset.copy(deep=True) ac_cols = {y_col: [] for y_col in y_cols} group_index_list = np.array( @@ -52,6 +62,13 @@ def detect_streoisomer_activity_cliff( class StereoIsomerACDetection(BaseAction): """ Automatic detection of activity shift between stereoisomers. + + Args: + stereoisomer_id_col: Column which identifies the stereoisomers. + y_cols: List of columns for bioactivities. + threshold: Threshold to identify the activity cliff. Currently, the difference of zscores between isomers are used for identification. + prefix: Prefix for the adding columns. + mol_col: Column for molecule strings """ stereoisomer_id_col: str = "MOL_molhash_id_no_stereo" @@ -80,7 +97,7 @@ def transform( col_with_prefix = self.get_column_name(col) report.log_new_column(col_with_prefix) - has_cliff = dataset[col_with_prefix] == True + has_cliff = dataset[col_with_prefix].__eq__(True) num_cliff = has_cliff.sum() if num_cliff > 0: diff --git a/auroris/curation/actions/_deduplicate.py b/auroris/curation/actions/_deduplicate.py index d10e94a..893a1c7 100644 --- a/auroris/curation/actions/_deduplicate.py +++ b/auroris/curation/actions/_deduplicate.py @@ -53,6 +53,12 @@ def deduplicate( class Deduplication(BaseAction): """ Automatic detection of outliers. + + Args: + deduplicate_on: A subset of the columns to deduplicate on (can be default). + y_cols: The columns to aggregate. + keep: Whether to keep the first or last copy of the duplicates. + method: The method to aggregate the data. """ deduplicate_on: Optional[Union[str, List[str]]] = None diff --git a/auroris/curation/actions/_discretize.py b/auroris/curation/actions/_discretize.py index 965f06f..3241bbf 100644 --- a/auroris/curation/actions/_discretize.py +++ b/auroris/curation/actions/_discretize.py @@ -17,7 +17,8 @@ def discretize( allow_nan: bool = True, label_order: Literal["ascending", "descending"] = "ascending", ) -> np.ndarray: - """Thresholding of array-like or scipy.sparse matrix into binary or multiclass labels. + """ + Thresholding of array-like or scipy.sparse matrix into binary or multiclass labels. Args: X : The data to discretize, element by element. @@ -76,6 +77,37 @@ def discretize( class Discretization(BaseAction): + """ + Thresholding bioactivity columns to binary or multiclass labels. + + Args: + X : The data to discretize, element by element. + scipy.sparse matrices should be in CSR or CSC format to avoid an + un-necessary copy. + + thresholds: Interval boundaries that include the right bin edge. + + inplace: Set to True to perform inplace discretization and avoid a copy + (if the input is already a numpy array or a scipy.sparse CSR / CSC + matrix and if axis is 1). + + allow_nan: Set to True to allow nans in the array for discretization. Otherwise, + an error will be raised instead. + + label_order: The continuous values are discretized to labels 0, 1, 2, .., N with respect to given + threshold bins [threshold_1, threshold_2,.., threshould_n]. + When set to 'ascending', the class label is in ascending order with the threshold + bins that `0` represents negative class or lower class, while 1, 2, 3 are for higher classes. + When set to 'descending' the class label is in ascending order with the threshold bins. + Sometimes the positive labels are on the left side of provided threshold. + E.g. For binarization with threshold [0.5], the positive label is defined + by`X < 0.5`. In this case, `label_order` should be `descending`. + + log_scale: Whether visualize distribution in log scale. + See more in + + """ + input_column: str prefix: str = "CLS_" thresholds: List[float] diff --git a/auroris/curation/actions/_distribution.py b/auroris/curation/actions/_distribution.py index 9b001ae..9b0efd7 100644 --- a/auroris/curation/actions/_distribution.py +++ b/auroris/curation/actions/_distribution.py @@ -1,7 +1,6 @@ -from typing import Dict, List, Optional +from typing import Dict, List, Optional, Sequence import pandas as pd -from pydantic import Field from auroris.curation.actions._base import BaseAction from auroris.report import CurationReport @@ -12,11 +11,17 @@ class ContinuousDistributionVisualization(BaseAction): """ Visualize a continuous distribution + + Args: + y_cols: List of columns for bioactivity for visualization + log_scale: Whether visualize distribution in log scale. + bins: The bin boundaries to color the area under the KDE curve. + """ y_cols: Optional[List[str]] = None log_scale: bool = False - kwargs: Dict = Field(default_factory=dict) + bins: Optional[Sequence[float]] = None def transform( self, @@ -27,7 +32,9 @@ def transform( ): if report is not None: for y_col in self.y_cols: - fig = visualize_continuous_distribution(data=dataset[y_col], log_scale=self.log_scale) + fig = visualize_continuous_distribution( + data=dataset[y_col], log_scale=self.log_scale, bins=self.bins + ) report.log_image(fig, title=f"Data distribution - {y_col}") return dataset diff --git a/auroris/curation/actions/_mol.py b/auroris/curation/actions/_mol.py index aca6c01..d2d7684 100644 --- a/auroris/curation/actions/_mol.py +++ b/auroris/curation/actions/_mol.py @@ -13,7 +13,7 @@ try: from molfeat.trans.pretrained.hf_transformers import PretrainedHFTransformer -except: +except Exception: PretrainedHFTransformer = None @@ -26,6 +26,24 @@ def curate_molecules( count_stereocenters: bool = True, **parallelized_kwargs, ): + """ + Curate a list of molecules. + + Args: + mols: List of molecules + progress: Whether show curation progress + remove_salt_solvent: Whether remove salt and solvent from molecule + remove_stereo: Whether remove stereo chemistry information from molecule + count_stereoisomers: Whether count the number of stereoisomers of molecule + count_stereocenters: Whether count the number of stereocenters of molecule + parallelized_kwargs: Additional argument for the parallelizarion process. + See more about + + Returns: + mol_dict: Dictionary of molecule and additional metadata + num_invalid: Number of invalid molecules + + """ fn = partial( _curate_molecule, remove_salt_solvent=remove_salt_solvent, @@ -191,10 +209,11 @@ def _get_mol_dict( def _num_stereo_centers(mol: dm.Mol) -> Tuple[int]: - """Get the number of defined and undefined stereo centers of a given molecule - by accessing the all and only defined stereo centers. - It's to facilitate the analysis of the stereo isomers. - None will be return if there is no stereo centers in the molecule. + """ + Get the number of defined and undefined stereo centers of a given molecule + by accessing the all and only defined stereo centers. + It's to facilitate the analysis of the stereo isomers. + None will be return if there is no stereo centers in the molecule. Args: mol: Molecule @@ -218,7 +237,9 @@ def _num_stereo_centers(mol: dm.Mol) -> Tuple[int]: class MoleculeCuration(BaseAction): """ - Attributes: + Automated molecule curation and chemistry space distribution + + Args: input_column: The name of the column that has the molecules (either `dm.Mol` objects or SMILES). remove_salt_solvent: When set to 'True', all disconnected salts and solvents will be removed from molecule. In most of the cases, it is recommended to remove the salts/solvents. diff --git a/auroris/curation/actions/_outlier.py b/auroris/curation/actions/_outlier.py index af66dd8..0a008aa 100644 --- a/auroris/curation/actions/_outlier.py +++ b/auroris/curation/actions/_outlier.py @@ -18,52 +18,6 @@ OutlierDetectionMethod: TypeAlias = Literal["iso", "lof", "svm", "ee", "zscore"] -def detect_outliers(X: np.ndarray, method: OutlierDetectionMethod = "zscore", **kwargs: Any): - """Functional interface for detecting outliers - - Args: - X: The observations that we want to classify as inliers or outliers. - method: The method to use for outlier detection. - **kwargs: Keyword arguments for the outlier detection method. - """ - - if X.ndim != 1: - raise ValueError("X must be a 1D array for outlier detection.") - - detector_cls = _OUTLIER_METHODS[method] - detector = detector_cls(**kwargs) - indices = np.flatnonzero(~np.isnan(X)) - - in_ = X[indices].reshape(-1, 1) - out_ = detector.fit_predict(in_) - - is_inlier = np.full_like(X, np.nan) - is_inlier[indices] = out_.flatten() - - is_outlier = is_inlier == -1 - return is_outlier - - -def modified_zscore(data: np.ndarray, consistency_correction: float = 1.4826): - """ - The modified z score is calculated from the median absolute deviation (MAD). - These values must be multiplied by a constant to approximate the standard deviation. - - The modified z score might be more robust than the standard z score because it relies - on the median (MED) for calculating the z score. - - modified Z score = (X-MED) / (consistency_correction*MAD) - - """ - median = np.nanmedian(data) - - deviation_from_med = np.array(data) - median - - mad = np.nanmedian(np.abs(deviation_from_med)) - mod_zscore = deviation_from_med / (consistency_correction * mad) - return mod_zscore - - class ZscoreOutlier(OutlierMixin): """ Detect outliers by the absolute value of the Z-score. @@ -122,9 +76,70 @@ def fit_predict(self, X: np.ndarray) -> np.ndarray: return self.predict(X) +_OUTLIER_METHODS: Dict[OutlierDetectionMethod, OutlierMixin] = { + "iso": IsolationForest, + "lof": LocalOutlierFactor, + "svm": OneClassSVM, + "ee": EllipticEnvelope, + "zscore": ZscoreOutlier, +} + + +def detect_outliers(X: np.ndarray, method: OutlierDetectionMethod = "zscore", **kwargs: Any): + """Functional interface for detecting outliers + + Args: + X: The observations that we want to classify as inliers or outliers. + method: The method to use for outlier detection. + **kwargs: Keyword arguments for the outlier detection method. + """ + + if X.ndim != 1: + raise ValueError("X must be a 1D array for outlier detection.") + + detector_cls = _OUTLIER_METHODS[method] + detector = detector_cls(**kwargs) + indices = np.flatnonzero(~np.isnan(X)) + + in_ = X[indices].reshape(-1, 1) + out_ = detector.fit_predict(in_) + + is_inlier = np.full_like(X, np.nan) + is_inlier[indices] = out_.flatten() + + is_outlier = is_inlier == -1 + return is_outlier + + +def modified_zscore(data: np.ndarray, consistency_correction: float = 1.4826): + """ + The modified z score is calculated from the median absolute deviation (MAD). + These values must be multiplied by a constant to approximate the standard deviation. + + The modified z score might be more robust than the standard z score because it relies + on the median (MED) for calculating the z score. + + modified Z score = (X-MED) / (consistency_correction*MAD) + + """ + median = np.nanmedian(data) + + deviation_from_med = np.array(data) - median + + mad = np.nanmedian(np.abs(deviation_from_med)) + mod_zscore = deviation_from_med / (consistency_correction * mad) + return mod_zscore + + class OutlierDetection(BaseAction): """ - Automatic detection of outliers. + Automatic detection of outliers + + Args: + method: Method name for outlier detection. + columns: Column names to detect outliers + prefix: Prefix for added column names + """ method: OutlierDetectionMethod @@ -159,12 +174,3 @@ def transform( report.log_image(fig, title=f"Outlier detection - {column}") return dataset - - -_OUTLIER_METHODS: Dict[OutlierDetectionMethod, OutlierMixin] = { - "iso": IsolationForest, - "lof": LocalOutlierFactor, - "svm": OneClassSVM, - "ee": EllipticEnvelope, - "zscore": ZscoreOutlier, -} diff --git a/auroris/report/broadcaster/_html.py b/auroris/report/broadcaster/_html.py index 97396c9..6e5c7f9 100644 --- a/auroris/report/broadcaster/_html.py +++ b/auroris/report/broadcaster/_html.py @@ -1,5 +1,4 @@ import base64 -import os import re import pathlib from copy import deepcopy @@ -20,7 +19,14 @@ class HTMLBroadcaster(ReportBroadcaster): - """Render a simple HTML page""" + """ + Render a simple HTML page + + Args: + report: Curation report object. + destination: Destination folder for exporting the report. + embed_images: Whether embed image bytes in HTML report. + """ def __init__( self, diff --git a/auroris/utils.py b/auroris/utils.py index 868a0d4..41d28fb 100644 --- a/auroris/utils.py +++ b/auroris/utils.py @@ -7,13 +7,13 @@ from PIL.Image import Image as ImageType from IPython.core.display import Image as IpythonImage import fsspec -from google.cloud import storage from sklearn.utils.multiclass import type_of_target import datamol as dm -def is_regression(values: np.ndarray): +def is_regression(values: np.ndarray) -> bool: + """Whether the input values are for regreesion""" target_type = type_of_target(values) if target_type == "continuous": return True @@ -35,6 +35,7 @@ def fig2img(fig: Figure) -> ImageType: def ipyimg2img(fig: IpythonImage) -> ImageType: + """Convert Ipython image to PIL image""" return Image.open(BytesIO(fig.data)) @@ -47,6 +48,10 @@ def img2bytes(image: ImageType): def path2url(path: str, destination: str): + """ + Convert path to an local or remote url for html report. + Currently, only GCP is supported. + """ if not os.path.isfile(path): if path.startswith("gs://"): return path.replace("gs://", "https://storage.googleapis.com/") @@ -62,6 +67,8 @@ def save_image(image: ImageType, path: str, destination: str): image.save(path) else: # Lu: couldn't find a way to save image directly to remote path + # convert to bytes image_bytes = img2bytes(image) + # save bytes as image to remote path with fsspec.open(path, "wb") as f: f.write(image_bytes) diff --git a/auroris/visualization/_distribution.py b/auroris/visualization/_distribution.py index c49a469..6ae6883 100644 --- a/auroris/visualization/_distribution.py +++ b/auroris/visualization/_distribution.py @@ -85,7 +85,15 @@ def _format(val): def visualize_distribution_with_outliers( values: np.ndarray, is_outlier: Optional[List[bool]] = None, title: str = "Probability Plot" ): - """Visualize the distribution of the data and highlight the potential outliers.""" + """ + Visualize the distribution of the data and highlight the potential outliers. + + Args: + values: Values for visulization. + is_outlier: List of outlier flag. + title: Title of plot + + """ if is_outlier is None: # Import here to prevent ciruclar imports @@ -104,12 +112,9 @@ def visualize_distribution_with_outliers( y = res[0][1] # Specify the indices of data points to highlight - highlight_indices = np.argwhere(is_outlier == True).flatten() + highlight_indices = np.argwhere(is_outlier.__eq__(True)).flatten() highlight_color = "red" - # Plot the probability plot - # plt.plot(x, y, "bo") # Blue circles for regular points - # Overlay specific points with different colors for idx in highlight_indices: plt.plot( diff --git a/auroris/visualization/utils.py b/auroris/visualization/utils.py index 690cb97..f4f4503 100644 --- a/auroris/visualization/utils.py +++ b/auroris/visualization/utils.py @@ -15,7 +15,7 @@ def create_figure( dpi: int = 150, seaborn_theme: Optional[str] = "whitegrid", ): - """Creates a figure with the desired size""" + """Creates a figure with the desired size and layout""" if seaborn_theme is not None: sns.set_theme(style=seaborn_theme) From 582070a0a0b3a38414679360b0b2f810072286f0 Mon Sep 17 00:00:00 2001 From: Lu Zhu Date: Thu, 16 May 2024 22:16:50 -0400 Subject: [PATCH 13/39] fix docstring and extension --- auroris/curation/_curator.py | 23 ++-- auroris/curation/actions/_ac_stereoisomer.py | 24 ++-- auroris/curation/actions/_base.py | 9 +- auroris/curation/actions/_deduplicate.py | 21 ++-- auroris/curation/actions/_discretize.py | 52 ++++---- auroris/curation/actions/_distribution.py | 20 ++- auroris/curation/actions/_mol.py | 59 +++++---- auroris/curation/actions/_outlier.py | 14 +-- docs/index.md | 35 +++++- docs/tutorials/getting_started.ipynb | 124 ++++++++++++------- mkdocs.yml | 3 +- 11 files changed, 219 insertions(+), 165 deletions(-) diff --git a/auroris/curation/_curator.py b/auroris/curation/_curator.py index 2f7969c..de571d2 100644 --- a/auroris/curation/_curator.py +++ b/auroris/curation/_curator.py @@ -17,21 +17,22 @@ class Curator(BaseModel): A curator is a collection of actions that are applied to a dataset. Can be serialized. - Args: - data_path: Data path. - The data must be loadable by `pd.read_csv` with default parameters. - steps: List of curation actions. - Check all the available action . - """ # To know which Action object to create, we need a discriminated union. # This is the recommended way to add all subclasses in the type. # See e.g. https://github.com/pydantic/pydantic/issues/2200 # and https://github.com/pydantic/pydantic/issues/2036 - data_path: Optional[Union[str, PathLike]] = None - - steps: List[Union[tuple(ACTION_REGISTRY)]] = Field(..., discriminator="name") # type: ignore + data_path: Optional[Union[str, PathLike]] = Field( + default=None, + description="Data path. The data must be loadable by `pd.read_csv` with default parameters.", + ) + + steps: List[Union[tuple(ACTION_REGISTRY)]] = Field( + ..., + discriminator="name", + description="List of curation actions. Check all the available action .", + ) verbosity: VerbosityLevel = VerbosityLevel.NORMAL parallelized_kwargs: dict = Field(default_factory=dict) @@ -56,10 +57,6 @@ def _validate_data_path(cls, value: Union[str, PathLike]): f"Consider to directly pass the loaded the data to `Curator.curate()`." ) - @field_serializer("verbosity") - def _serialize_verbosity(self, value: Union[str, PathLike]): - return value.name - def _load_data(self): return pd.read_csv(self.data_path) diff --git a/auroris/curation/actions/_ac_stereoisomer.py b/auroris/curation/actions/_ac_stereoisomer.py index 7ecf336..c1a4bb0 100644 --- a/auroris/curation/actions/_ac_stereoisomer.py +++ b/auroris/curation/actions/_ac_stereoisomer.py @@ -1,9 +1,11 @@ from typing import Dict, List, Optional +from pydantic import Field import datamol as dm import numpy as np import pandas as pd + from auroris.curation.actions._base import BaseAction from auroris.curation.actions._outlier import modified_zscore from auroris.report import CurationReport @@ -62,20 +64,18 @@ def detect_streoisomer_activity_cliff( class StereoIsomerACDetection(BaseAction): """ Automatic detection of activity shift between stereoisomers. - - Args: - stereoisomer_id_col: Column which identifies the stereoisomers. - y_cols: List of columns for bioactivities. - threshold: Threshold to identify the activity cliff. Currently, the difference of zscores between isomers are used for identification. - prefix: Prefix for the adding columns. - mol_col: Column for molecule strings """ - stereoisomer_id_col: str = "MOL_molhash_id_no_stereo" - y_cols: List[str] - threshold: float = 2.0 - prefix: str = "AC_" - mol_col: str = "MOL_smiles" + stereoisomer_id_col: str = Field( + default="MOL_molhash_id_no_stereo", description="Column which identifies the stereoisomers." + ) + y_cols: List[str] = Field(..., description="List of columns for bioactivities.") + threshold: float = Field( + default=2.0, + description=" Threshold to identify the activity cliff. Currently, the difference of zscores between isomers are used for identification.", + ) + prefix: str = Field(default="AC_", description="Prefix for the adding columns.") + mol_col: str = Field(default="MOL_smiles", description="Column for molecule strings.") def transform( self, diff --git a/auroris/curation/actions/_base.py b/auroris/curation/actions/_base.py index 589e211..8df6ca9 100644 --- a/auroris/curation/actions/_base.py +++ b/auroris/curation/actions/_base.py @@ -2,7 +2,7 @@ from typing import TYPE_CHECKING, Dict, Optional import pandas as pd -from pydantic import BaseModel, model_validator +from pydantic import BaseModel, model_validator, Field from auroris.types import VerbosityLevel @@ -16,14 +16,9 @@ class BaseAction(BaseModel, abc.ABC): """ An action in the curation process. - - Args: - prefix: If the action adds columns, use this prefix. - completed: If the action has completed. - dep_action: Name of dependent action. """ - prefix: str = None + prefix: str = Field(default=None, description="If the action adds columns, use this prefix.") @property def name(self) -> str: diff --git a/auroris/curation/actions/_deduplicate.py b/auroris/curation/actions/_deduplicate.py index 893a1c7..bd18e50 100644 --- a/auroris/curation/actions/_deduplicate.py +++ b/auroris/curation/actions/_deduplicate.py @@ -1,4 +1,5 @@ from typing import Dict, List, Literal, Optional, Union +from pydantic import Field import pandas as pd @@ -53,18 +54,18 @@ def deduplicate( class Deduplication(BaseAction): """ Automatic detection of outliers. - - Args: - deduplicate_on: A subset of the columns to deduplicate on (can be default). - y_cols: The columns to aggregate. - keep: Whether to keep the first or last copy of the duplicates. - method: The method to aggregate the data. """ - deduplicate_on: Optional[Union[str, List[str]]] = None - y_cols: Optional[Union[str, List[str]]] = None - keep: Literal["first", "last"] = "first" - method: Literal["mean", "median"] = "median" + deduplicate_on: Optional[Union[str, List[str]]] = Field( + default=None, description="A subset of the columns to deduplicate on (can be default)." + ) + y_cols: Optional[Union[str, List[str]]] = Field(default=None, description="The columns to aggregate.") + keep: Literal["first", "last"] = Field( + default="first", description="Whether to keep the first or last copy of the duplicates." + ) + method: Literal["mean", "median"] = Field( + default="median", description="The method to aggregate the data." + ) def transform( self, diff --git a/auroris/curation/actions/_discretize.py b/auroris/curation/actions/_discretize.py index 3241bbf..9af9083 100644 --- a/auroris/curation/actions/_discretize.py +++ b/auroris/curation/actions/_discretize.py @@ -1,4 +1,5 @@ from typing import Dict, List, Literal, Optional, Union +from pydantic import Field import numpy as np import pandas as pd @@ -79,42 +80,37 @@ def discretize( class Discretization(BaseAction): """ Thresholding bioactivity columns to binary or multiclass labels. + """ - Args: - X : The data to discretize, element by element. - scipy.sparse matrices should be in CSR or CSC format to avoid an - un-necessary copy. - - thresholds: Interval boundaries that include the right bin edge. - - inplace: Set to True to perform inplace discretization and avoid a copy + input_column: str = Field(..., description="Column to be discretized.") + prefix: str = "CLS_" + thresholds: List[float] = Field(..., description="Interval boundaries that include the right bin edge.") + inplace: bool = Field( + default=False, + description="""Set to True to perform inplace discretization and avoid a copy (if the input is already a numpy array or a scipy.sparse CSR / CSC - matrix and if axis is 1). - - allow_nan: Set to True to allow nans in the array for discretization. Otherwise, - an error will be raised instead. - - label_order: The continuous values are discretized to labels 0, 1, 2, .., N with respect to given + matrix and if axis is 1).""", + ) + allow_nan: bool = Field( + default=True, + description="Set to True to allow nans in the array for discretization. Otherwise, an error will be raised instead.", + ) + label_order: Literal["ascending", "descending"] = Field( + default="ascending", + description="""The continuous values are discretized to labels 0, 1, 2, .., N with respect to given threshold bins [threshold_1, threshold_2,.., threshould_n]. When set to 'ascending', the class label is in ascending order with the threshold bins that `0` represents negative class or lower class, while 1, 2, 3 are for higher classes. When set to 'descending' the class label is in ascending order with the threshold bins. Sometimes the positive labels are on the left side of provided threshold. E.g. For binarization with threshold [0.5], the positive label is defined - by`X < 0.5`. In this case, `label_order` should be `descending`. - - log_scale: Whether visualize distribution in log scale. - See more in - - """ - - input_column: str - prefix: str = "CLS_" - thresholds: List[float] - inplace: bool = False - allow_nan: bool = True - label_order: Literal["ascending", "descending"] = "ascending" - log_scale: bool = False + by`X < 0.5`. In this case, `label_order` should be `descending`.""", + ) + log_scale: bool = Field( + default=False, + description="""Whether visualize distribution in log scale. + See more in """, + ) def transform( self, diff --git a/auroris/curation/actions/_distribution.py b/auroris/curation/actions/_distribution.py index 9b0efd7..755ec8d 100644 --- a/auroris/curation/actions/_distribution.py +++ b/auroris/curation/actions/_distribution.py @@ -1,5 +1,5 @@ from typing import Dict, List, Optional, Sequence - +from pydantic import Field import pandas as pd from auroris.curation.actions._base import BaseAction @@ -10,18 +10,16 @@ class ContinuousDistributionVisualization(BaseAction): """ - Visualize a continuous distribution - - Args: - y_cols: List of columns for bioactivity for visualization - log_scale: Whether visualize distribution in log scale. - bins: The bin boundaries to color the area under the KDE curve. - + Visualize a continuous distribution. """ - y_cols: Optional[List[str]] = None - log_scale: bool = False - bins: Optional[Sequence[float]] = None + y_cols: Optional[List[str]] = Field( + default=None, description="List of columns for bioactivity for visualization." + ) + log_scale: bool = Field(default=False, description="Whether visualize distribution in log scale.") + bins: Optional[Sequence[float]] = Field( + default=None, description="The bin boundaries to color the area under the KDE curve." + ) def transform( self, diff --git a/auroris/curation/actions/_mol.py b/auroris/curation/actions/_mol.py index d2d7684..bfdc410 100644 --- a/auroris/curation/actions/_mol.py +++ b/auroris/curation/actions/_mol.py @@ -1,5 +1,6 @@ from functools import partial from typing import Dict, List, Optional, Tuple, Union +from pydantic import Field import datamol as dm import numpy as np @@ -25,23 +26,21 @@ def curate_molecules( count_stereoisomers: bool = True, count_stereocenters: bool = True, **parallelized_kwargs, -): +) -> Tuple: """ Curate a list of molecules. Args: - mols: List of molecules - progress: Whether show curation progress - remove_salt_solvent: Whether remove salt and solvent from molecule - remove_stereo: Whether remove stereo chemistry information from molecule - count_stereoisomers: Whether count the number of stereoisomers of molecule - count_stereocenters: Whether count the number of stereocenters of molecule - parallelized_kwargs: Additional argument for the parallelizarion process. - See more about + mols: List of molecules. + progress: Whether show curation progress. + remove_salt_solvent: Whether remove salt and solvent from molecule. + remove_stereo: Whether remove stereo chemistry information from molecule. + count_stereoisomers: Whether count the number of stereoisomers of molecule. + count_stereocenters: Whether count the number of stereocenters of molecule. Returns: mol_dict: Dictionary of molecule and additional metadata - num_invalid: Number of invalid molecules + num_invalid: Number of invßßalid molecules """ fn = partial( @@ -237,25 +236,31 @@ def _num_stereo_centers(mol: dm.Mol) -> Tuple[int]: class MoleculeCuration(BaseAction): """ - Automated molecule curation and chemistry space distribution - - Args: - input_column: The name of the column that has the molecules (either `dm.Mol` objects or SMILES). - remove_salt_solvent: When set to 'True', all disconnected salts and solvents - will be removed from molecule. In most of the cases, it is recommended to remove the salts/solvents. - remove_stereo: Whether remove stereochemistry information from molecule. - If it's known that the stereochemistry do not contribute to the bioactivity of interest, - the stereochemistry information can be removed. + Automated molecule curation and chemistry space distribution. """ - input_column: str - prefix: str = "MOL_" - remove_salt_solvent: bool = True - remove_stereo: bool = False - count_stereoisomers: bool = True - count_stereocenters: bool = True - y_cols: Optional[List[str]] = None - fast: Optional[bool] = True + input_column: str = Field( + ..., description="The name of the column that has the molecules (either `dm.Mol` objects or SMILES)." + ) + prefix: str = Field(default="MOL_", description="Prefix for added column names") + remove_salt_solvent: bool = Field( + default=True, description="When set to 'True', all disconnected salts and solvents" + ) + remove_stereo: bool = Field( + default=False, + description="Whether remove stereochemistry information from molecule. If it's known that the stereochemistry do not contribute to the bioactivity of interest, the stereochemistry information can be removed.", + ) + count_stereoisomers: bool = Field( + default=True, description="Whether count the number of stereoisomers of molecule." + ) + count_stereocenters: bool = Field( + default=True, description="Whether count the number of stereocenter of molecule." + ) + y_cols: Optional[List[str]] = Field(default=None, description="Column names for bioactivities") + fast: Optional[bool] = Field( + default=True, + description="Whether compute molecule features with default ECFP for visualizing distribution in chemical space.", + ) def transform( self, diff --git a/auroris/curation/actions/_outlier.py b/auroris/curation/actions/_outlier.py index 0a008aa..4cbd572 100644 --- a/auroris/curation/actions/_outlier.py +++ b/auroris/curation/actions/_outlier.py @@ -133,18 +133,12 @@ def modified_zscore(data: np.ndarray, consistency_correction: float = 1.4826): class OutlierDetection(BaseAction): """ - Automatic detection of outliers - - Args: - method: Method name for outlier detection. - columns: Column names to detect outliers - prefix: Prefix for added column names - + Automatic detection of outliers. """ - method: OutlierDetectionMethod - columns: List[str] - prefix: str = "OUTLIER_" + method: OutlierDetectionMethod = Field(..., description="Method name for outlier detection.") + columns: List[str] = Field(..., description="Column names to detect outliers.") + prefix: str = Field(default="OUTLIER_", description="Prefix for added column names.") kwargs: Dict = Field(default_factory=dict) def transform( diff --git a/docs/index.md b/docs/index.md index e80fbeb..290df24 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,4 +1,37 @@ # Introduction -Welcome to the Auroris documentation! +Welcome to the Auroris - Simplifying Drug Discovery Data Curation +--- + +## What is Auroris? + +Auroris is a comprehensive Python library designed to assist researchers, scientists in managing, cleaning, and preparing data relevant to drug discovery. The library offers a range of features and techniques to handle diverse data types commonly encountered in drug discovery, including chemical structures, biological assays, and potentially to be extended to cell imaging data, genomic data, and clinical trial information. + + +## Where to next? +--- + +**:fontawesome-solid-rocket: Quickstart** + +Dive deeper into the Auroris code and learn about how to curate data for your ML-powered drug discovery program. + +[:material-arrow-right: Let's get started](./tutorials/getting_started.ipynb) + +--- + +**:fontawesome-solid-code: API Reference** + +Explore the technical documentation here to delve into the inner workings of the code. Gain insights into the intricate details of how different methods and classes function. + +[:material-arrow-right: Let's get started](./api/curator.md) + +--- + +**:fontawesome-solid-comments: Community** + +We're excited to have you join us in revolutionizing drug discovery data curation! Explore Polaris, provide feedback, share your use cases, and collaborate with us to enhance and expand the capabilities of Polaris for the benefit of the drug discovery community. + +[:material-arrow-right: Let's get started](https://discord.gg/vBFd8p6H7u) + +--- \ No newline at end of file diff --git a/docs/tutorials/getting_started.ipynb b/docs/tutorials/getting_started.ipynb index 4a03831..d3f1e09 100644 --- a/docs/tutorials/getting_started.ipynb +++ b/docs/tutorials/getting_started.ipynb @@ -22,7 +22,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 11, "id": "976c5af4-2819-4cd6-8a57-072441eb9305", "metadata": { "editable": true, @@ -83,7 +83,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 12, "id": "a8f88963-f561-4cdc-8bfb-0a3045743e98", "metadata": { "editable": true, @@ -99,7 +99,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 13, "id": "5483a1fd-b22e-43e5-92f4-e6e8f4f3e629", "metadata": { "editable": true, @@ -142,7 +142,7 @@ " \n", " \n", " 0\n", - " <rdkit.Chem.rdchem.Mol object at 0x1570f9d90>\n", + " <rdkit.Chem.rdchem.Mol object at 0x288ce21f0>\n", " 1\n", " n-pentane\n", " -3.18\n", @@ -152,7 +152,7 @@ " \n", " \n", " 1\n", - " <rdkit.Chem.rdchem.Mol object at 0x1570f9ee0>\n", + " <rdkit.Chem.rdchem.Mol object at 0x288ce11c0>\n", " 2\n", " cyclopentane\n", " -2.64\n", @@ -162,7 +162,7 @@ " \n", " \n", " 2\n", - " <rdkit.Chem.rdchem.Mol object at 0x1570f9f50>\n", + " <rdkit.Chem.rdchem.Mol object at 0x288ce2c00>\n", " 3\n", " n-hexane\n", " -3.84\n", @@ -172,7 +172,7 @@ " \n", " \n", " 3\n", - " <rdkit.Chem.rdchem.Mol object at 0x1570f9fc0>\n", + " <rdkit.Chem.rdchem.Mol object at 0x288ce2340>\n", " 4\n", " 2-methylpentane\n", " -3.74\n", @@ -182,7 +182,7 @@ " \n", " \n", " 4\n", - " <rdkit.Chem.rdchem.Mol object at 0x1570fa030>\n", + " <rdkit.Chem.rdchem.Mol object at 0x288ce1ee0>\n", " 6\n", " 2,2-dimethylbutane\n", " -3.55\n", @@ -196,11 +196,11 @@ ], "text/plain": [ " mol ID NAME \\\n", - "0 1 n-pentane \n", - "1 2 cyclopentane \n", - "2 3 n-hexane \n", - "3 4 2-methylpentane \n", - "4 6 2,2-dimethylbutane \n", + "0 1 n-pentane \n", + "1 2 cyclopentane \n", + "2 3 n-hexane \n", + "3 4 2-methylpentane \n", + "4 6 2,2-dimethylbutane \n", "\n", " SOL SOL_classification smiles split \n", "0 -3.18 (A) low CCCCC train \n", @@ -210,7 +210,7 @@ "4 -3.55 (A) low CCC(C)(C)C train " ] }, - "execution_count": 4, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -242,10 +242,20 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 14, "id": "f9dbba41-2bd2-4321-b948-13877cee5b13", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[32m2024-05-16 17:56:13.689\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mauroris.curation._curator\u001b[0m:\u001b[36mtransform\u001b[0m:\u001b[36m76\u001b[0m - \u001b[1mPerforming step: MoleculeCuration\u001b[0m\n", + "\u001b[32m2024-05-16 17:56:17.051\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mauroris.curation._curator\u001b[0m:\u001b[36mtransform\u001b[0m:\u001b[36m76\u001b[0m - \u001b[1mPerforming step: OutlierDetection\u001b[0m\n", + "\u001b[32m2024-05-16 17:56:17.079\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mauroris.curation._curator\u001b[0m:\u001b[36mtransform\u001b[0m:\u001b[36m76\u001b[0m - \u001b[1mPerforming step: Discretization\u001b[0m\n" + ] + } + ], "source": [ "from auroris.curation import Curator\n", "from auroris.curation.actions import MoleculeCuration, OutlierDetection, Discretization\n", @@ -274,7 +284,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 15, "id": "98aaadd9-8c0f-412e-ab74-79b8f9a3d75a", "metadata": {}, "outputs": [ @@ -283,27 +293,32 @@ "output_type": "stream", "text": [ "\u001b[31;1m===== Curation Report =====\u001b[0m\n", - "\u001b[38;20mTime: 2024-05-02 13:20:55\u001b[0m\n", - "\u001b[38;20mVersion: dev\u001b[0m\n", + "\u001b[34;1mTime: 2024-05-16 17:56:13\u001b[0m\n", + "\u001b[34;1mVersion: dev\u001b[0m\n", "\u001b[34;1m===== MoleculeCuration =====\u001b[0m\n", - "\u001b[38;20m[LOG]: Couldn't preprocess 18 / 1282 molecules.\u001b[0m\n", - "\u001b[38;20m[LOG]: New column added: MOL_smiles\u001b[0m\n", - "\u001b[38;20m[LOG]: New column added: MOL_molhash_id\u001b[0m\n", - "\u001b[38;20m[LOG]: New column added: MOL_molhash_id_no_stereo\u001b[0m\n", - "\u001b[38;20m[LOG]: New column added: MOL_num_stereoisomers\u001b[0m\n", - "\u001b[38;20m[LOG]: New column added: MOL_num_undefined_stereoisomers\u001b[0m\n", - "\u001b[38;20m[LOG]: New column added: MOL_num_defined_stereo_center\u001b[0m\n", - "\u001b[38;20m[LOG]: New column added: MOL_num_undefined_stereo_center\u001b[0m\n", - "\u001b[38;20m[LOG]: New column added: MOL_num_stereo_center\u001b[0m\n", - "\u001b[38;20m[LOG]: New column added: MOL_undefined_E_D\u001b[0m\n", - "\u001b[38;20m[LOG]: New column added: MOL_undefined_E/Z\u001b[0m\n", - "\u001b[38;20m[IMG]: Dimensions 1200 x 600\u001b[0m\n", - "\u001b[38;20m[IMG]: Dimensions None x None\u001b[0m\n", + "\u001b[34;1m[LOG]: Couldn't preprocess 18 / 1282 molecules.\u001b[0m\n", + "\u001b[34;1m[LOG]: New column added: MOL_smiles\u001b[0m\n", + "\u001b[34;1m[LOG]: New column added: MOL_molhash_id\u001b[0m\n", + "\u001b[34;1m[LOG]: New column added: MOL_molhash_id_no_stereo\u001b[0m\n", + "\u001b[34;1m[LOG]: New column added: MOL_num_stereoisomers\u001b[0m\n", + "\u001b[34;1m[LOG]: New column added: MOL_num_undefined_stereoisomers\u001b[0m\n", + "\u001b[34;1m[LOG]: New column added: MOL_num_defined_stereo_center\u001b[0m\n", + "\u001b[34;1m[LOG]: New column added: MOL_num_undefined_stereo_center\u001b[0m\n", + "\u001b[34;1m[LOG]: New column added: MOL_num_stereo_center\u001b[0m\n", + "\u001b[34;1m[LOG]: New column added: MOL_undefined_E_D\u001b[0m\n", + "\u001b[34;1m[LOG]: New column added: MOL_undefined_E/Z\u001b[0m\n", + "\u001b[34;1m[LOG]: Default `ecfp` fingerprint is used to compute the distributionin chemical space.\u001b[0m\n", + "\u001b[34;1m[LOG]: Molecules with undefined stereocenter detected: 253.\u001b[0m\n", + "\u001b[34;1m[IMG]: Dimensions 1200 x 600\u001b[0m\n", + "\u001b[34;1m[IMG]: Dimensions 1200 x 2400\u001b[0m\n", "\u001b[34;1m===== OutlierDetection =====\u001b[0m\n", - "\u001b[38;20m[LOG]: New column added: OUTLIER_SOL\u001b[0m\n", - "\u001b[38;20m[IMG]: Dimensions 2400 x 600\u001b[0m\n", + "\u001b[34;1m[LOG]: New column added: OUTLIER_SOL\u001b[0m\n", + "\u001b[34;1m[LOG]: Found 7 potential outliers with respect to the SOL column for review.\u001b[0m\n", + "\u001b[34;1m[IMG]: Dimensions 640 x 480\u001b[0m\n", "\u001b[34;1m===== Discretization =====\u001b[0m\n", - "\u001b[38;20m[LOG]: New column added: CLS_SOL\u001b[0m\n" + "\u001b[34;1m[LOG]: New column added: CLS_SOL\u001b[0m\n", + "\u001b[34;1m[IMG]: Dimensions 1200 x 600\u001b[0m\n", + "\u001b[31;1m===== Curation Report END =====\u001b[0m\n" ] } ], @@ -324,14 +339,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "id": "9b896b12-fbae-4b7b-b62a-f2d2d15075c1", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "'/Users/lu.zhu/Documents/Codebase/ValenceLab/auroris/docs/tutorials/test/index.html'" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from auroris.report.broadcaster import HTMLBroadcaster\n", "\n", - "broadcaster = HTMLBroadcaster(report, \"/path/to/broadcaster\")\n", + "broadcaster = HTMLBroadcaster(report= report, \n", + " destination=\"/Users/lu.zhu/Documents/Codebase/ValenceLab/auroris/docs/tutorials/test\", \n", + " embed_images=True)\n", "broadcaster.broadcast()" ] }, @@ -345,14 +373,14 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 17, "id": "97be9d29-03eb-4eb7-b9c0-ac84413f6dca", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "mol \n", + "mol \n", "ID 1\n", "NAME n-pentane\n", "SOL -3.18\n", @@ -369,12 +397,12 @@ "MOL_num_stereo_center 0.0\n", "MOL_undefined_E_D False\n", "MOL_undefined_E/Z 0\n", - "OUTLIER_SOL 0.0\n", + "OUTLIER_SOL False\n", "CLS_SOL 0.0\n", "Name: 0, dtype: object" ] }, - "execution_count": 8, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -400,7 +428,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 18, "id": "f5044a09-c34f-4888-a5ac-65fb62225129", "metadata": { "editable": true, @@ -412,9 +440,9 @@ "outputs": [ { "data": { - "image/png": "", + "image/png": "", "text/plain": [ - "
" + "
" ] }, "metadata": {}, @@ -443,6 +471,12 @@ "id": "e5b79da1", "metadata": {}, "source": [] + }, + { + "cell_type": "markdown", + "id": "b020ee56", + "metadata": {}, + "source": [] } ], "metadata": { diff --git a/mkdocs.yml b/mkdocs.yml index ee47e12..247c546 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -27,7 +27,6 @@ nav: - Stereoisomer AC: api/actions/stereo_ac.md - Functional: api/functional.md - Visualization: api/visualization.md - - Types: api/types.md - Community: https://discord.gg/vBFd8p6H7u - Polaris Hub: https://polarishub.io/ @@ -99,6 +98,8 @@ plugins: separate_signature: true show_signature_annotations: true line_length: 80 + extensions: + - griffe_fieldz: {include_inherited: true} # support pydantic data-class - mkdocs-jupyter: execute: False remove_tag_config: From 021c11e7d3a11bd2efcb70585d224f1e7bb049a8 Mon Sep 17 00:00:00 2001 From: Lu Zhu Date: Thu, 16 May 2024 22:52:30 -0400 Subject: [PATCH 14/39] update css --- auroris/curation/_curator.py | 5 ++++- .../css/{custom-alchemy.css => custom-auroris.css} | 14 +++++++------- mkdocs.yml | 1 + 3 files changed, 12 insertions(+), 8 deletions(-) rename docs/assets/css/{custom-alchemy.css => custom-auroris.css} (58%) diff --git a/auroris/curation/_curator.py b/auroris/curation/_curator.py index de571d2..f4407d2 100644 --- a/auroris/curation/_curator.py +++ b/auroris/curation/_curator.py @@ -112,7 +112,10 @@ def to_json(self, path: str): path: The destination to save to """ serialization = self.model_dump(exclude="steps") - # # save steps in defined order + # remove data_path + if self.data_path is None: + serialization.pop("data_path") + # save steps in defined order serialization["steps"] = [{step.name: step.model_dump()} for step in self.steps] with fsspec.open(path, "w") as f: json.dump(serialization, f) diff --git a/docs/assets/css/custom-alchemy.css b/docs/assets/css/custom-auroris.css similarity index 58% rename from docs/assets/css/custom-alchemy.css rename to docs/assets/css/custom-auroris.css index bbac249..6fbf63d 100644 --- a/docs/assets/css/custom-alchemy.css +++ b/docs/assets/css/custom-auroris.css @@ -4,25 +4,25 @@ For a list of all available variables, see https://github.com/squidfunk/mkdocs-material/blob/master/src/assets/stylesheets/main/_colors.scss */ - --polaris-primary: hsla(236, 100%, 19%, 1.0); - --polaris-secondary: hsla(290, 61%, 43%, 1.0); - --polaris-ternary: hsla(236, 100%, 9%, 1.0); + --auroris-primary: rgb(36, 82, 97); + --auroris-secondary: rgb(70, 201, 190); + --auroris-ternary: rgb(0, 61, 94); } /* Change the header background to use a gradient */ .md-header { - background-image: linear-gradient(to right, var(--polaris-secondary), var(--polaris-primary)); + background-image: linear-gradient(to right, var(--auroris-secondary), var(--auroris-primary)); } /* Change the footer background to use a gradient */ .md-footer { - background-image: linear-gradient(to right, var(--polaris-primary), var(--polaris-ternary)); + background-image: linear-gradient(to right, var(--auroris-primary), var(--auroris-ternary)); } /* Change the tabs background to use a gradient */ .md-tabs { - background-image: linear-gradient(to right, #F4F6F9, #dfc3e2); - color: var(--polaris-ternary); + background-image: linear-gradient(to right, #F4F6F9, #d7f2c3); + color: var(--auroris-ternary); } /* Remove the `In` and `Out` block in rendered Jupyter notebooks */ diff --git a/mkdocs.yml b/mkdocs.yml index 247c546..52aec8a 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -27,6 +27,7 @@ nav: - Stereoisomer AC: api/actions/stereo_ac.md - Functional: api/functional.md - Visualization: api/visualization.md + - Types: api/types.md - Community: https://discord.gg/vBFd8p6H7u - Polaris Hub: https://polarishub.io/ From d73f6850a25c5c961ba118b5e3e7be308ea2dd63 Mon Sep 17 00:00:00 2001 From: Lu Zhu Date: Thu, 16 May 2024 23:09:37 -0400 Subject: [PATCH 15/39] minor change --- auroris/report/broadcaster/_html.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/auroris/report/broadcaster/_html.py b/auroris/report/broadcaster/_html.py index 6e5c7f9..3ead388 100644 --- a/auroris/report/broadcaster/_html.py +++ b/auroris/report/broadcaster/_html.py @@ -68,11 +68,11 @@ def broadcast(self): src = f"data:image/png;base64,{image_data}" else: # Save as separate file - filename = ( - f"{re.sub(r'[^\w\-\.]', '_', image.title)}.png" - if image.title is not None - else f"{image_counter}.png" - ) + if image.title: + filename = f"{re.sub(r'[^\w\-\.]', '_', image.title)}.png" + else: + filename = f"{image_counter}.png" + path = dm.fs.join(self._image_dir, filename) save_image(image.image, path, self._destination) src = path2url(path, self._destination) From 085da1cd6a2ddb10eb27c7fe01a33de2cffaf89f Mon Sep 17 00:00:00 2001 From: Lu Zhu Date: Thu, 16 May 2024 23:19:55 -0400 Subject: [PATCH 16/39] minor fix --- auroris/curation/_curator.py | 2 -- auroris/report/broadcaster/_html.py | 8 ++------ 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/auroris/curation/_curator.py b/auroris/curation/_curator.py index f4407d2..dc1449b 100644 --- a/auroris/curation/_curator.py +++ b/auroris/curation/_curator.py @@ -119,6 +119,4 @@ def to_json(self, path: str): serialization["steps"] = [{step.name: step.model_dump()} for step in self.steps] with fsspec.open(path, "w") as f: json.dump(serialization, f) - # with fsspec.open(path, "w") as f: - # json.dump(self.model_dump(), f) return path diff --git a/auroris/report/broadcaster/_html.py b/auroris/report/broadcaster/_html.py index 3ead388..4897be0 100644 --- a/auroris/report/broadcaster/_html.py +++ b/auroris/report/broadcaster/_html.py @@ -68,12 +68,8 @@ def broadcast(self): src = f"data:image/png;base64,{image_data}" else: # Save as separate file - if image.title: - filename = f"{re.sub(r'[^\w\-\.]', '_', image.title)}.png" - else: - filename = f"{image_counter}.png" - - path = dm.fs.join(self._image_dir, filename) + filename = re.sub(r"[^\w\-\.]", "_", image.title) if image.title else image_counter + path = dm.fs.join(self._image_dir, f"{filename}.png") save_image(image.image, path, self._destination) src = path2url(path, self._destination) From 8454d5ef7f9a9d84ae8f61302dc48523c26f4ff7 Mon Sep 17 00:00:00 2001 From: Lu Zhu Date: Thu, 16 May 2024 23:27:07 -0400 Subject: [PATCH 17/39] add dep --- env.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/env.yml b/env.yml index 52c9fd7..e2c8206 100644 --- a/env.yml +++ b/env.yml @@ -45,3 +45,7 @@ dependencies: - mdx_truly_sane_lists - nbconvert - mike >=1.0.0 + + - pip + - pip: + - griffe_fieldz \ No newline at end of file From 9e75e011c49de7e80af5fbd2298cad0387770c8f Mon Sep 17 00:00:00 2001 From: Lu Zhu Date: Fri, 17 May 2024 10:02:58 -0400 Subject: [PATCH 18/39] Update auroris/curation/_curator.py Co-authored-by: Cas Wognum --- auroris/curation/_curator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/auroris/curation/_curator.py b/auroris/curation/_curator.py index dc1449b..dab661e 100644 --- a/auroris/curation/_curator.py +++ b/auroris/curation/_curator.py @@ -53,8 +53,8 @@ def _validate_data_path(cls, value: Union[str, PathLike]): return value except Exception: raise ValueError( - f"Dataset cann't be loaded by `panda.read_csv('{value}')`." - f"Consider to directly pass the loaded the data to `Curator.curate()`." + f"Dataset can't be loaded by `pandas.read_csv('{value}')`." + f"Consider passing the DataFrame directly to `Curator.curate(dataset=...)`." ) def _load_data(self): From 53ad1da7389fd882bd46f5d5494c0e274618f30f Mon Sep 17 00:00:00 2001 From: Lu Zhu Date: Fri, 17 May 2024 10:09:59 -0400 Subject: [PATCH 19/39] Update auroris/curation/actions/_mol.py Co-authored-by: Cas Wognum --- auroris/curation/actions/_mol.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/auroris/curation/actions/_mol.py b/auroris/curation/actions/_mol.py index bfdc410..750e821 100644 --- a/auroris/curation/actions/_mol.py +++ b/auroris/curation/actions/_mol.py @@ -14,7 +14,7 @@ try: from molfeat.trans.pretrained.hf_transformers import PretrainedHFTransformer -except Exception: +except ImportError: PretrainedHFTransformer = None From ecdf78596bcf16e526455a5e5a325348a3a0aaa2 Mon Sep 17 00:00:00 2001 From: Lu Zhu Date: Fri, 17 May 2024 10:10:44 -0400 Subject: [PATCH 20/39] Update docs/index.md Co-authored-by: Cas Wognum --- docs/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/index.md b/docs/index.md index 290df24..062fa4f 100644 --- a/docs/index.md +++ b/docs/index.md @@ -14,7 +14,7 @@ Auroris is a comprehensive Python library designed to assist researchers, scient **:fontawesome-solid-rocket: Quickstart** -Dive deeper into the Auroris code and learn about how to curate data for your ML-powered drug discovery program. +Dive deeper into the Auroris code and learn how to curate data for your ML-powered drug discovery program. [:material-arrow-right: Let's get started](./tutorials/getting_started.ipynb) From 33569ebb7428166133f0dc24bae25dfba289f819 Mon Sep 17 00:00:00 2001 From: Lu Zhu Date: Fri, 17 May 2024 10:18:27 -0400 Subject: [PATCH 21/39] Update auroris/visualization/_distribution.py Co-authored-by: Cas Wognum --- auroris/visualization/_distribution.py | 36 +++++++++++++------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/auroris/visualization/_distribution.py b/auroris/visualization/_distribution.py index 6ae6883..0ca674b 100644 --- a/auroris/visualization/_distribution.py +++ b/auroris/visualization/_distribution.py @@ -106,23 +106,23 @@ def visualize_distribution_with_outliers( values = values[sorted_ind] is_outlier = is_outlier[sorted_ind] - fig = plt.figure() - res = stats.probplot(values, dist="norm", plot=plt, fit=True) - x = res[0][0] - y = res[0][1] - - # Specify the indices of data points to highlight - highlight_indices = np.argwhere(is_outlier.__eq__(True)).flatten() - highlight_color = "red" - - # Overlay specific points with different colors - for idx in highlight_indices: - plt.plot( - x[idx], y[idx], marker="o", markersize=8, color=highlight_color - ) # Red circles for highlighted points - - plt.xlabel("Theoretical quantiles") - plt.ylabel("Ordered Values") - plt.title(title) + with create_figure(n_plots=1) as (fig, axes): + res = stats.probplot(values, dist="norm", plot=plt, fit=True, plot=axes[0]) + x = res[0][0] + y = res[0][1] + + # Specify the indices of data points to highlight + highlight_indices = np.argwhere(is_outlier.__eq__(True)).flatten() + highlight_color = "red" + + # Overlay specific points with different colors + for idx in highlight_indices: + ax.plot( + x[idx], y[idx], marker="o", markersize=8, color=highlight_color + ) # Red circles for highlighted points + + ax.xlabel("Theoretical quantiles") + ax.ylabel("Ordered Values") + ax.title(title) return fig From 83cf9492bdbf6497504f1c7fe1f4b28a500081be Mon Sep 17 00:00:00 2001 From: Lu Zhu Date: Fri, 17 May 2024 10:50:50 -0400 Subject: [PATCH 22/39] Update docs/index.md Co-authored-by: Cas Wognum --- docs/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/index.md b/docs/index.md index 062fa4f..36ae5be 100644 --- a/docs/index.md +++ b/docs/index.md @@ -6,7 +6,7 @@ Welcome to the Auroris - Simplifying Drug Discovery Data Curation ## What is Auroris? -Auroris is a comprehensive Python library designed to assist researchers, scientists in managing, cleaning, and preparing data relevant to drug discovery. The library offers a range of features and techniques to handle diverse data types commonly encountered in drug discovery, including chemical structures, biological assays, and potentially to be extended to cell imaging data, genomic data, and clinical trial information. +Auroris is a comprehensive Python library designed to assist researchers and scientists in managing, cleaning, and preparing data relevant to drug discovery. Our mission is to implement a range of techniques to handle, transform, filter, analyze, or visualize the diverse data types commonly encountered in drug discovery. ## Where to next? From 26f0a6f529c085da4622bdc3fc9ada6e80550d95 Mon Sep 17 00:00:00 2001 From: Lu Zhu Date: Fri, 17 May 2024 13:20:22 -0400 Subject: [PATCH 23/39] update test_curator_save_load --- tests/test_curator.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/tests/test_curator.py b/tests/test_curator.py index 221b934..d9152df 100644 --- a/tests/test_curator.py +++ b/tests/test_curator.py @@ -13,17 +13,21 @@ def test_curator_save_load(tmpdir): curator = Curator( steps=[ - OutlierDetection(method="zscore", columns=["outlier_column"]), MoleculeCuration(input_column="smiles"), + OutlierDetection(method="zscore", columns=["outlier_column"]), ], ) path = os.path.join(tmpdir, "curator.json") + curator.to_json(path) - curator.from_json(path) + curator_reload = curator.from_json(path) + + assert len(curator.steps) == len(curator_reload.steps) + for step1, step2 in zip(curator.steps, curator_reload.steps): + assert step1 == step2 - assert len(curator.steps) == 2 - assert curator.steps[0].method == "zscore" - assert curator.steps[0].columns == ["outlier_column"] + assert curator.steps[1].method == "zscore" + assert curator.steps[1].columns == ["outlier_column"] def test_curator_integration(dataset, tmpdir): From 3a3b5e861590ced85bd1f7c39b1643944e510506 Mon Sep 17 00:00:00 2001 From: Lu Zhu Date: Fri, 17 May 2024 13:25:54 -0400 Subject: [PATCH 24/39] minor fix --- auroris/visualization/_distribution.py | 35 +++++++++++++------------- 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/auroris/visualization/_distribution.py b/auroris/visualization/_distribution.py index 0ca674b..39039d3 100644 --- a/auroris/visualization/_distribution.py +++ b/auroris/visualization/_distribution.py @@ -3,7 +3,6 @@ import numpy as np import seaborn as sns from scipy import stats -import matplotlib.pyplot as plt from auroris.visualization.utils import create_figure @@ -107,22 +106,22 @@ def visualize_distribution_with_outliers( is_outlier = is_outlier[sorted_ind] with create_figure(n_plots=1) as (fig, axes): - res = stats.probplot(values, dist="norm", plot=plt, fit=True, plot=axes[0]) - x = res[0][0] - y = res[0][1] - - # Specify the indices of data points to highlight - highlight_indices = np.argwhere(is_outlier.__eq__(True)).flatten() - highlight_color = "red" - - # Overlay specific points with different colors - for idx in highlight_indices: - ax.plot( - x[idx], y[idx], marker="o", markersize=8, color=highlight_color - ) # Red circles for highlighted points - - ax.xlabel("Theoretical quantiles") - ax.ylabel("Ordered Values") - ax.title(title) + res = stats.probplot(values, dist="norm", fit=True, plot=axes[0]) + x = res[0][0] + y = res[0][1] + + # Specify the indices of data points to highlight + highlight_indices = np.argwhere(is_outlier.__eq__(True)).flatten() + highlight_color = "red" + + # Overlay specific points with different colors + for idx in highlight_indices: + axes[0].plot( + x[idx], y[idx], marker="o", markersize=8, color=highlight_color + ) # Red circles for highlighted points + + axes[0].set_xlabel("Theoretical quantiles") + axes[0].set_ylabel("Ordered Values") + axes[0].set_title(title) return fig From 78c731db1c922e380b149f3a124dba9458c7f8a1 Mon Sep 17 00:00:00 2001 From: Lu Zhu Date: Fri, 17 May 2024 15:46:34 -0400 Subject: [PATCH 25/39] Update auroris/report/broadcaster/_logger.py Co-authored-by: Cas Wognum --- auroris/report/broadcaster/_logger.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/auroris/report/broadcaster/_logger.py b/auroris/report/broadcaster/_logger.py index 317f740..26aa89d 100644 --- a/auroris/report/broadcaster/_logger.py +++ b/auroris/report/broadcaster/_logger.py @@ -33,7 +33,7 @@ class LoggerBroadcaster(ReportBroadcaster): def __init__(self, report: CurationReport): super().__init__(report) - self.logger = logging.getLogger() + self.logger = logging.getLogger(self.__class__.__name__) # Lu: debug level might log other irrelevant debugging logs self.logger.setLevel(logging.INFO) From ed0c7f6f7906853f146d24f97b66c38579030984 Mon Sep 17 00:00:00 2001 From: Lu Zhu Date: Wed, 22 May 2024 00:38:10 -0400 Subject: [PATCH 26/39] curator save/load --- auroris/curation/_curator.py | 25 +++++++++++-------------- auroris/curation/actions/_base.py | 3 ++- tests/test_curator.py | 4 ++-- 3 files changed, 15 insertions(+), 17 deletions(-) diff --git a/auroris/curation/_curator.py b/auroris/curation/_curator.py index dab661e..0d94beb 100644 --- a/auroris/curation/_curator.py +++ b/auroris/curation/_curator.py @@ -23,7 +23,7 @@ class Curator(BaseModel): # This is the recommended way to add all subclasses in the type. # See e.g. https://github.com/pydantic/pydantic/issues/2200 # and https://github.com/pydantic/pydantic/issues/2036 - data_path: Optional[Union[str, PathLike]] = Field( + src_dataset_path: Optional[Union[str, PathLike]] = Field( default=None, description="Data path. The data must be loadable by `pd.read_csv` with default parameters.", ) @@ -46,8 +46,8 @@ def _validate_verbosity(cls, v): def _serialize_verbosity(self, value: VerbosityLevel): return value.name - @field_validator("data_path", mode="before") - def _validate_data_path(cls, value: Union[str, PathLike]): + @field_validator("src_dataset_path", mode="before") + def _validate_src_dataset_path(cls, value: Union[str, PathLike]): try: pd.read_csv(value, nrows=5) return value @@ -58,14 +58,14 @@ def _validate_data_path(cls, value: Union[str, PathLike]): ) def _load_data(self): - return pd.read_csv(self.data_path) + return pd.read_csv(self.src_dataset_path) def transform(self, dataset: Optional[pd.DataFrame] = None) -> Tuple[pd.DataFrame, CurationReport]: if dataset is None: dataset = self._load_data() report = CurationReport() - dataset = dataset.copy() + dataset = dataset.copy(deep=True) action: BaseAction for action in self.steps: @@ -101,22 +101,19 @@ def from_json(cls, path: str): with fsspec.open(path, "r") as f: data = json.load(f) - steps = [cls._get_action(name)(**args) for step in data["steps"] for name, args in step.items()] - data["steps"] = steps + data["steps"] = [cls._get_action(step["name"]).model_validate(step) for step in data["steps"]] return cls.model_validate(data) def to_json(self, path: str): """Saves the curation workflow to a JSON file. Args: - path: The destination to save to + path: The destination to save to. """ - serialization = self.model_dump(exclude="steps") - # remove data_path - if self.data_path is None: - serialization.pop("data_path") - # save steps in defined order - serialization["steps"] = [{step.name: step.model_dump()} for step in self.steps] + serialization = self.model_dump() + # remove src_dataset_path if unavailable + if self.src_dataset_path is None: + serialization.pop("src_dataset_path") with fsspec.open(path, "w") as f: json.dump(serialization, f) return path diff --git a/auroris/curation/actions/_base.py b/auroris/curation/actions/_base.py index 8df6ca9..9cbc908 100644 --- a/auroris/curation/actions/_base.py +++ b/auroris/curation/actions/_base.py @@ -2,7 +2,7 @@ from typing import TYPE_CHECKING, Dict, Optional import pandas as pd -from pydantic import BaseModel, model_validator, Field +from pydantic import BaseModel, model_validator, Field, computed_field from auroris.types import VerbosityLevel @@ -20,6 +20,7 @@ class BaseAction(BaseModel, abc.ABC): prefix: str = Field(default=None, description="If the action adds columns, use this prefix.") + @computed_field @property def name(self) -> str: """The name of the action. Needs to be unique.""" diff --git a/tests/test_curator.py b/tests/test_curator.py index d9152df..5a8b0c1 100644 --- a/tests/test_curator.py +++ b/tests/test_curator.py @@ -17,8 +17,8 @@ def test_curator_save_load(tmpdir): OutlierDetection(method="zscore", columns=["outlier_column"]), ], ) - path = os.path.join(tmpdir, "curator.json") - + path = os.path.join(tmpdir, "curator_lu.json") + print(path) curator.to_json(path) curator_reload = curator.from_json(path) From 499eb2604e432df7460378fceab6a656166fe76a Mon Sep 17 00:00:00 2001 From: Lu Zhu Date: Wed, 22 May 2024 00:39:40 -0400 Subject: [PATCH 27/39] avoid ipython image --- auroris/curation/actions/_mol.py | 24 ++++++++++-------------- auroris/utils.py | 14 ++++++++------ 2 files changed, 18 insertions(+), 20 deletions(-) diff --git a/auroris/curation/actions/_mol.py b/auroris/curation/actions/_mol.py index 750e821..790c7e2 100644 --- a/auroris/curation/actions/_mol.py +++ b/auroris/curation/actions/_mol.py @@ -297,19 +297,12 @@ def transform( smiles_col = self.get_column_name("smiles") smiles = dataset[smiles_col].dropna().values - if PretrainedHFTransformer and not self.fast: - featurizer = "ChemBERTa-77M-MTR" - transformer = PretrainedHFTransformer(kind=featurizer, notation="smiles", dtype=float) - X = transformer(smiles) - report.log( - "`ChemBERTa-77M-MTR` embedding is used to compute the distributionin chemical space." - ) - else: - featurizer = "ECFP" - with dm.without_rdkit_log(): - # Temporary disable logs because of deprecation warning - X = np.array([dm.to_fp(smi) for smi in smiles]) - report.log("Default `ecfp` fingerprint is used to compute the distributionin chemical space.") + # Lu: User can call visulize_chemspace for the customized molecular features. + featurizer = "ECFP" + with dm.without_rdkit_log(): + # Temporary disable logs because of deprecation warning + X = np.array([dm.to_fp(smi) for smi in smiles]) + report.log("Default `ecfp` fingerprint is used to compute the distributionin chemical space.") # list of data per column y = dataset[self.y_cols].T.values.tolist() if self.y_cols else None @@ -334,7 +327,10 @@ def transform( defined = row[defined_col] legends.append(f"Undefined:{undefined}\n Definded:{defined}") - image = dm.to_image(to_plot[smiles_col].tolist(), legends=legends, use_svg=False) + # returnPNG to avoid ipythonImage + image = dm.to_image( + to_plot[smiles_col].tolist(), legends=legends, use_svg=False, returnPNG=True + ) report.log_image( image, title="Molecules with undefined stereocenters", diff --git a/auroris/utils.py b/auroris/utils.py index 41d28fb..a2e705b 100644 --- a/auroris/utils.py +++ b/auroris/utils.py @@ -1,11 +1,11 @@ import os from io import BytesIO +from typing import ByteString import numpy as np from matplotlib.figure import Figure from PIL import Image from PIL.Image import Image as ImageType -from IPython.core.display import Image as IpythonImage import fsspec from sklearn.utils.multiclass import type_of_target @@ -34,11 +34,6 @@ def fig2img(fig: Figure) -> ImageType: ) -def ipyimg2img(fig: IpythonImage) -> ImageType: - """Convert Ipython image to PIL image""" - return Image.open(BytesIO(fig.data)) - - def img2bytes(image: ImageType): """Convert png image to bytes""" image_bytes = BytesIO() @@ -47,6 +42,13 @@ def img2bytes(image: ImageType): return image_bytes +def bytes2imf(image_bytes: ByteString): + image_stream = BytesIO(image_bytes) + # Open the image using PIL + image = Image.open(image_stream) + return image + + def path2url(path: str, destination: str): """ Convert path to an local or remote url for html report. From d90123831c443c7d571e99fcfcd3e0106dc47609 Mon Sep 17 00:00:00 2001 From: Lu Zhu Date: Wed, 22 May 2024 00:40:18 -0400 Subject: [PATCH 28/39] minor viz fix --- auroris/visualization/_chemspace.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/auroris/visualization/_chemspace.py b/auroris/visualization/_chemspace.py index e662a22..35dba00 100644 --- a/auroris/visualization/_chemspace.py +++ b/auroris/visualization/_chemspace.py @@ -12,7 +12,7 @@ def visualize_chemspace( - X: Union[List[np.ndarray], np.ndarray], + X: np.ndarray, y: Optional[Union[List[np.ndarray], np.ndarray]] = None, labels: Optional[List[str]] = None, n_cols: int = 2, @@ -22,10 +22,10 @@ def visualize_chemspace( seaborn_theme: Optional[str] = "whitegrid", **umap_kwargs: Any, ): - """Plot the chemical space. Also, color based on the target values. + """Plot the coverage in chemical space. Also, color based on the target values. Args: - X: A list of arrays with the features. + X: Array the molecular features. y: A list of arrays with the target values. labels: Optional list of labels for each set of features. n_cols: Number of columns in the subplots. @@ -40,7 +40,7 @@ def visualize_chemspace( raise ImportError("Please run `pip install umap-learn` to use UMAP visualizations for the chemspace.") if isinstance(y, np.ndarray): - y = list(y) + y = [y] if y is None: y = [None] @@ -72,5 +72,4 @@ def visualize_chemspace( ax.set_xlabel("Component 0") ax.set_xlabel("Component 1") ax.set_title(label) - return fig From 97c9bd3038fbe727547138ff5d322e92c839635c Mon Sep 17 00:00:00 2001 From: Lu Zhu Date: Wed, 22 May 2024 00:43:44 -0400 Subject: [PATCH 29/39] wip --- auroris/report/_report.py | 11 +++++------ auroris/utils.py | 3 ++- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/auroris/report/_report.py b/auroris/report/_report.py index ef8d685..4024982 100644 --- a/auroris/report/_report.py +++ b/auroris/report/_report.py @@ -5,11 +5,10 @@ from matplotlib import pyplot as plt from matplotlib.figure import Figure from PIL.Image import Image as ImageType -from IPython.core.display import Image as IpythonImage from pydantic import BaseModel, ConfigDict, Field, PrivateAttr from auroris import __version__ -from auroris.utils import fig2img, ipyimg2img +from auroris.utils import fig2img, bytes2img class AnnotatedImage(BaseModel): @@ -72,17 +71,17 @@ def log_new_column(self, name: str): def log_image( self, - image_or_figure: Union[ImageType, Figure, ByteString, IpythonImage], + image_or_figure: Union[ImageType, Figure, ByteString], title: Optional[str] = None, description: Optional[str] = None, ): """Logs an image. Also accepts Matplotlib figures, which will be converted to images.""" self._check_active_section() - if isinstance(image_or_figure, IpythonImage): - image = ipyimg2img(image_or_figure) - elif isinstance(image_or_figure, Figure): + if isinstance(image_or_figure, Figure): image = fig2img(image_or_figure) plt.close(image_or_figure) + elif isinstance(image_or_figure, ByteString): + image = bytes2img(image_or_figure) else: image = image_or_figure diff --git a/auroris/utils.py b/auroris/utils.py index a2e705b..ba4bfdc 100644 --- a/auroris/utils.py +++ b/auroris/utils.py @@ -42,7 +42,8 @@ def img2bytes(image: ImageType): return image_bytes -def bytes2imf(image_bytes: ByteString): +def bytes2img(image_bytes: ByteString): + """Convert bytes to PIL image""" image_stream = BytesIO(image_bytes) # Open the image using PIL image = Image.open(image_stream) From ed7a16e23cb6f255b2864700d5b578be88b317ea Mon Sep 17 00:00:00 2001 From: Lu Zhu Date: Wed, 22 May 2024 00:44:13 -0400 Subject: [PATCH 30/39] refactor image name --- auroris/report/broadcaster/_html.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/auroris/report/broadcaster/_html.py b/auroris/report/broadcaster/_html.py index 4897be0..1a78e45 100644 --- a/auroris/report/broadcaster/_html.py +++ b/auroris/report/broadcaster/_html.py @@ -68,7 +68,9 @@ def broadcast(self): src = f"data:image/png;base64,{image_data}" else: # Save as separate file - filename = re.sub(r"[^\w\-\.]", "_", image.title) if image.title else image_counter + # add image title to the file name. (Replace space, slash, dot by hyphen) + filename = re.sub(r"[ ./]", "_", image.title) if image.title else "" + filename = "-".join([str(image_counter), filename]) path = dm.fs.join(self._image_dir, f"{filename}.png") save_image(image.image, path, self._destination) src = path2url(path, self._destination) From e0699c530d0fe1a252301a2b7efd7dc3cca2c735 Mon Sep 17 00:00:00 2001 From: Lu Zhu Date: Wed, 22 May 2024 11:23:11 -0400 Subject: [PATCH 31/39] Update docs/index.md Co-authored-by: Cas Wognum --- docs/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/index.md b/docs/index.md index 36ae5be..8b80e7b 100644 --- a/docs/index.md +++ b/docs/index.md @@ -30,7 +30,7 @@ Explore the technical documentation here to delve into the inner workings of the **:fontawesome-solid-comments: Community** -We're excited to have you join us in revolutionizing drug discovery data curation! Explore Polaris, provide feedback, share your use cases, and collaborate with us to enhance and expand the capabilities of Polaris for the benefit of the drug discovery community. +We're excited to have you join us in revolutionizing drug discovery data curation! Explore Auroris and the broader Polaris ecosystem it is part of, provide feedback, share your use cases, and collaborate with us to enhance and expand the capabilities of Auroris for the benefit of the drug discovery community. [:material-arrow-right: Let's get started](https://discord.gg/vBFd8p6H7u) From b41793535441c7b1de8e7af9aedae4f3503615ba Mon Sep 17 00:00:00 2001 From: Lu Zhu Date: Wed, 22 May 2024 13:54:15 -0400 Subject: [PATCH 32/39] Update auroris/utils.py Co-authored-by: Cas Wognum --- auroris/utils.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/auroris/utils.py b/auroris/utils.py index ba4bfdc..04ddd37 100644 --- a/auroris/utils.py +++ b/auroris/utils.py @@ -50,18 +50,18 @@ def bytes2img(image_bytes: ByteString): return image -def path2url(path: str, destination: str): +def _img_to_html_src(self, path: str): """ - Convert path to an local or remote url for html report. - Currently, only GCP is supported. + Convert a path to a corresponding `src` attribute for an `` tag. + Currently only supports GCP and local paths. """ - if not os.path.isfile(path): - if path.startswith("gs://"): - return path.replace("gs://", "https://storage.googleapis.com/") - else: - raise ValueError("Only GCP path is supported.") - else: - return os.path.relpath(path, destination) + protocol = dm.utils.fs.get_protocol(path) + if protocol == "gs": + return path.replace("gs://", "https://storage.googleapis.com/") + elif protocol == "file": + return os.path.relpath(path, self._destination) + else: + raise ValueError("We only support images hosted in GCP or locally") def save_image(image: ImageType, path: str, destination: str): From c50af5b88f5ab282c2a48a63e76e77c6c3ced685 Mon Sep 17 00:00:00 2001 From: Lu Zhu Date: Wed, 22 May 2024 13:59:54 -0400 Subject: [PATCH 33/39] fix attribute discriminator --- auroris/curation/_curator.py | 29 +++++++++----------- auroris/curation/actions/_ac_stereoisomer.py | 3 +- auroris/curation/actions/_base.py | 8 +----- auroris/curation/actions/_deduplicate.py | 1 + auroris/curation/actions/_discretize.py | 1 + auroris/curation/actions/_distribution.py | 3 +- auroris/curation/actions/_mol.py | 21 +++++++------- auroris/curation/actions/_outlier.py | 1 + 8 files changed, 32 insertions(+), 35 deletions(-) diff --git a/auroris/curation/_curator.py b/auroris/curation/_curator.py index 0d94beb..6ce27dc 100644 --- a/auroris/curation/_curator.py +++ b/auroris/curation/_curator.py @@ -1,5 +1,5 @@ import json -from typing import List, Tuple, Union, Optional +from typing import List, Tuple, Union, Optional, Annotated from os import PathLike import fsspec @@ -7,7 +7,7 @@ from loguru import logger from pydantic import BaseModel, Field, field_serializer, field_validator -from auroris.curation.actions._base import ACTION_REGISTRY, BaseAction +from auroris.curation.actions import BaseAction from auroris.report import CurationReport from auroris.types import VerbosityLevel @@ -28,11 +28,16 @@ class Curator(BaseModel): description="Data path. The data must be loadable by `pd.read_csv` with default parameters.", ) - steps: List[Union[tuple(ACTION_REGISTRY)]] = Field( - ..., - discriminator="name", - description="List of curation actions. Check all the available action .", - ) + steps: List[ + Annotated[ + Union[tuple(BaseAction.__subclasses__())], + Field( + ..., + discriminator="name", + description="List of curation actions. Check all the available action .", + ), + ] + ] verbosity: VerbosityLevel = VerbosityLevel.NORMAL parallelized_kwargs: dict = Field(default_factory=dict) @@ -84,13 +89,6 @@ def transform(self, dataset: Optional[pd.DataFrame] = None) -> Tuple[pd.DataFram def __call__(self, dataset): return self.transform(dataset) - @classmethod - def _get_action(cls, name: str): - for action in ACTION_REGISTRY: - if action.__name__ == name: - return action - return None - @classmethod def from_json(cls, path: str): """Loads a curation workflow from a JSON file. @@ -101,8 +99,7 @@ def from_json(cls, path: str): with fsspec.open(path, "r") as f: data = json.load(f) - data["steps"] = [cls._get_action(step["name"]).model_validate(step) for step in data["steps"]] - return cls.model_validate(data) + return cls(**data) def to_json(self, path: str): """Saves the curation workflow to a JSON file. diff --git a/auroris/curation/actions/_ac_stereoisomer.py b/auroris/curation/actions/_ac_stereoisomer.py index c1a4bb0..006d4b9 100644 --- a/auroris/curation/actions/_ac_stereoisomer.py +++ b/auroris/curation/actions/_ac_stereoisomer.py @@ -1,4 +1,4 @@ -from typing import Dict, List, Optional +from typing import Dict, List, Optional, Literal from pydantic import Field import datamol as dm @@ -76,6 +76,7 @@ class StereoIsomerACDetection(BaseAction): ) prefix: str = Field(default="AC_", description="Prefix for the adding columns.") mol_col: str = Field(default="MOL_smiles", description="Column for molecule strings.") + name: Literal["ac_stereoisomer"] = "ac_stereoisomer" def transform( self, diff --git a/auroris/curation/actions/_base.py b/auroris/curation/actions/_base.py index 9cbc908..f90fb37 100644 --- a/auroris/curation/actions/_base.py +++ b/auroris/curation/actions/_base.py @@ -2,7 +2,7 @@ from typing import TYPE_CHECKING, Dict, Optional import pandas as pd -from pydantic import BaseModel, model_validator, Field, computed_field +from pydantic import BaseModel, model_validator, Field from auroris.types import VerbosityLevel @@ -20,12 +20,6 @@ class BaseAction(BaseModel, abc.ABC): prefix: str = Field(default=None, description="If the action adds columns, use this prefix.") - @computed_field - @property - def name(self) -> str: - """The name of the action. Needs to be unique.""" - return self.__class__.__name__ - @model_validator(mode="after") @classmethod def _validate_model(cls, m: "BaseAction"): diff --git a/auroris/curation/actions/_deduplicate.py b/auroris/curation/actions/_deduplicate.py index bd18e50..fe7bcda 100644 --- a/auroris/curation/actions/_deduplicate.py +++ b/auroris/curation/actions/_deduplicate.py @@ -66,6 +66,7 @@ class Deduplication(BaseAction): method: Literal["mean", "median"] = Field( default="median", description="The method to aggregate the data." ) + name: Literal["deduplicate"] = "deduplicate" def transform( self, diff --git a/auroris/curation/actions/_discretize.py b/auroris/curation/actions/_discretize.py index 9af9083..548692d 100644 --- a/auroris/curation/actions/_discretize.py +++ b/auroris/curation/actions/_discretize.py @@ -111,6 +111,7 @@ class Discretization(BaseAction): description="""Whether visualize distribution in log scale. See more in """, ) + name: Literal["discretize"] = "discretize" def transform( self, diff --git a/auroris/curation/actions/_distribution.py b/auroris/curation/actions/_distribution.py index 755ec8d..4422f73 100644 --- a/auroris/curation/actions/_distribution.py +++ b/auroris/curation/actions/_distribution.py @@ -1,4 +1,4 @@ -from typing import Dict, List, Optional, Sequence +from typing import Dict, List, Optional, Sequence, Literal from pydantic import Field import pandas as pd @@ -20,6 +20,7 @@ class ContinuousDistributionVisualization(BaseAction): bins: Optional[Sequence[float]] = Field( default=None, description="The bin boundaries to color the area under the KDE curve." ) + name: Literal["distribution"] = "distribution" def transform( self, diff --git a/auroris/curation/actions/_mol.py b/auroris/curation/actions/_mol.py index 790c7e2..1552f6a 100644 --- a/auroris/curation/actions/_mol.py +++ b/auroris/curation/actions/_mol.py @@ -1,5 +1,5 @@ from functools import partial -from typing import Dict, List, Optional, Tuple, Union +from typing import Dict, List, Literal, Optional, Tuple, Union from pydantic import Field import datamol as dm @@ -11,11 +11,7 @@ from auroris.report import CurationReport from auroris.types import VerbosityLevel from auroris.visualization import visualize_chemspace - -try: - from molfeat.trans.pretrained.hf_transformers import PretrainedHFTransformer -except ImportError: - PretrainedHFTransformer = None +from auroris.visualization.utils import create_figure def curate_molecules( @@ -261,6 +257,7 @@ class MoleculeCuration(BaseAction): default=True, description="Whether compute molecule features with default ECFP for visualizing distribution in chemical space.", ) + name: Literal["mol_curation"] = "mol_curation" def transform( self, @@ -327,10 +324,14 @@ def transform( defined = row[defined_col] legends.append(f"Undefined:{undefined}\n Definded:{defined}") - # returnPNG to avoid ipythonImage - image = dm.to_image( - to_plot[smiles_col].tolist(), legends=legends, use_svg=False, returnPNG=True - ) + with create_figure( + n_plots=1, + n_cols=1, + ) as (image, _): + dm.to_image( + to_plot[smiles_col].tolist(), legends=legends, use_svg=False, returnPNG=True + ) + report.log_image( image, title="Molecules with undefined stereocenters", diff --git a/auroris/curation/actions/_outlier.py b/auroris/curation/actions/_outlier.py index 4cbd572..398169d 100644 --- a/auroris/curation/actions/_outlier.py +++ b/auroris/curation/actions/_outlier.py @@ -140,6 +140,7 @@ class OutlierDetection(BaseAction): columns: List[str] = Field(..., description="Column names to detect outliers.") prefix: str = Field(default="OUTLIER_", description="Prefix for added column names.") kwargs: Dict = Field(default_factory=dict) + name: Literal["outlier_detection"] = "outlier_detection" def transform( self, From 4ee0270670090105547e1ed510e99deb3e3917a1 Mon Sep 17 00:00:00 2001 From: Lu Zhu Date: Wed, 22 May 2024 14:00:15 -0400 Subject: [PATCH 34/39] fix report --- auroris/report/_report.py | 6 ++++- auroris/report/broadcaster/_html.py | 6 ++--- auroris/report/broadcaster/_logger.py | 15 ++++++------ auroris/utils.py | 34 ++++++++++----------------- tests/test_curator.py | 3 +-- 5 files changed, 29 insertions(+), 35 deletions(-) diff --git a/auroris/report/_report.py b/auroris/report/_report.py index 4024982..526147c 100644 --- a/auroris/report/_report.py +++ b/auroris/report/_report.py @@ -71,7 +71,11 @@ def log_new_column(self, name: str): def log_image( self, - image_or_figure: Union[ImageType, Figure, ByteString], + image_or_figure: Union[ + ImageType, + Figure, + ByteString, + ], title: Optional[str] = None, description: Optional[str] = None, ): diff --git a/auroris/report/broadcaster/_html.py b/auroris/report/broadcaster/_html.py index 1a78e45..99ad393 100644 --- a/auroris/report/broadcaster/_html.py +++ b/auroris/report/broadcaster/_html.py @@ -8,7 +8,7 @@ import fsspec from auroris.report import CurationReport -from auroris.utils import img2bytes, save_image, path2url +from auroris.utils import img2bytes, save_image, _img_to_html_src from ._base import ReportBroadcaster @@ -72,8 +72,8 @@ def broadcast(self): filename = re.sub(r"[ ./]", "_", image.title) if image.title else "" filename = "-".join([str(image_counter), filename]) path = dm.fs.join(self._image_dir, f"{filename}.png") - save_image(image.image, path, self._destination) - src = path2url(path, self._destination) + save_image(image.image, path) + src = _img_to_html_src(path, self._destination) image.image = src image_counter += 1 diff --git a/auroris/report/broadcaster/_logger.py b/auroris/report/broadcaster/_logger.py index 26aa89d..0cad0ee 100644 --- a/auroris/report/broadcaster/_logger.py +++ b/auroris/report/broadcaster/_logger.py @@ -17,7 +17,7 @@ class ColoredFormatter(logging.Formatter): FORMATS = { logging.DEBUG: grey + format + reset, - logging.INFO: blue + format + reset, + logging.INFO: grey + format + reset, logging.WARNING: yellow + format + reset, logging.ERROR: red + format + reset, logging.CRITICAL: bold_red + format + reset, @@ -34,8 +34,7 @@ class LoggerBroadcaster(ReportBroadcaster): def __init__(self, report: CurationReport): super().__init__(report) self.logger = logging.getLogger(self.__class__.__name__) - # Lu: debug level might log other irrelevant debugging logs - self.logger.setLevel(logging.INFO) + self.logger.setLevel(logging.DEBUG) handler = logging.StreamHandler(sys.stdout) handler.setFormatter(ColoredFormatter()) @@ -55,19 +54,19 @@ def broadcast(self): self.on_report_end(self._report) def render_log(self, message: str): - self.logger.info(f"[LOG]: {message}") + self.logger.debug(f"[LOG]: {message}") def render_image(self, image: AnnotatedImage): width, height = image.image.size - self.logger.info(f"[IMG]: Dimensions {width} x {height}") + self.logger.debug(f"[IMG]: Dimensions {width} x {height}") def on_section_start(self, section: Section): - self.logger.info(f"===== {section.title} =====") + self.logger.debug(f"===== {section.title} =====") def on_report_start(self, report: CurationReport): self.logger.critical("===== Curation Report =====") - self.logger.info(f"Time: {report.time_stamp.strftime('%Y-%m-%d %H:%M:%S')}") - self.logger.info(f"Version: {report.auroris_version}") + self.logger.debug(f"Time: {report.time_stamp.strftime('%Y-%m-%d %H:%M:%S')}") + self.logger.debug(f"Version: {report.auroris_version}") def on_report_end(self, report: CurationReport): self.logger.critical("===== Curation Report END =====") diff --git a/auroris/utils.py b/auroris/utils.py index 04ddd37..4402137 100644 --- a/auroris/utils.py +++ b/auroris/utils.py @@ -1,14 +1,15 @@ import os from io import BytesIO from typing import ByteString +import fsspec import numpy as np from matplotlib.figure import Figure from PIL import Image from PIL.Image import Image as ImageType -import fsspec from sklearn.utils.multiclass import type_of_target + import datamol as dm @@ -25,13 +26,12 @@ def is_regression(values: np.ndarray) -> bool: def fig2img(fig: Figure) -> ImageType: """Convert a Matplotlib figure to a PIL Image""" - if isinstance(fig, Figure): - fig.canvas.draw() - return Image.frombytes( - "RGBA", - fig.canvas.get_width_height(), - fig.canvas.buffer_rgba(), - ) + fig.canvas.draw() + return Image.frombytes( + "RGBA", + fig.canvas.get_width_height(), + fig.canvas.buffer_rgba(), + ) def img2bytes(image: ImageType): @@ -60,18 +60,10 @@ def _img_to_html_src(self, path: str): return path.replace("gs://", "https://storage.googleapis.com/") elif protocol == "file": return os.path.relpath(path, self._destination) - else: - raise ValueError("We only support images hosted in GCP or locally") + else: + raise ValueError("We only support images hosted in GCP or locally") -def save_image(image: ImageType, path: str, destination: str): - """Save image to local and remote path""" - if dm.fs.is_local_path(destination): - image.save(path) - else: - # Lu: couldn't find a way to save image directly to remote path - # convert to bytes - image_bytes = img2bytes(image) - # save bytes as image to remote path - with fsspec.open(path, "wb") as f: - f.write(image_bytes) +def save_image(image: ImageType, path: str): + with fsspec.open(path, "wb") as fd: + image.save(fd, format="png") diff --git a/tests/test_curator.py b/tests/test_curator.py index 5a8b0c1..885cb05 100644 --- a/tests/test_curator.py +++ b/tests/test_curator.py @@ -17,8 +17,7 @@ def test_curator_save_load(tmpdir): OutlierDetection(method="zscore", columns=["outlier_column"]), ], ) - path = os.path.join(tmpdir, "curator_lu.json") - print(path) + path = os.path.join(tmpdir, "curator.json") curator.to_json(path) curator_reload = curator.from_json(path) From b6757ca8885c90c461951cdb02b71337d24212ba Mon Sep 17 00:00:00 2001 From: Lu Zhu Date: Wed, 22 May 2024 16:38:05 -0400 Subject: [PATCH 35/39] minor fix --- auroris/curation/_curator.py | 38 ++++++++++++-------- auroris/curation/actions/_ac_stereoisomer.py | 2 +- auroris/curation/actions/_deduplicate.py | 2 +- auroris/report/broadcaster/_html.py | 18 ++++++++-- auroris/utils.py | 29 ++++++--------- tests/test_curator.py | 2 ++ 6 files changed, 55 insertions(+), 36 deletions(-) diff --git a/auroris/curation/_curator.py b/auroris/curation/_curator.py index 6ce27dc..faef993 100644 --- a/auroris/curation/_curator.py +++ b/auroris/curation/_curator.py @@ -3,14 +3,14 @@ from os import PathLike import fsspec -import pandas as pd from loguru import logger from pydantic import BaseModel, Field, field_serializer, field_validator +import pandas as pd from auroris.curation.actions import BaseAction from auroris.report import CurationReport from auroris.types import VerbosityLevel - +from auroris.utils import is_parquet_file class Curator(BaseModel): """ @@ -53,21 +53,31 @@ def _serialize_verbosity(self, value: VerbosityLevel): @field_validator("src_dataset_path", mode="before") def _validate_src_dataset_path(cls, value: Union[str, PathLike]): - try: - pd.read_csv(value, nrows=5) - return value - except Exception: - raise ValueError( - f"Dataset can't be loaded by `pandas.read_csv('{value}')`." - f"Consider passing the DataFrame directly to `Curator.curate(dataset=...)`." - ) - - def _load_data(self): - return pd.read_csv(self.src_dataset_path) + # Use pyarrow reader interface for parquet validation which is also used in `pandas.read_parquet` + if not is_parquet_file(value): + try: + pd.read_csv(value, nrows=5) + return value + except Exception: + raise ValueError( + f"Dataset can't be loaded by `pandas.read_csv('{value}')` nor `pandas.read_parquet('{value}')`." + f"Consider passing the DataFrame directly to `Curator.curate(dataset=...)`." + ) + return value def transform(self, dataset: Optional[pd.DataFrame] = None) -> Tuple[pd.DataFrame, CurationReport]: + if self.src_dataset_path is not None: + if dataset is not None: + logger.warning( + "`self.scr_dataset_path` takes precedence over the `dataset` parameter. Overwriting the dataset!" + ) + if is_parquet_file(self.src_dataset_path): + dataset = pd.read_csv(self.src_dataset_path) + else: + dataset = pd.read_parquet(self.src_dataset_path) + if dataset is None: - dataset = self._load_data() + raise ValueError("Running the curator requires a source dataset.") report = CurationReport() dataset = dataset.copy(deep=True) diff --git a/auroris/curation/actions/_ac_stereoisomer.py b/auroris/curation/actions/_ac_stereoisomer.py index 006d4b9..4a88cde 100644 --- a/auroris/curation/actions/_ac_stereoisomer.py +++ b/auroris/curation/actions/_ac_stereoisomer.py @@ -98,7 +98,7 @@ def transform( col_with_prefix = self.get_column_name(col) report.log_new_column(col_with_prefix) - has_cliff = dataset[col_with_prefix].__eq__(True) + has_cliff = dataset[col_with_prefix] num_cliff = has_cliff.sum() if num_cliff > 0: diff --git a/auroris/curation/actions/_deduplicate.py b/auroris/curation/actions/_deduplicate.py index fe7bcda..75e4681 100644 --- a/auroris/curation/actions/_deduplicate.py +++ b/auroris/curation/actions/_deduplicate.py @@ -83,6 +83,6 @@ def transform( method=self.method, ) if report is not None: - num_duplicates = dataset.shape[0] - dataset_dedup.shape[0] + num_duplicates = len(dataset) - len(dataset_dedup) report.log(f"Deduplication merged and removed {num_duplicates} duplicated molecules from dataset") return dataset_dedup diff --git a/auroris/report/broadcaster/_html.py b/auroris/report/broadcaster/_html.py index 99ad393..73962df 100644 --- a/auroris/report/broadcaster/_html.py +++ b/auroris/report/broadcaster/_html.py @@ -1,5 +1,6 @@ import base64 import re +import os import pathlib from copy import deepcopy from importlib import resources @@ -8,7 +9,7 @@ import fsspec from auroris.report import CurationReport -from auroris.utils import img2bytes, save_image, _img_to_html_src +from auroris.utils import img2bytes, save_image from ._base import ReportBroadcaster @@ -73,7 +74,7 @@ def broadcast(self): filename = "-".join([str(image_counter), filename]) path = dm.fs.join(self._image_dir, f"{filename}.png") save_image(image.image, path) - src = _img_to_html_src(path, self._destination) + src = self._img_to_html_src(path) image.image = src image_counter += 1 @@ -93,3 +94,16 @@ def broadcast(self): fd.write(html) return path + + def _img_to_html_src(self, path: str): + """ + Convert a path to a corresponding `src` attribute for an `` tag. + Currently only supports GCP and local paths. + """ + protocol = dm.utils.fs.get_protocol(path) + if protocol == "gs": + return path.replace("gs://", "https://storage.googleapis.com/") + elif protocol == "file": + return os.path.relpath(path, self._destination) + else: + raise ValueError("We only support images hosted in GCP or locally") diff --git a/auroris/utils.py b/auroris/utils.py index 4402137..329b857 100644 --- a/auroris/utils.py +++ b/auroris/utils.py @@ -1,8 +1,7 @@ -import os from io import BytesIO from typing import ByteString import fsspec - +import pyarrow.parquet as pq import numpy as np from matplotlib.figure import Figure from PIL import Image @@ -10,8 +9,6 @@ from sklearn.utils.multiclass import type_of_target -import datamol as dm - def is_regression(values: np.ndarray) -> bool: """Whether the input values are for regreesion""" @@ -50,20 +47,16 @@ def bytes2img(image_bytes: ByteString): return image -def _img_to_html_src(self, path: str): - """ - Convert a path to a corresponding `src` attribute for an `` tag. - Currently only supports GCP and local paths. - """ - protocol = dm.utils.fs.get_protocol(path) - if protocol == "gs": - return path.replace("gs://", "https://storage.googleapis.com/") - elif protocol == "file": - return os.path.relpath(path, self._destination) - else: - raise ValueError("We only support images hosted in GCP or locally") - - def save_image(image: ImageType, path: str): with fsspec.open(path, "wb") as fd: image.save(fd, format="png") + + +def is_parquet_file(path): + """Verify parquet file""" + try: + pq.read_schema(path) + return True + except (IOError, ValueError): + return False + diff --git a/tests/test_curator.py b/tests/test_curator.py index 885cb05..183bec5 100644 --- a/tests/test_curator.py +++ b/tests/test_curator.py @@ -1,5 +1,7 @@ import os +from pandas.core.api import DataFrame as DataFrame + from auroris.curation import Curator from auroris.curation.actions import Discretization, MoleculeCuration, OutlierDetection from auroris.report.broadcaster import HTMLBroadcaster, LoggerBroadcaster From 4b2bae893a9214369adade948eb9a53813576ecc Mon Sep 17 00:00:00 2001 From: Lu Zhu Date: Wed, 22 May 2024 16:47:23 -0400 Subject: [PATCH 36/39] remove gcp --- auroris/curation/_curator.py | 1 + auroris/report/broadcaster/_html.py | 8 +++----- auroris/utils.py | 1 - 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/auroris/curation/_curator.py b/auroris/curation/_curator.py index faef993..7e07413 100644 --- a/auroris/curation/_curator.py +++ b/auroris/curation/_curator.py @@ -12,6 +12,7 @@ from auroris.types import VerbosityLevel from auroris.utils import is_parquet_file + class Curator(BaseModel): """ A curator is a collection of actions that are applied to a dataset. diff --git a/auroris/report/broadcaster/_html.py b/auroris/report/broadcaster/_html.py index 73962df..3ca4e1a 100644 --- a/auroris/report/broadcaster/_html.py +++ b/auroris/report/broadcaster/_html.py @@ -98,12 +98,10 @@ def broadcast(self): def _img_to_html_src(self, path: str): """ Convert a path to a corresponding `src` attribute for an `` tag. - Currently only supports GCP and local paths. + Currently only supports local paths. """ protocol = dm.utils.fs.get_protocol(path) - if protocol == "gs": - return path.replace("gs://", "https://storage.googleapis.com/") - elif protocol == "file": + if protocol == "file": return os.path.relpath(path, self._destination) else: - raise ValueError("We only support images hosted in GCP or locally") + raise ValueError("We only support images hosted locally") diff --git a/auroris/utils.py b/auroris/utils.py index 329b857..095f3c3 100644 --- a/auroris/utils.py +++ b/auroris/utils.py @@ -59,4 +59,3 @@ def is_parquet_file(path): return True except (IOError, ValueError): return False - From 5ec838a045764521b7d43e12a17e9a0f2ce34e8b Mon Sep 17 00:00:00 2001 From: Lu Zhu Date: Thu, 23 May 2024 01:06:41 -0400 Subject: [PATCH 37/39] update tutorial --- README.md | 4 + auroris/curation/actions/_discretize.py | 2 +- auroris/report/broadcaster/_html.py | 2 +- docs/tutorials/getting_started.ipynb | 129 +++++++++++------------- 4 files changed, 66 insertions(+), 71 deletions(-) diff --git a/README.md b/README.md index 2d9308e..829d913 100644 --- a/README.md +++ b/README.md @@ -33,6 +33,10 @@ curator = Curator( # Run the curation dataset, report = curator(dataset) ``` +### Run curation with command line +``` +auroris [config_file] [destination] --dataset-path [data_path] +``` ## Documentation diff --git a/auroris/curation/actions/_discretize.py b/auroris/curation/actions/_discretize.py index 548692d..8a619e7 100644 --- a/auroris/curation/actions/_discretize.py +++ b/auroris/curation/actions/_discretize.py @@ -83,7 +83,7 @@ class Discretization(BaseAction): """ input_column: str = Field(..., description="Column to be discretized.") - prefix: str = "CLS_" + prefix: str = Field(default="CLS_", description="Prefix for added column names.") thresholds: List[float] = Field(..., description="Interval boundaries that include the right bin edge.") inplace: bool = Field( default=False, diff --git a/auroris/report/broadcaster/_html.py b/auroris/report/broadcaster/_html.py index 3ca4e1a..d8f97ed 100644 --- a/auroris/report/broadcaster/_html.py +++ b/auroris/report/broadcaster/_html.py @@ -104,4 +104,4 @@ def _img_to_html_src(self, path: str): if protocol == "file": return os.path.relpath(path, self._destination) else: - raise ValueError("We only support images hosted locally") + raise ValueError("We only support images hosted locally.") diff --git a/docs/tutorials/getting_started.ipynb b/docs/tutorials/getting_started.ipynb index d3f1e09..28fcbc0 100644 --- a/docs/tutorials/getting_started.ipynb +++ b/docs/tutorials/getting_started.ipynb @@ -22,7 +22,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 2, "id": "976c5af4-2819-4cd6-8a57-072441eb9305", "metadata": { "editable": true, @@ -83,7 +83,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 3, "id": "a8f88963-f561-4cdc-8bfb-0a3045743e98", "metadata": { "editable": true, @@ -99,7 +99,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 4, "id": "5483a1fd-b22e-43e5-92f4-e6e8f4f3e629", "metadata": { "editable": true, @@ -142,7 +142,7 @@ " \n", " \n", " 0\n", - " <rdkit.Chem.rdchem.Mol object at 0x288ce21f0>\n", + " <rdkit.Chem.rdchem.Mol object at 0x169be1fc0>\n", " 1\n", " n-pentane\n", " -3.18\n", @@ -152,7 +152,7 @@ " \n", " \n", " 1\n", - " <rdkit.Chem.rdchem.Mol object at 0x288ce11c0>\n", + " <rdkit.Chem.rdchem.Mol object at 0x169be2110>\n", " 2\n", " cyclopentane\n", " -2.64\n", @@ -162,7 +162,7 @@ " \n", " \n", " 2\n", - " <rdkit.Chem.rdchem.Mol object at 0x288ce2c00>\n", + " <rdkit.Chem.rdchem.Mol object at 0x169be2180>\n", " 3\n", " n-hexane\n", " -3.84\n", @@ -172,7 +172,7 @@ " \n", " \n", " 3\n", - " <rdkit.Chem.rdchem.Mol object at 0x288ce2340>\n", + " <rdkit.Chem.rdchem.Mol object at 0x169be21f0>\n", " 4\n", " 2-methylpentane\n", " -3.74\n", @@ -182,7 +182,7 @@ " \n", " \n", " 4\n", - " <rdkit.Chem.rdchem.Mol object at 0x288ce1ee0>\n", + " <rdkit.Chem.rdchem.Mol object at 0x169be2260>\n", " 6\n", " 2,2-dimethylbutane\n", " -3.55\n", @@ -196,11 +196,11 @@ ], "text/plain": [ " mol ID NAME \\\n", - "0 1 n-pentane \n", - "1 2 cyclopentane \n", - "2 3 n-hexane \n", - "3 4 2-methylpentane \n", - "4 6 2,2-dimethylbutane \n", + "0 1 n-pentane \n", + "1 2 cyclopentane \n", + "2 3 n-hexane \n", + "3 4 2-methylpentane \n", + "4 6 2,2-dimethylbutane \n", "\n", " SOL SOL_classification smiles split \n", "0 -3.18 (A) low CCCCC train \n", @@ -210,7 +210,7 @@ "4 -3.55 (A) low CCC(C)(C)C train " ] }, - "execution_count": 13, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -218,8 +218,8 @@ "source": [ "# Load your data set\n", "# See more details of the dataset at https://docs.datamol.io/stable/api/datamol.data.html\n", - "dataset = dm.data.solubility()\n", - "dataset.head(5)" + "data = dm.data.solubility()\n", + "data.head(5)" ] }, { @@ -242,7 +242,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 5, "id": "f9dbba41-2bd2-4321-b948-13877cee5b13", "metadata": {}, "outputs": [ @@ -250,9 +250,9 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[32m2024-05-16 17:56:13.689\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mauroris.curation._curator\u001b[0m:\u001b[36mtransform\u001b[0m:\u001b[36m76\u001b[0m - \u001b[1mPerforming step: MoleculeCuration\u001b[0m\n", - "\u001b[32m2024-05-16 17:56:17.051\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mauroris.curation._curator\u001b[0m:\u001b[36mtransform\u001b[0m:\u001b[36m76\u001b[0m - \u001b[1mPerforming step: OutlierDetection\u001b[0m\n", - "\u001b[32m2024-05-16 17:56:17.079\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mauroris.curation._curator\u001b[0m:\u001b[36mtransform\u001b[0m:\u001b[36m76\u001b[0m - \u001b[1mPerforming step: Discretization\u001b[0m\n" + "\u001b[32m2024-05-23 00:53:40.139\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mauroris.curation._curator\u001b[0m:\u001b[36mtransform\u001b[0m:\u001b[36m88\u001b[0m - \u001b[1mPerforming step: mol_curation\u001b[0m\n", + "\u001b[32m2024-05-23 00:53:55.266\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mauroris.curation._curator\u001b[0m:\u001b[36mtransform\u001b[0m:\u001b[36m88\u001b[0m - \u001b[1mPerforming step: outlier_detection\u001b[0m\n", + "\u001b[32m2024-05-23 00:53:55.338\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mauroris.curation._curator\u001b[0m:\u001b[36mtransform\u001b[0m:\u001b[36m88\u001b[0m - \u001b[1mPerforming step: discretize\u001b[0m\n" ] } ], @@ -271,7 +271,7 @@ ")\n", "\n", "# Run the curation\n", - "dataset, report = curator(dataset)" + "dataset, report = curator(data)" ] }, { @@ -284,7 +284,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 6, "id": "98aaadd9-8c0f-412e-ab74-79b8f9a3d75a", "metadata": {}, "outputs": [ @@ -293,31 +293,31 @@ "output_type": "stream", "text": [ "\u001b[31;1m===== Curation Report =====\u001b[0m\n", - "\u001b[34;1mTime: 2024-05-16 17:56:13\u001b[0m\n", - "\u001b[34;1mVersion: dev\u001b[0m\n", - "\u001b[34;1m===== MoleculeCuration =====\u001b[0m\n", - "\u001b[34;1m[LOG]: Couldn't preprocess 18 / 1282 molecules.\u001b[0m\n", - "\u001b[34;1m[LOG]: New column added: MOL_smiles\u001b[0m\n", - "\u001b[34;1m[LOG]: New column added: MOL_molhash_id\u001b[0m\n", - "\u001b[34;1m[LOG]: New column added: MOL_molhash_id_no_stereo\u001b[0m\n", - "\u001b[34;1m[LOG]: New column added: MOL_num_stereoisomers\u001b[0m\n", - "\u001b[34;1m[LOG]: New column added: MOL_num_undefined_stereoisomers\u001b[0m\n", - "\u001b[34;1m[LOG]: New column added: MOL_num_defined_stereo_center\u001b[0m\n", - "\u001b[34;1m[LOG]: New column added: MOL_num_undefined_stereo_center\u001b[0m\n", - "\u001b[34;1m[LOG]: New column added: MOL_num_stereo_center\u001b[0m\n", - "\u001b[34;1m[LOG]: New column added: MOL_undefined_E_D\u001b[0m\n", - "\u001b[34;1m[LOG]: New column added: MOL_undefined_E/Z\u001b[0m\n", - "\u001b[34;1m[LOG]: Default `ecfp` fingerprint is used to compute the distributionin chemical space.\u001b[0m\n", - "\u001b[34;1m[LOG]: Molecules with undefined stereocenter detected: 253.\u001b[0m\n", - "\u001b[34;1m[IMG]: Dimensions 1200 x 600\u001b[0m\n", - "\u001b[34;1m[IMG]: Dimensions 1200 x 2400\u001b[0m\n", - "\u001b[34;1m===== OutlierDetection =====\u001b[0m\n", - "\u001b[34;1m[LOG]: New column added: OUTLIER_SOL\u001b[0m\n", - "\u001b[34;1m[LOG]: Found 7 potential outliers with respect to the SOL column for review.\u001b[0m\n", - "\u001b[34;1m[IMG]: Dimensions 640 x 480\u001b[0m\n", - "\u001b[34;1m===== Discretization =====\u001b[0m\n", - "\u001b[34;1m[LOG]: New column added: CLS_SOL\u001b[0m\n", - "\u001b[34;1m[IMG]: Dimensions 1200 x 600\u001b[0m\n", + "\u001b[38;20mTime: 2024-05-23 00:53:40\u001b[0m\n", + "\u001b[38;20mVersion: dev\u001b[0m\n", + "\u001b[38;20m===== mol_curation =====\u001b[0m\n", + "\u001b[38;20m[LOG]: Couldn't preprocess 18 / 1282 molecules.\u001b[0m\n", + "\u001b[38;20m[LOG]: New column added: MOL_smiles\u001b[0m\n", + "\u001b[38;20m[LOG]: New column added: MOL_molhash_id\u001b[0m\n", + "\u001b[38;20m[LOG]: New column added: MOL_molhash_id_no_stereo\u001b[0m\n", + "\u001b[38;20m[LOG]: New column added: MOL_num_stereoisomers\u001b[0m\n", + "\u001b[38;20m[LOG]: New column added: MOL_num_undefined_stereoisomers\u001b[0m\n", + "\u001b[38;20m[LOG]: New column added: MOL_num_defined_stereo_center\u001b[0m\n", + "\u001b[38;20m[LOG]: New column added: MOL_num_undefined_stereo_center\u001b[0m\n", + "\u001b[38;20m[LOG]: New column added: MOL_num_stereo_center\u001b[0m\n", + "\u001b[38;20m[LOG]: New column added: MOL_undefined_E_D\u001b[0m\n", + "\u001b[38;20m[LOG]: New column added: MOL_undefined_E/Z\u001b[0m\n", + "\u001b[38;20m[LOG]: Default `ecfp` fingerprint is used to compute the distributionin chemical space.\u001b[0m\n", + "\u001b[38;20m[LOG]: Molecules with undefined stereocenter detected: 253.\u001b[0m\n", + "\u001b[38;20m[IMG]: Dimensions 1200 x 600\u001b[0m\n", + "\u001b[38;20m[IMG]: Dimensions 1200 x 600\u001b[0m\n", + "\u001b[38;20m===== outlier_detection =====\u001b[0m\n", + "\u001b[38;20m[LOG]: New column added: OUTLIER_SOL\u001b[0m\n", + "\u001b[38;20m[LOG]: Found 7 potential outliers with respect to the SOL column for review.\u001b[0m\n", + "\u001b[38;20m[IMG]: Dimensions 1200 x 600\u001b[0m\n", + "\u001b[38;20m===== discretize =====\u001b[0m\n", + "\u001b[38;20m[LOG]: New column added: CLS_SOL\u001b[0m\n", + "\u001b[38;20m[IMG]: Dimensions 1200 x 600\u001b[0m\n", "\u001b[31;1m===== Curation Report END =====\u001b[0m\n" ] } @@ -339,26 +339,29 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 7, "id": "9b896b12-fbae-4b7b-b62a-f2d2d15075c1", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'/Users/lu.zhu/Documents/Codebase/ValenceLab/auroris/docs/tutorials/test/index.html'" + "'/var/folders/_7/ffxc1f251dbb5msn977xl4sm0000gr/T/tmpp9qm656e/index.html'" ] }, - "execution_count": 16, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from auroris.report.broadcaster import HTMLBroadcaster\n", + "import tempfile\n", + "\n", + "temp_dir = tempfile.TemporaryDirectory().name\n", "\n", "broadcaster = HTMLBroadcaster(report= report, \n", - " destination=\"/Users/lu.zhu/Documents/Codebase/ValenceLab/auroris/docs/tutorials/test\", \n", + " destination=temp_dir, \n", " embed_images=True)\n", "broadcaster.broadcast()" ] @@ -373,14 +376,14 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 8, "id": "97be9d29-03eb-4eb7-b9c0-ac84413f6dca", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "mol \n", + "mol \n", "ID 1\n", "NAME n-pentane\n", "SOL -3.18\n", @@ -402,7 +405,7 @@ "Name: 0, dtype: object" ] }, - "execution_count": 17, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -428,7 +431,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 9, "id": "f5044a09-c34f-4888-a5ac-65fb62225129", "metadata": { "editable": true, @@ -440,9 +443,9 @@ "outputs": [ { "data": { - "image/png": "", + "image/png": "", "text/plain": [ - "
" + "
" ] }, "metadata": {}, @@ -465,18 +468,6 @@ "source": [ "The End. " ] - }, - { - "cell_type": "markdown", - "id": "e5b79da1", - "metadata": {}, - "source": [] - }, - { - "cell_type": "markdown", - "id": "b020ee56", - "metadata": {}, - "source": [] } ], "metadata": { From 6d836ffd8815835136159d0d483e2fe04c07c4c2 Mon Sep 17 00:00:00 2001 From: Lu Zhu Date: Thu, 23 May 2024 01:10:24 -0400 Subject: [PATCH 38/39] add pyarrow dep --- env.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/env.yml b/env.yml index e2c8206..3243d1c 100644 --- a/env.yml +++ b/env.yml @@ -10,6 +10,7 @@ dependencies: - pydantic >=2 - fsspec - pillow + - pyarrow # Scientific - numpy From becae34ff5569f03e6b0de8eab0a70d1aa3938ab Mon Sep 17 00:00:00 2001 From: Cas Wognum Date: Thu, 23 May 2024 16:43:29 -0400 Subject: [PATCH 39/39] Update the documentation (#5) * Docs pass * Changed extension to parquet in CLI --- README.md | 2 + auroris/cli.py | 21 +++-- auroris/curation/_curator.py | 90 +++++++++++++------- auroris/curation/actions/_ac_stereoisomer.py | 46 +++++----- auroris/curation/actions/_base.py | 22 +++-- auroris/curation/actions/_deduplicate.py | 19 ++--- auroris/curation/actions/_discretize.py | 46 ++++------ auroris/curation/actions/_distribution.py | 27 +++--- auroris/curation/actions/_mol.py | 73 ++++++++-------- auroris/curation/actions/_outlier.py | 11 ++- auroris/report/_report.py | 8 +- auroris/report/broadcaster/_html.py | 8 +- auroris/report/broadcaster/_logger.py | 4 +- auroris/types.py | 2 + auroris/utils.py | 8 +- docs/api/actions.md | 13 +++ docs/api/actions/deduplication.md | 4 - docs/api/actions/discretization.md | 4 - docs/api/actions/mol.md | 4 - docs/api/actions/outlier_detection.md | 4 - docs/api/actions/stereo_ac.md | 4 - docs/api/functional.md | 4 +- docs/api/utils.md | 5 ++ docs/api/visualization.md | 4 +- env.yml | 6 +- mkdocs.yml | 21 ++--- pyproject.toml | 1 + 27 files changed, 246 insertions(+), 215 deletions(-) create mode 100644 docs/api/actions.md delete mode 100644 docs/api/actions/deduplication.md delete mode 100644 docs/api/actions/discretization.md delete mode 100644 docs/api/actions/mol.md delete mode 100644 docs/api/actions/outlier_detection.md delete mode 100644 docs/api/actions/stereo_ac.md create mode 100644 docs/api/utils.md diff --git a/README.md b/README.md index 829d913..48ba4cb 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,8 @@ curator = Curator( dataset, report = curator(dataset) ``` ### Run curation with command line +A `Curator` object is serializable, so you can save it to and load it from a JSON file to reproduce the curation. + ``` auroris [config_file] [destination] --dataset-path [data_path] ``` diff --git a/auroris/cli.py b/auroris/cli.py index 0bc3301..a3ccef8 100644 --- a/auroris/cli.py +++ b/auroris/cli.py @@ -1,8 +1,8 @@ +from typing import Optional + import datamol as dm -import pandas as pd import typer -from typing import Optional from auroris.curation import Curator from auroris.report.broadcaster import HTMLBroadcaster @@ -11,17 +11,24 @@ @app.command() def curate(config_path: str, destination: str, dataset_path: Optional[str] = None, overwrite: bool = False): - # Load data - dataset = pd.read_csv(dataset_path) if dataset_path else None + # Create the curator curator = Curator.from_json(config_path) + # Overwrite the source dataset if it is set + if dataset_path is not None: + curator.src_dataset_path = dataset_path + # Run curation - dataset, report = curator(dataset) + dataset, report = curator.transform() # Save dataset dm.fs.mkdir(destination, exist_ok=overwrite) - path = dm.fs.join(destination, "curated.csv") - dataset.to_csv(path, index=False) + path = dm.fs.join(destination, "curated.parquet") + dataset.to_parquet(path, index=False) + + # Save a copy of the curation config + config_destination = dm.fs.join(destination, "config.json") + curator.to_json(config_destination) # Save report as HTML report_destination = dm.fs.join(destination, "report") diff --git a/auroris/curation/_curator.py b/auroris/curation/_curator.py index 7e07413..d90a23c 100644 --- a/auroris/curation/_curator.py +++ b/auroris/curation/_curator.py @@ -1,11 +1,11 @@ import json -from typing import List, Tuple, Union, Optional, Annotated +from typing import Annotated, List, Optional, Tuple, Union -from os import PathLike +import datamol as dm import fsspec +import pandas as pd from loguru import logger from pydantic import BaseModel, Field, field_serializer, field_validator -import pandas as pd from auroris.curation.actions import BaseAction from auroris.report import CurationReport @@ -15,30 +15,27 @@ class Curator(BaseModel): """ - A curator is a collection of actions that are applied to a dataset. - Can be serialized. + A curator is a serializable collection of actions that are applied to a dataset. + Attributes: + steps (List[BaseAction]): Ordered list of curation actions to apply to the dataset. + src_dataset_path: An optional path to load the source dataset from. Can be used to specify a reproducible workflow. + verbosity: Verbosity level for logging. + parallelized_kwargs: Keyword arguments to affect parallelization in the steps. """ # To know which Action object to create, we need a discriminated union. # This is the recommended way to add all subclasses in the type. # See e.g. https://github.com/pydantic/pydantic/issues/2200 # and https://github.com/pydantic/pydantic/issues/2036 - src_dataset_path: Optional[Union[str, PathLike]] = Field( - default=None, - description="Data path. The data must be loadable by `pd.read_csv` with default parameters.", - ) - steps: List[ Annotated[ - Union[tuple(BaseAction.__subclasses__())], - Field( - ..., - discriminator="name", - description="List of curation actions. Check all the available action .", - ), + Union[tuple(BaseAction.__subclasses__())], # type: ignore + Field(..., discriminator="name"), ] ] + + src_dataset_path: Optional[str] = None verbosity: VerbosityLevel = VerbosityLevel.NORMAL parallelized_kwargs: dict = Field(default_factory=dict) @@ -52,35 +49,56 @@ def _validate_verbosity(cls, v): def _serialize_verbosity(self, value: VerbosityLevel): return value.name - @field_validator("src_dataset_path", mode="before") - def _validate_src_dataset_path(cls, value: Union[str, PathLike]): - # Use pyarrow reader interface for parquet validation which is also used in `pandas.read_parquet` + @field_validator("src_dataset_path") + def _validate_src_dataset_path(cls, value: Optional[str]): + # If not set, no need to validate + if value is None: + return value + + # Efficient check to see if it's a valid path to a supported file if not is_parquet_file(value): try: pd.read_csv(value, nrows=5) - return value except Exception: raise ValueError( f"Dataset can't be loaded by `pandas.read_csv('{value}')` nor `pandas.read_parquet('{value}')`." f"Consider passing the DataFrame directly to `Curator.curate(dataset=...)`." ) + + # If it's set, but local, warn the user that this hinders reproducibility. + if dm.utils.fs.is_local_path(value): + logger.warning( + "Using a local path for `src_dataset_path` hinders reproducibility. " + "Consider uploading the file to a public cloud storage service." + ) return value def transform(self, dataset: Optional[pd.DataFrame] = None) -> Tuple[pd.DataFrame, CurationReport]: + """Runs the curation process. + + Args: + dataset: The dataset to be curated. If `src_dataset_path` is set, this parameter is ignored. + + Returns: + A tuple of the curated dataset and a report summarizing the changes made. + """ + if self.src_dataset_path is not None: if dataset is not None: logger.warning( - "`self.scr_dataset_path` takes precedence over the `dataset` parameter. Overwriting the dataset!" + "Both `self.scr_dataset_path` and the `dataset` parameter are specified. " + "Ignoring the `dataset` parameter." ) - if is_parquet_file(self.src_dataset_path): - dataset = pd.read_csv(self.src_dataset_path) - else: - dataset = pd.read_parquet(self.src_dataset_path) + + dataset = self.load_dataset(self.src_dataset_path) if dataset is None: raise ValueError("Running the curator requires a source dataset.") + # The report summarizes the changes made to the dataset report = CurationReport() + + # Changes are not made in place dataset = dataset.copy(deep=True) action: BaseAction @@ -97,6 +115,20 @@ def transform(self, dataset: Optional[pd.DataFrame] = None) -> Tuple[pd.DataFram return dataset, report + @staticmethod + def load_dataset(path: str): + """ + Loads a dataset, to be curated, from a path. + + Info: File-format support + This currently only supports CSV and Parquet files and uses the default + parameters for `pd.read_csv` and `pd.read_parquet`. If you need more flexibility, + consider loading the data yourself and passing it directly to `Curator.transform(dataset=...)`. + """ + if not is_parquet_file(path): + return pd.read_csv(path) + return pd.read_parquet(path) + def __call__(self, dataset): return self.transform(dataset) @@ -109,7 +141,6 @@ def from_json(cls, path: str): """ with fsspec.open(path, "r") as f: data = json.load(f) - return cls(**data) def to_json(self, path: str): @@ -118,10 +149,5 @@ def to_json(self, path: str): Args: path: The destination to save to. """ - serialization = self.model_dump() - # remove src_dataset_path if unavailable - if self.src_dataset_path is None: - serialization.pop("src_dataset_path") with fsspec.open(path, "w") as f: - json.dump(serialization, f) - return path + json.dump(self.model_dump(), f) diff --git a/auroris/curation/actions/_ac_stereoisomer.py b/auroris/curation/actions/_ac_stereoisomer.py index 4a88cde..10161a6 100644 --- a/auroris/curation/actions/_ac_stereoisomer.py +++ b/auroris/curation/actions/_ac_stereoisomer.py @@ -1,10 +1,9 @@ -from typing import Dict, List, Optional, Literal -from pydantic import Field +from typing import Dict, List, Literal, Optional import datamol as dm import numpy as np import pandas as pd - +from pydantic import Field from auroris.curation.actions._base import BaseAction from auroris.curation.actions._outlier import modified_zscore @@ -64,19 +63,22 @@ def detect_streoisomer_activity_cliff( class StereoIsomerACDetection(BaseAction): """ Automatic detection of activity shift between stereoisomers. + + See [`auroris.curation.functional.detect_streoisomer_activity_cliff`][] for the docs of the + `stereoisomer_id_col`, `y_cols` and `threshold` attributes + + Attributes: + mol_col: Column with the SMILES or RDKit Molecule objects. + If specified, will be used to render an image for the activity cliffs. """ - stereoisomer_id_col: str = Field( - default="MOL_molhash_id_no_stereo", description="Column which identifies the stereoisomers." - ) - y_cols: List[str] = Field(..., description="List of columns for bioactivities.") - threshold: float = Field( - default=2.0, - description=" Threshold to identify the activity cliff. Currently, the difference of zscores between isomers are used for identification.", - ) - prefix: str = Field(default="AC_", description="Prefix for the adding columns.") - mol_col: str = Field(default="MOL_smiles", description="Column for molecule strings.") name: Literal["ac_stereoisomer"] = "ac_stereoisomer" + prefix: str = "AC_" + + stereoisomer_id_col: str = "MOL_molhash_id_no_stereo" + y_cols: List[str] = Field(default_factory=list) + threshold: float = 2.0 + mol_col: Optional[str] = "MOL_smiles" def transform( self, @@ -93,6 +95,11 @@ def transform( prefix=self.prefix, ) + # Log the following information to the report: + # - Newly added columns + # - Number of activity cliffs found + # - Image of the activity cliffs + if report is not None: for col in self.y_cols: col_with_prefix = self.get_column_name(col) @@ -106,13 +113,14 @@ def transform( f"Found {num_cliff} activity cliffs among stereoisomers " f"with respect to the {col} column." ) - to_plot = dataset.loc[has_cliff, self.mol_col] - legends = (col + dataset.loc[has_cliff, col].astype(str)).tolist() - image = dm.to_image([dm.to_mol(s) for s in to_plot], legends=legends, use_svg=False) - report.log_image( - image_or_figure=image, title="Detection of activity shifts among stereoisomers" - ) + if self.mol_col is not None: + to_plot = dataset.loc[has_cliff, self.mol_col] + legends = (col + dataset.loc[has_cliff, col].astype(str)).tolist() + image = dm.to_image([dm.to_mol(s) for s in to_plot], legends=legends, use_svg=False) + report.log_image( + image_or_figure=image, title="Detection of activity shifts among stereoisomers" + ) else: report.log( diff --git a/auroris/curation/actions/_base.py b/auroris/curation/actions/_base.py index f90fb37..9afaafc 100644 --- a/auroris/curation/actions/_base.py +++ b/auroris/curation/actions/_base.py @@ -2,7 +2,7 @@ from typing import TYPE_CHECKING, Dict, Optional import pandas as pd -from pydantic import BaseModel, model_validator, Field +from pydantic import BaseModel, model_validator from auroris.types import VerbosityLevel @@ -10,15 +10,23 @@ from auroris.report import CurationReport -ACTION_REGISTRY = [] - - class BaseAction(BaseModel, abc.ABC): """ An action in the curation process. + + Info: The importance of reproducibility + One of the main goals in designing `auroris` is to make it easy to reproduce the curation process. + Reproducibility is key to scientific research. This is why a BaseAction needs to be serializable and + uniquely identified by a `name`. + + Attributes: + name: The name that uniquely identifies the action. This is used to serialize and deserialize the action. + prefix: This prefix is used when an action adds columns to a dataset. + If not set, it defaults to the name in uppercase. """ - prefix: str = Field(default=None, description="If the action adds columns, use this prefix.") + name: str + prefix: str = None @model_validator(mode="after") @classmethod @@ -42,7 +50,3 @@ def transform( def __call__(self, dataset: pd.DataFrame): return self.transform(dataset) - - def __init_subclass__(cls, **kwargs): - super().__init_subclass__(**kwargs) - ACTION_REGISTRY.append(cls) diff --git a/auroris/curation/actions/_deduplicate.py b/auroris/curation/actions/_deduplicate.py index 75e4681..3abfe1c 100644 --- a/auroris/curation/actions/_deduplicate.py +++ b/auroris/curation/actions/_deduplicate.py @@ -1,5 +1,4 @@ from typing import Dict, List, Literal, Optional, Union -from pydantic import Field import pandas as pd @@ -54,20 +53,18 @@ def deduplicate( class Deduplication(BaseAction): """ Automatic detection of outliers. + + See [`auroris.curation.functional.deduplicate`][] for the docs of the + `deduplicate_on`, `y_cols`, `keep` and `method` attributes """ - deduplicate_on: Optional[Union[str, List[str]]] = Field( - default=None, description="A subset of the columns to deduplicate on (can be default)." - ) - y_cols: Optional[Union[str, List[str]]] = Field(default=None, description="The columns to aggregate.") - keep: Literal["first", "last"] = Field( - default="first", description="Whether to keep the first or last copy of the duplicates." - ) - method: Literal["mean", "median"] = Field( - default="median", description="The method to aggregate the data." - ) name: Literal["deduplicate"] = "deduplicate" + deduplicate_on: Optional[Union[str, List[str]]] = None + y_cols: Optional[Union[str, List[str]]] = None + keep: Literal["first", "last"] = "first" + method: Literal["mean", "median"] = "median" + def transform( self, dataset: pd.DataFrame, diff --git a/auroris/curation/actions/_discretize.py b/auroris/curation/actions/_discretize.py index 8a619e7..7c4e9f5 100644 --- a/auroris/curation/actions/_discretize.py +++ b/auroris/curation/actions/_discretize.py @@ -1,5 +1,4 @@ from typing import Dict, List, Literal, Optional, Union -from pydantic import Field import numpy as np import pandas as pd @@ -80,38 +79,25 @@ def discretize( class Discretization(BaseAction): """ Thresholding bioactivity columns to binary or multiclass labels. + + See [`auroris.curation.functional.discretize`][] for the docs of the + `thresholds`, `inplace`, `allow_nan` and `label_order` attributes + + Attributes: + input_column: The column to discretize. + log_scale: Whether a visual depiction of the discretization should be on a log scale. """ - input_column: str = Field(..., description="Column to be discretized.") - prefix: str = Field(default="CLS_", description="Prefix for added column names.") - thresholds: List[float] = Field(..., description="Interval boundaries that include the right bin edge.") - inplace: bool = Field( - default=False, - description="""Set to True to perform inplace discretization and avoid a copy - (if the input is already a numpy array or a scipy.sparse CSR / CSC - matrix and if axis is 1).""", - ) - allow_nan: bool = Field( - default=True, - description="Set to True to allow nans in the array for discretization. Otherwise, an error will be raised instead.", - ) - label_order: Literal["ascending", "descending"] = Field( - default="ascending", - description="""The continuous values are discretized to labels 0, 1, 2, .., N with respect to given - threshold bins [threshold_1, threshold_2,.., threshould_n]. - When set to 'ascending', the class label is in ascending order with the threshold - bins that `0` represents negative class or lower class, while 1, 2, 3 are for higher classes. - When set to 'descending' the class label is in ascending order with the threshold bins. - Sometimes the positive labels are on the left side of provided threshold. - E.g. For binarization with threshold [0.5], the positive label is defined - by`X < 0.5`. In this case, `label_order` should be `descending`.""", - ) - log_scale: bool = Field( - default=False, - description="""Whether visualize distribution in log scale. - See more in """, - ) name: Literal["discretize"] = "discretize" + prefix: str = "CLS_" + + input_column: str + thresholds: List[float] + + inplace: bool = False + allow_nan: bool = True + label_order: Literal["ascending", "descending"] = "ascending" + log_scale: bool = False def transform( self, diff --git a/auroris/curation/actions/_distribution.py b/auroris/curation/actions/_distribution.py index 4422f73..20dd3b1 100644 --- a/auroris/curation/actions/_distribution.py +++ b/auroris/curation/actions/_distribution.py @@ -1,6 +1,7 @@ -from typing import Dict, List, Optional, Sequence, Literal -from pydantic import Field +from typing import Dict, List, Literal, Optional, Sequence + import pandas as pd +from loguru import logger from auroris.curation.actions._base import BaseAction from auroris.report import CurationReport @@ -10,18 +11,21 @@ class ContinuousDistributionVisualization(BaseAction): """ - Visualize a continuous distribution. + Visualize one or more continuous distribution(s). + + See [`auroris.visualization.visualize_continuous_distribution`][] for the docs of the + `log_scale` and `bins` attributes + + Attributes: + y_cols: The columns whose distributions should be visualized. """ - y_cols: Optional[List[str]] = Field( - default=None, description="List of columns for bioactivity for visualization." - ) - log_scale: bool = Field(default=False, description="Whether visualize distribution in log scale.") - bins: Optional[Sequence[float]] = Field( - default=None, description="The bin boundaries to color the area under the KDE curve." - ) name: Literal["distribution"] = "distribution" + y_cols: List[str] + log_scale: bool = False + bins: Optional[Sequence[float]] = None + def transform( self, dataset: pd.DataFrame, @@ -29,6 +33,9 @@ def transform( verbosity: VerbosityLevel = VerbosityLevel.NORMAL, parallelized_kwargs: Optional[Dict] = None, ): + if report is None: + logger.warning("No report provided. Skipping visualization.") + if report is not None: for y_col in self.y_cols: fig = visualize_continuous_distribution( diff --git a/auroris/curation/actions/_mol.py b/auroris/curation/actions/_mol.py index 1552f6a..495b7f0 100644 --- a/auroris/curation/actions/_mol.py +++ b/auroris/curation/actions/_mol.py @@ -1,6 +1,5 @@ from functools import partial from typing import Dict, List, Literal, Optional, Tuple, Union -from pydantic import Field import datamol as dm import numpy as np @@ -233,31 +232,27 @@ def _num_stereo_centers(mol: dm.Mol) -> Tuple[int]: class MoleculeCuration(BaseAction): """ Automated molecule curation and chemistry space distribution. + + See [`auroris.curation.functional.curate_molecules`][] for the docs of the + `remove_salt_solvent`, `remove_stereo`, `count_stereoisomers`, and `count_stereocenters` attributes + + Attributes: + input_column: The name of the column that has the molecules (either `dm.Mol` objects or SMILES). + X_col: Column with custom features for each of the molecules. If None, will use ECFP. + y_cols: Column names for bioactivities, which will be used to colorcode the chemical space visualization. """ - input_column: str = Field( - ..., description="The name of the column that has the molecules (either `dm.Mol` objects or SMILES)." - ) - prefix: str = Field(default="MOL_", description="Prefix for added column names") - remove_salt_solvent: bool = Field( - default=True, description="When set to 'True', all disconnected salts and solvents" - ) - remove_stereo: bool = Field( - default=False, - description="Whether remove stereochemistry information from molecule. If it's known that the stereochemistry do not contribute to the bioactivity of interest, the stereochemistry information can be removed.", - ) - count_stereoisomers: bool = Field( - default=True, description="Whether count the number of stereoisomers of molecule." - ) - count_stereocenters: bool = Field( - default=True, description="Whether count the number of stereocenter of molecule." - ) - y_cols: Optional[List[str]] = Field(default=None, description="Column names for bioactivities") - fast: Optional[bool] = Field( - default=True, - description="Whether compute molecule features with default ECFP for visualizing distribution in chemical space.", - ) name: Literal["mol_curation"] = "mol_curation" + prefix: str = "MOL_" + + input_column: str + remove_salt_solvent: bool = True + remove_stereo: bool = False + count_stereoisomers: bool = True + count_stereocenters: bool = True + + X_col: Optional[str] = None + y_cols: Optional[Union[str, List[str]]] = None def transform( self, @@ -266,8 +261,8 @@ def transform( verbosity: VerbosityLevel = VerbosityLevel.NORMAL, parallelized_kwargs: Optional[Dict] = None, ) -> pd.DataFrame: + # Run the curation mols = dataset[self.input_column].values - parallelized_kwargs = parallelized_kwargs or {} mol_dict, num_invalid = curate_molecules( mols, @@ -287,6 +282,9 @@ def transform( dataset = pd.concat([dataset, df], axis=1) + # Log information to the report + # - New columns with the curated molecule information + if report is not None: for col in df.columns: report.log_new_column(col) @@ -294,12 +292,15 @@ def transform( smiles_col = self.get_column_name("smiles") smiles = dataset[smiles_col].dropna().values - # Lu: User can call visulize_chemspace for the customized molecular features. - featurizer = "ECFP" - with dm.without_rdkit_log(): - # Temporary disable logs because of deprecation warning - X = np.array([dm.to_fp(smi) for smi in smiles]) - report.log("Default `ecfp` fingerprint is used to compute the distributionin chemical space.") + if self.X_col is None: + featurizer = "ECFP" + with dm.without_rdkit_log(): + X = np.array([dm.to_fp(smi) for smi in smiles]) + report.log("Default `ecfp` fingerprint is used to visualize the chemical space.") + + else: + featurizer = self.X_col + X = dataset[self.X_col].values # list of data per column y = dataset[self.y_cols].T.values.tolist() if self.y_cols else None @@ -309,7 +310,6 @@ def transform( if self.count_stereocenters: # Plot all compounds with undefined stereocenters for visual inspection - undefined_col = self.get_column_name("num_undefined_stereo_center") defined_col = self.get_column_name("num_defined_stereo_center") @@ -324,10 +324,7 @@ def transform( defined = row[defined_col] legends.append(f"Undefined:{undefined}\n Definded:{defined}") - with create_figure( - n_plots=1, - n_cols=1, - ) as (image, _): + with create_figure(n_plots=1, n_cols=1) as (image, _): dm.to_image( to_plot[smiles_col].tolist(), legends=legends, use_svg=False, returnPNG=True ) @@ -335,9 +332,9 @@ def transform( report.log_image( image, title="Molecules with undefined stereocenters", - description=f"There are {num_mol_undefined} molecules with undefined stereocenter(s)." - f"It's recommanded to use and" - f"check the stereoisomers and activity cliffs in the dataset.", + description=f"There are {num_mol_undefined} molecules with undefined stereocenter(s). " + "It's recommended to use and " + "check the stereoisomers and activity cliffs in the dataset.", ) return dataset diff --git a/auroris/curation/actions/_outlier.py b/auroris/curation/actions/_outlier.py index 398169d..3f15ab0 100644 --- a/auroris/curation/actions/_outlier.py +++ b/auroris/curation/actions/_outlier.py @@ -134,13 +134,20 @@ def modified_zscore(data: np.ndarray, consistency_correction: float = 1.4826): class OutlierDetection(BaseAction): """ Automatic detection of outliers. + + See [`auroris.curation.functional.detect_outliers`][] for the docs of the + `method` and `kwargs` attributes + + Attributes: + columns: The columns for which to detect outliers. """ + name: Literal["outlier_detection"] = "outlier_detection" + prefix: str = Field(default="OUTLIER_", description="Prefix for added column names.") + method: OutlierDetectionMethod = Field(..., description="Method name for outlier detection.") columns: List[str] = Field(..., description="Column names to detect outliers.") - prefix: str = Field(default="OUTLIER_", description="Prefix for added column names.") kwargs: Dict = Field(default_factory=dict) - name: Literal["outlier_detection"] = "outlier_detection" def transform( self, diff --git a/auroris/report/_report.py b/auroris/report/_report.py index 526147c..56504a0 100644 --- a/auroris/report/_report.py +++ b/auroris/report/_report.py @@ -8,7 +8,7 @@ from pydantic import BaseModel, ConfigDict, Field, PrivateAttr from auroris import __version__ -from auroris.utils import fig2img, bytes2img +from auroris.utils import bytes2img, fig2img class AnnotatedImage(BaseModel): @@ -71,11 +71,7 @@ def log_new_column(self, name: str): def log_image( self, - image_or_figure: Union[ - ImageType, - Figure, - ByteString, - ], + image_or_figure: Union[ImageType, Figure, ByteString], title: Optional[str] = None, description: Optional[str] = None, ): diff --git a/auroris/report/broadcaster/_html.py b/auroris/report/broadcaster/_html.py index d8f97ed..4694db8 100644 --- a/auroris/report/broadcaster/_html.py +++ b/auroris/report/broadcaster/_html.py @@ -1,7 +1,6 @@ import base64 -import re import os -import pathlib +import re from copy import deepcopy from importlib import resources @@ -56,8 +55,6 @@ def broadcast(self): if not self._embed_images: dm.fs.mkdir(self._image_dir, exist_ok=True) - pathlib.Path(__file__).parent.resolve() / "templates" - # Save all images image_counter = 0 for section in report.sections: @@ -100,8 +97,7 @@ def _img_to_html_src(self, path: str): Convert a path to a corresponding `src` attribute for an `` tag. Currently only supports local paths. """ - protocol = dm.utils.fs.get_protocol(path) - if protocol == "file": + if dm.utils.fs.is_local_path(path): return os.path.relpath(path, self._destination) else: raise ValueError("We only support images hosted locally.") diff --git a/auroris/report/broadcaster/_logger.py b/auroris/report/broadcaster/_logger.py index 0cad0ee..27dc124 100644 --- a/auroris/report/broadcaster/_logger.py +++ b/auroris/report/broadcaster/_logger.py @@ -17,7 +17,7 @@ class ColoredFormatter(logging.Formatter): FORMATS = { logging.DEBUG: grey + format + reset, - logging.INFO: grey + format + reset, + logging.INFO: blue + format + reset, logging.WARNING: yellow + format + reset, logging.ERROR: red + format + reset, logging.CRITICAL: bold_red + format + reset, @@ -61,7 +61,7 @@ def render_image(self, image: AnnotatedImage): self.logger.debug(f"[IMG]: Dimensions {width} x {height}") def on_section_start(self, section: Section): - self.logger.debug(f"===== {section.title} =====") + self.logger.info(f"===== {section.title} =====") def on_report_start(self, report: CurationReport): self.logger.critical("===== Curation Report =====") diff --git a/auroris/types.py b/auroris/types.py index 013135c..cfcaf55 100644 --- a/auroris/types.py +++ b/auroris/types.py @@ -2,6 +2,8 @@ class VerbosityLevel(IntEnum): + """The different verbosity levels""" + SILENT = 0 NORMAL = 1 VERBOSE = 2 diff --git a/auroris/utils.py b/auroris/utils.py index 095f3c3..ce345bb 100644 --- a/auroris/utils.py +++ b/auroris/utils.py @@ -1,12 +1,12 @@ from io import BytesIO from typing import ByteString + import fsspec -import pyarrow.parquet as pq import numpy as np +import pyarrow.parquet as pq from matplotlib.figure import Figure from PIL import Image from PIL.Image import Image as ImageType - from sklearn.utils.multiclass import type_of_target @@ -42,18 +42,18 @@ def img2bytes(image: ImageType): def bytes2img(image_bytes: ByteString): """Convert bytes to PIL image""" image_stream = BytesIO(image_bytes) - # Open the image using PIL image = Image.open(image_stream) return image def save_image(image: ImageType, path: str): + """Save an image to a fsspec-compatible path""" with fsspec.open(path, "wb") as fd: image.save(fd, format="png") def is_parquet_file(path): - """Verify parquet file""" + """Verify parquet file without actually loading it.""" try: pq.read_schema(path) return True diff --git a/docs/api/actions.md b/docs/api/actions.md new file mode 100644 index 0000000..714a080 --- /dev/null +++ b/docs/api/actions.md @@ -0,0 +1,13 @@ +::: auroris.curation.actions.BaseAction + options: + filters: ["!^_"] + +--- + +::: auroris.curation.actions + options: + show_root_heading: False + show_root_toc_entry: False + filters: ["!BaseAction", "!^_"] + +--- \ No newline at end of file diff --git a/docs/api/actions/deduplication.md b/docs/api/actions/deduplication.md deleted file mode 100644 index 1e6a0a5..0000000 --- a/docs/api/actions/deduplication.md +++ /dev/null @@ -1,4 +0,0 @@ - -::: auroris.curation.actions.Deduplication - options: - filters: ["!^_"] \ No newline at end of file diff --git a/docs/api/actions/discretization.md b/docs/api/actions/discretization.md deleted file mode 100644 index 3de0d20..0000000 --- a/docs/api/actions/discretization.md +++ /dev/null @@ -1,4 +0,0 @@ - -::: auroris.curation.actions.Discretization - options: - filters: ["!^_"] \ No newline at end of file diff --git a/docs/api/actions/mol.md b/docs/api/actions/mol.md deleted file mode 100644 index 8d95039..0000000 --- a/docs/api/actions/mol.md +++ /dev/null @@ -1,4 +0,0 @@ - -::: auroris.curation.actions.MoleculeCuration - options: - filters: ["!^_"] \ No newline at end of file diff --git a/docs/api/actions/outlier_detection.md b/docs/api/actions/outlier_detection.md deleted file mode 100644 index 5a7b0b3..0000000 --- a/docs/api/actions/outlier_detection.md +++ /dev/null @@ -1,4 +0,0 @@ - -::: auroris.curation.actions.OutlierDetection - options: - filters: ["!^_"] \ No newline at end of file diff --git a/docs/api/actions/stereo_ac.md b/docs/api/actions/stereo_ac.md deleted file mode 100644 index 97f93a1..0000000 --- a/docs/api/actions/stereo_ac.md +++ /dev/null @@ -1,4 +0,0 @@ - -::: auroris.curation.actions.StereoIsomerACDetection - options: - filters: ["!^_"] \ No newline at end of file diff --git a/docs/api/functional.md b/docs/api/functional.md index 3518fa4..b0d1a1b 100644 --- a/docs/api/functional.md +++ b/docs/api/functional.md @@ -1,4 +1,6 @@ ::: auroris.curation.functional options: - filters: ["!^_"] \ No newline at end of file + filters: ["!^_"] + show_root_heading: False + show_root_toc_entry: False \ No newline at end of file diff --git a/docs/api/utils.md b/docs/api/utils.md new file mode 100644 index 0000000..3338f39 --- /dev/null +++ b/docs/api/utils.md @@ -0,0 +1,5 @@ +::: auroris.utils + options: + filters: ["!^_"] + show_root_heading: False + show_root_toc_entry: False \ No newline at end of file diff --git a/docs/api/visualization.md b/docs/api/visualization.md index 7dafb14..279d852 100644 --- a/docs/api/visualization.md +++ b/docs/api/visualization.md @@ -1,3 +1,5 @@ ::: auroris.visualization options: - filters: ["!^_"] \ No newline at end of file + filters: ["!^_"] + show_root_heading: False + show_root_toc_entry: False \ No newline at end of file diff --git a/env.yml b/env.yml index 3243d1c..0bab54f 100644 --- a/env.yml +++ b/env.yml @@ -45,8 +45,4 @@ dependencies: - markdown-include - mdx_truly_sane_lists - nbconvert - - mike >=1.0.0 - - - pip - - pip: - - griffe_fieldz \ No newline at end of file + - mike >=1.0.0 \ No newline at end of file diff --git a/mkdocs.yml b/mkdocs.yml index 52aec8a..9ca3ffc 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -18,16 +18,15 @@ nav: - Tutorials: - Getting Started: tutorials/getting_started.ipynb - API Reference: - - Curator: api/curator.md - - Actions: - - Deduplication: api/actions/deduplication.md - - Discretization: api/actions/discretization.md - - Molecule Curation: api/actions/mol.md - - Outlier Detection: api/actions/outlier_detection.md - - Stereoisomer AC: api/actions/stereo_ac.md - - Functional: api/functional.md - - Visualization: api/visualization.md - - Types: api/types.md + - Core: + - Curator: api/curator.md + - Actions: api/actions.md + - Functional API: + - Curation: api/functional.md + - Visualization: api/visualization.md + - Misc: + - Types: api/types.md + - Utils: api/utils.md - Community: https://discord.gg/vBFd8p6H7u - Polaris Hub: https://polarishub.io/ @@ -99,8 +98,6 @@ plugins: separate_signature: true show_signature_annotations: true line_length: 80 - extensions: - - griffe_fieldz: {include_inherited: true} # support pydantic data-class - mkdocs-jupyter: execute: False remove_tag_config: diff --git a/pyproject.toml b/pyproject.toml index 3810c7f..3e02eff 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,6 +43,7 @@ dependencies = [ "datamol >=0.12.1", "pillow", "fsspec", + "pyarrow", ] [project.optional-dependencies]