Skip to content

Commit

Permalink
Reviewed @zhu0619's changes. Switched to jinja2 for HTML broadcaster
Browse files Browse the repository at this point in the history
  • Loading branch information
cwognum committed May 7, 2024
1 parent f71d10f commit 8c074be
Show file tree
Hide file tree
Showing 21 changed files with 318 additions and 481 deletions.
10 changes: 5 additions & 5 deletions .github/ISSUE_TEMPLATE/bug-report.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@ body:
value: |
Please provide the following information.
- type: input
id: alchemy-version
id: auroris-version
attributes:
label: Alchemy version
description: Value of ``alchemy.__version__``
label: Auroris version
description: Value of ``auroris.__version__``
placeholder: 0.2.5, 0.3.0, 0.3.1, etc.
validations:
required: true
Expand All @@ -34,7 +34,7 @@ body:
id: installation
attributes:
label: Installation
description: How was Alchemy installed?
description: How was Auroris installed?
placeholder: e.g., "using pip into virtual environment", or "using conda"
validations:
required: true
Expand All @@ -56,4 +56,4 @@ body:
id: additional-output
attributes:
label: Additional output
description: If you think it might be relevant, please provide the output from ``pip freeze`` or ``conda env export`` depending on which was used to install Alchemy.
description: If you think it might be relevant, please provide the output from ``pip freeze`` or ``conda env export`` depending on which was used to install Auroris.
27 changes: 6 additions & 21 deletions auroris/curation/_curator.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,14 @@
import json
from typing import List, Tuple, Union, Dict
from typing import List, Tuple, Union

from loguru import logger
import fsspec
import pandas as pd
from loguru import logger
from pydantic import BaseModel, Field, field_serializer, field_validator

from auroris.curation.actions._base import ACTION_REGISTRY
from auroris.curation.actions._base import ACTION_REGISTRY, BaseAction
from auroris.report import CurationReport
from auroris.types import VerbosityLevel
from auroris.curation.actions._discretize import Discretization


class Curator(BaseModel):
Expand All @@ -27,9 +26,6 @@ class Curator(BaseModel):
verbosity: VerbosityLevel = VerbosityLevel.NORMAL
parallelized_kwargs: dict = Field(default_factory=dict)

state: List[str] = []
_discretizers: Dict[str, Discretization] = {}

@field_validator("verbosity", mode="before")
def _validate_verbosity(cls, v):
if not isinstance(v, VerbosityLevel):
Expand All @@ -42,30 +38,19 @@ def _serialize_verbosity(self, value: VerbosityLevel):

def transform(self, dataset: pd.DataFrame) -> Tuple[pd.DataFrame, CurationReport]:
report = CurationReport()

dataset = dataset.copy(deep=True)

action: BaseAction
for action in self.steps:
logger.info(f"Performing step: {action.name}")
if action._dep_action and action._dep_action not in self.state:
raise RuntimeError(f"{action._dep_action} should be called before {action.name}.")
with report.section(action.name):
kwargs = {}

if action.name == "Discretization":
self._discretizers[action.input_column] = action

if action.name == "DataDistribution":
kwargs = {"discretizers": self._discretizers}

with report.section(action.name):
dataset = action.transform(
dataset,
report=report,
verbosity=self.verbosity,
parallelized_kwargs=self.parallelized_kwargs,
**kwargs,
)
action.completed = True
self.state.append(action.name)

return dataset, report

Expand Down
4 changes: 2 additions & 2 deletions auroris/curation/actions/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
from ._base import BaseAction
from ._deduplicate import Deduplication
from ._discretize import Discretization
from ._distribution import ContinuousDistributionVisualization
from ._mol import MoleculeCuration
from ._outlier import OutlierDetection
from ._distribution import DataDistribution

__all__ = [
"BaseAction",
Expand All @@ -13,5 +13,5 @@
"Deduplication",
"Discretization",
"StereoIsomerACDetection",
"DataDistribution",
"ContinuousDistributionVisualization",
]
10 changes: 4 additions & 6 deletions auroris/curation/actions/_ac_stereoisomer.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,8 @@ def detect_streoisomer_activity_cliff(
ac_cols[y_col].extend([ac] * len(group))

for y_col in y_cols:
dataset_ori.loc[group_index_list.flatten(), f"{prefix}{y_col}"] = np.array(ac_cols[y_col]).astype(
bool
)
rows = group_index_list.flatten()
dataset_ori.loc[rows, f"{prefix}{y_col}"] = np.array(ac_cols[y_col]).astype(bool)

return dataset_ori

Expand All @@ -55,12 +54,11 @@ class StereoIsomerACDetection(BaseAction):
Automatic detection of outliers.
"""

stereoisomer_id_col: Optional[str] = "MOL_molhash_id_no_stereo"
y_cols: Optional[List[str]] = None
stereoisomer_id_col: str
y_cols: List[str]
threshold: float = 2.0
prefix: str = "AC_"
mol_col: str = "MOL_smiles"
_dep_action = "MoleculeCuration"

def transform(
self,
Expand Down
13 changes: 1 addition & 12 deletions auroris/curation/actions/_base.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import abc
from typing import TYPE_CHECKING, Dict, Optional, Literal
from typing import TYPE_CHECKING, Dict, Optional

import pandas as pd
from pydantic import BaseModel, model_validator
Expand All @@ -24,17 +24,6 @@ class BaseAction(BaseModel, abc.ABC):
"""

prefix: str = None
completed: bool = False
_dep_action: Optional[
Literal[
"MoleculeCuration",
"OutlierDetection",
"Deduplication",
"Discretization",
"StereoIsomerACDetection",
"DataDistribution",
]
] = None

@property
def name(self) -> str:
Expand Down
14 changes: 11 additions & 3 deletions auroris/curation/actions/_discretize.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Dict, Literal, Optional, Union, List
from typing import Dict, List, Literal, Optional, Union

import numpy as np
import pandas as pd
Expand All @@ -7,6 +7,7 @@
from auroris.curation.actions._base import BaseAction
from auroris.report import CurationReport
from auroris.types import VerbosityLevel
from auroris.visualization import visualize_continuous_distribution


def discretize(
Expand All @@ -23,8 +24,7 @@ def discretize(
scipy.sparse matrices should be in CSR or CSC format to avoid an
un-necessary copy.
thresholds: Feature values below or equal to this are replaced by 0, above it by 1.
Threshold may not be less than 0 for operations on sparse matrices.
thresholds: Interval boundaries that include the right bin edge.
inplace: Set to True to perform inplace discretization and avoid a copy
(if the input is already a numpy array or a scipy.sparse CSR / CSC
Expand Down Expand Up @@ -82,6 +82,7 @@ class Discretization(BaseAction):
inplace: bool = False
allow_nan: bool = True
label_order: Literal["ascending", "descending"] = "ascending"
log_scale: bool = True

def transform(
self,
Expand All @@ -99,6 +100,13 @@ def transform(
label_order=self.label_order,
)

fig = visualize_continuous_distribution(
data=dataset[self.input_column].values,
log_scale=self.log_scale,
bins=self.thresholds,
)
report.log_image(fig, title=self.input_column)

column_name = self.get_column_name(self.input_column)
dataset[column_name] = X

Expand Down
32 changes: 6 additions & 26 deletions auroris/curation/actions/_distribution.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
from typing import Dict, List, Optional

import pandas as pd
from pydantic import Field
import numpy as np

from auroris.curation.actions._base import BaseAction
from auroris.report import CurationReport
from auroris.types import VerbosityLevel
from auroris.visualization import detailed_distributions_plots
from auroris.visualization import visualize_continuous_distribution


class DataDistribution(BaseAction):
class ContinuousDistributionVisualization(BaseAction):
"""
Access the data distribution
Visualize a continuous distribution
"""

y_cols: Optional[List[str]] = None
Expand All @@ -21,34 +21,14 @@ class DataDistribution(BaseAction):
def transform(
self,
dataset: pd.DataFrame,
discretizers: Optional[callable] = None,
report: Optional[CurationReport] = None,
verbosity: VerbosityLevel = VerbosityLevel.NORMAL,
parallelized_kwargs: Optional[Dict] = None,
):
if report is not None:
for y_col in self.y_cols:
discretizer = discretizers.get(y_col)
sections = []
if discretizer is not None:
low = -np.inf
high = np.inf
for i, threshold in enumerate(discretizer.thresholds + [high]):
X = dataset[f"{discretizer.prefix}{y_col}"].values
if discretizer.label_order == "descending":
i = len(discretizer.thresholds) - i
pct = 100 * sum(X == i) / len(X)
sections.append(
{
"label": f"{discretizer.prefix}{y_col} = {i}: {pct:.1f} %",
"start": low,
"end": threshold,
"pct": pct,
}
)
low = threshold
fig = detailed_distributions_plots(
data=dataset[y_col], label_name=y_col, sections=sections, log_scale=self.log_scale
fig = visualize_continuous_distribution(
data=dataset[y_col], label_name=y_col, log_scale=self.log_scale
)
report.log_image(fig, title=f"Data distribution - {y_col}")

Expand Down
2 changes: 1 addition & 1 deletion auroris/curation/actions/_mol.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,7 +269,7 @@ def transform(
X = np.array([dm.to_fp(smi) for smi in smiles])

fig = visualize_chemspace(X=X)
report.log_image(fig, "Distribution in Chemical Space", "This is a test")
report.log_image(fig, "Distribution in Chemical Space")

if self.count_stereocenters:
# Plot all compounds with undefined stereocenters for visual inspection
Expand Down
2 changes: 0 additions & 2 deletions auroris/curation/actions/_outlier.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,9 @@ def detect_outliers(X: np.ndarray, method: OutlierDetectionMethod = "zscore", **
in_ = X[indices].reshape(-1, 1)
out_ = detector.fit_predict(in_)

# is_inlier = np.zeros_like(X, dtype=int)
is_inlier = np.full_like(X, np.nan)
is_inlier[indices] = out_.flatten()

# is_outlier = 1 - ((is_inlier + 1) / 2)
is_outlier = is_inlier == -1
return is_outlier

Expand Down
4 changes: 2 additions & 2 deletions auroris/report/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
from ._report import CurationReport, Section
from ._report import AnnotatedImage, CurationReport, Section

__all__ = ["CurationReport", "Section"]
__all__ = ["CurationReport", "Section", "AnnotatedImage"]
48 changes: 25 additions & 23 deletions auroris/report/_report.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,26 @@
from contextlib import contextmanager
from datetime import datetime
from typing import List, Optional, Union, ByteString
from typing import ByteString, List, Optional, Union

from matplotlib import pyplot as plt
from matplotlib.figure import Figure
from PIL.Image import Image as ImageType
from pydantic import BaseModel, ConfigDict, Field, PrivateAttr
from IPython.core.display import Image as IPy_Image
from PIL.PngImagePlugin import PngImageFile

from auroris import __version__
from auroris.utils import fig2bytes, png2bytes
from auroris.utils import fig2img


class Image(BaseModel):
class AnnotatedImage(BaseModel):
"""
A image subsection in a report
Image data, potentially with a title and / or description.
"""

image_data: bytes
title: str
description: str
image: ImageType
title: Optional[str] = ""
description: Optional[str] = ""

model_config = ConfigDict(arbitrary_types_allowed=True)


class Section(BaseModel):
Expand All @@ -30,16 +30,15 @@ class Section(BaseModel):

title: str
logs: List[str] = Field(default_factory=list)
images: List[Image] = Field(default_factory=list)

model_config = ConfigDict(arbitrary_types_allowed=True)
images: List[AnnotatedImage] = Field(default_factory=list)


class CurationReport(BaseModel):
"""
A report that summarizes the changes of the curation process.
"""

title: str = "Curation Report"
sections: List[Section] = Field(default_factory=list)
auroris_version: str = Field(default=__version__)
time_stamp: datetime = Field(default_factory=datetime.now)
Expand All @@ -63,27 +62,30 @@ def section(self, name: str):

def log(self, message: str):
"""Log a message to the report"""
self._check_active_section()
self._active_section.logs.append(message)

def log_new_column(self, name: str):
"""Log that a new column has been added to the dataset"""
self.log(f"New column added: {name}")

def log_image(
self, image_or_figure: Union[ImageType, Figure, ByteString], title: str = "", description: str = ""
self,
image_or_figure: Union[ImageType, Figure, ByteString],
title: Optional[str] = None,
description: Optional[str] = None,
):
"""Logs an image. Also accepts Matplotlib figures, which will be converted to images."""

if isinstance(image_or_figure, IPy_Image):
image_data = image_or_figure.data

elif isinstance(image_or_figure, Figure):
image_data = fig2bytes(image_or_figure)
self._check_active_section()
if isinstance(image_or_figure, Figure):
image = fig2img(image_or_figure)
plt.close(image_or_figure)
elif isinstance(image_or_figure, PngImageFile):
image_data = png2bytes(image_or_figure)
else:
image_data = image_or_figure
image = image_or_figure

image = Image(image_data=image_data, title=title, description=description)
image = AnnotatedImage(image=image, title=title, description=description)
self._active_section.images.append(image)

def _check_active_section(self):
if self._active_section is None:
raise RuntimeError("No active section. Use `with report.section(name):`")
Loading

0 comments on commit 8c074be

Please sign in to comment.