Skip to content

Commit

Permalink
update to community and impute
Browse files Browse the repository at this point in the history
  • Loading branch information
GemmaTuron committed Feb 13, 2025
1 parent 2e30fb9 commit ccd0469
Show file tree
Hide file tree
Showing 6 changed files with 31 additions and 15 deletions.
1 change: 0 additions & 1 deletion .gitattributes

This file was deleted.

10 changes: 4 additions & 6 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
FROM bentoml/model-server:0.11.0-py38
FROM bentoml/model-server:0.11.0-py312
MAINTAINER ersilia

RUN pip install mordredcommunity==2.0.6
RUN pip install networkx==3.2.1
RUN pip install numpy==1.26.4
RUN pip install pandas==1.3.5
RUN pip install rdkit==2023.3.2
RUN pip install mordredcommunity[full]==2.0.6
RUN pip install timeout-decorator==0.5.0
RUN pip install scikit-learn==1.6.1
RUN pip install joblib==1.4.2

WORKDIR /repo
COPY ./repo
Binary file added model/checkpoints/cols_to_drop.pkl
Binary file not shown.
Binary file added model/checkpoints/imputer.pkl
Binary file not shown.
27 changes: 23 additions & 4 deletions model/framework/code/main.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,18 @@
import os
import csv
import sys
from rdkit import Chem
import pandas as pd
from mordred import Calculator, descriptors
from timeout_decorator import timeout
import pandas as pd
import joblib

infile = sys.argv[1]
outfile = sys.argv[2]

ROOT = os.path.abspath(os.path.dirname(__file__))
checkpoints_dir = os.path.join(ROOT, "..","..","checkpoints")

with open(infile, "r") as f:
reader = csv.reader(f)
next(reader)
Expand All @@ -23,7 +28,7 @@
def one_molecule(mol):
return calc(mol)


#columns = calc._name_dict.keys()
columns = list(calc.pandas([Chem.MolFromSmiles("CCCC")]).columns)

R = []
Expand All @@ -34,5 +39,19 @@ def one_molecule(mol):
r = [None for _ in range(len(columns))]
R += [r]

df = pd.DataFrame(R, columns=columns)
df.to_csv(outfile, index=False)
cols_to_drop = joblib.load(os.path.join(checkpoints_dir, "cols_to_drop.pkl"))
imputer = joblib.load(os.path.join(checkpoints_dir,"imputer.pkl"))

R = pd.DataFrame(R, columns=columns)
R = R.drop(columns=cols_to_drop)
R = imputer.transform(R)

cols = [c for c in columns if c not in cols_to_drop]

#TODO: we are now imputing molecules that might be all NaNs originally

with open(outfile, "w") as f:
writer = csv.writer(f)
writer.writerow(cols)
for r in R:
writer.writerow(r)
8 changes: 4 additions & 4 deletions model/framework/examples/output.csv

Large diffs are not rendered by default.

0 comments on commit ccd0469

Please sign in to comment.