update to community and impute

ersilia-os · Feb 13, 2025 · ccd0469 · ccd0469
1 parent 2e30fb9
commit ccd0469
Show file tree

Hide file tree

Showing 6 changed files with 31 additions and 15 deletions.
diff --git a/.gitattributes b/.gitattributes
diff --git a/Dockerfile b/Dockerfile
@@ -1,12 +1,10 @@
-FROM bentoml/model-server:0.11.0-py38
+FROM bentoml/model-server:0.11.0-py312
 MAINTAINER ersilia
 
-RUN pip install mordredcommunity==2.0.6
-RUN pip install networkx==3.2.1
-RUN pip install numpy==1.26.4
-RUN pip install pandas==1.3.5
-RUN pip install rdkit==2023.3.2
+RUN pip install mordredcommunity[full]==2.0.6
 RUN pip install timeout-decorator==0.5.0
+RUN pip install scikit-learn==1.6.1
+RUN pip install joblib==1.4.2
 
 WORKDIR /repo
 COPY ./repo
diff --git a/model/checkpoints/cols_to_drop.pkl b/model/checkpoints/cols_to_drop.pkl
diff --git a/model/checkpoints/imputer.pkl b/model/checkpoints/imputer.pkl
diff --git a/model/framework/code/main.py b/model/framework/code/main.py
@@ -1,13 +1,18 @@
+import os
 import csv
 import sys
 from rdkit import Chem
+import pandas as pd
 from mordred import Calculator, descriptors
 from timeout_decorator import timeout
-import pandas as pd
+import joblib
 
 infile = sys.argv[1]
 outfile = sys.argv[2]
 
+ROOT = os.path.abspath(os.path.dirname(__file__))
+checkpoints_dir = os.path.join(ROOT, "..","..","checkpoints")
+
 with open(infile, "r") as f:
     reader = csv.reader(f)
     next(reader)
@@ -23,7 +28,7 @@
 def one_molecule(mol):
     return calc(mol)
 
-
+#columns = calc._name_dict.keys()
 columns = list(calc.pandas([Chem.MolFromSmiles("CCCC")]).columns)
 
 R = []
@@ -34,5 +39,19 @@ def one_molecule(mol):
         r = [None for _ in range(len(columns))]
     R += [r]
 
-df = pd.DataFrame(R, columns=columns)
-df.to_csv(outfile, index=False)
+cols_to_drop = joblib.load(os.path.join(checkpoints_dir, "cols_to_drop.pkl"))
+imputer = joblib.load(os.path.join(checkpoints_dir,"imputer.pkl"))
+
+R = pd.DataFrame(R, columns=columns)
+R = R.drop(columns=cols_to_drop)
+R = imputer.transform(R)
+
+cols = [c for c in columns if c not in cols_to_drop]
+
+#TODO: we are now imputing molecules that might be all NaNs originally
+
+with open(outfile, "w") as f:
+    writer = csv.writer(f)
+    writer.writerow(cols)
+    for r in R:
+        writer.writerow(r)
diff --git a/model/framework/examples/output.csv b/model/framework/examples/output.csv