Minor code to build atomic pretraining dataset

BojarLab · Aug 15, 2024 · e74c2fb · e74c2fb
1 parent 4e9327f
commit e74c2fb
Show file tree

Hide file tree

Showing 4 changed files with 25 additions and 2 deletions.
diff --git a/gifflar/acquisition/__init__.py b/gifflar/acquisition/__init__.py
diff --git a/gifflar/acquisition/candycrunsh.py b/gifflar/acquisition/candycrunsh.py
@@ -0,0 +1,23 @@
+import pickle
+
+from gifflar.data import GlycanStorage
+
+with open("collected.pkl", "rb") as f:
+    _, unique_glycans, _ = pickle.load(f)
+
+gs = GlycanStorage("C:/Users/joere/Desktop")
+print("Loaded GlycanStorage:", len(gs.data))
+
+data = {}
+for i, iupac in enumerate(unique_glycans):
+    try:
+        print(f"\r{i}", end="")
+        res = gs.query(iupac)
+        if res:
+            data[iupac] = res["smiles"]
+    except Exception as e:
+        print(e)
+
+print(len(data))
+with open("glycan_smiles.pkl", "wb") as f:
+    pickle.dump(data, f)
diff --git a/gifflar/data.py b/gifflar/data.py
@@ -224,7 +224,7 @@ def hetero_collate(data: Optional[Union[List[List[HeteroData]], List[HeteroData]
 
 
 class GlycanStorage:
-    def __init__(self, path: Optional[Path] = None):
+    def __init__(self, path: Optional[Path | str] = None):
         """
         Initialize the wrapper around a dict.
 

diff --git a/requirements.txt b/requirements.txt
@@ -5,7 +5,7 @@ scikit-learn
 numpy
 pandas
 git+https://github.com/KalininaLab/glyles.git
-git+https://github.com/BojarLab/glycowork.git@ab73b62d7d500d321592cca20cdf3b8507f60026
+git+https://github.com/BojarLab/glycowork.git@3b8a6619ecc4b256cb28b43a47f17ef7df0ecaf3
 jsonargparse
 rich
 pytorch-lightning