Updated notebooks and example for Stratification

kalininalab · Mar 26, 2024 · 6390166 · 6390166
1 parent 206e367
commit 6390166
Show file tree

Hide file tree

Showing 7 changed files with 340 additions and 468 deletions.
diff --git a/docs/examples/tox21.nblink b/docs/examples/tox21.nblink
@@ -0,0 +1,3 @@
+{
+  "path": "../../examples/tox21.ipynb"
+}
diff --git a/docs/index.rst b/docs/index.rst
@@ -83,6 +83,7 @@ arguments are mostly the same.
     examples/bace
     examples/pdbbind
     examples/rna
+    examples/tox21
 
 .. toctree::
     :maxdepth: 1

diff --git a/examples/bace.ipynb b/examples/bace.ipynb
@@ -115,7 +115,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## The output\n",
+    "### The output\n",
     "\n",
     "Finally, we inspect the e_split object as this holds all the assignments of the datapoints to the splits, for each run and each technique. First, the overall architecture is described, lastly we look at the first five assignments of the C1 run 0."
    ]

diff --git a/examples/pdbbind.ipynb b/examples/pdbbind.ipynb
@@ -80,45 +80,6 @@
    "cell_type": "code",
    "execution_count": 3,
    "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "[10:05:12] Explicit valence for atom # 50 C greater than permitted\n",
-      "[10:05:12] ERROR: Could not sanitize molecule ending on line 287\n",
-      "[10:05:12] ERROR: Explicit valence for atom # 50 C greater than permitted\n",
-      "[10:05:12] Explicit valence for atom # 0 C greater than permitted\n",
-      "[10:05:12] ERROR: Could not sanitize molecule ending on line 48\n",
-      "[10:05:12] ERROR: Explicit valence for atom # 0 C greater than permitted\n",
-      "[10:05:12] Explicit valence for atom # 26 C greater than permitted\n",
-      "[10:05:12] ERROR: Could not sanitize molecule ending on line 119\n",
-      "[10:05:12] ERROR: Explicit valence for atom # 26 C greater than permitted\n",
-      "[10:05:12] Explicit valence for atom # 26 C greater than permitted\n",
-      "[10:05:12] ERROR: Could not sanitize molecule ending on line 116\n",
-      "[10:05:12] ERROR: Explicit valence for atom # 26 C greater than permitted\n",
-      "[10:05:12] Explicit valence for atom # 4 C greater than permitted\n",
-      "[10:05:12] ERROR: Could not sanitize molecule ending on line 108\n",
-      "[10:05:12] ERROR: Explicit valence for atom # 4 C greater than permitted\n",
-      "[10:05:12] Explicit valence for atom # 23 C greater than permitted\n",
-      "[10:05:12] ERROR: Could not sanitize molecule ending on line 146\n",
-      "[10:05:12] ERROR: Explicit valence for atom # 23 C greater than permitted\n",
-      "[10:05:12] Explicit valence for atom # 2 C greater than permitted\n",
-      "[10:05:12] ERROR: Could not sanitize molecule ending on line 45\n",
-      "[10:05:12] ERROR: Explicit valence for atom # 2 C greater than permitted\n",
-      "[10:05:12] Explicit valence for atom # 42 C greater than permitted\n",
-      "[10:05:12] ERROR: Could not sanitize molecule ending on line 172\n",
-      "[10:05:12] ERROR: Explicit valence for atom # 42 C greater than permitted\n",
-      "[10:05:12] Explicit valence for atom # 6 C greater than permitted\n",
-      "[10:05:12] ERROR: Could not sanitize molecule ending on line 94\n",
-      "[10:05:12] ERROR: Explicit valence for atom # 6 C greater than permitted\n",
-      "[10:05:12] Explicit valence for atom # 28 C greater than permitted\n",
-      "[10:05:12] ERROR: Could not sanitize molecule ending on line 159\n",
-      "[10:05:12] ERROR: Explicit valence for atom # 28 C greater than permitted\n",
-      "[10:05:12] Can't kekulize mol.  Unkekulized atoms: 2 6 7 17 23 24 25\n",
-      "[10:05:12] ERROR: Could not sanitize molecule ending on line 114\n",
-      "[10:05:12] ERROR: Can't kekulize mol.  Unkekulized atoms: 2 6 7 17 23 24 25\n"
-     ]
-    },
     {
      "data": {
       "text/plain": "      ids                                             Ligand  \\\n0    2d3u  Cc1ccccc1S(=O)(=O)Nc1cc(-c2ccc(C#N)cc2)sc1C(=O...   \n1    3cyx  CC(C)(C)NC(=O)[C@@H]1C[C@@H]2CCCC[C@@H]2C[N@H+...   \n2    3uo4   O=C([O-])c1ccc(Nc2nccc(Nc3ccccc3-c3ccccc3)n2)cc1   \n3    1p1q             Cc1o[nH]c(=O)c1C[C@H]([NH3+])C(=O)[O-]   \n5    2wtv  O=C([O-])c1ccc(Nc2ncc3c(n2)-c2ccc(Cl)cc2C(c2c(...   \n..    ...                                                ...   \n188  2x0y               Cn1c(=O)c2c(ncn2C[C@H](O)CO)n(C)c1=O   \n189  3uex                         CCCCCCCCCCCCCCCCCC(=O)[O-]   \n190  2pq9  O=C([O-])C1=C[C@@H](OP(=O)([O-])[O-])[C@@H](O)...   \n191  1u1b  Cc1cn([C@H]2C[C@H](O[P@](=O)([O-])O[P@](=O)([O...   \n192  4gqq                        CCOC(=O)/C=C/c1ccc(O)c(O)c1   \n\n                                   Target         y  \n0    /tmp/v2013-core/2d3u/2d3u_pocket.pdb  0.268375  \n1    /tmp/v2013-core/3cyx/3cyx_pocket.pdb  0.749538  \n2    /tmp/v2013-core/3uo4/3uo4_pocket.pdb  0.090166  \n3    /tmp/v2013-core/1p1q/1p1q_pocket.pdb -0.636034  \n5    /tmp/v2013-core/2wtv/2wtv_pocket.pdb  1.079223  \n..                                    ...       ...  \n188  /tmp/v2013-core/2x0y/2x0y_pocket.pdb -0.765235  \n189  /tmp/v2013-core/3uex/3uex_pocket.pdb  0.268375  \n190  /tmp/v2013-core/2pq9/2pq9_pocket.pdb  0.798545  \n191  /tmp/v2013-core/1u1b/1u1b_pocket.pdb  0.660433  \n192  /tmp/v2013-core/4gqq/4gqq_pocket.pdb -1.527076  \n\n[182 rows x 4 columns]",
@@ -130,6 +91,9 @@
     }
    ],
    "source": [
+    "from rdkit import rdBase\n",
+    "blocker = rdBase.BlockLogs()\n",
+    "\n",
     "def sdf2smiles(x):\n",
     "    mols = Chem.SDMolSupplier(x)\n",
     "    if len(mols) != 1:\n",
@@ -227,7 +191,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## The output\n",
+    "### The output\n",
     "\n",
     "Finally, we inspect the returned split assignments as this holds all the assignments of the datapoints to the splits, for each run and each technique. First, the overall architecture is described, lastly we look at the first five assignments of the C1 run 0."
    ]
@@ -259,9 +223,9 @@
       "\tRun 2 - Type: <class 'dict'> - 182 assignments\n",
       "\tRun 3 - Type: <class 'dict'> - 182 assignments\n",
       "\n",
-      "ID: 2d3u - Split: val\n",
-      "ID: 3cyx - Split: val\n",
-      "ID: 3pww - Split: val\n",
+      "ID: 2d3u - Split: train\n",
+      "ID: 3cyx - Split: train\n",
+      "ID: 3pww - Split: train\n",
       "ID: 3uo4 - Split: train\n",
       "ID: 1p1q - Split: train\n"
      ]
@@ -302,11 +266,11 @@
       "\tRun 2 - Type: <class 'dict'> - 182 assignments\n",
       "\tRun 3 - Type: <class 'dict'> - 182 assignments\n",
       "\n",
-      "ID: 2d3u - Split: train\n",
-      "ID: 3cyx - Split: val\n",
-      "ID: 3uo4 - Split: train\n",
-      "ID: 1p1q - Split: train\n",
-      "ID: 2wtv - Split: train\n"
+      "ID: 2d3u - Split: val\n",
+      "ID: 3cyx - Split: train\n",
+      "ID: 3uo4 - Split: val\n",
+      "ID: 1p1q - Split: test\n",
+      "ID: 2wtv - Split: val\n"
      ]
     }
    ],
@@ -361,10 +325,10 @@
       "\tRun 3 - Type: <class 'dict'> - 182 assignments\n",
       "\n",
       "ID: ('2d3u', '2d3u') - Split: not selected\n",
-      "ID: ('3cyx', '3cyx') - Split: val\n",
-      "ID: ('3uo4', '3uo4') - Split: train\n",
-      "ID: ('1p1q', '1p1q') - Split: train\n",
-      "ID: ('2wtv', '2wtv') - Split: train\n"
+      "ID: ('3cyx', '3cyx') - Split: train\n",
+      "ID: ('3uo4', '3uo4') - Split: not selected\n",
+      "ID: ('1p1q', '1p1q') - Split: not selected\n",
+      "ID: ('2wtv', '2wtv') - Split: not selected\n"
      ]
     }
    ],