Merge pull request #16 from ginkgobioworks/zulko/download_tracks

adds a way to see available borzoi tracks
ginkgobioworks · Dec 17, 2024 · 6f0499b · 6f0499b
2 parents 2ff9531 + de2d290
commit 6f0499b
Show file tree

Hide file tree

Showing 5 changed files with 40 additions and 1 deletion.
diff --git a/examples/promoter_activity.py b/examples/promoter_activity.py
@@ -22,7 +22,7 @@
 )
 
 response = client.send_request(query)
-print(response)
+print("Single-query response:", response)
 
 
 # In this next example we pull the promoter files from a fasta file and send them
@@ -39,6 +39,8 @@
     },
 )
 
+print("Now sending 100 requests, by batches of 10")
+print("Writing results to promoter_activity.jsonl...")
 output_file = Path(__file__).parent / "outputs" / "promoter_activity.jsonl"
 for batch_result in client.send_requests_by_batches(queries, batch_size=10):
     for query_result in batch_result:

diff --git a/ginkgo_ai_client/queries.py b/ginkgo_ai_client/queries.py
@@ -3,12 +3,14 @@
 from typing import Dict, Optional, Any, List, Literal, Union
 from abc import ABC, abstractmethod
 from pathlib import Path
+from functools import lru_cache
 import json
 import yaml
 import tempfile
 
 import pydantic
 import requests
+import pandas
 
 from ginkgo_ai_client.utils import (
     fasta_sequence_iterator,
@@ -405,6 +407,34 @@ def list_with_promoter_from_fasta(
         )
         return list(iterator)
 
+    @classmethod
+    @lru_cache(maxsize=1)
+    def _get_full_tissue_dataframe(cls):
+        file_id = "13eQTxjqW3KMCzbaRYUSbZiyzXCaNYTIg"
+        url = f"https://drive.google.com/uc?export=download&id={file_id}"
+        tracks = pandas.read_csv(url)
+        return tracks
+
+    @classmethod
+    def get_tissue_track_dataframe(
+        cls, tissue: str = None, assay: str = None
+    ) -> pandas.DataFrame:
+        """Return a pandas DataFrame with the tissues and their corresponding tracks.
+
+        Parameters
+        ----------
+        tissue: str, optional
+            If provided, only rows with the tissue name will be returned.
+        assay: str, optional
+            If provided, only rows with the assay name will be returned.
+        """
+        df = cls._get_full_tissue_dataframe()
+        if tissue is not None:
+            df = df[df["sample"].str.contains(tissue, case=False)]
+        if assay is not None:
+            df = df[df.assay.str.contains(assay)]
+        return df
+
 
 ## ---- DIFFUSION QUERIES ---------------------------------------------------------
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -18,6 +18,7 @@ dependencies = [
     "requests",
     "tqdm",
     "pyyaml",
+    "pandas",
 ]
 
 [project.optional-dependencies]

diff --git a/requirements.txt b/requirements.txt
@@ -3,6 +3,7 @@ requests==2.32.3
 biopython==1.82.0
 pytest-xdist==3.6.1
 pytest-cov==4.0.0
+pandas==2.1.2
 
 sphinx==8.1.3
 docutils==0.21.2

diff --git a/test/test_query_creation.py b/test/test_query_creation.py
@@ -44,6 +44,11 @@ def test_promoter_activity_iteration():
     assert len(queries) == 50
 
 
+def test_get_tissue_tracks():
+    df = PromoterActivityQuery.get_tissue_track_dataframe(tissue="heart", assay="DNASE")
+    assert len(df) == 22
+
+
 @pytest.mark.parametrize(
     "filename, expected_sequences",
     [