Skip to content

Commit

Permalink
Merge pull request #16 from ginkgobioworks/zulko/download_tracks
Browse files Browse the repository at this point in the history
adds a way to see available borzoi tracks
  • Loading branch information
Zulko authored Dec 17, 2024
2 parents 2ff9531 + de2d290 commit 6f0499b
Show file tree
Hide file tree
Showing 5 changed files with 40 additions and 1 deletion.
4 changes: 3 additions & 1 deletion examples/promoter_activity.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
)

response = client.send_request(query)
print(response)
print("Single-query response:", response)


# In this next example we pull the promoter files from a fasta file and send them
Expand All @@ -39,6 +39,8 @@
},
)

print("Now sending 100 requests, by batches of 10")
print("Writing results to promoter_activity.jsonl...")
output_file = Path(__file__).parent / "outputs" / "promoter_activity.jsonl"
for batch_result in client.send_requests_by_batches(queries, batch_size=10):
for query_result in batch_result:
Expand Down
30 changes: 30 additions & 0 deletions ginkgo_ai_client/queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,14 @@
from typing import Dict, Optional, Any, List, Literal, Union
from abc import ABC, abstractmethod
from pathlib import Path
from functools import lru_cache
import json
import yaml
import tempfile

import pydantic
import requests
import pandas

from ginkgo_ai_client.utils import (
fasta_sequence_iterator,
Expand Down Expand Up @@ -405,6 +407,34 @@ def list_with_promoter_from_fasta(
)
return list(iterator)

@classmethod
@lru_cache(maxsize=1)
def _get_full_tissue_dataframe(cls):
file_id = "13eQTxjqW3KMCzbaRYUSbZiyzXCaNYTIg"
url = f"https://drive.google.com/uc?export=download&id={file_id}"
tracks = pandas.read_csv(url)
return tracks

@classmethod
def get_tissue_track_dataframe(
cls, tissue: str = None, assay: str = None
) -> pandas.DataFrame:
"""Return a pandas DataFrame with the tissues and their corresponding tracks.
Parameters
----------
tissue: str, optional
If provided, only rows with the tissue name will be returned.
assay: str, optional
If provided, only rows with the assay name will be returned.
"""
df = cls._get_full_tissue_dataframe()
if tissue is not None:
df = df[df["sample"].str.contains(tissue, case=False)]
if assay is not None:
df = df[df.assay.str.contains(assay)]
return df


## ---- DIFFUSION QUERIES ---------------------------------------------------------

Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ dependencies = [
"requests",
"tqdm",
"pyyaml",
"pandas",
]

[project.optional-dependencies]
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ requests==2.32.3
biopython==1.82.0
pytest-xdist==3.6.1
pytest-cov==4.0.0
pandas==2.1.2

sphinx==8.1.3
docutils==0.21.2
Expand Down
5 changes: 5 additions & 0 deletions test/test_query_creation.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,11 @@ def test_promoter_activity_iteration():
assert len(queries) == 50


def test_get_tissue_tracks():
df = PromoterActivityQuery.get_tissue_track_dataframe(tissue="heart", assay="DNASE")
assert len(df) == 22


@pytest.mark.parametrize(
"filename, expected_sequences",
[
Expand Down

0 comments on commit 6f0499b

Please sign in to comment.