-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Feature/standardized output rebase (#109)
* move everything to src * enable plugin loading from top-level working directory * add 1000G test files * draft CAF output validation * update readme * complete docstring on worked example method * update code and fixtures to get exact chr field from vrsix index * improve pip install // make worked example tsv path relative * bump version * pin va-spec-python depedency to particular commit
- Loading branch information
Showing
26 changed files
with
435 additions
and
211 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
import pysam | ||
|
||
from plugin_system.plugins.base_plugin import BasePlugin | ||
from plugin_system.utils import ( | ||
load_dict, | ||
csv_to_dataframe, | ||
terra_data_table_to_dataframe, | ||
) | ||
|
||
|
||
class ThousandGenomesPlugin(BasePlugin): | ||
""" | ||
Plugin for AnVIL 1000G PRIMED data release on Terra | ||
Link: https://anvil.terra.bio/#workspaces/anvil-datastorage/AnVIL_1000G_PRIMED-data-model | ||
Note that get_phenotype_index is inherited from the parent BasePlugin class. | ||
""" | ||
|
||
def __init__( | ||
self, phenotype_table_path: str | None = None, index_path: str | None = None | ||
): | ||
"""constructor used to set a phenotype index if provided a file path for the index (index_path). | ||
Otherwise create a phenotype index using a Terra data table (no path specified) or with a csv/tsv filepath. | ||
Index example: {"sample_A": ["HP:0001263", "HP:0000002"], "sample_B": ["HP:0001263"]} | ||
Note that we actively do not use super() to invoke the BasePlugin's constructor to create custom functionality. | ||
Args: | ||
phenotype_table_path (str, optional): Path to csv/tsv of phenotype data specified by the GREGoR data model. | ||
When not specified, defaults to loading from Terra data table in existing workspace titled "phenotypes". | ||
For more info on the data model, see https://gregorconsortium.org/data-model. Defaults to None. | ||
index_path (str, optional): Path to existing phenotype index. Defaults to None. | ||
""" | ||
|
||
self.phenotype_index = self.__create_phenotype_index( | ||
phenotype_table_path=phenotype_table_path, index_path=index_path | ||
) | ||
|
||
def __create_phenotype_index( | ||
self, phenotype_table_path: str | None = None, index_path: str | None = None | ||
) -> dict[str, list[str]]: | ||
"""[private method] given phenotypical data input specified by the GREGoR Data model (in either tsv/csv/Terra data table), | ||
return a dictionary mapping from each sample to its list of phenotypes | ||
Args: | ||
phenotype_table_path (str, optional): Path to csv/tsv of phenotype data specified by the GREGoR data model. | ||
When not specified, defaults to loading from Terra data table in existing workspace titled "phenotypes". | ||
For more info on the data model, see https://gregorconsortium.org/data-model | ||
index_path (str, optional): Path to pre-computed index. Defaults to None. | ||
Returns: | ||
dict[str, list[str]]: index of a sample id to sample's phenotypes. | ||
""" | ||
|
||
# load index from file if already created | ||
if index_path is not None: | ||
return load_dict(index_path) | ||
|
||
# if no path specified, load phenotype table from Terra Data Table by default (must be in Terra workspace) | ||
if phenotype_table_path is None: | ||
phenotype_df = terra_data_table_to_dataframe( | ||
table_name="population_descriptor" | ||
) | ||
else: # otherwise load phenotype data table from file | ||
phenotype_df = csv_to_dataframe(phenotype_table_path) | ||
|
||
# create participant to phenotypes mapping | ||
phenotype_index = {} | ||
for subject_id in phenotype_df["subject_id"].unique(): | ||
all_phenotypes = phenotype_df[phenotype_df["subject_id"] == subject_id][ | ||
"country_of_recruitment" | ||
] | ||
|
||
phenotype_index[subject_id] = list(all_phenotypes.unique()) | ||
|
||
return phenotype_index |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
import json | ||
import os | ||
import subprocess | ||
|
||
from ga4gh.vrs.extras.translator import AlleleTranslator | ||
from ga4gh.vrs.dataproxy import create_dataproxy | ||
from plugin_system.plugin_manager import PluginManager | ||
from vrs_anvil.evidence import get_cohort_allele_frequency | ||
|
||
|
||
# run this in 1000g directory | ||
assert os.getcwd().endswith( | ||
"1000g" | ||
), "to ensure the plugin can be located, please run this in the 1000g directory" | ||
|
||
# set varaible for variant data input | ||
variant_id = "chr1-20094-TAA-T" | ||
vcf_path = "../tests/fixtures/1kGP.chr1.1000.vrs.vcf.gz" | ||
|
||
# set path to write VCF index to | ||
vcf_index_path = "1000g_chr1_index.db" | ||
|
||
# set phenotype-specific inputs | ||
phenotype = "USA" # to create subcohorts | ||
phenotype_table = "population_descriptor.tsv" # downloaded from https://anvil.terra.bio/#workspaces/anvil-datastorage/AnVIL_1000G_PRIMED-data-model/data | ||
|
||
# create vcf index from vcf at the specified path using vrsix | ||
command = ["vrsix", "load", f"--db-location={vcf_index_path}", vcf_path] | ||
try: | ||
result = subprocess.run(command, check=True, text=True, capture_output=True) | ||
print("vrsix command executed successfully!") | ||
except subprocess.CalledProcessError as e: | ||
print("Error executing vrsix command:", e.stderr) | ||
|
||
# # get VRS ID from variant of interest | ||
seqrepo_rest_service_url = "seqrepo+https://services.genomicmedlab.org/seqrepo" | ||
seqrepo_dataproxy = create_dataproxy(uri=seqrepo_rest_service_url) | ||
allele_translator = AlleleTranslator(seqrepo_dataproxy) | ||
allele = allele_translator.translate_from(variant_id) | ||
vrs_id = allele.id | ||
|
||
# instantiate 1000G plugin class with phenotype table | ||
plugin = PluginManager().load_plugin("ThousandGenomesPlugin") | ||
simple_plugin = plugin(phenotype_table) | ||
|
||
# generating cohort allele frequency using 1000G plugin | ||
caf = get_cohort_allele_frequency( | ||
variant_id=vrs_id, | ||
vcf_path=vcf_path, | ||
vcf_index_path=vcf_index_path, | ||
plugin=simple_plugin, | ||
phenotype=phenotype, | ||
) | ||
|
||
print(f"CAF:") | ||
print(json.dumps(caf.model_dump(exclude_none=True), indent=2)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
# plugin_system/plugin_manager.py | ||
import pkgutil | ||
import importlib | ||
import os | ||
|
||
from plugin_system.plugins.base_plugin import BasePlugin | ||
from vrs_anvil.evidence import PLUGIN_MODULE_PATH | ||
|
||
|
||
class PluginManager: | ||
def load_plugin(self, plugin_name: str) -> BasePlugin: | ||
"""grab first plugin class matching name `plugin_name` within the `plugin_package` directory | ||
Args: | ||
plugin_name (str): name of the plugin class | ||
Raises: | ||
OSError: unable to find class named `plugin_name` | ||
ImportError: unable to import package | ||
Returns: | ||
BasePlugin: non-instantiated Plugin class | ||
""" | ||
|
||
# get full path for the default plugin directory | ||
default_plugin_dir = os.path.dirname( | ||
importlib.import_module(PLUGIN_MODULE_PATH).__file__ | ||
) | ||
|
||
# look for plugin by name first in default directory then in top-level directory | ||
for i, iter in enumerate( | ||
[pkgutil.iter_modules([default_plugin_dir]), pkgutil.iter_modules()] | ||
): | ||
for _, name, _ in iter: | ||
# only look for specific file names | ||
if not name.endswith("_plugin"): | ||
continue | ||
|
||
# get full module name, use only plugin name if iterating across top-level dirs | ||
module_name = f"{PLUGIN_MODULE_PATH}.{name}" if i == 0 else name | ||
|
||
try: | ||
# dynamically import the module | ||
module = importlib.import_module(module_name) | ||
|
||
# grab first plugin class matching name <plugin_name> | ||
for attribute_name in dir(module): | ||
if attribute_name == plugin_name: | ||
attribute = getattr(module, attribute_name) | ||
if isinstance(attribute, type) and hasattr( | ||
attribute, "__is_plugin__" | ||
): | ||
return attribute | ||
except ImportError as e: | ||
# plugin modules found but unable to load specified plugin | ||
print(f"Error loading plugin {name}: {e}") | ||
raise | ||
|
||
# if no plugin found, raise error | ||
raise OSError( | ||
f"Plugin {plugin_name} not found. Make sure the path is stored in the top-level" | ||
) |
File renamed without changes.
File renamed without changes.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Oops, something went wrong.