Skip to content

Commit

Permalink
Feature/standardized output rebase (#109)
Browse files Browse the repository at this point in the history
* move everything to src

* enable plugin loading from top-level working directory

* add 1000G test files

* draft CAF output validation

* update readme

* complete docstring on worked example method

* update code and fixtures to get exact chr field from vrsix index

* improve pip install // make worked example tsv path relative

* bump version

* pin va-spec-python depedency to particular commit
  • Loading branch information
quinnwai authored Jan 31, 2025
1 parent 055ee0d commit 0d9c362
Show file tree
Hide file tree
Showing 26 changed files with 435 additions and 211 deletions.
77 changes: 77 additions & 0 deletions 1000g/1000g_plugin.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import pysam

from plugin_system.plugins.base_plugin import BasePlugin
from plugin_system.utils import (
load_dict,
csv_to_dataframe,
terra_data_table_to_dataframe,
)


class ThousandGenomesPlugin(BasePlugin):
"""
Plugin for AnVIL 1000G PRIMED data release on Terra
Link: https://anvil.terra.bio/#workspaces/anvil-datastorage/AnVIL_1000G_PRIMED-data-model
Note that get_phenotype_index is inherited from the parent BasePlugin class.
"""

def __init__(
self, phenotype_table_path: str | None = None, index_path: str | None = None
):
"""constructor used to set a phenotype index if provided a file path for the index (index_path).
Otherwise create a phenotype index using a Terra data table (no path specified) or with a csv/tsv filepath.
Index example: {"sample_A": ["HP:0001263", "HP:0000002"], "sample_B": ["HP:0001263"]}
Note that we actively do not use super() to invoke the BasePlugin's constructor to create custom functionality.
Args:
phenotype_table_path (str, optional): Path to csv/tsv of phenotype data specified by the GREGoR data model.
When not specified, defaults to loading from Terra data table in existing workspace titled "phenotypes".
For more info on the data model, see https://gregorconsortium.org/data-model. Defaults to None.
index_path (str, optional): Path to existing phenotype index. Defaults to None.
"""

self.phenotype_index = self.__create_phenotype_index(
phenotype_table_path=phenotype_table_path, index_path=index_path
)

def __create_phenotype_index(
self, phenotype_table_path: str | None = None, index_path: str | None = None
) -> dict[str, list[str]]:
"""[private method] given phenotypical data input specified by the GREGoR Data model (in either tsv/csv/Terra data table),
return a dictionary mapping from each sample to its list of phenotypes
Args:
phenotype_table_path (str, optional): Path to csv/tsv of phenotype data specified by the GREGoR data model.
When not specified, defaults to loading from Terra data table in existing workspace titled "phenotypes".
For more info on the data model, see https://gregorconsortium.org/data-model
index_path (str, optional): Path to pre-computed index. Defaults to None.
Returns:
dict[str, list[str]]: index of a sample id to sample's phenotypes.
"""

# load index from file if already created
if index_path is not None:
return load_dict(index_path)

# if no path specified, load phenotype table from Terra Data Table by default (must be in Terra workspace)
if phenotype_table_path is None:
phenotype_df = terra_data_table_to_dataframe(
table_name="population_descriptor"
)
else: # otherwise load phenotype data table from file
phenotype_df = csv_to_dataframe(phenotype_table_path)

# create participant to phenotypes mapping
phenotype_index = {}
for subject_id in phenotype_df["subject_id"].unique():
all_phenotypes = phenotype_df[phenotype_df["subject_id"] == subject_id][
"country_of_recruitment"
]

phenotype_index[subject_id] = list(all_phenotypes.unique())

return phenotype_index
56 changes: 56 additions & 0 deletions 1000g/worked_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import json
import os
import subprocess

from ga4gh.vrs.extras.translator import AlleleTranslator
from ga4gh.vrs.dataproxy import create_dataproxy
from plugin_system.plugin_manager import PluginManager
from vrs_anvil.evidence import get_cohort_allele_frequency


# run this in 1000g directory
assert os.getcwd().endswith(
"1000g"
), "to ensure the plugin can be located, please run this in the 1000g directory"

# set varaible for variant data input
variant_id = "chr1-20094-TAA-T"
vcf_path = "../tests/fixtures/1kGP.chr1.1000.vrs.vcf.gz"

# set path to write VCF index to
vcf_index_path = "1000g_chr1_index.db"

# set phenotype-specific inputs
phenotype = "USA" # to create subcohorts
phenotype_table = "population_descriptor.tsv" # downloaded from https://anvil.terra.bio/#workspaces/anvil-datastorage/AnVIL_1000G_PRIMED-data-model/data

# create vcf index from vcf at the specified path using vrsix
command = ["vrsix", "load", f"--db-location={vcf_index_path}", vcf_path]
try:
result = subprocess.run(command, check=True, text=True, capture_output=True)
print("vrsix command executed successfully!")
except subprocess.CalledProcessError as e:
print("Error executing vrsix command:", e.stderr)

# # get VRS ID from variant of interest
seqrepo_rest_service_url = "seqrepo+https://services.genomicmedlab.org/seqrepo"
seqrepo_dataproxy = create_dataproxy(uri=seqrepo_rest_service_url)
allele_translator = AlleleTranslator(seqrepo_dataproxy)
allele = allele_translator.translate_from(variant_id)
vrs_id = allele.id

# instantiate 1000G plugin class with phenotype table
plugin = PluginManager().load_plugin("ThousandGenomesPlugin")
simple_plugin = plugin(phenotype_table)

# generating cohort allele frequency using 1000G plugin
caf = get_cohort_allele_frequency(
variant_id=vrs_id,
vcf_path=vcf_path,
vcf_index_path=vcf_index_path,
plugin=simple_plugin,
phenotype=phenotype,
)

print(f"CAF:")
print(json.dumps(caf.model_dump(exclude_none=True), indent=2))
42 changes: 19 additions & 23 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -104,16 +104,14 @@ The command line utility supports Google Cloud URIs and running commands in the
## Cohort Allele Frequency Generation

### Description
Given a variant and an optional phenotype of interest, get aggregated allele frequency info for a cohort of interest as a cohort allele frequency object (CAF).
Given a variant and an optional phenotype of interest, get aggregated allele frequency info for a cohort of interest as a cohort allele frequency object ([CAF](https://va-ga4gh.readthedocs.io/en/1.0.0-ballot.2024-11/base-profiles/study-result-profiles.html#cohort-allele-frequency-study-result)).

### General Prerequisites
- Variant of interest
- Valid VRS-annotated joint VCF
- Assumes chr field is prepended with chr (eg `chr1`)
- genotyping laid out per-sample
- Precomputed VRS-VCF index (created using [vrsix](https://github.com/gks-anvil/vrsix))
- This maps VCF coordinates to VRS IDs
- Enables efficient retrieval of VCF row by VRS ID
- this enables efficient retrieval of VCF row by VRS ID
- [Optional] Phenotype of interest to specify subcohort
- [Optional] Plugins for project-specific transformations (see [here](README.md#plugins-for-unique-data-inputs) for more info)

Expand Down Expand Up @@ -153,33 +151,26 @@ These three problems above map to three different methods necessary in implement
1. This also makes use of a `pysam.VariantRecord` as input
2. An `alt_index` is also passed in as an input, which is the index representing the allele of interest within the VCF row. The alt index matches the genotype according to [VCF specification](https://samtools.github.io/hts-specs/VCFv4.2.pdf). For instance, a sample with the 2nd alt might have a genotype containing a 2, ie `(2,1)`, `(2,0)`, `(2,2)`, etc.

**Existing Plugins**
- For the methods signatures and default implementations, take a look at the [`BasePlugin`](plugin_system/plugins/base_plugin.py) class
- For a simple plugin implementation that inherits all methods from the `BasePlugin`, take a look at the [`SimplePlugin`](plugin_system/plugins/base_plugin.py) implementation.
- For an example plugin, take a look at the [`GregorPlugin`](plugin_system/plugins/gregor_plugin.py)

To implement your own plugin....

### Getting Started

**Plugin Implementer**
1. Copy [`gregor_plugin.py`](plugin_system/plugins/gregor_plugin.py) to the same directory
2. Rename the plugin class and name (eg `MyProjectPlugin` and `my_project_plugin.py`)
3. Implement the three methods mentioned above, calling any default implementations or plugin [utilities](plugin_system/utils.py) as necessary.
1. [`BasePlugin`](plugin_system/base_plugin.py) is by default the parent class, so you can use the `BasePlugin`'s implementations by calling `super().<method_to_invoke>`
2. [`GregorPlugin`](plugin_system/plugins/gregor_plugin.py) is a worked example of specific real-world implementation, refer to that for alternative ways to customize allele frequency generation.
3. See [`SimplePlugin`](plugin_system/plugins/base_plugin.py) for the simplest possible implementation of a plugin using only default methods inherited from `BasePlugin`.
There are two types of user stories for plugins: one will be implementing the project-specific plugin, while the other is using an already-built plugin. The former user will be called a plugin author, while the latter will be called a plugin user.

**Plugin Author**
1. Read through the default implementations defined in the [`BasePlugin`](src/plugin_system/plugins/base_plugin.py).
2. Copy [`simple_plugin.py`](src/plugin_system/plugins/gregor_plugin.py) to your working directory. This has to be in the **top-level directory** where you will do your CAF generation.
3. Rename the plugin class and name (eg `MyProjectPlugin` and `my_project_plugin.py`). The file name must end in `_plugin.py` to be a valid module.
4. Customize any of the methods described by the [`BasePlugin`](src/plugin_system/plugins/base_plugin.py) by implementing them in your own plugin class. For reference material...
1. [`BasePlugin`](src/plugin_system/plugins/base_plugin.py) is by default the parent class, so any methods that aren't defined in your plugin will inherit these methods. You can also define the `BasePlugin`'s implementations by calling `super().<method_to_invoke>` if you want to do additional work after using the default methods.
2. [`GregorPlugin`](src/plugin_system/plugins/gregor_plugin.py) is a worked example of specific real-world implementation, refer to that for alternative ways to customize allele frequency generation.
3. Plugin [utilities](src/plugin_system/utils.py) are a set of data transformation util methods that might be useful. For example, if your phenotypical data lives in a Terra data table, you could use `terra_data_table_to_dataframe` to rapidly retrieve the columns most relevant in creating a phenotype index.

**Plugin User**
1. Confirm that you have the variant of interest, [optional] phenotype of interest, VCF path of interest at your disposal, and name of implemented plugin class
2. See the `test_plugin_worked_example` function in [test_plugin.py](tests/unit/test_plugin.py) for a worked example on how to use plugin. The two main components of using the plugin are...
2. See the `test_plugin_worked_example` function in [test_plugin.py](tests/unit/test_plugin.py) for a worked example on how to use plugin. Generally, the two main components related to using the plugin are...
1. For your `MyProjectPlugin` plugin, instantiate it with the `PluginManager` and any input parameters specified.
2. Call `get_cohort_allele_frequency` with `plugin="MyProjectPlugin"` as a parameter.

# GREGoR-specific Details

### Work in Progress
- For chromosomes with ploidy of 1 (mitochondrial calling or sex chromosomes), focus allele counts (AC) and locus allele counts (AN) can have a maximum value of 1. Focus allele counts are 1 when the genotype has at least a single allele match (0/1, 1/1, or 1) otherwise it is none.
2. Call `get_cohort_allele_frequency` passing in the instantiated `MyProjectPlugin` object with the "plugin" parameter.


## Processing VCF Files ([vrs-python](https://github.com/ga4gh/vrs-python))
Expand All @@ -197,6 +188,11 @@ The above is an example using an example vcf. Replace the `--vcf_out` and `vrs_p

Also, see the [VRS Annotator](https://dockstore.org/workflows/github.com/gks-anvil/vrs-annotator/VRSAnnotator:main?tab=info) workflow on Dockstore for a way to do this on Terra.

## GREGoR-specific Details

### Work in Progress
- For chromosomes with ploidy of 1 (mitochondrial calling or sex chromosomes), focus allele counts (AC) and locus allele counts (AN) can have a maximum value of 1. Focus allele counts are 1 when the genotype has at least a single allele match (0/1, 1/1, or 1) otherwise it is none.

## Contributing

This project is open to contributions from the research community. If you are interested in contributing to the project, please contact the project team.
Expand Down
56 changes: 0 additions & 56 deletions plugin_system/plugin_manager.py

This file was deleted.

7 changes: 5 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
ga4gh.vrs[extras]==2.0.0a10
ga4gh.vrs[extras]==2.0.0a13
diskcache
biocommons.seqrepo
glom
Expand All @@ -15,4 +15,7 @@ setuptools
# for cohort allele frequency generation
firecloud
pandas
vrsix==0.0.1
vrsix==0.1.1

# TODO: use actual version with merge of PR
ga4gh.va_spec @ git+https://github.com/ga4gh/va-spec-python.git@2ea827ee9179262094e9f4cdba0876a6a389a54c#egg=ga4gh.va_spec
6 changes: 4 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@

# Versions should comply with PEP 440:
# https://packaging.python.org/en/latest/single_source_version.html
version='0.2.0rc4', # Required
version='0.2.0rc5', # Required

# This is a one-line description or tagline of what your project does. This
# corresponds to the "Summary" metadata field:
Expand Down Expand Up @@ -90,7 +90,9 @@
# You can just specify package directories manually here if your project is
# simple. Or you can use find_packages().

packages=find_packages(exclude=['contrib', 'docs', 'tests']), # Required
packages=find_packages(where="src"), # Required

package_dir={"": "src"},

# Specify which Python versions you support. In contrast to the
# 'Programming Language' classifiers above, 'pip install' will check this
Expand Down
File renamed without changes.
62 changes: 62 additions & 0 deletions src/plugin_system/plugin_manager.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# plugin_system/plugin_manager.py
import pkgutil
import importlib
import os

from plugin_system.plugins.base_plugin import BasePlugin
from vrs_anvil.evidence import PLUGIN_MODULE_PATH


class PluginManager:
def load_plugin(self, plugin_name: str) -> BasePlugin:
"""grab first plugin class matching name `plugin_name` within the `plugin_package` directory
Args:
plugin_name (str): name of the plugin class
Raises:
OSError: unable to find class named `plugin_name`
ImportError: unable to import package
Returns:
BasePlugin: non-instantiated Plugin class
"""

# get full path for the default plugin directory
default_plugin_dir = os.path.dirname(
importlib.import_module(PLUGIN_MODULE_PATH).__file__
)

# look for plugin by name first in default directory then in top-level directory
for i, iter in enumerate(
[pkgutil.iter_modules([default_plugin_dir]), pkgutil.iter_modules()]
):
for _, name, _ in iter:
# only look for specific file names
if not name.endswith("_plugin"):
continue

# get full module name, use only plugin name if iterating across top-level dirs
module_name = f"{PLUGIN_MODULE_PATH}.{name}" if i == 0 else name

try:
# dynamically import the module
module = importlib.import_module(module_name)

# grab first plugin class matching name <plugin_name>
for attribute_name in dir(module):
if attribute_name == plugin_name:
attribute = getattr(module, attribute_name)
if isinstance(attribute, type) and hasattr(
attribute, "__is_plugin__"
):
return attribute
except ImportError as e:
# plugin modules found but unable to load specified plugin
print(f"Error loading plugin {name}: {e}")
raise

# if no plugin found, raise error
raise OSError(
f"Plugin {plugin_name} not found. Make sure the path is stored in the top-level"
)
File renamed without changes.
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@

class SimplePlugin(BasePlugin):
"""
Simple example plugin using all default methods from the BasePlugin.
Simple example plugin that automatically inherits all default methods from the BasePlugin.
"""
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Loading

0 comments on commit 0d9c362

Please sign in to comment.