Feature/standardized output rebase (#109)

* move everything to src * enable plugin loading from top-level working directory * add 1000G test files * draft CAF output validation * update readme * complete docstring on worked example method * update code and fixtures to get exact chr field from vrsix index * improve pip install // make worked example tsv path relative * bump version * pin va-spec-python depedency to particular commit
gks-anvil · Jan 31, 2025 · 0d9c362 · 0d9c362
1 parent 055ee0d
commit 0d9c362
Show file tree

Hide file tree

Showing 26 changed files with 435 additions and 211 deletions.
diff --git a/1000g/1000g_plugin.py b/1000g/1000g_plugin.py
@@ -0,0 +1,77 @@
+import pysam
+
+from plugin_system.plugins.base_plugin import BasePlugin
+from plugin_system.utils import (
+    load_dict,
+    csv_to_dataframe,
+    terra_data_table_to_dataframe,
+)
+
+
+class ThousandGenomesPlugin(BasePlugin):
+    """
+    Plugin for AnVIL 1000G PRIMED data release on Terra
+
+    Link: https://anvil.terra.bio/#workspaces/anvil-datastorage/AnVIL_1000G_PRIMED-data-model
+
+    Note that get_phenotype_index is inherited from the parent BasePlugin class.
+    """
+
+    def __init__(
+        self, phenotype_table_path: str | None = None, index_path: str | None = None
+    ):
+        """constructor used to set a phenotype index if provided a file path for the index (index_path).
+        Otherwise create a phenotype index using a Terra data table (no path specified) or with a csv/tsv filepath.
+
+        Index example: {"sample_A": ["HP:0001263", "HP:0000002"], "sample_B": ["HP:0001263"]}
+        Note that we actively do not use super() to invoke the BasePlugin's constructor to create custom functionality.
+
+        Args:
+            phenotype_table_path (str, optional): Path to csv/tsv of phenotype data specified by the GREGoR data model.
+                When not specified, defaults to loading from Terra data table in existing workspace titled "phenotypes".
+                For more info on the data model, see https://gregorconsortium.org/data-model. Defaults to None.
+            index_path (str, optional): Path to existing phenotype index. Defaults to None.
+        """
+
+        self.phenotype_index = self.__create_phenotype_index(
+            phenotype_table_path=phenotype_table_path, index_path=index_path
+        )
+
+    def __create_phenotype_index(
+        self, phenotype_table_path: str | None = None, index_path: str | None = None
+    ) -> dict[str, list[str]]:
+        """[private method] given phenotypical data input specified by the GREGoR Data model (in either tsv/csv/Terra data table),
+        return a dictionary mapping from each sample to its list of phenotypes
+
+        Args:
+            phenotype_table_path (str, optional): Path to csv/tsv of phenotype data specified by the GREGoR data model.
+                    When not specified, defaults to loading from Terra data table in existing workspace titled "phenotypes".
+                    For more info on the data model, see https://gregorconsortium.org/data-model
+            index_path (str, optional): Path to pre-computed index. Defaults to None.
+
+        Returns:
+            dict[str, list[str]]: index of a sample id to sample's phenotypes.
+        """
+
+        # load index from file if already created
+        if index_path is not None:
+            return load_dict(index_path)
+
+        # if no path specified, load phenotype table from Terra Data Table by default (must be in Terra workspace)
+        if phenotype_table_path is None:
+            phenotype_df = terra_data_table_to_dataframe(
+                table_name="population_descriptor"
+            )
+        else:  # otherwise load phenotype data table from file
+            phenotype_df = csv_to_dataframe(phenotype_table_path)
+
+        # create participant to phenotypes mapping
+        phenotype_index = {}
+        for subject_id in phenotype_df["subject_id"].unique():
+            all_phenotypes = phenotype_df[phenotype_df["subject_id"] == subject_id][
+                "country_of_recruitment"
+            ]
+
+            phenotype_index[subject_id] = list(all_phenotypes.unique())
+
+        return phenotype_index
diff --git a/1000g/worked_example.py b/1000g/worked_example.py
@@ -0,0 +1,56 @@
+import json
+import os
+import subprocess
+
+from ga4gh.vrs.extras.translator import AlleleTranslator
+from ga4gh.vrs.dataproxy import create_dataproxy
+from plugin_system.plugin_manager import PluginManager
+from vrs_anvil.evidence import get_cohort_allele_frequency
+
+
+# run this in 1000g directory
+assert os.getcwd().endswith(
+    "1000g"
+), "to ensure the plugin can be located, please run this in the 1000g directory"
+
+# set varaible for variant data input
+variant_id = "chr1-20094-TAA-T"
+vcf_path = "../tests/fixtures/1kGP.chr1.1000.vrs.vcf.gz"
+
+# set path to write VCF index to
+vcf_index_path = "1000g_chr1_index.db"
+
+# set phenotype-specific inputs
+phenotype = "USA"  # to create subcohorts
+phenotype_table = "population_descriptor.tsv"  # downloaded from https://anvil.terra.bio/#workspaces/anvil-datastorage/AnVIL_1000G_PRIMED-data-model/data
+
+# create vcf index from vcf at the specified path using vrsix
+command = ["vrsix", "load", f"--db-location={vcf_index_path}", vcf_path]
+try:
+    result = subprocess.run(command, check=True, text=True, capture_output=True)
+    print("vrsix command executed successfully!")
+except subprocess.CalledProcessError as e:
+    print("Error executing vrsix command:", e.stderr)
+
+# # get VRS ID from variant of interest
+seqrepo_rest_service_url = "seqrepo+https://services.genomicmedlab.org/seqrepo"
+seqrepo_dataproxy = create_dataproxy(uri=seqrepo_rest_service_url)
+allele_translator = AlleleTranslator(seqrepo_dataproxy)
+allele = allele_translator.translate_from(variant_id)
+vrs_id = allele.id
+
+# instantiate 1000G plugin class with phenotype table
+plugin = PluginManager().load_plugin("ThousandGenomesPlugin")
+simple_plugin = plugin(phenotype_table)
+
+# generating cohort allele frequency using 1000G plugin
+caf = get_cohort_allele_frequency(
+    variant_id=vrs_id,
+    vcf_path=vcf_path,
+    vcf_index_path=vcf_index_path,
+    plugin=simple_plugin,
+    phenotype=phenotype,
+)
+
+print(f"CAF:")
+print(json.dumps(caf.model_dump(exclude_none=True), indent=2))
diff --git a/README.md b/README.md
@@ -104,16 +104,14 @@ The command line utility supports Google Cloud URIs and running commands in the
 ## Cohort Allele Frequency Generation
 
 ### Description
-Given a variant and an optional phenotype of interest, get aggregated allele frequency info for a cohort of interest as a cohort allele frequency object (CAF).
+Given a variant and an optional phenotype of interest, get aggregated allele frequency info for a cohort of interest as a cohort allele frequency object ([CAF](https://va-ga4gh.readthedocs.io/en/1.0.0-ballot.2024-11/base-profiles/study-result-profiles.html#cohort-allele-frequency-study-result)).
 
 ### General Prerequisites
 - Variant of interest
 - Valid VRS-annotated joint VCF
-  - Assumes chr field is prepended with chr (eg `chr1`)
   - genotyping laid out per-sample
 - Precomputed VRS-VCF index (created using [vrsix](https://github.com/gks-anvil/vrsix))
-  - This maps VCF coordinates to VRS IDs
-  - Enables efficient retrieval of VCF row by VRS ID
+  - this enables efficient retrieval of VCF row by VRS ID
 - [Optional] Phenotype of interest to specify subcohort
 - [Optional] Plugins for project-specific transformations (see [here](README.md#plugins-for-unique-data-inputs) for more info)
 
@@ -153,33 +151,26 @@ These three problems above map to three different methods necessary in implement
    1. This also makes use of a `pysam.VariantRecord` as input
    2. An `alt_index` is also passed in as an input, which is the index representing the allele of interest within the VCF row. The alt index matches the genotype according to [VCF specification](https://samtools.github.io/hts-specs/VCFv4.2.pdf). For instance, a sample with the 2nd alt might have a genotype containing a 2, ie `(2,1)`, `(2,0)`, `(2,2)`, etc.
 
-**Existing Plugins**
-- For the methods signatures and default implementations, take a look at the [`BasePlugin`](plugin_system/plugins/base_plugin.py) class
-- For a simple plugin implementation that inherits all methods from the `BasePlugin`, take a look at the [`SimplePlugin`](plugin_system/plugins/base_plugin.py) implementation.
-- For an example plugin, take a look at the [`GregorPlugin`](plugin_system/plugins/gregor_plugin.py)
-
 To implement your own plugin....
 
 ### Getting Started
 
-**Plugin Implementer**
-1. Copy [`gregor_plugin.py`](plugin_system/plugins/gregor_plugin.py) to the same directory
-2. Rename the plugin class and name (eg `MyProjectPlugin` and `my_project_plugin.py`)
-3. Implement the three methods mentioned above, calling any default implementations or plugin [utilities](plugin_system/utils.py) as necessary.
-   1. [`BasePlugin`](plugin_system/base_plugin.py) is by default the parent class, so you can use the `BasePlugin`'s implementations by calling `super().<method_to_invoke>`
-   2. [`GregorPlugin`](plugin_system/plugins/gregor_plugin.py) is a worked example of specific real-world implementation, refer to that for alternative ways to customize allele frequency generation.
-   3. See [`SimplePlugin`](plugin_system/plugins/base_plugin.py) for the simplest possible implementation of a plugin using only default methods inherited from `BasePlugin`.
+There are two types of user stories for plugins: one will be implementing the project-specific plugin, while the other is using an already-built plugin. The former user will be called a plugin author, while the latter will be called a plugin user.
+
+**Plugin Author**
+1. Read through the default implementations defined in the [`BasePlugin`](src/plugin_system/plugins/base_plugin.py).
+2. Copy [`simple_plugin.py`](src/plugin_system/plugins/gregor_plugin.py) to your working directory. This has to be in the **top-level directory** where you will do your CAF generation.
+3. Rename the plugin class and name (eg `MyProjectPlugin` and `my_project_plugin.py`). The file name must end in `_plugin.py` to be a valid module.
+4. Customize any of the methods described by the [`BasePlugin`](src/plugin_system/plugins/base_plugin.py) by implementing them in your own plugin class. For reference material...
+   1. [`BasePlugin`](src/plugin_system/plugins/base_plugin.py) is by default the parent class, so any methods that aren't defined in your plugin will inherit these methods. You can also define the `BasePlugin`'s implementations by calling `super().<method_to_invoke>` if you want to do additional work after using the default methods.
+   2. [`GregorPlugin`](src/plugin_system/plugins/gregor_plugin.py) is a worked example of specific real-world implementation, refer to that for alternative ways to customize allele frequency generation.
+   3. Plugin [utilities](src/plugin_system/utils.py) are a set of data transformation util methods that might be useful. For example, if your phenotypical data lives in a Terra data table, you could use `terra_data_table_to_dataframe` to rapidly retrieve the columns most relevant in creating a phenotype index.
 
 **Plugin User**
 1. Confirm that you have the variant of interest, [optional] phenotype of interest, VCF path of interest at your disposal, and name of implemented plugin class
-2. See the `test_plugin_worked_example` function in [test_plugin.py](tests/unit/test_plugin.py) for a worked example on how to use plugin. The two main components of using the plugin are...
+2. See the `test_plugin_worked_example` function in [test_plugin.py](tests/unit/test_plugin.py) for a worked example on how to use plugin. Generally, the two main components related to using the plugin are...
    1. For your `MyProjectPlugin` plugin, instantiate it with the `PluginManager` and any input parameters specified.
-   2. Call `get_cohort_allele_frequency` with `plugin="MyProjectPlugin"` as a parameter.
-
-# GREGoR-specific Details
-
-### Work in Progress
-- For chromosomes with ploidy of 1 (mitochondrial calling or sex chromosomes), focus allele counts (AC) and locus allele counts (AN) can have a maximum value of 1. Focus allele counts are 1 when the genotype has at least a single allele match (0/1, 1/1, or 1) otherwise it is none.
+   2. Call `get_cohort_allele_frequency` passing in the instantiated `MyProjectPlugin` object with the "plugin" parameter.
 
 
 ## Processing VCF Files ([vrs-python](https://github.com/ga4gh/vrs-python))
@@ -197,6 +188,11 @@ The above is an example using an example vcf. Replace the `--vcf_out` and `vrs_p
 
 Also, see the [VRS Annotator](https://dockstore.org/workflows/github.com/gks-anvil/vrs-annotator/VRSAnnotator:main?tab=info) workflow on Dockstore for a way to do this on Terra.
 
+## GREGoR-specific Details
+
+### Work in Progress
+- For chromosomes with ploidy of 1 (mitochondrial calling or sex chromosomes), focus allele counts (AC) and locus allele counts (AN) can have a maximum value of 1. Focus allele counts are 1 when the genotype has at least a single allele match (0/1, 1/1, or 1) otherwise it is none.
+
 ## Contributing
 
 This project is open to contributions from the research community. If you are interested in contributing to the project, please contact the project team.

diff --git a/plugin_system/plugin_manager.py b/plugin_system/plugin_manager.py
diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,4 @@
-ga4gh.vrs[extras]==2.0.0a10
+ga4gh.vrs[extras]==2.0.0a13
 diskcache
 biocommons.seqrepo
 glom
@@ -15,4 +15,7 @@ setuptools
 # for cohort allele frequency generation
 firecloud
 pandas
-vrsix==0.0.1
+vrsix==0.1.1
+
+# TODO: use actual version with merge of PR
+ga4gh.va_spec @ git+https://github.com/ga4gh/va-spec-python.git@2ea827ee9179262094e9f4cdba0876a6a389a54c#egg=ga4gh.va_spec
diff --git a/setup.py b/setup.py
@@ -41,7 +41,7 @@
 
     # Versions should comply with PEP 440:
     # https://packaging.python.org/en/latest/single_source_version.html
-    version='0.2.0rc4',  # Required
+    version='0.2.0rc5',  # Required
 
     # This is a one-line description or tagline of what your project does. This
     # corresponds to the "Summary" metadata field:
@@ -90,7 +90,9 @@
     # You can just specify package directories manually here if your project is
     # simple. Or you can use find_packages().
 
-    packages=find_packages(exclude=['contrib', 'docs', 'tests']),  # Required
+    packages=find_packages(where="src"),  # Required
+
+    package_dir={"": "src"},
 
     # Specify which Python versions you support. In contrast to the
     # 'Programming Language' classifiers above, 'pip install' will check this

diff --git a/plugin_system/__init__.py → src/plugin_system/__init__.py b/plugin_system/__init__.py → src/plugin_system/__init__.py
diff --git a/src/plugin_system/plugin_manager.py b/src/plugin_system/plugin_manager.py
@@ -0,0 +1,62 @@
+# plugin_system/plugin_manager.py
+import pkgutil
+import importlib
+import os
+
+from plugin_system.plugins.base_plugin import BasePlugin
+from vrs_anvil.evidence import PLUGIN_MODULE_PATH
+
+
+class PluginManager:
+    def load_plugin(self, plugin_name: str) -> BasePlugin:
+        """grab first plugin class matching name `plugin_name` within the `plugin_package` directory
+
+        Args:
+            plugin_name (str): name of the plugin class
+
+        Raises:
+            OSError: unable to find class named `plugin_name`
+            ImportError: unable to import package
+
+        Returns:
+            BasePlugin: non-instantiated Plugin class
+        """
+
+        # get full path for the default plugin directory
+        default_plugin_dir = os.path.dirname(
+            importlib.import_module(PLUGIN_MODULE_PATH).__file__
+        )
+
+        # look for plugin by name first in default directory then in top-level directory
+        for i, iter in enumerate(
+            [pkgutil.iter_modules([default_plugin_dir]), pkgutil.iter_modules()]
+        ):
+            for _, name, _ in iter:
+                # only look for specific file names
+                if not name.endswith("_plugin"):
+                    continue
+
+                # get full module name, use only plugin name if iterating across top-level dirs
+                module_name = f"{PLUGIN_MODULE_PATH}.{name}" if i == 0 else name
+
+                try:
+                    # dynamically import the module
+                    module = importlib.import_module(module_name)
+
+                    # grab first plugin class matching name <plugin_name>
+                    for attribute_name in dir(module):
+                        if attribute_name == plugin_name:
+                            attribute = getattr(module, attribute_name)
+                            if isinstance(attribute, type) and hasattr(
+                                attribute, "__is_plugin__"
+                            ):
+                                return attribute
+                except ImportError as e:
+                    # plugin modules found but unable to load specified plugin
+                    print(f"Error loading plugin {name}: {e}")
+                    raise
+
+        # if no plugin found, raise error
+        raise OSError(
+            f"Plugin {plugin_name} not found. Make sure the path is stored in the top-level"
+        )
diff --git a/plugin_system/plugins/__init__.py → src/plugin_system/plugins/__init__.py b/plugin_system/plugins/__init__.py → src/plugin_system/plugins/__init__.py
diff --git a/plugin_system/plugins/base_plugin.py → src/plugin_system/plugins/base_plugin.py b/plugin_system/plugins/base_plugin.py → src/plugin_system/plugins/base_plugin.py
diff --git a/plugin_system/plugins/gregor_plugin.py → src/plugin_system/plugins/gregor_plugin.py b/plugin_system/plugins/gregor_plugin.py → src/plugin_system/plugins/gregor_plugin.py
diff --git a/plugin_system/plugins/simple_plugin.py → src/plugin_system/plugins/simple_plugin.py b/plugin_system/plugins/simple_plugin.py → src/plugin_system/plugins/simple_plugin.py
@@ -3,5 +3,5 @@
 
 class SimplePlugin(BasePlugin):
     """
-    Simple example plugin using all default methods from the BasePlugin.
+    Simple example plugin that automatically inherits all default methods from the BasePlugin.
     """
diff --git a/plugin_system/utils.py → src/plugin_system/utils.py b/plugin_system/utils.py → src/plugin_system/utils.py
diff --git a/vrs_anvil/__init__.py → src/vrs_anvil/__init__.py b/vrs_anvil/__init__.py → src/vrs_anvil/__init__.py
diff --git a/vrs_anvil/annotator.py → src/vrs_anvil/annotator.py b/vrs_anvil/annotator.py → src/vrs_anvil/annotator.py
diff --git a/vrs_anvil/cli.py → src/vrs_anvil/cli.py b/vrs_anvil/cli.py → src/vrs_anvil/cli.py
diff --git a/vrs_anvil/collector.py → src/vrs_anvil/collector.py b/vrs_anvil/collector.py → src/vrs_anvil/collector.py