From c584227d7a814eae4853e496722b139393ea9c28 Mon Sep 17 00:00:00 2001 From: Zeyun Date: Tue, 16 Apr 2024 15:25:40 -0700 Subject: [PATCH 1/2] convert README file from rst to md --- README.md | 140 ++++++++++++++++++++++++++++++++++++++++ README.rst | 184 ----------------------------------------------------- 2 files changed, 140 insertions(+), 184 deletions(-) create mode 100644 README.md delete mode 100644 README.rst diff --git a/README.md b/README.md new file mode 100644 index 0000000..92da14b --- /dev/null +++ b/README.md @@ -0,0 +1,140 @@ +[![Documentation-webpage](https://img.shields.io/badge/Docs-Available-brightgreen)](https://mancusolab.github.io/sushie/) +[![Github](https://img.shields.io/github/stars/mancusolab/sushie?style=social)](https://github.com/mancusolab/sushie) +[![License](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) +[![Project generated with PyScaffold](https://img.shields.io/badge/-PyScaffold-005CA0?logo=pyscaffold)](https://pyscaffold.org/) + +# SuShiE🍣 + +SuShiE (Sum of Shared Single Effect) is a Python software to fine-map +causal SNPs, compute prediction weights, and infer effect size +correlation for molecular data (e.g., mRNA levels and protein levels +etc.) across multiple ancestries. **The manuscript is in progress.** + +``` diff +- We detest usage of our software or scientific outcome to promote racial discrimination. +``` + +Check [here](https://mancusolab.github.io/sushie/) for full +documentation. + + [**Installation**](#installation) + | [**Example**](#get-started-with-example) + | [**Notes**](#notes) + | [**Version History**](#version) + | [**Support**](#support) + | [**Other Software**](#other-software) + +## Installation + +Users can download the latest repository and then use `pip`: + +``` bash +git clone https://github.com/mancusolab/sushie.git +cd sushie +pip install . +``` + +*We currently only support Python3.8+.* + +Before installation, we recommend to create a new environment using +[conda](https://docs.conda.io/en/latest/) so that it will not affect the +software versions of the other projects. + +## Get Started with Example + +SuShiE software is very easy to use: + +``` bash +cd ./data/ +sushie finemap --pheno EUR.pheno AFR.pheno --vcf vcf/EUR.vcf vcf/AFR.vcf --covar EUR.covar AFR.covar --output ./test_result +``` + +It can perform: + +- SuShiE: multi-ancestry fine-mapping accounting for ancestral + correlation +- Single-ancestry SuSiE (Sum of Single Effect) +- Independent SuShiE: multi-ancestry SuShiE without accounting for + correlation +- Meta-SuSiE: single-ancestry SuSiE followed by meta-analysis +- Mega-SuSiE: single-ancestry SuSiE on row-wise stacked data across + ancestries +- QTL effect size correlation estimation +- cis-SNP heritability estimation +- Cross-validation for SuShiE prediction weights +- Convert prediction results to + [FUSION](http://gusevlab.org/projects/fusion/) format, thus can be + used in [TWAS](https://www.nature.com/articles/ng.3506) + +See [here](https://mancusolab.github.io/sushie/) for more details on how +to use SuShiE. + +If you want to use in-software SuShiE inference function, you can use +following code as an example: + +``` python +from sushie.infer import infer_sushie +# Xs is for genotype data, and it should be a list of numpy array whose length is the number of ancestry. +# ys is for phenotype data, and it should also be a list of numpy array whose length is the number of ancestry. +infer_sushie(Xs=X, ys=y) +``` + +You can play it with your own ideas! + +## Notes + +- SuShiE currently only supports **continuous** phenotype + fine-mapping. +- SuShiE currently only supports fine-mapping on + [autosomes](https://en.wikipedia.org/wiki/Autosome). +- SuShiE uses [JAX](https://github.com/google/jax) with [Just In + Time](https://jax.readthedocs.io/en/latest/jax-101/02-jitting.html) + compilation to achieve high-speed computation. However, there are + some [issues](https://github.com/google/jax/issues/5501) for JAX + with Mac M1 chip. To solve this, users need to initiate conda using + [miniforge](https://github.com/conda-forge/miniforge), and then + install SuShiE using `pip` in the desired environment. + +## Version History + +| Version | Description | +| --------- | --------- | +| 0.1 | Initial Release | +| 0.11 | Fix the bug for OLS to compute adjusted r squared. | +| 0.12 | Update io.corr function so that report all the correlation results no matter cs is pruned or not. | +| 0.13 | Add `--keep` command to enable user to specify a file that contains the subjects ID SuShiE will perform on. Add `--ancestry_index` command to enable user to specify a file that contains the ancestry index for fine-mapping. With this, user can input single phenotype, genotype, and covariate file that contains all the subjects across ancestries. Implement padding to increase inference time. Record elbo at each iteration and can access it in the `infer.SuShiEResult` object. The alphas table now outputs the average purity and KL divergence for each `L`. Change `--kl_threshold` to `--divergence`. Add `--maf` command to remove SNPs that less than minor allele frequency threshold within each ancestry. Add `--max_select` command to randomly select maximum number of SNPs to compute purity to avoid unnecessary memory spending. Add a QC function to remove duplicated SNPs. | +| 0.14 | Remove KL-Divergence pruning. Enhance command line appearance and improve the output files contents. Fix small bugs on multivariate KL. | + +## Support + +Please report any bugs or feature requests in the [Issue +Tracker](https://github.com/mancusolab/sushie/issues). If users have any +questions or comments, please contact Zeyun Lu () and +Nicholas Mancuso (). + +## Other Software + +Feel free to use other software developed by [Mancuso +Lab](https://www.mancusolab.com/): + +- [MA-FOCUS](https://github.com/mancusolab/ma-focus): a Bayesian + fine-mapping framework using + [TWAS](https://www.nature.com/articles/ng.3506) statistics across + multiple ancestries to identify the causal genes for complex traits. +- [SuSiE-PCA](https://github.com/mancusolab/susiepca): a scalable + Bayesian variable selection technique for sparse principal component + analysis +- [twas_sim](https://github.com/mancusolab/twas_sim): a Python + software to simulate [TWAS](https://www.nature.com/articles/ng.3506) + statistics. +- [FactorGo](https://github.com/mancusolab/factorgo): a scalable + variational factor analysis model that learns pleiotropic factors + from GWAS summary statistics. +- [HAMSTA](https://github.com/tszfungc/hamsta): a Python software to + estimate heritability explained by local ancestry data from + admixture mapping summary statistics. + +------------------------------------------------------------------------ + +This project has been set up using PyScaffold 4.1.1. For details and +usage information on PyScaffold see . diff --git a/README.rst b/README.rst deleted file mode 100644 index bb83d8c..0000000 --- a/README.rst +++ /dev/null @@ -1,184 +0,0 @@ -.. These are examples of badges you might want to add to your README: - please update the URLs accordingly - - - .. image:: https://readthedocs.org/projects/sushie/badge/?version=latest - :alt: ReadTheDocs - :target: https://sushie.readthedocs.io/en/stable/ - .. image:: https://img.shields.io/coveralls/github//sushie/main.svg - :alt: Coveralls - :target: https://coveralls.io/r//sushie - - .. image:: https://img.shields.io/conda/vn/conda-forge/sushie.svg - :alt: Conda-Forge - :target: https://anaconda.org/conda-forge/sushie - .. image:: https://pepy.tech/badge/sushie/month - :alt: Monthly Downloads - :target: https://pepy.tech/project/sushie - - - -.. image:: https://img.shields.io/badge/Docs-Available-brightgreen - :alt: Documentation-webpage - :target: https://mancusolab.github.io/sushie/ - -.. image:: https://img.shields.io/pypi/v/sushie.svg - :alt: PyPI-Server - :target: https://pypi.org/project/sushie/ - -.. image:: https://img.shields.io/github/stars/mancusolab/sushie?style=social - :alt: Github - :target: https://github.com/mancusolab/sushie - -.. image:: https://img.shields.io/badge/License-MIT-yellow.svg - :alt: License - :target: https://opensource.org/licenses/MIT - -.. image:: https://img.shields.io/badge/-PyScaffold-005CA0?logo=pyscaffold - :alt: Project generated with PyScaffold - :target: https://pyscaffold.org/ - - -======== -SuShiE🍣 -======== -SuShiE (Sum of Shared Single Effect) is a Python software to fine-map causal SNPs, compute prediction weights, and infer effect size correlation for molecular data (e.g., mRNA levels and protein levels etc.) across multiple ancestries. **The manuscript is in progress.** - -.. code:: diff - - - We detest usage of our software or scientific outcome to promote racial discrimination. - -Check `here `_ for full documentation. - - -|Installation|_ | |Example|_ | |Notes|_ | |Version|_ | |Support|_ | |Other Software|_ - -================= - -.. _Installation: -.. |Installation| replace:: **Installation** - -Installation -============ - -.. - The easiest way to install is with ``pip``: - .. code:: bash - pip install sushie - - Alternatively, - -Users can download the latest repository and then use ``pip``: - -.. code:: bash - - git clone https://github.com/mancusolab/sushie.git - cd sushie - pip install . - -*We currently only support Python3.8+.* - -Before installation, we recommend to create a new environment using `conda `_ so that it will not affect the software versions of the other projects. - -.. _Example: -.. |Example| replace:: **Example** - -Get Started with Example -======================== -SuShiE software is very easy to use: - -.. code:: bash - - cd ./data/ - sushie finemap --pheno EUR.pheno AFR.pheno --vcf vcf/EUR.vcf vcf/AFR.vcf --covar EUR.covar AFR.covar --output ./test_result - -It can perform: - -* SuShiE: multi-ancestry fine-mapping accounting for ancestral correlation -* Single-ancestry SuSiE (Sum of Single Effect) -* Independent SuShiE: multi-ancestry SuShiE without accounting for correlation -* Meta-SuSiE: single-ancestry SuSiE followed by meta-analysis -* Mega-SuSiE: single-ancestry SuSiE on row-wise stacked data across ancestries -* QTL effect size correlation estimation -* cis-SNP heritability estimation -* Cross-validation for SuShiE prediction weights -* Convert prediction results to `FUSION `_ format, thus can be used in `TWAS `_ - -See `here `_ for more details on how to use SuShiE. - -If you want to use in-software SuShiE inference function, you can use following code as an example: - -.. code:: python - - from sushie.infer import infer_sushie - # Xs is for genotype data, and it should be a list of numpy array whose length is the number of ancestry. - # ys is for phenotype data, and it should also be a list of numpy array whose length is the number of ancestry. - infer_sushie(Xs=X, ys=y) - -You can play it with your own ideas! - -.. _Notes: -.. |Notes| replace:: **Notes** - -Notes -===== - -* SuShiE currently only supports **continuous** phenotype fine-mapping. -* SuShiE currently only supports fine-mapping on `autosomes `_. -* SuShiE uses `JAX `_ with `Just In Time `_ compilation to achieve high-speed computation. However, there are some `issues `_ for JAX with Mac M1 chip. To solve this, users need to initiate conda using `miniforge `_, and then install SuShiE using ``pip`` in the desired environment. - -.. _Version: -.. |Version| replace:: **Version** - -Version History -=============== - -.. list-table:: - :header-rows: 1 - - * - Version - - Description - * - 0.1 - - Initial Release - * - 0.11 - - Fix the bug for OLS to compute adjusted r squared. - * - 0.12 - - Update io.corr function so that report all the correlation results no matter cs is pruned or not. - * - 0.13 - - Add ``--keep`` command to enable user to specify a file that contains the subjects ID SuShiE will perform on. Add ``--ancestry_index`` command to enable user to specify a file that contains the ancestry index for fine-mapping. With this, user can input single phenotype, genotype, and covariate file that contains all the subjects across ancestries. Implement padding to increase inference time. Record elbo at each iteration and can access it in the ``infer.SuShiEResult`` object. The alphas table now outputs the average purity and KL divergence for each ``L``. Change ``--kl_threshold`` to ``--divergence``. Add ``--maf`` command to remove SNPs that less than minor allele frequency threshold within each ancestry. Add ``--max_select`` command to randomly select maximum number of SNPs to compute purity to avoid unnecessary memory spending. Add a QC function to remove duplicated SNPs. - * - 0.14 - - Remove KL-Divergence pruning. Enhance command line appearance and improve the output files contents. Fix small bugs on multivariate KL. - -.. _Support: -.. |Support| replace:: **Support** - -Support -======== - -Please report any bugs or feature requests in the `Issue Tracker `_. If users have any -questions or comments, please contact Zeyun Lu (zeyunlu@usc.edu) and Nicholas Mancuso (nmancuso@usc.edu). - -.. _OtherSoftware: -.. |Other Software| replace:: **Other Software** - -Other Software -============== - -Feel free to use other software developed by `Mancuso Lab `_: - -* `MA-FOCUS `_: a Bayesian fine-mapping framework using `TWAS `_ statistics across multiple ancestries to identify the causal genes for complex traits. - -* `SuSiE-PCA `_: a scalable Bayesian variable selection technique for sparse principal component analysis - -* `twas_sim `_: a Python software to simulate `TWAS `_ statistics. - -* `FactorGo `_: a scalable variational factor analysis model that learns pleiotropic factors from GWAS summary statistics. - -* `HAMSTA `_: a Python software to estimate heritability explained by local ancestry data from admixture mapping summary statistics. - ---------------------- - -.. _pyscaffold-notes: - -This project has been set up using PyScaffold 4.1.1. For details and usage -information on PyScaffold see https://pyscaffold.org/. From b004c55688792aa4e53261401f1e077c6b373bee Mon Sep 17 00:00:00 2001 From: Zeyun Date: Tue, 16 Apr 2024 15:36:25 -0700 Subject: [PATCH 2/2] fix typos --- README.md | 2 +- docs/files.rst | 4 ++-- sushie/cli.py | 4 ++-- sushie/io.py | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 92da14b..6f05c90 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ documentation. [**Installation**](#installation) | [**Example**](#get-started-with-example) | [**Notes**](#notes) - | [**Version History**](#version) + | [**Version History**](#version-history) | [**Support**](#support) | [**Other Software**](#other-software) diff --git a/docs/files.rst b/docs/files.rst index 8d01714..a81023d 100644 --- a/docs/files.rst +++ b/docs/files.rst @@ -263,11 +263,11 @@ SuShiE by default outputs a ``*.corr.tsv`` file that contains the estimated effe - Float - 1.34 - The inferred effect size variance (the posterior estimate for :math:`\sigma^2_{i,b}` in :ref:`Model`) for ancestry 1. It depends on the number of ancestry. One estimate for each credible set. - * - ancestry1_est_covar + * - ancestry1_ancestry2_est_covar - Float - 2.56 - The inferred effect size covariance between ancestry 1 and ancestry 2. It depends on the number of pairs of ancestries. One estimate for each credible set. - * - ancestry1_est_corr + * - ancestry1_ancestry2_est_corr - Float - 0.8 - The inferred effect size correlation (the posterior estimate for :math:`\rho` in :ref:`Model`) between ancestry 1 and ancestry 2. It depends on the number of pairs of ancestries. One estimate for each credible set. diff --git a/sushie/cli.py b/sushie/cli.py index f5b8556..8a6c9cc 100755 --- a/sushie/cli.py +++ b/sushie/cli.py @@ -256,7 +256,7 @@ def _prepare_cv( train_pheno = [] valid_geno = [] valid_pheno = [] - train_index = jnp.delete(jnp.arange(5), cv).tolist() + train_index = jnp.delete(jnp.arange(cv_num), cv).tolist() # make the training and test for each population separately # because sample size may be different @@ -1111,7 +1111,7 @@ def build_finemap_parser(subp): default=None, help=( "Genotype data in vcf format. Use 'space' to separate ancestries if more than two.", - " Keep the same ancestry order as phenotype's.", + " Keep the same ancestry order as phenotype's. The software will count RFE allele.", ), ) diff --git a/sushie/io.py b/sushie/io.py index ceade52..fc8e0e4 100644 --- a/sushie/io.py +++ b/sushie/io.py @@ -198,7 +198,7 @@ def read_vcf(path: str) -> Tuple[pd.DataFrame, pd.DataFrame, Array]: """Read in genotype data in `vcf `_ format. Args: - path: The path for vcf genotype data (full file name). + path: The path for vcf genotype data (full file name). It will count REF allele. Returns: :py:obj:`Tuple[pd.DataFrame, pd.DataFrame, Array]`: A tuple of