Skip to content

Commit

Permalink
Add the correct nomenclature for genes in anopheles.py to enable CNVs…
Browse files Browse the repository at this point in the history
… for Af (#500)

* Tried to corect the nomenclature for genes for Af

* Used Alistair's solution

* Added a parameter

* Needed to update some tests with the new param

* Changed 'gff_gene_name' to 'gff_gene_name_attribute'
  • Loading branch information
jonbrenas authored Jan 11, 2024
1 parent c560a96 commit ffe1db0
Show file tree
Hide file tree
Showing 16 changed files with 191 additions and 4 deletions.
1 change: 1 addition & 0 deletions malariagen_data/af1.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ def __init__(
major_version_number=MAJOR_VERSION_NUMBER,
major_version_path=MAJOR_VERSION_PATH,
gff_gene_type="protein_coding_gene",
gff_gene_name_attribute="Note",
gff_default_attributes=("ID", "Parent", "Note", "description"),
storage_options=storage_options, # used by fsspec via init_filesystem()
tqdm_class=tqdm_class,
Expand Down
1 change: 1 addition & 0 deletions malariagen_data/ag3.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,7 @@ def __init__(
major_version_number=MAJOR_VERSION_NUMBER,
major_version_path=MAJOR_VERSION_PATH,
gff_gene_type="gene",
gff_gene_name_attribute="Name",
gff_default_attributes=("ID", "Parent", "Name", "description"),
storage_options=storage_options, # used by fsspec via init_filesystem()
tqdm_class=tqdm_class,
Expand Down
2 changes: 2 additions & 0 deletions malariagen_data/anoph/genome_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ def __init__(
self,
*,
gff_gene_type: str,
gff_gene_name_attribute: str,
gff_default_attributes: Tuple[str, ...],
gene_names: Optional[Mapping[str, str]] = None,
**kwargs,
Expand All @@ -37,6 +38,7 @@ def __init__(
# TODO Consider moving these parameters to configuration, as they could
# change if the GFF ever changed.
self._gff_gene_type = gff_gene_type
self._gff_gene_name_attribute = gff_gene_name_attribute
self._gff_default_attributes = gff_default_attributes

# Allow manual override of gene names.
Expand Down
9 changes: 7 additions & 2 deletions malariagen_data/anopheles.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ def __init__(
major_version_number: int,
major_version_path: str,
gff_gene_type: str,
gff_gene_name_attribute: str,
gff_default_attributes: Tuple[str, ...],
tqdm_class,
storage_options: Mapping, # used by fsspec via init_filesystem(url, **kwargs)
Expand All @@ -160,6 +161,7 @@ def __init__(
major_version_path=major_version_path,
storage_options=storage_options,
gff_gene_type=gff_gene_type,
gff_gene_name_attribute=gff_gene_name_attribute,
gff_default_attributes=gff_default_attributes,
cohorts_analysis=cohorts_analysis,
aim_analysis=aim_analysis,
Expand Down Expand Up @@ -877,7 +879,7 @@ def _gene_cnv(self, *, region, sample_sets, sample_query, max_coverage_variance)

debug("access genes")
df_genome_features = self.genome_features(region=region)
df_genes = df_genome_features.query("type == 'gene'")
df_genes = df_genome_features.query(f"type == '{self._gff_gene_type}'")

debug("setup intermediates")
windows = []
Expand Down Expand Up @@ -921,7 +923,10 @@ def _gene_cnv(self, *, region, sample_sets, sample_query, max_coverage_variance)
"gene_start": (["genes"], df_genes["start"].values),
"gene_end": (["genes"], df_genes["end"].values),
"gene_windows": (["genes"], windows),
"gene_name": (["genes"], df_genes["Name"].values),
"gene_name": (
["genes"],
df_genes[self._gff_gene_name_attribute].values,
),
"gene_strand": (["genes"], df_genes["strand"].values),
"gene_description": (["genes"], df_genes["description"].values),
"CN_mode": (["genes", "samples"], modes),
Expand Down
72 changes: 70 additions & 2 deletions notebooks/plot_cnv.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
"metadata": {},
"outputs": [],
"source": [
"import malariagen_data"
"import malariagen_data\n",
"import dask"
]
},
{
Expand All @@ -24,6 +25,19 @@
"ag3"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"af1 = malariagen_data.Af1(\n",
" \"simplecache::gs://vo_afun_release\",\n",
" simplecache=dict(cache_storage=\"../gcs_cache\"),\n",
")\n",
"af1"
]
},
{
"cell_type": "markdown",
"id": "74c46a04",
Expand All @@ -32,6 +46,13 @@
"## CNV HMM coverage"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Ag3"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down Expand Up @@ -116,6 +137,26 @@
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Af1"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# sample with a small CNV\n",
"af1.plot_cnv_hmm_coverage_track(\n",
" sample=\"VBS24196\",\n",
" region=\"2RL:28,460,000-28,580,000\",\n",
")"
]
},
{
"cell_type": "markdown",
"id": "a302d86c",
Expand All @@ -124,6 +165,13 @@
"## CNV HMM heatmap"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Ag3"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down Expand Up @@ -234,12 +282,32 @@
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Af1"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5c4dcf97",
"metadata": {},
"outputs": [],
"source": [
"af1.plot_cnv_hmm_heatmap(\n",
" sample_sets=\"1230-VO-GA-CF-AYALA-VMF00045\",\n",
" region=\"2RL:28,520,000-28,570,000\",\n",
" max_coverage_variance=None,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
Expand All @@ -259,7 +327,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.16"
"version": "3.10.11"
}
},
"nbformat": 4,
Expand Down
1 change: 1 addition & 0 deletions tests/anoph/test_aim_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ def ag3_sim_api(ag3_sim_fixture):
aim_ids=("gambcolu_vs_arab", "gamb_vs_colu"),
aim_palettes=_ag3.AIM_PALETTES,
gff_gene_type="gene",
gff_gene_name_attribute="Name",
gff_default_attributes=("ID", "Parent", "Name", "description"),
)

Expand Down
2 changes: 2 additions & 0 deletions tests/anoph/test_cnv_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def ag3_sim_api(ag3_sim_fixture):
"aim_species": object,
},
gff_gene_type="gene",
gff_gene_name_attribute="Name",
gff_default_attributes=("ID", "Parent", "Name", "description"),
results_cache=ag3_sim_fixture.results_cache_path.as_posix(),
default_coverage_calls_analysis="gamb_colu",
Expand All @@ -48,6 +49,7 @@ def af1_sim_api(af1_sim_fixture):
major_version_path=_af1.MAJOR_VERSION_PATH,
pre=False,
gff_gene_type="protein_coding_gene",
gff_gene_name_attribute="Note",
gff_default_attributes=("ID", "Parent", "Note", "description"),
results_cache=af1_sim_fixture.results_cache_path.as_posix(),
default_coverage_calls_analysis="funestus",
Expand Down
2 changes: 2 additions & 0 deletions tests/anoph/test_fst.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ def ag3_sim_api(ag3_sim_fixture):
"aim_species": object,
},
gff_gene_type="gene",
gff_gene_name_attribute="Name",
gff_default_attributes=("ID", "Parent", "Name", "description"),
default_site_mask="gamb_colu_arab",
results_cache=ag3_sim_fixture.results_cache_path.as_posix(),
Expand All @@ -48,6 +49,7 @@ def af1_sim_api(af1_sim_fixture):
major_version_path=_af1.MAJOR_VERSION_PATH,
pre=False,
gff_gene_type="protein_coding_gene",
gff_gene_name_attribute="Note",
gff_default_attributes=("ID", "Parent", "Note", "description"),
default_site_mask="funestus",
results_cache=af1_sim_fixture.results_cache_path.as_posix(),
Expand Down
2 changes: 2 additions & 0 deletions tests/anoph/test_g123.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ def ag3_sim_api(ag3_sim_fixture):
"aim_species": object,
},
gff_gene_type="gene",
gff_gene_name_attribute="Name",
gff_default_attributes=("ID", "Parent", "Name", "description"),
default_site_mask="gamb_colu_arab",
results_cache=ag3_sim_fixture.results_cache_path.as_posix(),
Expand All @@ -46,6 +47,7 @@ def af1_sim_api(af1_sim_fixture):
major_version_path=_af1.MAJOR_VERSION_PATH,
pre=False,
gff_gene_type="protein_coding_gene",
gff_gene_name_attribute="Note",
gff_default_attributes=("ID", "Parent", "Note", "description"),
default_site_mask="funestus",
results_cache=af1_sim_fixture.results_cache_path.as_posix(),
Expand Down
3 changes: 3 additions & 0 deletions tests/anoph/test_genome_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ def ag3_sim_api(ag3_sim_fixture):
major_version_path=_ag3.MAJOR_VERSION_PATH,
pre=True,
gff_gene_type="gene",
gff_gene_name_attribute="Name",
gff_default_attributes=("ID", "Parent", "Name", "description"),
virtual_contigs=_ag3.VIRTUAL_CONTIGS,
)
Expand All @@ -36,6 +37,7 @@ def af1_sim_api(af1_sim_fixture):
major_version_path=_af1.MAJOR_VERSION_PATH,
pre=False,
gff_gene_type="protein_coding_gene",
gff_gene_name_attribute="Note",
gff_default_attributes=("ID", "Parent", "Note", "description"),
)

Expand Down Expand Up @@ -152,6 +154,7 @@ def gh334_api(fixture_dir):
major_version_path="v1.0",
pre=False,
gff_gene_type="protein_coding_gene",
gff_gene_name_attribute="Note",
gff_default_attributes=("ID", "Parent"),
)

Expand Down
2 changes: 2 additions & 0 deletions tests/anoph/test_hap_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ def ag3_sim_api(ag3_sim_fixture):
"aim_species": object,
},
gff_gene_type="gene",
gff_gene_name_attribute="Name",
gff_default_attributes=("ID", "Parent", "Name", "description"),
results_cache=ag3_sim_fixture.results_cache_path.as_posix(),
default_phasing_analysis="gamb_colu_arab",
Expand All @@ -47,6 +48,7 @@ def af1_sim_api(af1_sim_fixture):
major_version_path=_af1.MAJOR_VERSION_PATH,
pre=False,
gff_gene_type="protein_coding_gene",
gff_gene_name_attribute="Note",
gff_default_attributes=("ID", "Parent", "Note", "description"),
results_cache=af1_sim_fixture.results_cache_path.as_posix(),
default_phasing_analysis="funestus",
Expand Down
2 changes: 2 additions & 0 deletions tests/anoph/test_igv.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ def ag3_sim_api(ag3_sim_fixture):
"aim_species": object,
},
gff_gene_type="gene",
gff_gene_name_attribute="Name",
gff_default_attributes=("ID", "Parent", "Name", "description"),
default_site_mask="gamb_colu_arab",
results_cache=ag3_sim_fixture.results_cache_path.as_posix(),
Expand All @@ -43,6 +44,7 @@ def af1_sim_api(af1_sim_fixture):
major_version_path=_af1.MAJOR_VERSION_PATH,
pre=False,
gff_gene_type="protein_coding_gene",
gff_gene_name_attribute="Note",
gff_default_attributes=("ID", "Parent", "Note", "description"),
default_site_mask="funestus",
results_cache=af1_sim_fixture.results_cache_path.as_posix(),
Expand Down
2 changes: 2 additions & 0 deletions tests/anoph/test_pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ def ag3_sim_api(ag3_sim_fixture):
"aim_species": object,
},
gff_gene_type="gene",
gff_gene_name_attribute="Name",
gff_default_attributes=("ID", "Parent", "Name", "description"),
default_site_mask="gamb_colu_arab",
results_cache=ag3_sim_fixture.results_cache_path.as_posix(),
Expand All @@ -47,6 +48,7 @@ def af1_sim_api(af1_sim_fixture):
major_version_path=_af1.MAJOR_VERSION_PATH,
pre=False,
gff_gene_type="protein_coding_gene",
gff_gene_name_attribute="Note",
gff_default_attributes=("ID", "Parent", "Note", "description"),
default_site_mask="funestus",
results_cache=af1_sim_fixture.results_cache_path.as_posix(),
Expand Down
2 changes: 2 additions & 0 deletions tests/anoph/test_snp_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ def ag3_sim_api(ag3_sim_fixture):
"aim_species": object,
},
gff_gene_type="gene",
gff_gene_name_attribute="Name",
gff_default_attributes=("ID", "Parent", "Name", "description"),
default_site_mask="gamb_colu_arab",
results_cache=ag3_sim_fixture.results_cache_path.as_posix(),
Expand All @@ -52,6 +53,7 @@ def af1_sim_api(af1_sim_fixture):
major_version_path=_af1.MAJOR_VERSION_PATH,
pre=False,
gff_gene_type="protein_coding_gene",
gff_gene_name_attribute="Note",
gff_default_attributes=("ID", "Parent", "Note", "description"),
default_site_mask="funestus",
results_cache=af1_sim_fixture.results_cache_path.as_posix(),
Expand Down
2 changes: 2 additions & 0 deletions tests/anoph/test_snp_frq.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ def ag3_sim_api(ag3_sim_fixture):
"aim_species": object,
},
gff_gene_type="gene",
gff_gene_name_attribute="Name",
gff_default_attributes=("ID", "Parent", "Name", "description"),
default_site_mask="gamb_colu_arab",
results_cache=ag3_sim_fixture.results_cache_path.as_posix(),
Expand All @@ -50,6 +51,7 @@ def af1_sim_api(af1_sim_fixture):
major_version_path=_af1.MAJOR_VERSION_PATH,
pre=False,
gff_gene_type="protein_coding_gene",
gff_gene_name_attribute="Note",
gff_default_attributes=("ID", "Parent", "Note", "description"),
default_site_mask="funestus",
results_cache=af1_sim_fixture.results_cache_path.as_posix(),
Expand Down
Loading

0 comments on commit ffe1db0

Please sign in to comment.