Merge pull request #128 from nextstrain/docs-v3

Dataset creation guide
nextstrain · Jan 14, 2024 · ed6b6a7 · ed6b6a7
2 parents bc6a3f1 + 39ce2c4
commit ed6b6a7
Show file tree

Hide file tree

Showing 27 changed files with 27,524 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -5,3 +5,4 @@
 \.DS_Store
 \.env
 \.vscode
+/docs/tmp/
diff --git a/docs/README.md b/docs/README.md
@@ -0,0 +1,14 @@
+# Nextclade dataset documentation
+
+## Dataset migration from version 2 to 3
+Version 2 dataset are not compatible with Nextclade v3 and need migrating.
+See our [dataset migration guide](migration-guide-v3.md) for tips and tools on how to create v3 dataset from your existing v2 datasets.
+
+## Create new datasets
+You can create your own NextClade datasets!
+We provide a [dataset creation tutorial](dataset-creation-guide.md) to help you assemble datasets for your virus of interest. To help you to get going, there is a [minimal dataset](minimal-dataset) and a [script](example-workflow/scripts/generate_from_genbank.py) to walk you through the creation of a suitable annotation using data from RefSeq.
+
+## Contribute datasets to a collection on NextClade
+
+If you want your dataset to be included in those served by the Nextclade Web application, you can create a pull request in this repository.
+The [dataset curation guide](dataset-curation-guide.md) describes the relevant steps.
diff --git a/docs/dataset-creation-guide.md b/docs/dataset-creation-guide.md
diff --git a/docs/example-workflow/.gitignore b/docs/example-workflow/.gitignore
@@ -0,0 +1,7 @@
+.snakemake/
+data/
+results/
+bin/
+test_out/
+dataset.zip
+tmp/
diff --git a/docs/example-workflow/README.md b/docs/example-workflow/README.md
@@ -0,0 +1,23 @@
+# Nextclade dataset creation example workflow
+
+This repository contains a simple example workflow for creating a Nextclade dataset, in this case for Zika virus.
+
+For a more detailed guide on dataset creation, see the [dataset creation tutorial](../dataset-creation-guide.md).
+
+The quickest way to run this workflow is to [install the Nextstrain toolchain](https://docs.nextstrain.org/en/latest/install.html) and then run the following commands (the target `test` generates the dataset and tests it):
+
+```bash
+nextstrain build . test
+```
+Alternatively, you can run the workflow via
+```bash
+snakemake --cores 4 assemble_dataset
+```
+
+## Customization
+Basic configuration is supported by editing the constants at the top of the `Snakefile`.
+You can adapt this workflow to a different virus by changing the NCBI taxon ID and the reference sequence. You will need to provide a reference sequence and annotation analogous to what is provided in `minimal-dataset` for Zika virus.
+
+You may also want to edit `resources/auspice_config.json` and `resources/pathogen.json`.
+
+If you struggle with customizing the workflow, please post your questions in the [Nextstrain discussion forum](https://discussion.nextstrain.org/).
diff --git a/docs/example-workflow/Snakefile b/docs/example-workflow/Snakefile
@@ -0,0 +1,293 @@
+REFERENCE_ACCESSION = "NC_035889.1"
+TAXON_ID = 64320
+GENES = ["CA", "PRO", "MP", "ENV", "NS1", "NS2A", "NS2B", "NS3", "NS4A", "NS4B", "NS5"]
+ALLOWED_DIVERGENCE = "500"
+MIN_DATE = "2010-01-01"
+MIN_LENGTH = "8000"
+MAX_SEQS = "100"
+GFF_PATH = "../minimal-dataset/genome_annotation.gff3"
+GENBANK_PATH = "resources/reference.gb"
+REFERENCE_PATH = "../minimal-dataset/reference.fasta"
+README_PATH = "../minimal-dataset/README.md"
+CHANGELOG_PATH = "../minimal-dataset/CHANGELOG.md"
+ROOTING = "mid_point"  # alternative root using outgroup, e.g. the reference "NC_035889.1"
+
+FETCH_SEQUENCES = True
+
+if FETCH_SEQUENCES == True:
+
+    include: "rules/fetch_from_ncbi.smk"
+
+
+rule add_reference_to_include:
+    """
+    Create an include file for augur filter
+    """
+    input:
+        include="resources/include.txt",
+    output:
+        "results/include.txt",
+    shell:
+        """
+        echo "{REFERENCE_ACCESSION}" >> results/include.txt
+        """
+
+
+rule filter:
+    """
+    Exclude sequences from before {MIN_DATE} and subsample to {MAX_SEQS} sequences.
+    Only take sequences longer than {MIN_LENGTH}
+    """
+    input:
+        sequences="data/sequences.fasta",
+        metadata="data/metadata.tsv",
+        include="results/include.txt",
+    output:
+        filtered_sequences="results/filtered_sequences_raw.fasta",
+        filtered_metadata="results/filtered_metadata_raw.tsv",
+    params:
+        min_date="" if MIN_DATE == "" else "--min-date " + MIN_DATE,
+        min_length="" if MIN_LENGTH == "" else "--min-length " + MIN_LENGTH,
+        max_seqs=MAX_SEQS,
+    shell:
+        """
+        augur filter \
+            --sequences {input.sequences} \
+            --metadata {input.metadata} \
+            {params.min_length} \
+            {params.min_date} \
+            --include {input.include} \
+            --subsample-max-sequences {params.max_seqs} \
+            --output {output.filtered_sequences} \
+            --output-metadata {output.filtered_metadata}
+        """
+
+
+rule align:
+    input:
+        sequences="results/filtered_sequences_raw.fasta",
+        reference=REFERENCE_PATH,
+        annotation=GFF_PATH,
+    output:
+        alignment="results/aligned.fasta",
+        tsv="results/nextclade.tsv",
+    params:
+        translation_template=lambda w: "results/translations/cds_{cds}.translation.fasta",
+    shell:
+        """
+        nextclade3 run \
+            {input.sequences} \
+            --input-ref {input.reference} \
+            --input-annotation {input.annotation} \
+            --output-translations {params.translation_template} \
+            --output-tsv {output.tsv} \
+            --output-fasta {output.alignment}
+        """
+
+
+rule get_outliers:
+    """
+    Automatically identify sequences with >{ALLOWED_DIVERGENCE} substitutions
+    (likely to be sequencing errors or low quality/misannotated sequences) and put them in outliers.txt
+    """
+    input:
+        nextclade="results/nextclade.tsv",
+    output:
+        outliers="results/outliers.txt",
+        tmp="tmp/outliers.txt",
+    params:
+        allowed_divergence=lambda w: ALLOWED_DIVERGENCE,
+    shell:
+        """
+        tsv-filter -H -v --is-numeric totalSubstitutions {input.nextclade} \
+        > {output.tmp}
+        tsv-filter -H \
+            --is-numeric totalSubstitutions \
+            --gt totalSubstitutions:{params.allowed_divergence} \
+            {input.nextclade} \
+        | tail -n +2 >> {output.tmp}
+        cat {output.tmp} \
+        | tsv-select -H -f seqName \
+        | tail -n +2 > {output.outliers}
+        """
+
+
+rule exclude:
+    """
+    Rule to allow for manual and automatic exclusion of sequences
+    without triggering a new subsampling that could
+    surface new bad sequences resulting in an infinite loop
+    """
+    input:
+        sequences="results/aligned.fasta",
+        metadata="data/metadata.tsv",
+        exclude="resources/exclude.txt",
+        outliers="results/outliers.txt",
+    output:
+        filtered_sequences="results/filtered_aligned.fasta",
+        filtered_metadata="results/filtered_metadata.tsv",
+        strains="results/tree_strains.txt",
+    shell:
+        """
+        augur filter \
+            --sequences {input.sequences} \
+            --metadata {input.metadata} \
+            --exclude {input.exclude} {input.outliers} \
+            --output {output.filtered_sequences} \
+            --output-metadata {output.filtered_metadata} \
+            --output-strains {output.strains}
+        """
+
+
+rule tree:
+    input:
+        alignment="results/filtered_aligned.fasta",
+    output:
+        tree="results/tree_raw.nwk",
+    shell:
+        """
+        augur tree \
+            --alignment {input.alignment} \
+            --output {output.tree} \
+        """
+
+
+rule refine:
+    input:
+        tree="results/tree_raw.nwk",
+        alignment="results/filtered_aligned.fasta",
+    output:
+        tree="results/tree.nwk",
+        node_data="results/branch_lengths.json",
+    shell:
+        """
+        augur refine \
+            --tree {input.tree} \
+            --alignment {input.alignment} \
+            --root {ROOTING} \
+            --keep-polytomies \
+            --divergence-units mutations \
+            --output-node-data {output.node_data} \
+            --output-tree {output.tree}
+        """
+
+
+rule ancestral:
+    input:
+        tree="results/tree.nwk",
+        alignment="results/filtered_aligned.fasta",
+        annotation=GENBANK_PATH,
+    output:
+        node_data="results/muts.json",
+    params:
+        translation_template=r"results/translations/cds_%GENE.translation.fasta",
+        output_translation_template=r"results/translations/cds_%GENE.ancestral.fasta",
+        genes=" ".join(GENES),
+    shell:
+        """
+        augur ancestral \
+            --tree {input.tree} \
+            --alignment {input.alignment} \
+            --annotation {input.annotation} \
+            --root-sequence {input.annotation} \
+            --genes {params.genes} \
+            --translations {params.translation_template} \
+            --output-node-data {output.node_data} \
+            --output-translations {params.output_translation_template}
+        """
+
+
+rule dummy_clades:
+    """
+    Nextclade requires clade membership to be specified for each node
+    in the tree. This rule creates a dummy clade membership for each node
+    """
+    input:
+        "results/branch_lengths.json",
+    output:
+        "results/dummy_clades.json",
+    shell:
+        """
+        jq '.nodes |= map_values({{"clade_membership": "dummy"}})' {input} > {output}
+        """
+
+
+rule export:
+    input:
+        tree="results/tree.nwk",
+        metadata="results/filtered_metadata.tsv",
+        mutations="results/muts.json",
+        branch_lengths="results/branch_lengths.json",
+        clades="results/dummy_clades.json",
+        auspice_config="resources/auspice_config.json",
+    output:
+        auspice="results/auspice.json",
+    shell:
+        """
+        augur export v2 \
+            --tree {input.tree} \
+            --metadata {input.metadata} \
+            --auspice-config {input.auspice_config} \
+            --node-data {input.mutations} {input.branch_lengths} {input.clades} \
+            --output {output.auspice}
+        """
+
+
+rule subsample_example_sequences:
+    input:
+        all_sequences="data/sequences.fasta",
+        tree_strains="results/tree_strains.txt",
+    output:
+        example_sequences="results/example_sequences.fasta",
+    shell:
+        """
+        # Exclude tree sequences from all sequences
+        seqkit grep -v -f {input.tree_strains} {input.all_sequences} \
+        | seqkit sample -n 100 -s 42 > {output.example_sequences}
+        """
+
+
+rule assemble_dataset:
+    input:
+        tree="results/auspice.json",
+        reference=REFERENCE_PATH,
+        annotation=GFF_PATH,
+        sequences="results/example_sequences.fasta",
+        pathogen="resources/pathogen.json",
+        readme=README_PATH,
+        changelog=CHANGELOG_PATH,
+    output:
+        tree="dataset/tree.json",
+        reference="dataset/reference.fasta",
+        annotation="dataset/genome_annotation.gff3",
+        sequences="dataset/sequences.fasta",
+        pathogen="dataset/pathogen.json",
+        readme="dataset/README.md",
+        changelog="dataset/CHANGELOG.md",
+        dataset_zip="dataset.zip",
+    shell:
+        """
+        cp {input.tree} {output.tree}
+        cp {input.reference} {output.reference}
+        cp {input.annotation} {output.annotation}
+        cp {input.sequences} {output.sequences}
+        cp {input.pathogen} {output.pathogen}
+        cp {input.readme} {output.readme}
+        cp {input.changelog} {output.changelog}
+        zip -rj dataset.zip  dataset/*
+        """
+
+
+rule test:
+    input:
+        dataset="dataset.zip",
+        sequences="dataset/sequences.fasta",
+    output:
+        output=directory("test_out"),
+    shell:
+        """
+        nextclade3 run \
+            --input-dataset {input.dataset} \
+            --output-all {output.output} \
+            {input.sequences}
+        """
diff --git a/docs/example-workflow/dataset/CHANGELOG.md b/docs/example-workflow/dataset/CHANGELOG.md
@@ -0,0 +1,3 @@
+## Unreleased
+
+Initial release.
diff --git a/docs/example-workflow/dataset/README.md b/docs/example-workflow/dataset/README.md
@@ -0,0 +1,3 @@
+# Example dataset for Zika virus
+
+This is a minimal example dataset for demonstration purposes.
diff --git a/docs/example-workflow/dataset/genome_annotation.gff3 b/docs/example-workflow/dataset/genome_annotation.gff3
@@ -0,0 +1,19 @@
+##gff-version 3
+#!gff-spec-version 1.21
+#!processor NCBI annotwriter
+##sequence-region NC_035889.1 1 10808
+##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=64320
+NC_035889.1	RefSeq	region	1	10808	.	+	.	ID=NC_035889.1:1..10808;Dbxref=taxon:64320;collection-date=2015;country=Brazil: Rio Grande do Norte%2C Natal;gbkey=Src;genome=genomic;isolation-source=fetus' brain autopsy;mol_type=genomic RNA;nat-host=Homo sapiens;strain=Natal RGN
+NC_035889.1	RefSeq	region	1	10808	.	+	.	ID=id-NC_035889.1:1..10808;Note=Mature peptides were annotated by RefSeq staff using homoloy to NC_012532.1 and the cleavage sites reported in Kuno and Chang%2C 2007 (PMID 17195954). Questions about the annotation of this sequence should be directed to info@ncbi.nlm.nih.gov.;gbkey=Comment
+NC_035889.1	RefSeq	CDS	108	473	.	+	.	Name=CA;gbkey=Prot;protein_id=YP_009430295.1;ID=id-YP_009428568.1:1..122;product=anchored capsid protein C
+NC_035889.1	RefSeq	CDS	474	752	.	+	.	Name=PRO;gbkey=Prot;product=protein pr;protein_id=YP_009430298.1;ID=id-YP_009428568.1:123..215
+NC_035889.1	RefSeq	CDS	753	977	.	+	.	Name=MP;gbkey=Prot;protein_id=YP_009430299.1;product=membrane glycoprotein M;ID=id-YP_009428568.1:216..290
+NC_035889.1	RefSeq	CDS	978	2489	.	+	.	Name=ENV;gbkey=Prot;protein_id=YP_009430300.1;product=envelope protein E;ID=id-YP_009428568.1:291..794
+NC_035889.1	RefSeq	CDS	2490	3545	.	+	.	Name=NS1;gbkey=Prot;protein_id=YP_009430301.1;product=nonstructural protein NS1;ID=id-YP_009428568.1:795..1146
+NC_035889.1	RefSeq	CDS	3546	4223	.	+	.	Name=NS2A;gbkey=Prot;protein_id=YP_009430302.1;product=nonstructural protein NS2A;ID=id-YP_009428568.1:1147..1372
+NC_035889.1	RefSeq	CDS	4224	4613	.	+	.	Name=NS2B;gbkey=Prot;protein_id=YP_009430303.1;product=nonstructural protein NS2B;ID=id-YP_009428568.1:1373..1502
+NC_035889.1	RefSeq	CDS	4614	6464	.	+	.	Name=NS3;gbkey=Prot;protein_id=YP_009430304.1;product=nonstructural protein NS3;ID=id-YP_009428568.1:1503..2119
+NC_035889.1	RefSeq	CDS	6465	6845	.	+	.	Name=NS4A;gbkey=Prot;protein_id=YP_009430305.1;product=nonstructural protein NS4A;ID=id-YP_009428568.1:2120..2246
+NC_035889.1	RefSeq	CDS	6846	6914	.	+	.	Name=2K;gbkey=Prot;product=protein 2K;protein_id=YP_009430306.1;ID=id-YP_009428568.1:2247..2269
+NC_035889.1	RefSeq	CDS	6915	7667	.	+	.	Name=NS4B;gbkey=Prot;protein_id=YP_009430307.1;product=nonstructural protein NS4B;ID=id-YP_009428568.1:2270..2520
+NC_035889.1	RefSeq	CDS	7668	10376	.	+	.	Name=NS5;gbkey=Prot;protein_id=YP_009430308.1;ID=id-YP_009428568.1:2521..3423;product=RNA-dependent RNA polymerase NS5
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# Example dataset for Zika virus

		This is a minimal example dataset for demonstration purposes.