diff --git a/README.md b/README.md index 6467382..90f87ba 100644 --- a/README.md +++ b/README.md @@ -81,3 +81,7 @@ For additional information and source code for each pipeline see the following g [https://github.com/UCSC-Treehouse/fusion] [https://github.com/UCSC-Treehouse/mini-var-call] + + +## Treehouse Internal Use +See [the Treeshop document](treeshop.md) for running on VMs via docker-machine and fabric. diff --git a/fabfile.py b/fabfile.py index cf12ad9..e4a619e 100644 --- a/fabfile.py +++ b/fabfile.py @@ -384,6 +384,11 @@ def process(manifest="manifest.tsv", base=".", checksum_only="False"): run("rm *.tar.gz") run("mv *.sorted.bam sorted.bam") + # Temporarily move sorted.bam to parent dir so we don't download it + # Still pretty hacky but prevents temporary exposure of sequence data to downstream dir + with cd("/mnt/outputs/expression"): + run("mv sorted.bam ..") + # Update methods.json and copy output back dest = "{}/ucsc_cgl-rnaseq-cgl-pipeline-3.3.4-785eee9".format(output) local("mkdir -p {}".format(dest)) @@ -402,14 +407,9 @@ def process(manifest="manifest.tsv", base=".", checksum_only="False"): with open("{}/methods.json".format(dest), "w") as f: f.write(json.dumps(methods, indent=4)) - """ - FIXME HACK: Delete the RNASeq BAM favoring the bam output from umend QC. - Ideally would delete before back hauling but fabric doesn't support exclude - and the alternative would require moving it away and back before QC - or listing all files that should be backhauled from rnaseq. - """ - print("Deleting local copy of RNASeq BAM in favor of UMEND BAM") - local("rm {}/*.bam".format(dest)) + # Move sorted.bam back to the expression dir so that QC can find it. + with cd("/mnt/outputs/expression"): + run("mv ../sorted.bam .") # Calculate qc (bam-umend-qc) methods["start"] = datetime.datetime.utcnow().isoformat() @@ -419,6 +419,14 @@ def process(manifest="manifest.tsv", base=".", checksum_only="False"): _log_error("{} Failed qc: {}".format(sample_id, result)) continue + # Store sortedByCoord.md.bam and .bai in primary/derived + # First, move it out of the way momentarily so it won't get + # downloaded into downstream + bamdest = "{}/primary/derived/{}".format(base, sample_id) + local("mkdir -p {}".format(bamdest)) + with cd("/mnt/outputs/qc"): + run("mv sortedByCoord.md.bam* ..") + # Update methods.json and copy output back dest = "{}/ucsctreehouse-bam-umend-qc-1.1.1-5f286d7".format(output) local("mkdir -p {}".format(dest)) @@ -426,6 +434,8 @@ def process(manifest="manifest.tsv", base=".", checksum_only="False"): os.path.relpath(output, base))] methods["outputs"] = [ os.path.relpath(p, base) for p in get("/mnt/outputs/qc/*", dest)] + methods["outputs"] += [ + os.path.relpath(p, base) for p in get("/mnt/outputs/sortedByCoord.md.bam*", bamdest)] methods["end"] = datetime.datetime.utcnow().isoformat() methods["pipeline"] = { "source": "https://github.com/UCSC-Treehouse/bam-umend-qc", @@ -438,6 +448,10 @@ def process(manifest="manifest.tsv", base=".", checksum_only="False"): with open("{}/methods.json".format(dest), "w") as f: f.write(json.dumps(methods, indent=4)) + # And move the QC bam back so it's available to the variant caller + with cd("/mnt/outputs/qc"): + run("mv ../sortedByCoord.md.bam* .") + # Calculate fusion methods["start"] = datetime.datetime.utcnow().isoformat() with settings(warn_only=True): @@ -475,8 +489,7 @@ def process(manifest="manifest.tsv", base=".", checksum_only="False"): # Update methods.json and copy output back dest = "{}/ucsctreehouse-mini-var-call-0.0.1-1976429".format(output) local("mkdir -p {}".format(dest)) - methods["inputs"] = ["{}/ucsc_cgl-rnaseq-cgl-pipeline-3.3.4-785eee9/sortedByCoord.md.bam".format( # NOQA - os.path.relpath(output, base))] + methods["inputs"] = ["{}/sortedByCoord.md.bam".format(bamdest)] methods["outputs"] = [ os.path.relpath(p, base) for p in get("/mnt/outputs/variants/*", dest)] methods["end"] = datetime.datetime.utcnow().isoformat() diff --git a/treeshop.md b/treeshop.md index 7221dde..4e5afc9 100644 --- a/treeshop.md +++ b/treeshop.md @@ -136,13 +136,12 @@ After this you should have the following under downstream: │   │   └── STAR │   │   ├── Log.final.out │   │   └── SJ.out.tab - │   ├── RSEM - │   │   ├── Hugo - │   │   │   ├── rsem_genes.hugo.results - │   │   │   └── rsem_isoforms.hugo.results - │   │   ├── rsem_genes.results - │   │   └── rsem_isoforms.results - │   └── sorted.bam + │   └── RSEM + │      ├── Hugo + │      │   ├── rsem_genes.hugo.results + │      │   └── rsem_isoforms.hugo.results + │      ├── rsem_genes.results + │      └── rsem_isoforms.results ├── ucsctreehouse-bam-umend-qc-1.1.0-cc481e4 │   ├── bam_umend_qc.json │   ├── bam_umend_qc.tsv @@ -157,6 +156,14 @@ After this you should have the following under downstream: ├── methods.json └── mini.ann.vcf +The bam file generated by `qc` will be placed in `primary/derived`. +Note that the bam file generated by `expression` (`sorted.bam`) is not downloaded at all. + + treeshop/primary/derived + └── TEST + ├── sortedByCoord.md.bam + └── sortedByCoord.md.bam.bai + ### Shut Down After confirming that you successfully processed your data, you may want to shut down your docker machine.