-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathaggregateGregor
executable file
·95 lines (68 loc) · 2.47 KB
/
aggregateGregor
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#! /usr/bin/env python3
#
# Vivek Rai
# vivekrai@umich.edu
# (c) Parker Lab
#
# August 20, 2018
#
import os
import sys
from argh import *
import pandas as pd
from numpy import log2
def exists(path):
return os.path.exists(path)
@arg("in_dir", nargs="+", help="Path to GREGOR output directory")
@arg("--traits", nargs="+", help="Ordered list of traits")
def aggregate_output(in_dir, traits=None):
""" Compute Z-score and Standard Deviation from the GREGOR output.
The standard deviation is calculated from the columns of `neighborhoodFile`
in the output of GREGOR using the formula below:
SUM [Col-1 * Col-2 * (1 - Col-2)] ^ 1/2
The Z-Score thus is:
(InBed_Index_SNP - ExpectNum_of_InBed_SNP)/Std-Dev
"""
if (traits is not None) and len(traits) != len(in_dir):
print("Number of traits must match the number of GREGOR directories")
sys.exit(1)
if traits is None:
iter_ = zip(in_dir, in_dir)
else:
iter_ = zip(in_dir, traits)
dfs = []
for f, t in iter_:
stats_summary_path = os.path.join(f, "StatisticSummaryFile.txt")
if not exists(stats_summary_path):
print("Error: StatisticSummaryFile not found.")
return None
df = pd.read_csv(stats_summary_path, sep="\t")
def get_stdev(fname, root=f):
neighborhood_file = os.path.join(root, fname, "neighborhoodFile.txt")
if not exists(neighborhood_file):
print("Error: 'neighborhoodFile not found.'")
return None
nf = pd.read_csv(
neighborhood_file,
sep="\t",
header=None,
skiprows=1,
)
stdev = sum(nf[0] * nf[1] * (1 - nf[1])) ** 0.5
return stdev
df["log2_foldEnrichment"] = log2(
df["InBed_Index_SNP"] / df["ExpectNum_of_InBed_SNP"]
)
df["Stdev"] = df["Bed_File"].apply(get_stdev)
df["Zscore"] = (
(df["InBed_Index_SNP"] - df["ExpectNum_of_InBed_SNP"]) / df["Stdev"]
)
df["Bed_File"] = df["Bed_File"].apply(lambda x: os.path.splitext(x)[0])
df["Trait"] = t
dfs.append(df)
print(pd.concat(dfs).to_string(index=False))
if __name__ == "__main__":
""" Aggregate GREGOR's output into a dataframe. Further, calculate the zscore,
stdev, and log2 fold enrichment from the metadata files.
Prints to STDOUT."""
dispatch_command(aggregate_output)