aggregateGregor

#! /usr/bin/env python3
#
# Vivek Rai
# vivekrai@umich.edu
# (c) Parker Lab
#
# August 20, 2018
#


import os
import sys
from argh import *

import pandas as pd
from numpy import log2


def exists(path):
    return os.path.exists(path)


@arg("in_dir", nargs="+", help="Path to GREGOR output directory")
@arg("--traits", nargs="+", help="Ordered list of traits")
def aggregate_output(in_dir, traits=None):
    """ Compute Z-score and Standard Deviation from the GREGOR output.

    The standard deviation is calculated from the columns of `neighborhoodFile`
    in the output of GREGOR using the formula below:

        SUM [Col-1 * Col-2 * (1 - Col-2)] ^ 1/2

    The Z-Score thus is:

        (InBed_Index_SNP - ExpectNum_of_InBed_SNP)/Std-Dev

    """

    if (traits is not None) and len(traits) != len(in_dir):
        print("Number of traits must match the number of GREGOR directories")
        sys.exit(1)

    if traits is None:
        iter_ = zip(in_dir, in_dir)
    else:
        iter_ = zip(in_dir, traits)

    dfs = []

    for f, t in iter_:
        stats_summary_path = os.path.join(f, "StatisticSummaryFile.txt")
        if not exists(stats_summary_path):
            print("Error: StatisticSummaryFile not found.")
            return None

        df = pd.read_csv(stats_summary_path, sep="\t")

        def get_stdev(fname, root=f):
            neighborhood_file = os.path.join(root, fname, "neighborhoodFile.txt")
            if not exists(neighborhood_file):
                print("Error: 'neighborhoodFile not found.'")
                return None

            nf = pd.read_csv(
                neighborhood_file,
                sep="\t",
                header=None,
                skiprows=1,
            )
            stdev = sum(nf[0] * nf[1] * (1 - nf[1])) ** 0.5
            return stdev

        df["log2_foldEnrichment"] = log2(
            df["InBed_Index_SNP"] / df["ExpectNum_of_InBed_SNP"]
        )
        df["Stdev"] = df["Bed_File"].apply(get_stdev)
        df["Zscore"] = (
            (df["InBed_Index_SNP"] - df["ExpectNum_of_InBed_SNP"]) / df["Stdev"]
        )
        df["Bed_File"] = df["Bed_File"].apply(lambda x: os.path.splitext(x)[0])
        df["Trait"] = t

        dfs.append(df)

    print(pd.concat(dfs).to_string(index=False))


if __name__ == "__main__":
    """ Aggregate GREGOR's output into a dataframe. Further, calculate the zscore,
    stdev, and log2 fold enrichment from the metadata files.

    Prints to STDOUT."""

    dispatch_command(aggregate_output)