-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_ldsc_inputs_total.py
85 lines (69 loc) · 2.75 KB
/
get_ldsc_inputs_total.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/usr/bin/env python3
import os
import pickle
import sys
import numpy as np
import pandas as pd
def load_gene(data, clusters, gene, genes_dir):
gene_dir = os.path.join(genes_dir, gene)
gene_path = os.path.join(gene_dir, "gene_data.pickle")
try:
with open(gene_path, "rb") as gene_file:
gene_data = pickle.load(gene_file)
except Exception as e:
print(e) ####
return
try:
total_counts = gene_data["total_counts"]
contig, tss_pos = gene_data["tss"]
except Exception as e:
print(e)
return
expression_dct = {}
for c in clusters:
try:
# print(total_counts[c].keys()) ####
counts = total_counts[c]["c"]
except Exception as e:
print(e)
continue
for sample, phen in counts.items():
expression_dct.setdefault(sample, {})[c] = phen
expression_lst = []
for sample_name, cluster_data in expression_dct.items():
expression_lst.append([cluster_data.get(c, np.nan) for c in clusters])
if len(expression_lst) == 0:
return
# print(expression_lst) ####
expression_arr = np.array(expression_lst)
exp_sum = np.nansum(expression_arr, axis=1, keepdims=True)
exp_valid = ~np.isnan(expression_arr)
num_valid = np.sum(exp_valid, axis=1, keepdims=True)
exp_rest = (exp_sum - expression_arr) / (num_valid - 1)
exp_diff = expression_arr - exp_rest
diff_mean = np.nanmean(exp_diff, axis=0)
diff_std = np.nanstd(exp_diff, axis=0)
num_samps = np.sum(exp_valid, axis=0)
t_scores = diff_mean / (diff_std / np.sqrt(num_samps))
for i, c in enumerate(clusters):
data[c].append([contig, tss_pos - 100000, tss_pos + 100000, gene, t_scores[i]])
def write_bed(data, out_path):
with open(out_path, "w") as out_file:
out_file.writelines(f"{' '.join(map(str, i))}\n" for i in sorted(data))
def get_ldsc_inputs_total(clusters, genes_dir, out_dir, gene_list_path):
data = {c: [] for c in clusters}
with open(gene_list_path, "rb") as gene_list_file:
gene_list = pickle.load(gene_list_file)
# gene_list = gene_list[:100] ####
for gene in gene_list:
load_gene(data, clusters, gene, genes_dir)
for c in clusters:
out_path = os.path.join(out_dir, f"{c}_t.bed")
write_bed(data[c], out_path)
if __name__ == '__main__':
data_path_kellis = "/agusevlab/awang/sc_kellis"
genes_dir = os.path.join(data_path_kellis, "genes_429")
out_dir = os.path.join(data_path_kellis, "ldsc_429_exp")
gene_list_path = os.path.join(data_path_kellis, "list_429_all.pickle")
clusters = ["Ex", "Oligo", "Astro", "In", "Endo", "Microglia", "OPC", "Per"]
get_ldsc_inputs_total(clusters, genes_dir, out_dir, gene_list_path)