-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathsingle_cell_tools.py
executable file
·113 lines (103 loc) · 4.68 KB
/
single_cell_tools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import numpy as np
import pylab as plt
import seaborn as sns
import pandas as pd
from scipy import sparse
def cluster_acc(y_true, y_pred):
"""
Calculate clustering accuracy. Require scikit-learn installed
# Arguments
y: true labels, numpy.array with shape `(n_samples,)`
y_pred: predicted labels, numpy.array with shape `(n_samples,)`
# Return
accuracy, in [0,1]
"""
y_true = y_true.astype(np.int64)
assert y_pred.size == y_true.size
D = max(y_pred.max(), y_true.max()) + 1
w = np.zeros((D, D), dtype=np.int64)
for i in range(y_pred.size):
w[y_pred[i], y_true[i]] += 1
from sklearn.utils.linear_assignment_ import linear_assignment
ind = linear_assignment(w.max() - w)
return sum([w[i, j] for i, j in ind]) * 1.0 / y_pred.size
def geneSelection(data, threshold=0, atleast=10,
yoffset=.02, xoffset=5, decay=1.5, n=None,
plot=True, markers=None, genes=None, figsize=(6,3.5),
markeroffsets=None, labelsize=10, alpha=1, verbose=1):
if sparse.issparse(data):
zeroRate = 1 - np.squeeze(np.array((data>threshold).mean(axis=0)))
A = data.multiply(data>threshold)
A.data = np.log2(A.data)
meanExpr = np.zeros_like(zeroRate) * np.nan
detected = zeroRate < 1
meanExpr[detected] = np.squeeze(np.array(A[:,detected].mean(axis=0))) / (1-zeroRate[detected])
else:
zeroRate = 1 - np.mean(data>threshold, axis=0)
meanExpr = np.zeros_like(zeroRate) * np.nan
detected = zeroRate < 1
mask = data[:,detected]>threshold
logs = np.zeros_like(data[:,detected]) * np.nan
logs[mask] = np.log2(data[:,detected][mask])
meanExpr[detected] = np.nanmean(logs, axis=0)
lowDetection = np.array(np.sum(data>threshold, axis=0)).squeeze() < atleast
zeroRate[lowDetection] = np.nan
meanExpr[lowDetection] = np.nan
if n is not None:
up = 10
low = 0
for t in range(100):
nonan = ~np.isnan(zeroRate)
selected = np.zeros_like(zeroRate).astype(bool)
selected[nonan] = zeroRate[nonan] > np.exp(-decay*(meanExpr[nonan] - xoffset)) + yoffset
if np.sum(selected) == n:
break
elif np.sum(selected) < n:
up = xoffset
xoffset = (xoffset + low)/2
else:
low = xoffset
xoffset = (xoffset + up)/2
if verbose>0:
print('Chosen offset: {:.2f}'.format(xoffset))
else:
nonan = ~np.isnan(zeroRate)
selected = np.zeros_like(zeroRate).astype(bool)
selected[nonan] = zeroRate[nonan] > np.exp(-decay*(meanExpr[nonan] - xoffset)) + yoffset
if plot:
if figsize is not None:
plt.figure(figsize=figsize)
plt.ylim([0, 1])
if threshold>0:
plt.xlim([np.log2(threshold), np.ceil(np.nanmax(meanExpr))])
else:
plt.xlim([0, np.ceil(np.nanmax(meanExpr))])
x = np.arange(plt.xlim()[0], plt.xlim()[1]+.1,.1)
y = np.exp(-decay*(x - xoffset)) + yoffset
if decay==1:
plt.text(.4, 0.2, '{} genes selected\ny = exp(-x+{:.2f})+{:.2f}'.format(np.sum(selected),xoffset, yoffset),
color='k', fontsize=labelsize, transform=plt.gca().transAxes)
else:
plt.text(.4, 0.2, '{} genes selected\ny = exp(-{:.1f}*(x-{:.2f}))+{:.2f}'.format(np.sum(selected),decay,xoffset, yoffset),
color='k', fontsize=labelsize, transform=plt.gca().transAxes)
plt.plot(x, y, color=sns.color_palette()[1], linewidth=2)
xy = np.concatenate((np.concatenate((x[:,None],y[:,None]),axis=1), np.array([[plt.xlim()[1], 1]])))
t = plt.matplotlib.patches.Polygon(xy, color=sns.color_palette()[1], alpha=.4)
plt.gca().add_patch(t)
plt.scatter(meanExpr, zeroRate, s=1, alpha=alpha, rasterized=True)
if threshold==0:
plt.xlabel('Mean log2 nonzero expression')
plt.ylabel('Frequency of zero expression')
else:
plt.xlabel('Mean log2 nonzero expression')
plt.ylabel('Frequency of near-zero expression')
plt.tight_layout()
if markers is not None and genes is not None:
if markeroffsets is None:
markeroffsets = [(0, 0) for g in markers]
for num,g in enumerate(markers):
i = np.where(genes==g)[0]
plt.scatter(meanExpr[i], zeroRate[i], s=10, color='k')
dx, dy = markeroffsets[num]
plt.text(meanExpr[i]+dx+.1, zeroRate[i]+dy, g, color='k', fontsize=labelsize)
return selected