-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtable_wrappers.py
157 lines (127 loc) · 5.49 KB
/
table_wrappers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Apr 8 10:10:27 2021
@author: brian
"""
import pandas as pd
import numpy as np
import pdb
def groupby_concat_mean(df,gbcol,datcol):
'''
Take grouped mean of column of arrays in pandas dataframe.
I.e. if a column has many array objects of the same length, use df.groupby()
and apply np.mean(axis=0) to np.vstack() of grouped rows
Parameters
----------
df : pandas.DataFrame
must contain gbcol and datcol columns
gbcol : String
Column of df to groupby
datcol : String
Column of arrays of same length to be concatenated vertically:
Returns
-------
pandas.DataFrame
Grouped rows w/ concatenated means of arrays.
for example: df, gbcol = 'anid', datcol = 'speed'
df=
'anid' 'speed'
'AB1' [0,1,1]
'AB1' [0,2,1]
returns:
df = 'anid' 'speed'
'AB1' [0, 1.5, 1]
'''
temp=df.groupby(by=gbcol).apply(lambda x: np.mean(np.vstack(x.loc[:,datcol]),axis=0))
temp0=df.groupby(by=gbcol).mean()
temp0.loc[:,datcol]=temp.values
return temp0.reset_index()
def consolidate_columns_to_labels(df,label_columns,
value_column_name='value',
label_column_name='label'):
''' Take a dataframe with several columns and convert those column names
to label values, and put values in same row of value column.
Update: this seems to be the same as pd.melt() !!!!
e.g. if df = 'a' 'b' 'c' 'id'
1 2 3 'apple'
3 4 5 'dog'
return 'label' 'value' 'id'
a 1 'apple'
b 2 'apple'
c 3 'apple'
a 3 'dog'
b 4 'dog'
c 5 'dog'
'''
#Initialize the new dataframe in format desired:
keep_columns = [col for col in df.columns if (col not in label_columns)]
output_df={value_column_name : [],
label_column_name : [],
}
for col in keep_columns:
output_df[col]=[]
#Turn dataframe into a column of labels and values, w/ chunks ID'd by index:
unwrapped =df.stack().reset_index(level=1, name='val')
ind = np.unique(df.index)
for i in ind:
#For each chunk of rows in unwrapped data frame
chunk = unwrapped.loc[i].reset_index(drop=True)
labels = chunk.loc[:,'level_1']
#Keep label/value pairs as is, but add in
for ii,lab in enumerate(labels):
if lab in label_columns:
output_df[value_column_name].append(chunk.loc[ii,'val'])
output_df[label_column_name].append(lab)
else:
for j in range(0,len(label_columns)):
output_df[lab].append(chunk.iloc[ii,1])
return pd.DataFrame(output_df)
def df_melt_stack(dfs,df_conds,label_columns, var_name, value_name,
static_columns, sort_column=None):
''' Take 2 dataframes of different conditions, melt columns and concatenate.
Input:
dfs = list of pandas.DataFrames with identical column variables,
df_conds = list of string labels describing condition of data frames ['control','experimental']
label_columns = list of column names to unpivot (pd.melt(value_vars))
static_columns = list of column names to fill when unpivotted (pd.melt(id_vars))
sort_column (optional) = string. name of column to use for sorting first if desired.
Output:
df_out= pandas dataframe with melted,stacked data.
e.g.
if dfs[0] = 'a' 'b' 'c' 'id'
1 2 3 'apple'
3 4 5 'dog'
dfs[1] = 'a' 'b' 'c' 'id'
0 0 0 'apple'
1 3 2 'dog'
with df_conds=['saline','cno']
static_columns=['id']
return:
'var_name' 'value_name' 'id' 'cond'
a 1 'apple' 'saline'
b 2 'apple' 'saline'
c 3 'apple' 'saline'
a 3 'dog' 'saline'
b 4 'dog' 'saline'
c 5 'dog' 'saline'
a 0 'apple' 'cno'
b 0 'apple' 'cno'
c 0 'apple' 'cno'
a 0 'dog' 'cno'
b 3 'dog' 'cno'
c 2 'dog' 'cno'
'''
static_columns += ['cond']
df_out = pd.DataFrame()
for cond_label,df in zip(df_conds,dfs):
df['cond']=cond_label
# pdb.set_trace()
df_out= pd.concat((df_out,
pd.melt(df,
value_vars = label_columns,
id_vars = static_columns,
value_name=value_name,
var_name=var_name)
))
return df_out