13
13
if st .button ("Re-do the data preparation step now." ):
14
14
reset_dataframes ()
15
15
st .session_state ["data_preparation_done" ] = False
16
- st .experimental_rerun ()
16
+ st .rerun ()
17
17
show_table (pd .concat ([st .session_state .md , st .session_state .data ], axis = 1 ), title = "FeatureMatrix-scaled-centered" )
18
18
else :
19
19
st .info (
20
20
"""💡 Once you are happy with the results, don't forget to click the **Submit Data for Statistics!** button."""
21
21
)
22
22
ft , md = pd .DataFrame (), pd .DataFrame ()
23
23
24
- file_origin = st .selectbox ("File upload " , ["Quantification table and meta data files" , "GNPS(2) task ID" , "Example dataset from publication" , "Small example dataset for testing" ])
24
+ file_origin = st .radio ("File origin " , ["Quantification table and meta data files" , "GNPS(2) task ID" , "Example dataset from publication" , "Small example dataset for testing" ])
25
25
# b661d12ba88745639664988329c1363e
26
26
if file_origin == "Small example dataset for testing" :
27
27
ft , md = load_example ()
36
36
task_id_default = ""
37
37
disabled = False
38
38
task_id = st .text_input ("GNPS task ID" , task_id_default , disabled = disabled )
39
- c1 , c2 = st .columns (2 )
40
- merge_annotations = c1 .checkbox ("Annotate metabolites" , True , help = "Merge annotations from GNPS FBMN and analog search if available." )
39
+ _ , c2 , _ = st .columns (3 )
41
40
if c2 .button ("Load filed from GNPS" , type = "primary" , disabled = len (task_id ) == 0 , use_container_width = True ):
42
- st .session_state ["ft_gnps" ], st .session_state ["md_gnps" ] = load_from_gnps (task_id , merge_annotations )
41
+ st .session_state ["ft_gnps" ], st .session_state ["md_gnps" ] = load_from_gnps (task_id )
43
42
44
43
if "ft_gnps" in st .session_state :
45
44
if not st .session_state ["ft_gnps" ].empty :
70
69
if not st .session_state ["md_uploaded" ].empty :
71
70
md = st .session_state ["md_uploaded" ]
72
71
73
- v_space (2 )
74
72
if not ft .empty or not md .empty :
75
- t1 , t2 = st .tabs (["Quantification Table" , "Meta Data" ])
73
+ t1 , t2 = st .tabs (["** Quantification Table** " , "** Meta Data** " ])
76
74
t1 .dataframe (ft )
77
75
t2 .dataframe (md )
78
76
79
-
80
77
if not ft .index .is_unique :
81
78
st .error ("Please upload a feature matrix with unique metabolite names." )
82
79
102
99
# # check if ft column names and md row names are the same
103
100
md , ft = check_columns (md , ft )
104
101
105
- st .markdown ("## Blank removal" )
106
-
107
- blank_removal = st .checkbox ("Remove blank features?" , False )
108
- if blank_removal :
109
- # Select true sample files (excluding blank and pools)
110
- st .markdown ("#### Samples" )
111
- st .markdown (
112
- "Select samples (excluding blank and pools) based on the following table."
113
- )
114
- df = inside_levels (md )
115
- mask = df .apply (lambda row : len (row ['LEVELS' ]) == 0 , axis = 1 )
116
- df = df [~ mask ]
117
- st .dataframe (df )
118
- c1 , c2 = st .columns (2 )
119
- sample_column = c1 .selectbox (
120
- "attribute for sample selection" ,
121
- md .columns ,
122
- )
123
- sample_options = list (set (md [sample_column ].dropna ()))
124
- sample_rows = c2 .multiselect ("sample selection" , sample_options , sample_options [0 ])
125
- samples = ft [md [md [sample_column ].isin (sample_rows )].index ]
126
- samples_md = md .loc [samples .columns ]
127
-
128
- with st .expander (f"Selected samples { samples .shape } " ):
129
- st .dataframe (samples )
130
-
131
- if samples .shape [1 ] == ft .shape [1 ]:
132
- st .warning ("You selected everything as sample type. Blank removal not possible." )
133
- else :
134
- v_space (1 )
135
- # Ask if blank removal should be done
136
- st .markdown ("#### Blanks" )
102
+ tabs = st .tabs (["**Blank Removal**" , "**Imputation**" , "**Normalization**" , "📊 **Summary**" ])
103
+ with tabs [0 ]:
104
+ blank_removal = st .checkbox ("Remove blank features?" , False )
105
+ if blank_removal :
106
+ # Select true sample files (excluding blank and pools)
107
+ st .markdown ("#### Samples" )
137
108
st .markdown (
138
- "Select blanks (excluding samples and pools) based on the following table."
109
+ "Select samples (excluding blank and pools) based on the following table."
139
110
)
140
- non_samples_md = md .loc [
141
- [index for index in md .index if index not in samples .columns ]
142
- ]
143
- df = inside_levels (non_samples_md )
111
+ df = inside_levels (md )
144
112
mask = df .apply (lambda row : len (row ['LEVELS' ]) == 0 , axis = 1 )
145
113
df = df [~ mask ]
146
114
st .dataframe (df )
147
115
c1 , c2 = st .columns (2 )
148
-
149
- blank_column = c1 . selectbox (
150
- "attribute for blank selection" , non_samples_md .columns
116
+ sample_column = c1 . selectbox (
117
+ "attribute for sample selection" ,
118
+ md .columns ,
151
119
)
152
- blank_options = list (set (non_samples_md [blank_column ].dropna ()))
153
- blank_rows = c2 .multiselect ("blank selection" , blank_options , blank_options [0 ])
154
- blanks = ft [non_samples_md [non_samples_md [blank_column ].isin (blank_rows )].index ]
155
- with st .expander (f"Selected blanks { blanks .shape } " ):
156
- st .dataframe (blanks )
120
+ sample_options = list (set (md [sample_column ].dropna ()))
121
+ sample_rows = c2 .multiselect ("sample selection" , sample_options , sample_options [0 ])
122
+ samples = ft [md [md [sample_column ].isin (sample_rows )].index ]
123
+ samples_md = md .loc [samples .columns ]
157
124
158
- # define a cutoff value for blank removal (ratio blank/avg(samples))
159
- c1 , c2 = st .columns (2 )
160
- cutoff = c1 .number_input (
161
- "cutoff threshold for blank removal" ,
162
- 0.1 ,
163
- 1.0 ,
164
- 0.3 ,
165
- 0.05 ,
166
- help = """The recommended cutoff range is between 0.1 and 0.3.
167
-
168
- Features with intensity ratio of (blank mean)/(sample mean) above the threshold (e.g. 30%) are considered noise/background features.
169
- """ ,
170
- )
171
- (
172
- ft ,
173
- n_background_features ,
174
- n_real_features ,
175
- ) = remove_blank_features (blanks , samples , cutoff )
176
- c2 .metric ("background or noise features" , n_background_features )
177
- with st .expander (f"Feature table after removing blanks { ft .shape } " ):
178
- show_table (ft , "blank-features-removed" )
179
-
180
- if not ft .empty :
181
- cutoff_LOD = get_cutoff_LOD (ft )
125
+ with st .expander (f"Selected samples preview (n={ samples .shape [1 ]} )" ):
126
+ st .dataframe (samples .head ())
182
127
183
- st .markdown ("## Imputation" )
128
+ if samples .shape [1 ] == ft .shape [1 ]:
129
+ st .warning ("You selected everything as sample type. Blank removal not possible." )
130
+ else :
131
+ v_space (1 )
132
+ # Ask if blank removal should be done
133
+ st .markdown ("#### Blanks" )
134
+ st .markdown (
135
+ "Select blanks (excluding samples and pools) based on the following table."
136
+ )
137
+ non_samples_md = md .loc [
138
+ [index for index in md .index if index not in samples .columns ]
139
+ ]
140
+ df = inside_levels (non_samples_md )
141
+ mask = df .apply (lambda row : len (row ['LEVELS' ]) == 0 , axis = 1 )
142
+ df = df [~ mask ]
143
+ st .dataframe (df )
144
+ c1 , c2 = st .columns (2 )
184
145
185
- c1 , c2 = st .columns (2 )
186
- c2 .metric (
187
- f"total missing values" ,
188
- str ((ft == 0 ).to_numpy ().mean () * 100 )[:4 ] + " %" ,
189
- )
190
- imputation = c1 .checkbox ("Impute missing values?" , False , help = f"These values will be filled with random number between 1 and { cutoff_LOD } (Limit of Detection) during imputation." )
191
- if imputation :
192
- if cutoff_LOD > 1 :
146
+ blank_column = c1 .selectbox (
147
+ "attribute for blank selection" , non_samples_md .columns
148
+ )
149
+ blank_options = list (set (non_samples_md [blank_column ].dropna ()))
150
+ blank_rows = c2 .multiselect ("blank selection" , blank_options , blank_options [0 ])
151
+ blanks = ft [non_samples_md [non_samples_md [blank_column ].isin (blank_rows )].index ]
152
+ with st .expander (f"Selected blanks preview (n={ blanks .shape [1 ]} )" ):
153
+ st .dataframe (blanks .head ())
154
+
155
+ # define a cutoff value for blank removal (ratio blank/avg(samples))
193
156
c1 , c2 = st .columns (2 )
194
- ft = impute_missing_values (ft , cutoff_LOD )
195
- with st .expander (f"Imputed data { ft .shape } " ):
196
- show_table (ft , "imputed" )
197
- else :
198
- st .warning (f"Can't impute with random values between 1 and lowest value, which is { cutoff_LOD } (rounded)." )
199
-
200
- st .markdown ("## Normalization" )
201
- normalization_method = st .selectbox ("data normalization method" , ["Center-Scaling" ,
202
- # "Probabilistic Quotient Normalization (PQN)",
203
- "Total Ion Current (TIC) or sample-centric normalization" ,
204
- "None" ])
205
- v_space (2 )
206
- _ , c1 , _ = st .columns (3 )
207
- if c1 .button ("**Submit Data for Statistics!**" , type = "primary" ):
208
- st .session_state ["md" ], st .session_state ["data" ] = normalization (
209
- ft , md , normalization_method
210
- )
211
- st .session_state ["data_preparation_done" ] = True
212
- st .experimental_rerun ()
213
- v_space (2 )
157
+ cutoff = c1 .number_input (
158
+ "cutoff threshold for blank removal" ,
159
+ 0.1 ,
160
+ 1.0 ,
161
+ 0.3 ,
162
+ 0.05 ,
163
+ help = """The recommended cutoff range is between 0.1 and 0.3.
164
+
165
+ Features with intensity ratio of (blank mean)/(sample mean) above the threshold (e.g. 30%) are considered noise/background features.
166
+ """ ,
167
+ )
168
+ (
169
+ ft ,
170
+ n_background_features ,
171
+ n_real_features ,
172
+ ) = remove_blank_features (blanks , samples , cutoff )
173
+ c2 .metric ("background or noise features" , n_background_features )
174
+ with st .expander (f"Feature table after removing blanks - features: { ft .shape [0 ]} , samples: { ft .shape [1 ]} " ):
175
+ show_table (ft , "blank-features-removed" )
176
+
177
+ if not ft .empty :
178
+ cutoff_LOD = get_cutoff_LOD (ft )
179
+
180
+ with tabs [1 ]:
214
181
215
- tab1 , tab2 = st .tabs (
216
- ["📊 Feature intensity frequency" , "📊 Missing values per feature" ]
182
+ c1 , c2 = st .columns (2 )
183
+ c2 .metric (
184
+ f"total missing values" ,
185
+ str ((ft == 0 ).to_numpy ().mean () * 100 )[:4 ] + " %" ,
186
+ )
187
+ imputation = c1 .checkbox ("Impute missing values?" , False , help = f"These values will be filled with random number between 1 and { cutoff_LOD } (Limit of Detection) during imputation." )
188
+ if imputation :
189
+ if cutoff_LOD > 1 :
190
+ c1 , c2 = st .columns (2 )
191
+ ft = impute_missing_values (ft , cutoff_LOD )
192
+ with st .expander (f"Imputed data - features: { ft .shape [0 ]} , samples: { ft .shape [1 ]} " ):
193
+ show_table (ft .head (), "imputed" )
194
+ else :
195
+ st .warning (f"Can't impute with random values between 1 and lowest value, which is { cutoff_LOD } (rounded)." )
196
+
197
+ with tabs [2 ]:
198
+ normalization_method = st .radio ("data normalization method" , ["None" ,
199
+ "Center-Scaling" ,
200
+ # "Probabilistic Quotient Normalization (PQN)",
201
+ "Total Ion Current (TIC) or sample-centric normalization" ])
202
+ with tabs [3 ]:
203
+ tab1 , tab2 = st .tabs (
204
+ ["📊 Feature intensity frequency" , "📊 Missing values per feature" ]
205
+ )
206
+ with tab1 :
207
+ fig = get_feature_frequency_fig (ft )
208
+ show_fig (fig , "feature-intensity-frequency" )
209
+ with tab2 :
210
+ fig = get_missing_values_per_feature_fig (ft , cutoff_LOD )
211
+ show_fig (fig , "missing-values" )
212
+
213
+
214
+ else :
215
+ st .error ("No features left after blank removal!" )
216
+
217
+ _ , c1 , _ = st .columns (3 )
218
+ if c1 .button ("**Submit Data for Statistics!**" , type = "primary" ):
219
+ st .session_state ["md" ], st .session_state ["data" ] = normalization (
220
+ ft , md , normalization_method
217
221
)
218
- with tab1 :
219
- fig = get_feature_frequency_fig (ft )
220
- show_fig (fig , "feature-intensity-frequency" )
221
- with tab2 :
222
- fig = get_missing_values_per_feature_fig (ft , cutoff_LOD )
223
- show_fig (fig , "missing-values" )
224
-
225
- else :
226
- st .error ("No features left after blank removal!" )
222
+ st .session_state ["data_preparation_done" ] = True
223
+ st .rerun ()
0 commit comments