Skip to content

Commit 421d456

Browse files
authored
Merge pull request #16 from axelwalter/annotations
data preparation page update and gnps annotations
2 parents 9d3dcda + c1c441e commit 421d456

6 files changed

+131
-133
lines changed

pages/1_📁_Data_Preparation.py

+114-117
Original file line numberDiff line numberDiff line change
@@ -13,15 +13,15 @@
1313
if st.button("Re-do the data preparation step now."):
1414
reset_dataframes()
1515
st.session_state["data_preparation_done"] = False
16-
st.experimental_rerun()
16+
st.rerun()
1717
show_table(pd.concat([st.session_state.md, st.session_state.data], axis=1), title="FeatureMatrix-scaled-centered")
1818
else:
1919
st.info(
2020
"""💡 Once you are happy with the results, don't forget to click the **Submit Data for Statistics!** button."""
2121
)
2222
ft, md = pd.DataFrame(), pd.DataFrame()
2323

24-
file_origin = st.selectbox("File upload", ["Quantification table and meta data files", "GNPS(2) task ID", "Example dataset from publication", "Small example dataset for testing"])
24+
file_origin = st.radio("File origin", ["Quantification table and meta data files", "GNPS(2) task ID", "Example dataset from publication", "Small example dataset for testing"])
2525
# b661d12ba88745639664988329c1363e
2626
if file_origin == "Small example dataset for testing":
2727
ft, md = load_example()
@@ -36,10 +36,9 @@
3636
task_id_default = ""
3737
disabled = False
3838
task_id = st.text_input("GNPS task ID", task_id_default, disabled=disabled)
39-
c1, c2 = st.columns(2)
40-
merge_annotations = c1.checkbox("Annotate metabolites", True, help="Merge annotations from GNPS FBMN and analog search if available.")
39+
_, c2, _ = st.columns(3)
4140
if c2.button("Load filed from GNPS", type="primary", disabled=len(task_id) == 0, use_container_width=True):
42-
st.session_state["ft_gnps"], st.session_state["md_gnps"] = load_from_gnps(task_id, merge_annotations)
41+
st.session_state["ft_gnps"], st.session_state["md_gnps"] = load_from_gnps(task_id)
4342

4443
if "ft_gnps" in st.session_state:
4544
if not st.session_state["ft_gnps"].empty:
@@ -70,13 +69,11 @@
7069
if not st.session_state["md_uploaded"].empty:
7170
md = st.session_state["md_uploaded"]
7271

73-
v_space(2)
7472
if not ft.empty or not md.empty:
75-
t1, t2 = st.tabs(["Quantification Table", "Meta Data"])
73+
t1, t2 = st.tabs(["**Quantification Table**", "**Meta Data**"])
7674
t1.dataframe(ft)
7775
t2.dataframe(md)
7876

79-
8077
if not ft.index.is_unique:
8178
st.error("Please upload a feature matrix with unique metabolite names.")
8279

@@ -102,125 +99,125 @@
10299
# # check if ft column names and md row names are the same
103100
md, ft = check_columns(md, ft)
104101

105-
st.markdown("## Blank removal")
106-
107-
blank_removal = st.checkbox("Remove blank features?", False)
108-
if blank_removal:
109-
# Select true sample files (excluding blank and pools)
110-
st.markdown("#### Samples")
111-
st.markdown(
112-
"Select samples (excluding blank and pools) based on the following table."
113-
)
114-
df = inside_levels(md)
115-
mask = df.apply(lambda row: len(row['LEVELS']) == 0, axis=1)
116-
df = df[~mask]
117-
st.dataframe(df)
118-
c1, c2 = st.columns(2)
119-
sample_column = c1.selectbox(
120-
"attribute for sample selection",
121-
md.columns,
122-
)
123-
sample_options = list(set(md[sample_column].dropna()))
124-
sample_rows = c2.multiselect("sample selection", sample_options, sample_options[0])
125-
samples = ft[md[md[sample_column].isin(sample_rows)].index]
126-
samples_md = md.loc[samples.columns]
127-
128-
with st.expander(f"Selected samples {samples.shape}"):
129-
st.dataframe(samples)
130-
131-
if samples.shape[1] == ft.shape[1]:
132-
st.warning("You selected everything as sample type. Blank removal not possible.")
133-
else:
134-
v_space(1)
135-
# Ask if blank removal should be done
136-
st.markdown("#### Blanks")
102+
tabs = st.tabs(["**Blank Removal**", "**Imputation**", "**Normalization**", "📊 **Summary**"])
103+
with tabs[0]:
104+
blank_removal = st.checkbox("Remove blank features?", False)
105+
if blank_removal:
106+
# Select true sample files (excluding blank and pools)
107+
st.markdown("#### Samples")
137108
st.markdown(
138-
"Select blanks (excluding samples and pools) based on the following table."
109+
"Select samples (excluding blank and pools) based on the following table."
139110
)
140-
non_samples_md = md.loc[
141-
[index for index in md.index if index not in samples.columns]
142-
]
143-
df = inside_levels(non_samples_md)
111+
df = inside_levels(md)
144112
mask = df.apply(lambda row: len(row['LEVELS']) == 0, axis=1)
145113
df = df[~mask]
146114
st.dataframe(df)
147115
c1, c2 = st.columns(2)
148-
149-
blank_column = c1.selectbox(
150-
"attribute for blank selection", non_samples_md.columns
116+
sample_column = c1.selectbox(
117+
"attribute for sample selection",
118+
md.columns,
151119
)
152-
blank_options = list(set(non_samples_md[blank_column].dropna()))
153-
blank_rows = c2.multiselect("blank selection", blank_options, blank_options[0])
154-
blanks = ft[non_samples_md[non_samples_md[blank_column].isin(blank_rows)].index]
155-
with st.expander(f"Selected blanks {blanks.shape}"):
156-
st.dataframe(blanks)
120+
sample_options = list(set(md[sample_column].dropna()))
121+
sample_rows = c2.multiselect("sample selection", sample_options, sample_options[0])
122+
samples = ft[md[md[sample_column].isin(sample_rows)].index]
123+
samples_md = md.loc[samples.columns]
157124

158-
# define a cutoff value for blank removal (ratio blank/avg(samples))
159-
c1, c2 = st.columns(2)
160-
cutoff = c1.number_input(
161-
"cutoff threshold for blank removal",
162-
0.1,
163-
1.0,
164-
0.3,
165-
0.05,
166-
help="""The recommended cutoff range is between 0.1 and 0.3.
167-
168-
Features with intensity ratio of (blank mean)/(sample mean) above the threshold (e.g. 30%) are considered noise/background features.
169-
""",
170-
)
171-
(
172-
ft,
173-
n_background_features,
174-
n_real_features,
175-
) = remove_blank_features(blanks, samples, cutoff)
176-
c2.metric("background or noise features", n_background_features)
177-
with st.expander(f"Feature table after removing blanks {ft.shape}"):
178-
show_table(ft, "blank-features-removed")
179-
180-
if not ft.empty:
181-
cutoff_LOD = get_cutoff_LOD(ft)
125+
with st.expander(f"Selected samples preview (n={samples.shape[1]})"):
126+
st.dataframe(samples.head())
182127

183-
st.markdown("## Imputation")
128+
if samples.shape[1] == ft.shape[1]:
129+
st.warning("You selected everything as sample type. Blank removal not possible.")
130+
else:
131+
v_space(1)
132+
# Ask if blank removal should be done
133+
st.markdown("#### Blanks")
134+
st.markdown(
135+
"Select blanks (excluding samples and pools) based on the following table."
136+
)
137+
non_samples_md = md.loc[
138+
[index for index in md.index if index not in samples.columns]
139+
]
140+
df = inside_levels(non_samples_md)
141+
mask = df.apply(lambda row: len(row['LEVELS']) == 0, axis=1)
142+
df = df[~mask]
143+
st.dataframe(df)
144+
c1, c2 = st.columns(2)
184145

185-
c1, c2 = st.columns(2)
186-
c2.metric(
187-
f"total missing values",
188-
str((ft == 0).to_numpy().mean() * 100)[:4] + " %",
189-
)
190-
imputation = c1.checkbox("Impute missing values?", False, help=f"These values will be filled with random number between 1 and {cutoff_LOD} (Limit of Detection) during imputation.")
191-
if imputation:
192-
if cutoff_LOD > 1:
146+
blank_column = c1.selectbox(
147+
"attribute for blank selection", non_samples_md.columns
148+
)
149+
blank_options = list(set(non_samples_md[blank_column].dropna()))
150+
blank_rows = c2.multiselect("blank selection", blank_options, blank_options[0])
151+
blanks = ft[non_samples_md[non_samples_md[blank_column].isin(blank_rows)].index]
152+
with st.expander(f"Selected blanks preview (n={blanks.shape[1]})"):
153+
st.dataframe(blanks.head())
154+
155+
# define a cutoff value for blank removal (ratio blank/avg(samples))
193156
c1, c2 = st.columns(2)
194-
ft = impute_missing_values(ft, cutoff_LOD)
195-
with st.expander(f"Imputed data {ft.shape}"):
196-
show_table(ft, "imputed")
197-
else:
198-
st.warning(f"Can't impute with random values between 1 and lowest value, which is {cutoff_LOD} (rounded).")
199-
200-
st.markdown("## Normalization")
201-
normalization_method = st.selectbox("data normalization method", ["Center-Scaling",
202-
# "Probabilistic Quotient Normalization (PQN)",
203-
"Total Ion Current (TIC) or sample-centric normalization",
204-
"None"])
205-
v_space(2)
206-
_, c1, _ = st.columns(3)
207-
if c1.button("**Submit Data for Statistics!**", type="primary"):
208-
st.session_state["md"], st.session_state["data"] = normalization(
209-
ft, md, normalization_method
210-
)
211-
st.session_state["data_preparation_done"] = True
212-
st.experimental_rerun()
213-
v_space(2)
157+
cutoff = c1.number_input(
158+
"cutoff threshold for blank removal",
159+
0.1,
160+
1.0,
161+
0.3,
162+
0.05,
163+
help="""The recommended cutoff range is between 0.1 and 0.3.
164+
165+
Features with intensity ratio of (blank mean)/(sample mean) above the threshold (e.g. 30%) are considered noise/background features.
166+
""",
167+
)
168+
(
169+
ft,
170+
n_background_features,
171+
n_real_features,
172+
) = remove_blank_features(blanks, samples, cutoff)
173+
c2.metric("background or noise features", n_background_features)
174+
with st.expander(f"Feature table after removing blanks - features: {ft.shape[0]}, samples: {ft.shape[1]}"):
175+
show_table(ft, "blank-features-removed")
176+
177+
if not ft.empty:
178+
cutoff_LOD = get_cutoff_LOD(ft)
179+
180+
with tabs[1]:
214181

215-
tab1, tab2 = st.tabs(
216-
["📊 Feature intensity frequency", "📊 Missing values per feature"]
182+
c1, c2 = st.columns(2)
183+
c2.metric(
184+
f"total missing values",
185+
str((ft == 0).to_numpy().mean() * 100)[:4] + " %",
186+
)
187+
imputation = c1.checkbox("Impute missing values?", False, help=f"These values will be filled with random number between 1 and {cutoff_LOD} (Limit of Detection) during imputation.")
188+
if imputation:
189+
if cutoff_LOD > 1:
190+
c1, c2 = st.columns(2)
191+
ft = impute_missing_values(ft, cutoff_LOD)
192+
with st.expander(f"Imputed data - features: {ft.shape[0]}, samples: {ft.shape[1]}"):
193+
show_table(ft.head(), "imputed")
194+
else:
195+
st.warning(f"Can't impute with random values between 1 and lowest value, which is {cutoff_LOD} (rounded).")
196+
197+
with tabs[2]:
198+
normalization_method = st.radio("data normalization method", ["None",
199+
"Center-Scaling",
200+
# "Probabilistic Quotient Normalization (PQN)",
201+
"Total Ion Current (TIC) or sample-centric normalization"])
202+
with tabs[3]:
203+
tab1, tab2 = st.tabs(
204+
["📊 Feature intensity frequency", "📊 Missing values per feature"]
205+
)
206+
with tab1:
207+
fig = get_feature_frequency_fig(ft)
208+
show_fig(fig, "feature-intensity-frequency")
209+
with tab2:
210+
fig = get_missing_values_per_feature_fig(ft, cutoff_LOD)
211+
show_fig(fig, "missing-values")
212+
213+
214+
else:
215+
st.error("No features left after blank removal!")
216+
217+
_, c1, _ = st.columns(3)
218+
if c1.button("**Submit Data for Statistics!**", type="primary"):
219+
st.session_state["md"], st.session_state["data"] = normalization(
220+
ft, md, normalization_method
217221
)
218-
with tab1:
219-
fig = get_feature_frequency_fig(ft)
220-
show_fig(fig, "feature-intensity-frequency")
221-
with tab2:
222-
fig = get_missing_values_per_feature_fig(ft, cutoff_LOD)
223-
show_fig(fig, "missing-values")
224-
225-
else:
226-
st.error("No features left after blank removal!")
222+
st.session_state["data_preparation_done"] = True
223+
st.rerun()

pages/7_One-way_ANOVA_&_Tukey's.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
st.session_state.anova_attribute,
3030
corrections_map[st.session_state.p_value_correction]
3131
)
32-
st.experimental_rerun()
32+
st.rerun()
3333

3434
if not st.session_state.df_anova.empty:
3535
attribute_options = list(
@@ -58,7 +58,7 @@
5858
st.session_state.tukey_elements,
5959
corrections_map[st.session_state.p_value_correction]
6060
)
61-
st.experimental_rerun()
61+
st.rerun()
6262

6363
tab_options = [
6464
"📈 ANOVA: plot",

pages/8_Kruskal-Wallis_&_Dunn's.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
st.session_state.data, st.session_state.kruskal_attribute,
2929
corrections_map[st.session_state.p_value_correction]
3030
)
31-
st.experimental_rerun()
31+
st.rerun()
3232

3333
if not st.session_state.df_kruskal.empty:
3434
if any(st.session_state.df_kruskal["significant"]):
@@ -58,7 +58,7 @@
5858
st.session_state.dunn_elements,
5959
corrections_map[st.session_state.p_value_correction]
6060
)
61-
st.experimental_rerun()
61+
st.rerun()
6262
else:
6363
st.warning("No significant metabolites found in Kruskal Wallis test after p-value correction.")
6464

pages/9_Student's_t-test.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@
5353
st.session_state.ttest_correction,
5454
corrections_map[st.session_state.p_value_correction]
5555
)
56-
st.experimental_rerun()
56+
st.rerun()
5757

5858
if not st.session_state.df_ttest.empty:
5959
tabs = st.tabs(

src/common.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@
1414
"df_important_features",
1515
"df_oob",
1616
"ft_gnps",
17-
"md_gnps")
17+
"md_gnps",
18+
"df_gnps_annotations")
1819

1920
corrections_map = {"Bonferroni": "bonf",
2021
"Sidak": "sidak",

src/fileselection.py

+10-10
Original file line numberDiff line numberDiff line change
@@ -54,24 +54,24 @@ def load_example():
5454
return ft, md
5555

5656
@st.cache_data
57-
def load_from_gnps(task_id, merge_annotations):
57+
def load_from_gnps(task_id):
5858
try: # GNPS2 will run here
5959
ft = workflow_fbmn.get_quantification_dataframe(task_id, gnps2=True)
6060
md = workflow_fbmn.get_metadata_dataframe(task_id, gnps2=True).set_index("filename")
61-
if merge_annotations:
62-
an = taskresult.get_gnps2_task_resultfile_dataframe(task_id, "nf_output/library/merged_results_with_gnps.tsv")[["#Scan#", "Compound_Name"]].set_index("#Scan#")
61+
an = taskresult.get_gnps2_task_resultfile_dataframe(task_id, "nf_output/library/merged_results_with_gnps.tsv")[["#Scan#", "Compound_Name"]].set_index("#Scan#")
6362
except urllib.error.HTTPError: # GNPS1 task IDs can not be retrieved and throw HTTP Error 500
6463
ft_url = f"https://proteomics2.ucsd.edu/ProteoSAFe/DownloadResultFile?task={task_id}&file=quantification_table_reformatted/&block=main"
6564
md_url = f"https://proteomics2.ucsd.edu/ProteoSAFe/DownloadResultFile?task={task_id}&file=metadata_merged/&block=main"
6665
ft = pd.read_csv(ft_url)
6766
md = pd.read_csv(md_url, sep = "\t", index_col="filename")
68-
if merge_annotations:
69-
an_url = f"https://proteomics2.ucsd.edu/ProteoSAFe/DownloadResultFile?task={task_id}&file=DB_result/&block=main"
70-
an = pd.read_csv(an_url, sep = "\t")[["#Scan#", "Compound_Name"]].set_index("#Scan#")
71-
if merge_annotations:
72-
ft.index = pd.Index(ft.apply(lambda x: f'{an.loc[x["row ID"], "Compound_Name"]}_{round(x["row m/z"], 4)}_{round(x["row retention time"], 2)}' if x["row ID"] in an.index else f'{x["row ID"]}_{round(x["row m/z"], 4)}_{round(x["row retention time"], 2)}', axis=1))
73-
else:
74-
ft.index = pd.Index(ft.apply(lambda x: f'{x["row ID"]}_{round(x["row m/z"], 4)}_{round(x["row retention time"], 2)}', axis=1))
67+
an_url = f"https://proteomics2.ucsd.edu/ProteoSAFe/DownloadResultFile?task={task_id}&file=DB_result/&block=main"
68+
an = pd.read_csv(an_url, sep = "\t")[["#Scan#", "Compound_Name"]].set_index("#Scan#")
69+
70+
index_with_annotations = pd.Index(ft.apply(lambda x: f'{x["row ID"]}_{round(x["row m/z"], 4)}_{round(x["row retention time"], 2)}', axis=1))
71+
ft.index = index_with_annotations
72+
st.session_state["df_gnps_annotations"].index = index_with_annotations
73+
st.session_state["df_gnps_annotations"]["GNPS annotation"] = ft["row ID"].apply(lambda x: an.loc[x, "Compound_Name"] if x in an.index else pd.NA)
74+
st.session_state["df_gnps_annotations"].dropna(inplace=True)
7575
return ft, md
7676

7777
def load_ft(ft_file):

0 commit comments

Comments
 (0)