Merge pull request #16 from axelwalter/annotations

axelwalter · web-flow · commit 421d456aca16 · 2024-03-28T11:06:11.000+01:00
data preparation page update and gnps annotations
diff --git a/pages/1_📁_Data_Preparation.py b/pages/1_📁_Data_Preparation.py
@@ -13,15 +13,15 @@
     if st.button("Re-do the data preparation step now."):
         reset_dataframes()
         st.session_state["data_preparation_done"] = False
-        st.experimental_rerun()
+        st.rerun()
     show_table(pd.concat([st.session_state.md, st.session_state.data], axis=1), title="FeatureMatrix-scaled-centered")
 else:
     st.info(
         """💡 Once you are happy with the results, don't forget to click the **Submit Data for Statistics!** button."""
     )
     ft, md = pd.DataFrame(), pd.DataFrame()
 
-    file_origin = st.selectbox("File upload", ["Quantification table and meta data files", "GNPS(2) task ID", "Example dataset from publication", "Small example dataset for testing"])
+    file_origin = st.radio("File origin", ["Quantification table and meta data files", "GNPS(2) task ID", "Example dataset from publication", "Small example dataset for testing"])
     # b661d12ba88745639664988329c1363e
     if file_origin == "Small example dataset for testing":
         ft, md = load_example()
@@ -36,10 +36,9 @@
             task_id_default = ""
             disabled = False
         task_id = st.text_input("GNPS task ID", task_id_default, disabled=disabled)
-        c1, c2 = st.columns(2)
-        merge_annotations = c1.checkbox("Annotate metabolites", True, help="Merge annotations from GNPS FBMN and analog search if available.")
+        _, c2, _ = st.columns(3)
         if c2.button("Load filed from GNPS", type="primary", disabled=len(task_id) == 0, use_container_width=True):
-            st.session_state["ft_gnps"], st.session_state["md_gnps"] = load_from_gnps(task_id, merge_annotations)
+            st.session_state["ft_gnps"], st.session_state["md_gnps"] = load_from_gnps(task_id)
         
         if "ft_gnps" in st.session_state:
             if not st.session_state["ft_gnps"].empty:
@@ -70,13 +69,11 @@
             if not st.session_state["md_uploaded"].empty:
                 md = st.session_state["md_uploaded"]
 
-    v_space(2)
     if not ft.empty or not md.empty:
-        t1, t2 = st.tabs(["Quantification Table", "Meta Data"])
+        t1, t2 = st.tabs(["**Quantification Table**", "**Meta Data**"])
         t1.dataframe(ft)
         t2.dataframe(md)
 
-
     if not ft.index.is_unique:
         st.error("Please upload a feature matrix with unique metabolite names.")
 
@@ -102,125 +99,125 @@
         # # check if ft column names and md row names are the same
         md, ft = check_columns(md, ft)
 
-        st.markdown("## Blank removal")
-
-        blank_removal = st.checkbox("Remove blank features?", False)
-        if blank_removal:
-            # Select true sample files (excluding blank and pools)
-            st.markdown("#### Samples")
-            st.markdown(
-                "Select samples (excluding blank and pools) based on the following table."
-            )
-            df = inside_levels(md)
-            mask = df.apply(lambda row: len(row['LEVELS']) == 0, axis=1)
-            df = df[~mask]
-            st.dataframe(df)
-            c1, c2 = st.columns(2)
-            sample_column = c1.selectbox(
-                "attribute for sample selection",
-                md.columns,
-            )
-            sample_options = list(set(md[sample_column].dropna()))
-            sample_rows = c2.multiselect("sample selection", sample_options, sample_options[0])
-            samples = ft[md[md[sample_column].isin(sample_rows)].index]
-            samples_md = md.loc[samples.columns]
-
-            with st.expander(f"Selected samples {samples.shape}"):
-                st.dataframe(samples)
-
-            if samples.shape[1] == ft.shape[1]:
-                st.warning("You selected everything as sample type. Blank removal not possible.")
-            else:
-                v_space(1)
-                # Ask if blank removal should be done
-                st.markdown("#### Blanks")
+        tabs = st.tabs(["**Blank Removal**", "**Imputation**", "**Normalization**", "📊 **Summary**"])
+        with tabs[0]:
+            blank_removal = st.checkbox("Remove blank features?", False)
+            if blank_removal:
+                # Select true sample files (excluding blank and pools)
+                st.markdown("#### Samples")
                 st.markdown(
-                    "Select blanks (excluding samples and pools) based on the following table."
+                    "Select samples (excluding blank and pools) based on the following table."
                 )
-                non_samples_md = md.loc[
-                    [index for index in md.index if index not in samples.columns]
-                ]
-                df = inside_levels(non_samples_md)
+                df = inside_levels(md)
                 mask = df.apply(lambda row: len(row['LEVELS']) == 0, axis=1)
                 df = df[~mask]
                 st.dataframe(df)
                 c1, c2 = st.columns(2)
-
-                blank_column = c1.selectbox(
-                    "attribute for blank selection", non_samples_md.columns
+                sample_column = c1.selectbox(
+                    "attribute for sample selection",
+                    md.columns,
                 )
-                blank_options = list(set(non_samples_md[blank_column].dropna()))
-                blank_rows = c2.multiselect("blank selection", blank_options, blank_options[0])
-                blanks = ft[non_samples_md[non_samples_md[blank_column].isin(blank_rows)].index]
-                with st.expander(f"Selected blanks {blanks.shape}"):
-                    st.dataframe(blanks)
+                sample_options = list(set(md[sample_column].dropna()))
+                sample_rows = c2.multiselect("sample selection", sample_options, sample_options[0])
+                samples = ft[md[md[sample_column].isin(sample_rows)].index]
+                samples_md = md.loc[samples.columns]
 
-                # define a cutoff value for blank removal (ratio blank/avg(samples))
-                c1, c2 = st.columns(2)
-                cutoff = c1.number_input(
-                    "cutoff threshold for blank removal",
-                    0.1,
-                    1.0,
-                    0.3,
-                    0.05,
-                    help="""The recommended cutoff range is between 0.1 and 0.3.
-
-Features with intensity ratio of (blank mean)/(sample mean) above the threshold (e.g. 30%) are considered noise/background features.
-                    """,
-                )
-                (
-                    ft,
-                    n_background_features,
-                    n_real_features,
-                ) = remove_blank_features(blanks, samples, cutoff)
-                c2.metric("background or noise features", n_background_features)
-                with st.expander(f"Feature table after removing blanks {ft.shape}"):
-                    show_table(ft, "blank-features-removed")
-        
-        if not ft.empty:
-            cutoff_LOD = get_cutoff_LOD(ft)
+                with st.expander(f"Selected samples preview (n={samples.shape[1]})"):
+                    st.dataframe(samples.head())
 
-            st.markdown("## Imputation")
+                if samples.shape[1] == ft.shape[1]:
+                    st.warning("You selected everything as sample type. Blank removal not possible.")
+                else:
+                    v_space(1)
+                    # Ask if blank removal should be done
+                    st.markdown("#### Blanks")
+                    st.markdown(
+                        "Select blanks (excluding samples and pools) based on the following table."
+                    )
+                    non_samples_md = md.loc[
+                        [index for index in md.index if index not in samples.columns]
+                    ]
+                    df = inside_levels(non_samples_md)
+                    mask = df.apply(lambda row: len(row['LEVELS']) == 0, axis=1)
+                    df = df[~mask]
+                    st.dataframe(df)
+                    c1, c2 = st.columns(2)
 
-            c1, c2 = st.columns(2)
-            c2.metric(
-                f"total missing values",
-                str((ft == 0).to_numpy().mean() * 100)[:4] + " %",
-            )
-            imputation = c1.checkbox("Impute missing values?", False, help=f"These values will be filled with random number between 1 and {cutoff_LOD} (Limit of Detection) during imputation.")
-            if imputation:
-                if cutoff_LOD > 1:
+                    blank_column = c1.selectbox(
+                        "attribute for blank selection", non_samples_md.columns
+                    )
+                    blank_options = list(set(non_samples_md[blank_column].dropna()))
+                    blank_rows = c2.multiselect("blank selection", blank_options, blank_options[0])
+                    blanks = ft[non_samples_md[non_samples_md[blank_column].isin(blank_rows)].index]
+                    with st.expander(f"Selected blanks preview (n={blanks.shape[1]})"):
+                        st.dataframe(blanks.head())
+
+                    # define a cutoff value for blank removal (ratio blank/avg(samples))
                     c1, c2 = st.columns(2)
-                    ft = impute_missing_values(ft, cutoff_LOD)
-                    with st.expander(f"Imputed data {ft.shape}"):
-                        show_table(ft, "imputed")
-                else:
-                    st.warning(f"Can't impute with random values between 1 and lowest value, which is {cutoff_LOD} (rounded).")
-
-            st.markdown("## Normalization")
-            normalization_method = st.selectbox("data normalization method", ["Center-Scaling", 
-                                                    # "Probabilistic Quotient Normalization (PQN)", 
-                                                    "Total Ion Current (TIC) or sample-centric normalization",
-                                                    "None"])
-            v_space(2)
-            _, c1, _ = st.columns(3)
-            if c1.button("**Submit Data for Statistics!**", type="primary"):
-                st.session_state["md"], st.session_state["data"] = normalization(
-                    ft, md, normalization_method
-                )
-                st.session_state["data_preparation_done"] = True
-                st.experimental_rerun()
-            v_space(2)
+                    cutoff = c1.number_input(
+                        "cutoff threshold for blank removal",
+                        0.1,
+                        1.0,
+                        0.3,
+                        0.05,
+                        help="""The recommended cutoff range is between 0.1 and 0.3.
+
+    Features with intensity ratio of (blank mean)/(sample mean) above the threshold (e.g. 30%) are considered noise/background features.
+                        """,
+                    )
+                    (
+                        ft,
+                        n_background_features,
+                        n_real_features,
+                    ) = remove_blank_features(blanks, samples, cutoff)
+                    c2.metric("background or noise features", n_background_features)
+                    with st.expander(f"Feature table after removing blanks - features: {ft.shape[0]}, samples: {ft.shape[1]}"):
+                        show_table(ft, "blank-features-removed")
+            
+            if not ft.empty:
+                cutoff_LOD = get_cutoff_LOD(ft)
+
+                with tabs[1]:
 
-            tab1, tab2 = st.tabs(
-                ["📊 Feature intensity frequency", "📊 Missing values per feature"]
+                    c1, c2 = st.columns(2)
+                    c2.metric(
+                        f"total missing values",
+                        str((ft == 0).to_numpy().mean() * 100)[:4] + " %",
+                    )
+                    imputation = c1.checkbox("Impute missing values?", False, help=f"These values will be filled with random number between 1 and {cutoff_LOD} (Limit of Detection) during imputation.")
+                    if imputation:
+                        if cutoff_LOD > 1:
+                            c1, c2 = st.columns(2)
+                            ft = impute_missing_values(ft, cutoff_LOD)
+                            with st.expander(f"Imputed data - features: {ft.shape[0]}, samples: {ft.shape[1]}"):
+                                show_table(ft.head(), "imputed")
+                        else:
+                            st.warning(f"Can't impute with random values between 1 and lowest value, which is {cutoff_LOD} (rounded).")
+
+                with tabs[2]:
+                    normalization_method = st.radio("data normalization method", ["None",
+                                                            "Center-Scaling", 
+                                                            # "Probabilistic Quotient Normalization (PQN)", 
+                                                            "Total Ion Current (TIC) or sample-centric normalization"])
+                with tabs[3]:
+                    tab1, tab2 = st.tabs(
+                        ["📊 Feature intensity frequency", "📊 Missing values per feature"]
+                    )
+                    with tab1:
+                        fig = get_feature_frequency_fig(ft)
+                        show_fig(fig, "feature-intensity-frequency")
+                    with tab2:
+                        fig = get_missing_values_per_feature_fig(ft, cutoff_LOD)
+                        show_fig(fig, "missing-values")
+
+            
+            else:
+                st.error("No features left after blank removal!")
+
+        _, c1, _ = st.columns(3)
+        if c1.button("**Submit Data for Statistics!**", type="primary"):
+            st.session_state["md"], st.session_state["data"] = normalization(
+                ft, md, normalization_method
             )
-            with tab1:
-                fig = get_feature_frequency_fig(ft)
-                show_fig(fig, "feature-intensity-frequency")
-            with tab2:
-                fig = get_missing_values_per_feature_fig(ft, cutoff_LOD)
-                show_fig(fig, "missing-values")
-        
-        else:
-            st.error("No features left after blank removal!")
+            st.session_state["data_preparation_done"] = True
+            st.rerun()
diff --git a/pages/7_One-way_ANOVA_&_Tukey's.py b/pages/7_One-way_ANOVA_&_Tukey's.py
@@ -29,7 +29,7 @@
             st.session_state.anova_attribute,
             corrections_map[st.session_state.p_value_correction]
         )
-        st.experimental_rerun()
+        st.rerun()
 
     if not st.session_state.df_anova.empty:
         attribute_options = list(
@@ -58,7 +58,7 @@
                 st.session_state.tukey_elements,
                 corrections_map[st.session_state.p_value_correction]
             )
-            st.experimental_rerun()
+            st.rerun()
 
     tab_options = [
         "📈 ANOVA: plot",
diff --git a/pages/8_Kruskal-Wallis_&_Dunn's.py b/pages/8_Kruskal-Wallis_&_Dunn's.py
@@ -28,7 +28,7 @@
             st.session_state.data, st.session_state.kruskal_attribute,
             corrections_map[st.session_state.p_value_correction]
         )
-        st.experimental_rerun()
+        st.rerun()
 
     if not st.session_state.df_kruskal.empty:
         if any(st.session_state.df_kruskal["significant"]):
@@ -58,7 +58,7 @@
                     st.session_state.dunn_elements,
                     corrections_map[st.session_state.p_value_correction]
                 )
-                st.experimental_rerun()
+                st.rerun()
         else:
             st.warning("No significant metabolites found in Kruskal Wallis test after p-value correction.")
 
diff --git a/pages/9_Student's_t-test.py b/pages/9_Student's_t-test.py
@@ -53,7 +53,7 @@
             st.session_state.ttest_correction,
             corrections_map[st.session_state.p_value_correction]
         )
-        st.experimental_rerun()
+        st.rerun()
 
     if not st.session_state.df_ttest.empty:
         tabs = st.tabs(
diff --git a/src/common.py b/src/common.py
@@ -14,7 +14,8 @@
                    "df_important_features",
                    "df_oob",
                    "ft_gnps",
-                   "md_gnps")
+                   "md_gnps",
+                   "df_gnps_annotations")
 
 corrections_map = {"Bonferroni": "bonf",
                    "Sidak": "sidak",
diff --git a/src/fileselection.py b/src/fileselection.py
@@ -54,24 +54,24 @@ def load_example():
     return ft, md
 
 @st.cache_data
-def load_from_gnps(task_id, merge_annotations):
+def load_from_gnps(task_id):
     try: # GNPS2 will run here
         ft = workflow_fbmn.get_quantification_dataframe(task_id, gnps2=True)
         md = workflow_fbmn.get_metadata_dataframe(task_id, gnps2=True).set_index("filename")
-        if merge_annotations:
-            an = taskresult.get_gnps2_task_resultfile_dataframe(task_id, "nf_output/library/merged_results_with_gnps.tsv")[["#Scan#", "Compound_Name"]].set_index("#Scan#")
+        an = taskresult.get_gnps2_task_resultfile_dataframe(task_id, "nf_output/library/merged_results_with_gnps.tsv")[["#Scan#", "Compound_Name"]].set_index("#Scan#")
     except urllib.error.HTTPError: # GNPS1 task IDs can not be retrieved and throw HTTP Error 500
         ft_url = f"https://proteomics2.ucsd.edu/ProteoSAFe/DownloadResultFile?task={task_id}&file=quantification_table_reformatted/&block=main"
         md_url = f"https://proteomics2.ucsd.edu/ProteoSAFe/DownloadResultFile?task={task_id}&file=metadata_merged/&block=main"
         ft = pd.read_csv(ft_url)
         md = pd.read_csv(md_url, sep = "\t", index_col="filename")
-        if merge_annotations:
-            an_url = f"https://proteomics2.ucsd.edu/ProteoSAFe/DownloadResultFile?task={task_id}&file=DB_result/&block=main"
-            an = pd.read_csv(an_url, sep = "\t")[["#Scan#", "Compound_Name"]].set_index("#Scan#")
-    if merge_annotations:
-            ft.index = pd.Index(ft.apply(lambda x: f'{an.loc[x["row ID"], "Compound_Name"]}_{round(x["row m/z"], 4)}_{round(x["row retention time"], 2)}' if x["row ID"] in an.index else f'{x["row ID"]}_{round(x["row m/z"], 4)}_{round(x["row retention time"], 2)}', axis=1))
-    else:
-        ft.index = pd.Index(ft.apply(lambda x: f'{x["row ID"]}_{round(x["row m/z"], 4)}_{round(x["row retention time"], 2)}', axis=1))
+        an_url = f"https://proteomics2.ucsd.edu/ProteoSAFe/DownloadResultFile?task={task_id}&file=DB_result/&block=main"
+        an = pd.read_csv(an_url, sep = "\t")[["#Scan#", "Compound_Name"]].set_index("#Scan#")
+    
+    index_with_annotations = pd.Index(ft.apply(lambda x: f'{x["row ID"]}_{round(x["row m/z"], 4)}_{round(x["row retention time"], 2)}', axis=1))
+    ft.index = index_with_annotations
+    st.session_state["df_gnps_annotations"].index = index_with_annotations
+    st.session_state["df_gnps_annotations"]["GNPS annotation"] = ft["row ID"].apply(lambda x: an.loc[x, "Compound_Name"] if x in an.index else pd.NA)
+    st.session_state["df_gnps_annotations"].dropna(inplace=True)
     return ft, md
 
 def load_ft(ft_file):

Original file line number	Diff line number	Diff line change
`@@ -53,7 +53,7 @@`
`53`	`53`	`st.session_state.ttest_correction,`
`54`	`54`	`corrections_map[st.session_state.p_value_correction]`
`55`	`55`	`)`
`56`		`- st.experimental_rerun()`
	`56`	`+ st.rerun()`
`57`	`57`
`58`	`58`	`if not st.session_state.df_ttest.empty:`
`59`	`59`	`tabs = st.tabs(`