Inputs added for classical molecular networking

abzer005 · abzer005 · commit c12e722ffa11 · 2024-04-02T16:29:27.000+02:00
diff --git a/pages/1_📁_Data_Preparation.py b/pages/1_📁_Data_Preparation.py
@@ -21,24 +21,41 @@
     )
     ft, md = pd.DataFrame(), pd.DataFrame()
 
-    file_origin = st.radio("File origin", ["Quantification table and meta data files", "GNPS(2) task ID", "Example dataset from publication", "Small example dataset for testing"])
+    file_origin = st.radio("File origin", 
+                           ["Quantification table and meta data files", 
+                            "GNPS(2) task ID", 
+                            "Example dataset from publication", 
+                            "Small example dataset for testing",
+                            "GNPS2 classical molecular networking (CMN)"])
+    
     # b661d12ba88745639664988329c1363e
     if file_origin == "Small example dataset for testing":
         ft, md = load_example()
 
-    if file_origin == "GNPS(2) task ID" or file_origin == "Example dataset from publication":
+    if file_origin == "GNPS(2) task ID" or file_origin == "Example dataset from publication" or "GNPS2 classical molecular networking (CMN)":
         st.warning("💡 This tool only supports task ID from GNPS1 and 2 not from Quickstart GNPS1.")
         if file_origin == "Example dataset from publication":
             task_id_default = "b661d12ba88745639664988329c1363e" # 63e8b3da08df41fe95031e4710e0476b
-
+            disabled = True
+        elif file_origin == "GNPS2 classical molecular networking (CMN)":
+            task_id_default = "2a65f90094654235a4c8d337fdca11e1" # 63e8b3da08df41fe95031e4710e0476b
             disabled = True
         else:
             task_id_default = ""
             disabled = False
         task_id = st.text_input("GNPS task ID", task_id_default, disabled=disabled)
         _, c2, _ = st.columns(3)
+        
         if c2.button("Load filed from GNPS", type="primary", disabled=len(task_id) == 0, use_container_width=True):
-            st.session_state["ft_gnps"], st.session_state["md_gnps"] = load_from_gnps(task_id)
+            st.session_state["ft_gnps"], st.session_state["md_gnps"] = load_from_gnps(task_id, cmn=True)
+            
+        if md.empty:
+            st.warning("Meta data is empty. Please upload one.")
+            
+            md_file = st.file_uploader("Meta Data Table")
+            if md_file:
+                md = load_md(md_file)
+                st.success("Meta data was loaded successfully!")
         
         if "ft_gnps" in st.session_state:
             if not st.session_state["ft_gnps"].empty:
diff --git a/src/fileselection.py b/src/fileselection.py
@@ -54,26 +54,38 @@ def load_example():
     return ft, md
 
 @st.cache_data
-def load_from_gnps(task_id):
+def load_from_gnps(task_id, cmn=False):
     try: # GNPS2 will run here
         ft = workflow_fbmn.get_quantification_dataframe(task_id, gnps2=True)
         md = workflow_fbmn.get_metadata_dataframe(task_id, gnps2=True).set_index("filename")
         an = taskresult.get_gnps2_task_resultfile_dataframe(task_id, "nf_output/library/merged_results_with_gnps.tsv")[["#Scan#", "Compound_Name"]].set_index("#Scan#")
     except urllib.error.HTTPError: # GNPS1 task IDs can not be retrieved and throw HTTP Error 500
-        ft_url = f"https://proteomics2.ucsd.edu/ProteoSAFe/DownloadResultFile?task={task_id}&file=quantification_table_reformatted/&block=main"
-        md_url = f"https://proteomics2.ucsd.edu/ProteoSAFe/DownloadResultFile?task={task_id}&file=metadata_merged/&block=main"
+        if cmn:
+            ft_url = f"https://gnps2.org/resultfile?task={task_id}&file=nf_output/clustering/featuretable_reformatted_precursorintensity.csv"
+            md_url = f"https://gnps2.org/resultfile?task={task_id}&file=nf_output/metadata/merged_metadata.tsv"
+        else:
+            ft_url = f"https://proteomics2.ucsd.edu/ProteoSAFe/DownloadResultFile?task={task_id}&file=quantification_table_reformatted/&block=main"
+            md_url = f"https://proteomics2.ucsd.edu/ProteoSAFe/DownloadResultFile?task={task_id}&file=metadata_merged/&block=main"
         ft = pd.read_csv(ft_url)
-        md = pd.read_csv(md_url, sep = "\t", index_col="filename")
-        an_url = f"https://proteomics2.ucsd.edu/ProteoSAFe/DownloadResultFile?task={task_id}&file=DB_result/&block=main"
-        an = pd.read_csv(an_url, sep = "\t")[["#Scan#", "Compound_Name"]].set_index("#Scan#")
-    
-    index_with_annotations = pd.Index(ft.apply(lambda x: f'{x["row ID"]}_{round(x["row m/z"], 4)}_{round(x["row retention time"], 2)}', axis=1))
-    ft.index = index_with_annotations
-    st.session_state["df_gnps_annotations"].index = index_with_annotations
-    st.session_state["df_gnps_annotations"]["GNPS annotation"] = ft["row ID"].apply(lambda x: an.loc[x, "Compound_Name"] if x in an.index else pd.NA)
-    st.session_state["df_gnps_annotations"].dropna(inplace=True)
+        try:
+            md = pd.read_csv(md_url, sep = "\t", index_col="filename")
+        except pd.errors.EmptyDataError:
+            md = pd.DataFrame()
+        if not cmn:
+            an_url = f"https://proteomics2.ucsd.edu/ProteoSAFe/DownloadResultFile?task={task_id}&file=DB_result/&block=main"
+            an = pd.read_csv(an_url, sep = "\t")[["#Scan#", "Compound_Name"]].set_index("#Scan#")
+    if cmn:
+        ft.index = ft["row ID"]
+        ft = ft.drop(columns=["row m/z", "row retention time", "row ID"])
+    else:
+        index_with_mz_RT = pd.Index(ft.apply(lambda x: f'{x["row ID"]}_{round(x["row m/z"], 4)}_{round(x["row retention time"], 2)}', axis=1))
+        ft.index = index_with_mz_RT
+        st.session_state["df_gnps_annotations"].index = index_with_mz_RT
+        st.session_state["df_gnps_annotations"]["GNPS annotation"] = ft["row ID"].apply(lambda x: an.loc[x, "Compound_Name"] if x in an.index else pd.NA)
+        st.session_state["df_gnps_annotations"].dropna(inplace=True)
     return ft, md
 
+
 def load_ft(ft_file):
     ft = open_df(ft_file)
     ft = ft.dropna(axis=1)