Merge pull request #20 from abzer005/Abzer-branch

axelwalter · web-flow · commit 9a264ab13629 · 2024-04-16T15:31:27.000+02:00
Fixed issues
diff --git a/pages/1_📁_Data_Preparation.py b/pages/1_📁_Data_Preparation.py
@@ -32,22 +32,25 @@
     if file_origin == "Small example dataset for testing":
         ft, md = load_example()
 
-    elif file_origin == "GNPS(2) task ID" or file_origin == "Example dataset from publication" or file_origin == "GNPS2 classical molecular networking (CMN)":
+    elif file_origin in ["GNPS(2) task ID", "Example dataset from publication", "GNPS2 classical molecular networking (CMN)"]:
         st.warning("💡 This tool only supports task ID from GNPS1 and 2 not from Quickstart GNPS1.")
         if file_origin == "Example dataset from publication":
             task_id_default = "b661d12ba88745639664988329c1363e" # 63e8b3da08df41fe95031e4710e0476b
             disabled = True
+            cmn_flag = False
         elif file_origin == "GNPS2 classical molecular networking (CMN)":
             task_id_default = "" # 2a65f90094654235a4c8d337fdca11e1
             disabled = False
+            cmn_flag = True
         else:
             task_id_default = ""
             disabled = False
+            cmn_flag = False
         task_id = st.text_input("GNPS task ID", task_id_default, disabled=disabled)
         _, c2, _ = st.columns(3)
         
         if c2.button("Load files from GNPS", type="primary", disabled=len(task_id) == 0, use_container_width=True):
-            st.session_state["ft_gnps"], st.session_state["md_gnps"] = load_from_gnps(task_id, cmn=True)   
+            st.session_state["ft_gnps"], st.session_state["md_gnps"] = load_from_gnps(task_id, cmn= cmn_flag)   
         
         if not st.session_state["ft_gnps"].empty and st.session_state["md_gnps"].empty:
             st.warning("Meta data is empty. Please upload one.")
@@ -202,6 +205,8 @@
                         show_table(ft, "blank-features-removed")
             
                 st.session_state['blank_removal_done'] = True
+            else:
+                st.session_state['blank_removal_done'] = False
             
             if not ft.empty:
                 cutoff_LOD = get_cutoff_LOD(ft)
@@ -224,6 +229,8 @@
                             st.warning(f"Can't impute with random values between 1 and lowest value, which is {cutoff_LOD} (rounded).")
                         
                         st.session_state['imputation_done'] = True
+                    else:
+                        st.session_state['imputation_done'] = False
 
                 with tabs[2]:
                     normalization_method = st.radio("data normalization method", ["None",
diff --git a/pages/5_Random_Forest.py b/pages/5_Random_Forest.py
@@ -28,16 +28,19 @@
     random_seed = 123 if use_random_seed else None
 
     if c2.button("Run supervised learning", type="primary"):
-        df_oob, df_important_features, log, class_report, label_mapping, test_confusion_df, train_confusion_df, test_accuracy, train_accuracy = run_random_forest(st.session_state.rf_attribute, st.session_state.rf_n_trees, random_seed)
-        st.session_state['df_oob'] = df_oob
-        st.session_state['df_important_features'] = df_important_features
-        st.session_state['log'] = log
-        st.session_state['class_report'] = class_report
-        st.session_state['label_mapping'] = label_mapping
-        st.session_state['test_confusion_df'] = test_confusion_df
-        st.session_state['train_confusion_df'] = train_confusion_df
-        st.session_state['test_accuracy'] = test_accuracy
-        st.session_state['train_accuracy'] = train_accuracy
+        try:
+            df_oob, df_important_features, log, class_report, label_mapping, test_confusion_df, train_confusion_df, test_accuracy, train_accuracy = run_random_forest(st.session_state.rf_attribute, st.session_state.rf_n_trees, random_seed)
+            st.session_state['df_oob'] = df_oob
+            st.session_state['df_important_features'] = df_important_features
+            st.session_state['log'] = log
+            st.session_state['class_report'] = class_report
+            st.session_state['label_mapping'] = label_mapping
+            st.session_state['test_confusion_df'] = test_confusion_df
+            st.session_state['train_confusion_df'] = train_confusion_df
+            st.session_state['test_accuracy'] = test_accuracy
+            st.session_state['train_accuracy'] = train_accuracy
+        except Exception as e:
+            st.error(f"Failed to run model due to: {str(e)}")
 
 if 'df_important_features' in st.session_state and not st.session_state.df_important_features.empty:
     tabs = st.tabs(["📈 Analyze optimum number of trees", 
diff --git a/src/anova.py b/src/anova.py
@@ -80,6 +80,8 @@ def get_anova_plot(anova):
         yaxis_title="-log(p)",
         showlegend=False
     )
+    fig.update_yaxes(title_standoff=10)
+  
     # fig.update_yaxes(title_font_size=20)
     # fig.update_xaxes(title_font_size=20)
 
diff --git a/src/fileselection.py b/src/fileselection.py
@@ -55,34 +55,48 @@ def load_example():
 
 @st.cache_data
 def load_from_gnps(task_id, cmn=False):
+
     try: # GNPS2 will run here
         ft = workflow_fbmn.get_quantification_dataframe(task_id, gnps2=True)
         md = workflow_fbmn.get_metadata_dataframe(task_id, gnps2=True).set_index("filename")
         an = taskresult.get_gnps2_task_resultfile_dataframe(task_id, "nf_output/library/merged_results_with_gnps.tsv")[["#Scan#", "Compound_Name"]].set_index("#Scan#")
-    except urllib.error.HTTPError: # GNPS1 task IDs can not be retrieved and throw HTTP Error 500
+    except urllib.error.HTTPError as e:
+        print(f"HTTP Error encountered: {e}") # GNPS1 task IDs can not be retrieved and throw HTTP Error 500
         if cmn:
             ft_url = f"https://gnps2.org/resultfile?task={task_id}&file=nf_output/clustering/featuretable_reformatted_precursorintensity.csv"
             md_url = f"https://gnps2.org/resultfile?task={task_id}&file=nf_output/metadata/merged_metadata.tsv"
+            
+            ft = pd.read_csv(ft_url)
+            try:
+                md = pd.read_csv(md_url, sep = "\t", index_col="filename")
+            except pd.errors.EmptyDataError:
+                md = pd.DataFrame()
+
         else:
             ft_url = f"https://proteomics2.ucsd.edu/ProteoSAFe/DownloadResultFile?task={task_id}&file=quantification_table_reformatted/&block=main"
             md_url = f"https://proteomics2.ucsd.edu/ProteoSAFe/DownloadResultFile?task={task_id}&file=metadata_merged/&block=main"
-        ft = pd.read_csv(ft_url)
-        try:
-            md = pd.read_csv(md_url, sep = "\t", index_col="filename")
-        except pd.errors.EmptyDataError:
-            md = pd.DataFrame()
-        if not cmn:
             an_url = f"https://proteomics2.ucsd.edu/ProteoSAFe/DownloadResultFile?task={task_id}&file=DB_result/&block=main"
+
+            ft = pd.read_csv(ft_url)
+            md = pd.read_csv(md_url, sep="\t", index_col="filename")
             an = pd.read_csv(an_url, sep = "\t")[["#Scan#", "Compound_Name"]].set_index("#Scan#")
+
+    if md.empty: # Handle empty metadata
+        md = pd.DataFrame()
+
     if cmn:
         ft.index = ft["row ID"].astype(str)
         ft = ft.drop(columns=["row m/z", "row retention time", "row ID"])
+
     else:
-        index_with_mz_RT = pd.Index(ft.apply(lambda x: f'{x["row ID"]}_{round(x["row m/z"], 4)}_{round(x["row retention time"], 2)}', axis=1))
+        index_with_mz_RT = ft.apply(lambda x: f'{x["row ID"]}_{round(x["row m/z"], 4)}_{round(x["row retention time"], 2)}', axis=1)
         ft.index = index_with_mz_RT
-        st.session_state["df_gnps_annotations"].index = index_with_mz_RT
-        st.session_state["df_gnps_annotations"]["GNPS annotation"] = ft["row ID"].apply(lambda x: an.loc[x, "Compound_Name"] if x in an.index else pd.NA)
-        st.session_state["df_gnps_annotations"].dropna(inplace=True)
+        if 'df_gnps_annotations' in st.session_state:
+            st.session_state["df_gnps_annotations"].index = index_with_mz_RT
+            st.session_state["df_gnps_annotations"]["GNPS annotation"] = ft["row ID"].apply(lambda x: an.loc[x, "Compound_Name"] if x in an.index else pd.NA)
+            st.session_state["df_gnps_annotations"].dropna(inplace=True)
+    
+    ft.index.name = 'metabolite'
     return ft, md
 
 
diff --git a/src/pca.py b/src/pca.py
@@ -1,18 +1,25 @@
 import streamlit as st
 from sklearn.decomposition import PCA
-from sklearn.preprocessing import StandardScaler
 import pandas as pd
 import plotly.express as px
 import numpy as np
 
+
 @st.cache_data
 def get_pca_df(scaled, n=5):
+<<<<<<< HEAD
+    # calculating Principal components
+    pca = PCA(n_components=n)
+    pca_df = pd.DataFrame(
+        data=pca.fit_transform(scaled), columns=[f"PC{x}" for x in range(1, n + 1)]
+=======
     
     # calculating Principal components
     pca = PCA(n_components=n)
     pca_df = pd.DataFrame(
         data=pca.fit_transform(scaled), 
         columns=[f"PC{x}" for x in range(1, n + 1)]
+>>>>>>> efdd76467755ddb96598832b0740cb7149a9cefb
     )
     pca_df.index = scaled.index
     return pca.explained_variance_ratio_, pca_df
diff --git a/src/randomforest.py b/src/randomforest.py
@@ -3,12 +3,11 @@
 import numpy as np
 import plotly.express as px
 from sklearn.preprocessing import OrdinalEncoder
-from sklearn.model_selection import train_test_split
+from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.utils import class_weight
 from sklearn.metrics import classification_report
 from sklearn.metrics import confusion_matrix, accuracy_score
-from io import StringIO
 
 @st.cache_data
 def run_random_forest(attribute, n_trees, random_seed=None):
@@ -36,13 +35,16 @@ def run_random_forest(attribute, n_trees, random_seed=None):
     # Extract the feature intensities as np 2D array
     features = np.array(st.session_state.data)
 
+    # Determine the smallest class size and adjust test_size accordingly
+    unique, counts = np.unique(labels, return_counts=True)
+    min_class_count = min(counts)
+    min_test_size = float(len(unique)) / len(labels)
 
-    # Split the data into training and test sets
-    train_features, test_features, train_labels, test_labels = train_test_split(features, 
-                                                                                labels, 
-                                                                                test_size=0.25, 
-                                                                                random_state= random_seed,
-                                                                                stratify=labels)
+    # Adjust test size to be larger of the calculated min_test_size or the initial_test_size
+    adjusted_test_size = max(min_test_size, 0.25)
+    
+    train_features, test_features, train_labels, test_labels = train_test_split(
+        features, labels, test_size= adjusted_test_size, random_state=random_seed, stratify=labels)
 
     # Collecting info about feature and label shapes for logging
     log += f"Training Features Shape: {train_features.shape}\n"
@@ -61,7 +63,7 @@ def run_random_forest(attribute, n_trees, random_seed=None):
         weights[w] = sklearn_weights[i]
 
     # Set up the random forest classifier with 100 tress, balanded weights, and a random state to make it reproducible
-    rf = RandomForestClassifier(n_estimators=n_trees, class_weight='balanced', random_state=random_seed)
+    rf = RandomForestClassifier(n_estimators=n_trees, class_weight= weights, random_state=random_seed)
    
     # Fit the classifier to the training set
     rf.fit(train_features, train_labels)
@@ -106,9 +108,10 @@ def run_random_forest(attribute, n_trees, random_seed=None):
     df_important_features = pd.DataFrame(rf.feature_importances_, 
                                          index=st.session_state.data.columns).sort_values(by=0, ascending=False)
     df_important_features.columns = ["importance"]
-
+    
     return df_oob, df_important_features, log, class_report, label_mapping, test_confusion_df, train_confusion_df, test_accuracy, train_accuracy
 
+
 def get_oob_fig(df):
     return px.line(df, x="n trees", y="error rate", title="out-of-bag (OOB) error")
 

Original file line number	Diff line number	Diff line change
`@@ -80,6 +80,8 @@ def get_anova_plot(anova):`
`80`	`80`	`yaxis_title="-log(p)",`
`81`	`81`	`showlegend=False`
`82`	`82`	`)`
	`83`	`+ fig.update_yaxes(title_standoff=10)`
	`84`	`+`
`83`	`85`	`# fig.update_yaxes(title_font_size=20)`
`84`	`86`	`# fig.update_xaxes(title_font_size=20)`
`85`	`87`