Merge pull request #19 from abzer005/Abzer-branch

axelwalter · web-flow · commit efdd76467755 · 2024-04-13T18:02:25.000+02:00
diff --git a/pages/1_📁_Data_Preparation.py b/pages/1_📁_Data_Preparation.py
@@ -32,7 +32,7 @@
     if file_origin == "Small example dataset for testing":
         ft, md = load_example()
 
-    if file_origin == "GNPS(2) task ID" or file_origin == "Example dataset from publication" or "GNPS2 classical molecular networking (CMN)":
+    elif file_origin == "GNPS(2) task ID" or file_origin == "Example dataset from publication" or file_origin == "GNPS2 classical molecular networking (CMN)":
         st.warning("💡 This tool only supports task ID from GNPS1 and 2 not from Quickstart GNPS1.")
         if file_origin == "Example dataset from publication":
             task_id_default = "b661d12ba88745639664988329c1363e" # 63e8b3da08df41fe95031e4710e0476b
@@ -65,7 +65,7 @@
             if not st.session_state["md_gnps"].empty:
                 md = st.session_state["md_gnps"]
 
-    elif file_origin == "Quantification table and meta data files":
+    if file_origin == "Quantification table and meta data files":
         st.info("💡 Upload tables in txt (tab separated), tsv, csv or xlsx (Excel) format.")
         c1, c2 = st.columns(2)
         # Feature Quantification Table
@@ -116,6 +116,17 @@
         # # check if ft column names and md row names are the same
         md, ft = check_columns(md, ft)
 
+        # Initialize the process flags at the start of your Streamlit app if they don't already exist
+        if 'blank_removal_done' not in st.session_state:
+            st.session_state['blank_removal_done'] = False
+
+        if 'imputation_done' not in st.session_state:
+            st.session_state['imputation_done'] = False
+
+        # Use a string to track the normalization method used; 'None' indicates no normalization done
+        if 'normalization_method_used' not in st.session_state:
+            st.session_state['normalization_method_used'] = 'None'
+
         tabs = st.tabs(["**Blank Removal**", "**Imputation**", "**Normalization**", "📊 **Summary**"])
         with tabs[0]:
             blank_removal = st.checkbox("Remove blank features?", False)
@@ -178,8 +189,7 @@
                         0.3,
                         0.05,
                         help="""The recommended cutoff range is between 0.1 and 0.3.
-
-    Features with intensity ratio of (blank mean)/(sample mean) above the threshold (e.g. 30%) are considered noise/background features.
+                        Features with intensity ratio of (blank mean)/(sample mean) above the threshold (e.g. 30%) are considered noise/background features.
                         """,
                     )
                     (
@@ -191,6 +201,8 @@
                     with st.expander(f"Feature table after removing blanks - features: {ft.shape[0]}, samples: {ft.shape[1]}"):
                         show_table(ft, "blank-features-removed")
             
+                st.session_state['blank_removal_done'] = True
+            
             if not ft.empty:
                 cutoff_LOD = get_cutoff_LOD(ft)
 
@@ -210,13 +222,35 @@
                                 show_table(ft.head(), "imputed")
                         else:
                             st.warning(f"Can't impute with random values between 1 and lowest value, which is {cutoff_LOD} (rounded).")
+                        
+                        st.session_state['imputation_done'] = True
 
                 with tabs[2]:
                     normalization_method = st.radio("data normalization method", ["None",
                                                             "Center-Scaling", 
                                                             # "Probabilistic Quotient Normalization (PQN)", 
                                                             "Total Ion Current (TIC) or sample-centric normalization"])
+                    st.session_state['normalization_method_used'] = normalization_method
+                
                 with tabs[3]:
+                    # Summary tab content
+                    st.markdown("## Process Summary")
+                    if st.session_state['blank_removal_done']:
+                        st.success("Blank removal done.")
+                    else:
+                        st.warning("Blank removal not done.")
+
+                    if st.session_state['imputation_done']:
+                        st.success("Imputation done.")
+                    else:
+                        st.warning("Imputation not done.")
+
+                    # Check which normalization method was used
+                    if st.session_state['normalization_method_used'] != 'None':
+                        st.success(f"Normalization done using {st.session_state['normalization_method_used']} method.")
+                    else:
+                        st.warning("Normalization not done.")
+
                     tab1, tab2 = st.tabs(
                         ["📊 Feature intensity frequency", "📊 Missing values per feature"]
                     )
diff --git a/pages/5_Random_Forest.py b/pages/5_Random_Forest.py
@@ -12,6 +12,8 @@
     )
     st.image("assets/figures/random-forest.png")
 
+use_random_seed = st.checkbox('Use a fixed random seed for reproducibility', True)
+
 if not st.session_state.data.empty:
     c1, c2 = st.columns(2)
     c1.selectbox(
@@ -23,13 +25,56 @@
                     key = "rf_n_trees",
                     help="number of trees for random forest, check the OOB error plot and select a number of trees where the error rate is low and flat")
     
+    random_seed = 123 if use_random_seed else None
+
     if c2.button("Run supervised learning", type="primary"):
-        st.session_state.df_oob, st.session_state.df_important_features = run_random_forest(st.session_state.rf_attribute, st.session_state.rf_n_trees)
+        df_oob, df_important_features, log, class_report, label_mapping, test_confusion_df, train_confusion_df, test_accuracy, train_accuracy = run_random_forest(st.session_state.rf_attribute, st.session_state.rf_n_trees, random_seed)
+        st.session_state['df_oob'] = df_oob
+        st.session_state['df_important_features'] = df_important_features
+        st.session_state['log'] = log
+        st.session_state['class_report'] = class_report
+        st.session_state['label_mapping'] = label_mapping
+        st.session_state['test_confusion_df'] = test_confusion_df
+        st.session_state['train_confusion_df'] = train_confusion_df
+        st.session_state['test_accuracy'] = test_accuracy
+        st.session_state['train_accuracy'] = train_accuracy
 
-if not st.session_state.df_important_features.empty:
-    tabs = st.tabs(["📈 Analyze optimum number of trees", "📁 Feature ranked by importance"])
+if 'df_important_features' in st.session_state and not st.session_state.df_important_features.empty:
+    tabs = st.tabs(["📈 Analyze optimum number of trees", 
+                    "📁 Feature ranked by importance", 
+                    "📋 Classification Report",
+                    "🔍 Confusion Matrix"])
     with tabs[0]:
         fig = get_oob_fig(st.session_state.df_oob)
         show_fig(fig, "oob-error")
     with tabs[1]:
-        show_table(st.session_state.df_important_features)
+        show_table(st.session_state.df_important_features)
+    with tabs[2]:  # Classification Report
+        if 'log' in st.session_state:
+            st.subheader("Log Messages")
+            st.text(st.session_state.log)
+
+        if 'class_report' in st.session_state and 'label_mapping' in st.session_state:
+            st.subheader("Classification Report")
+        
+            # Convert the classification report string to DataFrame
+            class_report_df = classification_report_to_df(st.session_state.class_report)
+        
+            # Convert the label mapping string to DataFrame
+            label_mapping_df = label_mapping_to_df(st.session_state.label_mapping)
+           
+            # Ensure class_report_df's index is set correctly for merging
+            class_report_df['class'] = class_report_df['class'].astype(str)
+        
+            # Merge the DataFrames on 'Class Index'
+            merged_df = pd.merge(class_report_df, label_mapping_df, on='class')
+            merged_df.set_index('Label', inplace=True)
+            st.dataframe(merged_df)
+    with tabs[3]:
+        st.subheader("Test Set Confusion Matrix")
+        st.dataframe(st.session_state.test_confusion_df)
+        st.write(f"Test Set Accuracy: {st.session_state.test_accuracy:.2%}")
+
+        st.subheader("Train Set Confusion Matrix")
+        st.dataframe(st.session_state.train_confusion_df)
+        st.write(f"Train Set Accuracy: {st.session_state.train_accuracy:.2%}")
diff --git a/src/clustering.py b/src/clustering.py
@@ -37,7 +37,8 @@ def get_heatmap(data):
     ord_ft = ord_samp.T.reset_index()
     ord_ft = ord_ft.reindex(cluster_ft["leaves"])
 
-    ord_ft.drop(columns=["row ID"], inplace=True)
+    ord_ft.drop(columns=["metabolite"], inplace=True)
+    
     # Append string prefix to numeric indeces
     ord_ft.index = pd.Index(["m_"+x if x.isnumeric() else x for x in ord_ft.index.astype(str)])
     
@@ -48,8 +49,8 @@ def get_heatmap(data):
         x=list(ord_ft.columns),
         text_auto=False,
         aspect="auto",
-        color_continuous_scale="PuOr_r",
-        range_color=[ord_ft.min().min(), ord_ft.max().max()],
+        color_continuous_scale="PuOr_r"
+        #range_color=[ord_ft.min().min(), ord_ft.max().max()],
     )
 
     fig.update_layout(
diff --git a/src/pca.py b/src/pca.py
@@ -1,16 +1,18 @@
 import streamlit as st
 from sklearn.decomposition import PCA
+from sklearn.preprocessing import StandardScaler
 import pandas as pd
 import plotly.express as px
 import numpy as np
 
-
 @st.cache_data
 def get_pca_df(scaled, n=5):
+    
     # calculating Principal components
     pca = PCA(n_components=n)
     pca_df = pd.DataFrame(
-        data=pca.fit_transform(scaled), columns=[f"PC{x}" for x in range(1, n + 1)]
+        data=pca.fit_transform(scaled), 
+        columns=[f"PC{x}" for x in range(1, n + 1)]
     )
     pca_df.index = scaled.index
     return pca.explained_variance_ratio_, pca_df
diff --git a/src/randomforest.py b/src/randomforest.py
@@ -7,12 +7,21 @@
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.utils import class_weight
 from sklearn.metrics import classification_report
+from sklearn.metrics import confusion_matrix, accuracy_score
+from io import StringIO
 
 @st.cache_data
-def run_random_forest(attribute, n_trees):
+def run_random_forest(attribute, n_trees, random_seed=None):
     # initialize a log to print out in the app later
     log = ""
 
+    df_oob = pd.DataFrame()  # Placeholder
+    df_important_features = pd.DataFrame()  # Placeholder
+
+    # Placeholder for classification report and label mapping
+    class_report = "Classification report here"
+    label_mapping = "Label mapping here"
+
     labels = st.session_state.md[[attribute]]
     rf_data = pd.concat([st.session_state.data, labels], axis=1)
 
@@ -22,63 +31,125 @@ def run_random_forest(attribute, n_trees):
     labels = enc.fit_transform(labels)
     labels = np.array([x[0] + 1 for x in labels])
 
+    class_names = enc.categories_[0] #getting the class names
+
     # Extract the feature intensities as np 2D array
     features = np.array(st.session_state.data)
 
 
     # Split the data into training and test sets
-    train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.25, random_state=123)
-
-    print(f'Training Features Shape: {train_features.shape}')
-    print(f'Training Labels Shape: {train_labels.shape}')
-    print(f'Testing Features Shape: {test_features.shape}')
-    print(f'Testing Labels Shape: {test_labels.shape}')
+    train_features, test_features, train_labels, test_labels = train_test_split(features, 
+                                                                                labels, 
+                                                                                test_size=0.25, 
+                                                                                random_state= random_seed,
+                                                                                stratify=labels)
+
+    # Collecting info about feature and label shapes for logging
+    log += f"Training Features Shape: {train_features.shape}\n"
+    log += f"Training Labels Shape: {train_labels.shape}\n"
+    log += f"Testing Features Shape: {test_features.shape}\n"
+    log += f"Testing Labels Shape: {test_labels.shape}\n"
 
     # Balance the weights of the attribute of interest to account for unbalanced sample sizes per group
     sklearn_weights = class_weight.compute_class_weight(
         class_weight='balanced',
         classes=np.unique(train_labels),
         y=train_labels)
+    
     weights = {}
     for i,w in enumerate(np.unique(train_labels)):
         weights[w] = sklearn_weights[i]
 
     # Set up the random forest classifier with 100 tress, balanded weights, and a random state to make it reproducible
-    rf = RandomForestClassifier(n_estimators=n_trees, class_weight=weights, random_state=123)
+    rf = RandomForestClassifier(n_estimators=n_trees, class_weight='balanced', random_state=random_seed)
+   
     # Fit the classifier to the training set
     rf.fit(train_features, train_labels)
 
     # Use the random forest classifier to predict the sample areas in the test set
-    predictions = rf.predict(test_features)
-    print(f'Classifier mean accuracy score: {round(rf.score(test_features, test_labels)*100, 2)}%.')
+    predictions_test = rf.predict(test_features)
+    predictions_train = rf.predict(train_features)
+
+    classifier_accuracy = round(rf.score(test_features, test_labels)*100, 2)
+    log += f"Classifier mean accuracy score: {classifier_accuracy}%.\n"
 
+    # Calculate confusion matrices
+    test_confusion_matrix = confusion_matrix(test_labels, predictions_test, labels=range(len(class_names)))
+    train_confusion_matrix = confusion_matrix(train_labels, predictions_train, labels=range(len(class_names)))
+
+    test_confusion_df = pd.DataFrame(test_confusion_matrix, index=class_names, columns=class_names)
+    train_confusion_df = pd.DataFrame(train_confusion_matrix, index=class_names, columns=class_names)
+
+    test_accuracy = accuracy_score(test_labels, predictions_test)
+    train_accuracy = accuracy_score(train_labels, predictions_train)
+    
     # Report of the accuracy of predictions on the test set
-    print(classification_report(test_labels, predictions))
+    class_report = classification_report(test_labels, predictions_test)
 
     # Print the sample areas corresponding to the numbers in the report
-    print("Sample areas corresponding to the numbers:")
-    for i,cat in enumerate(enc.categories_[0]):
-        print(f"{i+1.0} ,{cat}")
+    label_mapping = "\n".join([f"{i+1.0} ,{cat}" for i, cat in enumerate(enc.categories_[0])])
 
     # Most important model quality plot
     # OOB error lines should flatline. If it doesn't flatline add more trees
-    rf = RandomForestClassifier(class_weight=weights, warm_start=True, oob_score=True, random_state=123)
+    rf_oob = RandomForestClassifier(class_weight=weights, warm_start=True, oob_score=True, random_state=123)
     errors = []
     tree_range = np.arange(1,500, 10)
     for i in tree_range:
-        rf.set_params(n_estimators=i)
-        rf.fit(train_features, train_labels)
-        errors.append(1-rf.oob_score_)
+        rf_oob.set_params(n_estimators=i)
+        rf_oob.fit(train_features, train_labels)
+        errors.append(1-rf_oob.oob_score_)
 
 
     df_oob = pd.DataFrame({"n trees": tree_range, "error rate": errors})
 
     # Extract the important features in the model
-    df_important_features = pd.DataFrame(rf.feature_importances_, index=st.session_state.data.columns).sort_values(by=0, ascending=False)
+    df_important_features = pd.DataFrame(rf.feature_importances_, 
+                                         index=st.session_state.data.columns).sort_values(by=0, ascending=False)
     df_important_features.columns = ["importance"]
 
-
-    return df_oob, df_important_features
+    return df_oob, df_important_features, log, class_report, label_mapping, test_confusion_df, train_confusion_df, test_accuracy, train_accuracy
 
 def get_oob_fig(df):
-    return px.line(df, x="n trees", y="error rate", title="out-of-bag (OOB) error")
+    return px.line(df, x="n trees", y="error rate", title="out-of-bag (OOB) error")
+
+def classification_report_to_df(report):
+    
+    # Split the report into lines
+    lines = report.split("\n")
+    
+    # Prepare a dictionary to hold the data
+    report_data = {"class": [], "precision": [], "recall": [], "f1-score": [], "support": []}
+    
+    for line in lines[2:-3]:  # Skip the header and summary lines
+        parts = line.split()
+        # Ensure that the line contains the expected number of parts
+        if len(parts) == 5:
+            report_data["class"].append(parts[0])
+            report_data["precision"].append(parts[1])
+            report_data["recall"].append(parts[2])
+            report_data["f1-score"].append(parts[3])
+            report_data["support"].append(parts[4])
+    
+    # Convert the dictionary to a DataFrame
+    report_df = pd.DataFrame(report_data)
+    
+    # Convert numeric columns from strings to floats
+    report_df[["precision", "recall", "f1-score"]] = report_df[["precision", "recall", "f1-score"]].astype(float)
+    report_df["support"] = report_df["support"].astype(int)
+    
+    return report_df
+
+def label_mapping_to_df(label_mapping_str):
+    
+    # Split the string into lines
+    lines = label_mapping_str.split("\n")
+    
+    # Split each line into index and label, then collect into a list of tuples
+    mapping = [line.split(" ,") for line in lines if line]  # Ensure the line is not empty
+    
+    # Convert the list of tuples into a DataFrame
+    mapping_df = pd.DataFrame(mapping, columns=['class', 'Label'])
+    mapping_df['class'] = mapping_df['class'].astype(str)
+    return mapping_df
+
+