Skip to content

Commit efdd764

Browse files
authored
Merge pull request #19 from abzer005/Abzer-branch
2 parents 0e4df32 + 8a4807e commit efdd764

File tree

5 files changed

+188
-35
lines changed

5 files changed

+188
-35
lines changed

pages/1_📁_Data_Preparation.py

+38-4
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
if file_origin == "Small example dataset for testing":
3333
ft, md = load_example()
3434

35-
if file_origin == "GNPS(2) task ID" or file_origin == "Example dataset from publication" or "GNPS2 classical molecular networking (CMN)":
35+
elif file_origin == "GNPS(2) task ID" or file_origin == "Example dataset from publication" or file_origin == "GNPS2 classical molecular networking (CMN)":
3636
st.warning("💡 This tool only supports task ID from GNPS1 and 2 not from Quickstart GNPS1.")
3737
if file_origin == "Example dataset from publication":
3838
task_id_default = "b661d12ba88745639664988329c1363e" # 63e8b3da08df41fe95031e4710e0476b
@@ -65,7 +65,7 @@
6565
if not st.session_state["md_gnps"].empty:
6666
md = st.session_state["md_gnps"]
6767

68-
elif file_origin == "Quantification table and meta data files":
68+
if file_origin == "Quantification table and meta data files":
6969
st.info("💡 Upload tables in txt (tab separated), tsv, csv or xlsx (Excel) format.")
7070
c1, c2 = st.columns(2)
7171
# Feature Quantification Table
@@ -116,6 +116,17 @@
116116
# # check if ft column names and md row names are the same
117117
md, ft = check_columns(md, ft)
118118

119+
# Initialize the process flags at the start of your Streamlit app if they don't already exist
120+
if 'blank_removal_done' not in st.session_state:
121+
st.session_state['blank_removal_done'] = False
122+
123+
if 'imputation_done' not in st.session_state:
124+
st.session_state['imputation_done'] = False
125+
126+
# Use a string to track the normalization method used; 'None' indicates no normalization done
127+
if 'normalization_method_used' not in st.session_state:
128+
st.session_state['normalization_method_used'] = 'None'
129+
119130
tabs = st.tabs(["**Blank Removal**", "**Imputation**", "**Normalization**", "📊 **Summary**"])
120131
with tabs[0]:
121132
blank_removal = st.checkbox("Remove blank features?", False)
@@ -178,8 +189,7 @@
178189
0.3,
179190
0.05,
180191
help="""The recommended cutoff range is between 0.1 and 0.3.
181-
182-
Features with intensity ratio of (blank mean)/(sample mean) above the threshold (e.g. 30%) are considered noise/background features.
192+
Features with intensity ratio of (blank mean)/(sample mean) above the threshold (e.g. 30%) are considered noise/background features.
183193
""",
184194
)
185195
(
@@ -191,6 +201,8 @@
191201
with st.expander(f"Feature table after removing blanks - features: {ft.shape[0]}, samples: {ft.shape[1]}"):
192202
show_table(ft, "blank-features-removed")
193203

204+
st.session_state['blank_removal_done'] = True
205+
194206
if not ft.empty:
195207
cutoff_LOD = get_cutoff_LOD(ft)
196208

@@ -210,13 +222,35 @@
210222
show_table(ft.head(), "imputed")
211223
else:
212224
st.warning(f"Can't impute with random values between 1 and lowest value, which is {cutoff_LOD} (rounded).")
225+
226+
st.session_state['imputation_done'] = True
213227

214228
with tabs[2]:
215229
normalization_method = st.radio("data normalization method", ["None",
216230
"Center-Scaling",
217231
# "Probabilistic Quotient Normalization (PQN)",
218232
"Total Ion Current (TIC) or sample-centric normalization"])
233+
st.session_state['normalization_method_used'] = normalization_method
234+
219235
with tabs[3]:
236+
# Summary tab content
237+
st.markdown("## Process Summary")
238+
if st.session_state['blank_removal_done']:
239+
st.success("Blank removal done.")
240+
else:
241+
st.warning("Blank removal not done.")
242+
243+
if st.session_state['imputation_done']:
244+
st.success("Imputation done.")
245+
else:
246+
st.warning("Imputation not done.")
247+
248+
# Check which normalization method was used
249+
if st.session_state['normalization_method_used'] != 'None':
250+
st.success(f"Normalization done using {st.session_state['normalization_method_used']} method.")
251+
else:
252+
st.warning("Normalization not done.")
253+
220254
tab1, tab2 = st.tabs(
221255
["📊 Feature intensity frequency", "📊 Missing values per feature"]
222256
)

pages/5_Random_Forest.py

+49-4
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
)
1313
st.image("assets/figures/random-forest.png")
1414

15+
use_random_seed = st.checkbox('Use a fixed random seed for reproducibility', True)
16+
1517
if not st.session_state.data.empty:
1618
c1, c2 = st.columns(2)
1719
c1.selectbox(
@@ -23,13 +25,56 @@
2325
key = "rf_n_trees",
2426
help="number of trees for random forest, check the OOB error plot and select a number of trees where the error rate is low and flat")
2527

28+
random_seed = 123 if use_random_seed else None
29+
2630
if c2.button("Run supervised learning", type="primary"):
27-
st.session_state.df_oob, st.session_state.df_important_features = run_random_forest(st.session_state.rf_attribute, st.session_state.rf_n_trees)
31+
df_oob, df_important_features, log, class_report, label_mapping, test_confusion_df, train_confusion_df, test_accuracy, train_accuracy = run_random_forest(st.session_state.rf_attribute, st.session_state.rf_n_trees, random_seed)
32+
st.session_state['df_oob'] = df_oob
33+
st.session_state['df_important_features'] = df_important_features
34+
st.session_state['log'] = log
35+
st.session_state['class_report'] = class_report
36+
st.session_state['label_mapping'] = label_mapping
37+
st.session_state['test_confusion_df'] = test_confusion_df
38+
st.session_state['train_confusion_df'] = train_confusion_df
39+
st.session_state['test_accuracy'] = test_accuracy
40+
st.session_state['train_accuracy'] = train_accuracy
2841

29-
if not st.session_state.df_important_features.empty:
30-
tabs = st.tabs(["📈 Analyze optimum number of trees", "📁 Feature ranked by importance"])
42+
if 'df_important_features' in st.session_state and not st.session_state.df_important_features.empty:
43+
tabs = st.tabs(["📈 Analyze optimum number of trees",
44+
"📁 Feature ranked by importance",
45+
"📋 Classification Report",
46+
"🔍 Confusion Matrix"])
3147
with tabs[0]:
3248
fig = get_oob_fig(st.session_state.df_oob)
3349
show_fig(fig, "oob-error")
3450
with tabs[1]:
35-
show_table(st.session_state.df_important_features)
51+
show_table(st.session_state.df_important_features)
52+
with tabs[2]: # Classification Report
53+
if 'log' in st.session_state:
54+
st.subheader("Log Messages")
55+
st.text(st.session_state.log)
56+
57+
if 'class_report' in st.session_state and 'label_mapping' in st.session_state:
58+
st.subheader("Classification Report")
59+
60+
# Convert the classification report string to DataFrame
61+
class_report_df = classification_report_to_df(st.session_state.class_report)
62+
63+
# Convert the label mapping string to DataFrame
64+
label_mapping_df = label_mapping_to_df(st.session_state.label_mapping)
65+
66+
# Ensure class_report_df's index is set correctly for merging
67+
class_report_df['class'] = class_report_df['class'].astype(str)
68+
69+
# Merge the DataFrames on 'Class Index'
70+
merged_df = pd.merge(class_report_df, label_mapping_df, on='class')
71+
merged_df.set_index('Label', inplace=True)
72+
st.dataframe(merged_df)
73+
with tabs[3]:
74+
st.subheader("Test Set Confusion Matrix")
75+
st.dataframe(st.session_state.test_confusion_df)
76+
st.write(f"Test Set Accuracy: {st.session_state.test_accuracy:.2%}")
77+
78+
st.subheader("Train Set Confusion Matrix")
79+
st.dataframe(st.session_state.train_confusion_df)
80+
st.write(f"Train Set Accuracy: {st.session_state.train_accuracy:.2%}")

src/clustering.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,8 @@ def get_heatmap(data):
3737
ord_ft = ord_samp.T.reset_index()
3838
ord_ft = ord_ft.reindex(cluster_ft["leaves"])
3939

40-
ord_ft.drop(columns=["row ID"], inplace=True)
40+
ord_ft.drop(columns=["metabolite"], inplace=True)
41+
4142
# Append string prefix to numeric indeces
4243
ord_ft.index = pd.Index(["m_"+x if x.isnumeric() else x for x in ord_ft.index.astype(str)])
4344

@@ -48,8 +49,8 @@ def get_heatmap(data):
4849
x=list(ord_ft.columns),
4950
text_auto=False,
5051
aspect="auto",
51-
color_continuous_scale="PuOr_r",
52-
range_color=[ord_ft.min().min(), ord_ft.max().max()],
52+
color_continuous_scale="PuOr_r"
53+
#range_color=[ord_ft.min().min(), ord_ft.max().max()],
5354
)
5455

5556
fig.update_layout(

src/pca.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,18 @@
11
import streamlit as st
22
from sklearn.decomposition import PCA
3+
from sklearn.preprocessing import StandardScaler
34
import pandas as pd
45
import plotly.express as px
56
import numpy as np
67

7-
88
@st.cache_data
99
def get_pca_df(scaled, n=5):
10+
1011
# calculating Principal components
1112
pca = PCA(n_components=n)
1213
pca_df = pd.DataFrame(
13-
data=pca.fit_transform(scaled), columns=[f"PC{x}" for x in range(1, n + 1)]
14+
data=pca.fit_transform(scaled),
15+
columns=[f"PC{x}" for x in range(1, n + 1)]
1416
)
1517
pca_df.index = scaled.index
1618
return pca.explained_variance_ratio_, pca_df

src/randomforest.py

+93-22
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,21 @@
77
from sklearn.ensemble import RandomForestClassifier
88
from sklearn.utils import class_weight
99
from sklearn.metrics import classification_report
10+
from sklearn.metrics import confusion_matrix, accuracy_score
11+
from io import StringIO
1012

1113
@st.cache_data
12-
def run_random_forest(attribute, n_trees):
14+
def run_random_forest(attribute, n_trees, random_seed=None):
1315
# initialize a log to print out in the app later
1416
log = ""
1517

18+
df_oob = pd.DataFrame() # Placeholder
19+
df_important_features = pd.DataFrame() # Placeholder
20+
21+
# Placeholder for classification report and label mapping
22+
class_report = "Classification report here"
23+
label_mapping = "Label mapping here"
24+
1625
labels = st.session_state.md[[attribute]]
1726
rf_data = pd.concat([st.session_state.data, labels], axis=1)
1827

@@ -22,63 +31,125 @@ def run_random_forest(attribute, n_trees):
2231
labels = enc.fit_transform(labels)
2332
labels = np.array([x[0] + 1 for x in labels])
2433

34+
class_names = enc.categories_[0] #getting the class names
35+
2536
# Extract the feature intensities as np 2D array
2637
features = np.array(st.session_state.data)
2738

2839

2940
# Split the data into training and test sets
30-
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.25, random_state=123)
31-
32-
print(f'Training Features Shape: {train_features.shape}')
33-
print(f'Training Labels Shape: {train_labels.shape}')
34-
print(f'Testing Features Shape: {test_features.shape}')
35-
print(f'Testing Labels Shape: {test_labels.shape}')
41+
train_features, test_features, train_labels, test_labels = train_test_split(features,
42+
labels,
43+
test_size=0.25,
44+
random_state= random_seed,
45+
stratify=labels)
46+
47+
# Collecting info about feature and label shapes for logging
48+
log += f"Training Features Shape: {train_features.shape}\n"
49+
log += f"Training Labels Shape: {train_labels.shape}\n"
50+
log += f"Testing Features Shape: {test_features.shape}\n"
51+
log += f"Testing Labels Shape: {test_labels.shape}\n"
3652

3753
# Balance the weights of the attribute of interest to account for unbalanced sample sizes per group
3854
sklearn_weights = class_weight.compute_class_weight(
3955
class_weight='balanced',
4056
classes=np.unique(train_labels),
4157
y=train_labels)
58+
4259
weights = {}
4360
for i,w in enumerate(np.unique(train_labels)):
4461
weights[w] = sklearn_weights[i]
4562

4663
# Set up the random forest classifier with 100 tress, balanded weights, and a random state to make it reproducible
47-
rf = RandomForestClassifier(n_estimators=n_trees, class_weight=weights, random_state=123)
64+
rf = RandomForestClassifier(n_estimators=n_trees, class_weight='balanced', random_state=random_seed)
65+
4866
# Fit the classifier to the training set
4967
rf.fit(train_features, train_labels)
5068

5169
# Use the random forest classifier to predict the sample areas in the test set
52-
predictions = rf.predict(test_features)
53-
print(f'Classifier mean accuracy score: {round(rf.score(test_features, test_labels)*100, 2)}%.')
70+
predictions_test = rf.predict(test_features)
71+
predictions_train = rf.predict(train_features)
72+
73+
classifier_accuracy = round(rf.score(test_features, test_labels)*100, 2)
74+
log += f"Classifier mean accuracy score: {classifier_accuracy}%.\n"
5475

76+
# Calculate confusion matrices
77+
test_confusion_matrix = confusion_matrix(test_labels, predictions_test, labels=range(len(class_names)))
78+
train_confusion_matrix = confusion_matrix(train_labels, predictions_train, labels=range(len(class_names)))
79+
80+
test_confusion_df = pd.DataFrame(test_confusion_matrix, index=class_names, columns=class_names)
81+
train_confusion_df = pd.DataFrame(train_confusion_matrix, index=class_names, columns=class_names)
82+
83+
test_accuracy = accuracy_score(test_labels, predictions_test)
84+
train_accuracy = accuracy_score(train_labels, predictions_train)
85+
5586
# Report of the accuracy of predictions on the test set
56-
print(classification_report(test_labels, predictions))
87+
class_report = classification_report(test_labels, predictions_test)
5788

5889
# Print the sample areas corresponding to the numbers in the report
59-
print("Sample areas corresponding to the numbers:")
60-
for i,cat in enumerate(enc.categories_[0]):
61-
print(f"{i+1.0} ,{cat}")
90+
label_mapping = "\n".join([f"{i+1.0} ,{cat}" for i, cat in enumerate(enc.categories_[0])])
6291

6392
# Most important model quality plot
6493
# OOB error lines should flatline. If it doesn't flatline add more trees
65-
rf = RandomForestClassifier(class_weight=weights, warm_start=True, oob_score=True, random_state=123)
94+
rf_oob = RandomForestClassifier(class_weight=weights, warm_start=True, oob_score=True, random_state=123)
6695
errors = []
6796
tree_range = np.arange(1,500, 10)
6897
for i in tree_range:
69-
rf.set_params(n_estimators=i)
70-
rf.fit(train_features, train_labels)
71-
errors.append(1-rf.oob_score_)
98+
rf_oob.set_params(n_estimators=i)
99+
rf_oob.fit(train_features, train_labels)
100+
errors.append(1-rf_oob.oob_score_)
72101

73102

74103
df_oob = pd.DataFrame({"n trees": tree_range, "error rate": errors})
75104

76105
# Extract the important features in the model
77-
df_important_features = pd.DataFrame(rf.feature_importances_, index=st.session_state.data.columns).sort_values(by=0, ascending=False)
106+
df_important_features = pd.DataFrame(rf.feature_importances_,
107+
index=st.session_state.data.columns).sort_values(by=0, ascending=False)
78108
df_important_features.columns = ["importance"]
79109

80-
81-
return df_oob, df_important_features
110+
return df_oob, df_important_features, log, class_report, label_mapping, test_confusion_df, train_confusion_df, test_accuracy, train_accuracy
82111

83112
def get_oob_fig(df):
84-
return px.line(df, x="n trees", y="error rate", title="out-of-bag (OOB) error")
113+
return px.line(df, x="n trees", y="error rate", title="out-of-bag (OOB) error")
114+
115+
def classification_report_to_df(report):
116+
117+
# Split the report into lines
118+
lines = report.split("\n")
119+
120+
# Prepare a dictionary to hold the data
121+
report_data = {"class": [], "precision": [], "recall": [], "f1-score": [], "support": []}
122+
123+
for line in lines[2:-3]: # Skip the header and summary lines
124+
parts = line.split()
125+
# Ensure that the line contains the expected number of parts
126+
if len(parts) == 5:
127+
report_data["class"].append(parts[0])
128+
report_data["precision"].append(parts[1])
129+
report_data["recall"].append(parts[2])
130+
report_data["f1-score"].append(parts[3])
131+
report_data["support"].append(parts[4])
132+
133+
# Convert the dictionary to a DataFrame
134+
report_df = pd.DataFrame(report_data)
135+
136+
# Convert numeric columns from strings to floats
137+
report_df[["precision", "recall", "f1-score"]] = report_df[["precision", "recall", "f1-score"]].astype(float)
138+
report_df["support"] = report_df["support"].astype(int)
139+
140+
return report_df
141+
142+
def label_mapping_to_df(label_mapping_str):
143+
144+
# Split the string into lines
145+
lines = label_mapping_str.split("\n")
146+
147+
# Split each line into index and label, then collect into a list of tuples
148+
mapping = [line.split(" ,") for line in lines if line] # Ensure the line is not empty
149+
150+
# Convert the list of tuples into a DataFrame
151+
mapping_df = pd.DataFrame(mapping, columns=['class', 'Label'])
152+
mapping_df['class'] = mapping_df['class'].astype(str)
153+
return mapping_df
154+
155+

0 commit comments

Comments
 (0)