Skip to content

Commit 9a264ab

Browse files
authored
Merge pull request #20 from abzer005/Abzer-branch
Fixed issues
2 parents 4f0edae + 877a43c commit 9a264ab

6 files changed

+70
-34
lines changed

pages/1_📁_Data_Preparation.py

+9-2
Original file line numberDiff line numberDiff line change
@@ -32,22 +32,25 @@
3232
if file_origin == "Small example dataset for testing":
3333
ft, md = load_example()
3434

35-
elif file_origin == "GNPS(2) task ID" or file_origin == "Example dataset from publication" or file_origin == "GNPS2 classical molecular networking (CMN)":
35+
elif file_origin in ["GNPS(2) task ID", "Example dataset from publication", "GNPS2 classical molecular networking (CMN)"]:
3636
st.warning("💡 This tool only supports task ID from GNPS1 and 2 not from Quickstart GNPS1.")
3737
if file_origin == "Example dataset from publication":
3838
task_id_default = "b661d12ba88745639664988329c1363e" # 63e8b3da08df41fe95031e4710e0476b
3939
disabled = True
40+
cmn_flag = False
4041
elif file_origin == "GNPS2 classical molecular networking (CMN)":
4142
task_id_default = "" # 2a65f90094654235a4c8d337fdca11e1
4243
disabled = False
44+
cmn_flag = True
4345
else:
4446
task_id_default = ""
4547
disabled = False
48+
cmn_flag = False
4649
task_id = st.text_input("GNPS task ID", task_id_default, disabled=disabled)
4750
_, c2, _ = st.columns(3)
4851

4952
if c2.button("Load files from GNPS", type="primary", disabled=len(task_id) == 0, use_container_width=True):
50-
st.session_state["ft_gnps"], st.session_state["md_gnps"] = load_from_gnps(task_id, cmn=True)
53+
st.session_state["ft_gnps"], st.session_state["md_gnps"] = load_from_gnps(task_id, cmn= cmn_flag)
5154

5255
if not st.session_state["ft_gnps"].empty and st.session_state["md_gnps"].empty:
5356
st.warning("Meta data is empty. Please upload one.")
@@ -202,6 +205,8 @@
202205
show_table(ft, "blank-features-removed")
203206

204207
st.session_state['blank_removal_done'] = True
208+
else:
209+
st.session_state['blank_removal_done'] = False
205210

206211
if not ft.empty:
207212
cutoff_LOD = get_cutoff_LOD(ft)
@@ -224,6 +229,8 @@
224229
st.warning(f"Can't impute with random values between 1 and lowest value, which is {cutoff_LOD} (rounded).")
225230

226231
st.session_state['imputation_done'] = True
232+
else:
233+
st.session_state['imputation_done'] = False
227234

228235
with tabs[2]:
229236
normalization_method = st.radio("data normalization method", ["None",

pages/5_Random_Forest.py

+13-10
Original file line numberDiff line numberDiff line change
@@ -28,16 +28,19 @@
2828
random_seed = 123 if use_random_seed else None
2929

3030
if c2.button("Run supervised learning", type="primary"):
31-
df_oob, df_important_features, log, class_report, label_mapping, test_confusion_df, train_confusion_df, test_accuracy, train_accuracy = run_random_forest(st.session_state.rf_attribute, st.session_state.rf_n_trees, random_seed)
32-
st.session_state['df_oob'] = df_oob
33-
st.session_state['df_important_features'] = df_important_features
34-
st.session_state['log'] = log
35-
st.session_state['class_report'] = class_report
36-
st.session_state['label_mapping'] = label_mapping
37-
st.session_state['test_confusion_df'] = test_confusion_df
38-
st.session_state['train_confusion_df'] = train_confusion_df
39-
st.session_state['test_accuracy'] = test_accuracy
40-
st.session_state['train_accuracy'] = train_accuracy
31+
try:
32+
df_oob, df_important_features, log, class_report, label_mapping, test_confusion_df, train_confusion_df, test_accuracy, train_accuracy = run_random_forest(st.session_state.rf_attribute, st.session_state.rf_n_trees, random_seed)
33+
st.session_state['df_oob'] = df_oob
34+
st.session_state['df_important_features'] = df_important_features
35+
st.session_state['log'] = log
36+
st.session_state['class_report'] = class_report
37+
st.session_state['label_mapping'] = label_mapping
38+
st.session_state['test_confusion_df'] = test_confusion_df
39+
st.session_state['train_confusion_df'] = train_confusion_df
40+
st.session_state['test_accuracy'] = test_accuracy
41+
st.session_state['train_accuracy'] = train_accuracy
42+
except Exception as e:
43+
st.error(f"Failed to run model due to: {str(e)}")
4144

4245
if 'df_important_features' in st.session_state and not st.session_state.df_important_features.empty:
4346
tabs = st.tabs(["📈 Analyze optimum number of trees",

src/anova.py

+2
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,8 @@ def get_anova_plot(anova):
8080
yaxis_title="-log(p)",
8181
showlegend=False
8282
)
83+
fig.update_yaxes(title_standoff=10)
84+
8385
# fig.update_yaxes(title_font_size=20)
8486
# fig.update_xaxes(title_font_size=20)
8587

src/fileselection.py

+25-11
Original file line numberDiff line numberDiff line change
@@ -55,34 +55,48 @@ def load_example():
5555

5656
@st.cache_data
5757
def load_from_gnps(task_id, cmn=False):
58+
5859
try: # GNPS2 will run here
5960
ft = workflow_fbmn.get_quantification_dataframe(task_id, gnps2=True)
6061
md = workflow_fbmn.get_metadata_dataframe(task_id, gnps2=True).set_index("filename")
6162
an = taskresult.get_gnps2_task_resultfile_dataframe(task_id, "nf_output/library/merged_results_with_gnps.tsv")[["#Scan#", "Compound_Name"]].set_index("#Scan#")
62-
except urllib.error.HTTPError: # GNPS1 task IDs can not be retrieved and throw HTTP Error 500
63+
except urllib.error.HTTPError as e:
64+
print(f"HTTP Error encountered: {e}") # GNPS1 task IDs can not be retrieved and throw HTTP Error 500
6365
if cmn:
6466
ft_url = f"https://gnps2.org/resultfile?task={task_id}&file=nf_output/clustering/featuretable_reformatted_precursorintensity.csv"
6567
md_url = f"https://gnps2.org/resultfile?task={task_id}&file=nf_output/metadata/merged_metadata.tsv"
68+
69+
ft = pd.read_csv(ft_url)
70+
try:
71+
md = pd.read_csv(md_url, sep = "\t", index_col="filename")
72+
except pd.errors.EmptyDataError:
73+
md = pd.DataFrame()
74+
6675
else:
6776
ft_url = f"https://proteomics2.ucsd.edu/ProteoSAFe/DownloadResultFile?task={task_id}&file=quantification_table_reformatted/&block=main"
6877
md_url = f"https://proteomics2.ucsd.edu/ProteoSAFe/DownloadResultFile?task={task_id}&file=metadata_merged/&block=main"
69-
ft = pd.read_csv(ft_url)
70-
try:
71-
md = pd.read_csv(md_url, sep = "\t", index_col="filename")
72-
except pd.errors.EmptyDataError:
73-
md = pd.DataFrame()
74-
if not cmn:
7578
an_url = f"https://proteomics2.ucsd.edu/ProteoSAFe/DownloadResultFile?task={task_id}&file=DB_result/&block=main"
79+
80+
ft = pd.read_csv(ft_url)
81+
md = pd.read_csv(md_url, sep="\t", index_col="filename")
7682
an = pd.read_csv(an_url, sep = "\t")[["#Scan#", "Compound_Name"]].set_index("#Scan#")
83+
84+
if md.empty: # Handle empty metadata
85+
md = pd.DataFrame()
86+
7787
if cmn:
7888
ft.index = ft["row ID"].astype(str)
7989
ft = ft.drop(columns=["row m/z", "row retention time", "row ID"])
90+
8091
else:
81-
index_with_mz_RT = pd.Index(ft.apply(lambda x: f'{x["row ID"]}_{round(x["row m/z"], 4)}_{round(x["row retention time"], 2)}', axis=1))
92+
index_with_mz_RT = ft.apply(lambda x: f'{x["row ID"]}_{round(x["row m/z"], 4)}_{round(x["row retention time"], 2)}', axis=1)
8293
ft.index = index_with_mz_RT
83-
st.session_state["df_gnps_annotations"].index = index_with_mz_RT
84-
st.session_state["df_gnps_annotations"]["GNPS annotation"] = ft["row ID"].apply(lambda x: an.loc[x, "Compound_Name"] if x in an.index else pd.NA)
85-
st.session_state["df_gnps_annotations"].dropna(inplace=True)
94+
if 'df_gnps_annotations' in st.session_state:
95+
st.session_state["df_gnps_annotations"].index = index_with_mz_RT
96+
st.session_state["df_gnps_annotations"]["GNPS annotation"] = ft["row ID"].apply(lambda x: an.loc[x, "Compound_Name"] if x in an.index else pd.NA)
97+
st.session_state["df_gnps_annotations"].dropna(inplace=True)
98+
99+
ft.index.name = 'metabolite'
86100
return ft, md
87101

88102

src/pca.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,25 @@
11
import streamlit as st
22
from sklearn.decomposition import PCA
3-
from sklearn.preprocessing import StandardScaler
43
import pandas as pd
54
import plotly.express as px
65
import numpy as np
76

7+
88
@st.cache_data
99
def get_pca_df(scaled, n=5):
10+
<<<<<<< HEAD
11+
# calculating Principal components
12+
pca = PCA(n_components=n)
13+
pca_df = pd.DataFrame(
14+
data=pca.fit_transform(scaled), columns=[f"PC{x}" for x in range(1, n + 1)]
15+
=======
1016

1117
# calculating Principal components
1218
pca = PCA(n_components=n)
1319
pca_df = pd.DataFrame(
1420
data=pca.fit_transform(scaled),
1521
columns=[f"PC{x}" for x in range(1, n + 1)]
22+
>>>>>>> efdd76467755ddb96598832b0740cb7149a9cefb
1623
)
1724
pca_df.index = scaled.index
1825
return pca.explained_variance_ratio_, pca_df

src/randomforest.py

+13-10
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,11 @@
33
import numpy as np
44
import plotly.express as px
55
from sklearn.preprocessing import OrdinalEncoder
6-
from sklearn.model_selection import train_test_split
6+
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
77
from sklearn.ensemble import RandomForestClassifier
88
from sklearn.utils import class_weight
99
from sklearn.metrics import classification_report
1010
from sklearn.metrics import confusion_matrix, accuracy_score
11-
from io import StringIO
1211

1312
@st.cache_data
1413
def run_random_forest(attribute, n_trees, random_seed=None):
@@ -36,13 +35,16 @@ def run_random_forest(attribute, n_trees, random_seed=None):
3635
# Extract the feature intensities as np 2D array
3736
features = np.array(st.session_state.data)
3837

38+
# Determine the smallest class size and adjust test_size accordingly
39+
unique, counts = np.unique(labels, return_counts=True)
40+
min_class_count = min(counts)
41+
min_test_size = float(len(unique)) / len(labels)
3942

40-
# Split the data into training and test sets
41-
train_features, test_features, train_labels, test_labels = train_test_split(features,
42-
labels,
43-
test_size=0.25,
44-
random_state= random_seed,
45-
stratify=labels)
43+
# Adjust test size to be larger of the calculated min_test_size or the initial_test_size
44+
adjusted_test_size = max(min_test_size, 0.25)
45+
46+
train_features, test_features, train_labels, test_labels = train_test_split(
47+
features, labels, test_size= adjusted_test_size, random_state=random_seed, stratify=labels)
4648

4749
# Collecting info about feature and label shapes for logging
4850
log += f"Training Features Shape: {train_features.shape}\n"
@@ -61,7 +63,7 @@ def run_random_forest(attribute, n_trees, random_seed=None):
6163
weights[w] = sklearn_weights[i]
6264

6365
# Set up the random forest classifier with 100 tress, balanded weights, and a random state to make it reproducible
64-
rf = RandomForestClassifier(n_estimators=n_trees, class_weight='balanced', random_state=random_seed)
66+
rf = RandomForestClassifier(n_estimators=n_trees, class_weight= weights, random_state=random_seed)
6567

6668
# Fit the classifier to the training set
6769
rf.fit(train_features, train_labels)
@@ -106,9 +108,10 @@ def run_random_forest(attribute, n_trees, random_seed=None):
106108
df_important_features = pd.DataFrame(rf.feature_importances_,
107109
index=st.session_state.data.columns).sort_values(by=0, ascending=False)
108110
df_important_features.columns = ["importance"]
109-
111+
110112
return df_oob, df_important_features, log, class_report, label_mapping, test_confusion_df, train_confusion_df, test_accuracy, train_accuracy
111113

114+
112115
def get_oob_fig(df):
113116
return px.line(df, x="n trees", y="error rate", title="out-of-bag (OOB) error")
114117

0 commit comments

Comments
 (0)