7
7
from sklearn .ensemble import RandomForestClassifier
8
8
from sklearn .utils import class_weight
9
9
from sklearn .metrics import classification_report
10
+ from sklearn .metrics import confusion_matrix , accuracy_score
11
+ from io import StringIO
10
12
11
13
@st .cache_data
12
- def run_random_forest (attribute , n_trees ):
14
+ def run_random_forest (attribute , n_trees , random_seed = None ):
13
15
# initialize a log to print out in the app later
14
16
log = ""
15
17
18
+ df_oob = pd .DataFrame () # Placeholder
19
+ df_important_features = pd .DataFrame () # Placeholder
20
+
21
+ # Placeholder for classification report and label mapping
22
+ class_report = "Classification report here"
23
+ label_mapping = "Label mapping here"
24
+
16
25
labels = st .session_state .md [[attribute ]]
17
26
rf_data = pd .concat ([st .session_state .data , labels ], axis = 1 )
18
27
@@ -22,63 +31,125 @@ def run_random_forest(attribute, n_trees):
22
31
labels = enc .fit_transform (labels )
23
32
labels = np .array ([x [0 ] + 1 for x in labels ])
24
33
34
+ class_names = enc .categories_ [0 ] #getting the class names
35
+
25
36
# Extract the feature intensities as np 2D array
26
37
features = np .array (st .session_state .data )
27
38
28
39
29
40
# Split the data into training and test sets
30
- train_features , test_features , train_labels , test_labels = train_test_split (features , labels , test_size = 0.25 , random_state = 123 )
31
-
32
- print (f'Training Features Shape: { train_features .shape } ' )
33
- print (f'Training Labels Shape: { train_labels .shape } ' )
34
- print (f'Testing Features Shape: { test_features .shape } ' )
35
- print (f'Testing Labels Shape: { test_labels .shape } ' )
41
+ train_features , test_features , train_labels , test_labels = train_test_split (features ,
42
+ labels ,
43
+ test_size = 0.25 ,
44
+ random_state = random_seed ,
45
+ stratify = labels )
46
+
47
+ # Collecting info about feature and label shapes for logging
48
+ log += f"Training Features Shape: { train_features .shape } \n "
49
+ log += f"Training Labels Shape: { train_labels .shape } \n "
50
+ log += f"Testing Features Shape: { test_features .shape } \n "
51
+ log += f"Testing Labels Shape: { test_labels .shape } \n "
36
52
37
53
# Balance the weights of the attribute of interest to account for unbalanced sample sizes per group
38
54
sklearn_weights = class_weight .compute_class_weight (
39
55
class_weight = 'balanced' ,
40
56
classes = np .unique (train_labels ),
41
57
y = train_labels )
58
+
42
59
weights = {}
43
60
for i ,w in enumerate (np .unique (train_labels )):
44
61
weights [w ] = sklearn_weights [i ]
45
62
46
63
# Set up the random forest classifier with 100 tress, balanded weights, and a random state to make it reproducible
47
- rf = RandomForestClassifier (n_estimators = n_trees , class_weight = weights , random_state = 123 )
64
+ rf = RandomForestClassifier (n_estimators = n_trees , class_weight = 'balanced' , random_state = random_seed )
65
+
48
66
# Fit the classifier to the training set
49
67
rf .fit (train_features , train_labels )
50
68
51
69
# Use the random forest classifier to predict the sample areas in the test set
52
- predictions = rf .predict (test_features )
53
- print (f'Classifier mean accuracy score: { round (rf .score (test_features , test_labels )* 100 , 2 )} %.' )
70
+ predictions_test = rf .predict (test_features )
71
+ predictions_train = rf .predict (train_features )
72
+
73
+ classifier_accuracy = round (rf .score (test_features , test_labels )* 100 , 2 )
74
+ log += f"Classifier mean accuracy score: { classifier_accuracy } %.\n "
54
75
76
+ # Calculate confusion matrices
77
+ test_confusion_matrix = confusion_matrix (test_labels , predictions_test , labels = range (len (class_names )))
78
+ train_confusion_matrix = confusion_matrix (train_labels , predictions_train , labels = range (len (class_names )))
79
+
80
+ test_confusion_df = pd .DataFrame (test_confusion_matrix , index = class_names , columns = class_names )
81
+ train_confusion_df = pd .DataFrame (train_confusion_matrix , index = class_names , columns = class_names )
82
+
83
+ test_accuracy = accuracy_score (test_labels , predictions_test )
84
+ train_accuracy = accuracy_score (train_labels , predictions_train )
85
+
55
86
# Report of the accuracy of predictions on the test set
56
- print ( classification_report (test_labels , predictions ) )
87
+ class_report = classification_report (test_labels , predictions_test )
57
88
58
89
# Print the sample areas corresponding to the numbers in the report
59
- print ("Sample areas corresponding to the numbers:" )
60
- for i ,cat in enumerate (enc .categories_ [0 ]):
61
- print (f"{ i + 1.0 } ,{ cat } " )
90
+ label_mapping = "\n " .join ([f"{ i + 1.0 } ,{ cat } " for i , cat in enumerate (enc .categories_ [0 ])])
62
91
63
92
# Most important model quality plot
64
93
# OOB error lines should flatline. If it doesn't flatline add more trees
65
- rf = RandomForestClassifier (class_weight = weights , warm_start = True , oob_score = True , random_state = 123 )
94
+ rf_oob = RandomForestClassifier (class_weight = weights , warm_start = True , oob_score = True , random_state = 123 )
66
95
errors = []
67
96
tree_range = np .arange (1 ,500 , 10 )
68
97
for i in tree_range :
69
- rf .set_params (n_estimators = i )
70
- rf .fit (train_features , train_labels )
71
- errors .append (1 - rf .oob_score_ )
98
+ rf_oob .set_params (n_estimators = i )
99
+ rf_oob .fit (train_features , train_labels )
100
+ errors .append (1 - rf_oob .oob_score_ )
72
101
73
102
74
103
df_oob = pd .DataFrame ({"n trees" : tree_range , "error rate" : errors })
75
104
76
105
# Extract the important features in the model
77
- df_important_features = pd .DataFrame (rf .feature_importances_ , index = st .session_state .data .columns ).sort_values (by = 0 , ascending = False )
106
+ df_important_features = pd .DataFrame (rf .feature_importances_ ,
107
+ index = st .session_state .data .columns ).sort_values (by = 0 , ascending = False )
78
108
df_important_features .columns = ["importance" ]
79
109
80
-
81
- return df_oob , df_important_features
110
+ return df_oob , df_important_features , log , class_report , label_mapping , test_confusion_df , train_confusion_df , test_accuracy , train_accuracy
82
111
83
112
def get_oob_fig (df ):
84
- return px .line (df , x = "n trees" , y = "error rate" , title = "out-of-bag (OOB) error" )
113
+ return px .line (df , x = "n trees" , y = "error rate" , title = "out-of-bag (OOB) error" )
114
+
115
+ def classification_report_to_df (report ):
116
+
117
+ # Split the report into lines
118
+ lines = report .split ("\n " )
119
+
120
+ # Prepare a dictionary to hold the data
121
+ report_data = {"class" : [], "precision" : [], "recall" : [], "f1-score" : [], "support" : []}
122
+
123
+ for line in lines [2 :- 3 ]: # Skip the header and summary lines
124
+ parts = line .split ()
125
+ # Ensure that the line contains the expected number of parts
126
+ if len (parts ) == 5 :
127
+ report_data ["class" ].append (parts [0 ])
128
+ report_data ["precision" ].append (parts [1 ])
129
+ report_data ["recall" ].append (parts [2 ])
130
+ report_data ["f1-score" ].append (parts [3 ])
131
+ report_data ["support" ].append (parts [4 ])
132
+
133
+ # Convert the dictionary to a DataFrame
134
+ report_df = pd .DataFrame (report_data )
135
+
136
+ # Convert numeric columns from strings to floats
137
+ report_df [["precision" , "recall" , "f1-score" ]] = report_df [["precision" , "recall" , "f1-score" ]].astype (float )
138
+ report_df ["support" ] = report_df ["support" ].astype (int )
139
+
140
+ return report_df
141
+
142
+ def label_mapping_to_df (label_mapping_str ):
143
+
144
+ # Split the string into lines
145
+ lines = label_mapping_str .split ("\n " )
146
+
147
+ # Split each line into index and label, then collect into a list of tuples
148
+ mapping = [line .split (" ," ) for line in lines if line ] # Ensure the line is not empty
149
+
150
+ # Convert the list of tuples into a DataFrame
151
+ mapping_df = pd .DataFrame (mapping , columns = ['class' , 'Label' ])
152
+ mapping_df ['class' ] = mapping_df ['class' ].astype (str )
153
+ return mapping_df
154
+
155
+
0 commit comments