-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
157 lines (132 loc) · 5.71 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from preprocessing import preprocess_data
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, ConfusionMatrixDisplay
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.utils.class_weight import compute_class_weight
def main():
print("- data/train.csv:", os.path.exists("data/train.csv"))
print("- data/test.csv:", os.path.exists("data/test.csv"))
print("- preprocessing.py:", os.path.exists("preprocessing.py"))
# Load the data
print("Loading data...")
try:
train_data = pd.read_csv("data/train.csv")
print(f"Successfully loaded train data with shape: {train_data.shape}")
except FileNotFoundError:
print("Error: Could not find data/train.csv")
print("Current working directory:", os.getcwd())
return
except Exception as e:
print(f"Error loading train data: {str(e)}")
return
try:
test_data = pd.read_csv("data/test.csv")
print(f"Successfully loaded test data with shape: {test_data.shape}")
except FileNotFoundError:
print("Error: Could not find data/test.csv")
print("Current working directory:", os.getcwd())
return
except Exception as e:
print(f"Error loading test data: {str(e)}")
return
# Preprocess the data
print("Preprocessing data...")
idList = test_data['id']
y = train_data['Depression']
train_data, test_data = preprocess_data(train_data, test_data)
# Check if the data is sparse and convert to DataFrame if necessary
if isinstance(train_data, np.ndarray):
train_data = pd.DataFrame(train_data)
if isinstance(test_data, np.ndarray):
test_data = pd.DataFrame(test_data)
X = train_data
# 分割資料集為訓練集與測試集
try:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
except ValueError as e:
print(f"Error during train-test split: {str(e)}")
return
# 定義要測試的 kernel 和對應的參數
kernels = ['linear', 'rbf', 'sigmoid']
# kernels = ['linear']
results = []
for kernel in kernels:
print(f"\n{'='*50}")
print(f"Starting SVM training with kernel = {kernel}")
print(f"{'='*50}")
# 計算 class weights
class_weights = compute_class_weight(
class_weight='balanced',
classes=np.unique(y_train),
y=y_train
)
class_weights_dict = dict(enumerate(class_weights))
print("Class Weights:", class_weights_dict)
# 定義模型和參數網格
parameters = {'kernel': [kernel], 'C': [0.001, 0.1, 1], 'class_weight': [class_weights_dict]}
if kernel in ['rbf', 'sigmoid']:
parameters['gamma'] = ['scale'] # 固定 gamma 為 'scale'
clSvm = SVC()
grid_search = GridSearchCV(clSvm, parameters, cv=KFold(5), scoring='accuracy', verbose=1)
grid_search.fit(X_train, y_train)
# 顯示最佳模型資訊
print("\nBest Model Information:")
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Cross-Validation Score: {grid_search.best_score_:.4f}")
# 顯示所有參數組合的結果
print("\nAll Parameter Combinations Results:")
results_df = pd.DataFrame(grid_search.cv_results_)
for mean_score, params in zip(results_df['mean_test_score'], results_df['params']):
print(f"Parameters: {params}")
print(f"Mean Score: {mean_score:.4f}\n")
# 預測
y_pred_for_acc = grid_search.predict(X_val)
y_pred_for_submit = grid_search.predict(test_data)
# 評估指標
acc = accuracy_score(y_val, y_pred_for_acc)
precision = precision_score(y_val, y_pred_for_acc)
recall = recall_score(y_val, y_pred_for_acc)
f1 = f1_score(y_val, y_pred_for_acc)
# 儲存結果
results.append({
'Kernel': kernel,
'Best Params': grid_search.best_params_,
'accuracy': acc,
'Precision': precision,
'Recall': recall,
'F1-Score': f1
})
# 顯示分類報告
print(f"\nClassification Report for kernel = {kernel}:\n")
print(classification_report(y_val, y_pred_for_acc))
# 繪製混淆矩陣
disp = ConfusionMatrixDisplay.from_estimator(grid_search, X_val, y_val, cmap='Blues', values_format='d')
disp.ax_.set_title(f"Confusion Matrix for kernel = {kernel}")
plt.savefig(f"images/confusion_matrix_{kernel}.png")
plt.show()
# Create submission file
submission = pd.DataFrame({'id': idList, 'Depression': y_pred_for_submit})
submission.to_csv(f"submission_{kernel}.csv", index=False)
print(f"Submission file created as 'submission_{kernel}.csv'")
# 將結果轉為 DataFrame 方便查看
results_df = pd.DataFrame(results)
print("\nComparison of SVM Kernels:\n")
print(results_df)
# 可視化不同 kernel 的結果
metrics = ['accuracy', 'Precision', 'Recall', 'F1-Score']
results_df.set_index('Kernel', inplace=True)
plt.figure(figsize=(12, 8))
results_df[metrics].plot(kind='bar', colormap='viridis')
plt.title('SVM Kernel Performance Comparison')
plt.ylabel('Score')
plt.xticks(rotation=0)
plt.legend(loc='best')
plt.tight_layout()
plt.savefig("images/svm_comparison.png")
plt.show()
if __name__ == "__main__":
main()