-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
adds playground/gradient-boosted-trees/ and prune eVE with panda #61
- Loading branch information
1 parent
70ab899
commit fd93059
Showing
6 changed files
with
210 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
# Gradient boosted trees | ||
|
||
## Installing Dependencies | ||
Install [eVE](../../../pyVEs/eVE.yml) virtual environment | ||
``` | ||
mamba activate eVE | ||
``` | ||
|
||
## Data | ||
``` | ||
datasets_diabetes.py | ||
``` | ||
|
||
## Voting regression predictions | ||
``` | ||
python *.py | ||
``` | ||
|
||
## References | ||
https://github.com/benedekrozemberczki/awesome-gradient-boosting-papers | ||
https://scikit-learn.org/stable/auto_examples/ensemble/plot_voting_regressor.html#sphx-glr-auto-examples-ensemble-plot-voting-regressor-py | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
#https://colab.research.google.com/gist/DeepakNair93/573cc1d52f497c685b7a96ce37838dd5/untitled0.ipynb#scrollTo=kdHS_7x78c0M | ||
|
||
import matplotlib.pyplot as plt | ||
import pandas as pd | ||
import seaborn as sns | ||
from sklearn.datasets import load_diabetes | ||
diabetes = load_diabetes() | ||
#X, y = load_diabetes(return_X_y=True) | ||
#Samples total 442 | ||
#Dimensionality 10 | ||
#Features real, -.2 < x < .2 | ||
#Targets integer 25 - 346 | ||
|
||
#print(diabetes.DESCR) | ||
print(diabetes.feature_names) #checking the feature names | ||
print(diabetes.data.shape) #checking the shape of data | ||
print(diabetes.target.shape) | ||
#print(diabetes.target) | ||
print(diabetes.target[:3]) | ||
|
||
|
||
db_df = pd.DataFrame(diabetes.data,columns=diabetes.feature_names) | ||
db_df['Progression'] = diabetes.target #new column name 'Progression' | ||
#print(db_df.isna().sum()) | ||
print(db_df.describe()) | ||
print(db_df.info()) | ||
|
||
corr = db_df.corr() | ||
plt.subplots(figsize=(8,8)) | ||
sns.heatmap(corr,cmap= 'RdYlGn',annot=True) | ||
plt.show() | ||
|
||
|
||
#This plot shows the linear correlation between the variables within themselves & also variables with the target 'Progression'. | ||
#This could be a phase where the variables which are multicollinear can be eliminated. | ||
# https://medium.com/@hammad.ai/3-ways-to-detect-multicollinearity-in-your-dataset-6ee1776b7aa8 | ||
|
||
|
99 changes: 99 additions & 0 deletions
99
playground/gradient-boosted-trees/gradient-boosting-regression.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
#https://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_regression.html | ||
# Author: Peter Prettenhofer <peter.prettenhofer@gmail.com> | ||
# Maria Telenczuk <https://github.com/maikia> | ||
# Katrina Ni <https://github.com/nilichen> | ||
# | ||
# License: BSD 3 clause | ||
|
||
import matplotlib.pyplot as plt | ||
import numpy as np | ||
|
||
from sklearn import datasets, ensemble | ||
from sklearn.inspection import permutation_importance | ||
from sklearn.metrics import mean_squared_error | ||
from sklearn.model_selection import train_test_split | ||
|
||
diabetes = datasets.load_diabetes() | ||
X, y = diabetes.data, diabetes.target | ||
|
||
X_train, X_test, y_train, y_test = train_test_split( | ||
X, y, test_size=0.1, random_state=13 | ||
) | ||
|
||
params = { | ||
"n_estimators": 500, | ||
"max_depth": 4, | ||
"min_samples_split": 5, | ||
"learning_rate": 0.01, | ||
"loss": "squared_error", | ||
} | ||
|
||
#Fit regression model | ||
reg = ensemble.GradientBoostingRegressor(**params) | ||
reg.fit(X_train, y_train) | ||
|
||
mse = mean_squared_error(y_test, reg.predict(X_test)) | ||
print("The mean squared error (MSE) on test set: {:.4f}".format(mse)) | ||
|
||
# Plot training deviance | ||
test_score = np.zeros((params["n_estimators"],), dtype=np.float64) | ||
for i, y_pred in enumerate(reg.staged_predict(X_test)): | ||
test_score[i] = mean_squared_error(y_test, y_pred) | ||
|
||
fig = plt.figure(figsize=(6, 6)) | ||
plt.subplot(1, 1, 1) | ||
plt.title("Deviance") | ||
plt.plot( | ||
np.arange(params["n_estimators"]) + 1, | ||
reg.train_score_, | ||
"b-", | ||
label="Training Set Deviance", | ||
) | ||
plt.plot( | ||
np.arange(params["n_estimators"]) + 1, test_score, "r-", label="Test Set Deviance" | ||
) | ||
plt.legend(loc="upper right") | ||
plt.xlabel("Boosting Iterations") | ||
plt.ylabel("Deviance") | ||
fig.tight_layout() | ||
plt.show() | ||
|
||
# Plot feature importance¶ | ||
feature_importance = reg.feature_importances_ | ||
sorted_idx = np.argsort(feature_importance) | ||
pos = np.arange(sorted_idx.shape[0]) + 0.5 | ||
fig = plt.figure(figsize=(12, 6)) | ||
plt.subplot(1, 2, 1) | ||
plt.barh(pos, feature_importance[sorted_idx], align="center") | ||
plt.yticks(pos, np.array(diabetes.feature_names)[sorted_idx]) | ||
plt.title("Feature Importance (MDI)") | ||
|
||
result = permutation_importance( | ||
reg, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2 | ||
) | ||
sorted_idx = result.importances_mean.argsort() | ||
plt.subplot(1, 2, 2) | ||
plt.boxplot( | ||
result.importances[sorted_idx].T, | ||
vert=False, | ||
labels=np.array(diabetes.feature_names)[sorted_idx], | ||
) | ||
plt.title("Permutation Importance (test set)") | ||
fig.tight_layout() | ||
plt.show() | ||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
import matplotlib.pyplot as plt | ||
|
||
from sklearn.datasets import load_diabetes | ||
from sklearn.ensemble import ( | ||
GradientBoostingRegressor, | ||
RandomForestRegressor, | ||
VotingRegressor, | ||
) | ||
from sklearn.linear_model import LinearRegression | ||
|
||
|
||
X, y = load_diabetes(return_X_y=True) | ||
|
||
# Train classifiers | ||
reg1 = GradientBoostingRegressor(random_state=1) | ||
reg2 = RandomForestRegressor(random_state=1) | ||
reg3 = LinearRegression() | ||
|
||
reg1.fit(X, y) | ||
reg2.fit(X, y) | ||
reg3.fit(X, y) | ||
|
||
ereg = VotingRegressor([("gb", reg1), ("rf", reg2), ("lr", reg3)]) | ||
ereg.fit(X, y) | ||
|
||
#Making predictions | ||
xt = X[:100] | ||
|
||
pred1 = reg1.predict(xt) | ||
pred2 = reg2.predict(xt) | ||
pred3 = reg3.predict(xt) | ||
pred4 = ereg.predict(xt) | ||
|
||
|
||
#Plot the results¶ | ||
plt.figure() | ||
plt.plot(pred1, "gd", label="GradientBoostingRegressor") | ||
plt.plot(pred2, "b^", label="RandomForestRegressor") | ||
plt.plot(pred3, "ys", label="LinearRegression") | ||
plt.plot(pred4, "r*", ms=10, label="VotingRegressor") | ||
|
||
plt.tick_params(axis="x", which="both", bottom=False, top=False, labelbottom=False) | ||
plt.ylabel("predicted") | ||
plt.xlabel("training samples") | ||
plt.legend(loc="best") | ||
plt.title("Regressor predictions and their average") | ||
|
||
plt.show() | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters