Skip to content

Commit

Permalink
adds playground/gradient-boosted-trees/ and prune eVE with panda #61
Browse files Browse the repository at this point in the history
  • Loading branch information
mxochicale committed Mar 4, 2024
1 parent 70ab899 commit fd93059
Show file tree
Hide file tree
Showing 6 changed files with 210 additions and 4 deletions.
22 changes: 22 additions & 0 deletions playground/gradient-boosted-trees/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Gradient boosted trees

## Installing Dependencies
Install [eVE](../../../pyVEs/eVE.yml) virtual environment
```
mamba activate eVE
```

## Data
```
datasets_diabetes.py
```

## Voting regression predictions
```
python *.py
```

## References
https://github.com/benedekrozemberczki/awesome-gradient-boosting-papers
https://scikit-learn.org/stable/auto_examples/ensemble/plot_voting_regressor.html#sphx-glr-auto-examples-ensemble-plot-voting-regressor-py

38 changes: 38 additions & 0 deletions playground/gradient-boosted-trees/datasets_diabetes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#https://colab.research.google.com/gist/DeepakNair93/573cc1d52f497c685b7a96ce37838dd5/untitled0.ipynb#scrollTo=kdHS_7x78c0M

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.datasets import load_diabetes
diabetes = load_diabetes()
#X, y = load_diabetes(return_X_y=True)
#Samples total 442
#Dimensionality 10
#Features real, -.2 < x < .2
#Targets integer 25 - 346

#print(diabetes.DESCR)
print(diabetes.feature_names) #checking the feature names
print(diabetes.data.shape) #checking the shape of data
print(diabetes.target.shape)
#print(diabetes.target)
print(diabetes.target[:3])


db_df = pd.DataFrame(diabetes.data,columns=diabetes.feature_names)
db_df['Progression'] = diabetes.target #new column name 'Progression'
#print(db_df.isna().sum())
print(db_df.describe())
print(db_df.info())

corr = db_df.corr()
plt.subplots(figsize=(8,8))
sns.heatmap(corr,cmap= 'RdYlGn',annot=True)
plt.show()


#This plot shows the linear correlation between the variables within themselves & also variables with the target 'Progression'.
#This could be a phase where the variables which are multicollinear can be eliminated.
# https://medium.com/@hammad.ai/3-ways-to-detect-multicollinearity-in-your-dataset-6ee1776b7aa8


99 changes: 99 additions & 0 deletions playground/gradient-boosted-trees/gradient-boosting-regression.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
#https://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_regression.html
# Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>
# Maria Telenczuk <https://github.com/maikia>
# Katrina Ni <https://github.com/nilichen>
#
# License: BSD 3 clause

import matplotlib.pyplot as plt
import numpy as np

from sklearn import datasets, ensemble
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

diabetes = datasets.load_diabetes()
X, y = diabetes.data, diabetes.target

X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.1, random_state=13
)

params = {
"n_estimators": 500,
"max_depth": 4,
"min_samples_split": 5,
"learning_rate": 0.01,
"loss": "squared_error",
}

#Fit regression model
reg = ensemble.GradientBoostingRegressor(**params)
reg.fit(X_train, y_train)

mse = mean_squared_error(y_test, reg.predict(X_test))
print("The mean squared error (MSE) on test set: {:.4f}".format(mse))

# Plot training deviance
test_score = np.zeros((params["n_estimators"],), dtype=np.float64)
for i, y_pred in enumerate(reg.staged_predict(X_test)):
test_score[i] = mean_squared_error(y_test, y_pred)

fig = plt.figure(figsize=(6, 6))
plt.subplot(1, 1, 1)
plt.title("Deviance")
plt.plot(
np.arange(params["n_estimators"]) + 1,
reg.train_score_,
"b-",
label="Training Set Deviance",
)
plt.plot(
np.arange(params["n_estimators"]) + 1, test_score, "r-", label="Test Set Deviance"
)
plt.legend(loc="upper right")
plt.xlabel("Boosting Iterations")
plt.ylabel("Deviance")
fig.tight_layout()
plt.show()

# Plot feature importance¶
feature_importance = reg.feature_importances_
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + 0.5
fig = plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.barh(pos, feature_importance[sorted_idx], align="center")
plt.yticks(pos, np.array(diabetes.feature_names)[sorted_idx])
plt.title("Feature Importance (MDI)")

result = permutation_importance(
reg, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2
)
sorted_idx = result.importances_mean.argsort()
plt.subplot(1, 2, 2)
plt.boxplot(
result.importances[sorted_idx].T,
vert=False,
labels=np.array(diabetes.feature_names)[sorted_idx],
)
plt.title("Permutation Importance (test set)")
fig.tight_layout()
plt.show()
















50 changes: 50 additions & 0 deletions playground/gradient-boosted-trees/voting-regressor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import matplotlib.pyplot as plt

from sklearn.datasets import load_diabetes
from sklearn.ensemble import (
GradientBoostingRegressor,
RandomForestRegressor,
VotingRegressor,
)
from sklearn.linear_model import LinearRegression


X, y = load_diabetes(return_X_y=True)

# Train classifiers
reg1 = GradientBoostingRegressor(random_state=1)
reg2 = RandomForestRegressor(random_state=1)
reg3 = LinearRegression()

reg1.fit(X, y)
reg2.fit(X, y)
reg3.fit(X, y)

ereg = VotingRegressor([("gb", reg1), ("rf", reg2), ("lr", reg3)])
ereg.fit(X, y)

#Making predictions
xt = X[:100]

pred1 = reg1.predict(xt)
pred2 = reg2.predict(xt)
pred3 = reg3.predict(xt)
pred4 = ereg.predict(xt)


#Plot the results¶
plt.figure()
plt.plot(pred1, "gd", label="GradientBoostingRegressor")
plt.plot(pred2, "b^", label="RandomForestRegressor")
plt.plot(pred3, "ys", label="LinearRegression")
plt.plot(pred4, "r*", ms=10, label="VotingRegressor")

plt.tick_params(axis="x", which="both", bottom=False, top=False, labelbottom=False)
plt.ylabel("predicted")
plt.xlabel("training samples")
plt.legend(loc="best")
plt.title("Regressor predictions and their average")

plt.show()


3 changes: 0 additions & 3 deletions playground/xgboost/README.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
# xgboost.XGBClassifier

## Installing Dependencies
Install [eVE](../../../pyVEs/eVE.yml) virtual environment

## Running notebooks
```
mamba activate eVE
Expand Down
2 changes: 1 addition & 1 deletion pyVEs/eVE.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ dependencies:
- tqdm #https://github.com/tqdm/tqdm/tags
- pylint #https://github.com/pylint-dev/pylint/tags
- seaborn #https://github.com/mwaskom/seaborn/tags
- pandas
### VERSIONS of pyside6: https://pypi.org/project/PySide6/#history
#- pyside6>=6.4.2
### VERSIONS of VTK https://gitlab.kitware.com/vtk/vtk/-/tags
Expand All @@ -41,7 +42,6 @@ dependencies:
#- matplotlib
#- scikit-learn
#- notebook
#- pandas
#- seaborn
#- araviq6
#- civiq6
Expand Down

0 comments on commit fd93059

Please sign in to comment.