adds playground/gradient-boosted-trees/ and prune eVE with panda #61

mxochicale · Mar 4, 2024 · fd93059 · fd93059
1 parent 70ab899
commit fd93059
Show file tree

Hide file tree

Showing 6 changed files with 210 additions and 4 deletions.
diff --git a/playground/gradient-boosted-trees/README.md b/playground/gradient-boosted-trees/README.md
@@ -0,0 +1,22 @@
+# Gradient boosted trees
+
+## Installing Dependencies
+Install [eVE](../../../pyVEs/eVE.yml) virtual environment
+```
+mamba activate eVE
+```
+
+## Data
+```
+datasets_diabetes.py
+```
+
+## Voting regression predictions
+```
+python *.py
+```
+
+## References
+https://github.com/benedekrozemberczki/awesome-gradient-boosting-papers 
+https://scikit-learn.org/stable/auto_examples/ensemble/plot_voting_regressor.html#sphx-glr-auto-examples-ensemble-plot-voting-regressor-py  
+
diff --git a/playground/gradient-boosted-trees/datasets_diabetes.py b/playground/gradient-boosted-trees/datasets_diabetes.py
@@ -0,0 +1,38 @@
+#https://colab.research.google.com/gist/DeepakNair93/573cc1d52f497c685b7a96ce37838dd5/untitled0.ipynb#scrollTo=kdHS_7x78c0M
+
+import matplotlib.pyplot as plt
+import pandas as pd
+import seaborn as sns
+from sklearn.datasets import load_diabetes
+diabetes = load_diabetes()
+#X, y = load_diabetes(return_X_y=True)
+#Samples total 442
+#Dimensionality 10
+#Features real, -.2 < x < .2
+#Targets integer 25 - 346
+
+#print(diabetes.DESCR)
+print(diabetes.feature_names)   #checking the feature names
+print(diabetes.data.shape)  #checking the shape of data
+print(diabetes.target.shape)
+#print(diabetes.target)
+print(diabetes.target[:3])
+
+
+db_df = pd.DataFrame(diabetes.data,columns=diabetes.feature_names)
+db_df['Progression'] = diabetes.target #new column name 'Progression'
+#print(db_df.isna().sum())
+print(db_df.describe())
+print(db_df.info())
+
+corr = db_df.corr()
+plt.subplots(figsize=(8,8))
+sns.heatmap(corr,cmap= 'RdYlGn',annot=True)
+plt.show()
+
+
+#This plot shows the linear correlation between the variables within themselves & also variables with the target 'Progression'. 
+#This could be a phase where the variables which are multicollinear can be eliminated. 
+# https://medium.com/@hammad.ai/3-ways-to-detect-multicollinearity-in-your-dataset-6ee1776b7aa8
+
+
diff --git a/playground/gradient-boosted-trees/gradient-boosting-regression.py b/playground/gradient-boosted-trees/gradient-boosting-regression.py
@@ -0,0 +1,99 @@
+#https://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_regression.html
+# Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>
+#         Maria Telenczuk <https://github.com/maikia>
+#         Katrina Ni <https://github.com/nilichen>
+#
+# License: BSD 3 clause
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+from sklearn import datasets, ensemble
+from sklearn.inspection import permutation_importance
+from sklearn.metrics import mean_squared_error
+from sklearn.model_selection import train_test_split
+
+diabetes = datasets.load_diabetes()
+X, y = diabetes.data, diabetes.target
+
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y, test_size=0.1, random_state=13
+)
+
+params = {
+    "n_estimators": 500,
+    "max_depth": 4,
+    "min_samples_split": 5,
+    "learning_rate": 0.01,
+    "loss": "squared_error",
+}
+
+#Fit regression model 
+reg = ensemble.GradientBoostingRegressor(**params)
+reg.fit(X_train, y_train)
+
+mse = mean_squared_error(y_test, reg.predict(X_test))
+print("The mean squared error (MSE) on test set: {:.4f}".format(mse))
+
+# Plot training deviance
+test_score = np.zeros((params["n_estimators"],), dtype=np.float64)
+for i, y_pred in enumerate(reg.staged_predict(X_test)):
+    test_score[i] = mean_squared_error(y_test, y_pred)
+
+fig = plt.figure(figsize=(6, 6))
+plt.subplot(1, 1, 1)
+plt.title("Deviance")
+plt.plot(
+    np.arange(params["n_estimators"]) + 1,
+    reg.train_score_,
+    "b-",
+    label="Training Set Deviance",
+)
+plt.plot(
+    np.arange(params["n_estimators"]) + 1, test_score, "r-", label="Test Set Deviance"
+)
+plt.legend(loc="upper right")
+plt.xlabel("Boosting Iterations")
+plt.ylabel("Deviance")
+fig.tight_layout()
+plt.show()
+
+# Plot feature importance¶
+feature_importance = reg.feature_importances_
+sorted_idx = np.argsort(feature_importance)
+pos = np.arange(sorted_idx.shape[0]) + 0.5
+fig = plt.figure(figsize=(12, 6))
+plt.subplot(1, 2, 1)
+plt.barh(pos, feature_importance[sorted_idx], align="center")
+plt.yticks(pos, np.array(diabetes.feature_names)[sorted_idx])
+plt.title("Feature Importance (MDI)")
+
+result = permutation_importance(
+    reg, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2
+)
+sorted_idx = result.importances_mean.argsort()
+plt.subplot(1, 2, 2)
+plt.boxplot(
+    result.importances[sorted_idx].T,
+    vert=False,
+    labels=np.array(diabetes.feature_names)[sorted_idx],
+)
+plt.title("Permutation Importance (test set)")
+fig.tight_layout()
+plt.show()
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/playground/gradient-boosted-trees/voting-regressor.py b/playground/gradient-boosted-trees/voting-regressor.py
@@ -0,0 +1,50 @@
+import matplotlib.pyplot as plt
+
+from sklearn.datasets import load_diabetes
+from sklearn.ensemble import (
+    GradientBoostingRegressor,
+    RandomForestRegressor,
+    VotingRegressor,
+)
+from sklearn.linear_model import LinearRegression
+
+
+X, y = load_diabetes(return_X_y=True)
+
+# Train classifiers
+reg1 = GradientBoostingRegressor(random_state=1)
+reg2 = RandomForestRegressor(random_state=1)
+reg3 = LinearRegression()
+
+reg1.fit(X, y)
+reg2.fit(X, y)
+reg3.fit(X, y)
+
+ereg = VotingRegressor([("gb", reg1), ("rf", reg2), ("lr", reg3)])
+ereg.fit(X, y)
+
+#Making predictions
+xt = X[:100]
+
+pred1 = reg1.predict(xt)
+pred2 = reg2.predict(xt)
+pred3 = reg3.predict(xt)
+pred4 = ereg.predict(xt)
+
+
+#Plot the results¶
+plt.figure()
+plt.plot(pred1, "gd", label="GradientBoostingRegressor")
+plt.plot(pred2, "b^", label="RandomForestRegressor")
+plt.plot(pred3, "ys", label="LinearRegression")
+plt.plot(pred4, "r*", ms=10, label="VotingRegressor")
+
+plt.tick_params(axis="x", which="both", bottom=False, top=False, labelbottom=False)
+plt.ylabel("predicted")
+plt.xlabel("training samples")
+plt.legend(loc="best")
+plt.title("Regressor predictions and their average")
+
+plt.show()
+
+
diff --git a/playground/xgboost/README.md b/playground/xgboost/README.md
@@ -1,8 +1,5 @@
 # xgboost.XGBClassifier
 
-## Installing Dependencies
-Install [eVE](../../../pyVEs/eVE.yml) virtual environment
-
 ## Running notebooks
 ```
 mamba activate eVE

diff --git a/pyVEs/eVE.yml b/pyVEs/eVE.yml
@@ -27,6 +27,7 @@ dependencies:
      - tqdm #https://github.com/tqdm/tqdm/tags
      - pylint #https://github.com/pylint-dev/pylint/tags
      - seaborn #https://github.com/mwaskom/seaborn/tags
+     - pandas
      ### VERSIONS of pyside6: https://pypi.org/project/PySide6/#history
      #- pyside6>=6.4.2
      ### VERSIONS of VTK https://gitlab.kitware.com/vtk/vtk/-/tags
@@ -41,7 +42,6 @@ dependencies:
      #- matplotlib
      #- scikit-learn
      #- notebook
-     #- pandas
      #- seaborn
      #- araviq6
      #- civiq6