Update reqs (#13)

* feat: Update to work with dask==2022.12, scikit-learn==1.2 * test: Tidy tests * feat: Bump version to 0.6.0 * docs: Update readme and examples * chore: Update CI python version targets * test: Correct import * chore: Update setup.py, tidy
garethjns · Jan 1, 2023 · 8a23e7f · 8a23e7f
1 parent dbc6324
commit 8a23e7f
Show file tree

Hide file tree

Showing 44 changed files with 3,260 additions and 3,627 deletions.
diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml
@@ -12,7 +12,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [3.6, 3.7, 3.8]
+        python-version: [3.8, 3.9]
 
     steps:
     - uses: actions/checkout@v2
@@ -23,7 +23,7 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        pip install -r requirements.txt
+        pip install .
     - name: Lint with flake8
       run: |
         pip install flake8
@@ -33,5 +33,5 @@ jobs:
         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
     - name: Test with pytest
       run: |
-        pip install pytest
+        pip install -r tests/requirements.txt
         pytest
diff --git a/.gitignore b/.gitignore
@@ -5,6 +5,7 @@ laptop_env/
 worker*/
 *.dirlock
 *.lock
+notes/
 
 # Byte-compiled / optimized / DLL files
 __pycache__/

diff --git a/README.md b/README.md
@@ -39,16 +39,14 @@ from sklearn.datasets import make_blobs
 from incremental_trees.models.classification.streaming_rfc import StreamingRFC
 
 # Generate some data in memory
-x, y = make_blobs(n_samples=int(2e5), random_state=0, n_features=40,
-                  centers=2, cluster_std=100)
+x, y = make_blobs(n_samples=int(2e5), random_state=0, n_features=40, centers=2, cluster_std=100)
 
 srfc = StreamingRFC(n_estimators_per_chunk=3,
                     max_n_estimators=np.inf,
                     spf_n_fits=30,  # Number of calls to .partial_fit()
                     spf_sample_prop=0.3)  # Number of rows to sample each on .partial_fit()
 
-srfc.fit(x, y, 
-         sample_weight=np.ones_like(y))  # Optional, gets sampled along with the data
+srfc.fit(x, y, sample_weight=np.ones_like(y))  # Optional, gets sampled along with the data
 
 # Should be n_estimators_per_chunk * spf_n_fits
 print(len(srfc.estimators_))
@@ -96,7 +94,7 @@ For example, this can be used to feed .partial_fit() sequentially (although belo
 ````python
 import numpy as np
 from sklearn.datasets import make_blobs
-from incremental_trees.trees import StreamingRFC
+from incremental_trees.models.classification.streaming_rfc import StreamingRFC
 
 srfc = StreamingRFC(n_estimators_per_chunk=20,
                     max_n_estimators=np.inf,
@@ -110,11 +108,11 @@ x, y = make_blobs(n_samples=int(2e5), random_state=0, n_features=40,
 n_chunks = 30
 chunk_size = int(2e3)
 for i in range(n_chunks):
-    sample_idx = np.random.randint(0, x.shape[0], chunk_size)
-    # Call .partial_fit(), specifying expected classes, also supports other .fit args such as sample_weight
-    srfc.partial_fit(x[sample_idx, :], y[sample_idx],
-                     classes=np.unique(y))
-           
+   sample_idx = np.random.randint(0, x.shape[0], chunk_size)
+   # Call .partial_fit(), specifying expected classes, also supports other .fit args such as sample_weight
+   srfc.partial_fit(x[sample_idx, :], y[sample_idx],
+                    classes=np.unique(y))
+
 # Should be n_chunks * n_estimators_per_chunk             
 print(len(srfc.estimators_))
 print(srfc.score(x, y))
@@ -126,17 +124,17 @@ There are a couple of different model setups worth considering. No idea which wo
 #### "Incremental forest"
 For the number of chunks/fits, sample rows from X, then fit a number of single trees (with different column subsets), eg.
 ````python
-srfc = StreamingRFC(n_estimators_per_chunk=10,
-                    max_features='sqrt')    
+srfc = StreamingRFC(n_estimators_per_chunk=10, max_features='sqrt')    
 ````
 #### "Incremental decision trees"
 Single (or few) decision trees per data subset, with all features. 
 ````python
-srfc = StreamingRFC(n_estimators_per_chunk=1,
-                    max_features=x.shape[1])
+srfc = StreamingRFC(n_estimators_per_chunk=1, max_features=x.shape[1])
 ````
 
 # Version history
+## v0.6.0
+ - Update to work with scikit-learn==1.2, dask==2022.12, dask-glm==0.2.0, dask-ml==2022.5.27. Support python 3.8 and 3.9.
 ## v0.5.1
  - Add support for passing fit args/kwargs via `.fit` (specifically, `sample_weight`)
 ## v0.5.0

diff --git a/example_dask.py b/example_dask.py
diff --git a/example_fit.py b/example_fit.py
diff --git a/incremental_trees/__init__.py b/incremental_trees/__init__.py
@@ -1 +1 @@
-__version__ = '0.5.1'
+__version__ = '0.6.0'
diff --git a/incremental_trees/models/classification/streaming_extc.py b/incremental_trees/models/classification/streaming_extc.py
@@ -1,3 +1,5 @@
+from typing import Optional, Dict, Union
+
 import numpy as np
 from sklearn.ensemble import ExtraTreesClassifier
 from sklearn.tree import ExtraTreeClassifier
@@ -10,52 +12,54 @@ class StreamingEXTC(ClassifierAdditions, ClassifierOverloads, ExtraTreesClassifi
     """Overload sklearn.ensemble.ExtraTreesClassifier to add partial fit method and new params."""
 
     def __init__(self,
+                 criterion: str = "gini",
+                 max_depth: Optional[int] = None,
+                 min_samples_split: int = 2,
+                 min_samples_leaf: int = 1,
+                 min_weight_fraction_leaf: float = 0.0,
+                 max_features: float = 1.0,
+                 max_leaf_nodes: Optional[int] = None,
+                 min_impurity_decrease: float = 0.0,
+                 bootstrap: bool = False,
+                 oob_score: bool = False,
+                 n_jobs: Optional[int] = None,
+                 random_state: Optional[int] = None,
+                 verbose: int = 0,
+                 warm_start: bool = True,
+                 class_weight: Optional[Union[str, Dict]] = None,
+                 ccp_alpha: float = 0.0,
+                 max_samples: Optional[float] = None,
                  n_estimators_per_chunk: int = 1,
-                 n_estimators: bool = None,
-                 max_n_estimators=np.inf,
-                 criterion="gini",
-                 max_depth=None,
-                 min_samples_split=2,
-                 min_samples_leaf=1,
-                 min_weight_fraction_leaf=0.,
-                 max_features="auto",
-                 max_leaf_nodes=None,
-                 min_impurity_decrease=0.,
-                 min_impurity_split=None,
-                 bootstrap=False,
-                 oob_score=False,
-                 n_jobs=None,
-                 random_state=None,
-                 verbose=0,
-                 warm_start=True,
-                 class_weight=None,
+                 max_n_estimators: float = np.inf,
                  dask_feeding: bool = True,
-                 spf_n_fits=100,
-                 spf_sample_prop: float = 0.1):
+                 spf_n_fits: int = 100,
+                 spf_sample_prop: float = 0.1
+                 ):
         super(ExtraTreesClassifier, self).__init__(
-            base_estimator=ExtraTreeClassifier(),
+            estimator=ExtraTreeClassifier(),
             n_estimators=n_estimators_per_chunk,
             estimator_params=("criterion", "max_depth", "min_samples_split",
                               "min_samples_leaf", "min_weight_fraction_leaf",
                               "max_features", "max_leaf_nodes",
-                              "min_impurity_decrease", "min_impurity_split",
-                              "random_state"),
+                              "min_impurity_decrease",
+                              "random_state", "ccp_alpha"),
             bootstrap=bootstrap,
             oob_score=oob_score,
             n_jobs=n_jobs,
             random_state=random_state,
             verbose=verbose,
             warm_start=warm_start,
-            class_weight=class_weight)
+            class_weight=class_weight,
+            max_samples=max_samples
+        )
 
         self.max_n_estimators: int = None
         self._fit_estimators: int = 0
         self.classes_: np.array = None  # NB: Needs to be array, not list.
         self.n_classes_: int = None
-
         self._fit_estimators = 0
         self.max_n_estimators = max_n_estimators
-        self.n_estimators_per_chunk = n_estimators
+        self.n_estimators_per_chunk = n_estimators_per_chunk
         self.criterion = criterion
         self.max_depth = max_depth
         self.min_samples_split = min_samples_split
@@ -64,7 +68,8 @@ def __init__(self,
         self.max_features = max_features
         self.max_leaf_nodes = max_leaf_nodes
         self.min_impurity_decrease = min_impurity_decrease
-        self.min_impurity_split = min_impurity_split
+        self.ccp_alpha = ccp_alpha
+        self.max_samples = max_samples
 
         # Set additional params.
         self.set_params(n_estimators_per_chunk=n_estimators_per_chunk,

diff --git a/incremental_trees/models/classification/streaming_rfc.py b/incremental_trees/models/classification/streaming_rfc.py
@@ -1,4 +1,4 @@
-import warnings
+from typing import Optional, Union, Dict, List
 
 import numpy as np
 from sklearn.ensemble import RandomForestClassifier
@@ -15,27 +15,28 @@ class StreamingRFC(ClassifierAdditions, ClassifierOverloads, RandomForestClassif
     """
 
     def __init__(self,
-                 bootstrap=True,
-                 class_weight=None,
-                 criterion='gini',
-                 max_depth=None,
-                 max_features='auto',
-                 max_leaf_nodes=None,
-                 min_impurity_decrease=0.0,
-                 min_impurity_split=None,
-                 min_samples_leaf=1,
-                 min_samples_split=2,
-                 min_weight_fraction_leaf=0.0,
-                 n_estimators_per_chunk: int = 1,
-                 n_jobs=None,
-                 oob_score=False,
-                 random_state=None,
-                 verbose=0,
+                 criterion: str = 'gini',
+                 max_depth: Optional[int] = None,
+                 min_samples_split: int = 2,
+                 min_samples_leaf: int = 1,
+                 min_weight_fraction_leaf: float = 0.0,
+                 max_features: Optional[str] = 'sqrt',
+                 max_leaf_nodes: Optional[int] = None,
+                 min_impurity_decrease: float = 0.0,
+                 bootstrap: bool = True,
+                 oob_score: bool = False,
+                 n_jobs: Optional[int] = None,
+                 random_state: Optional[int] = None,
+                 verbose: int = 0,
                  warm_start: bool = True,
+                 class_weight: Optional[Union[str, Dict, List[Dict]]] = None,
+                 ccp_alpha: float = 0.0,
+                 max_samples: Optional[int] = None,
                  dask_feeding: bool = True,
-                 max_n_estimators=10,
-                 spf_n_fits=100,
-                 spf_sample_prop=0.1) -> None:
+                 n_estimators_per_chunk: int = 1,
+                 max_n_estimators: int = 10,
+                 spf_n_fits: int = 100,
+                 spf_sample_prop: float = 0.1) -> None:
         """
         :param bootstrap:
         :param class_weight:
@@ -44,7 +45,6 @@ def __init__(self,
         :param max_features:
         :param max_leaf_nodes:
         :param min_impurity_decrease:
-        :param min_impurity_split:
         :param min_samples_leaf:
         :param min_samples_split:
         :param min_weight_fraction_leaf:
@@ -73,7 +73,6 @@ def __init__(self,
                         max_features=max_features,
                         max_leaf_nodes=max_leaf_nodes,
                         min_impurity_decrease=min_impurity_decrease,
-                        min_impurity_split=min_impurity_split,
                         min_samples_leaf=min_samples_leaf,
                         min_samples_split=min_samples_split,
                         min_weight_fraction_leaf=min_weight_fraction_leaf,
@@ -89,4 +88,7 @@ def __init__(self,
                         max_n_estimators=max_n_estimators,
                         verb=0,
                         spf_n_fits=spf_n_fits,
-                        spf_sample_prop=spf_sample_prop)
+                        spf_sample_prop=spf_sample_prop,
+                        ccp_alpha=ccp_alpha,
+                        max_samples=max_samples
+                        )