Skip to content

Commit

Permalink
Update reqs (#13)
Browse files Browse the repository at this point in the history
* feat: Update to work with dask==2022.12, scikit-learn==1.2

* test: Tidy tests

* feat: Bump version to 0.6.0

* docs: Update readme and examples

* chore: Update CI python version targets

* test: Correct import

* chore: Update setup.py, tidy
  • Loading branch information
garethjns authored Jan 1, 2023
1 parent dbc6324 commit 8a23e7f
Show file tree
Hide file tree
Showing 44 changed files with 3,260 additions and 3,627 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/run_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: [3.6, 3.7, 3.8]
python-version: [3.8, 3.9]

steps:
- uses: actions/checkout@v2
Expand All @@ -23,7 +23,7 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
pip install .
- name: Lint with flake8
run: |
pip install flake8
Expand All @@ -33,5 +33,5 @@ jobs:
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
- name: Test with pytest
run: |
pip install pytest
pip install -r tests/requirements.txt
pytest
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ laptop_env/
worker*/
*.dirlock
*.lock
notes/

# Byte-compiled / optimized / DLL files
__pycache__/
Expand Down
26 changes: 12 additions & 14 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,16 +39,14 @@ from sklearn.datasets import make_blobs
from incremental_trees.models.classification.streaming_rfc import StreamingRFC

# Generate some data in memory
x, y = make_blobs(n_samples=int(2e5), random_state=0, n_features=40,
centers=2, cluster_std=100)
x, y = make_blobs(n_samples=int(2e5), random_state=0, n_features=40, centers=2, cluster_std=100)

srfc = StreamingRFC(n_estimators_per_chunk=3,
max_n_estimators=np.inf,
spf_n_fits=30, # Number of calls to .partial_fit()
spf_sample_prop=0.3) # Number of rows to sample each on .partial_fit()

srfc.fit(x, y,
sample_weight=np.ones_like(y)) # Optional, gets sampled along with the data
srfc.fit(x, y, sample_weight=np.ones_like(y)) # Optional, gets sampled along with the data

# Should be n_estimators_per_chunk * spf_n_fits
print(len(srfc.estimators_))
Expand Down Expand Up @@ -96,7 +94,7 @@ For example, this can be used to feed .partial_fit() sequentially (although belo
````python
import numpy as np
from sklearn.datasets import make_blobs
from incremental_trees.trees import StreamingRFC
from incremental_trees.models.classification.streaming_rfc import StreamingRFC

srfc = StreamingRFC(n_estimators_per_chunk=20,
max_n_estimators=np.inf,
Expand All @@ -110,11 +108,11 @@ x, y = make_blobs(n_samples=int(2e5), random_state=0, n_features=40,
n_chunks = 30
chunk_size = int(2e3)
for i in range(n_chunks):
sample_idx = np.random.randint(0, x.shape[0], chunk_size)
# Call .partial_fit(), specifying expected classes, also supports other .fit args such as sample_weight
srfc.partial_fit(x[sample_idx, :], y[sample_idx],
classes=np.unique(y))
sample_idx = np.random.randint(0, x.shape[0], chunk_size)
# Call .partial_fit(), specifying expected classes, also supports other .fit args such as sample_weight
srfc.partial_fit(x[sample_idx, :], y[sample_idx],
classes=np.unique(y))

# Should be n_chunks * n_estimators_per_chunk
print(len(srfc.estimators_))
print(srfc.score(x, y))
Expand All @@ -126,17 +124,17 @@ There are a couple of different model setups worth considering. No idea which wo
#### "Incremental forest"
For the number of chunks/fits, sample rows from X, then fit a number of single trees (with different column subsets), eg.
````python
srfc = StreamingRFC(n_estimators_per_chunk=10,
max_features='sqrt')
srfc = StreamingRFC(n_estimators_per_chunk=10, max_features='sqrt')
````
#### "Incremental decision trees"
Single (or few) decision trees per data subset, with all features.
````python
srfc = StreamingRFC(n_estimators_per_chunk=1,
max_features=x.shape[1])
srfc = StreamingRFC(n_estimators_per_chunk=1, max_features=x.shape[1])
````

# Version history
## v0.6.0
- Update to work with scikit-learn==1.2, dask==2022.12, dask-glm==0.2.0, dask-ml==2022.5.27. Support python 3.8 and 3.9.
## v0.5.1
- Add support for passing fit args/kwargs via `.fit` (specifically, `sample_weight`)
## v0.5.0
Expand Down
40 changes: 0 additions & 40 deletions example_dask.py

This file was deleted.

21 changes: 0 additions & 21 deletions example_fit.py

This file was deleted.

2 changes: 1 addition & 1 deletion incremental_trees/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.5.1'
__version__ = '0.6.0'
59 changes: 32 additions & 27 deletions incremental_trees/models/classification/streaming_extc.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from typing import Optional, Dict, Union

import numpy as np
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import ExtraTreeClassifier
Expand All @@ -10,52 +12,54 @@ class StreamingEXTC(ClassifierAdditions, ClassifierOverloads, ExtraTreesClassifi
"""Overload sklearn.ensemble.ExtraTreesClassifier to add partial fit method and new params."""

def __init__(self,
criterion: str = "gini",
max_depth: Optional[int] = None,
min_samples_split: int = 2,
min_samples_leaf: int = 1,
min_weight_fraction_leaf: float = 0.0,
max_features: float = 1.0,
max_leaf_nodes: Optional[int] = None,
min_impurity_decrease: float = 0.0,
bootstrap: bool = False,
oob_score: bool = False,
n_jobs: Optional[int] = None,
random_state: Optional[int] = None,
verbose: int = 0,
warm_start: bool = True,
class_weight: Optional[Union[str, Dict]] = None,
ccp_alpha: float = 0.0,
max_samples: Optional[float] = None,
n_estimators_per_chunk: int = 1,
n_estimators: bool = None,
max_n_estimators=np.inf,
criterion="gini",
max_depth=None,
min_samples_split=2,
min_samples_leaf=1,
min_weight_fraction_leaf=0.,
max_features="auto",
max_leaf_nodes=None,
min_impurity_decrease=0.,
min_impurity_split=None,
bootstrap=False,
oob_score=False,
n_jobs=None,
random_state=None,
verbose=0,
warm_start=True,
class_weight=None,
max_n_estimators: float = np.inf,
dask_feeding: bool = True,
spf_n_fits=100,
spf_sample_prop: float = 0.1):
spf_n_fits: int = 100,
spf_sample_prop: float = 0.1
):
super(ExtraTreesClassifier, self).__init__(
base_estimator=ExtraTreeClassifier(),
estimator=ExtraTreeClassifier(),
n_estimators=n_estimators_per_chunk,
estimator_params=("criterion", "max_depth", "min_samples_split",
"min_samples_leaf", "min_weight_fraction_leaf",
"max_features", "max_leaf_nodes",
"min_impurity_decrease", "min_impurity_split",
"random_state"),
"min_impurity_decrease",
"random_state", "ccp_alpha"),
bootstrap=bootstrap,
oob_score=oob_score,
n_jobs=n_jobs,
random_state=random_state,
verbose=verbose,
warm_start=warm_start,
class_weight=class_weight)
class_weight=class_weight,
max_samples=max_samples
)

self.max_n_estimators: int = None
self._fit_estimators: int = 0
self.classes_: np.array = None # NB: Needs to be array, not list.
self.n_classes_: int = None

self._fit_estimators = 0
self.max_n_estimators = max_n_estimators
self.n_estimators_per_chunk = n_estimators
self.n_estimators_per_chunk = n_estimators_per_chunk
self.criterion = criterion
self.max_depth = max_depth
self.min_samples_split = min_samples_split
Expand All @@ -64,7 +68,8 @@ def __init__(self,
self.max_features = max_features
self.max_leaf_nodes = max_leaf_nodes
self.min_impurity_decrease = min_impurity_decrease
self.min_impurity_split = min_impurity_split
self.ccp_alpha = ccp_alpha
self.max_samples = max_samples

# Set additional params.
self.set_params(n_estimators_per_chunk=n_estimators_per_chunk,
Expand Down
48 changes: 25 additions & 23 deletions incremental_trees/models/classification/streaming_rfc.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import warnings
from typing import Optional, Union, Dict, List

import numpy as np
from sklearn.ensemble import RandomForestClassifier
Expand All @@ -15,27 +15,28 @@ class StreamingRFC(ClassifierAdditions, ClassifierOverloads, RandomForestClassif
"""

def __init__(self,
bootstrap=True,
class_weight=None,
criterion='gini',
max_depth=None,
max_features='auto',
max_leaf_nodes=None,
min_impurity_decrease=0.0,
min_impurity_split=None,
min_samples_leaf=1,
min_samples_split=2,
min_weight_fraction_leaf=0.0,
n_estimators_per_chunk: int = 1,
n_jobs=None,
oob_score=False,
random_state=None,
verbose=0,
criterion: str = 'gini',
max_depth: Optional[int] = None,
min_samples_split: int = 2,
min_samples_leaf: int = 1,
min_weight_fraction_leaf: float = 0.0,
max_features: Optional[str] = 'sqrt',
max_leaf_nodes: Optional[int] = None,
min_impurity_decrease: float = 0.0,
bootstrap: bool = True,
oob_score: bool = False,
n_jobs: Optional[int] = None,
random_state: Optional[int] = None,
verbose: int = 0,
warm_start: bool = True,
class_weight: Optional[Union[str, Dict, List[Dict]]] = None,
ccp_alpha: float = 0.0,
max_samples: Optional[int] = None,
dask_feeding: bool = True,
max_n_estimators=10,
spf_n_fits=100,
spf_sample_prop=0.1) -> None:
n_estimators_per_chunk: int = 1,
max_n_estimators: int = 10,
spf_n_fits: int = 100,
spf_sample_prop: float = 0.1) -> None:
"""
:param bootstrap:
:param class_weight:
Expand All @@ -44,7 +45,6 @@ def __init__(self,
:param max_features:
:param max_leaf_nodes:
:param min_impurity_decrease:
:param min_impurity_split:
:param min_samples_leaf:
:param min_samples_split:
:param min_weight_fraction_leaf:
Expand Down Expand Up @@ -73,7 +73,6 @@ def __init__(self,
max_features=max_features,
max_leaf_nodes=max_leaf_nodes,
min_impurity_decrease=min_impurity_decrease,
min_impurity_split=min_impurity_split,
min_samples_leaf=min_samples_leaf,
min_samples_split=min_samples_split,
min_weight_fraction_leaf=min_weight_fraction_leaf,
Expand All @@ -89,4 +88,7 @@ def __init__(self,
max_n_estimators=max_n_estimators,
verb=0,
spf_n_fits=spf_n_fits,
spf_sample_prop=spf_sample_prop)
spf_sample_prop=spf_sample_prop,
ccp_alpha=ccp_alpha,
max_samples=max_samples
)
Loading

0 comments on commit 8a23e7f

Please sign in to comment.