Skip to content

Commit

Permalink
Big bang!
Browse files Browse the repository at this point in the history
  • Loading branch information
Giulia Baldini committed May 31, 2024
0 parents commit d2a6413
Show file tree
Hide file tree
Showing 57 changed files with 6,948 additions and 0 deletions.
6 changes: 6 additions & 0 deletions .flake8
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
[flake8]
extend-ignore = E203,E266,E501,W503
max-line-length = 88
max-complexity = 18
select = B,C,E,F,W,T4,B9
mypy_config = .mypy.ini
48 changes: 48 additions & 0 deletions .github/workflows/mypy-flake-test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
name: Q&A and Tests

on:
push:
branches:
- main
paths:
- "bico/**"
- "tests/**"
- "poetry.lock"
pull_request:
paths:
- "bico/**"
- "tests/**"
- "poetry.lock"
# Allows you to run this workflow manually from the Actions tab
workflow_dispatch:

jobs:
build:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.9", "3.10", "3.11", "3.12"]
steps:
- uses: actions/checkout@v3

- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v3
with:
python-version: ${{ matrix.python-version }}

- name: Install poetry
run: |
curl -sSL https://install.python-poetry.org | POETRY_VERSION=1.8.2 python3 -
poetry run pip install -U pip
poetry install --with dev
- name: Run MyPy
run: |
mkdir .mypy_cache
poetry run mypy --install-types --non-interactive bico
- name: Run Flake8
run: |
poetry run flake8 bico/
- name: Run Tests
run: |
poetry run python -m unittest discover tests -v
30 changes: 30 additions & 0 deletions .github/workflows/publish.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
name: Upload to PyPi

on:
release:
types: [published]
# Allows you to run this workflow manually from the Actions tab
workflow_dispatch:

permissions:
contents: read

jobs:
deploy:

runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v3
- name: Set up Python 3.10
uses: actions/setup-python@v3
with:
python-version: "3.10"
- name: Install poetry
run: |
curl -sSL https://install.python-poetry.org | POETRY_VERSION=1.8.2 python3 -
export PATH="/root/.local/bin:$PATH"
poetry run pip install -U pip
poetry install
- name: Publish package
run: poetry publish --build --username __token__ --password ${{ secrets.PYPI_TOKEN }}
166 changes: 166 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
# Prerequisites
*.d

# Compiled Object files
*.slo
*.lo
*.o
*.obj

# Precompiled Headers
*.gch
*.pch

# Compiled Dynamic libraries
*.so
*.dylib
*.dll

# Fortran module files
*.mod
*.smod

# Compiled Static libraries
*.lai
*.la
*.a
*.lib

# Executables
*.exe
*.out
*.app

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
.python-version

# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock

# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

.vscode/
.idea/
.DS_Store
9 changes: 9 additions & 0 deletions .mypy.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
[mypy]
disallow_untyped_defs = True
disallow_untyped_calls = True
ignore_missing_imports = True
warn_no_return = True
warn_return_any = True
warn_unreachable = True
warn_unused_configs = True
exclude = _external
26 changes: 26 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.0.1
hooks:
- id: end-of-file-fixer
exclude: \\w+.pdf
- id: trailing-whitespace
exclude: \\w+.pdf
- repo: https://github.com/psf/black
rev: 22.3.0
hooks:
- id: black
- repo: https://github.com/pycqa/isort
rev: 5.12.0
hooks:
- id: isort
- repo: https://github.com/PyCQA/flake8
rev: 7.0.0
hooks:
- id: flake8
additional_dependencies: ["flake8-bugbear==21.4.3"]
- repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.10.0
hooks:
- id: mypy
args: ["--install-types", "--non-interactive"]
104 changes: 104 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
[![Build Status](https://github.com/algo-hhu//actions/workflows/mypy-flake-test.yml/badge.svg)](https://github.com/algo-hhu/bico/actions)
[![License: GPL v3](https://img.shields.io/badge/License-GPLv3-blue.svg)](https://www.gnu.org/licenses/gpl-3.0)
[![Supported Python version](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/release/python-390/)
[![Stable Version](https://img.shields.io/pypi/v/bico?label=stable)](https://pypi.org/project/bico/)

# BICO

BICO is a fast streaming algorithm to compute high quality solutions for the k-means problem on very large sets of points. It combines the tree data structure of SIGMOND Test of Time Award winning algorithm BIRCH with insights from clustering theory to obtain solutions fast while keeping the error regarding the k-means cost function low.

<!---
TODO: Add logo
<p align="center">
<img src="https://github.com/algo-hhu/bico/blob/main/images/logo.png" alt="BICO Logo"/>
</p>
-->

## Installation

```bash
pip install bico
```

## Example

```python
from bico import BICO
import numpy as np
import time

np.random.seed(42)

data = np.random.rand(10000, 10)

start = time.time()
bico = BICO(n_clusters=3, random_state=0, fit_coreset=True)
bico.fit(data)

print("Time:", time.time() - start)
# Time: 0.08275651931762695

print(bico.coreset_points_)
# BICO returns a set of points that act as a summary of the entire dataset.
# By default, at most 200 * n_clusters points are returned.
# This behaviour can be changed by setting the `summary_size` parameter.

# [[0.45224018 0.70183673 0.55506671 ... 0.70132665 0.57244196 0.66789088]
# [0.73712952 0.5250208 0.43809322 ... 0.61427161 0.67910981 0.56207661]
# [0.89905336 0.46942062 0.20677639 ... 0.74210482 0.75714522 0.49651055]
# ...
# [0.68744494 0.41508081 0.39197623 ... 0.44093386 0.21983902 0.37237243]
# [0.60820965 0.29406341 0.67067782 ... 0.66435474 0.2390822 0.20070476]
# [0.67385626 0.33474823 0.68238779 ... 0.3581703 0.65646253 0.41386131]]

print(bico.cluster_centers_)
# If the `fit_coreset` parameter is set to True, the cluster centers are computed using KMeans from sklearn based on the coreset.

# [[0.46892639 0.41968333 0.47302945 0.51782955 0.39390839 0.56209413
# 0.4481691 0.49521457 0.31394509 0.5104331 ]
# [0.54384638 0.518978 0.49456809 0.56677848 0.63881783 0.33627504
# 0.49873782 0.5541338 0.52913562 0.56017203]
# [0.48639347 0.55542596 0.54350474 0.41931257 0.48117255 0.60089563
# 0.55457724 0.44833238 0.67583389 0.43069267]]
```

## Development

Install [poetry](https://python-poetry.org/docs/#installation)
```bash
curl -sSL https://install.python-poetry.org | python3 -
```

Install clang
```bash
sudo apt-get install clang
```

Set clang variables
```bash
export CXX=/usr/bin/clang++
export CC=/usr/bin/clang
```

Install the package
```bash
poetry install
```

If the installation does not work and you do not see the C++ output, you can build the package to see the stack trace
```bash
poetry build
```

Run the tests
```bash
poetry run python -m unittest discover tests -v
```

## Citation

If you use this code, please cite [the following paper](https://link.springer.com/chapter/10.1007/978-3-642-40450-4_41):

```
Hendrik Fichtenberger, Marc Gillé, Melanie Schmidt, Chris Schwiegelshohn and Christian Sohler. "BICO: BIRCH Meets Coresets for k-Means Clustering" (2013). ESA 2013.
```
3 changes: 3 additions & 0 deletions bico/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from bico.core import BICO

__all__ = ["BICO"]
Loading

0 comments on commit d2a6413

Please sign in to comment.