From d2a6413b7d85742033f1d0608c12a90041161028 Mon Sep 17 00:00:00 2001 From: Giulia Baldini Date: Fri, 31 May 2024 16:41:37 +0200 Subject: [PATCH] Big bang! --- .flake8 | 6 + .github/workflows/mypy-flake-test.yml | 48 + .github/workflows/publish.yml | 30 + .gitignore | 166 + .mypy.ini | 9 + .pre-commit-config.yaml | 26 + README.md | 104 + bico/__init__.py | 3 + bico/_core.cpp | 136 + bico/base/algorithm.h | 36 + bico/base/attributecalculator.h | 29 + bico/base/clusterdissimilaritymeasure.h | 30 + bico/base/clustermeasuresetter.h | 17 + bico/base/combinedevaluation.h | 73 + bico/base/discreteboundedspace.h | 55 + bico/base/discreteproxyprovider.h | 69 + bico/base/dissimilaritymeasure.h | 30 + bico/base/euclideanspaceprovider.h | 24 + bico/base/evaluation.h | 24 + bico/base/inputsetter.h | 20 + bico/base/measuresetter.h | 20 + bico/base/partitionevaluation.h | 47 + bico/base/partitionprovider.h | 74 + bico/base/proxyevaluation.h | 63 + bico/base/proxygenerator.h | 32 + bico/base/proxyprovider.h | 62 + bico/base/solutionprovider.h | 33 + bico/base/streamingalgorithm.h | 27 + bico/base/weightedobject.h | 19 + bico/base/weightmodifier.h | 31 + bico/clustering/bico.h | 1003 ++++++ bico/clustering/cfentry.h | 141 + bico/clustering/cfrentry.h | 33 + bico/core.py | 212 ++ bico/datastructure/discreteproxysolution.h | 79 + bico/datastructure/proxysolution.h | 80 + bico/evaluation/kmeansevaluator.h | 428 +++ bico/exception/clueexception.h | 33 + bico/exception/invalidargumentexception.h | 28 + .../invalidruntimeconfigurationexception.h | 25 + bico/misc/randomgenerator.h | 35 + bico/misc/randomness.cpp | 5 + bico/misc/randomness.h | 36 + bico/point/l2metric.cpp | 14 + bico/point/l2metric.h | 28 + bico/point/point.cpp | 224 ++ bico/point/point.h | 154 + bico/point/pointcentroid.cpp | 16 + bico/point/pointcentroid.h | 35 + bico/point/pointweightmodifier.h | 44 + bico/point/realspaceprovider.cpp | 24 + bico/point/realspaceprovider.h | 30 + bico/point/squaredl2metric.cpp | 14 + bico/point/squaredl2metric.h | 28 + build.py | 47 + poetry.lock | 2771 +++++++++++++++++ pyproject.toml | 38 + 57 files changed, 6948 insertions(+) create mode 100644 .flake8 create mode 100644 .github/workflows/mypy-flake-test.yml create mode 100644 .github/workflows/publish.yml create mode 100644 .gitignore create mode 100644 .mypy.ini create mode 100644 .pre-commit-config.yaml create mode 100644 README.md create mode 100644 bico/__init__.py create mode 100644 bico/_core.cpp create mode 100644 bico/base/algorithm.h create mode 100644 bico/base/attributecalculator.h create mode 100644 bico/base/clusterdissimilaritymeasure.h create mode 100644 bico/base/clustermeasuresetter.h create mode 100644 bico/base/combinedevaluation.h create mode 100644 bico/base/discreteboundedspace.h create mode 100644 bico/base/discreteproxyprovider.h create mode 100644 bico/base/dissimilaritymeasure.h create mode 100644 bico/base/euclideanspaceprovider.h create mode 100644 bico/base/evaluation.h create mode 100644 bico/base/inputsetter.h create mode 100644 bico/base/measuresetter.h create mode 100644 bico/base/partitionevaluation.h create mode 100644 bico/base/partitionprovider.h create mode 100644 bico/base/proxyevaluation.h create mode 100644 bico/base/proxygenerator.h create mode 100644 bico/base/proxyprovider.h create mode 100644 bico/base/solutionprovider.h create mode 100644 bico/base/streamingalgorithm.h create mode 100644 bico/base/weightedobject.h create mode 100644 bico/base/weightmodifier.h create mode 100644 bico/clustering/bico.h create mode 100644 bico/clustering/cfentry.h create mode 100644 bico/clustering/cfrentry.h create mode 100644 bico/core.py create mode 100644 bico/datastructure/discreteproxysolution.h create mode 100644 bico/datastructure/proxysolution.h create mode 100644 bico/evaluation/kmeansevaluator.h create mode 100644 bico/exception/clueexception.h create mode 100644 bico/exception/invalidargumentexception.h create mode 100644 bico/exception/invalidruntimeconfigurationexception.h create mode 100644 bico/misc/randomgenerator.h create mode 100644 bico/misc/randomness.cpp create mode 100644 bico/misc/randomness.h create mode 100644 bico/point/l2metric.cpp create mode 100644 bico/point/l2metric.h create mode 100644 bico/point/point.cpp create mode 100644 bico/point/point.h create mode 100644 bico/point/pointcentroid.cpp create mode 100644 bico/point/pointcentroid.h create mode 100644 bico/point/pointweightmodifier.h create mode 100644 bico/point/realspaceprovider.cpp create mode 100644 bico/point/realspaceprovider.h create mode 100644 bico/point/squaredl2metric.cpp create mode 100644 bico/point/squaredl2metric.h create mode 100644 build.py create mode 100644 poetry.lock create mode 100644 pyproject.toml diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..adb7aff --- /dev/null +++ b/.flake8 @@ -0,0 +1,6 @@ +[flake8] +extend-ignore = E203,E266,E501,W503 +max-line-length = 88 +max-complexity = 18 +select = B,C,E,F,W,T4,B9 +mypy_config = .mypy.ini diff --git a/.github/workflows/mypy-flake-test.yml b/.github/workflows/mypy-flake-test.yml new file mode 100644 index 0000000..418cb10 --- /dev/null +++ b/.github/workflows/mypy-flake-test.yml @@ -0,0 +1,48 @@ +name: Q&A and Tests + +on: + push: + branches: + - main + paths: + - "bico/**" + - "tests/**" + - "poetry.lock" + pull_request: + paths: + - "bico/**" + - "tests/**" + - "poetry.lock" + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + +jobs: + build: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.9", "3.10", "3.11", "3.12"] + steps: + - uses: actions/checkout@v3 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python-version }} + + - name: Install poetry + run: | + curl -sSL https://install.python-poetry.org | POETRY_VERSION=1.8.2 python3 - + poetry run pip install -U pip + poetry install --with dev + + - name: Run MyPy + run: | + mkdir .mypy_cache + poetry run mypy --install-types --non-interactive bico + - name: Run Flake8 + run: | + poetry run flake8 bico/ + - name: Run Tests + run: | + poetry run python -m unittest discover tests -v diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml new file mode 100644 index 0000000..91be2d2 --- /dev/null +++ b/.github/workflows/publish.yml @@ -0,0 +1,30 @@ +name: Upload to PyPi + +on: + release: + types: [published] + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + +permissions: + contents: read + +jobs: + deploy: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + - name: Set up Python 3.10 + uses: actions/setup-python@v3 + with: + python-version: "3.10" + - name: Install poetry + run: | + curl -sSL https://install.python-poetry.org | POETRY_VERSION=1.8.2 python3 - + export PATH="/root/.local/bin:$PATH" + poetry run pip install -U pip + poetry install + - name: Publish package + run: poetry publish --build --username __token__ --password ${{ secrets.PYPI_TOKEN }} diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..23ffb0c --- /dev/null +++ b/.gitignore @@ -0,0 +1,166 @@ +# Prerequisites +*.d + +# Compiled Object files +*.slo +*.lo +*.o +*.obj + +# Precompiled Headers +*.gch +*.pch + +# Compiled Dynamic libraries +*.so +*.dylib +*.dll + +# Fortran module files +*.mod +*.smod + +# Compiled Static libraries +*.lai +*.la +*.a +*.lib + +# Executables +*.exe +*.out +*.app + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +.vscode/ +.idea/ +.DS_Store diff --git a/.mypy.ini b/.mypy.ini new file mode 100644 index 0000000..7e77cdb --- /dev/null +++ b/.mypy.ini @@ -0,0 +1,9 @@ +[mypy] +disallow_untyped_defs = True +disallow_untyped_calls = True +ignore_missing_imports = True +warn_no_return = True +warn_return_any = True +warn_unreachable = True +warn_unused_configs = True +exclude = _external diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..ef9eb9e --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,26 @@ +repos: +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.0.1 + hooks: + - id: end-of-file-fixer + exclude: \\w+.pdf + - id: trailing-whitespace + exclude: \\w+.pdf +- repo: https://github.com/psf/black + rev: 22.3.0 + hooks: + - id: black +- repo: https://github.com/pycqa/isort + rev: 5.12.0 + hooks: + - id: isort +- repo: https://github.com/PyCQA/flake8 + rev: 7.0.0 + hooks: + - id: flake8 + additional_dependencies: ["flake8-bugbear==21.4.3"] +- repo: https://github.com/pre-commit/mirrors-mypy + rev: v1.10.0 + hooks: + - id: mypy + args: ["--install-types", "--non-interactive"] diff --git a/README.md b/README.md new file mode 100644 index 0000000..ebfbaae --- /dev/null +++ b/README.md @@ -0,0 +1,104 @@ +[![Build Status](https://github.com/algo-hhu//actions/workflows/mypy-flake-test.yml/badge.svg)](https://github.com/algo-hhu/bico/actions) +[![License: GPL v3](https://img.shields.io/badge/License-GPLv3-blue.svg)](https://www.gnu.org/licenses/gpl-3.0) +[![Supported Python version](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/release/python-390/) +[![Stable Version](https://img.shields.io/pypi/v/bico?label=stable)](https://pypi.org/project/bico/) + +# BICO + +BICO is a fast streaming algorithm to compute high quality solutions for the k-means problem on very large sets of points. It combines the tree data structure of SIGMOND Test of Time Award winning algorithm BIRCH with insights from clustering theory to obtain solutions fast while keeping the error regarding the k-means cost function low. + + + +## Installation + +```bash +pip install bico +``` + +## Example + +```python +from bico import BICO +import numpy as np +import time + +np.random.seed(42) + +data = np.random.rand(10000, 10) + +start = time.time() +bico = BICO(n_clusters=3, random_state=0, fit_coreset=True) +bico.fit(data) + +print("Time:", time.time() - start) +# Time: 0.08275651931762695 + +print(bico.coreset_points_) +# BICO returns a set of points that act as a summary of the entire dataset. +# By default, at most 200 * n_clusters points are returned. +# This behaviour can be changed by setting the `summary_size` parameter. + +# [[0.45224018 0.70183673 0.55506671 ... 0.70132665 0.57244196 0.66789088] +# [0.73712952 0.5250208 0.43809322 ... 0.61427161 0.67910981 0.56207661] +# [0.89905336 0.46942062 0.20677639 ... 0.74210482 0.75714522 0.49651055] +# ... +# [0.68744494 0.41508081 0.39197623 ... 0.44093386 0.21983902 0.37237243] +# [0.60820965 0.29406341 0.67067782 ... 0.66435474 0.2390822 0.20070476] +# [0.67385626 0.33474823 0.68238779 ... 0.3581703 0.65646253 0.41386131]] + +print(bico.cluster_centers_) +# If the `fit_coreset` parameter is set to True, the cluster centers are computed using KMeans from sklearn based on the coreset. + +# [[0.46892639 0.41968333 0.47302945 0.51782955 0.39390839 0.56209413 +# 0.4481691 0.49521457 0.31394509 0.5104331 ] +# [0.54384638 0.518978 0.49456809 0.56677848 0.63881783 0.33627504 +# 0.49873782 0.5541338 0.52913562 0.56017203] +# [0.48639347 0.55542596 0.54350474 0.41931257 0.48117255 0.60089563 +# 0.55457724 0.44833238 0.67583389 0.43069267]] +``` + +## Development + +Install [poetry](https://python-poetry.org/docs/#installation) +```bash +curl -sSL https://install.python-poetry.org | python3 - +``` + +Install clang +```bash +sudo apt-get install clang +``` + +Set clang variables +```bash +export CXX=/usr/bin/clang++ +export CC=/usr/bin/clang +``` + +Install the package +```bash +poetry install +``` + +If the installation does not work and you do not see the C++ output, you can build the package to see the stack trace +```bash +poetry build +``` + +Run the tests +```bash +poetry run python -m unittest discover tests -v +``` + +## Citation + +If you use this code, please cite [the following paper](https://link.springer.com/chapter/10.1007/978-3-642-40450-4_41): + +``` +Hendrik Fichtenberger, Marc Gillé, Melanie Schmidt, Chris Schwiegelshohn and Christian Sohler. "BICO: BIRCH Meets Coresets for k-Means Clustering" (2013). ESA 2013. +``` diff --git a/bico/__init__.py b/bico/__init__.py new file mode 100644 index 0000000..c499922 --- /dev/null +++ b/bico/__init__.py @@ -0,0 +1,3 @@ +from bico.core import BICO + +__all__ = ["BICO"] diff --git a/bico/_core.cpp b/bico/_core.cpp new file mode 100644 index 0000000..875eeb8 --- /dev/null +++ b/bico/_core.cpp @@ -0,0 +1,136 @@ +#include + +#include +#include +#include +#include +#include +#include + +#include "point/l2metric.h" +#include "point/squaredl2metric.h" +#include "point/point.h" +#include "point/pointweightmodifier.h" +#include "clustering/bico.h" +#include "misc/randomness.h" +#include "misc/randomgenerator.h" +#include "datastructure/proxysolution.h" +#include "point/pointcentroid.h" +#include "point/pointweightmodifier.h" +#include "point/realspaceprovider.h" + +using namespace CluE; + +class BicoExternal +{ +public: + BicoExternal( + uint d, + uint k, + uint p, + uint m, + int seed); + virtual ~BicoExternal(); + void addData(double const *array, uint n); + void addPoint(double const *array); + int compute(int *sample_weights, + double *points); + +private: + const uint _d; + Bico *_bico; +}; + +BicoExternal::BicoExternal(uint d, + uint k, + uint p, + uint m, + int seed) : _d(d), _bico(new Bico(d, k, p, m, seed, new SquaredL2Metric(), new PointWeightModifier())) +{} + +void BicoExternal::addData(double const *array, uint n) +{ + for (size_t i = 0; i < n * _d; i += _d) + { + addPoint(&array[i]); + } +} + +void BicoExternal::addPoint(double const *array) +{ + std::vector coords(array, array + _d); + Point p(coords); + // Call BICO point update + *_bico << p; +} + +int BicoExternal::compute(int *sample_weights, + double *points) +{ + // Retrieve coreset + ProxySolution *sol = _bico->compute(); + // Output coreset points + for (size_t i = 0; i < sol->proxysets[0].size(); ++i) + { + // Output weight + sample_weights[i] = sol->proxysets[0][i].getWeight(); + // Output center of gravity + for (size_t j = 0; j < sol->proxysets[0][i].dimension(); ++j) + { + points[i * _d + j] = sol->proxysets[0][i][j]; + } + } + int m = sol->proxysets[0].size(); + delete sol; + + return m; +} + +BicoExternal::~BicoExternal() +{ + delete _bico; +} + +// Thank you https://github.com/dstein64/kmeans1d! + +extern "C" +{ +#if defined(_WIN32) || defined(__CYGWIN__) + __declspec(dllexport) +#endif + BicoExternal * + init(uint d, + uint k, + uint p, + uint m, + int seed) + { + return new BicoExternal(d, k, p, m, seed); + } + void addData(BicoExternal *bico, double const *array, uint n) { bico->addData(array, n); } + + void addPoint(BicoExternal *bico, double const *array) { bico->addPoint(array); } + + int compute(BicoExternal *bico, int *sample_weights, + double *points) { return bico->compute(sample_weights, points); } + + void freeBico(BicoExternal *bico) { + delete bico; + } +} // extern "C" + +static PyMethodDef module_methods[] = { + {NULL, NULL, 0, NULL}}; + +static struct PyModuleDef _coremodule = { + PyModuleDef_HEAD_INIT, + "bico._core", + NULL, + -1, + module_methods, +}; + +PyMODINIT_FUNC PyInit__core(void) +{ + return PyModule_Create(&_coremodule); +} diff --git a/bico/base/algorithm.h b/bico/base/algorithm.h new file mode 100644 index 0000000..a2ca46f --- /dev/null +++ b/bico/base/algorithm.h @@ -0,0 +1,36 @@ +#ifndef CLUEALGORITHM_H +#define CLUEALGORITHM_H + +#include "../base/solutionprovider.h" + +/** +* @brief namespace for the CluE library +*/ +namespace CluE +{ + +/** + * @brief Abstract base class for algorithms. + * + * @ingroup base_classes + */ +class Algorithm +{ +public: + virtual ~Algorithm() + { + } + + /** + * @brief Runs the algorithm and returns the computed solution. + * + * Implementing classes override this method with the computation of a SolutionProvider + * instance whose reference is returned. The responibility for destructing the instance lies + * with the caller. + */ + virtual SolutionProvider* compute() = 0; +}; + +} + +#endif diff --git a/bico/base/attributecalculator.h b/bico/base/attributecalculator.h new file mode 100644 index 0000000..6bb9262 --- /dev/null +++ b/bico/base/attributecalculator.h @@ -0,0 +1,29 @@ +#ifndef ATTRIBUTECALCULATOR_H +#define ATTRIBUTECALCULATOR_H + +namespace CluE +{ + +/** + * @brief Abstract base class for attribute calculation (e.g. diameter). + * + * @ingroup base_classes + */ +template class AttributeCalculator +{ +public: + virtual ~AttributeCalculator() + { + } + + virtual AttributeCalculator* clone() const = 0; + + /** + * @brief Computes a characteristic attribute of a given object. + */ + virtual double calculate(T const&) const = 0; +}; + +} + +#endif diff --git a/bico/base/clusterdissimilaritymeasure.h b/bico/base/clusterdissimilaritymeasure.h new file mode 100644 index 0000000..4273c3c --- /dev/null +++ b/bico/base/clusterdissimilaritymeasure.h @@ -0,0 +1,30 @@ +#ifndef CLUSTERDISSIMILARITYMEASURE_H +#define CLUSTERDISSIMILARITYMEASURE_H + +#include + +namespace CluE +{ + +/** + * @brief Abstract base class for cluster dissimilarity measurement. + * @ingroup base_classes + */ +template class ClusterDissimilarityMeasure +{ +public: + virtual ~ClusterDissimilarityMeasure() + { + } + + virtual ClusterDissimilarityMeasure* clone() const = 0; + + /** + * @brief Computes the dissimilarity between the two given clusters. + */ + virtual double dissimilarity(std::vector const&, std::vector const&) = 0; +}; + +} + +#endif diff --git a/bico/base/clustermeasuresetter.h b/bico/base/clustermeasuresetter.h new file mode 100644 index 0000000..72362e7 --- /dev/null +++ b/bico/base/clustermeasuresetter.h @@ -0,0 +1,17 @@ +#ifndef CLUSTERMEASURESETTER_H +#define CLUSTERMEASURESETTER_H + +namespace CluE +{ +/** + * @brief Interface to propagate the ability to set a ClusterDissimilarityMeasure + * + * @ingroup base_classes + */ +template class ClusterMeasureSetter +{ + virtual void setMeasure(ClusterDissimilarityMeasure const *measure) = 0; +}; +} + +#endif diff --git a/bico/base/combinedevaluation.h b/bico/base/combinedevaluation.h new file mode 100644 index 0000000..f1dc6d1 --- /dev/null +++ b/bico/base/combinedevaluation.h @@ -0,0 +1,73 @@ +#ifndef COMBINEDEVALUATION_H +#define COMBINEDEVALUATION_H + +#include "../base/partitionprovider.h" +#include "../base/proxyprovider.h" +#include "../base/discreteproxyprovider.h" + +#include + +namespace CluE { + +/** + * @brief Abstract class. Proxy based evaluation algorithms may be optimized by deriving from this class. + * + * Evaluation algorithms requiring proxies and partitions for calculating the input's cost may derive from this class. + * Using an already existing partitioning instead of calucalting it based on a given set of proxies will speed up computation time. + * + * @ingroup base_classes + */ +template class CombinedEvaluation : virtual public Evaluation { +public: + virtual ~CombinedEvaluation() { + } + + /** + * @brief Calculates the cost related to the proxies, based on the partitions. + * @note There are separate overloaded versions of this method for use with discrete proxies. + */ + virtual double combinedcost(std::vector > const& partitioning, std::vector const& proxies) const = 0; + /** + * @overload + * @param solutionIndex PartitionProvider and ProxyProvider index. + */ + virtual double combinedcost(PartitionProvider const &partitioning, ProxyProvider const &proxies, unsigned int solutionIndex) const = 0; + /** + * @overload + */ + virtual double combinedcost(std::vector const &partition, T const &proxy) const = 0; + /** + * @overload + * @param solutionIndex PartitionProvider and ProxyProvider solution index. + * @param elementIndex Partition and proxy index. + */ + virtual double combinedcost(PartitionProvider const &partitioning, ProxyProvider const &proxies, unsigned int solutionIndex, unsigned int elementIndex) const = 0; + + /** + * @brief Calculates the cost related to the discrete proxies, based on the partitions. + */ + virtual double combinedcost(std::vector > const& partitioning, std::vector const& proxies) const = 0; + /** + * @overload + * @param solutionIndex PartitionProvider and ProxyProvider solution index. + */ + virtual double combinedcost(PartitionProvider const &partitioning, DiscreteProxyProvider const &proxies, unsigned int solutionIndex) const = 0; + /** + * @overload + */ + virtual double combinedcost(std::vector const &partition, T const * const proxy) const; + /** + * @overload + * @param solutionIndex PartitionProvider and ProxyProvider index. + * @param elementIndex Partition and proxy index. + */ + virtual double combinedcost(PartitionProvider const &partitioning, DiscreteProxyProvider const &proxies, unsigned int solutionIndex, unsigned int elementIndex) const = 0; +}; + +template inline double CombinedEvaluation::combinedcost(std::vector const& cluster, T const * const proxy) const { + return combinedcost(cluster, *proxy); +} + +} + +#endif diff --git a/bico/base/discreteboundedspace.h b/bico/base/discreteboundedspace.h new file mode 100644 index 0000000..086b27c --- /dev/null +++ b/bico/base/discreteboundedspace.h @@ -0,0 +1,55 @@ +#ifndef DISCRETEBOUNDEDSPACE_H +#define DISCRETEBOUNDEDSPACE_H + +namespace CluE +{ + +/** + * @brief Interface to extend a template type to provide discrete (bounded) space {0, ..., n-1}^d features. + * + * @ingroup base_classes + */ +template class DiscreteBoundedSpace +{ +public: + typedef VectorType V; + + virtual DiscreteBoundedSpace* clone() const = 0; + + /** + * @brief Returns the vector represented by the given coordinates. + */ + virtual V getVector(std::vector coordinates) const = 0; + + /** + * @brief Returns the coordinates of the given vector. + */ + virtual std::vector getCoordinates(V const & vector) const = 0; + + /** + * @brief Space dimension + */ + virtual size_t dimension() const = 0; + + /** + * @brief Number of discrete coordinates per dimension + */ + virtual size_space n() const = 0; + + /** + * @brief Upper bound = n-1 + */ + virtual size_space uBound() const + { + return n()-1; + } + + /** + * @brief Returns the space's origin + */ + virtual VectorType origin() const = 0; +}; + +} + +#endif diff --git a/bico/base/discreteproxyprovider.h b/bico/base/discreteproxyprovider.h new file mode 100644 index 0000000..e0385fb --- /dev/null +++ b/bico/base/discreteproxyprovider.h @@ -0,0 +1,69 @@ +#ifndef DISCRETEPROXYPROVIDER_H +#define DISCRETEPROXYPROVIDER_H + +#include "../base/solutionprovider.h" + +#include + +namespace CluE +{ + +/** + * @brief Abstract base class to access the results of proxy / center based clustering algorithms. + * + * @ingroup base_classes + */ +template class DiscreteProxyProvider +{ +public: + + virtual ~DiscreteProxyProvider() + { + } + + /** + * @brief returns the number of available solutions + * + * The algorthm may compute more than one solution of possibly different size, where size means + * number of computed clusters, proxies (e.g. cluster centers) or the size of a coreset. + * The sizes can be retrieved by a call to size_of_solution(). + */ + virtual unsigned int number_of_solutions() const = 0; + + /** + * @brief returns the size of a particular solution + * + * @param index number between 0 and @ref number_of_solutions()-1 + * @return the size for the requested clustering + */ + virtual unsigned int size_of_solution(unsigned int index) const = 0; + + /** + * @brief Returns a pointer to the proxy for the specified clustering and cluster. + * + * Returns a pointer to the element of the input set that was computed to be the proxy for + * cluster number proxyIndex in clustering number solutionIndex. + */ + virtual T* discrete_proxy(unsigned int solutionIndex, unsigned int proxyIndex) const = 0; + + /** + * @brief Returns a vector of pointers to the proxies for the specified clustering. + * + * Returns a vector of pointers to the elements of the input set that were computed to be the + * proxies for clustering number proxyIndex. + */ + virtual std::vector discrete_proxies(unsigned int solutionIndex) const = 0; + + /** + * @brief Does a dynamic cast of the given SolutionProvider to a DiscreteProxyProvider. + * @return NULL if the SolutionProvider is not a DiscreteProxyProvider instance + */ + static DiscreteProxyProvider* toDiscreteProxyProvider(SolutionProvider* s) + { + return dynamic_cast*>(s); + } +}; + +} + +#endif diff --git a/bico/base/dissimilaritymeasure.h b/bico/base/dissimilaritymeasure.h new file mode 100644 index 0000000..10874e7 --- /dev/null +++ b/bico/base/dissimilaritymeasure.h @@ -0,0 +1,30 @@ +#ifndef DISSIMILARITYMEASURE_H +#define DISSIMILARITYMEASURE_H + +namespace CluE +{ + +/** + * @brief Abstract base class for dissimilarity measurement. + * + * @ingroup base_classes + */ +template class DissimilarityMeasure +{ +public: + + virtual ~DissimilarityMeasure() + { + } + + virtual DissimilarityMeasure* clone() const = 0; + + /** + * @brief Computes the dissimilarity between the two given objects. + */ + virtual double dissimilarity(T const&, T const&) const = 0; +}; + +} + +#endif diff --git a/bico/base/euclideanspaceprovider.h b/bico/base/euclideanspaceprovider.h new file mode 100644 index 0000000..75b6e23 --- /dev/null +++ b/bico/base/euclideanspaceprovider.h @@ -0,0 +1,24 @@ +#ifndef EUCLIDEANSPACEPROVIDER_H +#define EUCLIDEANSPACEPROVIDER_H + +namespace CluE +{ + +/** + * @brief Interface to extend a template type to provide euclidean vector space features. + * + * @ingroup base_classes + */ +template class EuclideanSpaceProvider +{ +public: + typedef VectorType V; + + virtual EuclideanSpaceProvider* clone() const = 0; + + virtual V nullVector() const = 0; +}; + +} + +#endif diff --git a/bico/base/evaluation.h b/bico/base/evaluation.h new file mode 100644 index 0000000..0b30899 --- /dev/null +++ b/bico/base/evaluation.h @@ -0,0 +1,24 @@ +#ifndef EVALUATION_H +#define EVALUATION_H + +namespace CluE +{ + +/** + * @brief Abstract base class for clustering evaluations. + * + * Clustering evaluations should derive from this class. + * + * @ingroup base_classes + */ +class Evaluation +{ +public: + virtual ~Evaluation() + { + } +}; + +} + +#endif diff --git a/bico/base/inputsetter.h b/bico/base/inputsetter.h new file mode 100644 index 0000000..96850d7 --- /dev/null +++ b/bico/base/inputsetter.h @@ -0,0 +1,20 @@ +#ifndef INPUTSETTER_H +#define INPUTSETTER_H + +#include + +namespace CluE +{ +/** + * @brief Interface to propagate the ability to set input data + * + * @ingroup base_classes + */ +template class InputSetter +{ +public: + virtual void setInput(std::vector const*) = 0; +}; +} + +#endif diff --git a/bico/base/measuresetter.h b/bico/base/measuresetter.h new file mode 100644 index 0000000..c093356 --- /dev/null +++ b/bico/base/measuresetter.h @@ -0,0 +1,20 @@ +#ifndef MEASURESETTER_H +#define MEASURESETTER_H + +#include "../base/dissimilaritymeasure.h" + +namespace CluE +{ +/** + * @brief Interface to propagate the ability to set a DissimilarityMeasure + * + * @ingroup base_classes + */ +template class MeasureSetter +{ +public: + virtual void setMeasure(DissimilarityMeasure const *measure) = 0; +}; +} + +#endif diff --git a/bico/base/partitionevaluation.h b/bico/base/partitionevaluation.h new file mode 100644 index 0000000..14f0a22 --- /dev/null +++ b/bico/base/partitionevaluation.h @@ -0,0 +1,47 @@ +#ifndef PARTITIONEVALUATION_H +#define PARTITIONEVALUATION_H + +#include "../base/evaluation.h" +#include "../base/partitionprovider.h" + +#include + +namespace CluE +{ + +/** + * @brief Abstract class for partition-based evaluation algorithms. + * + * Evaluation algorithms requiring partitions for calculating the input's cost should derive from this class. + * + * @ingroup base_classes + */ +template class PartitionEvaluation : virtual public Evaluation +{ +public: + virtual ~PartitionEvaluation() + { + } + + /** + * @brief Calculates the cost of a given partitioning. + */ + virtual double partitioncost(std::vector > const &partitioning) const = 0; + /** + * @overload + */ + virtual double partitioncost(PartitionProvider const &partitioning, unsigned int solutionIndex) const = 0; + + /** + * @brief Calculates the cost of a given partition. + */ + virtual double partitioncost(std::vector const &partition) const = 0; + /** + * @overload + */ + virtual double partitioncost(PartitionProvider const &partitioning, unsigned int solutionIndex, unsigned int partitionIndex) const = 0; +}; + +} + +#endif diff --git a/bico/base/partitionprovider.h b/bico/base/partitionprovider.h new file mode 100644 index 0000000..22ce31f --- /dev/null +++ b/bico/base/partitionprovider.h @@ -0,0 +1,74 @@ +#ifndef PARTITIONPROVIDER_H +#define PARTITIONPROVIDER_H + +#include "../base/solutionprovider.h" + +#include + +namespace CluE +{ + +/** + * @brief Abstract base class to access results of partition based clustering algorithms. + * + * @ingroup base_classes + */ +template class PartitionProvider +{ +public: + + virtual ~PartitionProvider() + { + } + + /** + * @brief returns the number of available solutions + * + * The algorthm may compute more than one solution of possibly different size, where size means + * number of computed clusters, proxies (e.g. cluster centers) or the size of a coreset. + * The sizes can be retrieved by a call to size_of_solution(). + */ + virtual unsigned int number_of_solutions() const = 0; + + /** + * @brief returns the size of a particular solution + * + * @param index number between 0 and @ref number_of_solutions()-1 + * @return the size for the requested clustering + */ + virtual unsigned int size_of_solution(unsigned int index) const = 0; + + /** + * @brief Returns the cardinality of the specified cluster from the computed clustering. + */ + virtual unsigned int clustersize(unsigned int solutionIndex, unsigned int partitionIndex) const = 0; + + /** + * @brief Returns a pointer to a particular element from the specified cluster and clustering. + */ + virtual T* element(unsigned int solutionIndex, unsigned int partitionIndex, unsigned int elementIndex) const = 0; + + /** + * @brief Returns a vector of pointers to the elements of a particular cluster from the specified + * clustering. + */ + virtual std::vector cluster(unsigned int solutionIndex, unsigned int partitionIndex) const = 0; + + /** + * @brief Returns the specified clustering as a vector of vector of pointers to the elements. + */ + virtual std::vector > clustering(unsigned int solutionIndex) const = 0; + + /** + * @brief Does a dynamic cast of the given SolutionProvider to a PartitionProvider. + * @return NULL if the SolutionProvider is not a PartitionProvider instance + */ + static PartitionProvider* toPartitionProvider(SolutionProvider* s) + { + return dynamic_cast*>(s); + } +}; + +} + +#endif diff --git a/bico/base/proxyevaluation.h b/bico/base/proxyevaluation.h new file mode 100644 index 0000000..d65a2a8 --- /dev/null +++ b/bico/base/proxyevaluation.h @@ -0,0 +1,63 @@ +#ifndef PROXYEVALUATION_H +#define PROXYEVALUATION_H + +#include "../base/evaluation.h" +#include "../base/proxyprovider.h" +#include "../base/discreteproxyprovider.h" + +#include + +namespace CluE { + +/** + * @brief Abstract class for proxy-based evaluation algorithms. + * + * Evaluation algorithms requiring proxies for calculating the input's cost should derive from this class. + * + * @ingroup base_classes + */ +template class ProxyEvaluation : virtual public Evaluation { +public: + virtual ~ProxyEvaluation() { + } + + /** + * @brief Calculates the cost related to the proxies based on the input. + * @note There are separate overloaded versions of this method for use with discrete proxies. + */ + virtual double proxycost(std::vector const &input, std::vector const &proxies) const = 0; + /** + * @overload + */ + virtual double proxycost(std::vector const &input, ProxyProvider const &proxies, unsigned int solutionIndex) const = 0; + /** + * @brief Calculates the cost related to a single proxy chosen from the whole list, based on the input. + * @note There are separate overloaded versions of this method for use with discrete proxies. + */ + virtual double proxycost(std::vector const &input, std::vector const &proxies, unsigned int proxyIndex) const = 0; + /** + * @overload + */ + virtual double proxycost(std::vector const& input, ProxyProvider const &proxies, unsigned int solutionIndex, unsigned int proxyIndex) const = 0; + + /** + * @brief Calculates the cost related to the discrete proxies based on the input. + */ + virtual double proxycost(std::vector const &input, std::vector const &proxies) const = 0; + /** + * @overload + */ + virtual double proxycost(std::vector const &input, DiscreteProxyProvider const &proxies, unsigned int solutionIndex) const = 0; + /** + * @brief Calculates the cost related to a single discrete proxy chosen from the whole list, based on the input. + */ + virtual double proxycost(std::vector const& input, std::vector const &proxies, unsigned int solutionIndex) const = 0; + /** + * @overload + */ + virtual double proxycost(std::vector const& input, DiscreteProxyProvider const &proxies, unsigned int solutionIndex, unsigned int proxyIndex) const = 0; +}; + +} + +#endif diff --git a/bico/base/proxygenerator.h b/bico/base/proxygenerator.h new file mode 100644 index 0000000..6476f1c --- /dev/null +++ b/bico/base/proxygenerator.h @@ -0,0 +1,32 @@ +#ifndef PROXYGENERATOR_H +#define PROXYGENERATOR_H + +#include + +namespace CluE +{ + +/** + * @brief Abstract base class for mechanisms that compute a proxy or representative object for a given set of objects, e.g. a cluster center. + * + * @ingroup base_classes + */ +template class ProxyGenerator +{ +public: + + virtual ~ProxyGenerator() + { + } + + virtual ProxyGenerator* clone() const = 0; + + /** + * Generates a proxy for the given vector of objects. + */ + virtual T generate(std::vector const&) const = 0; +}; + +} + +#endif diff --git a/bico/base/proxyprovider.h b/bico/base/proxyprovider.h new file mode 100644 index 0000000..5e8411d --- /dev/null +++ b/bico/base/proxyprovider.h @@ -0,0 +1,62 @@ +#ifndef PROXYPROVIDER_H +#define PROXYPROVIDER_H + +#include +#include "../base/solutionprovider.h" + +namespace CluE { + +/** + * @brief Abstract base class to access results of proxy / center based clustering algorithms. + * + * @ingroup base_classes + */ +template class ProxyProvider { +public: + + virtual ~ProxyProvider() { + } + + /** + * @brief returns the number of available solutions + * + * The algorthm may compute more than one solution of possibly different size, where size means + * number of computed clusters, proxies (e.g. cluster centers) or the size of a coreset. + * The sizes can be retrieved by a call to size_of_solution(). + */ + virtual unsigned int number_of_solutions() const = 0; + + /** + * @brief returns the size of a particular solution + * + * @param index number between 0 and @ref number_of_solutions()-1 + * @return the size for the requested clustering + */ + virtual unsigned int size_of_solution(unsigned int index) const = 0; + + /** + * @brief returns the proxy for the specified clustering and cluster + * + * Returns the computed proxy for cluster number proxyIndex in clustering number solutionIndex. + */ + virtual T proxy(unsigned int solutionIndex, unsigned int proxyIndex) const = 0; + + /** + * @brief returns the proxies for the specified clustering + * + * Returns the computed proxies for clustering number solutionIndex. + */ + virtual std::vector proxies(unsigned int solutionIndex) const = 0; + + /** + * @brief does a dynamic cast of the given SolutionProvider to a ProxyProvider + * @return NULL if the SolutionProvider is not a ProxyProvider instance + */ + static ProxyProvider* toProxyProvider(SolutionProvider* s) { + return dynamic_cast*>(s); + } +}; + +} + +#endif diff --git a/bico/base/solutionprovider.h b/bico/base/solutionprovider.h new file mode 100644 index 0000000..1ef6a7a --- /dev/null +++ b/bico/base/solutionprovider.h @@ -0,0 +1,33 @@ +#ifndef SOLUTIONPROVIDER_H +#define SOLUTIONPROVIDER_H + +#include + +namespace CluE +{ + +/** + * @brief Abstract base class for algorithm solutions. + * + * Abstract base class for all algorithm's solutions (clustering, coreset, seeding, ...). + * + * @ingroup base_classes + */ +class SolutionProvider +{ +public: + virtual ~SolutionProvider() + { + } + + /** + * @brief returns the time needed for the last computation + * + * @return time in seconds needed for last call to compute() + */ + virtual double computationtime() const = 0; +}; + +} + +#endif diff --git a/bico/base/streamingalgorithm.h b/bico/base/streamingalgorithm.h new file mode 100644 index 0000000..81902f9 --- /dev/null +++ b/bico/base/streamingalgorithm.h @@ -0,0 +1,27 @@ +#ifndef CLUESTREAMINGALGORITHM_H +#define CLUESTREAMINGALGORITHM_H + +#include "../base/algorithm.h" + +namespace CluE +{ +/** + * @brief Abstract base class for streaming algorithms + * + * 1. Pass the stream elemets through the << operator. + * 2. Use compute() to request the result. + * + * @ingroup base_classes + */ +template class StreamingAlgorithm : public Algorithm +{ +public: + /** + * @brief Streaming operator + */ + virtual StreamingAlgorithm& operator<<(T const & element) = 0; +}; + +} + +#endif diff --git a/bico/base/weightedobject.h b/bico/base/weightedobject.h new file mode 100644 index 0000000..1fc07da --- /dev/null +++ b/bico/base/weightedobject.h @@ -0,0 +1,19 @@ +#ifndef WEIGHTEDOBJECT_H +#define WEIGHTEDOBJECT_H + +namespace CluE +{ + +/** + * @brief Abstract base class for weighted objects + */ +class WeightedObject +{ +public: + virtual double getWeight() const = 0; + virtual void setWeight(double w) = 0; +}; + +} + +#endif diff --git a/bico/base/weightmodifier.h b/bico/base/weightmodifier.h new file mode 100644 index 0000000..721d105 --- /dev/null +++ b/bico/base/weightmodifier.h @@ -0,0 +1,31 @@ +#ifndef WEIGHTMODIFIER_H +#define WEIGHTMODIFIER_H + +namespace CluE +{ + +/** + * @brief Abstract base class to modify the weight of weighted objects. + * + * @ingroup base_classes + */ +template class WeightModifier +{ +public: + virtual ~WeightModifier() + { + } + + /** + * @brief make an exact copy of this object + * The clone method creates a copy of this object and returns a pointer to the new instance. + */ + virtual WeightModifier* clone() const = 0; + + virtual double getWeight(T&) = 0; + virtual void setWeight(T&, double) = 0; +}; + +} + +#endif diff --git a/bico/clustering/bico.h b/bico/clustering/bico.h new file mode 100644 index 0000000..bdce7cb --- /dev/null +++ b/bico/clustering/bico.h @@ -0,0 +1,1003 @@ +#ifndef BICO_H +#define BICO_H + +#include +#include +#include +#include +#include +#include +#include + +#include "../base/streamingalgorithm.h" +#include "../base/dissimilaritymeasure.h" +#include "../base/solutionprovider.h" +#include "../base/weightmodifier.h" +#include "../base/partitionprovider.h" +#include "../clustering/cfrentry.h" +#include "../datastructure/proxysolution.h" +#include "../evaluation/kmeansevaluator.h" +#include "../exception/invalidruntimeconfigurationexception.h" +#include "../misc/randomness.h" + +namespace CluE +{ + +/** + * @brief Fast computation of k-means coresets in a data stream + * + * BICO maintains a tree which is inspired by the clustering tree of BIRCH, + * a SIGMOD Test of Time award-winning clustering algorithm. + * Each node in the tree represents a subset of these points. Instead of + * storing all points as individual objects, only the number of points, + * the sum and the squared sum of the subset's points are stored as key features + * of each subset. Points are inserted into exactly one node. + * A detailed description of BICO can be found here: + * * Hendrik Fichtenberger, Marc Gillé, Melanie Schmidt, Chris Schwiegelshohn, + * Christian Sohler: BICO: BIRCH Meets Coresets for k-Means Clustering. + * ESA 2013: 481-492 + * + * In this implementation, the nearest neighbour search on the first level + * of the tree ist sped up by projecting all points to random 1-d subspaces. + * The first estimation of the optimal clustering cost is computed in a + * buffer phase at the beginning of the algorithm. + */ +template class Bico : public StreamingAlgorithm +{ +private: + + /** + * @brief Class representing a node in BICO's tree + */ + class BicoNode + { + public: + typedef std::pair, BicoNode*> FeaturePair; + typedef std::list FeatureList; + + /** + * Constructs a node for BICO's tree + * @param outer Parent BICO instance + */ + BicoNode(Bico& outer) : + objectId(outer.nodeIdCounter), + outer(outer), + features() + { + ++outer.nodeIdCounter; + } + + /** + * @brief Delete all nodes + */ + void clear() + { + for (auto it = features.begin(); it != features.end(); ++it) + delete it->second; + } + + /** + * Inserts a CFREntry into this node + * @param feature CFREntry to be inserted + * @return Iterator pointing to inserted CFREntry + */ + typename FeatureList::iterator insert(CFREntry const & feature) + { + return features.insert(features.end(), + FeaturePair(feature, new BicoNode(outer))); + } + + /** + * Iterator pointing at the first CFREntry + * @return Begin iterator + */ + typename FeatureList::iterator begin() + { + return features.begin(); + } + + /** + * Iterator pointing behind the last CFREntry + * @return End iterator + */ + typename FeatureList::iterator end() + { + return features.end(); + } + + /** + * Number of contained CFREntries + * @return Number of elements + */ + size_t size() + { + return features.size(); + } + + /** + * Indicates if node is empty + * @return Indicator + */ + bool empty() + { + return features.empty(); + } + + /** + * Returns an iterator to the CFREntry in this node whose reference point + * is nearest to a fixed point + * @param element Fixed point + * @param level Level of this node + * @return Nearest CFREntry + */ + typename FeatureList::iterator nearest(T const & element, int level) + { + typename FeatureList::iterator minIt = features.end(); + // Nearest neighbour search based on projections in level 1 + if (level == 1) + { + // Project point and calculate projection bucket number + double val = outer.project(element, 0); + int bucket_number = outer.calcBucketNumber(0, val); + int mini = 0; + int bucket_min = bucket_number; + int mins; + + if ((bucket_number < 0) || (bucket_number > outer.buckets[0].size() - 1)) + { + // The bucket does not exist (yet) + mins = 0; + } + else + { + // Search for the projection with smallest bucket size + mins = outer.buckets[mini][bucket_min].size(); + for (int i = 1; i < outer.L; i++) + { + val = outer.project(element, i); + bucket_number = outer.calcBucketNumber(i, val); + if ((bucket_number >= 0) & (bucket_number <= outer.buckets[i].size() - 1)) + { + int s = outer.buckets[i][bucket_number].size(); + if (s < mins) + { + mins = s; + bucket_min = bucket_number; + mini = i; + } + } + else + { + mins = 0; + bucket_min = bucket_number; + mini = i; + break; + } + } + + } + + bucket_number = bucket_min; + int rnd = mini; + + if (bucket_number < 0) + { + // Bucket does not exist => create one + outer.allocateBucket(rnd, true); + } + else if (bucket_number > outer.buckets[rnd].size() - 1) + { + // Bucket does not exist => create one + outer.allocateBucket(rnd, false); + } + else + { + // Bucket does exist => search nearest point in bucket + double minDist = -1; + + for (auto it = outer.buckets[rnd][bucket_number].begin(); it != outer.buckets[rnd][bucket_number].end(); ++it) + { + double tmpDist = outer.measure->dissimilarity((*it)->first.representative, element); + if (tmpDist < minDist || minDist == -1) + { + minDist = tmpDist; + minIt = (*it); + } + } + + } + } + // Simple nearest neighbour search in all other levels + else + { + double minDist = -1; + for (auto it = features.begin(); it != features.end(); ++it) + { + double tmpDist = outer.measure->dissimilarity(it->first.representative, element); + if (tmpDist < minDist || minDist == -1) + { + minDist = tmpDist; + minIt = it; + } + } + } + + return minIt; + } + + /** + * Removes a specified CFREntry + * @param pos Position of the CFREntry to be removed + */ + void erase(typename FeatureList::iterator pos) + { + features.erase(pos); + } + + /** + * Inserts all CFREntries of this node into a given FeatureList + * @param to Destination of insertion + * @param pos Position of insertion + */ + void spliceAllTo(BicoNode* to, typename FeatureList::iterator pos) + { + to->features.splice(pos, features); + } + + /** + * Inserts one CFREntry of this node into a given FeatureList + * @param it CFREntry to be inserted + * @param to Destination of insertion + * @param pos Postion of insertion + */ + void spliceElementTo(typename FeatureList::iterator it, BicoNode* to, typename FeatureList::iterator pos) + { + to->features.splice(pos, features, it); + } + + /** + * Returns the unique object id + * @return Object id + */ + int id() + { + return objectId; + } + + private: + /** + * @brief Unique object id + */ + int objectId; + + /** + * @brief Parent BICO instance + */ + Bico& outer; + + /** + * List of all contained CFREntries + */ + FeatureList features; + }; + +public: + /** + * @brief Constructs BICO for points of type T + * T can be an arbitrary type but it has to fulfil the requirements + * of CFREntry. + * + * @param dimension Dimension of the data + * @param k Number of desired centeres + * @param p Number of random projections used for nearest neighbour search + * in the first level + * @param nMax Maximum coreset size + * @param measure Implementation of the squared L2 metric for T + * @param weightModifier Class to read and modify weight of T + */ + Bico(size_t dimension, size_t k, size_t p, size_t nMax, int seed, + DissimilarityMeasure* measure, WeightModifier* weightModifier); + + virtual ~Bico(); + + /** + * @brief Returns a coreset of all point read so far + * @return Coreset + */ + virtual ProxySolution* compute(); + + /** + * @brief Read a point + * Insert the point into BICO's tree + * + * @param element Point of type T + * @return This BICO instance + */ + virtual Bico& operator<<(T const & element); + + /** + * @brief Write the tree as GraphViz source into a stream + * @param os Output stream + */ + void print(std::ostream& os); + +private: + /** + * @brief Inserts an element into a BicoNode at a certain level + * @param node BicoNode to be inserted into + * @param level Level of this BicoNode + * @param element Elemente to be inserted + */ + void insert(BicoNode* node, int level, T const & element); + + /** + * @brief Inserts an element into the nearest neighbour data structure + * @param iteratorElement Feature to be insertet into NN data structure + */ + void insertIntoNN(typename BicoNode::FeatureList::iterator iteratorElement); + + /** + * @brief Initialize nearest neighbour data structure + */ + void initializeNN(); + + /** + * @brief Allocates a new bucket + * @param bucket Number of projection + * @param left Push front bucket (instead of push back) + */ + void allocateBucket(int bucket, bool left); + + /** + * Calculates the bucket number for a given value + * @param rnd Number of projections + * @param val Value + * @return Bucket number + */ + int calcBucketNumber(int rnd, double val); + + /** + * @brief Projects a point onto a projection line + * @param point Point + * @param i Number of projection line + * @return Projected point + */ + double project(T point, int i); + + /** + * @brief Rebuilds the tree + */ + void rebuild(); + + /** + * Rebuilds the first level + * @param parent New root + * @param child Old root + */ + void rebuildFirstLevel(BicoNode* parent, BicoNode* child); + + /** + * Recursive rebuilding of the tree + * @param node Some node to be rebuilded + * @param level Level of this node + */ + void rebuildTraversMerge(BicoNode* node, int level); + + /** + * @brief Recursive computation of the coreset + * @param node Some node to be processed + * @param solution ProxySolution containing the coreset + */ + void computeTraverse(BicoNode* node, ProxySolution* solution); + + /** + * @brief Returns the threshold for a given level + * @param level Level + * @return Threshold at this level + */ + double getT(int level); + + /** + * @brief Returns the radius for a given level + * @param level Level + * @return Radius at this level + */ + double getR(int level); + + /** + * Writes a BicoNode as GraphViz source into a stream + * @param os Output stream + * @param node Some BicoNode + */ + void print(std::ostream& os, BicoNode* node); + + /** + * @brief Number of centers + */ + size_t k; + /** + * @brief Number of projections + */ + size_t L; + + /** + * @brief Random projection vectors + */ + std::vector> rndprojections; + + /** + * @brief Buckets for nearest neighbour search in first level + */ + std::vector> > buckets; + /** + * @brief Bucket borders + */ + std::vector> borders; + /** + * @brief Width of buckets + */ + std::vector bucket_radius; + + /** + * @brief Counter for unique BicoNode object ids + */ + int nodeIdCounter; + + /** + * @brief Buffer for DissimilarityMeasure + */ + std::unique_ptr> measure; + + /** + * @brief Buffer for WeightModifier + */ + std::unique_ptr> weightModifier; + + /** + * @brief Maximum coreset size + */ + size_t maxNumOfCFs; + + /** + * @brief Current coreset size + */ + size_t curNumOfCFs; + + /** + * @brief Dimension of the input points + */ + size_t dimension; + + /** + * @brief Current estimation of the optimal clustering cost + */ + double optEst; + + /** + * @brief Extreme values used for constructing the nearest neighbour buckets + */ + std::vector maxVal; + + /** + * @brief Root node of BICO's tree + */ + BicoNode* root; + + /** + * @brief Buffer phase indicator + */ + bool bufferPhase; + + /** + * @brief Buffer phase's buffer + */ + std::vector buffer; + + /** + * @brief Buffer phase's buffer for projected buffer points + */ + std::vector> projection_buffer; + + /** + * @brief Minimum pair distance of two points read in buffer phase + */ + double minDist; + + /** + * @brief Number of unique elements read in buffer phase + */ + size_t pairwise_different; + + /** + * @brief Current number of rebuilding + */ + int numOfRebuilds; +}; + +template