Skip to content

Commit

Permalink
Updaet readme
Browse files Browse the repository at this point in the history
  • Loading branch information
peekxc committed Sep 9, 2024
1 parent 3530a81 commit ee2098d
Show file tree
Hide file tree
Showing 5 changed files with 265 additions and 63 deletions.
32 changes: 32 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
`geomcover` is a Python package built to simplify constructing geometric set covers over a variety of geometric objects,
such as metric graphs and manifolds. Such covers can be used for simplifying surfaces, constructing fiber bundles,
or summarizing point cloud data.

The package currently includes efficient implementations of approximation algorithms for [the set cover problem](https://en.wikipedia.org/wiki/Set_cover_problem) (SCP), functions for constructing tangent bundles from range spaces, and tools for computing statistics on them (e.g. average alignment of tangent vectors). The package also exports useful auxiliary algorithms often used for preprocessing, such as [principle component analysis](https://en.wikipedia.org/wiki/Principal_component_analysis), [classical multidimensional scaling](https://en.wikipedia.org/wiki/Multidimensional_scaling), and [mean shift](https://en.wikipedia.org/wiki/Mean_shift).

Below are the SCP solvers implemented, along with their relative scalabilities, approximation qualities, and dependencies:

| Algorithm | Scalability | Approximation Quality | Required Dependencies | Solvers |
|-----------------------------|---------------|-----------------------|-------------------------------|-----------------|
| Greedy | Very large $n$| Medium | Numpy | Direct |
| Randomized Rounding (LP) | Large $n$ | High | Numpy / SciPy | `linprog` |
| ILP Method | Medium $n$ | Optimal[^1] | Numpy / SciPy (or `ortools`) | `milp` (or MPSolver) |
| SAT Solver | Small $n$ | Optimal[^2] | pysat | RC2 |

[^1]: The integer-linear program (ILP) by default uses SciPy's port of the the HiGHs solver, which typically finds the global optimum of moderately challenging problems; alternatively, any solver supported by OR-Tools [MPSolver](https://developers.google.com/optimization/lp/mpsolver) can be used if the `ortools` package is installed.

[^2]: The SAT solver used here is a weighted MaxSAT solver, which starts with an approximation solution and then incrementally exhausts the solution space until the optimal solution. While it is the slowest method, the starting approximation has the tightest approximation factor of all the methods listed.

## Installation

The package is a pure Python package, which for now needs to be built from source, e.g.:

```bash
python -m pip install git+https://github.com/peekxc/geomcover.git
```
A PYPI distribution is planned.

## Usage and Docs

For some example usage, see the `notebooks` directory, or examples in the [API docs](https://peekxc.github.io/geomcover/).

44 changes: 25 additions & 19 deletions notebooks/circle.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,37 +15,37 @@
X = np.c_[x, y]

# %% Plot
p = figure(width=325, height=325, match_aspect=True)
p.line(np.append(x, x[0]), np.append(y, y[0]), color="blue", line_width=2)
p.scatter(x, y, color="blue", size=5)
show(p)
p1 = figure(width=325, height=325, match_aspect=True)
p1.line(np.append(x, x[0]), np.append(y, y[0]), color="blue", line_width=2)
p1.scatter(x, y, color="blue", size=5)
show(p1)

## Our manifold will be the simple cycle graph (+ identity)
n = len(x)
M = cycle_graph(n, k=5) + dia_array(np.eye(n))
M = cycle_graph(n, k=7) + dia_array(np.eye(n))
TM = tangent_bundle(M=M, X=X, d=1, centers=X)

# %% Plot the bundle
p = plot_tangent_bundle(TM, width=325, height=325, match_aspect=True)
show(p)
p2 = plot_tangent_bundle(TM, width=325, height=325, match_aspect=True)
show(p2)

# %% Plot the cover
# p = plot_nerve(M, X, width = 450, height = 150)
# show(p)
p3 = plot_nerve(M, X, width=325, height=325, match_aspect=True)
show(p3)

# %% Step 2: choose a bundle weighting scheme
from geomcover.geometry import bundle_weights
from map2color import map2hex

TW = bundle_weights(M, TM, method="cosine", reduce=np.mean) # lambda x: np.ptp(x/2.0)
# TW = bundle_weights(M, TM, method="cosine", reduce=np.mean) # lambda x: np.ptp(x/2.0)
# TW = bundle_weights(M, TM, method="distance", reduce=np.max)
# TW = bundle_weights(M, TM, method="angle", reduce=np.mean) # uses Stiefel Canonical Metric
TW = bundle_weights(M, TM, method="angle", reduce=np.mean) # uses Stiefel Canonical Metric

(x, y), (xs, ys) = plot_tangent_bundle(TM, data=True)
p = figure(width=325, height=325, match_aspect=True)
p.multi_line(xs, ys)
p.scatter(x, y, color=map2hex(TW, "viridis"), size=5, fill_alpha=1.0)
show(p)
p4 = figure(width=325, height=325, match_aspect=True)
p4.multi_line(xs, ys)
p4.scatter(x, y, color=map2hex(TW, "viridis"), size=5, fill_alpha=1.0)
show(p4)

# %% Step 3: form the minimal weight set cover
from geomcover.cover import set_cover
Expand All @@ -54,7 +54,13 @@
cover, cover_weight = set_cover(M, TW, "ILP")
cover_ind = np.flatnonzero(cover)

p = figure(width=325, height=325, match_aspect=True)
p.scatter(*X[cover].T, color="red", size=7, line_color="black")
p.scatter(*X[~cover].T, color="gray", size=4, line_color="gray", fill_alpha=0.50)
show(p)
p5 = figure(width=325, height=325, match_aspect=True, title="")
p5.scatter(*X[cover].T, color="red", size=7, line_color="black")
p5.scatter(*X[~cover].T, color="gray", size=4, line_color="gray", fill_alpha=0.50)
show(p5)


# %%
from bokeh.layouts import row

show(row(p1, p2, p3, p4, p5))
91 changes: 52 additions & 39 deletions notebooks/spiral.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from bokeh.io import output_notebook
from bokeh.plotting import figure, show
from geomcover.geometry import tangent_bundle
from geomcover.plotting import plot_tangent_bundle
from geomcover.plotting import plot_nerve, plot_tangent_bundle

output_notebook()

Expand All @@ -20,9 +20,9 @@ def spiral(N, sigma=1.0):

# %% Data set
X = spiral(N=300, sigma=0.50)
p = figure(width=325, height=325, match_aspect=True)
p.scatter(*X.T, size=4, color="red", line_color="gray")
show(p)
p1 = figure(width=325, height=325, match_aspect=True)
p1.scatter(*X.T, size=4, color="blue", line_color="gray")
show(p1)

# %% First step: tangent bundle estimation
from geomcover.geometry import neighbor_graph_knn
Expand All @@ -34,64 +34,77 @@ def spiral(N, sigma=1.0):
TM = tangent_bundle(M=M, X=X, d=1, centers=X)

## Plot the bundle
p = plot_tangent_bundle(TM, width=325, height=325, match_aspect=True)
show(p)
p2 = plot_tangent_bundle(TM, width=325, height=325, match_aspect=True)
show(p2)

## Plot the cover
# p = plot_nerve(M, X, width = 450, height = 150)
# show(p)

# %% Step 2: choose a bundle weighting scheme
from set_cover.covers import bundle_weights
from geomcover.geometry import bundle_weights
from map2color import map2hex

# TW = bundle_weights(M, TM, method="cosine", reduce=np.mean) # lambda x: np.ptp(x/2.0)
# TW = bundle_weights(M, TM, method="cosine", reduce=np.mean) # lambda x: np.ptp(x/2.0)
# TW = bundle_weights(M, TM, method="distance", reduce=np.max)
TW = bundle_weights(M, TM, method="angle", reduce=np.mean) # uses Stiefel Canonical Metric

(x, y), (xs, ys) = plot_tangent_bundle(TM, data=True, c=2.0)
p = figure(width=350, height=350, match_aspect=True)
p.multi_line(xs, ys, color=map2hex(TW, "viridis"))
p.scatter(x, y, color=map2hex(TW, "viridis"), size=3.5, fill_alpha=1.0)
show(p)
p3 = figure(width=350, height=350, match_aspect=True)
p3.multi_line(xs, ys, color=map2hex(TW, "viridis"))
p3.scatter(x, y, color=map2hex(TW, "viridis"), size=3.5, fill_alpha=1.0)
show(p3)


# %% Step 3: form the minimal weight set cover
from set_cover.wset_cover import wset_cover
from geomcover.cover import set_cover

TW_unit = np.ones(len(TW))

cover, cover_weight = wset_cover(M, TW_unit, "greedy")
cover, cover_weight = wset_cover(M, TW_unit, "LP", sparsity=0.25)
cover, cover_weight = wset_cover(M, TW, "LP", sparsity=0.25)
cover, cover_weight = wset_cover(M, TW, "sat")
wset_cover(M, TW_unit, "sat")
# cover, cover_weight = wset_cover(M, TW_unit, "greedy")
# cover, cover_weight = wset_cover(M, TW_unit, "LP", sparsity=0.25)
# cover, cover_weight = wset_cover(M, TW, "LP", sparsity=0.25)
# cover, cover_weight = wset_cover(M, TW, "sat")
# cover, cover_weight = set_cover(M, TW_unit, "ILP")
# cover, cover_weight = wset_cover(M, TW, "sat")
cover_ind = np.flatnonzero(cover)
# cover_ind = np.flatnonzero(cover)
# assert valid_cover(M, ind=np.flatnonzero(cover))
cover, cover_weight = set_cover(M, TW, "ILP")

xs_c, ys_c = list(np.array(xs)[cover]), list(np.array(ys)[cover])
p = figure(width=350, height=350, match_aspect=True)
p.scatter(*X[~cover].T, color="gray", size=4, line_color="gray", fill_alpha=0.50)
p.scatter(*X[cover].T, color="red", size=5, line_color="black")
p.multi_line(xs_c, ys_c, color="red", line_width=4)
show(p)
p4 = figure(width=350, height=350, match_aspect=True)
p4.scatter(*X[~cover].T, color="gray", size=4, line_color="gray", fill_alpha=0.50)
p4.scatter(*X[cover].T, color="red", size=5, line_color="black")
p4.multi_line(xs_c, ys_c, color="red", line_width=3)
show(p4)

# %% Step 4: Tweak the weights to match what we want
from scipy.stats import gaussian_kde
# %%
from bokeh.layouts import row

kde = gaussian_kde(X.T)
density = kde(X.T)
alignment = bundle_weights(M, TM, method="angle", reduce=np.mean)
show(row(p1, p2, p3, p4))

normalize = lambda x: (x - np.min(x)) / (np.max(x) - np.min(x))
TW = 0.5 * normalize(alignment) + 0.5 * normalize(density)

cover, cover_weight = wset_cover(M, TW, "LP")
cover_ind = np.flatnonzero(cover)
# %% Show nerve simplification
X[cover]

xs_c, ys_c = list(np.array(xs)[cover]), list(np.array(ys)[cover])
p = figure(width=350, height=350, match_aspect=True)
p.scatter(*X[~cover].T, color="gray", size=4, line_color="gray", fill_alpha=0.50)
p.scatter(*X[cover].T, color="red", size=5, line_color="black")
p.multi_line(xs_c, ys_c, color="red", line_width=4)
show(p)
show(plot_nerve(M, X))

# # %% Step 4: Tweak the weights to match what we want
# from scipy.stats import gaussian_kde

# kde = gaussian_kde(X.T)
# density = kde(X.T)
# alignment = bundle_weights(M, TM, method="angle", reduce=np.mean)

# normalize = lambda x: (x - np.min(x)) / (np.max(x) - np.min(x))
# TW = 0.5 * normalize(alignment) + 0.5 * normalize(density)

# cover, cover_weight = wset_cover(M, TW, "LP")
# cover_ind = np.flatnonzero(cover)

# xs_c, ys_c = list(np.array(xs)[cover]), list(np.array(ys)[cover])
# p = figure(width=350, height=350, match_aspect=True)
# p.scatter(*X[~cover].T, color="gray", size=4, line_color="gray", fill_alpha=0.50)
# p.scatter(*X[cover].T, color="red", size=5, line_color="black")
# p.multi_line(xs_c, ys_c, color="red", line_width=4)
# show(p)
146 changes: 146 additions & 0 deletions notebooks/test_mean_shift.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
import numpy as np
from bokeh.io import output_notebook
from bokeh.plotting import figure, show
from bokeh.palettes import Sunset8
from scipy.ndimage import shift
from scipy.stats import gaussian_kde, multivariate_normal, norm

output_notebook()
# multivariate_normal.pdf(X, mean=[0,0], cov=np.eye(2))
X = np.loadtxt("https://raw.githubusercontent.com/mattnedrich/MeanShift_py/master/data.csv", delimiter=",")
K = gaussian_kde(X.T)

# %% show the test data + the contours
p = figure(width=300, height=300)
p.scatter(*X.T)
show(p)

(x_min, y_min), (x_max, y_max) = np.min(X, axis=0), np.max(X, axis=0)
x_delta, y_delta = 0.10 * np.ptp(X[:, 0]), 0.10 * np.ptp(X[:, 1])
xp = np.linspace(x_min - x_delta, x_max + x_delta, 80)
yp = np.linspace(y_min - y_delta, y_max + y_delta, 80)
xg, yg = np.meshgrid(xp, yp)
Z = K.evaluate(np.vstack([xg.ravel(), yg.ravel()])).reshape(xg.shape)
levels = np.linspace(np.min(Z), np.max(Z), 9)

p = figure(width=550, height=400, match_aspect=True)
contour_renderer = p.contour(xg, yg, Z, levels, fill_alpha=0.0, fill_color=Sunset8, line_color=Sunset8, line_width=3)
colorbar = contour_renderer.construct_color_bar()
p.add_layout(colorbar, "right")
p.scatter(*X.T, fill_alpha=K.evaluate(X.T) / np.max(K.evaluate(X.T)))
show(p)

# %% Perform the mean shift
from geomcover.cluster import mean_shift, assign_clusters, MultivariateNormalZeroMean
from collections import Counter

shift_points = mean_shift(X, bandwidth=K.factor)
cl = assign_clusters(shift_points)

# %%
from map2color import map2hex

# p = figure(width=300, height=300)
p.scatter(*X.T, color=map2hex(cl))
p.scatter(*shift_points.T, color=map2hex(cl), size=8, line_color="gray")
show(p)

# %%
KDE = gaussian_kde(X.T, bw_method=1.0)
cov = Covariance.from_precision(KDE.inv_cov, KDE.covariance)
n, d = X.shape
MVN = MultivariateNormalZeroMean((n, n, d), cov)
# MVN(X)


# import timeit
# X = np.random.uniform(size=(1500, 2))
# n, d = X.shape
# KDE = gaussian_kde(X.T, bw_method=1.0)
# MVN = MultivariateNormalZeroMean(cov)
# delta = X[:, np.newaxis, :] - X[:150, :][np.newaxis, :, :]
# timeit.timeit(lambda: MVN(delta), number=150)
# timeit.timeit(lambda: multivariate_normal.pdf(delta, mean=mu, cov=cov), number=150)


# %% Ensure kDE matches
from scipy.stats import Covariance, multivariate_normal
from scipy.spatial.distance import pdist, cdist, squareform
from scipy.stats import gaussian_kde

# X = np.random.uniform(size=(1500, 2), low=0, high=1)
# mu = np.zeros(2, dtype=np.float64)
# bw = np.repeat(0.25, len(mu))
# cov = Covariance.from_diagonal(bw)

# KDE = gaussian_kde(X.T, bw_method=0.20)
# cov = Covariance.from_precision(KDE.inv_cov, KDE.covariance)

# delta = X[:, np.newaxis, :] - X[np.newaxis, :, :]
# dens1 = np.sum(multivariate_normal.pdf(delta, mean=mu, cov=cov) / len(X), axis=1)
# dens2 = KDE.evaluate(X.T)
# assert np.allclose(dens1, dens2)

# import timeit

# timeit.timeit(lambda: np.sum(multivariate_normal.pdf(delta, mean=mu, cov=cov), axis=1) / len(X), number=10)
# timeit.timeit(lambda: KDE.evaluate(X.T), number=10)

# assert isinstance(kernel, Callable), "Kernel must a be a callable function."

# %% Test two larger gaussians

X1 = multivariate_normal.rvs(size=1500, mean=[-5, 0], cov=1)
X2 = multivariate_normal.rvs(size=1500, mean=[+5, 0], cov=np.diag([1, 2]))
X = np.vstack([X1, X2])

## Blurred mean shift takes 3.7s here, compared to regular MS taking 14.5s
## About 4x faster!
## custom mvn code takes 14.1s, compared to 19.2 w/ scipy
S = X[np.random.choice(range(len(X)), size=500, replace=False)]
SP = mean_shift(X, S, maxiter=200, batch=25)
cl = assign_clusters(SP, atol=0.5)


KDE = gaussian_kde(X.T)
density = KDE.evaluate(X.T)
## Straified samplign approach
# np.quantile(density, np.linspace(0, 1, 10))
cuts = np.linspace(np.min(density), np.max(density), 10)
binned = np.digitize(density, cuts)
bin_counts = np.bincount(binned)
n_samples = 100
(n_samples / len(X)) * bin_counts
# np.random.choice(range(len(X)), replace=False, p=bin_counts/np.sum(bin_counts))

np.digitize(density, bins=10)

p = figure(width=300, height=300)
p.scatter(*X.T, color=map2hex(cl, "turbo"))
# p.scatter(*SP.T, color="red", line_color="gray")
show(p)

from scipy.cluster.hierarchy import fcluster

L = single(pdist(SP))
cl = fcluster(L, 2, criterion="maxclust")


gaussian_kde(X.T)

# multivariate_normal.pdf(X[:5], mean=[0, 0], cov=1)
## covariance object 1.8s -> 1.6s
from line_profiler import LineProfiler
from geomcover.cluster import batch_shift_kernel

profile = LineProfiler()
profile.add_function(mean_shift)
profile.add_function(batch_shift_kernel)
profile.add_function(multivariate_normal.pdf)
profile.add_function(multivariate_normal._logpdf)
profile.add_function(MVN.__call__)
profile.enable_by_count()
# mean_shift(X, maxiter=1000)
MVN(delta)
multivariate_normal.pdf(delta, mean=mu, cov=cov)
profile.print_stats()
Loading

0 comments on commit ee2098d

Please sign in to comment.