From ee2098d1d1adac0c9d16afe0734a9810ab4747bc Mon Sep 17 00:00:00 2001
From: peekxc <matt.piekenbrock@gmail.com>
Date: Mon, 9 Sep 2024 12:36:05 -0400
Subject: [PATCH] Updaet readme

---
 README.md                    |  32 ++++++++
 notebooks/circle.py          |  44 ++++++-----
 notebooks/spiral.py          |  91 ++++++++++++----------
 notebooks/test_mean_shift.py | 146 +++++++++++++++++++++++++++++++++++
 src/geomcover/geometry.py    |  15 ++--
 5 files changed, 265 insertions(+), 63 deletions(-)
 create mode 100644 notebooks/test_mean_shift.py

diff --git a/README.md b/README.md
index e69de29..fd0f74f 100644
--- a/README.md
+++ b/README.md
@@ -0,0 +1,32 @@
+`geomcover` is a Python package built to simplify constructing geometric set covers over a variety of geometric objects, 
+such as metric graphs and manifolds. Such covers can be used for simplifying surfaces, constructing fiber bundles, 
+or summarizing point cloud data.
+
+The package currently includes efficient implementations of approximation algorithms for [the set cover problem](https://en.wikipedia.org/wiki/Set_cover_problem) (SCP), functions for constructing tangent bundles from range spaces, and tools for computing statistics on them (e.g. average alignment of tangent vectors). The package also exports useful auxiliary algorithms often used for preprocessing, such as [principle component analysis](https://en.wikipedia.org/wiki/Principal_component_analysis), [classical multidimensional scaling](https://en.wikipedia.org/wiki/Multidimensional_scaling), and [mean shift](https://en.wikipedia.org/wiki/Mean_shift).  
+
+Below are the SCP solvers implemented, along with their relative scalabilities, approximation qualities, and dependencies:
+
+| Algorithm                   | Scalability   | Approximation Quality | Required Dependencies         | Solvers         |
+|-----------------------------|---------------|-----------------------|-------------------------------|-----------------|
+| Greedy                      | Very large $n$| Medium                | Numpy                         | Direct          |
+| Randomized Rounding (LP)    | Large $n$     | High                  | Numpy / SciPy                 | `linprog`       |
+| ILP Method                  | Medium $n$    | Optimal[^1]              | Numpy / SciPy (or `ortools`)  | `milp` (or MPSolver) |
+| SAT Solver                  | Small $n$     | Optimal[^2]               | pysat                         |  RC2                   |
+
+[^1]: The integer-linear program (ILP) by default uses SciPy's port of the the HiGHs solver, which typically finds the global optimum of moderately challenging problems; alternatively, any solver supported by OR-Tools [MPSolver](https://developers.google.com/optimization/lp/mpsolver) can be used if the `ortools` package is installed.  
+
+[^2]: The SAT solver used here is a weighted MaxSAT solver, which starts with an approximation solution and then incrementally exhausts the solution space until the optimal solution. While it is the slowest method, the starting approximation has the tightest approximation factor of all the methods listed. 
+
+## Installation 
+
+The package is a pure Python package, which for now needs to be built from source, e.g.: 
+
+```bash
+python -m pip install git+https://github.com/peekxc/geomcover.git
+```
+A PYPI distribution is planned. 
+
+## Usage and Docs
+
+For some example usage, see the `notebooks` directory, or examples in the [API docs](https://peekxc.github.io/geomcover/). 
+
diff --git a/notebooks/circle.py b/notebooks/circle.py
index 7899655..963456e 100644
--- a/notebooks/circle.py
+++ b/notebooks/circle.py
@@ -15,37 +15,37 @@
 X = np.c_[x, y]
 
 # %% Plot
-p = figure(width=325, height=325, match_aspect=True)
-p.line(np.append(x, x[0]), np.append(y, y[0]), color="blue", line_width=2)
-p.scatter(x, y, color="blue", size=5)
-show(p)
+p1 = figure(width=325, height=325, match_aspect=True)
+p1.line(np.append(x, x[0]), np.append(y, y[0]), color="blue", line_width=2)
+p1.scatter(x, y, color="blue", size=5)
+show(p1)
 
 ## Our manifold will be the simple cycle graph (+ identity)
 n = len(x)
-M = cycle_graph(n, k=5) + dia_array(np.eye(n))
+M = cycle_graph(n, k=7) + dia_array(np.eye(n))
 TM = tangent_bundle(M=M, X=X, d=1, centers=X)
 
 # %% Plot the bundle
-p = plot_tangent_bundle(TM, width=325, height=325, match_aspect=True)
-show(p)
+p2 = plot_tangent_bundle(TM, width=325, height=325, match_aspect=True)
+show(p2)
 
 # %% Plot the cover
-# p = plot_nerve(M, X, width = 450, height = 150)
-# show(p)
+p3 = plot_nerve(M, X, width=325, height=325, match_aspect=True)
+show(p3)
 
 # %% Step 2: choose a bundle weighting scheme
 from geomcover.geometry import bundle_weights
 from map2color import map2hex
 
-TW = bundle_weights(M, TM, method="cosine", reduce=np.mean)  # lambda x: np.ptp(x/2.0)
+# TW = bundle_weights(M, TM, method="cosine", reduce=np.mean)  # lambda x: np.ptp(x/2.0)
 # TW = bundle_weights(M, TM, method="distance", reduce=np.max)
-# TW = bundle_weights(M, TM, method="angle", reduce=np.mean)  # uses Stiefel Canonical Metric
+TW = bundle_weights(M, TM, method="angle", reduce=np.mean)  # uses Stiefel Canonical Metric
 
 (x, y), (xs, ys) = plot_tangent_bundle(TM, data=True)
-p = figure(width=325, height=325, match_aspect=True)
-p.multi_line(xs, ys)
-p.scatter(x, y, color=map2hex(TW, "viridis"), size=5, fill_alpha=1.0)
-show(p)
+p4 = figure(width=325, height=325, match_aspect=True)
+p4.multi_line(xs, ys)
+p4.scatter(x, y, color=map2hex(TW, "viridis"), size=5, fill_alpha=1.0)
+show(p4)
 
 # %% Step 3: form the minimal weight set cover
 from geomcover.cover import set_cover
@@ -54,7 +54,13 @@
 cover, cover_weight = set_cover(M, TW, "ILP")
 cover_ind = np.flatnonzero(cover)
 
-p = figure(width=325, height=325, match_aspect=True)
-p.scatter(*X[cover].T, color="red", size=7, line_color="black")
-p.scatter(*X[~cover].T, color="gray", size=4, line_color="gray", fill_alpha=0.50)
-show(p)
+p5 = figure(width=325, height=325, match_aspect=True, title="")
+p5.scatter(*X[cover].T, color="red", size=7, line_color="black")
+p5.scatter(*X[~cover].T, color="gray", size=4, line_color="gray", fill_alpha=0.50)
+show(p5)
+
+
+# %%
+from bokeh.layouts import row
+
+show(row(p1, p2, p3, p4, p5))
diff --git a/notebooks/spiral.py b/notebooks/spiral.py
index 21e59c8..3294384 100644
--- a/notebooks/spiral.py
+++ b/notebooks/spiral.py
@@ -4,7 +4,7 @@
 from bokeh.io import output_notebook
 from bokeh.plotting import figure, show
 from geomcover.geometry import tangent_bundle
-from geomcover.plotting import plot_tangent_bundle
+from geomcover.plotting import plot_nerve, plot_tangent_bundle
 
 output_notebook()
 
@@ -20,9 +20,9 @@ def spiral(N, sigma=1.0):
 
 # %% Data set
 X = spiral(N=300, sigma=0.50)
-p = figure(width=325, height=325, match_aspect=True)
-p.scatter(*X.T, size=4, color="red", line_color="gray")
-show(p)
+p1 = figure(width=325, height=325, match_aspect=True)
+p1.scatter(*X.T, size=4, color="blue", line_color="gray")
+show(p1)
 
 # %% First step: tangent bundle estimation
 from geomcover.geometry import neighbor_graph_knn
@@ -34,64 +34,77 @@ def spiral(N, sigma=1.0):
 TM = tangent_bundle(M=M, X=X, d=1, centers=X)
 
 ## Plot the bundle
-p = plot_tangent_bundle(TM, width=325, height=325, match_aspect=True)
-show(p)
+p2 = plot_tangent_bundle(TM, width=325, height=325, match_aspect=True)
+show(p2)
 
 ## Plot the cover
 # p = plot_nerve(M, X, width = 450, height = 150)
 # show(p)
 
 # %% Step 2: choose a bundle weighting scheme
-from set_cover.covers import bundle_weights
+from geomcover.geometry import bundle_weights
 from map2color import map2hex
 
-# TW = bundle_weights(M, TM, method="cosine", reduce=np.mean) # lambda x: np.ptp(x/2.0)
+# TW = bundle_weights(M, TM, method="cosine", reduce=np.mean)  # lambda x: np.ptp(x/2.0)
 # TW = bundle_weights(M, TM, method="distance", reduce=np.max)
 TW = bundle_weights(M, TM, method="angle", reduce=np.mean)  # uses Stiefel Canonical Metric
 
 (x, y), (xs, ys) = plot_tangent_bundle(TM, data=True, c=2.0)
-p = figure(width=350, height=350, match_aspect=True)
-p.multi_line(xs, ys, color=map2hex(TW, "viridis"))
-p.scatter(x, y, color=map2hex(TW, "viridis"), size=3.5, fill_alpha=1.0)
-show(p)
+p3 = figure(width=350, height=350, match_aspect=True)
+p3.multi_line(xs, ys, color=map2hex(TW, "viridis"))
+p3.scatter(x, y, color=map2hex(TW, "viridis"), size=3.5, fill_alpha=1.0)
+show(p3)
+
 
 # %% Step 3: form the minimal weight set cover
-from set_cover.wset_cover import wset_cover
+from geomcover.cover import set_cover
 
 TW_unit = np.ones(len(TW))
 
-cover, cover_weight = wset_cover(M, TW_unit, "greedy")
-cover, cover_weight = wset_cover(M, TW_unit, "LP", sparsity=0.25)
-cover, cover_weight = wset_cover(M, TW, "LP", sparsity=0.25)
-cover, cover_weight = wset_cover(M, TW, "sat")
-wset_cover(M, TW_unit, "sat")
+# cover, cover_weight = wset_cover(M, TW_unit, "greedy")
+# cover, cover_weight = wset_cover(M, TW_unit, "LP", sparsity=0.25)
+# cover, cover_weight = wset_cover(M, TW, "LP", sparsity=0.25)
+# cover, cover_weight = wset_cover(M, TW, "sat")
+# cover, cover_weight = set_cover(M, TW_unit, "ILP")
 # cover, cover_weight = wset_cover(M, TW, "sat")
-cover_ind = np.flatnonzero(cover)
+# cover_ind = np.flatnonzero(cover)
 # assert valid_cover(M, ind=np.flatnonzero(cover))
+cover, cover_weight = set_cover(M, TW, "ILP")
 
 xs_c, ys_c = list(np.array(xs)[cover]), list(np.array(ys)[cover])
-p = figure(width=350, height=350, match_aspect=True)
-p.scatter(*X[~cover].T, color="gray", size=4, line_color="gray", fill_alpha=0.50)
-p.scatter(*X[cover].T, color="red", size=5, line_color="black")
-p.multi_line(xs_c, ys_c, color="red", line_width=4)
-show(p)
+p4 = figure(width=350, height=350, match_aspect=True)
+p4.scatter(*X[~cover].T, color="gray", size=4, line_color="gray", fill_alpha=0.50)
+p4.scatter(*X[cover].T, color="red", size=5, line_color="black")
+p4.multi_line(xs_c, ys_c, color="red", line_width=3)
+show(p4)
 
-# %% Step 4: Tweak the weights to match what we want
-from scipy.stats import gaussian_kde
+# %%
+from bokeh.layouts import row
 
-kde = gaussian_kde(X.T)
-density = kde(X.T)
-alignment = bundle_weights(M, TM, method="angle", reduce=np.mean)
+show(row(p1, p2, p3, p4))
 
-normalize = lambda x: (x - np.min(x)) / (np.max(x) - np.min(x))
-TW = 0.5 * normalize(alignment) + 0.5 * normalize(density)
 
-cover, cover_weight = wset_cover(M, TW, "LP")
-cover_ind = np.flatnonzero(cover)
+# %% Show nerve simplification
+X[cover]
 
-xs_c, ys_c = list(np.array(xs)[cover]), list(np.array(ys)[cover])
-p = figure(width=350, height=350, match_aspect=True)
-p.scatter(*X[~cover].T, color="gray", size=4, line_color="gray", fill_alpha=0.50)
-p.scatter(*X[cover].T, color="red", size=5, line_color="black")
-p.multi_line(xs_c, ys_c, color="red", line_width=4)
-show(p)
+show(plot_nerve(M, X))
+
+# # %% Step 4: Tweak the weights to match what we want
+# from scipy.stats import gaussian_kde
+
+# kde = gaussian_kde(X.T)
+# density = kde(X.T)
+# alignment = bundle_weights(M, TM, method="angle", reduce=np.mean)
+
+# normalize = lambda x: (x - np.min(x)) / (np.max(x) - np.min(x))
+# TW = 0.5 * normalize(alignment) + 0.5 * normalize(density)
+
+# cover, cover_weight = wset_cover(M, TW, "LP")
+# cover_ind = np.flatnonzero(cover)
+
+# xs_c, ys_c = list(np.array(xs)[cover]), list(np.array(ys)[cover])
+# p = figure(width=350, height=350, match_aspect=True)
+# p.scatter(*X[~cover].T, color="gray", size=4, line_color="gray", fill_alpha=0.50)
+# p.scatter(*X[cover].T, color="red", size=5, line_color="black")
+# p.multi_line(xs_c, ys_c, color="red", line_width=4)
+# show(p)
diff --git a/notebooks/test_mean_shift.py b/notebooks/test_mean_shift.py
new file mode 100644
index 0000000..5d5d3ff
--- /dev/null
+++ b/notebooks/test_mean_shift.py
@@ -0,0 +1,146 @@
+import numpy as np
+from bokeh.io import output_notebook
+from bokeh.plotting import figure, show
+from bokeh.palettes import Sunset8
+from scipy.ndimage import shift
+from scipy.stats import gaussian_kde, multivariate_normal, norm
+
+output_notebook()
+# multivariate_normal.pdf(X, mean=[0,0], cov=np.eye(2))
+X = np.loadtxt("https://raw.githubusercontent.com/mattnedrich/MeanShift_py/master/data.csv", delimiter=",")
+K = gaussian_kde(X.T)
+
+# %% show the test data + the contours
+p = figure(width=300, height=300)
+p.scatter(*X.T)
+show(p)
+
+(x_min, y_min), (x_max, y_max) = np.min(X, axis=0), np.max(X, axis=0)
+x_delta, y_delta = 0.10 * np.ptp(X[:, 0]), 0.10 * np.ptp(X[:, 1])
+xp = np.linspace(x_min - x_delta, x_max + x_delta, 80)
+yp = np.linspace(y_min - y_delta, y_max + y_delta, 80)
+xg, yg = np.meshgrid(xp, yp)
+Z = K.evaluate(np.vstack([xg.ravel(), yg.ravel()])).reshape(xg.shape)
+levels = np.linspace(np.min(Z), np.max(Z), 9)
+
+p = figure(width=550, height=400, match_aspect=True)
+contour_renderer = p.contour(xg, yg, Z, levels, fill_alpha=0.0, fill_color=Sunset8, line_color=Sunset8, line_width=3)
+colorbar = contour_renderer.construct_color_bar()
+p.add_layout(colorbar, "right")
+p.scatter(*X.T, fill_alpha=K.evaluate(X.T) / np.max(K.evaluate(X.T)))
+show(p)
+
+# %% Perform the mean shift
+from geomcover.cluster import mean_shift, assign_clusters, MultivariateNormalZeroMean
+from collections import Counter
+
+shift_points = mean_shift(X, bandwidth=K.factor)
+cl = assign_clusters(shift_points)
+
+# %%
+from map2color import map2hex
+
+# p = figure(width=300, height=300)
+p.scatter(*X.T, color=map2hex(cl))
+p.scatter(*shift_points.T, color=map2hex(cl), size=8, line_color="gray")
+show(p)
+
+# %%
+KDE = gaussian_kde(X.T, bw_method=1.0)
+cov = Covariance.from_precision(KDE.inv_cov, KDE.covariance)
+n, d = X.shape
+MVN = MultivariateNormalZeroMean((n, n, d), cov)
+# MVN(X)
+
+
+# import timeit
+# X = np.random.uniform(size=(1500, 2))
+# n, d = X.shape
+# KDE = gaussian_kde(X.T, bw_method=1.0)
+# MVN = MultivariateNormalZeroMean(cov)
+# delta = X[:, np.newaxis, :] - X[:150, :][np.newaxis, :, :]
+# timeit.timeit(lambda: MVN(delta), number=150)
+# timeit.timeit(lambda: multivariate_normal.pdf(delta, mean=mu, cov=cov), number=150)
+
+
+# %% Ensure kDE matches
+from scipy.stats import Covariance, multivariate_normal
+from scipy.spatial.distance import pdist, cdist, squareform
+from scipy.stats import gaussian_kde
+
+# X = np.random.uniform(size=(1500, 2), low=0, high=1)
+# mu = np.zeros(2, dtype=np.float64)
+# bw = np.repeat(0.25, len(mu))
+# cov = Covariance.from_diagonal(bw)
+
+# KDE = gaussian_kde(X.T, bw_method=0.20)
+# cov = Covariance.from_precision(KDE.inv_cov, KDE.covariance)
+
+# delta = X[:, np.newaxis, :] - X[np.newaxis, :, :]
+# dens1 = np.sum(multivariate_normal.pdf(delta, mean=mu, cov=cov) / len(X), axis=1)
+# dens2 = KDE.evaluate(X.T)
+# assert np.allclose(dens1, dens2)
+
+# import timeit
+
+# timeit.timeit(lambda: np.sum(multivariate_normal.pdf(delta, mean=mu, cov=cov), axis=1) / len(X), number=10)
+# timeit.timeit(lambda: KDE.evaluate(X.T), number=10)
+
+# assert isinstance(kernel, Callable), "Kernel must a be a callable function."
+
+# %% Test two larger gaussians
+
+X1 = multivariate_normal.rvs(size=1500, mean=[-5, 0], cov=1)
+X2 = multivariate_normal.rvs(size=1500, mean=[+5, 0], cov=np.diag([1, 2]))
+X = np.vstack([X1, X2])
+
+## Blurred mean shift takes 3.7s here, compared to regular MS taking 14.5s
+## About 4x faster!
+## custom mvn code takes 14.1s, compared to 19.2 w/ scipy
+S = X[np.random.choice(range(len(X)), size=500, replace=False)]
+SP = mean_shift(X, S, maxiter=200, batch=25)
+cl = assign_clusters(SP, atol=0.5)
+
+
+KDE = gaussian_kde(X.T)
+density = KDE.evaluate(X.T)
+## Straified samplign approach
+# np.quantile(density, np.linspace(0, 1, 10))
+cuts = np.linspace(np.min(density), np.max(density), 10)
+binned = np.digitize(density, cuts)
+bin_counts = np.bincount(binned)
+n_samples = 100
+(n_samples / len(X)) * bin_counts
+# np.random.choice(range(len(X)), replace=False, p=bin_counts/np.sum(bin_counts))
+
+np.digitize(density, bins=10)
+
+p = figure(width=300, height=300)
+p.scatter(*X.T, color=map2hex(cl, "turbo"))
+# p.scatter(*SP.T, color="red", line_color="gray")
+show(p)
+
+from scipy.cluster.hierarchy import fcluster
+
+L = single(pdist(SP))
+cl = fcluster(L, 2, criterion="maxclust")
+
+
+gaussian_kde(X.T)
+
+# multivariate_normal.pdf(X[:5], mean=[0, 0], cov=1)
+## covariance object 1.8s -> 1.6s
+from line_profiler import LineProfiler
+from geomcover.cluster import batch_shift_kernel
+
+profile = LineProfiler()
+profile.add_function(mean_shift)
+profile.add_function(batch_shift_kernel)
+profile.add_function(multivariate_normal.pdf)
+profile.add_function(multivariate_normal._logpdf)
+profile.add_function(MVN.__call__)
+profile.enable_by_count()
+# mean_shift(X, maxiter=1000)
+MVN(delta)
+multivariate_normal.pdf(delta, mean=mu, cov=cov)
+profile.print_stats()
diff --git a/src/geomcover/geometry.py b/src/geomcover/geometry.py
index 9ced553..1ae1c4b 100644
--- a/src/geomcover/geometry.py
+++ b/src/geomcover/geometry.py
@@ -8,7 +8,7 @@
 import numpy as np
 from combin import inverse_choose
 from numpy.typing import ArrayLike
-from scipy.sparse import coo_array, coo_matrix, csr_array, find, sparray
+from scipy.sparse import coo_array, coo_matrix, csr_array, find, sparray, diags
 from scipy.spatial import Delaunay
 from scipy.spatial.distance import cdist
 
@@ -104,7 +104,7 @@ def bundle_weights(M: sparray, TM: list, method: str, reduce: Union[str, Callabl
 	measures can at times be useful for constructing nerve complexes, removing outliers, detecting locally smooth areas, etc. 
 
 	The methods supported include 'distance', 'cosine', and 'angle', which do the following:
-	
+
 		* 'distance': Measures the distance from each neighborhood point to its projection onto the tangent space using the Euclidean norm.
 		* 'cosine': Measures the distance from each neighborhood tangent vector to a fixed tangent vector using the cosine distance. 
 		* 'angle': Measures the distance from each neighborhood tangent vector to a fixed tangent vector using the stiefel canonical metric. 
@@ -186,7 +186,9 @@ def neighbor_graph_ball(
 		C.extend(ind[c])
 		V.extend(v.astype(dtype))
 	G = coo_matrix((V, (R, C)), shape=(n, n), dtype=dtype)
-	return to_canonical(G, form="csc", diag=True, symmetrize=False)
+	G = to_canonical(G, form="csc")
+	G += diags(np.ones(G.shape[0]))
+	return G
 
 
 def neighbor_graph_knn(
@@ -208,7 +210,9 @@ def neighbor_graph_knn(
 	R, C = np.array(R), np.array(C)
 	dtype = np.float32 if weighted else bool
 	G = coo_matrix((V, (R, C)), shape=(n, n), dtype=dtype)
-	return to_canonical(G, form="csc", diag=True, symmetrize=False)
+	G = to_canonical(G, form="csc")
+	G += diags(np.ones(G.shape[0]))
+	return G
 
 
 def neighbor_graph_del(X: ArrayLike, weighted: bool = False, **kwargs):
@@ -224,7 +228,8 @@ def neighbor_graph_del(X: ArrayLike, weighted: bool = False, **kwargs):
 	V = np.ones(len(R)) if not weighted else np.linalg.norm(X[R] - X[C], axis=1)
 	dtype = np.bool if not weighted else np.float32
 	G = coo_array((V, (R, C)), shape=(n, n), dtype=dtype)
-	return to_canonical(G, form="csc", diag=True, symmetrize=False)
+	G += diags(np.ones(G.shape[0]))
+	return G
 
 
 def tangent_neighbor_graph(X: ArrayLike, d: int, r: float, ind=None):