Updaet readme

peekxc · Sep 9, 2024 · ee2098d · ee2098d
1 parent 3530a81
commit ee2098d
Show file tree

Hide file tree

Showing 5 changed files with 265 additions and 63 deletions.
diff --git a/README.md b/README.md
@@ -0,0 +1,32 @@
+`geomcover` is a Python package built to simplify constructing geometric set covers over a variety of geometric objects, 
+such as metric graphs and manifolds. Such covers can be used for simplifying surfaces, constructing fiber bundles, 
+or summarizing point cloud data.
+
+The package currently includes efficient implementations of approximation algorithms for [the set cover problem](https://en.wikipedia.org/wiki/Set_cover_problem) (SCP), functions for constructing tangent bundles from range spaces, and tools for computing statistics on them (e.g. average alignment of tangent vectors). The package also exports useful auxiliary algorithms often used for preprocessing, such as [principle component analysis](https://en.wikipedia.org/wiki/Principal_component_analysis), [classical multidimensional scaling](https://en.wikipedia.org/wiki/Multidimensional_scaling), and [mean shift](https://en.wikipedia.org/wiki/Mean_shift).  
+
+Below are the SCP solvers implemented, along with their relative scalabilities, approximation qualities, and dependencies:
+
+| Algorithm                   | Scalability   | Approximation Quality | Required Dependencies         | Solvers         |
+|-----------------------------|---------------|-----------------------|-------------------------------|-----------------|
+| Greedy                      | Very large $n$| Medium                | Numpy                         | Direct          |
+| Randomized Rounding (LP)    | Large $n$     | High                  | Numpy / SciPy                 | `linprog`       |
+| ILP Method                  | Medium $n$    | Optimal[^1]              | Numpy / SciPy (or `ortools`)  | `milp` (or MPSolver) |
+| SAT Solver                  | Small $n$     | Optimal[^2]               | pysat                         |  RC2                   |
+
+[^1]: The integer-linear program (ILP) by default uses SciPy's port of the the HiGHs solver, which typically finds the global optimum of moderately challenging problems; alternatively, any solver supported by OR-Tools [MPSolver](https://developers.google.com/optimization/lp/mpsolver) can be used if the `ortools` package is installed.  
+
+[^2]: The SAT solver used here is a weighted MaxSAT solver, which starts with an approximation solution and then incrementally exhausts the solution space until the optimal solution. While it is the slowest method, the starting approximation has the tightest approximation factor of all the methods listed. 
+
+## Installation 
+
+The package is a pure Python package, which for now needs to be built from source, e.g.: 
+
+```bash
+python -m pip install git+https://github.com/peekxc/geomcover.git
+```
+A PYPI distribution is planned. 
+
+## Usage and Docs
+
+For some example usage, see the `notebooks` directory, or examples in the [API docs](https://peekxc.github.io/geomcover/). 
+
diff --git a/notebooks/circle.py b/notebooks/circle.py
@@ -15,37 +15,37 @@
 X = np.c_[x, y]
 
 # %% Plot
-p = figure(width=325, height=325, match_aspect=True)
-p.line(np.append(x, x[0]), np.append(y, y[0]), color="blue", line_width=2)
-p.scatter(x, y, color="blue", size=5)
-show(p)
+p1 = figure(width=325, height=325, match_aspect=True)
+p1.line(np.append(x, x[0]), np.append(y, y[0]), color="blue", line_width=2)
+p1.scatter(x, y, color="blue", size=5)
+show(p1)
 
 ## Our manifold will be the simple cycle graph (+ identity)
 n = len(x)
-M = cycle_graph(n, k=5) + dia_array(np.eye(n))
+M = cycle_graph(n, k=7) + dia_array(np.eye(n))
 TM = tangent_bundle(M=M, X=X, d=1, centers=X)
 
 # %% Plot the bundle
-p = plot_tangent_bundle(TM, width=325, height=325, match_aspect=True)
-show(p)
+p2 = plot_tangent_bundle(TM, width=325, height=325, match_aspect=True)
+show(p2)
 
 # %% Plot the cover
-# p = plot_nerve(M, X, width = 450, height = 150)
-# show(p)
+p3 = plot_nerve(M, X, width=325, height=325, match_aspect=True)
+show(p3)
 
 # %% Step 2: choose a bundle weighting scheme
 from geomcover.geometry import bundle_weights
 from map2color import map2hex
 
-TW = bundle_weights(M, TM, method="cosine", reduce=np.mean)  # lambda x: np.ptp(x/2.0)
+# TW = bundle_weights(M, TM, method="cosine", reduce=np.mean)  # lambda x: np.ptp(x/2.0)
 # TW = bundle_weights(M, TM, method="distance", reduce=np.max)
-# TW = bundle_weights(M, TM, method="angle", reduce=np.mean)  # uses Stiefel Canonical Metric
+TW = bundle_weights(M, TM, method="angle", reduce=np.mean)  # uses Stiefel Canonical Metric
 
 (x, y), (xs, ys) = plot_tangent_bundle(TM, data=True)
-p = figure(width=325, height=325, match_aspect=True)
-p.multi_line(xs, ys)
-p.scatter(x, y, color=map2hex(TW, "viridis"), size=5, fill_alpha=1.0)
-show(p)
+p4 = figure(width=325, height=325, match_aspect=True)
+p4.multi_line(xs, ys)
+p4.scatter(x, y, color=map2hex(TW, "viridis"), size=5, fill_alpha=1.0)
+show(p4)
 
 # %% Step 3: form the minimal weight set cover
 from geomcover.cover import set_cover
@@ -54,7 +54,13 @@
 cover, cover_weight = set_cover(M, TW, "ILP")
 cover_ind = np.flatnonzero(cover)
 
-p = figure(width=325, height=325, match_aspect=True)
-p.scatter(*X[cover].T, color="red", size=7, line_color="black")
-p.scatter(*X[~cover].T, color="gray", size=4, line_color="gray", fill_alpha=0.50)
-show(p)
+p5 = figure(width=325, height=325, match_aspect=True, title="")
+p5.scatter(*X[cover].T, color="red", size=7, line_color="black")
+p5.scatter(*X[~cover].T, color="gray", size=4, line_color="gray", fill_alpha=0.50)
+show(p5)
+
+
+# %%
+from bokeh.layouts import row
+
+show(row(p1, p2, p3, p4, p5))
diff --git a/notebooks/spiral.py b/notebooks/spiral.py
@@ -4,7 +4,7 @@
 from bokeh.io import output_notebook
 from bokeh.plotting import figure, show
 from geomcover.geometry import tangent_bundle
-from geomcover.plotting import plot_tangent_bundle
+from geomcover.plotting import plot_nerve, plot_tangent_bundle
 
 output_notebook()
 
@@ -20,9 +20,9 @@ def spiral(N, sigma=1.0):
 
 # %% Data set
 X = spiral(N=300, sigma=0.50)
-p = figure(width=325, height=325, match_aspect=True)
-p.scatter(*X.T, size=4, color="red", line_color="gray")
-show(p)
+p1 = figure(width=325, height=325, match_aspect=True)
+p1.scatter(*X.T, size=4, color="blue", line_color="gray")
+show(p1)
 
 # %% First step: tangent bundle estimation
 from geomcover.geometry import neighbor_graph_knn
@@ -34,64 +34,77 @@ def spiral(N, sigma=1.0):
 TM = tangent_bundle(M=M, X=X, d=1, centers=X)
 
 ## Plot the bundle
-p = plot_tangent_bundle(TM, width=325, height=325, match_aspect=True)
-show(p)
+p2 = plot_tangent_bundle(TM, width=325, height=325, match_aspect=True)
+show(p2)
 
 ## Plot the cover
 # p = plot_nerve(M, X, width = 450, height = 150)
 # show(p)
 
 # %% Step 2: choose a bundle weighting scheme
-from set_cover.covers import bundle_weights
+from geomcover.geometry import bundle_weights
 from map2color import map2hex
 
-# TW = bundle_weights(M, TM, method="cosine", reduce=np.mean) # lambda x: np.ptp(x/2.0)
+# TW = bundle_weights(M, TM, method="cosine", reduce=np.mean)  # lambda x: np.ptp(x/2.0)
 # TW = bundle_weights(M, TM, method="distance", reduce=np.max)
 TW = bundle_weights(M, TM, method="angle", reduce=np.mean)  # uses Stiefel Canonical Metric
 
 (x, y), (xs, ys) = plot_tangent_bundle(TM, data=True, c=2.0)
-p = figure(width=350, height=350, match_aspect=True)
-p.multi_line(xs, ys, color=map2hex(TW, "viridis"))
-p.scatter(x, y, color=map2hex(TW, "viridis"), size=3.5, fill_alpha=1.0)
-show(p)
+p3 = figure(width=350, height=350, match_aspect=True)
+p3.multi_line(xs, ys, color=map2hex(TW, "viridis"))
+p3.scatter(x, y, color=map2hex(TW, "viridis"), size=3.5, fill_alpha=1.0)
+show(p3)
+
 
 # %% Step 3: form the minimal weight set cover
-from set_cover.wset_cover import wset_cover
+from geomcover.cover import set_cover
 
 TW_unit = np.ones(len(TW))
 
-cover, cover_weight = wset_cover(M, TW_unit, "greedy")
-cover, cover_weight = wset_cover(M, TW_unit, "LP", sparsity=0.25)
-cover, cover_weight = wset_cover(M, TW, "LP", sparsity=0.25)
-cover, cover_weight = wset_cover(M, TW, "sat")
-wset_cover(M, TW_unit, "sat")
+# cover, cover_weight = wset_cover(M, TW_unit, "greedy")
+# cover, cover_weight = wset_cover(M, TW_unit, "LP", sparsity=0.25)
+# cover, cover_weight = wset_cover(M, TW, "LP", sparsity=0.25)
+# cover, cover_weight = wset_cover(M, TW, "sat")
+# cover, cover_weight = set_cover(M, TW_unit, "ILP")
 # cover, cover_weight = wset_cover(M, TW, "sat")
-cover_ind = np.flatnonzero(cover)
+# cover_ind = np.flatnonzero(cover)
 # assert valid_cover(M, ind=np.flatnonzero(cover))
+cover, cover_weight = set_cover(M, TW, "ILP")
 
 xs_c, ys_c = list(np.array(xs)[cover]), list(np.array(ys)[cover])
-p = figure(width=350, height=350, match_aspect=True)
-p.scatter(*X[~cover].T, color="gray", size=4, line_color="gray", fill_alpha=0.50)
-p.scatter(*X[cover].T, color="red", size=5, line_color="black")
-p.multi_line(xs_c, ys_c, color="red", line_width=4)
-show(p)
+p4 = figure(width=350, height=350, match_aspect=True)
+p4.scatter(*X[~cover].T, color="gray", size=4, line_color="gray", fill_alpha=0.50)
+p4.scatter(*X[cover].T, color="red", size=5, line_color="black")
+p4.multi_line(xs_c, ys_c, color="red", line_width=3)
+show(p4)
 
-# %% Step 4: Tweak the weights to match what we want
-from scipy.stats import gaussian_kde
+# %%
+from bokeh.layouts import row
 
-kde = gaussian_kde(X.T)
-density = kde(X.T)
-alignment = bundle_weights(M, TM, method="angle", reduce=np.mean)
+show(row(p1, p2, p3, p4))
 
-normalize = lambda x: (x - np.min(x)) / (np.max(x) - np.min(x))
-TW = 0.5 * normalize(alignment) + 0.5 * normalize(density)
 
-cover, cover_weight = wset_cover(M, TW, "LP")
-cover_ind = np.flatnonzero(cover)
+# %% Show nerve simplification
+X[cover]
 
-xs_c, ys_c = list(np.array(xs)[cover]), list(np.array(ys)[cover])
-p = figure(width=350, height=350, match_aspect=True)
-p.scatter(*X[~cover].T, color="gray", size=4, line_color="gray", fill_alpha=0.50)
-p.scatter(*X[cover].T, color="red", size=5, line_color="black")
-p.multi_line(xs_c, ys_c, color="red", line_width=4)
-show(p)
+show(plot_nerve(M, X))
+
+# # %% Step 4: Tweak the weights to match what we want
+# from scipy.stats import gaussian_kde
+
+# kde = gaussian_kde(X.T)
+# density = kde(X.T)
+# alignment = bundle_weights(M, TM, method="angle", reduce=np.mean)
+
+# normalize = lambda x: (x - np.min(x)) / (np.max(x) - np.min(x))
+# TW = 0.5 * normalize(alignment) + 0.5 * normalize(density)
+
+# cover, cover_weight = wset_cover(M, TW, "LP")
+# cover_ind = np.flatnonzero(cover)
+
+# xs_c, ys_c = list(np.array(xs)[cover]), list(np.array(ys)[cover])
+# p = figure(width=350, height=350, match_aspect=True)
+# p.scatter(*X[~cover].T, color="gray", size=4, line_color="gray", fill_alpha=0.50)
+# p.scatter(*X[cover].T, color="red", size=5, line_color="black")
+# p.multi_line(xs_c, ys_c, color="red", line_width=4)
+# show(p)
diff --git a/notebooks/test_mean_shift.py b/notebooks/test_mean_shift.py
@@ -0,0 +1,146 @@
+import numpy as np
+from bokeh.io import output_notebook
+from bokeh.plotting import figure, show
+from bokeh.palettes import Sunset8
+from scipy.ndimage import shift
+from scipy.stats import gaussian_kde, multivariate_normal, norm
+
+output_notebook()
+# multivariate_normal.pdf(X, mean=[0,0], cov=np.eye(2))
+X = np.loadtxt("https://raw.githubusercontent.com/mattnedrich/MeanShift_py/master/data.csv", delimiter=",")
+K = gaussian_kde(X.T)
+
+# %% show the test data + the contours
+p = figure(width=300, height=300)
+p.scatter(*X.T)
+show(p)
+
+(x_min, y_min), (x_max, y_max) = np.min(X, axis=0), np.max(X, axis=0)
+x_delta, y_delta = 0.10 * np.ptp(X[:, 0]), 0.10 * np.ptp(X[:, 1])
+xp = np.linspace(x_min - x_delta, x_max + x_delta, 80)
+yp = np.linspace(y_min - y_delta, y_max + y_delta, 80)
+xg, yg = np.meshgrid(xp, yp)
+Z = K.evaluate(np.vstack([xg.ravel(), yg.ravel()])).reshape(xg.shape)
+levels = np.linspace(np.min(Z), np.max(Z), 9)
+
+p = figure(width=550, height=400, match_aspect=True)
+contour_renderer = p.contour(xg, yg, Z, levels, fill_alpha=0.0, fill_color=Sunset8, line_color=Sunset8, line_width=3)
+colorbar = contour_renderer.construct_color_bar()
+p.add_layout(colorbar, "right")
+p.scatter(*X.T, fill_alpha=K.evaluate(X.T) / np.max(K.evaluate(X.T)))
+show(p)
+
+# %% Perform the mean shift
+from geomcover.cluster import mean_shift, assign_clusters, MultivariateNormalZeroMean
+from collections import Counter
+
+shift_points = mean_shift(X, bandwidth=K.factor)
+cl = assign_clusters(shift_points)
+
+# %%
+from map2color import map2hex
+
+# p = figure(width=300, height=300)
+p.scatter(*X.T, color=map2hex(cl))
+p.scatter(*shift_points.T, color=map2hex(cl), size=8, line_color="gray")
+show(p)
+
+# %%
+KDE = gaussian_kde(X.T, bw_method=1.0)
+cov = Covariance.from_precision(KDE.inv_cov, KDE.covariance)
+n, d = X.shape
+MVN = MultivariateNormalZeroMean((n, n, d), cov)
+# MVN(X)
+
+
+# import timeit
+# X = np.random.uniform(size=(1500, 2))
+# n, d = X.shape
+# KDE = gaussian_kde(X.T, bw_method=1.0)
+# MVN = MultivariateNormalZeroMean(cov)
+# delta = X[:, np.newaxis, :] - X[:150, :][np.newaxis, :, :]
+# timeit.timeit(lambda: MVN(delta), number=150)
+# timeit.timeit(lambda: multivariate_normal.pdf(delta, mean=mu, cov=cov), number=150)
+
+
+# %% Ensure kDE matches
+from scipy.stats import Covariance, multivariate_normal
+from scipy.spatial.distance import pdist, cdist, squareform
+from scipy.stats import gaussian_kde
+
+# X = np.random.uniform(size=(1500, 2), low=0, high=1)
+# mu = np.zeros(2, dtype=np.float64)
+# bw = np.repeat(0.25, len(mu))
+# cov = Covariance.from_diagonal(bw)
+
+# KDE = gaussian_kde(X.T, bw_method=0.20)
+# cov = Covariance.from_precision(KDE.inv_cov, KDE.covariance)
+
+# delta = X[:, np.newaxis, :] - X[np.newaxis, :, :]
+# dens1 = np.sum(multivariate_normal.pdf(delta, mean=mu, cov=cov) / len(X), axis=1)
+# dens2 = KDE.evaluate(X.T)
+# assert np.allclose(dens1, dens2)
+
+# import timeit
+
+# timeit.timeit(lambda: np.sum(multivariate_normal.pdf(delta, mean=mu, cov=cov), axis=1) / len(X), number=10)
+# timeit.timeit(lambda: KDE.evaluate(X.T), number=10)
+
+# assert isinstance(kernel, Callable), "Kernel must a be a callable function."
+
+# %% Test two larger gaussians
+
+X1 = multivariate_normal.rvs(size=1500, mean=[-5, 0], cov=1)
+X2 = multivariate_normal.rvs(size=1500, mean=[+5, 0], cov=np.diag([1, 2]))
+X = np.vstack([X1, X2])
+
+## Blurred mean shift takes 3.7s here, compared to regular MS taking 14.5s
+## About 4x faster!
+## custom mvn code takes 14.1s, compared to 19.2 w/ scipy
+S = X[np.random.choice(range(len(X)), size=500, replace=False)]
+SP = mean_shift(X, S, maxiter=200, batch=25)
+cl = assign_clusters(SP, atol=0.5)
+
+
+KDE = gaussian_kde(X.T)
+density = KDE.evaluate(X.T)
+## Straified samplign approach
+# np.quantile(density, np.linspace(0, 1, 10))
+cuts = np.linspace(np.min(density), np.max(density), 10)
+binned = np.digitize(density, cuts)
+bin_counts = np.bincount(binned)
+n_samples = 100
+(n_samples / len(X)) * bin_counts
+# np.random.choice(range(len(X)), replace=False, p=bin_counts/np.sum(bin_counts))
+
+np.digitize(density, bins=10)
+
+p = figure(width=300, height=300)
+p.scatter(*X.T, color=map2hex(cl, "turbo"))
+# p.scatter(*SP.T, color="red", line_color="gray")
+show(p)
+
+from scipy.cluster.hierarchy import fcluster
+
+L = single(pdist(SP))
+cl = fcluster(L, 2, criterion="maxclust")
+
+
+gaussian_kde(X.T)
+
+# multivariate_normal.pdf(X[:5], mean=[0, 0], cov=1)
+## covariance object 1.8s -> 1.6s
+from line_profiler import LineProfiler
+from geomcover.cluster import batch_shift_kernel
+
+profile = LineProfiler()
+profile.add_function(mean_shift)
+profile.add_function(batch_shift_kernel)
+profile.add_function(multivariate_normal.pdf)
+profile.add_function(multivariate_normal._logpdf)
+profile.add_function(MVN.__call__)
+profile.enable_by_count()
+# mean_shift(X, maxiter=1000)
+MVN(delta)
+multivariate_normal.pdf(delta, mean=mu, cov=cov)
+profile.print_stats()