Skip to content

Commit

Permalink
Merge pull request #89 from jakirkham/add_squareform
Browse files Browse the repository at this point in the history
Add squareform
  • Loading branch information
jakirkham authored Oct 9, 2017
2 parents 8dc9363 + 5d43aab commit 307e016
Show file tree
Hide file tree
Showing 2 changed files with 143 additions and 9 deletions.
87 changes: 79 additions & 8 deletions dask_distance/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

from __future__ import division, unicode_literals

import numpy

import dask
import dask.array

Expand Down Expand Up @@ -165,16 +167,85 @@ def pdist(X, metric="euclidean", **kwargs):
if "V" not in kwargs:
kwargs["V"] = dask.array.var(X, axis=0, ddof=1)

result = cdist(X, X, metric, **kwargs)
result = squareform(cdist(X, X, metric, **kwargs), force="tovec")

result = [
result[i, i + 1:] for i in _pycompat.irange(0, len(result) - 1)
]
return result

if result:
result = dask.array.concatenate(result)
else:
result = dask.array.empty((0,), dtype=float, chunks=(1,))

def squareform(X, force="no"):
"""
Converts between dense and sparse distance matrices
Args:
X: 2-D square symmetric matrix or 1-D vector of distances
force: whether to force to a vector or a matrix
Returns:
array: 1-D vector or 2-D square symmetric matrix of distances
"""

X = _compat._asarray(X)

try:
force = force.decode("utf-8")
except AttributeError:
pass

conv = force
if force not in ["tovec", "tomatrix"]:
if X.ndim == 1:
conv = "tomatrix"
elif X.ndim == 2:
conv = "tovec"
else:
raise ValueError("X must be a vector or a square matrix.")

if conv == "tovec":
if X.ndim != 2 or X.shape[0] != X.shape[1]:
raise ValueError("X must be a square matrix.")
elif conv == "tomatrix":
if X.ndim != 1:
raise ValueError("X must be a vector.")

if conv == "tomatrix":
d = (1.0 + numpy.sqrt(1.0 + 8.0 * float(len(X)))) / 2.0
d = int(numpy.round(d))

if (d * (d - 1)) != (2 * len(X)):
raise ValueError("Unacceptable length for X.")

X_tri = []
j1 = 0
for j2 in _pycompat.irange(d - 1, -1, -1):
X_tri.append(X[j1:j1 + j2])
j1 += j2

z = dask.array.zeros((1,), dtype=X.dtype, chunks=(1,))

result = []
for i in range(d):
col_i = []

for j in range(i):
i_j = i - j
col_i.append(X_tri[j][i_j - 1:i_j])
col_i.append(z)
col_i.append(X_tri[i])

result.append(dask.array.concatenate([
a for a in col_i if a.size
]))

result = dask.array.stack(result)
elif conv == "tovec":
result = [
X[i, i + 1:] for i in range(0, len(X) - 1)
]

if result:
result = dask.array.concatenate(result)
else:
result = dask.array.empty((0,), dtype=X.dtype, chunks=(1,))

return result

Expand Down
65 changes: 64 additions & 1 deletion tests/test_dask_distance.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import absolute_import
from __future__ import absolute_import, division

import numpy as np
import scipy.spatial.distance as spdist
Expand Down Expand Up @@ -369,3 +369,66 @@ def test_2d_bool_pdist(metric, seed, u_shape, u_chunks):

assert d_r.shape == a_r.shape
assert np.allclose(np.array(d_r)[()], a_r, equal_nan=True)


@pytest.mark.parametrize(
"et, X_shape, X_chunks, force", [
(ValueError, (4, 3, 2), (2, 2, 2), "no"),
(ValueError, (4, 3, 2), (2, 2, 2), "tovec"),
(ValueError, (4, 3, 2), (2, 2, 2), "tomatrix"),
(ValueError, (4,), (2,), "tovec"),
(ValueError, (4, 3), (2, 2), "tomatrix"),
(ValueError, (4, 3), (2, 2), "no"),
(ValueError, (4, 3), (2, 2), "tovec"),
(ValueError, (2,), (2,), "no"),
(ValueError, (2,), (2,), "tomatrix"),
]
)
def test_squareform_err(et, X_shape, X_chunks, force):
np.random.seed(0)

a_X = np.random.random(X_shape)
d_X = da.from_array(a_X, chunks=X_chunks)

with pytest.raises(et):
dask_distance.squareform(d_X, force=force)


@pytest.mark.parametrize(
"X_shape, X_chunks, force", [
((0,), (1,), "no"),
((0,), (1,), "tomatrix"),
((0, 0), (1, 1), "no"),
((0, 0), (1, 1), "tovec"),
((1,), (1,), "no"),
((1,), (1,), "tomatrix"),
((3,), (2,), "no"),
((3,), (2,), "tomatrix"),
((6,), (2,), "no"),
((6,), (2,), "tomatrix"),
((1, 1), (1, 1), "no"),
((1, 1), (1, 1), "tovec"),
((3, 3), (1, 2), "no"),
((3, 3), (1, 2), "tovec"),
((10, 10), (4, 5), "no"),
((10, 10), (4, 5), "tovec"),
]
)
@pytest.mark.parametrize(
"seed", [
0,
137,
]
)
def test_squareform(seed, X_shape, X_chunks, force):
np.random.seed(seed)

a_X = np.random.random(X_shape)
d_X = da.from_array(a_X, chunks=X_chunks)

a_R = spdist.squareform(a_X, force=force, checks=False)
d_R = dask_distance.squareform(d_X, force=force)

assert d_R.shape == a_R.shape
assert d_R.dtype == a_R.dtype
assert np.allclose(np.array(d_R)[()], a_R, equal_nan=True)

0 comments on commit 307e016

Please sign in to comment.