-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathkmeans.py
209 lines (186 loc) · 7.51 KB
/
kmeans.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
#!/usr/bin/env python
# kmeans.py using any of the 20-odd metrics in scipy.spatial.distance
# kmeanssample 2 pass, first sample sqrt(N)
from __future__ import division
import random
import numpy as np
from scipy.spatial.distance import cdist # $scipy/spatial/distance.py
# http://docs.scipy.org/doc/scipy/reference/spatial.html
from scipy.sparse import issparse # $scipy/sparse/csr.py
__date__ = "2011-11-17 Nov denis"
# X sparse, any cdist metric: real app ?
# centres get dense rapidly, metrics in high dim hit distance whiteout
# vs unsupervised / semi-supervised svm
# ...............................................................................
def kmeans(X, centres, delta=.001, maxiter=10,
metric="euclidean", p=2, verbose=1):
""" centres, Xtocentre, distances = kmeans( X, initial centres ... )
in:
X N x dim may be sparse
centres k x dim: initial centres, e.g. random.sample( X, k )
delta: relative error, iterate until the average distance to centres
is within delta of the previous average distance
maxiter
metric: any of the 20-odd in scipy.spatial.distance
"chebyshev" = max, "cityblock" = L1, "minkowski" with p=
or a function( Xvec, centrevec ), e.g. Lqmetric below
p: for minkowski metric -- local mod cdist for 0 < p < 1 too
verbose: 0 silent, 2 prints running distances
out:
centres, k x dim
Xtocentre: each X -> its nearest centre, ints N -> k
distances, N
see also: kmeanssample below, class Kmeans below.
"""
if not issparse(X):
X = np.asanyarray(X) # ?
centres = centres.todense() if issparse(centres) \
else centres.copy()
N, dim = X.shape
k, cdim = centres.shape
if dim != cdim:
raise ValueError(
"kmeans: X %s and centres %s must have the same number of columns"
% (X.shape, centres.shape))
if verbose:
print("kmeans: X %s centres %s delta=%.2g maxiter=%d metric=%s" % (
X.shape, centres.shape, delta, maxiter, metric))
allx = np.arange(N)
prevdist = 0
for jiter in range(1, maxiter+1):
D = cdist_sparse(X, centres, metric=metric, p=p) # |X| x |centres|
xtoc = D.argmin(axis=1) # X -> nearest centre
distances = D[allx, xtoc]
avdist = distances.mean() # median ?
if verbose >= 2:
print("kmeans: av |X - nearest centre| = %.4g" % avdist)
if (1 - delta) * prevdist <= avdist <= prevdist \
or jiter == maxiter:
break
prevdist = avdist
for jc in range(k): # (1 pass in C)
c = np.where(xtoc == jc)[0]
if len(c) > 0:
centres[jc] = X[c].mean(axis=0)
if verbose:
print("kmeans: %d iterations cluster sizes: %r"
% (jiter, np.bincount(xtoc)))
if verbose >= 2:
r50 = np.zeros(k)
r90 = np.zeros(k)
for j in range(k):
dist = distances[xtoc == j]
if len(dist) > 0:
r50[j], r90[j] = np.percentile(dist, (50, 90))
print("kmeans: cluster 50 % radius", r50.astype(int))
print("kmeans: cluster 90 % radius", r90.astype(int))
# scale L1 / dim, L2 / sqrt(dim) ?
return centres, xtoc, distances
# ...............................................................................
def kmeanssample(X, k, nsample=0, **kwargs):
""" 2-pass kmeans, fast for large N:
1) kmeans a random sample of nsample ~ sqrt(N) from X
2) full kmeans, starting from those centres
"""
# merge w kmeans ? mttiw
# v large N: sample N^1/2, N^1/2 of that
# seed like sklearn ?
N, dim = X.shape
if nsample == 0:
nsample = max(2*np.sqrt(N), 10*k)
Xsample = randomsample(X, int(nsample))
pass1centres = randomsample(X, int(k))
samplecentres = kmeans(Xsample, pass1centres, **kwargs)[0]
return kmeans(X, samplecentres, **kwargs)
def cdist_sparse(X, Y, **kwargs):
""" -> |X| x |Y| cdist array, any cdist metric
X or Y may be sparse -- best csr
"""
# todense row at a time, v slow if both v sparse
sxy = 2*issparse(X) + issparse(Y)
if sxy == 0:
return cdist(X, Y, **kwargs)
d = np.empty((X.shape[0], Y.shape[0]), np.float64)
if sxy == 2:
for j, x in enumerate(X):
d[j] = cdist(x.todense(), Y, **kwargs)[0]
elif sxy == 1:
for k, y in enumerate(Y):
d[:, k] = cdist(X, y.todense(), **kwargs)[0]
else:
for j, x in enumerate(X):
for k, y in enumerate(Y):
d[j, k] = cdist(x.todense(), y.todense(), **kwargs)[0]
return d
def randomsample(X, n):
""" random.sample of the rows of X
X may be sparse -- best csr
"""
sampleix = random.sample(range(X.shape[0]), int(n))
return X[sampleix]
def nearestcentres(X, centres, metric="euclidean", p=2):
""" each X -> nearest centre, any metric
euclidean2 (~ withinss) is more sensitive to outliers,
cityblock (manhattan, L1) less sensitive
"""
D = cdist(X, centres, metric=metric, p=p) # |X| x |centres|
return D.argmin(axis=1)
def Lqmetric(x, y=None, q=.5):
# yes a metric, may increase weight of near matches; see ...
return (np.abs(x - y) ** q) .mean() if y is not None \
else (np.abs(x) ** q) .mean()
# ...............................................................................
class Kmeans:
""" km = Kmeans( X, k= or centres=, ... )
in: either initial centres= for kmeans
or k= [nsample=] for kmeanssample
out: km.centres, km.Xtocentre, km.distances
iterator:
for jcentre, J in km:
clustercentre = centres[jcentre]
J indexes e.g. X[J], classes[J]
"""
def __init__(self, X, k=0, centres=None, nsample=0, **kwargs):
self.X = X
if centres is None:
self.centres, self.Xtocentre, self.distances = kmeanssample(
X, k=k, nsample=nsample, **kwargs)
else:
self.centres, self.Xtocentre, self.distances = kmeans(
X, centres, **kwargs)
def __iter__(self):
for jc in range(len(self.centres)):
yield jc, (self.Xtocentre == jc)
# ...............................................................................
if __name__ == "__main__":
import random
import sys
from time import time
N = 10000
dim = 10
ncluster = 10
kmsample = 100 # 0: random centres, > 0: kmeanssample
kmdelta = .001
kmiter = 10
metric = "cosine" # "chebyshev" = max, "cityblock" L1, Lqmetric
seed = 1
exec("\n".join(sys.argv[1:])) # run this.py N= ...
np.set_printoptions(1, threshold=200, edgeitems=5, suppress=True)
np.random.seed(seed)
random.seed(seed)
print("N %d dim %d ncluster %d kmsample %d metric %s" % (
N, dim, ncluster, kmsample, metric))
X = np.random.exponential(size=(N, dim))
# cf scikits-learn datasets/
t0 = time()
if kmsample > 0:
centres, xtoc, dist = kmeanssample(X, ncluster, nsample=kmsample,
delta=kmdelta, maxiter=kmiter,
metric=metric, verbose=2)
else:
randomcentres = randomsample(X, ncluster)
centres, xtoc, dist = kmeans(X, randomcentres,
delta=kmdelta, maxiter=kmiter,
metric=metric, verbose=2)
print("%.0f msec" % ((time() - t0) * 1000))
# also ~/py/np/kmeans/test-kmeans.py