-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathk_mean_clustering.py
455 lines (413 loc) · 16.6 KB
/
k_mean_clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
#!/usr/bin/env python3
"""
Author: Junda Huang
Student number: 910203370050
Implementation of the k-means clustering algorithm
Hints:
- write a function to obtain Euclidean distance between two points.
- write a function to initialize centroids by randomly selecting points
from the initial set of points. You can use the random.sample() method
- write a function to find the closest centroid to each point and assign
the points to the clusters.
- write a function to calculate the centroids given the clusters
- write a function to implement k-means
- write a function to calculate WGSS given the clusters and the points
- write a function to calculate BGSS given the clusters and the points
"""
# import statements
from math import sqrt
from random import choices, sample
from copy import deepcopy
from statistics import mean, stdev
# Functions
def csv_parser(lines):
"""Return list of point coordinates as [[x1,y1,z1,...],[x2,y2,z2,...]]
lines: open file or list of lines. Expected format:
The file has a single header line.
Each line contains the coordinates for one data point, starting
with a label. A data point can be specified in arbitrary dimensions.
Output: List of lists with the coordinates of the points.
This function does not capture the labels of the data points. In case
they are needed later, this function should be adjusted.
"""
data_points = []
for line in lines:
items = line.strip().split(",")
try: #will fail on header line in file
data_points.append(list(map(float, items[1:]))) #skip label
except ValueError: #must be the header
continue
return data_points
def average_point(points):
"""
calculate the average point of given data points
input:
points: (list) of list as data point coords of floats
output:
average_p: (list) of average point coords of floats
"""
average_p = [sum(m)/len(points) for m in zip(*points)]
return average_p
def euclidean(p1, p2):
"""
Calculate the euclidean distance between two data points
input:
p1, p2: (list or tuple) of data points coordinates in float or int
e.g. p1 = [5.3, 4.6]; p2 = (3.2, 7.1)
output:
dis: (float) distance of two input data points
"""
if len(p1) != len(p2):
raise ValueError
else:
dist = sqrt(sum([(p1[c]-p2[c]) ** 2 for c in range(len(p1))]))
return dist
def centroids_init(data_points, k):
"""
Randomly select k points in data as the centroids
input:
data_points: (list) of list as data point coords of floats
k: (int) number of clusters
output:
centr_points: (list) of k lists of randomly selected datapoints
"""
centr_points = list(choices(data_points, k = k))
return centr_points
def clustering_init(data_points, centr_points):
"""
Find the closest centroid to each point and assign to clusters
input:
data_points: (list) of list as data point coords of floats
centr_points: (list) of k lists of randomly selected datapoints
output:
clusters: (list) of list as index of dataset
e.g. [[1, 3, 5], [2, 4, 6], [7, 8, 9]]
centroids: (list) of clusters' centroids
"""
k = len(centr_points)
clusters = []
cps = [] #initalise a list for in-between centr_points
for i in range(k):
if centr_points[i] in data_points:
clusters.append([data_points.index(centr_points[i])])
else:
clusters.append([])
# data points as orignal centroids are added in clusters
cps.append(centr_points[i])
for j, dp in enumerate(data_points):
if dp in centr_points: # data points as orignal centroids are ignored
continue
else:
cluster = 0
closest = 0
for l, cp in enumerate(cps):
dis = euclidean(cp, dp)
if l == 0:
closest = dis
elif dis < closest:
closest = dis
cluster = l
else:
continue
clusters[cluster].append(data_points.index(dp))
# append to cluster
points = [data_points[clusters[cluster][n]] \
for n in range(len(clusters[cluster]))]
cps[cluster] = average_point(points)
# calculate new average for edited cluster and changing centr_point
centroids = deepcopy(cps)
return clusters, centroids
def kmean_cluster(data_points, centroids):
"""
iterate kmean cluster until centroids are stablised
input:
data_points: (list) of list as data point coords of floats
centroids: (list) of clusters' centroids initially generated
output:
kmean_clusters: (list) of list as index of dataset
e.g. [[1, 3, 5], [2, 4, 6], [7, 8, 9]]
new_centroid: (list) of clusters' final centroids
iteration_n: (int) iteration number
"""
new_centroids = [[], centroids, []]
i = 1
while new_centroids[-3] != new_centroids[-2]:
# if the centroids stays the same then stop
kmean_clusters, new_centroids[i + 1] = \
clustering_init(data_points, new_centroids[i])
i += 1
new_centroids.append([])
iteration_n = i
new_centroid = new_centroids[iteration_n]
return kmean_clusters, new_centroid, iteration_n
def wgss_c(cluster, centroid, data_points):
"""
calculate WGSS for given cluster
input:
cluster: (list) of int as index of dataset.
e.g. [1, 3, 5]
centroid: (list) of given clusters' centroid
data_points: (list) of list as data point coords of floats
output:
wgss_c: (int) value of WGSS of given cluster
"""
total_dis = sum([euclidean(data_points[cluster[j]],\
centroid) ** 2 for j in range(len(cluster))])
wgss_c = total_dis/len(data_points)
return wgss_c
def wgss(kmean_clusters, new_centroid, data_points):
"""
calculate total WGSS for given clusters
input:
kmean_clusters: (list) of list as index of dataset
e.g. [[1, 3, 5], [2, 4, 6], [7, 8, 9]]
new_centroid: (list) of clusters' final centroid
data_points: (list) of list as data point coords of floats
output:
wgss: (int) total value of WGSS of given clusters
"""
wgss = 0
for i, cluster in enumerate(kmean_clusters):
wgss += wgss_c(cluster, new_centroid[i], data_points)
return wgss
def bgss(kmean_clusters, new_centroid, data_points):
"""
calculate BGSS for given clusters
input:
kmean_clusters: (list) of list as index of dataset
e.g. [[1, 3, 5], [2, 4, 6], [7, 8, 9]]
new_centroid: (list) of clusters' final centroid
data_points: (list) of list as data point coords of floats
output:
bgss: (int) total value of BGSS of given clusters
"""
bgss = 0
for i, centroid in enumerate(new_centroid):
bgss += len(kmean_clusters[i]) * \
(euclidean(centroid, average_point(data_points)) ** 2)
return bgss
def clusters_quality(clusters_list, centroid_list, data_points):
"""
Evaluate clusters quality and return W value
input:
clusters_list: (list) of clusters set as
list of lists of of indexes of dataset
e.g. [[1, 3, 5], [2, 4, 6], [7, 8, 9]],
[[3, 4, 5], [1, 2, 6], [7, 8, 9]]
centroid_list: (list) of clusters sets' final centroids as lists
data_points: (list) of list as data point coords of floats
output:
clusters_qual: (list) clusters as list of lists and
W value as int as tuple
e.g. [([[1, 3, 5], [2, 4, 6], [7, 8, 9]], 3.4),
([[3, 4, 5], [1, 2, 6], [7, 8, 9]], 5.7)]
"""
clusters_qual = []
for clusters, centroids in zip(clusters_list, centroid_list):
w = wgss(clusters, centroids, data_points)/\
bgss(clusters, centroids, data_points)
clusters_qual.append((clusters, w))
return clusters_qual
def result_output(filename, k, rep, resampling):
"""
To obtain answer needed for answer questions by using defined functions
input:
filename: (string) of the csv filename
k: (int) number of clusters
rep: (int) number of reptitions of kmeans required
resampling: (string) of resampling methods:
jackknifing, boottrapping or subsampling
output:
average_iter: (float) average iterations needed for converge
stdev_iter: (float) standerd deviation of iterations
kmeans: (list) of list as index of dataset
e.g. [[1, 3, 5], [2, 4, 6], [7, 8, 9]]
average_w: (float) average W value of the given clusters set
stdev_w: (float) standerd deviation of W values
best_w: (tuple) of a cluster with the lowest W value and its W value
"""
if resampling == None: # check if resampling is activated
with open(filename) as lines:
data_points = csv_parser(lines)
else:
with open(filename) as lines:
data = csv_parser(lines)
data_points = resample(data, resampling = resampling)
iterations = [] # store numbers of iteration in a list of int
kmeans = [] # store different sets of clusters in list if lists
centers = [] # store centroids of each set of clusters
for i in range(rep): # run rep times kmean
centr_points = centroids_init(data_points, k = k)
clusters, centroids = clustering_init(data_points, centr_points)
kmean_clusters, new_centroids, iteration\
= kmean_cluster(data_points, centroids)
iterations.append(iteration)
kmeans.append(kmean_clusters)
centers.append(new_centroids)
average_iter = mean(iterations)
stdev_iter = stdev(iterations)
clusters_qual = clusters_quality(kmeans, centers, data_points)
w_list = []
for i in clusters_qual:
w_list.append(i[1])
average_w = mean(w_list)
stdev_w = stdev(w_list)
best_w = min(clusters_qual, key = lambda t: t[1])
return average_iter, stdev_iter, kmeans, centers, \
w_list, average_w, stdev_w, best_w
def k_optimization(filename, k_list, rep, resampling):
"""
Using defined functions to find out the best k suited for clusters
input:
filename: (string) of the csv filename
k_list: (list) of int of numbers of clusters to be tested
rep: (int) number of repetition of kmeans required
resampling: (string) of resampling methods:
jackknifing, boottrapping or subsampling
output:
k_dict: (dictionary)
best_k: (int) best numbers for clustering
best_clusters: (tuple) best clustered clusters set with its W value.
"""
k_dict = {}
for k in k_list:
k_dict[k] = []
k_dict[k].extend((result_output\
(filename2, k = k, rep = rep, resampling = resampling)))
kmin_list = []
test = []
for key, value in k_dict.items():
kmin_list.append((key, value[5]))
test.append(value[-1])
best_k = min(kmin_list, key = lambda t: t[1])[0]
best_clusters_k = k_dict[best_k][-1]
best_clusters_all = min(test, key = lambda t: t[1])
if resampling == None:
return k_dict, best_k, best_clusters_k, best_clusters_all
else: # if resampling activated, this function only return the k value
return best_k
def boottrapping(data_points):
"""
Resample data set by boottrapping - sampling with replacement
input:
data_points: (list) of list as data point coords of floats
output:
boottrap_points: (list) of list as data point coords of floats
"""
boottrap_points = choices(data_points, k = len(data_points))
return boottrap_points
def subsampling(data_points):
"""
Resample data set by subsampling - sampling without replacement
input:
data_points: (list) of list as data point coords of floats
output:
subsample_points: (list) of list as data point coords of floats
"""
subsample_points = sample(data_points, k = len(data_points))
return subsample_points
def jackknifing(data_points):
"""
Resample data set by jackknifing - randomly remove one obeservation
input:
data_points: (list) of list as data point coords of floats
output:
jackknife_points: (list) of list as data point coords of floats
"""
jackknife_points = sample(data_points, k = len(data_points) - 1)
return jackknife_points
def resample(data, resampling):
"""
Choose esample data set by jackknifing, boottrapping or subsampling
input:
data: (list) of list as data point coords of floats
resampling: (string) of resampling methods:
jackknifing, boottrapping or subsampling
output:
data_points: (list) of list as data point coords of floats
"""
data_points = []
if resampling == 'jackknifing':
data_points = jackknifing(data)
elif resampling == 'boottrapping':
data_points = boottrapping(data)
elif resampling == 'subsampling':
data_points = subsampling(data)
else:
raise ValueError
return data_points
if __name__ == "__main__":
# the code below should produce the results necessary to answer
# the questions. In other words, if we run your code, we should see
# the data that you used to answer the questions.
# Question 1:
filename1 = '2dtest.csv'
average_iter, stdev_iter, kmeans, centers, w_list, average_w, \
stdev_w, best_w = result_output\
(filename1, k = 3, rep = 20, resampling = None)
# 1a:
print('Answer to question 1:\n1a:\n\
The average iterations needed for converge is {:.3f}.\n\
The standerd deviation of iterations is {:.3f}.'.format\
(average_iter, stdev_iter))
# 1b:
print('1b:\nThe clusters are as follow:')
for index, clusters in enumerate(kmeans):
print(index + 1, clusters)
# 1c:
print('1c:\nThe W values are {}.\n\
The average W value is {:.3f}.\n\
The standerd deviation of W value is {:.3f}.'\
.format(w_list, average_w, stdev_w))
# 1d:
print('1d:\nBest clusters with lowest w value is {}'.format(best_w))
# Question 2:
filename2 = 'LargeSet_1.csv'
k_large1 = (2, 3, 4, 5, 6)
k_large1_dict, best_k, best_clusters_k, best_clusters_all = \
k_optimization(filename2, k_large1, rep = 10, resampling = None)
# 2a:
print('Answer to question 2:\n2a:')
for key, value in k_large1_dict.items():
print('The average iterations needed for converge k = {} is {:.3f}.\n\
The standerd deviation of iterations of k = {} is {:.3f}.'.format\
(key, value[0], key, value[1]))
# 2b:
print('2b:\n')
for key, value in k_large1_dict.items():
print('The average W values of k = {} is {:.3f}.\n\
The standerd deviation of W value of k = {} is {:.3f}.'\
.format(key, value[-3], key, value[-2]))
# 2c:
print('2c:\nThe value k = {} leads to the best clustering'.format\
(best_k))
# 2d:
print('2d(1):\nThe best set of clusters is:\n{}\nIts W value is {:.3f}.'\
.format(best_clusters_k[0], best_clusters_k[1]))
print('2d(2):\nThe best set of clusters is:\n{}\nIts W value is {:.3f}.'\
.format(best_clusters_all[0], best_clusters_all[1]))
# Question 8:
# boottrap
boottrap_k = k_optimization\
(filename2, k_large1, rep = 10, resampling = 'boottrapping')
average_iter_bt, stdev_iter_bt, kmeans_bt, centers_bt, w_list_bt, \
average_w_bt, stdev_w_bt, best_w_bt = result_output\
(filename2, k = boottrap_k, rep = 10, resampling = None)
# jackknife
jackknife_k = k_optimization\
(filename2, k_large1, rep = 10, resampling = 'jackknifing')
average_iter_jk, stdev_iter_jk, kmeans_jk, centers_jk, w_list_jk, \
average_w_jk, stdev_w_jk, best_w_jk = result_output\
(filename2, k = jackknife_k, rep = 10, resampling = None)
# subsample
subsample_k = k_optimization\
(filename2, k_large1, rep = 10, resampling = 'subsampling')
average_iter_ss, stdev_iter_ss, kmeans_ss, centers_ss, w_list_ss, \
average_w_ss, stdev_w_ss, best_w_ss = result_output\
(filename2, k = subsample_k, rep = 10, resampling = None)
print('Answer to question 8:\n',\
'The optimal k from boottrap is {}.\nThe clusters are:\n{}\n\
The optimal k from jackknife is {}.\nThe clusters are:\n{}\n\
The optimal k from subsample is {}.\nThe clusters are:\n{}\n'.format\
(boottrap_k, best_w_bt, jackknife_k, best_w_jk, \
subsample_k, best_w_ss))