-
Notifications
You must be signed in to change notification settings - Fork 15
/
Copy pathclassify.f90
369 lines (330 loc) · 11.9 KB
/
classify.f90
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
module classify
contains
! Following kmeans subroutine is from https://rosettacode.org/wiki/K-means%2B%2B_clustering#Fortran
!**********************************************************************
! KMPP - K-Means++ - Traditional data clustering with a special initialization
! Public Domain - This program may be used by any person for any purpose.
!
! Origin:
! Hugo Steinhaus, 1956
!
! Refer to:
! "kmeans++: the advantages of careful seeding"
! David Arthur and Sergei Vassilvitskii
! Proceedings of the eighteenth annual ACM-SIAM symposium
! on Discrete algorithms, 2007
!
!____Variable_______I/O_______Description___________________Type_______
! X(P,N) In Data points Real
! P In Dimension of the data Integer
! N In Number of points Integer
! K In # clusters Integer
! C(P,K) Out Center points of clusters Real
! Z(N) Out What cluster a point is in Integer
! WORK(N) Neither Real
! IFAULT Out Error code Integer
!***********************************************************************
subroutine kmeanspp (X, P, N, K, C, Z, WORK, IFAULT)
IMPLICIT NONE
INTEGER P, N, K, Z(N), IFAULT
REAL*8 X(P,N), C(P,K), WORK(N)
INTEGER ITER ! maximum iterations
REAL BIG ! arbitrary large number
PARAMETER (ITER = 1000,BIG = 1E33)
INTEGER H ! count iterations
INTEGER I ! count points
INTEGER I1 ! point marked as initial center
INTEGER J ! count dimensions
INTEGER L ! count clusters
INTEGER L0 ! present cluster ID
INTEGER L1 ! new cluster ID
REAL*8 BEST ! shortest distance to a center
REAL*8 D2 ! squared distance
REAL*8 TOT ! a total
REAL*8 W ! temp scalar
LOGICAL CHANGE ! whether any points have been reassigned
IF (K < 1 .OR. K > N) THEN ! K out of bounds
IFAULT = 3
RETURN
END IF
! Begin.
IFAULT = 0 ; Z = 0 ; WORK = BIG
! choose initial center from data points
CALL RANDOM_NUMBER (W)
I1 = MIN(INT(W * FLOAT(N)) + 1, N) ! choose first center at random
C(:,1) = X(:,I1)
! initialize other centers
DO L = 2, K
TOT = 0.
DO I = 1, N ! measure from each point
BEST = WORK(I)
D2 = 0. ! to prior center
DO J = 1, P
D2 = D2 + (X(J,I) - C(J,L-1)) **2 ! Squared Euclidean distance
IF (D2 .GE. BEST) GO TO 10 ! needless to add to D2
END DO ! next J
IF (D2 < BEST) BEST = D2 ! shortest squared distance
WORK(I) = BEST
10 TOT = TOT + BEST ! cumulative squared distance
END DO ! next data point
! Choose center with probability proportional to its squared distance from existing centers.
CALL RANDOM_NUMBER (W)
W = W * TOT ! uniform at random over cumulative distance
TOT = 0.
DO I = 1, N
I1 = I
TOT = TOT + WORK(I)
IF (TOT > W) GO TO 20
END DO ! next I
20 CONTINUE
C(:,L) = X(:,I1)
END DO
! main loop for k-means
DO H = 1, ITER
CHANGE = .FALSE.
! find nearest center for each point
DO I = 1, N
L0 = Z(I)
L1 = 0
BEST = BIG
DO L = 1, K
D2 = 0.
DO J = 1, P
D2 = D2 + (X(J,I) - C(J,L)) **2
IF (D2 .GE. BEST) GO TO 30
END DO
30 CONTINUE
IF (D2 < BEST) THEN ! new nearest center
BEST = D2
L1 = L
END IF
END DO ! next L
IF (L0 .NE. L1) THEN
Z(I) = L1 ! reassign point
CHANGE = .TRUE.
END IF
END DO ! next I
IF (.NOT. CHANGE) RETURN ! success
! find cluster centers
WORK(1:K) = 0.
C(1:P,1:K) = 0.
DO I = 1, N
L = Z(I)
WORK(L) = WORK(L) + 1. ! count
DO J = 1, P
C(J,L) = C(J,L) + X(J,I) ! add
END DO
END DO
DO L = 1, K
IF (WORK(L) < 0.5) THEN ! empty cluster check
IFAULT = 1 ! fatal error
RETURN
END IF
W = 1. / WORK(L)
DO J = 1, P
C(J,L) = C(J,L) * W ! multiplication is faster than division
END DO
END DO
END DO ! next H
IFAULT = 2 ! too many iterations
RETURN
endsubroutine ! of KMPP
! Following kmeans subroutine is from https://people.sc.fsu.edu/~jburkardt/f_src/kmeans/kmeans.html
! KMEANS, a FORTRAN90 code which handles the K-Means problem, which organizes a set of N points in M dimensions into K clusters;
subroutine kmeans ( dim_num, point_num, cluster_num, it_max, it_num, point, &
cluster, cluster_center, cluster_population, cluster_energy )
!*****************************************************************************80
!
!! KMEANS_01 applies the K-Means algorithm.
!
! Discussion:
!
! Given a matrix of POINT_NUM observations on DIM_NUM variables, the
! observations are to be allocated to CLUSTER_NUM clusters in such
! a way that the within-cluster sum of squares is minimized.
!
! Licensing:
!
! This code is distributed under the GNU LGPL license.
!
! Modified:
!
! 18 November 2004
!
! Author:
!
! Original FORTRAN77 version by David Sparks.
! FORTRAN90 version by John Burkardt.
!
! Reference:
!
! David Sparks,
! Algorithm AS 58:
! Euclidean Cluster Analysis,
! Applied Statistics,
! Volume 22, Number 1, 1973, pages 126-130.
!
! Parameters:
!
! Input, integer ( kind = 4 ) DIM_NUM, the number of spatial dimensions.
!
! Input, integer ( kind = 4 ) POINT_NUM, the number of points.
!
! Input, integer ( kind = 4 ) CLUSTER_NUM, the number of clusters.
!
! Input, integer ( kind = 4 ) IT_MAX, the maximum number of iterations.
!
! Output, integer ( kind = 4 ) IT_NUM, the number of iterations taken.
!
! Input, real ( kind = 8 ) POINT(DIM_NUM,POINT_NUM), the points.
!
! Output, integer ( kind = 4 ) CLUSTER(POINT_NUM), indicates which cluster
! each point belongs to.
!
! Input/output, real ( kind = 8 ) CLUSTER_CENTER(DIM_NUM,CLUSTER_NUM),
! the cluster centers.
!
! Output, integer ( kind = 4 ) CLUSTER_POPULATION(CLUSTER_NUM), the number
! of points in each cluster.
!
! Output, real ( kind = 8 ) CLUSTER_ENERGY(CLUSTER_NUM), the
! cluster energies.
!
implicit none
integer ( kind = 4 ) cluster_num
integer ( kind = 4 ) dim_num
integer ( kind = 4 ) point_num
integer ( kind = 4 ) cluster(point_num)
real ( kind = 8 ) cluster_center(dim_num,cluster_num)
real ( kind = 8 ) cluster_energy(cluster_num)
integer ( kind = 4 ) cluster_population(cluster_num)
real ( kind = 8 ) dc
real ( kind = 8 ) de
real ( kind = 8 ) f(point_num)
integer ( kind = 4 ) i
integer ( kind = 4 ) il
integer ( kind = 4 ) ir
integer ( kind = 4 ) it_max
integer ( kind = 4 ) it_num
integer ( kind = 4 ) j
integer ( kind = 4 ) k
integer ( kind = 4 ) list(1)
real ( kind = 8 ) point(dim_num,point_num)
integer ( kind = 4 ) swap
it_num = 0
! Idiot checks.
if ( cluster_num < 1 ) then
write ( *, '(a)' ) 'KMEANS - Fatal error!'
write ( *, '(a)' ) ' CLUSTER_NUM < 1.'
stop 1
end if
if ( dim_num < 1 ) then
write ( *, '(a)' ) 'KMEANS - Fatal error!'
write ( *, '(a)' ) ' DIM_NUM < 1.'
stop 1
end if
if ( point_num < 1 ) then
write ( *, '(a)' ) 'KMEANS - Fatal error!'
write ( *, '(a)' ) ' POINT_NUM < 1.'
stop 1
end if
! For each observation, calculate the distance from each cluster center, and assign to the nearest.
do i = 1, point_num
do j = 1, cluster_num
cluster_energy(j) = sum ( ( point(1:dim_num,i) - cluster_center(1:dim_num,j) )**2 )
end do
list = minloc ( cluster_energy(1:cluster_num) )
cluster(i) = list(1)
end do
! Determine the cluster population counts.
cluster_population(1:cluster_num) = 0
do i = 1, point_num
j = cluster(i)
cluster_population(j) = cluster_population(j) + 1
end do
! Calculate the mean and sum of squares for each cluster.
cluster_center(1:dim_num,1:cluster_num) = 0.0D+00
do i = 1, point_num
j = cluster(i)
cluster_center(1:dim_num,j) = cluster_center(1:dim_num,j) + point(1:dim_num,i)
end do
do i = 1, cluster_num
if ( 0 < cluster_population(i) ) then
cluster_center(1:dim_num,i) = cluster_center(1:dim_num,i) / real ( cluster_population(i), kind = 8 )
end if
end do
! Set the point energies.
f(1:point_num) = 0.0D+00
do i = 1, point_num
j = cluster(i)
f(i) = sum ( ( point(1:dim_num,i) - cluster_center(1:dim_num,j) )**2 )
end do
! Set the cluster energies.
cluster_energy(1:cluster_num) = 0.0D+00
do i = 1, point_num
j = cluster(i)
cluster_energy(j) = cluster_energy(j) + f(i)
end do
! Adjust the point energies by a weight factor.
do i = 1, point_num
j = cluster(i)
if ( 1 < cluster_population(j) ) then
f(i) = f(i) * real ( cluster_population(j))/ real ( cluster_population(j) - 1)
end if
end do
! Examine each observation in turn to see if it should be reassigned to a different cluster.
it_num = 0
do while ( it_num < it_max )
it_num = it_num + 1
swap = 0
do i = 1, point_num
il = cluster(i)
ir = il
if ( cluster_population(il) <= 1 ) then
cycle
end if
dc = f(i)
do j = 1, cluster_num
if ( j /= il ) then
de = sum ( ( point(1:dim_num,i) - cluster_center(1:dim_num,j) )**2 ) * real(cluster_population(j)) / real ( cluster_population(j) + 1)
if ( de < dc ) then
dc = de
ir = j
end if
end if
end do
if ( ir == il ) cycle ! If the lowest value was obtained by staying in the current cluster, then cycle.
! Reassign the point from cluster IL to cluster IR.
cluster_center(1:dim_num,il) =( cluster_center(1:dim_num,il) * real ( cluster_population(il)) &
- point(1:dim_num,i) ) / real ( cluster_population(il) - 1)
cluster_center(1:dim_num,ir) = ( cluster_center(1:dim_num,ir) * real ( cluster_population(ir)) &
+ point(1:dim_num,i) ) / real ( cluster_population(ir) + 1)
cluster_energy(il) = cluster_energy(il) - f(i)
cluster_energy(ir) = cluster_energy(ir) + dc
cluster_population(ir) = cluster_population(ir) + 1
cluster_population(il) = cluster_population(il) - 1
cluster(i) = ir
! Adjust the value of F for points in clusters IL and IR.
do j = 1, point_num
k = cluster(j)
if ( k == il .or. k == ir ) then
f(j) = sum ( ( point(1:dim_num,j) - cluster_center(1:dim_num,k) )**2 )
if ( 1 < cluster_population(k) ) then
f(j) = f(j) * real ( cluster_population(k))/ ( real ( cluster_population(k) - 1) )
end if
end if
end do
swap = swap + 1
end do
! Exit if no reassignments were made during this iteration.
if ( swap == 0 ) exit
end do
! Compute the cluster energies.
cluster_energy(1:cluster_num) = 0.0D+00
do i = 1, point_num
j = cluster(i)
cluster_energy(j) = cluster_energy(j) + sum ( ( point(1:dim_num,i) - cluster_center(1:dim_num,j) )**2 )
end do
return
endsubroutine
endmodule