-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathblanket_clusterer.py
144 lines (124 loc) · 5.58 KB
/
blanket_clusterer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import csv
from algorithms.agglomer_clustering import AgglomerClustering
from algorithms.birch_clustering import BirchClustering
from algorithms.dbscan_clustering import DBSCANClustering
from algorithms.kmeans_clustering import KMeansClustering
__author__ = "Konstantin Bogdanoski"
__copyright__ = "Copyright 2020, BlanketClusterer"
__credits__ = ["Konstantin Bogdanoski", "Prof. PhD. Dimitar Trajanov", "MSc. Kostadin Mishev"]
__license__ = "MIT"
__version__ = "2.0.0"
__maintainer__ = "Konstantin Bogdanoski"
__email__ = "konstantin.b@live.com"
__status__ = "Production"
def validate_constructor(n_clusters,
clustering_type,
embeddings,
names,
items_in_cluster,
max_depth, group_names=None):
if int(n_clusters) <= 0:
raise ValueError("Invalid number of clusters")
if clustering_type not in ["k-means", "agglomerative", "dbscan", "birch"]:
raise ValueError("Invalid clustering type\nAllowed values: ['k-means', 'agglomerative', 'dbscan', 'birch']")
if embeddings is None:
raise ValueError("No embeddings specified")
if names is None:
raise ValueError("No names .csv file specified")
file = open(names, "r")
reader = csv.reader(file, delimiter=",")
for row in reader:
if "key" not in row or "value" not in row:
raise ValueError("Names are not in specified format\n"
"File must start with the following line:\n"
"key,value\n"
"and must be a .csv file")
break
if group_names is not None:
file = open(group_names, "r")
reader = csv.reader(file, delimiter=",")
for row in reader:
if "key" not in row or "value" not in row:
raise ValueError("Group names are not in specified format\n"
"File must start with the following line:\n"
"key,value\n"
"and must be a .csv file")
break
if int(items_in_cluster) <= 5:
raise ValueError("Number of items in clusters must be greater than 5")
if not 0 < int(max_depth) <= 6:
raise ValueError("Invalid argument for max depth, choose in range 1-6")
return
class BlanketClusterer:
"""
BlanketClusterer
Read more in the :ref:`UserGuide <blanket_clusterer>`.
Parameters
----------
:param n_clusters
:type Integer, default=10
The number of clusters to be generated together with centroids.
:param clustering_type
:type String, default="k-means"
Type of algorithms to be done.
Allowed types:
`k-means`, `agglomerative`
The types are used from scikit-learn
:param embeddings
path to embeddings model on file system.
it needs to be imported for the algorithm to operate.
Model must contain key-value pairs, where value is a
matrix embedding. The key is needed for the name extraction.
Allowed models:
`Word2Vec`,
:param names
path to `.csv` file containing the key-value pairs of
names. The keys must be the same as the
keys in the embedding model.
:param group_names
path to `.csv` file containing key-value pairs of group names.
They must be in a specific format
'left_boundary-right_boundary, generic_name'
Example:
'001-009, Example name'
IMPORTANT:
If you add group-names, the clusters will also be colorized based
on the prevailing group
This format is crucial if you want to name
the clusters with specific names
"""
def __init__(self, n_clusters=10,
clustering_type="k-means",
embeddings=None,
names=None,
items_in_cluster=20,
max_depth=6,
output_path="./output.json",
group_names=None):
validate_constructor(n_clusters, clustering_type, embeddings, names,
items_in_cluster, max_depth, group_names)
self.n_clusters = int(n_clusters)
self.clustering_type = clustering_type
self.embeddings = embeddings
self.output_path = output_path
self.names = names
self.max_depth = int(max_depth)
self.items_in_cluster = int(items_in_cluster)
self.group_names = group_names
def clusterize(self):
if self.clustering_type == "k-means":
kmeans = KMeansClustering(self.n_clusters, self.embeddings, self.names, self.items_in_cluster,
self.max_depth, self.output_path, self.group_names)
kmeans.clusterize()
elif self.clustering_type == "agglomerative":
agglomer = AgglomerClustering(self.n_clusters, self.embeddings, self.names, self.items_in_cluster,
self.max_depth, self.output_path, self.group_names)
agglomer.clusterize()
elif self.clustering_type == "dbscan":
dbs = DBSCANClustering(self.n_clusters, self.embeddings, self.names, self.items_in_cluster,
self.max_depth, self.output_path, self.group_names)
dbs.clusterize()
elif self.clustering_type == "birch":
brch = BirchClustering(self.n_clusters, self.embeddings, self.names, self.items_in_cluster,
self.max_depth, self.output_path, self.group_names)
brch.clusterize()