forked from msmathcomp/hyperbolic-tsne
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcode.py
140 lines (114 loc) · 5.17 KB
/
code.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
"""
First, we import the packages we will use and set important paths.
Note that `hyperbolicTSNE.util` and `hyperbolicTSNE.visualization` contain
useful functions for reading, processing and exporting embeddings.
This requires that hyperbolicTSNE has been set up as detailed in the main readme of the repository.
"""
import os
import traceback
from hyperbolicTSNE.util import find_last_embedding
from hyperbolicTSNE.visualization import plot_poincare, animate
from hyperbolicTSNE import load_data, Datasets, SequentialOptimizer, initialization, HyperbolicTSNE
"""
We assume that there is a top-level folder datasets that holds the MNIST data set.
Refer to the main readme of the repository for where to find the data sets used in this repository.
"""
data_home = "datasets"
log_path = "temp/poincare/" # path for saving embedding snapshots
only_animate = False
seed = 42
dataset = Datasets.MNIST # the Datasets handler provides access to several data sets used throughout the repository
num_points = 10000 # we use a subset for demonstration purposes, full MNIST has N=70000
perp = 30 # we use a perplexity of 30 in this example
"""
HyperbolicTSNE follows a similar API to other t-SNE libraries like OpenTSNE and sklearn.
The configuration process consists of loading the data to embed and defining the settings of the embedder.
We create a dict with parameters manually to demonstrate all the customization options.
Nevertheless, `hyperbolicTSNE.hyperbolicTSNE` provides parameter templates to start with.
"""
dataX, dataLabels, D, V, _ = load_data(
dataset,
data_home=data_home,
random_state=seed,
to_return="X_labels_D_V",
hd_params={"perplexity": perp},
sample=num_points,
knn_method="hnswlib" # we use an approximation of high-dimensional neighbors to speed up computations
)
print(dataX)
exaggeration_factor = 12 # Just like regular t-SNE, we use early exaggeration with a factor of 12
learning_rate = (dataX.shape[0] * 1) / (exaggeration_factor * 1000) # We adjust the learning rate to the hyperbolic setting
ex_iterations = 250 # The embedder is to execute 250 iterations of early exaggeration, ...
main_iterations = 750 # ... followed by 750 iterations of non-exaggerated gradient descent.
opt_config = dict(
learning_rate_ex=learning_rate, # learning rate during exaggeration
learning_rate_main=learning_rate, # learning rate main optimization
exaggeration=exaggeration_factor,
exaggeration_its=ex_iterations,
gradientDescent_its=main_iterations,
vanilla=False, # if vanilla is set to true, regular gradient descent without any modifications is performed; for vanilla set to false, the optimization makes use of momentum and gains
momentum_ex=0.5, # Set momentum during early exaggeration to 0.5
momentum=0.8, # Set momentum during non-exaggerated gradient descent to 0.8
exact=False, # To use the quad tree for acceleration (like Barnes-Hut in the Euclidean setting) or to evaluate the gradient exactly
area_split=False, # To build or not build the polar quad tree based on equal area splitting or - alternatively - on equal length splitting
n_iter_check=40, # Needed for early stopping criterion
size_tol=0.999, # Size of the embedding to be used as early stopping criterion
polar_or_cartesian="cartesian"
)
opt_params = SequentialOptimizer.sequence_poincare(**opt_config)
# Start: configure logging
logging_dict = {
"log_path": log_path
}
opt_params["logging_dict"] = logging_dict
log_path = opt_params["logging_dict"]["log_path"]
# Delete old log path
if os.path.exists(log_path) and not only_animate:
import shutil
shutil.rmtree(log_path)
# End: logging
print(f"config: {opt_config}")
"""
Run HyperbolicTSNE
Embedding the high dimensional data consists of three steps:
Initializating the embedding
Initializing the embedder
Embedding the data
The following three cells demonstrate this process.
Note that use set metric to "precomputed" because we pass the distance matrix to the fit method.
"""
# Compute an initial embedding of the data via PCA
X_embedded = initialization(
n_samples=dataX.shape[0],
n_components=2,
X=dataX,
random_state=seed,
method="pca"
)
# Initialize the embedder
htsne = HyperbolicTSNE(
init=X_embedded,
n_components=2,
metric="precomputed",
verbose=True,
opt_method=SequentialOptimizer,
opt_params=opt_params
)
# Compute the embedding
try:
hyperbolicEmbedding = htsne.fit_transform((D, V))
except ValueError:
hyperbolicEmbedding = find_last_embedding(log_path)
traceback.print_exc()
"""
After running the embedding process, the embeddings arrays are saved to the log_path.
We can use this information to visualize the embeddings using utility functions defined
in hyperbolicTSNE.visualization as shown below.
"""
# Create a rendering of the embedding and save it to a file
if not os.path.exists("results"):
os.mkdir("results")
fig = plot_poincare(hyperbolicEmbedding, dataLabels)
fig.savefig(f"results/{dataset.name}.png")
# This renders a GIF animation of the embedding process. If FFMPEG is installed, the command also supports .mp4 as file ending
animate(logging_dict, dataLabels, f"results/{dataset.name}_ani.gif", fast=True, plot_ee=True)