-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathstrans.py
70 lines (56 loc) · 2.22 KB
/
strans.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import pandas as pd
from sentence_transformers import SentenceTransformer
from umap import UMAP
from scipy import linalg as LA
plt.ion()
# ==============================================================================
def soPCA(data, dims_rescaled_data=2):
"""
returns: data transformed in 2 dims/columns + regenerated original data
pass in: data as 2D NumPy array
"""
m, n = data.shape
# Mean center the data
data -= data.mean(axis=0)
# Calculate the covariance matrix
R = np.cov(data, rowvar=False)
# Calculate eigenvectors & eigenvalues of the covariance matrix
# Use 'eigh' rather than 'eig' since R is symmetric, the performance gain is substantial
evals, evecs = LA.eigh( R )
# Sort eigenvalues in decreasing order
idx = np.argsort(evals)[::-1]
evecs = evecs[:,idx]
# Sort eigenvectors according to same index
evals = evals[idx]
# Select the first n eigenvectors (n is desired dimension of rescaled data array, or dims_rescaled_data)
evecs = evecs[:, :dims_rescaled_data]
# Carry out the transformation on the data using eigenvectors and return the re-scaled data, eigenvalues, and eigenvectors
return np.dot(evecs.T, data.T).T, evals, evecs
# ==============================================================================
df = pd.read_csv("./results_simple_multipass_text-davinci-003.csv")
sentences = []
for pid in tqdm(range(len(df))):
person = df.iloc[pid]
sentences.append( person['response'] )
# ==============================================================================
model = SentenceTransformer('sentence-t5-xxl')
sentence_embeddings = model.encode( sentences )
np.save("./nsyr_embs.npy",sentence_embeddings)
# ==============================================================================
sse_pca = soPCA( sentence_embeddings, 50 )
umap = UMAP(
n_components=2,
n_neighbors=15,
min_dist=0.001,
spread=1.0,
metric='cosine',
init='random',
random_state=0
)
projections = umap.fit_transform( sse_pca[0] )
np.save("./simple_multipass_pca50.npy", sse_pca[0])
np.save("./simple_multipass_projections.npy", projections)
print("strans.py finished.")