This repository has been archived by the owner on Jan 6, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 46
/
Copy pathmain.tex
380 lines (323 loc) · 27.2 KB
/
main.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
\documentclass[a4paper, 11pt, landscape]{article}
\usepackage{mathptmx} % more compact font
\usepackage{amsmath}
\usepackage{amsfonts}
\usepackage{mathtools}
\usepackage{multicol}
\usepackage{enumitem}
\usepackage{paralist} % for compacter lists
\usepackage{hyperref} % for Todo's and similar things
\usepackage[left=4.5mm, right=4.5mm, top=4.5mm, bottom=6mm, landscape, nohead, nofoot]{geometry}
\usepackage[small,compact]{titlesec}
\usepackage[usenames,dvipsnames,svgnames,table]{xcolor}
\usepackage{xparse}
% compact text
\linespread{0.9}
\setlength{\parindent}{0pt}
% compact lists even more
\setdefaultleftmargin{0em}{0em}{0em}{0em}{0em}{0em}
% compact sections
\titlespacing*{\section}{0pt}{0em}{0em}
\titlespacing*{\subsection}{0pt}{0em}{0em}
\titlespacing*{\subsubsection}{0pt}{0em}{0em}
% coloured section headings for easier read
\titleformat{name=\section}[block]
{\sffamily}
{}
{0pt}
{\colorsection}
\newcommand{\colorsection}[1]{%
\colorbox{red!10}{\parbox[t][0em]{\dimexpr\columnwidth-2\fboxsep}{\thesection\ #1}}}
\titleformat{name=\subsection}[block]
{\sffamily}
{}
{0pt}
{\subcolorsection}
\newcommand{\subcolorsection}[1]{%
\colorbox{orange!10}{\parbox[t][0em]{\dimexpr\columnwidth-2\fboxsep}{\thesubsection\ #1}}}
\titleformat{name=\subsubsection}[block]
{\sffamily}
{}
{0pt}
{\subsubcolorsection}
\newcommand{\subsubcolorsection}[1]{%
\colorbox{blue!10}{\parbox[t][0em]{\dimexpr\columnwidth-2\fboxsep}{\thesubsubsection\ #1}}}
% environment for multicols inside a list
\NewDocumentEnvironment{listcols}{O{2} O{0pt}}
{%
\bgroup %
\setlength{\multicolsep}{#2} %
\begin{multicols*}{#1} %
}
{%
\end{multicols*} %
\egroup %
}
% multicols lines & spacing
\setlength{\columnsep}{0.2cm}
\setlength{\columnseprule}{0.2pt}
% No page numbers
\pagenumbering{gobble}
% math helpers
\DeclareMathOperator*{\argmin}{arg\,min}
\DeclareMathOperator*{\argmax}{arg\,max}
\begin{document}
\begin{multicols*}{3}
\section{PCA}
$\mathbf{X} \in \mathbb{R}^{D \times N}$. $N$ observations, $K$ properties. Target: $\tilde{\mathbf{X}} \in \mathbb{R}^{K \times N}$.
\begin{compactenum}
\item Empirical Mean: $\overline{\mathbf{x}} = \frac{1}{N} \sum_{n=1}^N \mathbf{x}_n$
\item Center Data: $\overline{\mathbf{X}} = \mathbf{X} - [\overline{\mathbf{x}}, \ldots, \overline{\mathbf{x}}] = \mathbf{X} - \mathbf{M}$
\item Covariance Matrix: $\boldsymbol{\Sigma} = \frac{1}{N } \sum_{n=1}^N (\mathbf{x}_n - \overline{\mathbf{x}}) (\mathbf{x}_n - \overline{\mathbf{x}})^T = \frac{1}{N} \overline{\mathbf{X}}\overline{\mathbf{X}}^T$
\item Eigenvalue Decomposition: $\boldsymbol{\Sigma} = \mathbf{U} \boldsymbol{\Lambda} \mathbf{U}^T$, sort eigenvalues (and eigenvectors) in descending order
\item Select $K < D$, keep only the first $K$ eigenvalues and corresponding eigenvectors $\Rightarrow \mathbf{U}_K, \boldsymbol{\lambda}_K$
\item Transform data onto new Basis: $\overline{\mathbf{Z}} = \mathbf{U}_K^T \overline{\mathbf{X}}$
\item Reconstruct to original Basis: $\tilde{\overline{\mathbf{X}}} = \mathbf{U}_k \overline{\mathbf{Z}}$
\item Reverse centring: $\tilde{\mathbf{X}} = \tilde{\overline{\mathbf{X}}} + \mathbf{M}$
\end{compactenum}
\begin{compactitem}
\item For compression save $\mathbf{U}_k, \overline{\mathbf{Z}}, \overline{\mathbf{x}}$.
\item $\mathbf{U}_k \in \mathbb{R}^{D \times K}, \boldsymbol{\Sigma} \in \mathbb{R}^{D \times D}, \overline{\mathbf{Z}} \in \mathbb{R}^{K \times N}, \overline{\mathbf{X}} \in \mathbb{R}^{D \times N}$
\end{compactitem}
\begin{compactdesc}
\item[Approx. Error:] Sum of discarded eigenvalues
\end{compactdesc}
\section{Eigenvalue / -vectors}
Eigenvalue Problem: $\mathbf{Ax} = \lambda \mathbf{x}$
\begin{compactenum}
\item solve $\operatorname{det}(\mathbf{A} - \lambda \mathbf{I}) \overset{!}{=} 0$ resulting in $\{\lambda_i\}_i$
\item $\forall \lambda_i$:
solve $(\mathbf{A} - \lambda_i \mathbf{I}) \mathbf{x}_i = \mathbf{0}$, $\mathbf{x}_i$ is the $i$-th eigenvector.
\item (opt.) normalize eigenvector $q_i$: $q_i^{\text{norm}} = \frac{1}{\|q_i\|_2} q_i$.
\end{compactenum}
\section{Eigendecomposition}
$\mathbf{A} \in \mathbb{R}^{N \times N}$ with $N$ linear independent eigenvectors, then: $\mathbf{A} = \mathbf{Q} \boldsymbol{\Lambda} \mathbf{Q}^{-1}$. $\mathbf{Q} \in \mathbb{R}^{N \times N}$, $i$-th column is the $i$-th eigenvector of $\mathbf{A}$. $\boldsymbol{\Lambda}$ is diagonal and $\boldsymbol{\lambda}_{i,i}$ is the $i$-th eigenvector. Every symmetric matrix can be ``eigendecomposed''.
$\mathbf{A}^{-1} = \mathbf{Q} \boldsymbol{\Lambda}^{-1} \mathbf{Q}^{-1}$, $(\boldsymbol{\Lambda}^{-1})_{i,i} = \frac{1}{\lambda_i}$
Calculate eigenvalues $\lambda_i$ and eigenvectors $q_i$. Normalise the eigenvectors. Put eigenvectors as columns into $\mathbf{Q}$ and eigenvalues into the diagonal of $\boldsymbol{\Lambda}$.
\section{SVD}
\begin{compactitem}
\item $\mathbf{A} = \mathbf{U} \mathbf{D} \mathbf{V}^T = \sum_{k=1}^{\operatorname{rank}(\mathbf{A})} d_{k,k} u_k (v_k)^T$
\item $\mathbf{A} \in \mathbb{R}^{N \times P}, \mathbf{U} \in \mathbb{R}^{N \times N}, \mathbf{D} \in \mathbb{R}^{N \times P}, \mathbf{V} \in \mathbb{R}^{P \times P}$
\item $\mathbf{U}^T \mathbf{U} = I = \mathbf{V}^T \mathbf{V}$ ($\mathbf{U}, \mathbf{V}$ columns are orthonormal)
\item $\mathbf{U}$ columns are eigenvectors of $\mathbf{A} \mathbf{A}^T$, $\mathbf{V}$ columns are eigenvectors of $\mathbf{A}^T \mathbf{A}$, $\mathbf{D}$ diagonal elements are singular values, i.e. the square roots of the eigenvalues ($\mathbf{A}^T \mathbf{A}$ and $\mathbf{A} \mathbf{A}^T$ have the same eigenvalues)
\item $(\mathbf{D}^{-1})_{i,i} = \frac{1}{\mathbf{D}_{i, i}}$ ($\mathbf{D} \in \mathbb{R}^{N \times P} \to \mathbf{D}^{-1} \in \mathbb{R}^{P \times N}$, i.e. don't forget to transpose)
\item Missing columns in $\mathbf{U}$ are basis of $\operatorname{null}(A^T)$ and in $\mathbf{V}$ are basis of $\operatorname{null}(A)$. Calculate: $\mathbf{A}^T \mathbf{u} = \mathbf{0}$ or $\mathbf{A} \mathbf{v} = \mathbf{0}$ for $\mathbf{u}$ or $\mathbf{v}$.
\end{compactitem}
\begin{compactenum}
\item calculate $\mathbf{A}^T \mathbf{A}$.
\item calculate eigenvalues of $\mathbf{A}^T \mathbf{A}$, the square root of them are the diagonal elements of $\mathbf{D}$.
\item calculate eigenvectors of $\mathbf{A}^T \mathbf{A}$ using the eigenvalues resulting in the columns of $\mathbf{V}$.
\item calculate the missing matrix: $\mathbf{U} = \mathbf{A} \mathbf{V} \mathbf{D}^{-1}$. Can be checked by calculating the eigenvectors of $\mathbf{A} \mathbf{A}^T$.
\item normalize each columns of $\mathbf{U}$ and $\mathbf{V}$.
\end{compactenum}
\subsection{Low-Rank approximation}
Using only $K$ largest eigenvalues and corresponding eigenvectors. $\tilde{\mathbf{A}}_{i, j} = \sum_{k}^K \mathbf{U}_{i, k} \mathbf{D}_{k,k} \mathbf{V}_{j, k} = \mathbf{U}_{i, k} \mathbf{D}_{k,k} (\mathbf{V}^T)_{k, j}$.
\begin{compactdesc}
\item[Error Frobenius:] $\|\mathbf{A} - \tilde{\mathbf{A}}\|_F = \sqrt{\sum_{i > K} \sigma_i^2} = \sqrt{\sum_{i > K} \lambda_i}$
\item[Error Euclidean:] $\|\mathbf{A} - \tilde{\mathbf{A}}\|_2 = \sigma_{K+1}$
\end{compactdesc}
\section{Basis Transform}
$\mathbf{V}, \mathbf{W}$ are two basis matrices (each column is a basis vector, invertible). Representing $\mathbf{x}$ in basis $\mathbf{V}$ by \emph{coordinate vector} $\mathbf{a}$: $\mathbf{x} = \mathbf{V} \mathbf{a} \Rightarrow \mathbf{a} = \mathbf{V}^{-1} \mathbf{x}$. Knowing $\mathbf{a}$ representing $\mathbf{x}$ in $\mathbf{V}$ we can calculate coordinate vector $\mathbf{b}$ representing $\mathbf{x}$ in $\mathbf{W}$ with: $\mathbf{x} = \mathbf{W}^{-1} \mathbf{b} = \mathbf{V}^{-1} \mathbf{a} \Rightarrow \mathbf{b} = \mathbf{W} \mathbf{V}^{-1} \mathbf{a}$.
\section{Matrix/Vector}
\begin{compactdesc}
\item[Orthogonal:] (i.e. columns are orthonormal!) $\mathbf{A}^{-1} = \mathbf{A}^T$, $\mathbf{A} \mathbf{A}^T = \mathbf{A}^T \mathbf{A} = \mathbf{I}$, $\operatorname{det}(\mathbf{A}) \in \{+1, -1\}$, $\operatorname{det}(\mathbf{A}^T \mathbf{A}) = 1$
\item[Inner Product:] (in $\mathbb{R}^D$) $\langle \mathbf{x}, \mathbf{y} \rangle = \mathbf{x}^T \mathbf{y} = \sum_{i=1}^{N} \mathbf{x}_i \mathbf{y}_i$.
\begin{inparaitem}[\color{red}\textbullet]
\item $\langle \mathbf{x} \pm \mathbf{y}, \mathbf{x} \pm \mathbf{y} \rangle = \langle \mathbf{x}, \mathbf{x} \rangle \pm 2 \langle \mathbf{x}, \mathbf{y} \rangle + \langle \mathbf{y}, \mathbf{y} \rangle$
\item $\langle \mathbf{x}, \mathbf{y} + \mathbf{z} \rangle = \langle \mathbf{x}, \mathbf{y} \rangle + \langle \mathbf{x}, \mathbf{z} \rangle$
\item $\langle \mathbf{x} + \mathbf{y}, \mathbf{z} \rangle = \langle \mathbf{x}, \mathbf{z} \rangle + \langle \mathbf{y}, \mathbf{z} \rangle$
\item $\langle \mathbf{x}, \mathbf{y} \rangle = \|\mathbf{x}\|_2 \cdot \|\mathbf{y}\|_2 \cdot \cos(\theta)$
\item If $\mathbf{y}$ is a unit vector then $\langle \mathbf{x}, \mathbf{y} \rangle$ projects $\mathbf{x}$ onto $\mathbf{y}$
\end{inparaitem}
\item[Outer Product:] $\mathbf{u} \mathbf{v}^T$, $(\mathbf{u} \mathbf{v}^T)_{i, j} = \mathbf{u}_i \mathbf{v}_j$
\item[Transpose:] $(\mathbf{A}^T)^{-1} = (\mathbf{A}^{-1})^T$
\item[Gram-Schmidt:] $\{\mathbf{w}_i\}_i$ non-orthogonal basis. $\mathbf{v}_n = \mathbf{w}_n - \sum_{i=1}^{n-1} \frac{\langle \mathbf{v}_i, \mathbf{w}_n \rangle}{\langle \mathbf{v}_i, \mathbf{v}_i \rangle} \mathbf{v}_i$ results in $\{\mathbf{v}_i\}_i$ an orthogonal basis
\end{compactdesc}
\section{Norms}
\begin{inparadesc}
\item[\color{red}$l_0$:] $\|\mathbf{x}\|_0 := |\{i | x_i \neq 0\}|$
\item[\color{red}Nuclear:] $\|\mathbf{X}\|_\star = \sum_{i=1}^{\min(m, n)} \sigma_i$
\item[\color{red}Euclidean:] $\|\mathbf{x}\|_2 := \sqrt{\sum_{i=1}^{N} \mathbf{x}_i^2} = \sqrt{\mathbf{x}^T \mathbf{x}} = \sqrt{\langle \mathbf{x}, \mathbf{x} \rangle}$
\item[\color{red}$p$-norm:] $\|\mathbf{x}\|_p := \left( \sum_{i=1}^{N} |x_i|^p \right)^{\frac{1}{p}}$
\item[\color{red}Frobenius:] $\|\mathbf{A}\|_F :=\allowbreak \sqrt{\sum_{i=1}^{M} \sum_{j=1}^{N} |\mathbf{A}_{i, j}|^2} =\allowbreak \sqrt{\operatorname{trace}(\mathbf{A}^T \mathbf{A})} =\allowbreak \sqrt{\sum_{i=1}^{\min\{m, n\}} \sigma_i^2}$ ($\sigma_i$ is the $i$-th singularvalue), $\mathbf{A} \in \mathbb{R}^{M \times N}$
\end{inparadesc}
\section{$K$-means Algorithm}
\begin{inparadesc}
\item[\color{red}Target:] $\min_{\mathbf{U}, \mathbf{Z}} J(\mathbf{U}, \mathbf{Z}) = \|\mathbf{X} - \mathbf{U} \mathbf{Z}\|_F^2 = \sum_{n=1}^N \sum_{k=1}^K \mathbf{z}_{k,n} \|\mathbf{x}_n - \mathbf{u}_k\|_2^2$
\end{inparadesc}
\begin{inparaenum}[\color{red} 1.]
\item Initiate: choose randomly $K$ centroids $\mathbf{U} = [\mathbf{u}_1, \ldots, \mathbf{u}_K]$ (usually $K$ randomly chosen data points).
\item Assign data points to clusters. $k^\star(\mathbf{x}_n) = \argmin_k \{ \|\mathbf{x}_n - \mathbf{u}_1\|_2^2, \ldots, \|\mathbf{x}_n - \mathbf{u}_K\|_2^2 \}$ gives us the cluster $k$ whose centroid $\mathbf{u}_k$ is closest to data point $\mathbf{x}_n$. Set accordingly the $k$-th element of the $n$-th column of the assignment matrix $\mathbf{Z}$ to $1$, others to $0$.
\item Update centroids: $\mathbf{u}_k = \frac{\sum_{n=1}^N z_{k,n} \mathbf{x}_n}{\sum_{n=1}^N z_{k,n}}$ (each centroid is in the middle of all data points assigned to it's cluster).
\item Repeat from step 2, stops if $\frac{1}{2} \|\mathbf{Z} - \mathbf{Z}^\text{new}\|_0 = 0$.
\end{inparaenum}
\section{Probability / Statistics}
\begin{compactitem}
\item $P(x) := Pr[X = x] := \sum_{y \in Y} P(x, y)$
\item $P(x|y) := Pr[X = x | Y = y] := \frac{P(x,y)}{P(y)},\quad \text{if } P(y) > 0$
\item $\forall y \in Y: \sum_{x \in X} P(x|y) = 1$ (property for any fixed $y$)
\item $P(x, y) = P(x|y) P(y)$
\item $P(x|y) = \frac{P(y|x)P(x)}{P(y)}$ (Bayes' rule)
\item $P(x|y) = P(x) \Leftrightarrow P(y|x) = P(y)$ (iff $X$, $Y$ independent)
\item $P(x_1, \ldots, x_n) = \prod_{i=1}^n P(x_i)$ (iff IID)
\end{compactitem}
\section{Gaussian Mixture Models (GMM)}
\begin{compactdesc}
\item[GMM:] $p(\mathbf{x}) = \sum_{k=1}^K \pi_k \mathcal{N}(\mathbf{x} | \boldsymbol{\mu}_k, \Sigma_k)$
\item[Assignment variable:] $\mathbf{z}_k \in \{0, 1\}$, $\sum_{k=1}^K \mathbf{z}_k = 1$, $\operatorname{Pr}(\mathbf{z}_k = 1) = \boldsymbol{\pi}_k \Leftrightarrow p(\mathbf{z}) = \prod_{k=1}^K \boldsymbol{\pi}_k^{\mathbf{z}_k}$
\item[Complete data distribution:] $p(\mathbf{x}, \mathbf{z}) = \prod_{k=1}^K \left( \boldsymbol{\pi}_k \mathcal{N}(\boldsymbol{\mu}_k, \boldsymbol{\Sigma}_k) \right)^{\mathbf{z}_k}$
\item[Posterior Probabilities:] $\operatorname{Pr}(\mathbf{z}_k = 1 | \mathbf{x}) = \frac{\operatorname{Pr}(\mathbf{z}_k = 1) p(\mathbf{x} | \mathbf{z}_k = 1)}{\sum_{l=1}^K \operatorname{Pr}(\mathbf{z}_l = 1) p(\mathbf{x} | \mathbf{z}_l = 1)} = \frac{\boldsymbol{\pi}_k \mathcal{N}(\mathbf{x} | \boldsymbol{\mu}_k, \boldsymbol{\Sigma}_k)}{\sum_{l=1}^K \boldsymbol{\pi}_l \mathcal{N}(\mathbf{x} | \boldsymbol{\mu}_l, \boldsymbol{\Sigma}_l)}$
\item[Likelihood of observed data $\mathbf{X}$:] $p(\mathbf{X} | \boldsymbol{\pi}, \boldsymbol{\mu}, \boldsymbol{\Sigma}) = \prod_{n=1}^N p(\mathbf{x}_n) = \prod_{n=1}^N \sum_{k=1}^K \pi_k \mathcal{N}(\mathbf{x}_n | \boldsymbol{\mu}_k, \boldsymbol{\Sigma}_k)$
\item[Log-likelihood:] $\ln p(\mathbf{X} | \boldsymbol{\pi}, \boldsymbol{\mu}, \boldsymbol{\Sigma}) =\break \sum_{i=1}^N \ln \left( \sum_{k=1}^K \pi_k \mathcal{N}(\mathbf{x}_i | \boldsymbol{\mu}_k, \boldsymbol{\Sigma}_k) \right)$
\item[Negative Log-likelihood:] The smaller, the better the fit: $-\ln(p(\mathbf{X} | \mathbf{\theta})) = -\Sigma_{n=1}^N \ln\left( \Sigma_{k=1}^K \boldsymbol{\pi}_k \mathcal{N}(\mathbf{x}_n | \boldsymbol{\mu}_k, \boldsymbol{\Sigma}_k) \right)$
\end{compactdesc}
\subsection{AIC / BIC for GMM}
Trade-off: balance between data fit (i.e. likelihood $p(\mathbf{X} | \theta)$) and complexity (i.e. \# of free parameters $\kappa(\cdot)$). Heuristics:
\begin{compactdesc}
\item[Akaike Information Criterion (AIC):] $\operatorname{AIC}(\theta | \mathbf{X}) = -\ln(p_\theta(\mathbf{X})) + \kappa(\theta)$
\item[Bayesian Information Criterion (BIC):] $\operatorname{BIC}(\theta | \mathbf{X}) = -\ln(p_\theta(\mathbf{x})) + \frac{1}{2} \kappa(\theta) \ln(N)$
\item[\# of free params:] (fixed covariance matrix!) $\kappa(\theta) = K \cdot D + T$ ($K$ number of clusters, $D$ is data dimension (i.e. dimension of the $\mu$'s), $T$ number of not fixed $\pi$'s.
\end{compactdesc}
Note that the first term is the log-likelihood multiplied by $-1$. Compare AIC/BIC for different $L$, smaller is better.
\section{Expectation-Maximization (EM) for GMM}
\begin{compactenum}
\item Initialize $\boldsymbol{\pi}_k^{(0)}, \boldsymbol{\mu}_k^{(0)}, \boldsymbol{\Sigma}_k^{(0)}, t = 1$ for $k = 1, \ldots, K$
\item E-Step: $q_{k,n} := \mathbb{E}[z_{k,n}] = \frac{\boldsymbol{\pi}_k^{(t-1)} \mathcal{N}(\mathbf{x}_n | \boldsymbol{\mu}_k^{(t-1)}, \boldsymbol{\Sigma}_k^{(t-1)})}{\sum_{j=1}^K \boldsymbol{\pi}_j^{(t-1)} \mathcal{N}(\mathbf{x}_n | \boldsymbol{\mu}_j^{(t-1)}, \boldsymbol{\Sigma}_j^{(t-1)})}$
\item M-Step: $\boldsymbol{\mu}_k^{(t)} := \frac{\sum_{n=1}^N q_{k,n} \mathbf{x}_n}{\sum_{n=1}^N q_{k,n}}$, $\boldsymbol{\pi}_k^{(t)} := \frac{1}{N} \sum_{n=1}^N q_{k,n}$, and $\Sigma_k^{(t)} = \frac{\sum_{n=1}^N q_{k, n} (\mathbf{x}_n - \boldsymbol{\mu}_k^{(t)})(\mathbf{x}_k - \boldsymbol{\mu}_k^{(t)})^T}{\sum_{n=1}^N q_{k,n}}$
\item Repeat from (2.) with $t = t + 1$ if not $\| \log p(\mathbf{X} | \boldsymbol{\pi}^{(t)}, \boldsymbol{\mu}^{(t)}, \boldsymbol{\Sigma}^{(t)}) - \log p(\mathbf{X} | \boldsymbol{\pi}^{(t-1)}, \boldsymbol{\mu}^{(t-1)}, \boldsymbol{\Sigma}^{(t-1)}) \| < \epsilon$
\end{compactenum}
\section{Expectation-Maximization (EM) for Inpainting}
\begin{inparaenum}[\color{red} 1.]
\item Assume initial $\mathbf{z}^{(0)}$, $\mathbf{x}^{(0)}$ original image, $t = 1$
\item Reconstruct image $\mathbf{x}^{(t)} = \mathbf{M}\mathbf{x}^{(t-1)} + (\mathbf{I} - \mathbf{M})\mathbf{U}\mathbf{z}^{(t-1)}$
\item Sparse coding of $\mathbf{x}^{(t)}$: $\mathbf{z}^{(t)} \in \argmin_{\mathbf{z}} \|\mathbf{z}\|_0$ s.t. $\|\mathbf{x}^{(t)} - \mathbf{Uz}\|_2 < \sigma$
\item Repeat (2.) \& (3.) till converging/$\mathit{maxIters}$
\item Image reconstruction: $\hat{\mathbf{x}} = \mathbf{Mx} + (\mathbf{I} - \mathbf{M})\mathbf{x}^{(t)}$
\end{inparaenum}
\section{Haar Wavelet}
\begin{compactdesc}
\item[Mother function:] $\psi(t)$ is $1$ for $0 \leq t < \frac{1}{2}$, $-1$ for $\frac{1}{2} \leq t < 1$, $0$ otherwise.
\item[Haar function:] $\psi_{n,k}(t) = 2^{n/2} \psi(2^n t - k)$, $2^{n/2}$ only needed if normalized values needed!
\end{compactdesc}
\begin{compactitem}
\item For the not normalized basis: $\mathbf{W}^{-1} = (\mathbf{W^T} \mathbf{W})^{-1} \mathbf{W}^T = \operatorname{diag}(\frac{1}{l_1}, \ldots, \frac{1}{l_n}) \mathbf{W}^T$ where $l_i$ is the norm of the $i$-th column of $\mathbf{W}$. This is valid because $\mathbf{W}$ is orthogonal!
\item Don't forget $[1, 1, \ldots, 1]^T$: first non-normalized basis!
\item \textbf{``Coding'' step:} Tranform $\mathbf{x}$ into coordinate vector $\mathbf{c}$
\item \textbf{``Thresholding'' step:} Set small absolute values in $\mathbf{c}$ to $0$
\end{compactitem}
\section{Optimization}
Gradient of $f: \mathbb{R}^D \to \mathbb{R}$ is $\nabla f(\mathbf{x}) := \left( \frac{\partial f(\mathbf{x})}{\partial \mathbf{x}_1}, \ldots, \frac{\partial f(\mathbf{x})}{\partial \mathbf{x}_D} \right)^T \in \mathbb{R}^D$
\subsection{Convex Functions}
A function $f(x)$ is convex if domain of $f$ (set of input values) is convex (e.g. $\mathbb{R}^D$) and $\forall y, z, \forall t \in [0, 1]: f(t \cdot y + (1-t) \cdot z) \leq t \cdot f(y) + (1-t) \cdot f(z)$.
\subsection{Coordinate Descent}
Update one coordinate (the $d$-th), others are fixed.
\begin{inparaenum}[\color{red} 1.]
\item Init: $\mathbf{x}^{(0)} \in \mathbb{R}^D$
\item For $t = 0 \ \text{to} \ \mathit{maxIters}$:
\item choose $d$ uniformly from $1, \ldots, D$
\item $u^\star = \argmin_{u \in \mathbb{R}} f(x_1^{(t)}, \ldots, u, \ldots, x_D^{(t)})$
\item $\mathbf{x}_d^{(t+1)} = u^\star$ and $\mathbf{x}_i^{(t+1)} = \mathbf{x}_i^{(t)}$ for $i \neq d$
\end{inparaenum}
\subsection{Gradient Descent}
\begin{inparaenum}[\color{red} 1.]
\item Init: $\mathbf{x}^{(0)} \in \mathbb{R}^D$, stepsize usually $\gamma \approx \frac{1}{t}$
\item For $t = 0 \ \text{to} \ \mathit{maxIters}$: $\mathbf{x}^{(t+1)} = \mathbf{x}^{(t)} - \gamma \nabla f(\mathbf{x}^{(t)})$
\end{inparaenum}
\subsection{Stochastic Gradient Descent}
\begin{inparaenum}[\color{red} 1.]
\item Init: $\mathbf{x}^{(0)} \in \mathbb{R}^D$, stepsize usually $\gamma \approx \frac{1}{t}$
\item For $t = 0 \ \text{to} \ \mathit{maxIters}$:
\item choose $n$ uniformly from $1, \ldots, D$
\item $\mathbf{x}^{(t+1)} = \mathbf{x}^{(t)} - \gamma \nabla f_n(\mathbf{x}^{(t)})$
\end{inparaenum}
\subsection{Projected Gradient Descent}
In the gradient update step use a projection into constraint space $Q \subseteq \mathbb{R}^D$ with $P_Q(\mathbf{x}) = \argmin_{y \in Q} \|y - x\|$: $\mathbf{x}^{(t+1)} = P_Q\left(\mathbf{x}^{(t)} - \gamma \nabla f(\mathbf{x}^{(t)})\right)$
\subsection{Lagrangian Multipliers}
Minimize $f(\mathbf{x})$ subject to $g_i(\mathbf{x}) \leq 0,\ i = 1, \ldots, m$ and $h_i(\mathbf{x}) = \mathbf{a}_i^T \mathbf{x} - b_i = 0,\ i = 1, \ldots, p$.
\begin{compactdesc}
\item[Lagrangian:] $L(\mathbf{x}, \boldsymbol{\lambda}, \boldsymbol{\nu}) := f(\mathbf{x}) + \sum_{i=1}^m \lambda_i g_i(\mathbf{x}) + \sum_{i=1}^p \nu_i h_i(\mathbf{x})$
\item[Dual function:] $d(\boldsymbol{\lambda}, \boldsymbol{\nu}) := \inf_{\mathbf{x}} L(\mathbf{x}, \boldsymbol{\lambda}, \boldsymbol{\nu}) \in \mathbb{R}$
\item[Dual Problem:] $\max_{\boldsymbol{\lambda}, \boldsymbol{\nu}} d(\boldsymbol{\lambda}, \boldsymbol{\nu})$ s.t. $\boldsymbol{\lambda} \geq \mathbf{0}$. Result is lower bound of optimum $f(\mathbf{x}^\star)$. Also only interested in $d(\cdot)$ where it is finite (if needed add (in)equality as a ``subject to'').
\end{compactdesc}
\section{Matching Pursuit (MP): Over-Complete Dictionaries}
Sparse approximation of a vector $\mathbf{x}$ onto an over-complete dictionary $\mathcal{D}$ represented by matrix $\mathbf{U}$ using $K$ dictionary entries.
\begin{compactenum}
\item Initialize: Coefficient vector $\mathbb{R}^{|\mathcal{D}|} \ni \mathbf{z} = \mathbf{0}$, Residual $\mathbf{r} = \mathbf{x}$
\item While $\|\mathbf{z}\|_0 < K$ repeat the following steps
\item Find next dictionary index: $d^\star = \argmax_d |\langle \mathbf{u}_d, \mathbf{r} \rangle|$
\item Update coefficients: $\mathbf{z}_{d^\star} = \mathbf{z}_{d^\star} + \langle \mathbf{u}_{d^\star}, \mathbf{r} \rangle$
\item Update residual: $\mathbf{r} = \mathbf{r} - \langle \mathbf{u}_{d^\star}, \mathbf{r} \rangle \mathbf{u}_{d^\star}$.
\end{compactenum}
\section{Matching Pursuit (MP): Inpainting}
Diagonal matrix $\mathbf{M}$ with $\mathbf{M}_{i, i} = 1$ if $i$-th pixel known, otherwise $\mathbf{M}_{i,i} = 0$. Interested in $\mathbf{z}^\star \in \argmax_{\mathbf{z}} \|\mathbf{z}\|_0$ subject to $\|\mathbf{M}(\mathbf{x} - \mathbf{Uz})\|_2 < \sigma$. $\mathbf{U}$ over-complete dictionary of known parts, $\sigma$ allowed error. Reconstructed image: $\hat{\mathbf{x}} = \mathbf{Mx} + (\mathbf{I} - \mathbf{M})\mathbf{U} \mathbf{z}^\star$.
\begin{compactenum}
\item Initialize: Coefficient vector $\mathbf{z} = \mathbf{0}$, residual $\mathbf{r} = \mathbf{M}\mathbf{x}$.
\item While $\|z\|_0 < K$ and $\|r\|_2 > 0$ repeat the following steps
\begin{listcols}[2][5pt]
\item $d^\star = \argmax_d \frac{|\langle \mathbf{M}\mathbf{u}_d, \mathbf{r}\rangle|}{\|\mathbf{M}\mathbf{u}_d\|_2}$
\item $\mathbf{z}_{d^\star} = \mathbf{z}_{d^\star} + \frac{\langle \mathbf{M}\mathbf{u}_{d^\star}, \mathbf{r} \rangle}{\|\mathbf{M}\mathbf{u}_{d^\star}\|_2^2}$
\item $\mathbf{r} = \mathbf{r} - \frac{\langle \mathbf{M}\mathbf{u}_{d^\star}, \mathbf{r} \rangle}{\|\mathbf{M}\mathbf{u}_{d^\star}\|_2^2} \mathbf{M}\mathbf{u}_{d^\star}$
\end{listcols}
\end{compactenum}
\section{Coherence ($m(\cdot)$)}
\begin{compactitem}
\item $D$ dimension of data, $L$ number of atoms, $L > D$ then over-complete dictionary
\item $m(\mathbf{U}) = \max_{i,j:\, i \neq j} | \mathbf{u}_i^T \mathbf{u}_j |$
\item $m(\mathbf{B}) = 0$ if $\mathbf{B}$ is orthogonal matrix and a basis
\item $m([\mathbf{B}, \mathbf{u}]) \geq \frac{1}{\sqrt{D}}$ if atom $\mathbf{u}$ is added to orthogonal $\mathbf{B}$
\end{compactitem}
\section{Robust PCA}
\begin{compactitem}
\item Idea: Approximate $\mathbf{X}$ with $\mathbf{L} + \mathbf{S}$, $\mathbf{L}$ is low-rank, $\mathbf{S}$ is sparse.
\item Minimize $\|\mathbf{L}\|_\star + \lambda \|\mathbf{S}\|_1$ s. t. $\mathbf{L} + \mathbf{S} = \mathbf{X}$
\item A perfect reconstruction is possible if $\mathbf{S}$ is not low-rank, $\mathbf{L}$ is not sparse, or $\mathbf{X}$ is not low-rank \textit{and} sparse at the same time
\end{compactitem}
\subsection{Alternating Direction Method of Multipliers (ADMM)}
$\min_{\mathbf{x}_1, \mathbf{x}_2} f_1(\mathbf{x}_1) + f_2(\mathbf{x}_2)$ s. t. $\mathbf{A}_1 \mathbf{x}_1 + \mathbf{A}_2 \mathbf{x}_2 = \mathbf{b}$, $f_1, f_2$ convex
\begin{inparaitem}[\color{red}\textbullet]
\item Augmented Lagrangian: $L_p(\mathbf{x}_1, \mathbf{x}_2, \boldsymbol{\nu}) = f_1(\mathbf{x}_1) + f_2(\mathbf{x}_2) + \boldsymbol{\nu}^T (\mathbf{A}_1 \mathbf{x}_1 + \mathbf{A}_2 \mathbf{x}_2 - \mathbf{b}) + \frac{p}{2}\| \mathbf{A}_1 \mathbf{x}_1 + \mathbf{A}_2 \mathbf{x}_2 - \mathbf{b} \|_2^2$
\item ADMM: $\mathbf{x}_1^{(t+1)} := \argmin_{\mathbf{x}_1} L_p(\mathbf{x}_1, \mathbf{x}_2^{(t)}, \boldsymbol{\nu}^{(t)})$, $\mathbf{x}_2^{(t+1)} := \argmin_{\mathbf{x}_2} L_p(\mathbf{x}_1^{(t+1)}, \mathbf{x}_2, \boldsymbol{\nu}^{(t)})$, $\boldsymbol{\nu}^{(t+1)} := \boldsymbol{\nu}^{(t)} + p(\mathbf{A}_1 \mathbf{x}_1^{(t+1)} + \mathbf{A}_2 \mathbf{x}_2^{(t+1)} - \mathbf{b})$
\item ADDM for RPCA: $f_1(\mathbf{L}) = \|\mathbf{L}\|_\star$, $f_2(\mathbf{S}) = \lambda \| \mathbf{S} \|_1$, $\mathbf{A}_1 \mathbf{x}_1 + \mathbf{A}_2 \mathbf{x}_2 = \mathbf{b} \text{ becomes } \mathbf{L} + \mathbf{S} = \mathbf{X}$, therefore $L_p(\mathbf{L}, \mathbf{S}, \boldsymbol{\nu}) = \ldots$, $\boldsymbol{\nu}$ is matrix here!
\end{inparaitem}
\section{Non-Negative Matrix Factorization (NMF)}
\begin{inparaitem}[\color{red}\textbullet]
\item $\mathbf{X} \in \mathbb{R}^{D \times N}_{\geq 0}$, $D$ words, $N$ documents, $X$ stores word count per document
\item Target: $\mathbf{X} \approx \mathbf{UZ}$, $\mathbf{U} \in \mathbb{R}^{D \times K}_{\geq 0}$, $\mathbf{Z} \in \mathbb{R}^{K \times N}_{\geq 0}$ (semi-NMF: $\mathbf{U}$ allows negative entries)
\end{inparaitem}
\subsection{LSI}
Approx. $\mathbf{X}$ using SVD keeping $K$ largest singular values with $\tilde{\mathbf{X}} = \mathbf{U} \boldsymbol{\Sigma}_K \mathbf{V}^T$. Interpret $(\mathbf{U} \boldsymbol{\Sigma}_K)^T$ as mapping from $D$ dim. word space to $K$ dim. latent topic space. New document $\mathbf{x} \mapsto (\mathbf{U} \boldsymbol{\Sigma}_K)^T \mathbf{x}$. Compare using inner product $\langle \tilde{\mathbf{x}}_1, \tilde{\mathbf{x}}_2 \rangle = \langle (\mathbf{U} \boldsymbol{\Sigma}_K)^T \mathbf{x}_1, (\mathbf{U} \boldsymbol{\Sigma}_K)^T \mathbf{x}_2 \rangle$: Larger value $\Rightarrow$ more similar.
\subsection{pLSI}
\begin{compactitem}
\item $P(\mathit{word} | \mathit{doc}) = \sum_{k=1}^K P(\mathit{word} | \mathit{topic} = k) P(\mathit{topic} = k | \mathit{doc})$
\item $P(\mathit{word}, \mathit{doc}) = P(\mathit{word} | \mathit{doc}) P(\mathit{doc}) \break= \sum_z P(\mathit{word} | \mathit{topic}) P(\mathit{doc} | \mathit{topic}) P(\mathit{topic})$
\end{compactitem}
\textbf{Matrix Factorization View:}
\begin{inparaitem}[\color{red}\textbullet]
\item Normalize $\mathbf{X}$ resulting in $\hat{\mathbf{X}}$: $\hat{\mathbf{X}}_{d, n} := \frac{\mathbf{X}_{d,n}}{\sum_{d'=1}^D \sum_{n'=1}^N \mathbf{X}_{d', n'}}$
\item $\hat{\mathbf{X}} \approx \mathbf{U} \mathbf{Z}$, $\hat{\mathbf{X}} \in \mathbb{R}^{D \times N}_{\geq 0}$, $D$ words, $N$ documents
\item $\mathbf{U}_{d, k} \equiv P(\mathit{word}=d | \mathit{topic} = k)$, $\sum_{d=1}^D \mathbf{U}_{d,k} = 1\, \forall k$
\item $\mathbf{Z}_{k,n} \equiv P(\mathit{topic} = k | \mathit{doc} = n) P(\mathit{doc} = n)$, $\sum_{k, n} \mathbf{Z}_{k,n} = 1$
\item $(\mathbf{U}\mathbf{Z})_{d, n} \equiv P(\mathit{word = d}, \mathit{doc} = n)$
\item Log-likelihood: $\log P(\hat{\mathbf{X}}; \mathbf{U}, \mathbf{Z}) = \sum_{d=1}^D \sum_{n=1}^N \hat{\mathbf{X}}_{d,n} \log \sum_{k=1}^K \mathbf{U}_{d, k} \mathbf{Z}_{k, n}$
\end{inparaitem}
\textbf{Expectation Maximization (EM):}
\begin{compactenum}
\item E-Step: $P(z|d, w) = \frac{P(z)P(d|z)P(w|z)}{\sum_{z' \in \mathcal{Z}} P(z') P(d|z') P(w|z')}$
\item M-Steps: $P(w|z) \propto \sum_{d} f(d, w) P(z|d, w)$, $P(d|z) \propto \sum_{w} f(d, w) P(z|d, w)$, $P(z) \propto \sum_{d} \sum_{w} f(d, w) P(z|d, w)$, $f(d, w)$ is frequency of term $w$ in doc $d$.
\end{compactenum}
\begin{inparaitem}[\color{red}\textbullet]
\item $P(w|z) \in \mathbb{R}^{D \times K}$
\item $P(d|z) \in \mathbb{R}^{N \times K}$
\item $P(z) \in \mathbb{R}^{K}$
\end{inparaitem}
\subsection{NMF Algorithm for quadratic cost function}
$\min_{\mathbf{U}, \mathbf{Z}} J(\mathbf{U}, \mathbf{Z}) = \frac{1}{2} \|\mathbf{X} - \mathbf{U}\mathbf{Z}\|_F^2$ s.t. $\mathbf{U}_{d,k} \in [0, \infty)\ \forall d, k$ \& $\mathbf{Z}_{k, n} \in [0, \infty)\ \forall k, n$
\begin{inparaenum}[\color{red} 1.]
\item Init: $\mathbf{U} \gets \mathrm{rand}(D,K), \mathbf{Z} \gets \mathrm{rand}(K,N)$
\item repeat for $\mathit{maxIters}$:
\item Update $\mathbf{U}$: $u_{d,k} \gets u_{d,k} \frac{(\mathbf{X}\mathbf{Z}^T)_{d,k}}{(\mathbf{U}\mathbf{Z}\mathbf{Z}^T)_{d,k}}$
\item Update $\mathbf{Z}$: $z_{k,n} \gets z_{k,n} \frac{(\mathbf{U}^T\mathbf{X})_{k,n}}{(\mathbf{U}^T\mathbf{U}\mathbf{Z})_{k,n}}$
\end{inparaenum}
\raggedcolumns
\end{multicols*}
\end{document}