diff --git a/Lectures/lecture01.tex b/Lectures/lecture01.tex
index 6806926..984fc95 100644
--- a/Lectures/lecture01.tex
+++ b/Lectures/lecture01.tex
@@ -1,54 +1,81 @@
 \subsection{Lecture}
 
 \begin{edefn}
-    Let $p(x \vert \theta)$ be distribution. Concidering $\frac{\partial \log p(\hat{x} | \theta)}{\partial \theta}$, where $\hat{x} \sim p(x | \theta)$. 
-    
+    Let $p(x \vert \theta)$ be distribution. Considering $\frac{\partial \log p(\hat{x} | \theta)}{\partial \theta}$, where $\hat{x} \sim p(x | \theta)$ and $\theta \in \RR^n$.
+
     \begin{equation*}
-        \int p(x | \theta) \frac{\partial \log p(\hat{x} | \theta)}{\partial \theta} d\hat{x} = \frac{\partial}{\partial \theta} \int p(x | \theta) d\hat{x} = \frac{\partial}{\partial \theta} 1 = 0
+        \int p(\hat{x} | \theta) \frac{\partial \log p(\hat{x} | \theta)}{\partial \theta} d\hat{x} = \frac{\partial}{\partial \theta} \int p(\hat{x} | \theta) d\hat{x} = \frac{\partial}{\partial \theta} 1 = 0
     \end{equation*}
 
-    And \cursed{Fisher information} is defined as
+    And \cursed{Fisher information matrix} is defined as
     \begin{equation*}
         \begin{aligned}
-            F(\theta) &= \int p(x | \theta) \left( \frac{\partial \log p(\hat{x} | \theta)}{\partial \theta} \right) \left( \frac{\partial \log p(\hat{x} | \theta)}{\partial \theta} \right)^T dx -\int p(x | \theta) \left( \frac{\partial^2 \log p(\hat{x} | \theta)}{\partial \theta} \right) dx \succeq 0
+            F(\theta) &= \int p(\hat{x} | \theta) \left( \frac{\partial \log p(\hat{x} | \theta)}{\partial \theta} \right) \left( \frac{\partial \log p(\hat{x} | \theta)}{\partial \theta} \right)^T \dd \hat{x} \\
+                      &= -\int p(\hat{x} | \theta) \left( \frac{\partial^2 \log p(\hat{x} | \theta)}{\partial \theta^2} \right) \dd \hat{x} \\
+                      &= \mathbb{E}_{\hat{x} \sim p(x | \theta)} \left[ \left( \frac{\partial \log p(\hat{x} | \theta)}{\partial \theta} \right)^2 \right].
         \end{aligned}
     \end{equation*}
-
-    or
-    \begin{equation*}
-        F(\theta) = \mathbb{E}_{\hat{x} \sim p(x | \theta)} \left[ \left( \frac{\partial \log p(\hat{x} | \theta)}{\partial \theta} \right)^2 \right].
-    \end{equation*}
 \end{edefn}
 
-Let $f(x)$ be a function, $\grad f(x) \propto \argmax f(x + \delta)$ such that $\rho(x, x+ \delta) < \varepsilon$.  $$\int p(x | \theta) g(x) dx = \MI(p(x | \theta)) = \MI(\theta)$$
+To prove the 2nd ``='' in the above definition, consider i,j-th element of the matrix:
+\begin{equation*}
+    \begin{aligned}
+        &-\int p(\hat{x} | \theta) \left( \pdv{\log p(\hat{x} | \theta)}{\theta_j}{\theta_i} \right) \dd \hat{x} \\
+        &=-\int p(\hat{x} | \theta) \left( \pdv{\theta_j}(\frac{1}{p(\hat{x} | \theta)}\pdv{p(\hat{x} | \theta)}{\theta_i}) \right) \dd \hat{x} \\
+        &=\int p(\hat{x} | \theta) \left(\frac{1}{p(\hat{x} | \theta)}\right)^2 \left(\pdv{p(\hat{x} | \theta)}{\theta_j}\right)\left(\pdv{p(\hat{x} | \theta)}{\theta_i}\right) \dd \hat{x}
+          - \underbrace{\int \pdv{p(\hat{x} | \theta)}{\theta_i}{\theta_j} \dd \hat{x}}_{0} \\
+        &= \int p(\hat{x} | \theta) \left( \pdv{\log p(\hat{x} | \theta)}{\theta_i} \right) \left( \pdv{\log p(\hat{x} | \theta)}{\theta_j} \right) \dd \hat{x} \\
+        &\equiv F_{i,j}(\theta)
+    \end{aligned}
+\end{equation*}
+
+\begin{eremark}
+    $F(\theta) \succeq 0$, i.e. Fisher information matrix is positive semi-definite for all values of $\theta$.
+    To prove this, denote by $g$ the gradient $\pdv{\log p(x\mid \theta)}{\theta}$ and note that for a vector $v$ we can write $v^T F(\theta) v = \int p(x\mid \theta) v^T gg^T v\dd x = \int p(x\mid \theta) (v^T g)^2 \dd x \ge 0$.
+\end{eremark}
+
+\vspace{3em}
+
+Let $f(x)$ be a function, then its gradient may be expressed in the following peculiar way.
+$$\grad f(x) \propto \argmax_{\delta} f(x + \delta)
+\text{ such that } \rho(x, x+ \delta) < \varepsilon \text{ and } \varepsilon>0 \text{ is sufficiently small }
+$$
+% % I tried, but this definition is too sloppy to be formalized.
+% % Moreover, the original paper presents it the same way: https://arxiv.org/abs/1206.7051
+% Formally,
+% $$\grad f(x) = C(x) \lim_{\varepsilon \to 0+} [ \argmax_{\delta \in B_{\varepsilon}} f(x + \delta)]$$
+% where $B_{\varepsilon} \subset \RR^n$ is the zero-centered ball with radius $\varepsilon$ and $C(x)$ is some positive-real-valued function.
+% This is true because $f(x + \delta) = f(x) + \langle \grad f(x), \delta \rangle  + O(\norm{\delta}^2)$.
+% $$\int p(x | \theta) g(x) dx = \MI(p(x | \theta)) = \MI(\theta)$$
 
 \begin{edefn}
-    \cursed{Natural gradient}, $\operatorname{natgrad} \MI \propto \argmax \MI(\theta + \delta) = F^{-1}(\theta) \grad \MI (\theta)$ such that $$\operatorname{KL} \left( p(x | \theta) \parallel p(x | \theta + \delta)\right) < \varepsilon.$$  
+    % proposed in paper https://doi.org/10.1162/089976698300017746
+    \cursed{Natural gradient}, $\operatorname{natgrad} \MI \propto \argmax \MI(\theta + \delta) = F^{-1}(\theta) \grad \MI (\theta)$ such that $$\operatorname{KL} \left( p(x | \theta) \parallel p(x | \theta + \delta)\right) < \varepsilon.$$
 \end{edefn}
 
 \begin{eremark}
-    Natural gradient is a parametrisation invariant.
+    Natural gradient is parametrisation-invariant.
 \end{eremark}
 
 Let's consider $$p(X, Z, \theta) = \prod_{i = 1}^n p(x_i, z_i | \theta) p(\theta),$$ where $x_i$ are observed, $z_i$ are hidden and $\theta$ are parameters. We want to find $p(Z, \theta | X)$ assuming that it is integrable and that we have $Z$ -- conjugate to $\theta$ and vice versa. Then we can approximate $p(Z, \theta | X) \approx q(Z) q(\theta)$ and from conjugacy and exponential family conclude
 \begin{equation*}
     \begin{aligned}
-        p(x, z | \theta) &= f(\theta) \rho(x, z) \exp (\theta^T h(x, z)) \\ 
-        p(\theta) &= \frac{f(\theta)}{g(\nu_0, \eta_0)} \exp (\eta_0^T \theta)  
+        p(x, z | \theta) &= f(\theta) \rho(x, z) \exp (\theta^T h(x, z)) \\
+        p(\theta) &= \frac{f(\theta)}{g(\nu_0, \eta_0)} \exp (\eta_0^T \theta)
     \end{aligned}
 \end{equation*}
 
-Therefore, 
+Therefore,
 \begin{equation*}
     \begin{aligned}
-        \log  q(Z) &= \EE_{q(\theta)} \log p(X, Z, \theta) + \const = \sum_{i = 1}^n \EE_{\theta} \log p(x_i, z_i | \theta) + \EE_\theta\log p(\theta) + \const \\ 
-        &= \sum_{i = 1}^n \left( \EE_\theta \log f(\theta) + \log p(x_i, z_i) + \EE \theta^T h(x_i, z_i)\right) + \const \\ 
-        &= \underbrace{\sum_{i = 1}^n \left(\log p(x_i, z_i) + \EE \theta^T h(x_i, z_i) \right)}_{eq. 1} + \const \\ 
+        \log  q(Z) &= \EE_{q(\theta)} \log p(X, Z, \theta) + \const = \sum_{i = 1}^n \EE_{\theta} \log p(x_i, z_i | \theta) + \EE_\theta\log p(\theta) + \const \\
+        &= \sum_{i = 1}^n \left( \EE_\theta \log f(\theta) + \log p(x_i, z_i) + \EE \theta^T h(x_i, z_i)\right) + \const \\
+        &= \underbrace{\sum_{i = 1}^n \left(\log p(x_i, z_i) + \EE \theta^T h(x_i, z_i) \right)}_{eq. 1} + \const \\
         &= \sum_{i = 1}^n \log q(z_i) + \const,
     \end{aligned}
 \end{equation*}
 
-hence, $q(Z) = \prod_{i = 1}^n q(z_i)$. And 
+hence, $q(Z) = \prod_{i = 1}^n q(z_i)$. And
 \begin{equation*}
     \begin{aligned}
            \log q(\theta) &= \EE_{q(Z)} \log p(X, Z, \theta) + \const = \sum_{i = 1}^n \EE_{z_i} \log p(x_i, z_i | \theta) + \EE_{Z} \log p(\theta) + \const = \\
@@ -57,65 +84,65 @@ \subsection{Lecture}
 \end{equation*}
 
 \begin{eremark}
-       Here, when $n \gg 1$, we willl struggle. 
+       Here, when $n \gg 1$, we willl struggle.
 \end{eremark}
 
-Let's take a look at 
+Let's take a look at
 \begin{equation*}
-    \log p(X) = \int q(Z) q(\theta) \log \frac{p(X, Z, \theta)}{q(Z) q(\theta)} dZ d\theta + \operatorname{KL}(q(Z) q(\theta) \parallel p(Z, \theta | X)) = \LL(q(Z), q(\theta)), 
+    \log p(X) = \int q(Z) q(\theta) \log \frac{p(X, Z, \theta)}{q(Z) q(\theta)} dZ d\theta + \operatorname{KL}(q(Z) q(\theta) \parallel p(Z, \theta | X)) = \LL(q(Z), q(\theta)),
 \end{equation*}
 
 which is greater than the first term as the second term is non-negative. Now, instead of block-coordinate optimizations, we can use gradient optimizations.
 \begin{equation*}
     \begin{aligned}
-        q(\theta) &= f(\theta) \exp \left( \theta^T \left( \underbrace{\eta_0 + \sum_{i = 1}^n \EE_{z_i} h(x_1, z_i)}_{\eta_1}\right) \right) \frac{1}{g(\nu_0 + n, \eta_0 + \sum_{i = 1}^n \EE h(x_1, z_i))} \\ 
+        q(\theta) &= f(\theta) \exp \left( \theta^T \left( \underbrace{\eta_0 + \sum_{i = 1}^n \EE_{z_i} h(x_1, z_i)}_{\eta_1}\right) \right) \frac{1}{g(\nu_0 + n, \eta_0 + \sum_{i = 1}^n \EE h(x_1, z_i))} \\
         &= \frac{f(\theta)}{f(\nu_1, \eta_1)} \exp \left( \theta^T \eta_1 \right),
     \end{aligned}
 \end{equation*}
-    
-where $\nu_1 = \nu_0 + n$. So that 
+
+where $\nu_1 = \nu_0 + n$. So that
 \begin{equation*}
     \begin{aligned}
-        \MI(\eta_1) &= \int q(Z) q(\theta | \eta_1) \left[\log p(X, Z, \theta) - \log q(\theta, \eta_1) \right] dZ d\theta + \const \\ 
+        \MI(\eta_1) &= \int q(Z) q(\theta | \eta_1) \left[\log p(X, Z, \theta) - \log q(\theta, \eta_1) \right] dZ d\theta + \const \\
         &= \int q(Z) q(\theta | \eta_1) \left[ \sum_{i = 1}^n \log p(x_i, z_i | \theta) + \log p(\theta) - \log q(\theta | \eta_1) \right] dZ d\theta + \const \\
-        &= \int q(Z) q(\theta | \eta_1) \big[ n \log f(\theta) + \sum_{i = 1}^n \log p(x_i, z_i) + \theta^T \big( \sum_{i = 1}^n h(x_i, z_i) + \nu_0 \log f(\theta) + \eta_0^T \theta -\\ 
+        &= \int q(Z) q(\theta | \eta_1) \big[ n \log f(\theta) + \sum_{i = 1}^n \log p(x_i, z_i) + \theta^T \big( \sum_{i = 1}^n h(x_i, z_i) + \nu_0 \log f(\theta) + \eta_0^T \theta -\\
         &\equad - \log g(\nu_0, \eta_0)\big) - (n + \nu_0) \log f(\theta) - \theta^T \eta_1 + \log g(\nu_1, \eta_1)\big] dZ d\theta + \const \\
-        &= q(\theta | \eta_1) \left[ \theta^T \left( \sum_{i = 1}^n \EE h(x_i, z_i) + \eta_0 - \eta_1\right) + \log g(\nu_1, \eta_1)\right] d \theta \\ 
+        &= q(\theta | \eta_1) \left[ \theta^T \left( \sum_{i = 1}^n \EE h(x_i, z_i) + \eta_0 - \eta_1\right) + \log g(\nu_1, \eta_1)\right] d \theta \\
         &= \int q(\theta | \eta_1) \left[ \theta^T \left( \nu_0 - \eta_1 + \sum_{i = 1}^n \EE h(x_i, z_i)\right)\right] d \theta + \log g(\nu_1, \eta_1) + \const.
     \end{aligned}
 \end{equation*}
 
-so that 
+so that
 \begin{equation*}
-    \frac{\partial}{\partial \eta_1} \MI(\eta_1) = \underbrace{\frac{\partial^2 \log g(\eta_1, \nu_1)}{\partial^2 \eta_1}}_{F(\eta_1)} \left(\eta_0 - \eta_1 + \sum_{i = 1}^n \EE h(x_i, z_i) \right), 
+    \frac{\partial}{\partial \eta_1} \MI(\eta_1) = \underbrace{\frac{\partial^2 \log g(\eta_1, \nu_1)}{\partial^2 \eta_1}}_{F(\eta_1)} \left(\eta_0 - \eta_1 + \sum_{i = 1}^n \EE h(x_i, z_i) \right),
 \end{equation*}
 
 \begin{equation*}
     \begin{aligned}
-        \operatorname{natgrad} \MI(\eta_1) = \eta_0 - \eta_1 + \sum_{i = 1}^n \EE h(x_i, z_i); \\ 
+        \operatorname{natgrad} \MI(\eta_1) = \eta_0 - \eta_1 + \sum_{i = 1}^n \EE h(x_i, z_i); \\
         \operatorname{stochnatgrad} \MI(\eta_1) = \eta_0 - \eta_1 + n \EE_{z_j} h(x_j, z_j),
     \end{aligned}
 \end{equation*}
 
-where $j \sim U\{1, \ldots, n\}$. And the iterative algorithm is 
+where $j \sim U\{1, \ldots, n\}$. And the iterative algorithm is
 \begin{itemize}
-    \item Set $j \sim U\{1, \ldots, n\}$; 
+    \item Set $j \sim U\{1, \ldots, n\}$;
     \item Update $\log q(z_j) = \log p(x_j, z_j) + \EE \theta^T h(x_j, z_j) + \const$;
     \item $\eta_1^{t+1} = \eta_1^t + \alpha \left( \eta_0 - \eta_1^t + \EE h(x_j, z_j) \right)$.
 \end{itemize}
 
 \subsection{Seminar}
 
-And now once again about Fisher information. Let $p(x | \mu, \sigma^2) = \frac{1}{\sqrt{2\pi \sigma^2}} \exp \left( -\frac{(x - \mu)^2}{2\sigma^2} \right)$, then we can say that $\MI_x(\mu)$ represents the information contained in $x$ about $\mu$. $\MI_x(\mu) \propto \frac{1}{\sigma^2}$. 
+And now once again about Fisher information. Let $p(x | \mu, \sigma^2) = \frac{1}{\sqrt{2\pi \sigma^2}} \exp \left( -\frac{(x - \mu)^2}{2\sigma^2} \right)$, then we can say that $\MI_x(\mu)$ represents the information contained in $x$ about $\mu$. $\MI_x(\mu) \propto \frac{1}{\sigma^2}$.
 
 \begin{edefn}
     The \cursed{Fisher information} is defined as $$\MI(\theta) = \EE \left[ \left( \frac{\partial \log p(x | \theta)}{\partial \theta} \right)^2 \right] = -\EE \left[ \frac{\partial^2 \log p(x | \theta)}{\partial \theta^2} \right].$$
 \end{edefn}
 
-So for Gaussian distribution we have $\theta = \mu$, 
+So for Gaussian distribution we have $\theta = \mu$,
 \begin{equation*}
     \begin{aligned}
-        \log p(x | \mu) &= \log \frac{1}{\sqrt{2 \pi \sigma^2}}- \frac{(x - \mu)^2}{2\sigma^2}, \\ 
+        \log p(x | \mu) &= \log \frac{1}{\sqrt{2 \pi \sigma^2}}- \frac{(x - \mu)^2}{2\sigma^2}, \\
         \frac{\partial \log p(x | \mu)}{\partial \mu} &= \frac{x - \mu}{\sigma^2}, \\
         \MI(\theta) &= \frac{1}{\sigma^2}.
     \end{aligned}
@@ -123,9 +150,8 @@ \subsection{Seminar}
 
 Alternative definition of Fisher information is
 \begin{enumerate}
-    \item $\MI_x(\theta) = - \EE(l''(\theta | x))$; 
+    \item $\MI_x(\theta) = - \EE(l''(\theta | x))$;
     \item $\MI_x(\theta) = \Var (l'(\theta |x))$,
 \end{enumerate}
 
-where $l$ is a score function, $l(\theta | x) = \log p(x | \theta)$. 
-
+where $l$ is a score function, $l(\theta | x) = \log p(x | \theta)$.
diff --git a/Lectures/lecture03.tex b/Lectures/lecture03.tex
index 1df37ec..bfd8168 100644
--- a/Lectures/lecture03.tex
+++ b/Lectures/lecture03.tex
@@ -1,5 +1,3 @@
-\section{Lecture 3: Dropout}
-
 \subsection{Lecture}
 
 How does standard regularization works in Bayesian world?
diff --git a/neurobayes.tex b/neurobayes.tex
index 1c358b2..f5a8f90 100644
--- a/neurobayes.tex
+++ b/neurobayes.tex
@@ -3,31 +3,34 @@
 \input{preambule.tex}
 
 \usepackage{fancyhdr}
-\pagestyle{fancy} 
-    \fancyhead{} 
+\pagestyle{fancy}
+    \fancyhead{}
     \fancyhead[CO]{\normalsize{Fall 2024}}
-    \fancyhead[LO]{\normalsize{CUB}} 
+    \fancyhead[LO]{\normalsize{CUB}}
 
-\title{Deep Bayesian methods}
-\author{By Dmitryi Vetrov} 
+\title{Deep Bayesian Models}
+\author{By Dmitry Vetrov}
 \date{Spring 2024}
 
-\begin{document}  
+\begin{document}
 
 \maketitle
 
 \tableofcontents
 
 \newpage
+\setlength{\parindent}{0pt}
 
-\section{Lecture 1. Stochastical variational inference}
+\section{Lecture 1: Stochastic Variational Inference}
 
 \input{Lectures/lecture01.tex}
 
-\section{Lecture 2}
+\section{Lecture 2: Doubly Stochastic Variational Inference}
 
 \input{Lectures/lecture02.tex}
 
+\section{Lecture 3: Dropout}
+
 \input{Lectures/lecture03.tex}
 
-\end{document}
\ No newline at end of file
+\end{document}
diff --git a/preambule.tex b/preambule.tex
index 7251e26..77278d9 100644
--- a/preambule.tex
+++ b/preambule.tex
@@ -14,6 +14,7 @@
 \usepackage{amsmath,amsfonts,amssymb,amsthm,mathtools}
 \usepackage{icomma}
 \usepackage{euscript}
+\usepackage{physics}
 \usepackage{mathrsfs}
 \usepackage[dvipsnames]{xcolor}
 \usepackage[left=2cm,right=2cm,
@@ -94,21 +95,21 @@
 \DeclareMathOperator{\Imf}{Im}
 \DeclareMathOperator{\cont}{cont}
 \DeclareMathOperator{\id}{id}
-\DeclareMathOperator{\ev}{ev}
+% \DeclareMathOperator{\ev}{ev}
 \DeclareMathOperator{\lcm}{lcm}
 \DeclareMathOperator{\chard}{char}
 \DeclareMathOperator{\codim}{codim}
-\DeclareMathOperator{\rank}{rank}
+% \DeclareMathOperator{\rank}{rank}
 \DeclareMathOperator{\ord}{ord}
 \DeclareMathOperator{\End}{End}
 \DeclareMathOperator{\Ann}{Ann}
 \DeclareMathOperator{\Real}{Re}
-\DeclareMathOperator{\Res}{Res}
+% \DeclareMathOperator{\Res}{Res}
 \DeclareMathOperator{\Rad}{Rad}
 \DeclareMathOperator{\disc}{disc}
 \DeclareMathOperator{\rk}{rk}
 \DeclareMathOperator{\const}{const}
-\DeclareMathOperator{\grad}{grad}
+% \DeclareMathOperator{\grad}{grad}
 \DeclareMathOperator{\Aff}{Aff}
 \DeclareMathOperator{\Lin}{Lin}
 \DeclareMathOperator{\Prf}{Pr}
@@ -116,7 +117,7 @@
 \DeclareMathOperator{\cov}{cov}
 \DeclareMathOperator{\argmax}{argmax}
 \DeclareMathOperator{\argmin}{argmin}
-\DeclareMathOperator{\tr}{\textbf{tr}}
+% \DeclareMathOperator{\tr}{\textbf{tr}}
 %specific_shit
 \DeclareMathOperator{\Tors}{Tors}
 \DeclareMathOperator{\form}{Form}
@@ -148,7 +149,7 @@
 \newcommand{\bea}{\begin{eqnarray*}}
     \newcommand{\eea}{\end{eqnarray*}}
 
-    \newcommand{\abs}[1]{\lvert#1\rvert}
+    % \newcommand{\abs}[1]{\lvert#1\rvert}
     \newcommand{\bp}{\begin{prob}}
         \newcommand{\ep}{\end{prob}}
         \newcommand{\be}{\begin{ex}}
@@ -199,32 +200,17 @@
 
 %envirnoments
 \theoremstyle{indented}
-\newtheorem{theorem}{Теорема}
-\newtheorem{lemma}{Лемма}
-\newtheorem{alg}{Алгоритм}
 \newtheorem*{etheorem}{Theorem}
 \newtheorem{elemma}{Lemma}
-\newtheorem{ealg}{Algorythm}
-\newtheorem{st}{Статья}
+\newtheorem{ealg}{Algorithm}
 
 \theoremstyle{definition} 
-\newtheorem{defn}{Определение}
-\newtheorem*{exl}{Пример(ы)}
 \newtheorem{prob}{ }
-\newtheorem{problem}{Задача}
 \newtheorem{edefn}{Definition}
 \newtheorem*{eexl}{Example(s)}
 \newtheorem{eproblem}{Problem}
 
 \theoremstyle{remark} 
-\newtheorem*{remark}{Примечание}
-\newtheorem*{hint}{Подсказка}
-\newtheorem*{cons}{Следствие}
-\newtheorem{exer}{Упражнение}
-\newtheorem{stat}{Утверждение}
-\newtheorem*{prop}{Свойство(а)}
-\newtheorem*{sol}{Решение}
-\newtheorem*{ans}{Ответ}
 \newtheorem*{eremark}{Remark}
 \newtheorem*{ehint}{Hint}
 \newtheorem*{econs}{Corollary}
@@ -240,4 +226,3 @@
 \newcommand{\reset}{%
   \setcounter{prob}{0}%
 } 
-