The generative process of Latent Dirichlet Allocation (LDA) is as follows:
- for each document d = 1, 2, \dots, D
-
\theta_d \sim \mathrm{Dir}(\alpha) , \alpha = (\alpha_1, \dots, \alpha_K)
-
\theta_d means topic distribution of the document d.
- for each topic k = 1, 2, \dots, K
-
\phi_k \sim \mathrm{Dir}(\beta) , \beta = (\beta_1, \dots, \beta_V)
- Let \mathcal{V} = \set{1, 2, \dots, V} be index set of vocablaries.
-
\phi_k means word distribution of topic k.
- for each word position i in the document d
-
z_{d,i} \sim \mathrm{Mult}(\theta_d) , z_{d,i} \in \set{1, 2, \dots, K}
-
w_{d,i} \sim \mathrm{Mult}(\phi_{z_{d,i}}) , w_{d, i} \in \mathcal{V}
The joint probability is
\begin{align*}
p(w, z, \theta, \phi | \alpha, \beta) &= p(w|z,\phi) p(z|\theta) p(\theta|\alpha) p(\phi|\beta) \\
&= \left( \prod_{d=1}^D \prod_{i=1}^{n_d} p(w_{d,i} | \phi_{z_{d,i}}) p(z_{d,i} | \theta_{d}) \right)
\left( \prod_{d=1}^D p(\theta_d | \alpha) \right)
\left( \prod_{k=1}^K p(\phi_k | \beta) \right)
\end{align*}
To obtain Gibbs samplers, we need posterior distributions of latent variables z, \theta, \phi.
\begin{align*}
p(z_{d,i} &= k | w_{d,i} = v, w^{\backslash d,i}, z^{\backslash d,i}, \phi, \theta, \alpha, \beta) \\
&\propto p(z_{d,i} = k, w_{d,i} = v, w^{\backslash d,i}, z^{\backslash d,i}, \phi, \theta | \alpha, \beta) \\
&= p(w_{d,i} | z_{d,i}, \phi) p(z_{d,i}|\theta_d)
p(w^{\backslash d,i} | z^{\backslash d,i}, \phi) p(z^{\backslash d,i} | \theta^{\backslash d})
p(\phi | \beta) p(\theta | \alpha) \\
&\propto p(w_{d,i} | z_{d,i}, \phi) p(z_{d,i}|\theta_d) \\
&= \phi_{k,v} \theta_{d,k}
\end{align*}
\begin{align*}
p(\theta_d &| w, z, \phi, \theta^{\backslash d}, \alpha, \beta) \\
&\propto p(\theta_d, w, z, \phi, \theta^{\backslash d} | \alpha, \beta) \\
&= p(w|z,\phi) p(z_d|\theta_d) p(z^{\backslash d} | \theta^{\backslash d}) p(\theta_d | \alpha) p(\theta^{\backslash d} | \alpha) p(\phi|\beta) \\
&\propto p(z_d|\theta_d) p(\theta_d | \alpha) \\
\end{align*}
\begin{align*}
p(z_d|\theta_d) &= \prod_{i=1}^{n_d} p(z_{d,i} | \theta_d)
= \prod_{k=1}^K \prod_{i=1}^{n_d} \theta_{d,k}^{\delta(z_{d,i}, k)}
= \prod_{k=1}^{K} \theta_{d,k}^{\sum_{i=1}^{n_d} \delta(z_{d,i}, k)}
= \prod_{k=1}^{K} \theta_{d,k}^{n_{d,k}}
\end{align*}
\begin{align*}
p(\theta_d | \alpha) = \mathrm{Dir}(\alpha) \propto \prod_{k=1}^K \theta_{d,k}^{\alpha_k - 1}
\end{align*}
\begin{align*}
\therefore p(\theta_d &| w, z, \phi, \theta^{\backslash d}, \alpha, \beta) \propto
\prod_{k=1}^{K} \theta_{d,k}^{n_{d,k}} \theta_{d,k}^{\alpha_k - 1}
= \prod_{k=1}^{K} \theta_{d,k}^{n_{d,k} + \alpha_k - 1}
\end{align*}
\begin{align*}
p(\phi_k &| w, z, \phi^{\backslash k}, \alpha, \beta) \\
&\propto p(\phi_k, w, z, \phi^{\backslash k} | \alpha, \beta) \\
&= p(w|z, \phi_k) p(\phi|\beta) \cancel{p(z|\theta) p(\theta|\alpha)} \\
&\propto \left( \prod_{d=1}^D \prod_{i=1}^{n_d} p(w_{d,i} | \phi, z_{d,i}) \right)
\prod_{k=1}^K p(\phi_k | \beta) \\
\end{align*}
\begin{align*}
\prod_{d=1}^D \prod_{i=1}^{n_d} p(w_{d,i} | \phi, z_{d,i})
&= \prod_{d=1}^D \prod_{i=1}^{n_d} \prod_{k=1}^K \prod_{v=1}^V \phi_{k,v}^{\delta(w_{d,i}=v, z_{d,i}=k)} \\
&= \prod_{k=1}^K \prod_{v=1}^V \phi_{k,v}^{\sum_{k=1}^K \sum_{v=1}^V \delta(w_{d,i}=v, z_{d,i}=k)} \\
&= \prod_{k=1}^K \prod_{v=1}^V \phi_{k,v}^{n_{k,v}} \\
p(\phi_k | \beta) &\propto \prod_{v=1}^V \phi_{k,v}^{\beta_v -1}
\end{align*}
\begin{align*}
\therefore p(\phi_k | w, z, \phi^{\backslash k}, \alpha, \beta)
&= \prod_{k=1}^K \prod_{v=1}^V \phi_{k,v}^{n_{k,v}} \prod_{v=1}^V \phi_{k,v}^{\beta_v -1} \\
&= \prod_{v=1}^V \phi_{k,v}^{n_{k,v} + \beta_v -1} \\
\end{align*}
Discussion