Given a dataset y \in \mathbb{R}^N, X \in \mathbb{R}^{N \times D}.
Let w \in \mathbb{R}^D be weights.
A model of Bayesian linear regression is,
\begin{align*}
p(y, w|X) &= p(y|X, w) p(w) \\
&= \mathcal{N}(y | X w, \sigma_y^2 I_N) \mathcal{N}(w | 0, \sigma_w^2 I_D)
\end{align*}
The posterior is
\begin{align*}
p(w|X, y) &= \frac{p(X, y, w)}{p(X, y)} \\
&= \frac{p(X, y | w) p(w)}{p(X, y)} \\
&= \frac{p(y | X, w) p(X) p(w)}{p(y|X) p(X)} \\
&= \frac{p(y|X,w) p(w)}{p(y|X)} \\
&\propto p(y|X,w) p(w) \\
&= \mathcal{N}(y | X w, \sigma_y^2 I_N) \mathcal{N}(w | 0, \sigma_w^2 I_D) \\
&= \frac{1}{\sqrt{(2 \pi)^N |\sigma_y^2 I_N|}} \exp \left( -\frac{1}{2 \sigma_y^2} (y - X w)^\top (y - X w) \right)
\frac{1}{\sqrt{(2 \pi)^D |\sigma_w^2 I_D|}} \exp \left( -\frac{1}{2 \sigma_w^2} w^\top w \right)
\end{align*}
\begin{align*}
\ln p(w|y, X)
&\propto -\frac{1}{2 \sigma_y^2} (y - X w)^\top (y - X w) -\frac{1}{2 \sigma_w^2} w^\top w \\
&\propto -\frac{1}{2} \left\{
\frac{1}{\sigma_y^2} \left( y^\top y -2 y^\top X w + w^\top X^\top X w \right)
+ \frac{1}{\sigma_w^2} w^\top w
\right\} \\
&\propto -\frac{1}{2} \left\{
w^\top (\frac{1}{\sigma_y^2} X^\top X + \frac{1}{\sigma_w^2} I_D) w - 2 \frac{1}{\sigma_y^2} y^\top X w
\right\}
\end{align*}
Therefore,
\begin{align*}
p(w|X, y) &= \mathcal{N}(w| \mu_w, \Sigma_w) \,, \mu_w \in \mathbb{R}^D \,, \Sigma_w \in \mathbb{R}^{D \times D}\\
\Sigma_w^{-1} &= \frac{1}{\sigma_y^2} X^\top X + \frac{1}{\sigma_w^2} I_D \\
\mu_w^\top \Sigma_w^{-1} &= \frac{1}{\sigma_y^2} y^\top X
\iff \mu_w^\top = \frac{1}{\sigma_y^2} y^\top X \Sigma_w
\iff \mu_w = \frac{1}{\sigma_y^2} \Sigma_w X^\top y
\end{align*}
Let X_* \in \mathbb{R}^{M \times D} be new data points and y_* \in \mathbb{R}^M be predictions.
Let's derive the predictive distribution p(y_*|X_*).
\begin{align*}
p(y_* | X_*, X, y) &= \int p(y_*, w | X_*, X, y) dw \\
&= \int p(y_*|X_*, w) p(w|X, y) dw \\
&= \int \mathcal{N}(y_* | X_* w, \sigma_y^2 I_M) \mathcal{N}(w|\mu_w, \Sigma_w) dw \\
\end{align*}
Here is a useful formula.
\begin{align*}
p(x) &= \mathcal{N}(x| \mu_x, \Sigma_x) \\
p(y|x) &= \mathcal{N}(y | A x + b, \Sigma_{y|x}) \\
p(y) &= \int p(y | x) p(x) dx \\
&= \mathcal{N}(y|A \mu_x + b, \Sigma_{y|x} + A \Sigma_{x} A^\top)
\end{align*}
Let x=w , \mu_x = \mu_w , \Sigma_x = \Sigma_w , y=y_* , A = X_* , b=0 , \Sigma_{y|x} = \sigma_y^2 I_M and we have
\begin{align*}
p(y_* | X_*, X, y) &= \mathcal{N}(y_*|\mu_{y_*}, \Sigma_{y_*}) \\
\mu_{y_*} &= X_* \mu_w \\
\Sigma_{y_*} &= \sigma_y^2 I_M + X_* \Sigma_w X_*^\top
\end{align*}
Discussion