[GIST 2022 Spring 기계학습 및 딥러닝 HW3] Gaussian Mixture Model, Do detailed derivations of EM algorithm for GMM, in the case of arbitrary covariance matrices.

광주과학기술원 2002 Spring semester 손진희 교수님의 기계학습 및 딥러닝의 Homework 3 문제와 풀이입니다. 틀린내용 있을 수 있으니 너그러히 봐주시면 감사하겠습니다. 문제 시에 곧바로 삭제하겠습니다.

Problem 1 Gaussian Mixture Model (100 Points)

Gaussian mixture model is a family of distributions whose pdf is in the following form:
\begin{align}
\text{gmm}(\mathbf{x}) = p(\mathbf{x}) = \sum_{k=1}^{K} \pi_k \mathcal{N}(\mathbf{x}|\mathbf{\mu}_k,\Sigma_k),
\end{align}
where $\mathcal{N}(\cdot|\mathbf{\mu},\Sigma)$ denotes the Gaussian pdf with mean $\mathbf{\mu}$ and covariance matrix $\Sigma$, and $\{\pi_1, \dots, \pi_K\}$ are mixing coefficients satisfying
\begin{align}
\pi_k = p(y=k), ~~\sum_{k=1}^K \pi_k=1,~~~\pi_k\ge0, ~~~k=\{1,\dots,K\}.
\end{align}

– Do detailed derivations of EM algorithm for GMM, in the case of arbitrary covariance matrices.

[Hint1]
** Derive E-step and M-step of EM algorthm for GMM as follows. **

Compute the “responsibilities” $\gamma_{ik}=p(y_i=k|x_i,\theta)$ of each cluster with the current parameters.
Compute complete data log-likelihood and its lower bound (expected complete data log-likelihood) using the existing “responsibilities”.
Re-estimate parameters.

[Hint2] You can find the derivatives of matrices and vectors in “The Matrix Cookbook”

~~~~~~~~~~(https://www2.imm.dtu.dk/pubdb/edoc/imm3274.pdf).

[Hint3] In order to solve $\pi$, you have to use Lagrange multipier to handle the above contraints.

[Hint4] You can use those properties of matrix derivatives.
\begin{eqnarray}
\frac{\partial\det(\mathbf{X})}{\partial\mathbf{X}}=\det(\mathbf{X})(\mathbf{X}^{-1})\\
\frac{\partial\mathbf{a}^T\mathbf{X}^{-1}\mathbf{b}}{\partial\mathbf{X}}=-\mathbf{X}^{-T}\mathbf{ab}^T\mathbf{X}^{-T}
\end{eqnarray}

 

$\cdot$Initialize the means $\widehat{\mathbf{\mu}_k}$, covariances $\widehat{\Sigma_k}$ and mixing coefficients $\widehat{\pi_k}$\\
$\cdot$Let $\widehat{\theta}=(\widehat{\mathbf{\mu}},\widehat{\Sigma}, \widehat{\pi})$ and $\widehat{\mathbf{\mu}} = (\widehat{\mathbf{\mu}_1},…,\widehat{\mathbf{\mu}_K}),\widehat{\Sigma} = (\widehat{\Sigma_1},…,\widehat{\Sigma_K}),\widehat{\pi}=(\widehat{\pi_1},…,\widehat{\pi_K}) $\\
$\cdot$Evaluate the initial value of the marginal log-likelihood
\begin{align}
\log(p(\mathbf{x}_1,\mathbf{x}_2,…,\mathbf{x}_N | \widehat{\theta})) = \log \left (\sum_{k_1,k_2,…,k_N}p(\mathbf{x}_1,\mathbf{x}_2,…,\mathbf{x}_N,y_1=k_1,y_2=k_2,…,y_N=k_N|\widehat{\theta})\right)
\end{align}
$\cdot$Let $N$, $D$ be the number of data and dimension of $\mathbf{x}$ respectively

E-Step

$\cdot$Responsibilities
\begin{align}
\gamma_{nk} = p(y_n = k |\mathbf{x}_n,\widehat{\theta}) = \frac{p(\mathbf{x}_n|y_n=k,\widehat{\theta})p(y_n=k|\widehat{\theta})}{\sum_{l=1}^K p(\mathbf{x}_n | y_n=l,\widehat{\theta}) p(y_n=l|\widehat{\theta})}=\frac{\widehat{\pi_k} \mathcal{N}(\mathbf{x}_n|\widehat{\mathbf{\mu}_k},\widehat{\Sigma_k})}{\sum_{l=1}^{K}\widehat{\pi_l} \mathcal{N}(\mathbf{x}_n|\widehat{\mathbf{\mu}_l},\widehat{\Sigma_l})}
\end{align}
$\cdot$Complete data log-likelihood\\
Let $\theta=(\mathbf{\mu},\Sigma,\pi)$ be arbitrary parameter where ${\mathbf{\mu}} = ({\mathbf{\mu}_1},…,{\mathbf{\mu}_K}),{\Sigma} = ({\Sigma_1},…,{\Sigma_K}),{\pi}=({\pi_1},…,{\pi_K}) $
\begin{align*}
\log p(\mathbf{x}_1,\mathbf{x}_2,…,\mathbf{x}_N,y_1=k_1,…,y_N=k_N|\theta) & = \log \prod_{n=1}^N p(\mathbf{x}_n, y_n=k_n |\theta)\\
&=\sum_{n=1}^N \log p(\mathbf{x}_n | y_n=k_n,\theta)p(y_n=k_n|\theta)\\
&=\sum_{n=1}^N a_n(k_n,\theta)
\end{align*}
where
\begin{align}
a_n(k,\theta) = -\frac{D}{2}\log 2\pi – \frac{1}{2} \log \det \Sigma_k – \frac{1}{2}(\mathbf{x}_n-\mu_k)^T \Sigma_k^{-1}(\mathbf{x}_n-\mu_k)+\log \pi_k
\end{align}
$\cdot$Expected complete data log-likelihood
\begin{align}
q(y_1=k_1,…,y_N=k_N) = \prod_{n=1}^N \gamma_{nk_n}
\end{align}
\begin{align*}
q(y_1=k_1,…,y_N=k_N) &= p(y_1=k_1,…,y_N=k_N|\mathbf{x}_1,\mathbf{x}_2,…,\mathbf{x}_N,\widehat{\theta})\\
&= \frac{p(\mathbf{x}_1,\mathbf{x}_2,…,\mathbf{x}_N,y_1=k_1,…,y_N=k_N|\widehat{\theta)}}{p(\mathbf{x}_1,\mathbf{x}_2,…,\mathbf{x}_N|\widehat{\theta})}\\
&=\frac{\prod_{n=1}^N p(\mathbf{x}_n,y_n=k_n | \widehat{\theta})}{\prod_{n=1}^N p(\mathbf{x}_n| \widehat{\theta})}\\
&=\prod_{n=1}^N\frac{ p(\mathbf{x}_n,y_n=k_n | \widehat{\theta})}{ p(\mathbf{x}_n| \widehat{\theta})}\\
&= \prod_{n=1}^N p(y_n=k_n | \mathbf{x}_n,\widehat{\theta})\\
&= \prod_{n=1}^N \gamma_{nk_n}
\end{align*}
\begin{align}
L(q, \theta) = E_{q(Y_1,…,Y_N)}\left[\log \frac{p(\mathbf{x}_1,\mathbf{x}_2,…,\mathbf{x}_N,Y_1,Y_2,…,Y_N|\theta)}{q(Y_1,…,Y_N)} \right]
\end{align}

\begin{align*}
L(q, \theta) &= E_{q(Y_1,…,Y_N)}\left[\sum_{n=1}^N(a_n(Y_n,\theta) -\log \gamma_{nY_n})\right]\\ &=\sum_{n=1}^N E_{q(Y_1,…,Y_N)}\left[ a_n(Y_n,\theta)-\log \gamma_{nY_n}\right]\\ & =\sum_{n=1}^N\left(\sum_{k=1}^K\left(\gamma_{nk}\left(a_n(k,\theta) – \log r_{nk}\right)\right)\right)
\end{align*}
\begin{align}
L(q,\theta)= \sum_{n=1}^{N} \left(\sum_{k=1}^K \gamma_{nk} \left( -\frac{D}{2}\log 2\pi – \frac{1}{2} \log \det \Sigma_k – \frac{1}{2}(\mathbf{x}_n-\mu_k)^T \Sigma_k^{-1}(\mathbf{x}_n-\mu_k)+\log \pi_k – \log \gamma_{nk}\right) \right)
\end{align}

M-Step

\begin{align*}
\frac{\partial L}{\partial \mathbf{\mu}_k} = – \frac{1}{2}\sum_{n=1}^{N} \frac{\partial }{\partial \mathbf{\mu}_k} (\mathbf{x}_n-\mu_k)^T \Sigma_k^{-1}(\mathbf{x}_n-\mu_k)\gamma_{nk}=-\frac{1}{2}\sum_{n=1}^N(\mathbf{\mu}_k-\mathbf{x}_n)\gamma_{nk}\Sigma_k^{-1}
\end{align*}
\begin{align}
\frac{\partial L}{\partial \mathbf{\mu}_k} =0 \to \sum_{n=1}^N(\mathbf{\mu}_k-\mathbf{x}_n)\gamma_{nk} = 0 \to \mathbf{\mu}_{k_{new}} = \frac{\sum_{n=1}^N\gamma_{nk}\mathbf{x}_n}{\sum_{n=1}^N\gamma_{nk}}
\end{align}

From the fact $\Sigma_k$ is symmetric,
\begin{align*}
\frac{\partial L}{\partial \Sigma_k} &= -\frac{1}{2} \sum_{n=1}^N\frac{\partial }{\partial \Sigma_k}\left(\gamma_{nk} \left( \log \det \Sigma_k +(\mathbf{x}_n-\mu_k)^T \Sigma_k^{-1}(\mathbf{x}_n-\mu_k)\right) \right)\\
&= -\frac{1}{2}\sum_{n=1}^N\left(\gamma_{nk} \left( \frac{1}{\det \Sigma_k}\det (\Sigma_k)\Sigma_k^{-1}- \Sigma_k^{-1}(\mathbf{x}_n-\mathbf{\mu}_k)(\mathbf{x}_n-\mathbf{\mu}_k)^T\Sigma_k^{-1} \right)\right)\\
&=-\frac{1}{2}\sum_{n=1}^N\gamma_{nk} \Sigma_k^{-1}+\frac{1}{2}\Sigma_k^{-1}\gamma_{nk}\sum_{n=1}^N (\mathbf{x}_n-\mathbf{\mu}_k)(\mathbf{x}_n-\mathbf{\mu}_k)^T\Sigma_k^{-1}
\end{align*}
\begin{align}
\frac{\partial L}{\partial \Sigma_k} = 0 \to \Sigma_k = \frac{\sum_{n=1}^N\gamma_{nk}(\mathbf{x}_n-\mathbf{\mu}_k)(\mathbf{x}_n-\mathbf{\mu}_k)^T}{\sum_{n=1}^N \gamma_{nk}} \to \Sigma_{k_{new}} = \frac{\sum_{n=1}^N\gamma_{nk}(\mathbf{x}_n-\mathbf{\mu}_{k_{new}})(\mathbf{x}_n-\mathbf{\mu}_{k_{new}})^T}{\sum_{n=1}^N \gamma_{nk}}
\end{align}
Let $g(\pi)=\sum_{k=1}^K \pi_k$.By Definition of $\pi_k$, $g(\pi)=1$. Consider Lagrange multiplier method as below
\begin{align*}
&\frac{\partial L}{\partial \pi} = \lambda \frac{\partial g}{\partial \pi}\\
&\text{constrained } g(\pi)=1
\end{align*}
\begin{align}
\frac{\partial L}{\partial \pi_k}=\lambda \frac{\partial g}{\partial \pi_k} \to \sum_{n=1}^N \frac{\gamma_{nk}}{\pi_k} = \lambda\to\pi_k = \frac{\sum_{n=1}^N \gamma_{nk}}{\lambda} \to \sum_{k=1}^K \pi_k = \frac{1}{\lambda} \sum_{n=1}^{N}\sum_{k=1}^K \gamma_{nk} \to \lambda = N \to \pi_{k_{new}} = \frac{\sum_{n=1}^N\gamma_{nk}}{N}
\end{align}
Updated parameters are obtained as below.
\begin{align}
&\mathbf{\mu}_{k_{new}} = \frac{\sum_{n=1}^N\gamma_{nk}\mathbf{x}_n}{\sum_{n=1}^N\gamma_{nk}}\\
& \Sigma_{k_{new}} = \frac{\sum_{n=1}^N\gamma_{nk}(\mathbf{x}_n-\mathbf{\mu}_{k_{new}})(\mathbf{x}_n-\mathbf{\mu}_{k_{new}})^T}{\sum_{n=1}^N \gamma_{nk}}\\
&\pi_{k_{new}} = \frac{\sum_{n=1}^N\gamma_{nk}}{N}
\end{align}
Evaluate the log-likelihood
\begin{align*}
\log p(\mathbf{x}_1,…,\mathbf{x}_N|\mathbf{\mu}_{new},\Sigma_{new},\pi_{new})= \sum_{n=1}^N \log\left\{\sum_{k=1}^N\pi_{k_{new}} \mathcal{N}(\mathbf{x}_n | \mathbf{\mu}_{k_{new}}, \Sigma_{k_{new}}) \right\}
\end{align*}
If parameter or log-likelihood does not converge, return to E-step.

Leave a Comment