From 2ce8baccede43f3cacba6ca530c9ff167efc7062 Mon Sep 17 00:00:00 2001 From: Tran Le <45803288+tranktle@users.noreply.github.com> Date: Sun, 3 Apr 2022 16:56:01 -0500 Subject: [PATCH] Add files via upload --- 02_mixture_3Gaussian_1d.html | 1812 ++++++++++++++++++++++++++++++++ Note_3G_1D.html | 1917 ++++++++++++++++++++++++++++++++++ 2 files changed, 3729 insertions(+) create mode 100644 02_mixture_3Gaussian_1d.html create mode 100644 Note_3G_1D.html diff --git a/02_mixture_3Gaussian_1d.html b/02_mixture_3Gaussian_1d.html new file mode 100644 index 0000000..d942224 --- /dev/null +++ b/02_mixture_3Gaussian_1d.html @@ -0,0 +1,1812 @@ + + + + + + + + + + + + + + +01_ Mixture 3 Gaussian_1 Dimention + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+
+
+
+
+ +
+ + + + + + + +
+

1 Introduction

+

Here is the R code that I use to illustrate a simulation example using the EM algorithm for Gaussian mixture models with 3 (Gaussian) distributions in 1D. The EM formula construction and the pseudo-code can be found here.

+
+
+

2 Utility functions

+
    +
  • Functions to create and plot the synthesis data.
  • +
+
creat_3G_dat_f <- function(n, p1, mu1, sig1, p2, mu2, sig2, mu3, sig3){
+   # # Create synthesis data
+   # n: the total sample size of the three distributions
+   # p1, p2: the probability of a data point belongs to the 1st, 2nd distribution
+   # p1+p2 < 1 since we have p3=1-(p1+p2) is the proportion of 3rd distribution 
+   n1 <- floor(n*p1); Y1 <- rnorm(n1, mu1, sig1)
+   n2 <- floor(n*p2); Y2 <- rnorm(n2, mu2, sig2)
+   n3 <- ceiling(n*(1-p1-p2)); Y3 <- rnorm(n3, mu3, sig3)
+   data <- data.frame(
+      dist = c( rep("1", n1), rep("2", n2) , rep("3", n3)),
+      value = c( Y1, Y2, Y3))
+   return(data)
+}
+
+plot_3G_syn_dat_f <- function(data){
+   # a data set created by function creat_dat_f
+   p1 <- data %>%
+      ggplot(aes(x=value, fill=dist), addDensityCurve=TRUE) +
+      geom_histogram( color="#e9ecef", alpha=0.6, 
+                      position = 'identity', binwidth=1) +
+      scale_fill_manual(values=c("#69b3a2", "#404080", "#80427d"))
+   p2 <- data %>%
+      ggplot(aes(x=value)) +
+      geom_histogram( color="#e9ecef", alpha=0.6, position = 'identity', binwidth=1) 
+   theplot <- plot_grid(p1, p2, labels=c("Three data components","Mixture data"),
+                        ncol = 2, nrow = 1)
+   return(theplot)
+}
+
    +
  • Function to update the posterior probability
  • +
+
com_3G_pi_f <-function(y, p1, mu1, sig1, p2, mu2, sig2, mu3, sig3){
+   # compute posterior probability associative with each yi
+   f1 <- dnorm(y, mean = mu1, sd = sig1)
+   f2 <- dnorm(y, mean = mu2, sd = sig2)
+   f3 <- dnorm(y, mean = mu3, sd = sig3)
+   denorm <- (p1*f1)+(p2*f2)+((1-p1-p2)*f3)
+   pi_i1 <- (p1*f1)/denorm
+   pi_i2 <- (p2*f2)/denorm
+   return(c(pi_i1, pi_i2, 1-pi_i1-pi_i2))
+}
+
    +
  • The main EM function
  • +
+
EM_Three_gaussian_f <-function(Y, p1, mu1, sig1, p2, mu2, sig2, mu3, sig3, eps, max_iter, verbose) { 
+   # Function to estimate parameters based on initial guesses
+   # p1, mu1, sig1, p2, mu2, sig2, mu3, sig3: INITIAL  parameter values 
+   # eps" stopping criterion threshold
+   # max_iter: maximum number of EM iteration
+   # verbose = T if you want to see the updated estimation values in each iteration step
+   n <- length(Y)
+   for (i in (1:max_iter)){
+      # Compute the (s+1) step from (s) step
+      pij  <- map(Y, ~com_3G_pi_f(.x, p1, mu1, sig1, p2, mu2, sig2, mu3, sig3))
+      pij <- as.data.frame(do.call(rbind, pij)) 
+      p1_new <- sum(pij$V1)/n
+      p2_new <- sum(pij$V2)/n
+      #
+      mu1_new <- sum(Y*pij$V1)/sum(pij$V1)
+      sig1_new <- sqrt(sum(((Y-mu1)^2)*pij$V1)/sum(pij$V1))
+      mu2_new <- sum(Y*pij$V2)/sum(pij$V2)
+      sig2_new <- sqrt(sum(((Y-mu2)^2)*pij$V2)/sum(pij$V2))
+      mu3_new <- sum(Y*pij$V3)/sum(pij$V3)
+      sig3_new <- sqrt(sum(((Y-mu3)^2)*pij$V3)/sum(pij$V3))
+      if(verbose){cat("\n at step ", i, ": ", 
+                      mu1_new, sig1_new, mu2_new,sig2_new, mu3_new, sig3_new)}
+      # stopping criteria
+      if (max(abs(p1_new-p1), abs(p2_new -p2), 
+              abs(mu1_new -mu1), abs(mu2_new-mu2), abs(mu3_new-mu3))< eps){break}
+      # Update
+      p1 <- p1_new; mu1 <- mu1_new; sig1 <- sig1_new
+      p2 <- p2_new; mu2 <- mu2_new; sig2 <- sig2_new
+      mu3 <- mu3_new; sig3 <- sig3_new
+   } # end for loop
+   if (i==max_iter){cat("\n The stopping criterium does not meet \n")
+   } else {cat("\n The EM converge at iteration", i, '\n')}
+   return(list(pij=pij, 
+               p1=p1_new, mu1=mu1_new, sig1=sig1_new, 
+               p2=p2_new, mu2=mu2_new, sig2=sig2_new, 
+               p3= 1- p1_new-p2_new, mu3=mu3_new, sig3=sig3_new))
+}
+
    +
  • A function to do the EM clustering.
  • +
+
create_comp_dat_3G_1D_f <- function(data, pij){
+   # Out: comp_dat_2D: contain original label (dist) and EM cluster prediction (pred)
+   pij$Max<-colnames(pij)[apply(pij,1,which.max)]
+   pij <- pij %>% mutate(Max_col = str_sub(Max, start = 2, end = 2))
+   comp_dat_2D <- cbind(data, pred = pij$Max_col)
+   comp_dat_2D <- comp_dat_2D %>% mutate(err = as.integer(dist)-as.integer(pred))
+   return(comp_dat_2D)
+}
+
    +
  • Function to plot the original data and the predicted data (histograms) by EM clustering.
  • +
+
plot_1D_org_pre_dat_f <- function(comp_dat){
+   # Plot the original and predicted data gotten from EM algorithm
+   p1 <- comp_dat %>%
+      ggplot(aes(x=value, fill=dist), addDensityCurve=TRUE) +
+      geom_histogram( color="#e9ecef", alpha=0.6,
+                      position = 'identity', binwidth=1) +
+      scale_fill_manual(values=c("#69b3a2", "#404080", "#80427d"))
+   p2 <- comp_dat %>%
+      ggplot(aes(x=value, fill=pred)) +
+      geom_histogram( color="#e9ecef", alpha=0.6, 
+                      position = 'identity', binwidth=1) 
+   theplot <- plot_grid(p1, p2, 
+                        labels=c("Orginial data components",
+                                 "Predicted data components"), 
+                        ncol = 2, nrow = 1)
+   return(theplot)
+}
+
+
+

3 Example

+
# EXAMPLE 1 Set up a  model----------------------------------------
+set.seed(538725)
+# Parameters true values 
+Tv <- data.frame(p1=0.5, mu1=50, sig1=3,
+                 p2=0.2, mu2=80, sig2=4,
+                 mu3=85, sig3=3)
+data <- creat_3G_dat_f(n=1000, 
+                       p1=Tv$p1, mu1=Tv$mu1, sig1=Tv$sig1, 
+                       p2=Tv$p2, mu2=Tv$mu2, sig2=Tv$sig2, 
+                       mu3=Tv$mu3, sig3=Tv$sig3)
+# png("./EX_Result/EX1_3G_1d_data.png");
+plot_3G_syn_dat_f(data)
+

+
# ;dev.off()
+# Parameter Initial values
+
+

3.1 If we choose the correct number of clusters

+
Iv <-  data.frame(p1=0.3, mu1=50, sig1=sd(data$value)/3, 
+                 p2=0.3, mu2=75, sig2=sd(data$value)/3, 
+                 mu3=89, sig3=sd(data$value)/3)
+# Solve the problem using Em 
+EX3 <- EM_Three_gaussian_f(Y=data$value, 
+                           p1=Iv$p1, mu1=Iv$mu1, sig1=sd(data$value)/3,
+                           p2=Iv$p2, mu2=Iv$mu2, sig2=sd(data$value)/3, 
+                           mu3=Iv$mu3, sig3=sd(data$value)/3, 
+                           eps=0.02, max_iter=1000, verbose=F) 
+
## 
+##  The EM converge at iteration 20
+
# Get the comparison data
+EX3_pij <- as.data.frame(EX3$pij)
+EX3_comp_dat <- create_comp_dat_3G_1D_f(data, EX3_pij)
+# png("./EX_Result/EX1_3G_1d_orgpre.png");
+plot_1D_org_pre_dat_f(EX3_comp_dat)
+

+
# ;dev.off()
+EX3_error <- tabyl(EX3_comp_dat$err) %>% mutate(error = `EX3_comp_dat$err`) %>% 
+   select(c("error", "n", "percent"))
+# get the parameter estimator
+EX3 <- as.data.frame(EX3[c(2:10)]) 
+save(Tv, Iv, EX3, EX3_error, file= "./EX_Result/EX3.RData")
+
+
+

3.2 If we choose a wrong number of clusters

+
# If we use the #clusters =2 
+source("01_Mixture_2Gaussian_1d.R")
+EX3_2 <- EM_two_gaussian_f(mu1_0 = 50, sig1_0= 10, 
+                           mu2_0=85, sig2_0=10, 
+                           p_0=0.5, Y= data$value, 
+                           max_iter=1000, 
+                           eps=0.01, dist_thres_prob= 0.5, verbose=F) %>%
+   as.data.frame()
+U_vec <- com_U_vec_f(dist_thres_prob=0.5, EX3_2$p, Y=data$value, 
+                     mu1=EX3_2$mu1, sig1=EX3_2$sig1, 
+                     mu2=EX3_2$mu2, sig2=EX3_2$sig2)
+EX3_2_comdat <- cbind(data, U_vec) %>% 
+   mutate(dist = as.numeric(dist)) %>% 
+   mutate(err= dist-U_vec)
+EX3_2_comdat_err <- tabyl(EX3_2_comdat$err)
+EX3_2
+
##   i   p      mu1     sig1      mu2     sig2
+## 1 4 0.5 49.94347 3.073976 82.99561 4.113227
+
EX3_2_comdat_err
+
##  EX3_2_comdat$err   n percent
+##                 0 700     0.7
+##                 1 300     0.3
+
# write_rds(EX3_2, "./EX_Result/EX3_2.rds")
+# write_rds(EX3_2_comdat_err, "./EX_Result/EX3_2_comdat_err.rds")
+
plot_org_pre_wrongk_dat_f <- function(comp_dat){
+   # Plot the original +  predicted data gotten from EM algorithm 
+   # when we choose a WRONG number of cluster
+   comp_dat <- comp_dat %>% mutate(across(c("dist", "U_vec"), as.factor)) %>% 
+      rename(pre_dist = U_vec)
+   p1 <- comp_dat %>%
+      ggplot(aes(x=value, fill=dist), addDensityCurve=TRUE) +
+      geom_histogram( color="#e9ecef", alpha=0.6, position = 'identity', binwidth=1) +
+      scale_fill_manual(values=c("#69b3a2", "#404080", "#80427d"))
+   p2 <- comp_dat %>%
+      ggplot(aes(x=value, fill=pre_dist), addDensityCurve=TRUE) +
+      geom_histogram( color="#e9ecef", alpha=0.6, position = 'identity', binwidth=1) +
+      scale_fill_manual(values=c("#69b3a2", "#404080")) 
+   theplot <- plot_grid(p1, p2, 
+                        labels=c("Orginial data",
+                                 "Predicted data"), ncol = 2, nrow = 1)
+   return(theplot)
+}
+
+# png("./EX_Result/EX3_wrong_k.png");
+plot_org_pre_wrongk_dat_f(EX3_2_comdat)
+

+
# ;dev.off()
+
+
+ + + +
+
+ +
+ + + + + + + + + + + + + + + + diff --git a/Note_3G_1D.html b/Note_3G_1D.html new file mode 100644 index 0000000..7905570 --- /dev/null +++ b/Note_3G_1D.html @@ -0,0 +1,1917 @@ + + + + + + + + + + + + + + + +Note_kGaussian + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+
+
+
+
+ +
+ + + + + + + +
+

1 Introduction

+

This is the note that I use for learning about the EM algorithm. +Other sections (EM introduction, R codes, (more detailed E and M step establish) Em for mixtures of \(k=2\) Gaussian distribution in 1D and 2D) could be found from [here].

+
+
+

2 Mixtures of \(k\geq 2\) fixed number of Gaussian with unknown means and variances

+

We now consider the case of \(k\geq\) mixing Normal distribution with unknown parameters in \(1\) dimension.

+

Let’s \(Y_i, i=\overline{1,n}\) are observed data gotten from mixture of \(k\) Normal distribution with unknown parameters in 1 dimension space. Also, we don’t know about the distribution to which each data point belongs.

+

\[\begin{align} +f(y) &= \sum_{j=1}^k p_j f_j(y|\mu_j, \sigma_j^2), \text{ where } \sum_{j=1}^k p_j=1\\ +&\quad f_j \equiv N(\mu_j, \sigma_j^2), \text{ where } \mu_j, \sigma_j^2 \text{ unknown } +\end{align}\]

+

We want to use EM algorithm to estimate \(\boldsymbol{\theta}= (\boldsymbol{p}=(p_1, ..., p_k), \mu_j, \sigma_j^2, \forall j=\overline{1,k})\)

+

Let \(U_i\) is the label of the distribution that the data point \(Y_i\) belongs to. We have

+

\[ +f(y_i) = f(y_i, u_i, \boldsymbol{p})= \prod_{j=1}^k [p_jf_j(y_i)]^{I(u_i=j)} +\]

+
+

2.1 E step

+

We want to compute the \(Q\) function

+

\[\begin{align} +Q(\boldsymbol{\theta}|\boldsymbol{\theta}^{(s)} ) &= E_{Y_{miss}}\left[ln\left(f(Y_{obs}, Y_{miss}|\boldsymbol{\theta})\right) | Y_{obs}, \boldsymbol{\theta}^{(s)} \right]\\ +&= E_{U}\left[ln\left(f(\boldsymbol{Y},\boldsymbol{U})|\boldsymbol{\theta})\right) | Y, \boldsymbol{\theta}^{(s)} \right] +\end{align}\]

+
    +
  • The complete data likelihood
  • +
+

\[\begin{align} +f(\boldsymbol{Y},\boldsymbol{U}|\boldsymbol{\theta})&= \prod_{i=1}^n f(y_i) = \prod_{i=1}^n \prod_{j=1}^k [p_jf_j(y_i)]^{I(u_i=j)} +\end{align}\]

+
    +
  • The complete data log-likelihood:
  • +
+

\[\begin{align} +ln(f(\boldsymbol{Y},\boldsymbol{U}|\boldsymbol{\theta}))&= \sum_{i=1}^n \sum_{j=1}^k I(U_i=j) ln(p_jf_j(y_i))\\ +&= \sum_{i=1}^n \sum_{j=1}^k I(U_i=j) [ln(p_j) + ln(f_j)(y_i)] +\end{align}\]

+
    +
  • The \(Q\) function
  • +
+

\[\begin{align*} +Q(\boldsymbol{\theta}|\boldsymbol{\theta}^{(s)}) &= E_{U}\left[ln\left(f(\boldsymbol{Y},\boldsymbol{U})|\boldsymbol{\theta})\right) | \boldsymbol{Y}, \boldsymbol{\theta}^{(s)} \right]\\ +&= E_U\left[\sum_{i=1}^n \sum_{j=1}^k I(U_i=j) [ln(p_j) + ln(f_j)(y_i)]| \boldsymbol{Y}, \boldsymbol{\theta}^{(s)}\right]\\ +&= \sum_{i=1}^n \sum_{j=1}^k ln(p_i) E_U(I(U_i=j|\boldsymbol{Y}, \boldsymbol{p}^{(s)}))\\ + & \quad + \sum_{i=1}^n \sum_{j=1}^k ln(f_j(y_i)) E_U(I(U_i=j|\boldsymbol{Y}, \boldsymbol{p}^{(s)}))\\ +\end{align*}\]

+
+

Put

+

\[ +\pi_{ij}^{(s)}:= E_U(I(U_i=j|\boldsymbol{Y}, \boldsymbol{p}^{(s)})) = P(U_i=j| \boldsymbol{Y}, \boldsymbol{p}^{(s)})) +\] +We have that the posterior probability \(\pi_{ij}^{(s)}\) +\[\begin{align} +\pi_{ij}^{(s)} = P(U_i=j| \boldsymbol{Y}, \boldsymbol{p}^{(s)}) = +\frac{P(U_i=j, \boldsymbol{p}^{(s)})f_j(y_i)}{\sum_{l=1}^k P(U_i=l, \boldsymbol{p}_l^{(s)})f_l(y_i)} = +\frac{p_j^{(s)} f_j(y_i)}{\sum_{l=1}^k p_l^{(s)} f_l(y_i))} +\end{align}\]

+
+
    +
  • We have the Q function
  • +
+

\[\begin{align*} +Q(\boldsymbol{\theta}|\boldsymbol{\theta}^{(s)}) &= \sum_{i=1}^n \sum_{j=1}^k ln(p_j) \pi_{ij}^{(s)} + \sum_{i=1}^n \sum_{j=1}^k ln(f_j(y_i)) \pi_{ij}^{(s)}\\ +&= \sum_{i=1}^n \sum_{j=1}^k \pi_{ij}^{(s)} ln(p_j)+ \\ +& \quad +\sum_{i=1}^n \sum_{j=1}^k \pi_{ij}^{(s)} \left(-\frac{1}{2}ln(2\pi)-\frac{1}{2}ln(\sigma_j^2)-\frac{1}{2\sigma_j^2}(y_i-\mu_j)^2\right) +\end{align*}\]

+
+
+

2.2 M step

+

We now want to find \(\boldsymbol{\theta}= (\boldsymbol{p}=(p_1, ..., p_k), \mu_j, \sigma_j^2, \forall j=\overline{1,k})\) that maximize the \(Q(\boldsymbol{\theta}|\boldsymbol{\theta}^{(s)})\) function

+
    +
  • First, we want to find the \(\hat{p}_j\)
  • +
+

\[ +\frac{\partial Q}{\partial p_j} = \sum_{i=1}^n \pi_{ij} \frac{1}{p_j} +\]

+

Setting \(\frac{\partial Q}{\partial p_j} = 0\) would not help us finding \(p_j\) because the constraint \(\sum_{j=1}^k p_j = 1\). We construct the Lagrangian 1 by adding Lagrange multiplier to the \(Q\) function

+

\[Q(\boldsymbol{p}, \boldsymbol{p}^{(s)}) = \sum_{j=1}^k ln(p_i) \sum_{i=1}^n \pi_{ij}+ \lambda (1-\sum_{j=1}^k p_j)\] +Let’s \(\frac{\partial Q}{ \partial p_j} = 0\), we have

+

\[\begin{equation} +\frac{\partial Q}{ \partial p_j} = \frac{1}{p_j} \sum_{i=1}^n \pi_{ij}-\lambda=0 \Rightarrow p_j = \frac{\sum_{i=1}^n \pi_{ij}}{\lambda}\tag{2.1} +\end{equation}\]

+

Let’s \(\frac{\partial Q}{\partial \lambda}=0\), we have

+

\[\begin{align*} +\frac{\partial Q}{\partial \lambda}=0 &\Rightarrow 1-\sum_{j=1}^k p_j =0\\ +& \Rightarrow 1- \sum_{j=1}^k \sum_{i=1}^n \frac{\pi_{ij}}{\lambda} = 0\\ +& \Rightarrow \sum_{j=1}^k \sum_{i=1}^n \pi_{ij} = n +\end{align*}\]

+

Then substitute \(\lambda=n\) back to (2.1), we have

+

\[\begin{equation} +\hat{p}_j = \frac{\sum_{i=1}^n\pi_{ij}^{(s)}}{n} +\end{equation}\]

+
    +
  • Find \(\mu_j, \sigma_j^2, \forall j=\overline{1, k}\) that maximize \(Q\).
  • +
+

Let’s derivative of \(Q\) wrt. to \(\mu_j\) equal 0, we have

+

\[\begin{align} +& \frac{\partial Q}{\partial \mu_j}= \sum_{i=1}^n \pi_{ij}^{(s)}\frac{1}{\sigma_j^2}(y_i-\mu_j)=\frac{1}{\sigma_j^2}\left[\sum_{i=1}^n y_i \pi_{ij}^{(s)}- \mu_j \sum_{i=1}^n \pi_{ij}^{(s)}\right]=0\\ +& \Rightarrow \hat{\mu}_j = \frac{\sum_{i=1}^n y_i \pi_{ij}^{(s)}}{\sum_{i=1}^n \pi_{ij}^{(s)}} +\end{align}\]

+

Let’s derivative of \(Q\) wrt. to \(\sigma_j^2\) equal 0, we have +\[\begin{align} +&\frac{\partial Q}{\partial \sigma_j^2} = \sum_{i=1}^n \left(\frac{-1}{2\sigma_j^2} + \frac{(y_i-\mu_j)^2}{2\sigma_j^4}\right) \pi_{ij}^{(s)} = - \sum_{i=1}^n \pi_{ij}^n \pi_{ij}^{(s)} + \frac{\sum_{i=1}^n (y_i-\mu_j)^2 \pi_{ij}^{(s)}}{\sigma_j^2}=0\\ +& \Rightarrow \hat{\sigma}_j^2 = \frac{\sum_{i=1}^n(y_i-\mu_j)^2 \pi_{ij}^{(s)}}{\sum_{i=1}^n \pi_{ij}^{(s)}} +\end{align}\]

+
+
+

2.3 Pseudo-code for estimating parameters

+
    +
  1. Set initial values: \(\boldsymbol{p}= (p^{(0)}, p^{(1)}, ..., p^{(k-1)}, 1-p^{(0)}-...p^{(k-1)})\) and \(\mu_j^{(0)}, \sigma_j^{2(0)}, \forall j=\overline{1, k}\)
  2. +
  3. For each EM iteration step
  4. +
+
    +
  1. Update posterior probability +\[\pi_{ij}^{(s+1)} = \frac{p_j^{(s)} f_j(y_i)}{\sum_{l=1}^k p_l^{(s)} f_l(y_i))}\]
  2. +
  3. Update parameter estimations \(p_j^{(s+1)}\), \(\mu_j^{(s+1)}, \sigma_j^{(s+1)}\) +\[\begin{align*} +p_j^{(s+1)}&= \frac{\sum_{i=1}^n\pi_{ij}^{(s)}}{n}\\ +\mu_j^{(s+1)} &= \frac{\sum_{i=1}^n y_i \pi_{ij}^{(s)}}{\sum_{i=1}^n \pi_{ij}^{(s)}} \\ +\sigma_j^{(s+1)} &= \frac{\sum_{i=1}^n(y_i-\mu_j)^2 \pi_{ij}^{(s)}}{\sum_{i=1}^n \pi_{ij}^{(s)}} +\end{align*}\]
  4. +
+
    +
  1. Stopping criteria for step (2.):
  2. +
+
    +
  1. \(\boldsymbol{\theta}\) stop changing: \(\Vert \boldsymbol{\theta}^{(s+1)}-\boldsymbol{\theta}^{(s)}\Vert < \epsilon_1\).
  2. +
  3. log-likelihood stop changing: \(|l(\boldsymbol{\theta^{(s+1)}})-l(\boldsymbol{\theta^{(s)}})|<\epsilon_2\)
  4. +
+
+
+
+
+

3 Mixture of Gaussian distribution in dimension \(d\).

+

The proof is the same as with dimension = 1, except the variance matrix \(\Sigma\) is used instead of the variance \(\sigma^2\) and the formula for updating \(\Sigma\) is

+

\[\begin{equation} +\Sigma_j =\frac{\sum_{i=1}^n \pi_{ij} (x_j^{(i)}-\mu_j)(x_j^{(i)}-\mu_j)^T}{\sum_{i=1}^n \pi_{ij}} +\end{equation}\]

+
+
+

4 Examples of EM clustering algorithm for Gaussian mixture models

+
+

4.1 Example 1: Three Gaussian distributions in 1D

+

In this example, we will explore how sensitive the EM is to the prechosen number of clusters. The true data is on the LHS of the Fig.(@ref(fig:EX3_data)) with the true parameters are

+

\[\begin{equation*} +\begin{cases} +p_1=0.5, p_2=0.2, p_3=0.3\\ +Y_1\sim N(50, 3), Y_2 \sim N(80, 4), Y_3\sim N(85,3) +\end{cases} +\end{equation*}\]

+
+Example of the case when it is hard to choose the correct number of clusters. The figure on the LHS presents the three true Gaussian components. The figure in the RHS represents the observed mixed data. From the mixed data, we hardly can notice that there are 3 clusters in the data. +

+(#fig:EX3_data)Example of the case when it is hard to choose the correct number of clusters. The figure on the LHS presents the three true Gaussian components. The figure in the RHS represents the observed mixed data. From the mixed data, we hardly can notice that there are 3 clusters in the data. +

+
+

Looking at the figure in the RHS of @ref(fig:EX3_data), we hardly can notice that there are three different components in that. Let’s see what happen if we choose the correct number of cluster and the wrong number of clusters to run the EM.

+
    +
  • When we choose the correct number of clusters, which is 3 in this example. Then we see that the EM works well with estimating the parameters and clustering with an error rate is \(11.7\%\)
  • +
+ ++++++++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Parameters\(p_1\)\(\mu_1\)\(\sigma_1\)\(p_2\)\(\mu_2\)\(\sigma_2\)\(p_3\)\(\mu_3\)\(\sigma_3\)
True Values0.55030.28040.3853
Initial0.3505.640.3755.640.4895.64
Predicted0.549.93.070.173804.250.32784.63
+ + + + + + + + + + + + + + + + + + + + + + + + + +
errornpercent
-11000.100
08830.883
1170.017
+
    +
  • When we choose the wrong number of clusters, assume we choose 2, which is a reasonable number of clusters if we just look at the mixed data and have no information about the true number of clusters. Then we get the predicted values for the parameters
  • +
+
+LHS: Chosing a correct numeber of cluster, error rate= 11.7 percent. RHS:  choose a wrong number of clusters, error rate = 30 percent.LHS: Chosing a correct numeber of cluster, error rate= 11.7 percent. RHS:  choose a wrong number of clusters, error rate = 30 percent. +

+(#fig:EX3_correctk)LHS: Chosing a correct numeber of cluster, error rate= 11.7 percent. RHS: choose a wrong number of clusters, error rate = 30 percent. +

+
+ + + + + + + + + + + + + + + + + + + + + + + +
Parameter\(p_1\)\(\mu_1\)\(\sigma_1\)\(p_2\)\(\mu_2\)\(\sigma_2\)
Predicted0.549.93.070.5834.11
+ + + + + + + + + + + + + + + + + + + + +
Errornpercent
07000.7
13000.3
+
+
+
+

5 Reference

+

McLachlan, G. J., & Krishnan, T. (2007). The EM algorithm and extensions (Vol. 382). John Wiley & Sons.

+

Ng, A. (2000). CS229 Lecture notes. CS229 Lecture notes, 1(1), 1-3.

+
+
+
+
    +
  1. method of Lagrange multipliers is a strategy for finding the local maximum and minimum of a function \(f\) subject to equality constraints \(L(x, \lambda) = f(x) + \lambda g(x)\) with constraint g(x)=0↩︎

  2. +
+
+ + + +
+
+ +
+ + + + + + + + + + + + + + + +