From 2ce8baccede43f3cacba6ca530c9ff167efc7062 Mon Sep 17 00:00:00 2001 From: Tran Le <45803288+tranktle@users.noreply.github.com> Date: Sun, 3 Apr 2022 16:56:01 -0500 Subject: [PATCH] Add files via upload --- 02_mixture_3Gaussian_1d.html | 1812 ++++++++++++++++++++++++++++++++ Note_3G_1D.html | 1917 ++++++++++++++++++++++++++++++++++ 2 files changed, 3729 insertions(+) create mode 100644 02_mixture_3Gaussian_1d.html create mode 100644 Note_3G_1D.html diff --git a/02_mixture_3Gaussian_1d.html b/02_mixture_3Gaussian_1d.html new file mode 100644 index 0000000..d942224 --- /dev/null +++ b/02_mixture_3Gaussian_1d.html @@ -0,0 +1,1812 @@ + + + + +
+ + + + + + + + + +Here is the R code that I use to illustrate a simulation example using the EM algorithm for Gaussian mixture models with 3 (Gaussian) distributions in 1D. The EM formula construction and the pseudo-code can be found here.
+creat_3G_dat_f <- function(n, p1, mu1, sig1, p2, mu2, sig2, mu3, sig3){
+ # # Create synthesis data
+ # n: the total sample size of the three distributions
+ # p1, p2: the probability of a data point belongs to the 1st, 2nd distribution
+ # p1+p2 < 1 since we have p3=1-(p1+p2) is the proportion of 3rd distribution
+ n1 <- floor(n*p1); Y1 <- rnorm(n1, mu1, sig1)
+ n2 <- floor(n*p2); Y2 <- rnorm(n2, mu2, sig2)
+ n3 <- ceiling(n*(1-p1-p2)); Y3 <- rnorm(n3, mu3, sig3)
+ data <- data.frame(
+ dist = c( rep("1", n1), rep("2", n2) , rep("3", n3)),
+ value = c( Y1, Y2, Y3))
+ return(data)
+}
+
+plot_3G_syn_dat_f <- function(data){
+ # a data set created by function creat_dat_f
+ p1 <- data %>%
+ ggplot(aes(x=value, fill=dist), addDensityCurve=TRUE) +
+ geom_histogram( color="#e9ecef", alpha=0.6,
+ position = 'identity', binwidth=1) +
+ scale_fill_manual(values=c("#69b3a2", "#404080", "#80427d"))
+ p2 <- data %>%
+ ggplot(aes(x=value)) +
+ geom_histogram( color="#e9ecef", alpha=0.6, position = 'identity', binwidth=1)
+ theplot <- plot_grid(p1, p2, labels=c("Three data components","Mixture data"),
+ ncol = 2, nrow = 1)
+ return(theplot)
+}
+com_3G_pi_f <-function(y, p1, mu1, sig1, p2, mu2, sig2, mu3, sig3){
+ # compute posterior probability associative with each yi
+ f1 <- dnorm(y, mean = mu1, sd = sig1)
+ f2 <- dnorm(y, mean = mu2, sd = sig2)
+ f3 <- dnorm(y, mean = mu3, sd = sig3)
+ denorm <- (p1*f1)+(p2*f2)+((1-p1-p2)*f3)
+ pi_i1 <- (p1*f1)/denorm
+ pi_i2 <- (p2*f2)/denorm
+ return(c(pi_i1, pi_i2, 1-pi_i1-pi_i2))
+}
+EM_Three_gaussian_f <-function(Y, p1, mu1, sig1, p2, mu2, sig2, mu3, sig3, eps, max_iter, verbose) {
+ # Function to estimate parameters based on initial guesses
+ # p1, mu1, sig1, p2, mu2, sig2, mu3, sig3: INITIAL parameter values
+ # eps" stopping criterion threshold
+ # max_iter: maximum number of EM iteration
+ # verbose = T if you want to see the updated estimation values in each iteration step
+ n <- length(Y)
+ for (i in (1:max_iter)){
+ # Compute the (s+1) step from (s) step
+ pij <- map(Y, ~com_3G_pi_f(.x, p1, mu1, sig1, p2, mu2, sig2, mu3, sig3))
+ pij <- as.data.frame(do.call(rbind, pij))
+ p1_new <- sum(pij$V1)/n
+ p2_new <- sum(pij$V2)/n
+ #
+ mu1_new <- sum(Y*pij$V1)/sum(pij$V1)
+ sig1_new <- sqrt(sum(((Y-mu1)^2)*pij$V1)/sum(pij$V1))
+ mu2_new <- sum(Y*pij$V2)/sum(pij$V2)
+ sig2_new <- sqrt(sum(((Y-mu2)^2)*pij$V2)/sum(pij$V2))
+ mu3_new <- sum(Y*pij$V3)/sum(pij$V3)
+ sig3_new <- sqrt(sum(((Y-mu3)^2)*pij$V3)/sum(pij$V3))
+ if(verbose){cat("\n at step ", i, ": ",
+ mu1_new, sig1_new, mu2_new,sig2_new, mu3_new, sig3_new)}
+ # stopping criteria
+ if (max(abs(p1_new-p1), abs(p2_new -p2),
+ abs(mu1_new -mu1), abs(mu2_new-mu2), abs(mu3_new-mu3))< eps){break}
+ # Update
+ p1 <- p1_new; mu1 <- mu1_new; sig1 <- sig1_new
+ p2 <- p2_new; mu2 <- mu2_new; sig2 <- sig2_new
+ mu3 <- mu3_new; sig3 <- sig3_new
+ } # end for loop
+ if (i==max_iter){cat("\n The stopping criterium does not meet \n")
+ } else {cat("\n The EM converge at iteration", i, '\n')}
+ return(list(pij=pij,
+ p1=p1_new, mu1=mu1_new, sig1=sig1_new,
+ p2=p2_new, mu2=mu2_new, sig2=sig2_new,
+ p3= 1- p1_new-p2_new, mu3=mu3_new, sig3=sig3_new))
+}
+create_comp_dat_3G_1D_f <- function(data, pij){
+ # Out: comp_dat_2D: contain original label (dist) and EM cluster prediction (pred)
+ pij$Max<-colnames(pij)[apply(pij,1,which.max)]
+ pij <- pij %>% mutate(Max_col = str_sub(Max, start = 2, end = 2))
+ comp_dat_2D <- cbind(data, pred = pij$Max_col)
+ comp_dat_2D <- comp_dat_2D %>% mutate(err = as.integer(dist)-as.integer(pred))
+ return(comp_dat_2D)
+}
+plot_1D_org_pre_dat_f <- function(comp_dat){
+ # Plot the original and predicted data gotten from EM algorithm
+ p1 <- comp_dat %>%
+ ggplot(aes(x=value, fill=dist), addDensityCurve=TRUE) +
+ geom_histogram( color="#e9ecef", alpha=0.6,
+ position = 'identity', binwidth=1) +
+ scale_fill_manual(values=c("#69b3a2", "#404080", "#80427d"))
+ p2 <- comp_dat %>%
+ ggplot(aes(x=value, fill=pred)) +
+ geom_histogram( color="#e9ecef", alpha=0.6,
+ position = 'identity', binwidth=1)
+ theplot <- plot_grid(p1, p2,
+ labels=c("Orginial data components",
+ "Predicted data components"),
+ ncol = 2, nrow = 1)
+ return(theplot)
+}
+# EXAMPLE 1 Set up a model----------------------------------------
+set.seed(538725)
+# Parameters true values
+Tv <- data.frame(p1=0.5, mu1=50, sig1=3,
+ p2=0.2, mu2=80, sig2=4,
+ mu3=85, sig3=3)
+data <- creat_3G_dat_f(n=1000,
+ p1=Tv$p1, mu1=Tv$mu1, sig1=Tv$sig1,
+ p2=Tv$p2, mu2=Tv$mu2, sig2=Tv$sig2,
+ mu3=Tv$mu3, sig3=Tv$sig3)
+# png("./EX_Result/EX1_3G_1d_data.png");
+plot_3G_syn_dat_f(data)
+# ;dev.off()
+# Parameter Initial values
+Iv <- data.frame(p1=0.3, mu1=50, sig1=sd(data$value)/3,
+ p2=0.3, mu2=75, sig2=sd(data$value)/3,
+ mu3=89, sig3=sd(data$value)/3)
+# Solve the problem using Em
+EX3 <- EM_Three_gaussian_f(Y=data$value,
+ p1=Iv$p1, mu1=Iv$mu1, sig1=sd(data$value)/3,
+ p2=Iv$p2, mu2=Iv$mu2, sig2=sd(data$value)/3,
+ mu3=Iv$mu3, sig3=sd(data$value)/3,
+ eps=0.02, max_iter=1000, verbose=F)
+##
+## The EM converge at iteration 20
+# Get the comparison data
+EX3_pij <- as.data.frame(EX3$pij)
+EX3_comp_dat <- create_comp_dat_3G_1D_f(data, EX3_pij)
+# png("./EX_Result/EX1_3G_1d_orgpre.png");
+plot_1D_org_pre_dat_f(EX3_comp_dat)
+# ;dev.off()
+EX3_error <- tabyl(EX3_comp_dat$err) %>% mutate(error = `EX3_comp_dat$err`) %>%
+ select(c("error", "n", "percent"))
+# get the parameter estimator
+EX3 <- as.data.frame(EX3[c(2:10)])
+save(Tv, Iv, EX3, EX3_error, file= "./EX_Result/EX3.RData")
+# If we use the #clusters =2
+source("01_Mixture_2Gaussian_1d.R")
+EX3_2 <- EM_two_gaussian_f(mu1_0 = 50, sig1_0= 10,
+ mu2_0=85, sig2_0=10,
+ p_0=0.5, Y= data$value,
+ max_iter=1000,
+ eps=0.01, dist_thres_prob= 0.5, verbose=F) %>%
+ as.data.frame()
+U_vec <- com_U_vec_f(dist_thres_prob=0.5, EX3_2$p, Y=data$value,
+ mu1=EX3_2$mu1, sig1=EX3_2$sig1,
+ mu2=EX3_2$mu2, sig2=EX3_2$sig2)
+EX3_2_comdat <- cbind(data, U_vec) %>%
+ mutate(dist = as.numeric(dist)) %>%
+ mutate(err= dist-U_vec)
+EX3_2_comdat_err <- tabyl(EX3_2_comdat$err)
+EX3_2
+## i p mu1 sig1 mu2 sig2
+## 1 4 0.5 49.94347 3.073976 82.99561 4.113227
+EX3_2_comdat_err
+## EX3_2_comdat$err n percent
+## 0 700 0.7
+## 1 300 0.3
+# write_rds(EX3_2, "./EX_Result/EX3_2.rds")
+# write_rds(EX3_2_comdat_err, "./EX_Result/EX3_2_comdat_err.rds")
+plot_org_pre_wrongk_dat_f <- function(comp_dat){
+ # Plot the original + predicted data gotten from EM algorithm
+ # when we choose a WRONG number of cluster
+ comp_dat <- comp_dat %>% mutate(across(c("dist", "U_vec"), as.factor)) %>%
+ rename(pre_dist = U_vec)
+ p1 <- comp_dat %>%
+ ggplot(aes(x=value, fill=dist), addDensityCurve=TRUE) +
+ geom_histogram( color="#e9ecef", alpha=0.6, position = 'identity', binwidth=1) +
+ scale_fill_manual(values=c("#69b3a2", "#404080", "#80427d"))
+ p2 <- comp_dat %>%
+ ggplot(aes(x=value, fill=pre_dist), addDensityCurve=TRUE) +
+ geom_histogram( color="#e9ecef", alpha=0.6, position = 'identity', binwidth=1) +
+ scale_fill_manual(values=c("#69b3a2", "#404080"))
+ theplot <- plot_grid(p1, p2,
+ labels=c("Orginial data",
+ "Predicted data"), ncol = 2, nrow = 1)
+ return(theplot)
+}
+
+# png("./EX_Result/EX3_wrong_k.png");
+plot_org_pre_wrongk_dat_f(EX3_2_comdat)
+# ;dev.off()
+This is the note that I use for learning about the EM algorithm. +Other sections (EM introduction, R codes, (more detailed E and M step establish) Em for mixtures of \(k=2\) Gaussian distribution in 1D and 2D) could be found from [here].
+We now consider the case of \(k\geq\) mixing Normal distribution with unknown parameters in \(1\) dimension.
+Let’s \(Y_i, i=\overline{1,n}\) are observed data gotten from mixture of \(k\) Normal distribution with unknown parameters in 1 dimension space. Also, we don’t know about the distribution to which each data point belongs.
+\[\begin{align} +f(y) &= \sum_{j=1}^k p_j f_j(y|\mu_j, \sigma_j^2), \text{ where } \sum_{j=1}^k p_j=1\\ +&\quad f_j \equiv N(\mu_j, \sigma_j^2), \text{ where } \mu_j, \sigma_j^2 \text{ unknown } +\end{align}\]
+We want to use EM algorithm to estimate \(\boldsymbol{\theta}= (\boldsymbol{p}=(p_1, ..., p_k), \mu_j, \sigma_j^2, \forall j=\overline{1,k})\)
+Let \(U_i\) is the label of the distribution that the data point \(Y_i\) belongs to. We have
+\[ +f(y_i) = f(y_i, u_i, \boldsymbol{p})= \prod_{j=1}^k [p_jf_j(y_i)]^{I(u_i=j)} +\]
+We want to compute the \(Q\) function
+\[\begin{align} +Q(\boldsymbol{\theta}|\boldsymbol{\theta}^{(s)} ) &= E_{Y_{miss}}\left[ln\left(f(Y_{obs}, Y_{miss}|\boldsymbol{\theta})\right) | Y_{obs}, \boldsymbol{\theta}^{(s)} \right]\\ +&= E_{U}\left[ln\left(f(\boldsymbol{Y},\boldsymbol{U})|\boldsymbol{\theta})\right) | Y, \boldsymbol{\theta}^{(s)} \right] +\end{align}\]
+\[\begin{align} +f(\boldsymbol{Y},\boldsymbol{U}|\boldsymbol{\theta})&= \prod_{i=1}^n f(y_i) = \prod_{i=1}^n \prod_{j=1}^k [p_jf_j(y_i)]^{I(u_i=j)} +\end{align}\]
+\[\begin{align} +ln(f(\boldsymbol{Y},\boldsymbol{U}|\boldsymbol{\theta}))&= \sum_{i=1}^n \sum_{j=1}^k I(U_i=j) ln(p_jf_j(y_i))\\ +&= \sum_{i=1}^n \sum_{j=1}^k I(U_i=j) [ln(p_j) + ln(f_j)(y_i)] +\end{align}\]
+\[\begin{align*} +Q(\boldsymbol{\theta}|\boldsymbol{\theta}^{(s)}) &= E_{U}\left[ln\left(f(\boldsymbol{Y},\boldsymbol{U})|\boldsymbol{\theta})\right) | \boldsymbol{Y}, \boldsymbol{\theta}^{(s)} \right]\\ +&= E_U\left[\sum_{i=1}^n \sum_{j=1}^k I(U_i=j) [ln(p_j) + ln(f_j)(y_i)]| \boldsymbol{Y}, \boldsymbol{\theta}^{(s)}\right]\\ +&= \sum_{i=1}^n \sum_{j=1}^k ln(p_i) E_U(I(U_i=j|\boldsymbol{Y}, \boldsymbol{p}^{(s)}))\\ + & \quad + \sum_{i=1}^n \sum_{j=1}^k ln(f_j(y_i)) E_U(I(U_i=j|\boldsymbol{Y}, \boldsymbol{p}^{(s)}))\\ +\end{align*}\]
+Put
+\[ +\pi_{ij}^{(s)}:= E_U(I(U_i=j|\boldsymbol{Y}, \boldsymbol{p}^{(s)})) = P(U_i=j| \boldsymbol{Y}, \boldsymbol{p}^{(s)})) +\] +We have that the posterior probability \(\pi_{ij}^{(s)}\) +\[\begin{align} +\pi_{ij}^{(s)} = P(U_i=j| \boldsymbol{Y}, \boldsymbol{p}^{(s)}) = +\frac{P(U_i=j, \boldsymbol{p}^{(s)})f_j(y_i)}{\sum_{l=1}^k P(U_i=l, \boldsymbol{p}_l^{(s)})f_l(y_i)} = +\frac{p_j^{(s)} f_j(y_i)}{\sum_{l=1}^k p_l^{(s)} f_l(y_i))} +\end{align}\]
+\[\begin{align*} +Q(\boldsymbol{\theta}|\boldsymbol{\theta}^{(s)}) &= \sum_{i=1}^n \sum_{j=1}^k ln(p_j) \pi_{ij}^{(s)} + \sum_{i=1}^n \sum_{j=1}^k ln(f_j(y_i)) \pi_{ij}^{(s)}\\ +&= \sum_{i=1}^n \sum_{j=1}^k \pi_{ij}^{(s)} ln(p_j)+ \\ +& \quad +\sum_{i=1}^n \sum_{j=1}^k \pi_{ij}^{(s)} \left(-\frac{1}{2}ln(2\pi)-\frac{1}{2}ln(\sigma_j^2)-\frac{1}{2\sigma_j^2}(y_i-\mu_j)^2\right) +\end{align*}\]
+We now want to find \(\boldsymbol{\theta}= (\boldsymbol{p}=(p_1, ..., p_k), \mu_j, \sigma_j^2, \forall j=\overline{1,k})\) that maximize the \(Q(\boldsymbol{\theta}|\boldsymbol{\theta}^{(s)})\) function
+\[ +\frac{\partial Q}{\partial p_j} = \sum_{i=1}^n \pi_{ij} \frac{1}{p_j} +\]
+Setting \(\frac{\partial Q}{\partial p_j} = 0\) would not help us finding \(p_j\) because the constraint \(\sum_{j=1}^k p_j = 1\). We construct the Lagrangian 1 by adding Lagrange multiplier to the \(Q\) function
+\[Q(\boldsymbol{p}, \boldsymbol{p}^{(s)}) = \sum_{j=1}^k ln(p_i) \sum_{i=1}^n \pi_{ij}+ \lambda (1-\sum_{j=1}^k p_j)\] +Let’s \(\frac{\partial Q}{ \partial p_j} = 0\), we have
+\[\begin{equation} +\frac{\partial Q}{ \partial p_j} = \frac{1}{p_j} \sum_{i=1}^n \pi_{ij}-\lambda=0 \Rightarrow p_j = \frac{\sum_{i=1}^n \pi_{ij}}{\lambda}\tag{2.1} +\end{equation}\]
+Let’s \(\frac{\partial Q}{\partial \lambda}=0\), we have
+\[\begin{align*} +\frac{\partial Q}{\partial \lambda}=0 &\Rightarrow 1-\sum_{j=1}^k p_j =0\\ +& \Rightarrow 1- \sum_{j=1}^k \sum_{i=1}^n \frac{\pi_{ij}}{\lambda} = 0\\ +& \Rightarrow \sum_{j=1}^k \sum_{i=1}^n \pi_{ij} = n +\end{align*}\]
+Then substitute \(\lambda=n\) back to (2.1), we have
+\[\begin{equation} +\hat{p}_j = \frac{\sum_{i=1}^n\pi_{ij}^{(s)}}{n} +\end{equation}\]
+Let’s derivative of \(Q\) wrt. to \(\mu_j\) equal 0, we have
+\[\begin{align} +& \frac{\partial Q}{\partial \mu_j}= \sum_{i=1}^n \pi_{ij}^{(s)}\frac{1}{\sigma_j^2}(y_i-\mu_j)=\frac{1}{\sigma_j^2}\left[\sum_{i=1}^n y_i \pi_{ij}^{(s)}- \mu_j \sum_{i=1}^n \pi_{ij}^{(s)}\right]=0\\ +& \Rightarrow \hat{\mu}_j = \frac{\sum_{i=1}^n y_i \pi_{ij}^{(s)}}{\sum_{i=1}^n \pi_{ij}^{(s)}} +\end{align}\]
+Let’s derivative of \(Q\) wrt. to \(\sigma_j^2\) equal 0, we have +\[\begin{align} +&\frac{\partial Q}{\partial \sigma_j^2} = \sum_{i=1}^n \left(\frac{-1}{2\sigma_j^2} + \frac{(y_i-\mu_j)^2}{2\sigma_j^4}\right) \pi_{ij}^{(s)} = - \sum_{i=1}^n \pi_{ij}^n \pi_{ij}^{(s)} + \frac{\sum_{i=1}^n (y_i-\mu_j)^2 \pi_{ij}^{(s)}}{\sigma_j^2}=0\\ +& \Rightarrow \hat{\sigma}_j^2 = \frac{\sum_{i=1}^n(y_i-\mu_j)^2 \pi_{ij}^{(s)}}{\sum_{i=1}^n \pi_{ij}^{(s)}} +\end{align}\]
+The proof is the same as with dimension = 1, except the variance matrix \(\Sigma\) is used instead of the variance \(\sigma^2\) and the formula for updating \(\Sigma\) is
+\[\begin{equation} +\Sigma_j =\frac{\sum_{i=1}^n \pi_{ij} (x_j^{(i)}-\mu_j)(x_j^{(i)}-\mu_j)^T}{\sum_{i=1}^n \pi_{ij}} +\end{equation}\]
+In this example, we will explore how sensitive the EM is to the prechosen number of clusters. The true data is on the LHS of the Fig.(@ref(fig:EX3_data)) with the true parameters are
+\[\begin{equation*} +\begin{cases} +p_1=0.5, p_2=0.2, p_3=0.3\\ +Y_1\sim N(50, 3), Y_2 \sim N(80, 4), Y_3\sim N(85,3) +\end{cases} +\end{equation*}\]
++(#fig:EX3_data)Example of the case when it is hard to choose the correct number of clusters. The figure on the LHS presents the three true Gaussian components. The figure in the RHS represents the observed mixed data. From the mixed data, we hardly can notice that there are 3 clusters in the data. +
+Looking at the figure in the RHS of @ref(fig:EX3_data), we hardly can notice that there are three different components in that. Let’s see what happen if we choose the correct number of cluster and the wrong number of clusters to run the EM.
+Parameters | +\(p_1\) | +\(\mu_1\) | +\(\sigma_1\) | +\(p_2\) | +\(\mu_2\) | +\(\sigma_2\) | +\(p_3\) | +\(\mu_3\) | +\(\sigma_3\) | +
---|---|---|---|---|---|---|---|---|---|
True Values | +0.5 | +50 | +3 | +0.2 | +80 | +4 | +0.3 | +85 | +3 | +
Initial | +0.3 | +50 | +5.64 | +0.3 | +75 | +5.64 | +0.4 | +89 | +5.64 | +
Predicted | +0.5 | +49.9 | +3.07 | +0.173 | +80 | +4.25 | +0.327 | +84.6 | +3 | +
error | +n | +percent | +
---|---|---|
-1 | +100 | +0.100 | +
0 | +883 | +0.883 | +
1 | +17 | +0.017 | +
+(#fig:EX3_correctk)LHS: Chosing a correct numeber of cluster, error rate= 11.7 percent. RHS: choose a wrong number of clusters, error rate = 30 percent. +
+Parameter | +\(p_1\) | +\(\mu_1\) | +\(\sigma_1\) | +\(p_2\) | +\(\mu_2\) | +\(\sigma_2\) | +
---|---|---|---|---|---|---|
Predicted | +0.5 | +49.9 | +3.07 | +0.5 | +83 | +4.11 | +
Error | +n | +percent | +
---|---|---|
0 | +700 | +0.7 | +
1 | +300 | +0.3 | +
McLachlan, G. J., & Krishnan, T. (2007). The EM algorithm and extensions (Vol. 382). John Wiley & Sons.
+Ng, A. (2000). CS229 Lecture notes. CS229 Lecture notes, 1(1), 1-3.
+method of Lagrange multipliers is a strategy for finding the local maximum and minimum of a function \(f\) subject to equality constraints \(L(x, \lambda) = f(x) + \lambda g(x)\) with constraint g(x)=0↩︎