Skip to content

Commit

Permalink
Fixed a number of issues with the LL calculation and sampling.
Browse files Browse the repository at this point in the history
Added the parameters for the mixing coeficients

Renamed some variables to be more descriptive
  • Loading branch information
JamesAllingham committed May 13, 2018
1 parent 6e40ff4 commit 2a6e451
Showing 1 changed file with 32 additions and 27 deletions.
59 changes: 32 additions & 27 deletions auto_impute/mmm.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ def __init__(self, data, num_components, assignments=None, verbose=None):
kmeans = KMeans(n_clusters=self.num_components, random_state=0).fit(mean_imputed_X)

# create parameters depending on the feature assingments
rs = np.zeros(shape=(self.N, self.num_components))
rs[np.arange(self.N), kmeans.labels_] = 1
self.πs = np.mean(rs, axis=0)
self.μs = np.stack([np.mean(mean_imputed_X[np.where(kmeans.labels_ == k)[0], :], axis=0) for k in range(self.num_components)], axis=0)
self.Σs = np.stack([regularise_Σ(np.diag(np.var(mean_imputed_X[np.where(kmeans.labels_ == k)[0], :], axis=0))) for k in range(self.num_components)], axis=0)
# self.ps = np.array([[np.mean([self.one_hot_lookups[d][self.X.data[n, d]] for n in range(self.N) if ~self.X.mask[n, d]], axis=0)
Expand All @@ -73,7 +76,7 @@ def fit(self, max_iters=100, ϵ=1e-1):
best_ll = self.ll
if self.verbose: print("Fitting model:")
for i in range(max_iters):
old_μs, old_Σs, old_ps, old_rs, old_expected_X = self.μs.copy(), self.Σs.copy(), self.ps.copy(), self.rs.copy(), self.expected_X.copy()
old_μs, old_Σs, old_ps, old_πs, old_rs, old_expected_X = self.μs.copy(), self.Σs.copy(), self.ps.copy(), self.πs.copy(), self.rs.copy(), self.expected_X.copy()
# E-step
self._calc_rs()

Expand All @@ -84,7 +87,7 @@ def fit(self, max_iters=100, ϵ=1e-1):
# if the log likelihood stops improving then stop iterating
self._calc_ll()
if self.ll < best_ll or self.ll - best_ll < ϵ:
self.μs, self.Σs, self.ps, self.rs, self.expected_X = old_μs, old_Σs, old_ps, old_rs, old_expected_X
self.μs, self.Σs, self.ps, self.πs, self.rs, self.expected_X = old_μs, old_Σs, old_ps, old_πs, old_rs, old_expected_X
self.ll = best_ll

if self.verbose and not self.assignments_given:
Expand Down Expand Up @@ -117,19 +120,22 @@ def _calc_rs(self):
if np.all(rs[n, :] == 0): # deal with the case where no components want to take charge of an example
rs[n, :] = 1e-20
else:
rs[n, :] = np.mean(self.rs, axis=0)
rs[n, :] = self.πs

self.rs = rs/np.sum(rs, axis=1, keepdims=True)

# M-step
def _update_params(self):
# update πs
self.πs = np.mean(self.rs, axis=0)

# update the assignment params
# print(self.real_columns)
if not self.assignments_given:
for n in range(self.N):
for d in range(self.num_features):
tmp = np.sum([stats.norm.pdf(self.X.data[n, d] if not self.X.mask[n, d] else self.μs[k][d], loc=self.μs[k][d], scale=np.diag(self.Σs[k])[d]) for k in range(self.num_components)])
foo = np.array([
cont = np.array([
np.sum([
np.sum([
self.rs[n, k]*stats.norm.pdf(self.X.data[n, d] if not self.X.mask[n, d] else self.μs[k][d], loc=self.μs[k][d], scale=np.diag(self.Σs[k])[d])
Expand All @@ -139,7 +145,7 @@ def _update_params(self):
])
for d in range(self.num_features)
])
tmp = np.array([
disc = np.array([
np.sum([
np.sum([
self.rs[n, k]*stats.multinomial.pmf(self.one_hot_lookups[d][self.X.data[n, d]] if not self.X.mask[n, d] else self.ps[k, d], 1, self.ps[k, d])
Expand All @@ -149,7 +155,7 @@ def _update_params(self):
])
for d in range(self.num_features)
])
self.real_columns = foo/(tmp + foo)
self.real_columns = cont/(disc + cont)

# update the discrete params
ps = np.array([[np.zeros(shape=(self.unique_vals[d].size)) for d in range(self.num_features)] for k in range(self.num_components)])
Expand Down Expand Up @@ -215,25 +221,25 @@ def _calc_ll(self):
mask_row = self.X[n, :].mask
if np.all(~mask_row): continue

m_r_locs = np.where(np.logical_and(mask_row, self.real_columns))[0]
m_d_locs = np.where(np.logical_and(mask_row, np.logical_not(self.real_columns)))[0]
m_locs = np.where(mask_row)[0]

prob = 0
for k in range(self.num_components):
tmp = 1

# probability of real values
if m_r_locs.size:
μmo = self.μs[k, m_r_locs] # TODO: make this work with full cov mat
Σmm = self.Σs[k, :, :][np.ix_(m_r_locs, m_r_locs)]
for d in m_locs:
# real part
r_tmp = self.real_columns[d] * stats.norm.pdf(self.expected_X[n, d], loc=self.μs[k, d], scale=self.Σs[k, d, d])

tmp *= self.rs[n, k] * stats.multivariate_normal.pdf(self.expected_X[n, m_r_locs], mean=μmo, cov=Σmm, allow_singular=True)
# disc part
try:
d_tmp = (1 - self.real_columns[d]) * stats.multinomial.pmf(self.one_hot_lookups[d][self.expected_X[n, d]], 1, self.ps[k, d])
except KeyError as _: # if the value cannot be looked up then its probabiltiy is 0 acroding the categorical distribution
d_tmp = 0

# probability of discrete values
for d in m_d_locs:
tmp *= self.rs[n, k]*stats.multinomial.pmf(self.one_hot_lookups[d][self.expected_X[n, d]], 1, self.ps[k, d])
tmp *= r_tmp + d_tmp

prob += tmp
prob += self.rs[n, k]*tmp

lls.append(np.log(prob))

Expand All @@ -249,18 +255,17 @@ def _sample(self, num_samples):
# if there are no missing values then go to next iter
if np.all(~mask_row): continue

m_r_locs = np.where(np.logical_and(mask_row, self.real_columns))[0]
m_d_locs = np.where(np.logical_and(mask_row, np.logical_not(self.real_columns)))[0]
m_locs = np.where(mask_row)[0]
k = np.random.choice(self.num_components, p=self.rs[n, :])

# sample real values
μmo = self.μs[k, m_r_locs] # TODO: make this work with full cov mat
Σmm = self.Σs[k, :, :][np.ix_(m_r_locs, m_r_locs)]
sampled_Xs[i, n, m_r_locs] = stats.multivariate_normal.rvs(mean=μmo, cov=Σmm, size=1)

# sample discrete values
for d in m_d_locs:
sampled_Xs[i, n, d] = self.unique_vals[d][np.argmax(stats.multinomial.rvs(1, self.ps[k, d]))]
for d in m_locs:
# sample to determine if real or not
if np.random.rand(1) <= self.real_columns[d]:
# sample real value
sampled_Xs[i, n, d] = stats.norm.rvs(loc=self.μs[k, d], scale=self.Σs[k, d, d])
else:
# sample discrete value
sampled_Xs[i, n, d] = self.unique_vals[d][np.argmax(stats.multinomial.rvs(1, self.ps[k, d]))]

return sampled_Xs

0 comments on commit 2a6e451

Please sign in to comment.