Bernoulli Mixture

1 minute read

Bernoulli Mixture following Bishop’s Pattern Recognition and Machine Learning Section 9.3.3.

The code below gives a basic implementation of the Bernoulli Mixture fit using the EM algorithm. The essential equations are 9.57 and 9.58 in Bishop. We apply it to the fashion-mnist dataset which has 10 classes, but we look for 20 classes and pick up variations among the purses, etc. The resulting figure is basically a version of Figure 9.10 in Bishop. Different runs of this code produce different results!

# %load
import pandas as pd
import numpy as np
from numpy import log
from scipy.special import logsumexp
import matplotlib
import matplotlib.pyplot as plt
from itertools import product

class BernoulliMixture:

    def __init__(self, n_clusters=2, n_iter=100, tolerance=1e-8, alpha1=1e-6, alpha2=1e-6):
        '''sets things up'''
        self._n_clusters = n_clusters
        self._n_iter = n_iter
        self._tolerance = tolerance
        self._alpha1 = alpha1
        self._alpha2 = alpha2
        self.Theta = 1/self._n_clusters*np.ones(self._n_clusters)

    def _P(self, x, Mu, Theta):
        '''computes the log of the conditional probability of the latent variable given the data and Mu, Theta'''
        ll =,log(Mu)),log(1-Mu))
        Z = (log(Theta)+ ll - logsumexp(ll+log(Theta), axis=1, keepdims=True))
        return Z

    def fit(self, data):
        '''carries out the EM iteration'''
        self._n_samples, self._n_features = data.shape
        self.Mu = np.random.uniform(.25, .75, size=self._n_clusters*self._n_features).reshape(self._n_features, self._n_clusters)
        N = self._n_samples

        for i in range(self._n_iter):
            V = np.exp(self._P(data, self.Mu, self.Theta))
            W = V/V.sum(axis=1,keepdims=True)
            R =, W)
            Q = np.sum(W, axis=0, keepdims=True)
            self._old = self.Theta
            self.Mu, self.Theta = (R + self._alpha1)/(Q + self._n_features*self._alpha1), (Q + self._alpha2)/(N + self._n_clusters * self._alpha2)
            if np.allclose(self._old, self.Theta):
    def predict_proba(self, data):
        '''computes the conditional probability giving cluster membership'''
        return np.exp(self._P(data, self.Mu, self.Theta))
    def generate(self):
        '''generates data from the distribution'''
        x = np.random.multinomial(1,self.Theta.ravel())
        z =, self.Mu),self.Theta.transpose())
        return z

if __name__ == "__main__":
    data_df = pd.read_csv('data/fashion-mnist_train.csv',nrows=5000)
    data = data_df.iloc[:,1:].values
    data = data // 128
    M = BernoulliMixture(n_clusters=20)
    fig, ax = plt.subplots(5,4)
    for i,j in product(range(5), range(4)):
        if 4*i+j<20:
            ax[i,j].imshow(M.Mu[:, 4*i+j].reshape(28,28))