Density estimationΒΆ

This example shows how to do pdf estimation for one dimensional data. It estimates a Gaussian mixture model for several number of components, and it determines the “best” one according to the Bayesian Information Criterion.

It uses old faitfhul waiting time as the one dimension data, and plots the best model as well as the BIC as a function of the number of component.

Source code: pdfestimation1d.py

# Example of doing pdf estimation with EM algorithm. Requires matplotlib.
import numpy as N

import pylab as P
import matplotlib as MPL

from scikits.learn.em import EM, GM, GMM
import utils

oldfaithful = utils.get_faithful()

duration = oldfaithful[:, :1]
waiting = oldfaithful[:, 1:]

#dt = utils.scale(duration)
#dt = duration / 60.
dt = waiting / 60.

# This function train a mixture model with k components, returns the trained
# model and the BIC
def cluster(data, k):
    d = data.shape[1]
    gm = GM(d, k)
    gmm = GMM(gm)
    em = EM()
    em.train(data, gmm, maxiter = 20)
    return gm, gmm.bic(data)

# bc will contain a list of BIC values for each model trained, gml the
# corresponding mixture model
bc = []
gml = []

for k in range(1, 8):
    gm, b = cluster(dt, k = k)
    bc.append(b)
    gml.append(gm)

mbic = N.argmax(bc)

# Below is code to display a figure with histogram and best model (in the BIC
# sense) pdf, with the BIC as a function of the number of components on the
# right.
P.figure(figsize = [12, 7])
#---------------------------
# histogram + pdf estimation
#---------------------------
P.subplot(1, 2, 1)
h = gml[mbic].plot1d(gpdf=True)
# You can manipulate the differents parts of the plot through the returned
# handles
h['gpdf'][0].set_linestyle('-')
h['gpdf'][0].set_label('pdf of the mixture')
h['pdf'][0].set_label('pdf of individual component')
[l.set_linestyle('-') for l in h['pdf']]
[l.set_color('g') for l in h['pdf']]

prop = MPL.font_manager.FontProperties(size='smaller')
P.legend(loc = 'best', prop = prop)

P.hist(dt, 25, normed = 1, fill = False)
P.xlabel('waiting time between consecutive eruption (in min)')

#------------------------------------------
# BIC as a function of number of components
#------------------------------------------
P.subplot(1, 2, 2)
P.plot(N.arange(1, 8), bc, 'o:')
P.xlabel("number of components")
P.ylabel("BIC")
print "According to the BIC, model with %d components is better" % (N.argmax(bc) + 1)

Previous topic

Density estimation with mixture models

Next topic

Ellipse fitting with Gaussian mixtures

This Page