1
2
3
4
5
6
7
8
9 """Gaussian Naive Bayes Classifier
10
11 EXPERIMENTAL ;)
12 Basic implementation of Gaussian Naive Bayes classifier.
13 """
14
15 __docformat__ = 'restructuredtext'
16
17 import numpy as N
18
19 from numpy import ones, zeros, sum, abs, isfinite, dot
20 from mvpa.base import warning, externals
21 from mvpa.clfs.base import Classifier
22 from mvpa.misc.param import Parameter
23 from mvpa.misc.state import StateVariable
24
25
26
27
28 if __debug__:
29 from mvpa.base import debug
30
31
32 -class GNB(Classifier):
33 """Gaussian Naive Bayes `Classifier`.
34
35 GNB is a probabilistic classifier relying on Bayes rule to
36 estimate posterior probabilities of labels given the data. Naive
37 assumption in it is an independence of the features, which allows
38 to combine per-feature likelihoods by a simple product across
39 likelihoods of"independent" features.
40 See http://en.wikipedia.org/wiki/Naive_bayes for more information.
41
42 Provided here implementation is "naive" on its own -- various
43 aspects could be improved, but has its own advantages:
44
45 - implementation is simple and straightforward
46 - no data copying while considering samples of specific class
47 - provides alternative ways to assess prior distribution of the
48 classes in the case of unbalanced sets of samples (see parameter
49 `prior`)
50 - makes use of NumPy broadcasting mechanism, so should be
51 relatively efficient
52 - should work for any dimensionality of samples
53
54 GNB is listed both as linear and non-linear classifier, since
55 specifics of separating boundary depends on the data and/or
56 parameters: linear separation is achieved whenever samples are
57 balanced (or prior='uniform') and features have the same variance
58 across different classes (i.e. if common_variance=True to enforce
59 this).
60
61 Whenever decisions are made based on log-probabilities (parameter
62 logprob=True, which is the default), then state variable `values`
63 if enabled would also contain log-probabilities. Also mention
64 that normalization by the evidence (P(data)) is disabled by
65 default since it has no impact per se on classification decision.
66 You might like set parameter normalize to True if you want to
67 access properly scaled probabilities in `values` state variable.
68 """
69
70
71
72 _clf_internals = [ 'gnb', 'linear', 'non-linear',
73 'binary', 'multiclass' ]
74
75 common_variance = Parameter(False, allowedtype='bool',
76 doc="""Use the same variance across all classes.""")
77 prior = Parameter('laplacian_smoothing',
78 allowedtype='basestring',
79 choices=["laplacian_smoothing", "uniform", "ratio"],
80 doc="""How to compute prior distribution.""")
81
82 logprob = Parameter(True, allowedtype='bool',
83 doc="""Operate on log probabilities. Preferable to avoid unneeded
84 exponentiation and loose precision.
85 If set, logprobs are stored in `values`""")
86 normalize = Parameter(False, allowedtype='bool',
87 doc="""Normalize (log)prob by P(data). Requires probabilities thus
88 for `logprob` case would require exponentiation of 'logprob's, thus
89 disabled by default since does not impact classification output.
90 """)
91
93 """Initialize an GNB classifier.
94 """
95
96
97 Classifier.__init__(self, **kwargs)
98
99
100 self.means = None
101 """Means of features per class"""
102 self.variances = None
103 """Variances per class, but "vars" is taken ;)"""
104 self.ulabels = None
105 """Labels classifier was trained on"""
106 self.priors = None
107 """Class probabilities"""
108
109
110 self._norm_weight = None
111
113 """Train the classifier using `dataset` (`Dataset`).
114 """
115 params = self.params
116
117
118 X = dataset.samples
119 labels = dataset.labels
120 self.ulabels = ulabels = dataset.uniquelabels
121 nlabels = len(ulabels)
122
123 label2index = dict((l, il) for il, l in enumerate(ulabels))
124
125
126 nsamples = len(X)
127 s_shape = X.shape[1:]
128
129 self.means = means = \
130 N.zeros((nlabels, ) + s_shape)
131 self.variances = variances = \
132 N.zeros((nlabels, ) + s_shape)
133
134 nsamples_per_class = N.zeros((nlabels,) + (1,)*len(s_shape))
135
136
137 for s, l in zip(X, labels):
138 il = label2index[l]
139 nsamples_per_class[il] += 1
140 means[il] += s
141
142
143 non0labels = (nsamples_per_class.squeeze() != 0)
144 means[non0labels] /= nsamples_per_class[non0labels]
145
146
147
148 for s, l in zip(X, labels):
149 il = label2index[l]
150 variances[il] += (s - means[il])**2
151
152
153 if params.common_variance:
154
155 cvar = N.sum(variances, axis=0)/nsamples
156
157 variances[:] = cvar
158 else:
159 variances[non0labels] /= nsamples_per_class[non0labels]
160
161
162 prior = params.prior
163 if prior == 'uniform':
164 self.priors = N.ones((nlabels,))/nlabels
165 elif prior == 'laplacian_smoothing':
166 self.priors = (1+N.squeeze(nsamples_per_class)) \
167 / (float(nsamples) + nlabels)
168 elif prior == 'ratio':
169 self.priors = N.squeeze(nsamples_per_class) / float(nsamples)
170 else:
171 raise ValueError(
172 "No idea on how to handle '%s' way to compute priors"
173 % params.prior)
174
175
176 if params.logprob:
177
178 self._norm_weight = -0.5 * N.log(2*N.pi*variances)
179 else:
180 self._norm_weight = 1.0/N.sqrt(2*N.pi*variances)
181
182 if __debug__ and 'GNB' in debug.active:
183 debug('GNB', "training finished on data.shape=%s " % (X.shape, )
184 + "min:max(data)=%f:%f" % (N.min(X), N.max(X)))
185
186
188 """Untrain classifier and reset all learnt params
189 """
190 self.means = None
191 self.variances = None
192 self.ulabels = None
193 self.priors = None
194 super(GNB, self).untrain()
195
196
198 """Predict the output for the provided data.
199 """
200 params = self.params
201
202 scaled_distances = \
203 -0.5 * (((data - self.means[:, N.newaxis, ...])**2) \
204 / self.variances[:, N.newaxis, ...])
205 if params.logprob:
206
207
208
209
210
211
212
213 lprob_csfs = self._norm_weight[:, N.newaxis, ...] + scaled_distances
214
215
216
217
218
219 lprob_csf = lprob_csfs.reshape(
220 lprob_csfs.shape[:2] + (-1,))
221
222 lprob_cs = lprob_csf.sum(axis=2)
223
224
225 prob_cs_cp = lprob_cs + N.log(self.priors[:, N.newaxis])
226
227 else:
228
229
230 prob_csfs = \
231 self._norm_weight[:, N.newaxis, ...] * N.exp(scaled_distances)
232
233
234
235 prob_csf = prob_csfs.reshape(
236 prob_csfs.shape[:2] + (-1,))
237
238 prob_cs = prob_csf.prod(axis=2)
239
240
241 prob_cs_cp = prob_cs * self.priors[:, N.newaxis]
242
243
244 if params.normalize:
245 if params.logprob:
246 prob_cs_cp_real = N.exp(prob_cs_cp)
247 else:
248 prob_cs_cp_real = prob_cs_cp
249 prob_s_cp_marginals = N.sum(prob_cs_cp_real, axis=0)
250 if params.logprob:
251 prob_cs_cp -= N.log(prob_s_cp_marginals)
252 else:
253 prob_cs_cp /= prob_s_cp_marginals
254
255
256 winners = prob_cs_cp.argmax(axis=0)
257 predictions = [self.ulabels[c] for c in winners]
258
259
260 self.values = prob_cs_cp.T
261
262 if __debug__ and 'GNB' in debug.active:
263 debug('GNB', "predict on data.shape=%s min:max(data)=%f:%f " %
264 (data.shape, N.min(data), N.max(data)))
265
266 return predictions
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300