Package mvpa :: Package clfs :: Module gnb
[hide private]
[frames] | no frames]

Source Code for Module mvpa.clfs.gnb

  1  # emacs: -*- mode: python; py-indent-offset: 4; indent-tabs-mode: nil -*- 
  2  # vi: set ft=python sts=4 ts=4 sw=4 et: 
  3  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  4  # 
  5  #   See COPYING file distributed along with the PyMVPA package for the 
  6  #   copyright and license terms. 
  7  # 
  8  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  9  """Gaussian Naive Bayes Classifier 
 10   
 11     EXPERIMENTAL ;) 
 12     Basic implementation of Gaussian Naive Bayes classifier. 
 13  """ 
 14   
 15  __docformat__ = 'restructuredtext' 
 16   
 17  import numpy as N 
 18   
 19  from numpy import ones, zeros, sum, abs, isfinite, dot 
 20  from mvpa.base import warning, externals 
 21  from mvpa.clfs.base import Classifier 
 22  from mvpa.misc.param import Parameter 
 23  from mvpa.misc.state import StateVariable 
 24  #from mvpa.measures.base import Sensitivity 
 25  #from mvpa.misc.transformers import SecondAxisMaxOfAbs # XXX ? 
 26   
 27   
 28  if __debug__: 
 29      from mvpa.base import debug 
 30   
 31   
32 -class GNB(Classifier):
33 """Gaussian Naive Bayes `Classifier`. 34 35 GNB is a probabilistic classifier relying on Bayes rule to 36 estimate posterior probabilities of labels given the data. Naive 37 assumption in it is an independence of the features, which allows 38 to combine per-feature likelihoods by a simple product across 39 likelihoods of"independent" features. 40 See http://en.wikipedia.org/wiki/Naive_bayes for more information. 41 42 Provided here implementation is "naive" on its own -- various 43 aspects could be improved, but has its own advantages: 44 45 - implementation is simple and straightforward 46 - no data copying while considering samples of specific class 47 - provides alternative ways to assess prior distribution of the 48 classes in the case of unbalanced sets of samples (see parameter 49 `prior`) 50 - makes use of NumPy broadcasting mechanism, so should be 51 relatively efficient 52 - should work for any dimensionality of samples 53 54 GNB is listed both as linear and non-linear classifier, since 55 specifics of separating boundary depends on the data and/or 56 parameters: linear separation is achieved whenever samples are 57 balanced (or prior='uniform') and features have the same variance 58 across different classes (i.e. if common_variance=True to enforce 59 this). 60 61 Whenever decisions are made based on log-probabilities (parameter 62 logprob=True, which is the default), then state variable `values` 63 if enabled would also contain log-probabilities. Also mention 64 that normalization by the evidence (P(data)) is disabled by 65 default since it has no impact per se on classification decision. 66 You might like set parameter normalize to True if you want to 67 access properly scaled probabilities in `values` state variable. 68 """ 69 # XXX decide when should we set corresponding internal, 70 # since it depends actually on the data -- no clear way, 71 # so set both linear and non-linear 72 _clf_internals = [ 'gnb', 'linear', 'non-linear', 73 'binary', 'multiclass' ] 74 75 common_variance = Parameter(False, allowedtype='bool', 76 doc="""Use the same variance across all classes.""") 77 prior = Parameter('laplacian_smoothing', 78 allowedtype='basestring', 79 choices=["laplacian_smoothing", "uniform", "ratio"], 80 doc="""How to compute prior distribution.""") 81 82 logprob = Parameter(True, allowedtype='bool', 83 doc="""Operate on log probabilities. Preferable to avoid unneeded 84 exponentiation and loose precision. 85 If set, logprobs are stored in `values`""") 86 normalize = Parameter(False, allowedtype='bool', 87 doc="""Normalize (log)prob by P(data). Requires probabilities thus 88 for `logprob` case would require exponentiation of 'logprob's, thus 89 disabled by default since does not impact classification output. 90 """) 91
92 - def __init__(self, **kwargs):
93 """Initialize an GNB classifier. 94 """ 95 96 # init base class first 97 Classifier.__init__(self, **kwargs) 98 99 # pylint friendly initializations 100 self.means = None 101 """Means of features per class""" 102 self.variances = None 103 """Variances per class, but "vars" is taken ;)""" 104 self.ulabels = None 105 """Labels classifier was trained on""" 106 self.priors = None 107 """Class probabilities""" 108 109 # Define internal state of classifier 110 self._norm_weight = None
111
112 - def _train(self, dataset):
113 """Train the classifier using `dataset` (`Dataset`). 114 """ 115 params = self.params 116 117 # get the dataset information into easy vars 118 X = dataset.samples 119 labels = dataset.labels 120 self.ulabels = ulabels = dataset.uniquelabels 121 nlabels = len(ulabels) 122 #params = self.params # for quicker access 123 label2index = dict((l, il) for il, l in enumerate(ulabels)) 124 125 # set the feature dimensions 126 nsamples = len(X) 127 s_shape = X.shape[1:] # shape of a single sample 128 129 self.means = means = \ 130 N.zeros((nlabels, ) + s_shape) 131 self.variances = variances = \ 132 N.zeros((nlabels, ) + s_shape) 133 # degenerate dimension are added for easy broadcasting later on 134 nsamples_per_class = N.zeros((nlabels,) + (1,)*len(s_shape)) 135 136 # Estimate means and number of samples per each label 137 for s, l in zip(X, labels): 138 il = label2index[l] # index of the label 139 nsamples_per_class[il] += 1 140 means[il] += s 141 142 ## Actually compute the means 143 non0labels = (nsamples_per_class.squeeze() != 0) 144 means[non0labels] /= nsamples_per_class[non0labels] 145 146 # Estimate variances 147 # better loop than repmat! ;) 148 for s, l in zip(X, labels): 149 il = label2index[l] # index of the label 150 variances[il] += (s - means[il])**2 151 152 ## Actually compute the variances 153 if params.common_variance: 154 # we need to get global std 155 cvar = N.sum(variances, axis=0)/nsamples # sum across labels 156 # broadcast the same variance across labels 157 variances[:] = cvar 158 else: 159 variances[non0labels] /= nsamples_per_class[non0labels] 160 161 # Store prior probabilities 162 prior = params.prior 163 if prior == 'uniform': 164 self.priors = N.ones((nlabels,))/nlabels 165 elif prior == 'laplacian_smoothing': 166 self.priors = (1+N.squeeze(nsamples_per_class)) \ 167 / (float(nsamples) + nlabels) 168 elif prior == 'ratio': 169 self.priors = N.squeeze(nsamples_per_class) / float(nsamples) 170 else: 171 raise ValueError( 172 "No idea on how to handle '%s' way to compute priors" 173 % params.prior) 174 175 # Precompute and store weighting coefficient for Gaussian 176 if params.logprob: 177 # it would be added to exponent 178 self._norm_weight = -0.5 * N.log(2*N.pi*variances) 179 else: 180 self._norm_weight = 1.0/N.sqrt(2*N.pi*variances) 181 182 if __debug__ and 'GNB' in debug.active: 183 debug('GNB', "training finished on data.shape=%s " % (X.shape, ) 184 + "min:max(data)=%f:%f" % (N.min(X), N.max(X)))
185 186
187 - def untrain(self):
188 """Untrain classifier and reset all learnt params 189 """ 190 self.means = None 191 self.variances = None 192 self.ulabels = None 193 self.priors = None 194 super(GNB, self).untrain()
195 196
197 - def _predict(self, data):
198 """Predict the output for the provided data. 199 """ 200 params = self.params 201 # argument of exponentiation 202 scaled_distances = \ 203 -0.5 * (((data - self.means[:, N.newaxis, ...])**2) \ 204 / self.variances[:, N.newaxis, ...]) 205 if params.logprob: 206 # if self.params.common_variance: 207 # XXX YOH: 208 # For decision there is no need to actually compute 209 # properly scaled p, ie 1/sqrt(2pi * sigma_i) could be 210 # simply discarded since it is common across features AND 211 # classes 212 # For completeness -- computing everything now even in logprob 213 lprob_csfs = self._norm_weight[:, N.newaxis, ...] + scaled_distances 214 215 # XXX for now just cut/paste with different operators, but 216 # could just bind them and reuse in the same equations 217 # Naive part -- just a product of probabilities across features 218 ## First we need to reshape to get class x samples x features 219 lprob_csf = lprob_csfs.reshape( 220 lprob_csfs.shape[:2] + (-1,)) 221 ## Now -- sum across features 222 lprob_cs = lprob_csf.sum(axis=2) 223 224 # Incorporate class probabilities: 225 prob_cs_cp = lprob_cs + N.log(self.priors[:, N.newaxis]) 226 227 else: 228 # Just a regular Normal distribution with per 229 # feature/class mean and variances 230 prob_csfs = \ 231 self._norm_weight[:, N.newaxis, ...] * N.exp(scaled_distances) 232 233 # Naive part -- just a product of probabilities across features 234 ## First we need to reshape to get class x samples x features 235 prob_csf = prob_csfs.reshape( 236 prob_csfs.shape[:2] + (-1,)) 237 ## Now -- product across features 238 prob_cs = prob_csf.prod(axis=2) 239 240 # Incorporate class probabilities: 241 prob_cs_cp = prob_cs * self.priors[:, N.newaxis] 242 243 # Normalize by evidence P(data) 244 if params.normalize: 245 if params.logprob: 246 prob_cs_cp_real = N.exp(prob_cs_cp) 247 else: 248 prob_cs_cp_real = prob_cs_cp 249 prob_s_cp_marginals = N.sum(prob_cs_cp_real, axis=0) 250 if params.logprob: 251 prob_cs_cp -= N.log(prob_s_cp_marginals) 252 else: 253 prob_cs_cp /= prob_s_cp_marginals 254 255 # Take the class with maximal (log)probability 256 winners = prob_cs_cp.argmax(axis=0) 257 predictions = [self.ulabels[c] for c in winners] 258 259 260 self.values = prob_cs_cp.T # set to the probabilities per class 261 262 if __debug__ and 'GNB' in debug.active: 263 debug('GNB', "predict on data.shape=%s min:max(data)=%f:%f " % 264 (data.shape, N.min(data), N.max(data))) 265 266 return predictions
267 268 269 # XXX Later come up with some 270 # could be a simple t-test maps using distributions 271 # per each class 272 #def getSensitivityAnalyzer(self, **kwargs): 273 # """Returns a sensitivity analyzer for GNB.""" 274 # return GNBWeights(self, **kwargs) 275 276 277 # XXX Is there any reason to use properties? 278 #means = property(lambda self: self.__biases) 279 #variances = property(lambda self: self.__weights) 280 281 282 283 ## class GNBWeights(Sensitivity): 284 ## """`SensitivityAnalyzer` that reports the weights GNB trained 285 ## on a given `Dataset`. 286 287 ## """ 288 289 ## _LEGAL_CLFS = [ GNB ] 290 291 ## def _call(self, dataset=None): 292 ## """Extract weights from GNB classifier. 293 294 ## GNB always has weights available, so nothing has to be computed here. 295 ## """ 296 ## clf = self.clf 297 ## means = clf.means 298 ## XXX we can do something better ;) 299 ## return means 300