Package mvpa :: Package datasets :: Module miscfx
[hide private]
[frames] | no frames]

Source Code for Module mvpa.datasets.miscfx

  1  # emacs: -*- mode: python; py-indent-offset: 4; indent-tabs-mode: nil -*- 
  2  # vi: set ft=python sts=4 ts=4 sw=4 et: 
  3  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  4  # 
  5  #   See COPYING file distributed along with the PyMVPA package for the 
  6  #   copyright and license terms. 
  7  # 
  8  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  9  """Misc function performing operations on datasets. 
 10   
 11  All the functions defined in this module must accept dataset as the 
 12  first argument since they are bound to Dataset class in the trailer. 
 13  """ 
 14   
 15  __docformat__ = 'restructuredtext' 
 16   
 17  from operator import isSequenceType 
 18   
 19  import numpy as N 
 20   
 21  from mvpa.datasets.base import Dataset, datasetmethod 
 22  from mvpa.base.dochelpers import table2string 
 23  from mvpa.misc.support import getBreakPoints 
 24   
 25  from mvpa.base import externals, warning 
 26   
 27  if __debug__: 
 28      from mvpa.base import debug 
 29   
 30  if externals.exists('scipy'): 
 31      from mvpa.datasets.miscfx_sp import detrend 
32 33 34 @datasetmethod 35 -def zscore(dataset, mean=None, std=None, 36 perchunk=True, baselinelabels=None, 37 pervoxel=True, targetdtype='float64'):
38 """Z-Score the samples of a `Dataset` (in-place). 39 40 `mean` and `std` can be used to pass custom values to the z-scoring. 41 Both may be scalars or arrays. 42 43 All computations are done *in place*. Data upcasting is done 44 automatically if necessary into `targetdtype` 45 46 If `baselinelabels` provided, and `mean` or `std` aren't provided, it would 47 compute the corresponding measure based only on labels in `baselinelabels` 48 49 If `perchunk` is True samples within the same chunk are z-scored independent 50 of samples from other chunks, e.i. mean and standard deviation are 51 calculated individually. 52 """ 53 54 if __debug__ and perchunk \ 55 and N.array(dataset.samplesperchunk.values()).min() <= 2: 56 warning("Z-scoring chunk-wise and one chunk with less than three " 57 "samples will set features in these samples to either zero " 58 "(with 1 sample in a chunk) " 59 "or -1/+1 (with 2 samples in a chunk).") 60 61 # cast to floating point datatype if necessary 62 if str(dataset.samples.dtype).startswith('uint') \ 63 or str(dataset.samples.dtype).startswith('int'): 64 dataset.setSamplesDType(targetdtype) 65 66 def doit(samples, mean, std, statsamples=None): 67 """Internal method.""" 68 69 if statsamples is None: 70 # if nothing provided -- mean/std on all samples 71 statsamples = samples 72 73 if pervoxel: 74 axisarg = {'axis':0} 75 else: 76 axisarg = {} 77 78 # calculate mean if necessary 79 if mean is None: 80 mean = statsamples.mean(**axisarg) 81 82 # de-mean 83 samples -= mean 84 85 # calculate std-deviation if necessary 86 # XXX YOH: would that be actually what we want? 87 # may be we want actually estimate of deviation from the mean, 88 # which per se might be not statsamples.mean (see above)? 89 # if logic to be changed -- adjust ZScoreMapper as well 90 if std is None: 91 std = statsamples.std(**axisarg) 92 93 # do the z-scoring 94 if pervoxel: 95 samples[:, std != 0] /= std[std != 0] 96 else: 97 samples /= std 98 99 return samples
100 101 if baselinelabels is None: 102 statids = None 103 else: 104 statids = set(dataset.idsbylabels(baselinelabels)) 105 106 # for the sake of speed yoh didn't simply create a list 107 # [True]*dataset.nsamples to provide easy selection of everything 108 if perchunk: 109 for c in dataset.uniquechunks: 110 slicer = N.where(dataset.chunks == c)[0] 111 if not statids is None: 112 statslicer = list(statids.intersection(set(slicer))) 113 dataset.samples[slicer] = doit(dataset.samples[slicer], 114 mean, std, 115 dataset.samples[statslicer]) 116 else: 117 slicedsamples = dataset.samples[slicer] 118 dataset.samples[slicer] = doit(slicedsamples, 119 mean, std, 120 slicedsamples) 121 elif statids is None: 122 doit(dataset.samples, mean, std, dataset.samples) 123 else: 124 doit(dataset.samples, mean, std, dataset.samples[list(statids)]) 125
126 127 @datasetmethod 128 -def aggregateFeatures(dataset, fx=N.mean):
129 """Apply a function to each row of the samples matrix of a dataset. 130 131 The functor given as `fx` has to honour an `axis` keyword argument in the 132 way that NumPy used it (e.g. NumPy.mean, var). 133 134 :Returns: 135 a new `Dataset` object with the aggregated feature(s). 136 """ 137 agg = fx(dataset.samples, axis=1) 138 139 return Dataset(samples=N.array(agg, ndmin=2).T, 140 labels=dataset.labels, 141 chunks=dataset.chunks)
142
143 144 @datasetmethod 145 -def removeInvariantFeatures(dataset):
146 """Returns a new dataset with all invariant features removed. 147 """ 148 return dataset.selectFeatures(dataset.samples.std(axis=0).nonzero()[0])
149
150 151 @datasetmethod 152 -def coarsenChunks(source, nchunks=4):
153 """Change chunking of the dataset 154 155 Group chunks into groups to match desired number of chunks. Makes 156 sense if originally there were no strong groupping into chunks or 157 each sample was independent, thus belonged to its own chunk 158 159 :Parameters: 160 source : Dataset or list of chunk ids 161 dataset or list of chunk ids to operate on. If Dataset, then its chunks 162 get modified 163 nchunks : int 164 desired number of chunks 165 """ 166 167 if isinstance(source, Dataset): 168 chunks = source.chunks 169 else: 170 chunks = source 171 chunks_unique = N.unique(chunks) 172 nchunks_orig = len(chunks_unique) 173 174 if nchunks_orig < nchunks: 175 raise ValueError, \ 176 "Original number of chunks is %d. Cannot coarse them " \ 177 "to get %d chunks" % (nchunks_orig, nchunks) 178 179 # figure out number of samples per each chunk 180 counts = dict(zip(chunks_unique, [ 0 ] * len(chunks_unique))) 181 for c in chunks: 182 counts[c] += 1 183 184 # now we need to group chunks to get more or less equalized number 185 # of samples per chunk. No sophistication is done -- just 186 # consecutively group to get close to desired number of samples 187 # per chunk 188 avg_chunk_size = N.sum(counts.values())*1.0/nchunks 189 chunks_groups = [] 190 cur_chunk = [] 191 nchunks = 0 192 cur_chunk_nsamples = 0 193 samples_counted = 0 194 for i, c in enumerate(chunks_unique): 195 cc = counts[c] 196 197 cur_chunk += [c] 198 cur_chunk_nsamples += cc 199 200 # time to get a new chunk? 201 if (samples_counted + cur_chunk_nsamples 202 >= (nchunks+1)*avg_chunk_size) or i==nchunks_orig-1: 203 chunks_groups.append(cur_chunk) 204 samples_counted += cur_chunk_nsamples 205 cur_chunk_nsamples = 0 206 cur_chunk = [] 207 nchunks += 1 208 209 if len(chunks_groups) != nchunks: 210 warning("Apparently logic in coarseChunks is wrong. " 211 "It was desired to get %d chunks, got %d" 212 % (nchunks, len(chunks_groups))) 213 214 # remap using groups 215 # create dictionary 216 chunks_map = {} 217 for i, group in enumerate(chunks_groups): 218 for c in group: 219 chunks_map[c] = i 220 221 chunks_new = [chunks_map[x] for x in chunks] 222 223 if __debug__: 224 debug("DS_", "Using dictionary %s to remap old chunks %s into new %s" 225 % (chunks_map, chunks, chunks_new)) 226 227 if isinstance(source, Dataset): 228 if __debug__: 229 debug("DS", "Coarsing %d chunks into %d chunks for %s" 230 %(nchunks_orig, len(chunks_new), source)) 231 source.chunks = chunks_new 232 return 233 else: 234 return chunks_new
235
236 237 @datasetmethod 238 -def getSamplesPerChunkLabel(dataset):
239 """Returns an array with the number of samples per label in each chunk. 240 241 Array shape is (chunks x labels). 242 243 :Parameters: 244 dataset: Dataset 245 Source dataset. 246 """ 247 ul = dataset.uniquelabels 248 uc = dataset.uniquechunks 249 250 count = N.zeros((len(uc), len(ul)), dtype='uint') 251 252 for cc, c in enumerate(uc): 253 for lc, l in enumerate(ul): 254 count[cc, lc] = N.sum(N.logical_and(dataset.labels == l, 255 dataset.chunks == c)) 256 257 return count
258
259 260 -class SequenceStats(dict):
261 """Simple helper to provide representation of sequence statistics 262 263 Matlab analog: 264 http://cfn.upenn.edu/aguirre/code/matlablib/mseq/mtest.m 265 266 WARNING: Experimental -- API might change without warning! 267 Current implementation is ugly! 268 """ 269
270 - def __init__(self, seq, order=2):#, chunks=None, perchunk=False):
271 """Initialize SequenceStats 272 273 :Parameters: 274 seq : list or ndarray 275 Actual sequence of labels 276 277 :Keywords: 278 order : int 279 Maximal order of counter-balancing check. For perfect 280 counterbalancing all matrices should be identical 281 """ 282 """ 283 chunks : None or list or ndarray 284 Chunks to use if `perchunk`=True 285 perchunk .... TODO 286 """ 287 dict.__init__(self) 288 self.order = order 289 self._seq = seq 290 self.stats = None 291 self._str_stats = None 292 self.__compute()
293 294
295 - def __repr__(self):
296 """Representation of SequenceStats 297 """ 298 return "SequenceStats(%s, order=%d)" % (repr(self._seq), self.order)
299
300 - def __str__(self):
301 return self._str_stats
302
303 - def __compute(self):
304 """Compute stats and string representation 305 """ 306 # Do actual computation 307 order = self.order 308 seq = list(self._seq) # assure list 309 nsamples = len(seq) # # of samples/labels 310 ulabels = sorted(list(set(seq))) # unique labels 311 nlabels = len(ulabels) # # of labels 312 313 # mapping for labels 314 labels_map = dict([(l, i) for i,l in enumerate(ulabels)]) 315 316 # map sequence first 317 seqm = [labels_map[i] for i in seq] 318 nperlabel = N.bincount(seqm) 319 320 res = dict(ulabels=ulabels) 321 # Estimate counter-balance 322 cbcounts = N.zeros((order, nlabels, nlabels), dtype=int) 323 for cb in xrange(order): 324 for i,j in zip(seqm[:-(cb+1)], seqm[cb+1:]): 325 cbcounts[cb, i, j] += 1 326 res['cbcounts'] = cbcounts 327 328 """ 329 Lets compute relative counter-balancing 330 Ideally, nperlabel[i]/nlabels should precede each label 331 """ 332 # Autocorrelation 333 corr = [] 334 # for all possible shifts: 335 for shift in xrange(1, nsamples): 336 shifted = seqm[shift:] + seqm[:shift] 337 # ??? User pearsonsr with p may be? 338 corr += [N.corrcoef(seqm, shifted)[0, 1]] 339 # ??? report high (anti)correlations? 340 res['corrcoef'] = corr = N.array(corr) 341 res['sumabscorr'] = sumabscorr = N.sum(N.abs(corr)) 342 self.update(res) 343 344 # Assign textual summary 345 # XXX move into a helper function and do on demand 346 t = [ [""] * (1 + self.order*(nlabels+1)) for i in xrange(nlabels+1) ] 347 t[0][0] = "Labels/Order" 348 for i, l in enumerate(ulabels): 349 t[i+1][0] = '%s:' % l 350 for cb in xrange(order): 351 t[0][1+cb*(nlabels+1)] = "O%d" % (cb+1) 352 for i in xrange(nlabels+1): 353 t[i][(cb+1)*(nlabels+1)] = " | " 354 m = cbcounts[cb] 355 # ??? there should be better way to get indexes 356 ind = N.where(~N.isnan(m)) 357 for i, j in zip(*ind): 358 t[1+i][1+cb*(nlabels+1)+j] = '%d' % m[i, j] 359 360 sout = "Original sequence had %d entries from set %s\n" \ 361 % (len(seq), ulabels) + \ 362 "Counter-balance table for orders up to %d:\n" % order \ 363 + table2string(t) 364 sout += "Correlations: min=%.2g max=%.2g mean=%.2g sum(abs)=%.2g" \ 365 % (min(corr), max(corr), N.mean(corr), sumabscorr) 366 self._str_stats = sout
367 368
369 - def plot(self):
370 """Plot correlation coefficients 371 """ 372 externals.exists('pylab', raiseException=True) 373 import pylab as P 374 P.plot(self['corrcoef']) 375 P.title('Auto-correlation of the sequence') 376 P.xlabel('Offset') 377 P.ylabel('Correlation Coefficient') 378 P.show()
379