1
2
3
4
5
6
7
8
9 """Collection of dataset splitters.
10
11 Module Description
12 ==================
13
14 Splitters are destined to split the provided dataset varous ways to
15 simplify cross-validation analysis, implement boosting of the
16 estimates, or sample null-space via permutation testing.
17
18 Most of the splitters at the moment split 2-ways -- conventionally
19 first part is used for training, and 2nd part for testing by
20 `CrossValidatedTransferError` and `SplitClassifier`.
21
22 Brief Description of Available Splitters
23 ========================================
24
25 * `NoneSplitter` - just return full dataset as the desired part (training/testing)
26 * `OddEvenSplitter` - 2 splits: (odd samples,even samples) and (even, odd)
27 * `HalfSplitter` - 2 splits: (first half, second half) and (second, first)
28 * `NFoldSplitter` - splits for N-Fold cross validation.
29
30 Module Organization
31 ===================
32
33 .. packagetree::
34 :style: UML
35
36 """
37
38 __docformat__ = 'restructuredtext'
39
40 import operator
41
42 import numpy as N
43
44 import mvpa.misc.support as support
45 from mvpa.base.dochelpers import enhancedDocString
46 from mvpa.datasets.miscfx import coarsenChunks
47
48 if __debug__:
49 from mvpa.base import debug
50
52 """Base class of dataset splitters.
53
54 Each splitter should be initialized with all its necessary parameters. The
55 final splitting is done running the splitter object on a certain Dataset
56 via __call__(). This method has to be implemented like a generator, i.e. it
57 has to return every possible split with a yield() call.
58
59 Each split has to be returned as a sequence of Datasets. The properties
60 of the splitted dataset may vary between implementations. It is possible
61 to declare a sequence element as 'None'.
62
63 Please note, that even if there is only one Dataset returned it has to be
64 an element in a sequence and not just the Dataset object!
65 """
66
67 _STRATEGIES = ('first', 'random', 'equidistant')
68 _NPERLABEL_STR = ['equal', 'all']
69
70 - def __init__(self,
71 nperlabel='all',
72 nrunspersplit=1,
73 permute=False,
74 count=None,
75 strategy='equidistant',
76 discard_boundary=None,
77 attr='chunks',
78 reverse=False):
79 """Initialize splitter base.
80
81 :Parameters:
82 nperlabel : int or str (or list of them) or float
83 Number of dataset samples per label to be included in each
84 split. If given as a float, it must be in [0,1] range and would
85 mean the ratio of selected samples per each label.
86 Two special strings are recognized: 'all' uses all available
87 samples (default) and 'equal' uses the maximum number of samples
88 the can be provided by all of the classes. This value might be
89 provided as a sequence whos length matches the number of datasets
90 per split and indicates the configuration for the respective dataset
91 in each split.
92 nrunspersplit: int
93 Number of times samples for each split are chosen. This
94 is mostly useful if a subset of the available samples
95 is used in each split and the subset is randomly
96 selected for each run (see the `nperlabel` argument).
97 permute : bool
98 If set to `True`, the labels of each generated dataset
99 will be permuted on a per-chunk basis.
100 count : None or int
101 Desired number of splits to be output. It is limited by the
102 number of splits possible for a given splitter
103 (e.g. `OddEvenSplitter` can have only up to 2 splits). If None,
104 all splits are output (default).
105 strategy : str
106 If `count` is not None, possible strategies are possible:
107 first
108 First `count` splits are chosen
109 random
110 Random (without replacement) `count` splits are chosen
111 equidistant
112 Splits which are equidistant from each other
113 discard_boundary : None or int or sequence of int
114 If not `None`, how many samples on the boundaries between
115 parts of the split to discard in the training part.
116 If int, then discarded in all parts. If a sequence, numbers
117 to discard are given per part of the split.
118 E.g. if splitter splits only into (training, testing)
119 parts, then `discard_boundary`=(2,0) would instruct to discard
120 2 samples from training which are on the boundary with testing.
121 attr : str
122 Sample attribute used to determine splits.
123 reverse : bool
124 If True, the order of datasets in the split is reversed, e.g.
125 instead of (training, testing), (training, testing) will be spit
126 out
127 """
128
129 self.__nperlabel = None
130 self.__runspersplit = nrunspersplit
131 self.__permute = permute
132 self.__splitattr = attr
133 self._reverse = reverse
134 self.discard_boundary = discard_boundary
135
136
137
138
139 self.count = count
140 """Number (max) of splits to output on call"""
141
142 self._setStrategy(strategy)
143
144
145 self.setNPerLabel(nperlabel)
146
147
148 __doc__ = enhancedDocString('Splitter', locals())
149
158
160 """Set the number of samples per label in the split datasets.
161
162 'equal' sets sample size to highest possible number of samples that
163 can be provided by each class. 'all' uses all available samples
164 (default).
165 """
166 if isinstance(value, basestring):
167 if not value in self._NPERLABEL_STR:
168 raise ValueError, "Unsupported value '%s' for nperlabel." \
169 " Supported ones are %s or float or int" % (value, self._NPERLABEL_STR)
170 self.__nperlabel = value
171
172
174 """Each subclass has to implement this method. It gets a sequence with
175 the unique attribte ids of a dataset and has to return a list of lists
176 containing attribute ids to split into the second dataset.
177 """
178 raise NotImplementedError
179
180
182 """Splits the dataset.
183
184 This method behaves like a generator.
185 """
186
187
188 ds_class = dataset.__class__
189 DS_permuteLabels = ds_class.permuteLabels
190 try:
191 DS_getNSamplesPerLabel = ds_class._getNSamplesPerAttr
192 except AttributeError:
193
194
195 pass
196 DS_getRandomSamples = ds_class.getRandomSamples
197
198
199 cfgs = self.splitcfg(dataset)
200
201
202 count, Ncfgs = self.count, len(cfgs)
203
204
205
206 if count is not None and count < Ncfgs:
207 if count < 1:
208
209 return
210 strategy = self.strategy
211 if strategy == 'first':
212 cfgs = cfgs[:count]
213 elif strategy in ['equidistant', 'random']:
214 if strategy == 'equidistant':
215
216
217 step = float(Ncfgs) / count
218 assert(step >= 1.0)
219 indexes = [int(round(step * i)) for i in xrange(count)]
220 elif strategy == 'random':
221 indexes = N.random.permutation(range(Ncfgs))[:count]
222
223
224 indexes.sort()
225 else:
226
227 raise RuntimeError, "Really should not happen"
228 if __debug__:
229 debug("SPL", "For %s strategy selected %s splits "
230 "from %d total" % (strategy, indexes, Ncfgs))
231 cfgs = [cfgs[i] for i in indexes]
232
233
234 for split in cfgs:
235
236
237 if not operator.isSequenceType(self.__nperlabel) \
238 or isinstance(self.__nperlabel, str):
239 nperlabelsplit = [self.__nperlabel] * len(split)
240 else:
241 nperlabelsplit = self.__nperlabel
242
243
244 split_ds = self.splitDataset(dataset, split)
245
246
247 for run in xrange(self.__runspersplit):
248
249
250 finalized_datasets = []
251
252 for ds, nperlabel in zip(split_ds, nperlabelsplit):
253
254 if self.__permute:
255 DS_permuteLabels(ds, True, perchunk=True)
256
257
258 if nperlabel == 'all' or ds is None:
259 finalized_datasets.append(ds)
260 else:
261
262
263
264
265
266 if nperlabel == 'equal':
267
268 npl = N.array(DS_getNSamplesPerLabel(
269 ds, attrib='labels').values()).min()
270 elif isinstance(nperlabel, float) or (
271 operator.isSequenceType(nperlabel) and
272 len(nperlabel) > 0 and
273 isinstance(nperlabel[0], float)):
274
275
276 counts = N.array(DS_getNSamplesPerLabel(
277 ds, attrib='labels').values())
278 npl = (counts * nperlabel).round().astype(int)
279 else:
280 npl = nperlabel
281
282
283 finalized_datasets.append(
284 DS_getRandomSamples(ds, npl))
285
286 if self._reverse:
287 yield finalized_datasets[::-1]
288 else:
289 yield finalized_datasets
290
291
293 """Split a dataset by separating the samples where the configured
294 sample attribute matches an element of `specs`.
295
296 :Parameters:
297 dataset : Dataset
298 This is this source dataset.
299 specs : sequence of sequences
300 Contains ids of a sample attribute that shall be split into the
301 another dataset.
302 :Returns: Tuple of splitted datasets.
303 """
304
305 filters = []
306 none_specs = 0
307 cum_filter = None
308
309
310 discard_boundary = self.discard_boundary
311 if isinstance(discard_boundary, int):
312 if discard_boundary != 0:
313 discard_boundary = (discard_boundary,) * len(specs)
314 else:
315 discard_boundary = None
316
317 splitattr_data = eval('dataset.' + self.__splitattr)
318 for spec in specs:
319 if spec is None:
320 filters.append(None)
321 none_specs += 1
322 else:
323 filter_ = N.array([ i in spec \
324 for i in splitattr_data])
325 filters.append(filter_)
326 if cum_filter is None:
327 cum_filter = filter_
328 else:
329 cum_filter = N.logical_and(cum_filter, filter_)
330
331
332 if none_specs > 1:
333 raise ValueError, "Splitter cannot handle more than one `None` " \
334 "split definition."
335
336 for i, filter_ in enumerate(filters):
337 if filter_ is None:
338 filters[i] = N.logical_not(cum_filter)
339
340
341
342 if discard_boundary is not None:
343 ndiscard = discard_boundary[i]
344 if ndiscard != 0:
345
346
347
348 f, lenf = filters[i], len(filters[i])
349 f_pad = N.concatenate(([True]*ndiscard, f, [True]*ndiscard))
350 for d in xrange(2*ndiscard+1):
351 f = N.logical_and(f, f_pad[d:d+lenf])
352 filters[i] = f[:]
353
354
355
356
357 split_datasets = []
358
359
360 dataset_selectSamples = dataset.selectSamples
361 for filter_ in filters:
362 if (filter_ == False).all():
363 split_datasets.append(None)
364 else:
365 split_datasets.append(dataset_selectSamples(filter_))
366
367 return split_datasets
368
369
371 """String summary over the object
372 """
373 return \
374 "SplitterConfig: nperlabel:%s runs-per-split:%d permute:%s" \
375 % (self.__nperlabel, self.__runspersplit, self.__permute)
376
377
379 """Return splitcfg for a given dataset"""
380 return self._getSplitConfig(eval('dataset.unique' + self.__splitattr))
381
382
383 strategy = property(fget=lambda self:self.__strategy,
384 fset=_setStrategy)
385
386
388 """This is a dataset splitter that does **not** split. It simply returns
389 the full dataset that it is called with.
390
391 The passed dataset is returned as the second element of the 2-tuple.
392 The first element of that tuple will always be 'None'.
393 """
394
395 _known_modes = ['first', 'second']
396
397 - def __init__(self, mode='second', **kwargs):
398 """Cheap init -- nothing special
399
400 :Parameters:
401 mode
402 Either 'first' or 'second' (default) -- which output dataset
403 would actually contain the samples
404 """
405 Splitter.__init__(self, **(kwargs))
406
407 if not mode in NoneSplitter._known_modes:
408 raise ValueError, "Unknown mode %s for NoneSplitter" % mode
409 self.__mode = mode
410
411
412 __doc__ = enhancedDocString('NoneSplitter', locals(), Splitter)
413
414
416 """Return just one full split: no first or second dataset.
417 """
418 if self.__mode == 'second':
419 return [([], None)]
420 else:
421 return [(None, [])]
422
423
425 """String summary over the object
426 """
427 return \
428 "NoneSplitter / " + Splitter.__str__(self)
429
430
431
433 """Split a dataset into odd and even values of the sample attribute.
434
435 The splitter yields to splits: first (odd, even) and second (even, odd).
436 """
437 - def __init__(self, usevalues=False, **kwargs):
438 """Cheap init.
439
440 :Parameters:
441 usevalues: bool
442 If True the values of the attribute used for splitting will be
443 used to determine odd and even samples. If False odd and even
444 chunks are defined by the order of attribute values, i.e. first
445 unique attribute is odd, second is even, despite the
446 corresponding values might indicate the opposite (e.g. in case
447 of [2,3].
448 """
449 Splitter.__init__(self, **(kwargs))
450
451 self.__usevalues = usevalues
452
453
454 __doc__ = enhancedDocString('OddEvenSplitter', locals(), Splitter)
455
456
458 """Huka chaka!
459 YOH: LOL XXX
460 """
461 if self.__usevalues:
462 return [(None, uniqueattrs[(uniqueattrs % 2) == True]),
463 (None, uniqueattrs[(uniqueattrs % 2) == False])]
464 else:
465 return [(None, uniqueattrs[N.arange(len(uniqueattrs)) %2 == True]),
466 (None, uniqueattrs[N.arange(len(uniqueattrs)) %2 == False])]
467
468
470 """String summary over the object
471 """
472 return \
473 "OddEvenSplitter / " + Splitter.__str__(self)
474
475
476
478 """Split a dataset into two halves of the sample attribute.
479
480 The splitter yields to splits: first (1st half, 2nd half) and second
481 (2nd half, 1st half).
482 """
487
488
489 __doc__ = enhancedDocString('HalfSplitter', locals(), Splitter)
490
491
493 """Huka chaka!
494 """
495 return [(None, uniqueattrs[:len(uniqueattrs)/2]),
496 (None, uniqueattrs[len(uniqueattrs)/2:])]
497
498
500 """String summary over the object
501 """
502 return \
503 "HalfSplitter / " + Splitter.__str__(self)
504
505
506
508 """Split a dataset into N-groups of the sample attribute.
509
510 For example, NGroupSplitter(2) is the same as the HalfSplitter and
511 yields to splits: first (1st half, 2nd half) and second (2nd half,
512 1st half).
513 """
514 - def __init__(self, ngroups=4, **kwargs):
515 """Initialize the N-group splitter.
516
517 :Parameters:
518 ngroups: int
519 Number of groups to split the attribute into.
520 kwargs
521 Additional parameters are passed to the `Splitter` base class.
522 """
523 Splitter.__init__(self, **(kwargs))
524
525 self.__ngroups = ngroups
526
527 __doc__ = enhancedDocString('NGroupSplitter', locals(), Splitter)
528
529
531 """Huka chaka, wuka waka!
532 """
533
534
535 if len(uniqueattrs) < self.__ngroups:
536 raise ValueError, "Number of groups (%d) " % (self.__ngroups) + \
537 "must be less than " + \
538 "or equal to the number of unique attributes (%d)" % \
539 (len(uniqueattrs))
540
541
542 split_ind = coarsenChunks(uniqueattrs, nchunks=self.__ngroups)
543 split_ind = N.asarray(split_ind)
544
545
546 split_list = [(None, uniqueattrs[split_ind==i])
547 for i in range(self.__ngroups)]
548 return split_list
549
550
552 """String summary over the object
553 """
554 return \
555 "N-%d-GroupSplitter / " % self.__ngroup + Splitter.__str__(self)
556
557
558
560 """Generic N-fold data splitter.
561
562 Provide folding splitting. Given a dataset with N chunks, with
563 cvtype=1 (which is default), it would generate N splits, where
564 each chunk sequentially is taken out (with replacement) for
565 cross-validation. Example, if there is 4 chunks, splits for
566 cvtype=1 are:
567
568 [[1, 2, 3], [0]]
569 [[0, 2, 3], [1]]
570 [[0, 1, 3], [2]]
571 [[0, 1, 2], [3]]
572
573 If cvtype>1, then all possible combinations of cvtype number of
574 chunks are taken out for testing, so for cvtype=2 in previous
575 example:
576
577 [[2, 3], [0, 1]]
578 [[1, 3], [0, 2]]
579 [[1, 2], [0, 3]]
580 [[0, 3], [1, 2]]
581 [[0, 2], [1, 3]]
582 [[0, 1], [2, 3]]
583
584 """
585
586 - def __init__(self,
587 cvtype = 1,
588 **kwargs):
589 """Initialize the N-fold splitter.
590
591 :Parameters:
592 cvtype: int
593 Type of cross-validation: N-(cvtype)
594 kwargs
595 Additional parameters are passed to the `Splitter` base class.
596 """
597 Splitter.__init__(self, **(kwargs))
598
599
600 self.__cvtype = cvtype
601
602
603 __doc__ = enhancedDocString('NFoldSplitter', locals(), Splitter)
604
605
607 """String summary over the object
608 """
609 return \
610 "N-%d-FoldSplitter / " % self.__cvtype + Splitter.__str__(self)
611
612
614 """Returns proper split configuration for N-M fold split.
615 """
616 return [(None, i) for i in \
617 support.getUniqueLengthNCombinations(uniqueattrs,
618 self.__cvtype)]
619
620
621
623 """Split a dataset using an arbitrary custom rule.
624
625 The splitter is configured by passing a custom spitting rule (`splitrule`)
626 to its constructor. Such a rule is basically a sequence of split
627 definitions. Every single element in this sequence results in excatly one
628 split generated by the Splitter. Each element is another sequence for
629 sequences of sample ids for each dataset that shall be generated in the
630 split.
631
632 Example:
633
634 * Generate two splits. In the first split the *second* dataset
635 contains all samples with sample attributes corresponding to
636 either 0, 1 or 2. The *first* dataset of the first split contains
637 all samples which are not split into the second dataset.
638
639 The second split yields three datasets. The first with all samples
640 corresponding to sample attributes 1 and 2, the second dataset
641 contains only samples with attrbiute 3 and the last dataset
642 contains the samples with attribute 5 and 6.
643
644 CustomSplitter([(None, [0, 1, 2]), ([1,2], [3], [5, 6])])
645 """
646 - def __init__(self, splitrule, **kwargs):
647 """Cheap init.
648 """
649 Splitter.__init__(self, **(kwargs))
650
651 self.__splitrule = splitrule
652
653
654 __doc__ = enhancedDocString('CustomSplitter', locals(), Splitter)
655
656
658 """Huka chaka!
659 """
660 return self.__splitrule
661
662
664 """String summary over the object
665 """
666 return "CustomSplitter / " + Splitter.__str__(self)
667