1
2
3
4
5
6
7
8
9 """Unit tests for PyMVPA dataset handling"""
10
11 import unittest
12 import random
13 import numpy as N
14 from mvpa.datasets import Dataset
15 from mvpa.datasets.miscfx import zscore, aggregateFeatures
16 from mvpa.mappers.mask import MaskMapper
17 from mvpa.misc.exceptions import DatasetError
18 from mvpa.support import copy
19
20 from tests_warehouse import datasets
21
23
25 """Test composition of new datasets by addition of existing ones
26 """
27 data = Dataset(samples=range(5), labels=1, chunks=1)
28
29 self.failUnlessEqual(
30 data.uniquelabels, [1],
31 msg="uniquelabels must be correctly recomputed")
32
33
34 self.failUnlessEqual( data.nsamples, 1)
35
36 self.failUnless(
37 (data.samples == N.array([[0, 1, 2, 3, 4]])).all() )
38
39
40 self.failUnless( (data.labels == N.array([1])).all() )
41 self.failUnless( (data.chunks == N.array([1])).all() )
42
43
44 self.failUnlessRaises( DatasetError,
45 data.__iadd__, Dataset(samples=N.ones((2,3)),
46 labels=1,
47 chunks=1))
48
49
50 dss = datasets['uni2large'].samples
51 data += Dataset(samples=dss[:2, :5], labels=2, chunks=2 )
52 self.failUnlessEqual( data.nfeatures, 5 )
53 self.failUnless((data.labels == N.array([1, 2, 2])).all() )
54 self.failUnless((data.chunks == N.array([1, 2, 2])).all() )
55
56
57 data += Dataset(samples=dss[3:5, :5], labels=3)
58 self.failUnless((data.chunks == N.array([1, 2, 2, 0, 1]) ).all())
59
60
61 self.failUnless((data.uniquelabels == N.array([1, 2, 3]) ).all())
62
63
64 self.failUnlessRaises(DatasetError,
65 Dataset,
66 samples=dss[:4, :5],
67 labels=[ 1, 2, 3 ],
68 chunks=2)
69
70
71 self.failUnlessRaises(DatasetError,
72 Dataset,
73 samples=dss[:4, :5],
74 labels=[ 1, 2, 3, 4 ],
75 chunks=[ 2, 2, 2 ])
76
77
79 """Testing feature selection: sorted/not sorted, feature groups
80 """
81 origdata = datasets['uni2large'].samples[:10, :20]
82 data = Dataset(samples=origdata, labels=2, chunks=2 )
83
84
85 data.defineFeatureGroups(N.repeat(range(4), 5))
86
87 unmasked = data.samples.copy()
88
89
90 self.failUnless( data.nfeatures == 20 )
91
92 features_to_select = [3, 0, 17]
93 features_to_select_copy = copy.deepcopy(features_to_select)
94 features_to_select_sorted = copy.deepcopy(features_to_select)
95 features_to_select_sorted.sort()
96
97 bsel = N.array([False]*20)
98 bsel[ features_to_select ] = True
99
100 for sel, issorted in \
101 [(data.selectFeatures( features_to_select, sort=False), False),
102 (data.selectFeatures( features_to_select, sort=True), True),
103 (data.select(slice(None), features_to_select), True),
104 (data.select(slice(None), N.array(features_to_select)), True),
105 (data.select(slice(None), bsel), True)
106 ]:
107 self.failUnless(sel.nfeatures == 3)
108
109
110 self.failUnless(sel.samples.shape == (10, 3))
111
112
113 fts = (features_to_select, features_to_select_sorted)[int(issorted)]
114 self.failUnless((unmasked[:, fts] == sel.samples).all())
115
116
117 self.failUnless((sel._dsattr['featuregroups'] == [0, 0, 3]).all())
118
119
120 self.failUnless(features_to_select==features_to_select_copy)
121
122
123 gsel = data.selectFeatures(groups=[2, 3])
124 self.failUnless(gsel.nfeatures == 10)
125 self.failUnless(set(gsel._dsattr['featuregroups']) == set([2, 3]))
126
127
129 origdata = datasets['uni2large'].samples[:100, :10].T
130 data = Dataset(samples=origdata, labels=2, chunks=2 )
131
132 self.failUnless( data.nsamples == 10 )
133
134
135 for sel in [ data.selectSamples(5),
136 data.select(5),
137 data.select(slice(5, 6)),
138 ]:
139 self.failUnless( sel.nsamples == 1 )
140 self.failUnless( data.nfeatures == 100 )
141 self.failUnless( sel.origids == [5] )
142
143
144 for sel in [ data.selectSamples([5, 5]),
145
146
147
148
149
150 ]:
151 self.failUnless( sel.nsamples == 2 )
152 self.failUnless( (sel.samples[0] == data.samples[5]).all() )
153 self.failUnless( (sel.samples[0] == sel.samples[1]).all() )
154 self.failUnless( len(sel.labels) == 2 )
155 self.failUnless( len(sel.chunks) == 2 )
156 self.failUnless((sel.origids == [5, 5]).all())
157
158 self.failUnless( sel.samples.shape == (2, 100) )
159
160
161 for sel in [ data.selectSamples(data.idsbylabels(2)),
162 data.select(labels=2),
163 data.select('labels', 2),
164 data.select('labels', [2]),
165 data['labels', [2]],
166 data['labels': [2], 'labels':2],
167 data['labels': [2]],
168 ]:
169 self.failUnless( sel.nsamples == data.nsamples )
170 self.failUnless( N.all(sel.samples == data.samples) )
171
172 for sel in [ data.selectSamples(data.idsbylabels(3)),
173 data.select(labels=3),
174 data.select('labels', 3),
175 data.select('labels', [3]),
176 ]:
177 self.failUnless( sel.nsamples == 0 )
178
179 data = Dataset(samples=origdata,
180 labels=[8, 9, 4, 3, 3, 3, 4, 2, 8, 9],
181 chunks=2)
182 for sel in [ data.selectSamples(data.idsbylabels([2, 3])),
183 data.select('labels', [2, 3]),
184 data.select('labels', [2, 3], labels=[1, 2, 3, 4]),
185 data.select('labels', [2, 3], chunks=[1, 2, 3, 4]),
186 data['labels':[2, 3], 'chunks':[1, 2, 3, 4]],
187 data['chunks':[1, 2, 3, 4], 'labels':[2, 3]],
188 ]:
189 self.failUnless(N.all(sel.origids == [ 3., 4., 5., 7.]))
190
191
192 self.failUnless( (data.uniquelabels == [2, 3, 4, 8, 9]).all() );
193
194
195
196 sel = data.selectSamples(data.idsbylabels([3, 4, 8, 9]))
197 self.failUnlessEqual(set(sel.uniquelabels), set([3, 4, 8, 9]))
198 self.failUnless((sel.origids == [0, 1, 2, 3, 4, 5, 6, 8, 9]).all())
199
200
202 """Test some obscure selections of samples via select() or __getitem__
203 """
204 origdata = datasets['uni2large'].samples[:100, :10].T
205 data = Dataset(samples=origdata,
206
207 labels=[8, 9, 4, 3, 3, 3, 3, 2, 8, 9],
208 chunks=[1, 2, 3, 2, 3, 1, 5, 6, 3, 6])
209
210
211 if __debug__:
212
213 self.failUnlessRaises(ValueError, data.__getitem__,
214 'labels', 'featu')
215
216
217 self.failUnlessRaises(ValueError, data.__getitem__, 1, 1, 1)
218
219
220 for sel in [ data.select('chunks', [2, 6], labels=[3, 2],
221 features=slice(None)),
222 data.select('all', 'all', labels=[2,3], chunks=[2, 6]),
223 data['chunks', [2, 6], 'labels', [3, 2]],
224 data[:, :, 'chunks', [2, 6], 'labels', [3, 2]],
225
226 data[3:8, 'chunks', [2, 6, 2, 6], 'labels', [3, 2]],
227 ]:
228 self.failUnless(N.all(sel.origids == [3, 7]))
229 self.failUnless(sel.nfeatures == 100)
230 self.failUnless(N.all(sel.samples == origdata[ [3, 7] ]))
231
232 target = origdata[ [3, 7] ]
233 target = target[:, [1, 3] ]
234
235 for sel in [ data.select('all', [1, 3],
236 'chunks', [2, 6], labels=[3, 2]),
237 data[:, [1,3], 'chunks', [2, 6], 'labels', [3, 2]],
238 data[:, [1,3], 'chunks', [2, 6], 'labels', [3, 2]],
239
240 data[3:8, [1, 1, 3, 1],
241 'chunks', [2, 6, 2, 6], 'labels', [3, 2]],
242 ]:
243 self.failUnless(N.all(sel.origids == [3, 7]))
244 self.failUnless(sel.nfeatures == 2)
245 self.failUnless(N.all(sel.samples == target))
246
247
248 self.failUnless(data.select(chunks=[23]).nsamples == 0)
249
250
251 self.failUnless(N.all(data.where(chunks=[2, 6])==[1, 3, 7, 9]))
252 self.failUnless(N.all(data.where(chunks=[2, 6], labels=[22, 3])==[3]))
253
254 idx = data.where('all', [1, 3, 10], labels=[2, 3, 4])
255 self.failUnless(N.all(idx[1] == [1, 3, 10]))
256 self.failUnless(N.all(idx[0] == range(2, 8)))
257
258 self.failUnless(data.where() is None)
259
260 self.failUnless(data.where(labels=[123]) == [])
261
262
276
277
279 data1 = Dataset(samples=N.ones((5, 5)), labels=1, chunks=1 )
280 data2 = Dataset(samples=N.ones((3, 5)), labels=2, chunks=1 )
281
282 merged = data1 + data2
283
284 self.failUnless( merged.nfeatures == 5 )
285 l12 = [1]*5 + [2]*3
286 l1 = [1]*8
287 self.failUnless( (merged.labels == l12).all() )
288 self.failUnless( (merged.chunks == l1).all() )
289
290 data1 += data2
291
292 self.failUnless( data1.nfeatures == 5 )
293 self.failUnless( (data1.labels == l12).all() )
294 self.failUnless( (data1.chunks == l1).all() )
295
296
298 """
299 """
300 data = Dataset(samples=N.ones((5, 1)), labels=range(5), chunks=1 )
301 data += Dataset(samples=N.ones((5, 1))+1, labels=range(5), chunks=2 )
302 data += Dataset(samples=N.ones((5, 1))+2, labels=range(5), chunks=3 )
303 data += Dataset(samples=N.ones((5, 1))+3, labels=range(5), chunks=4 )
304 data += Dataset(samples=N.ones((5, 1))+4, labels=range(5), chunks=5 )
305 self.failUnless( data.samplesperlabel == {0:5, 1:5, 2:5, 3:5, 4:5} )
306
307
308 sample = data.getRandomSamples( 2 )
309 self.failUnless( sample.samplesperlabel.values() == [ 2, 2, 2, 2, 2 ] )
310
311 self.failUnless( (data.uniquechunks == range(1, 6)).all() )
312
313
314 origlabels = data.labels.copy()
315
316 data.permuteLabels(True)
317
318 self.failIf( (data.labels == origlabels).all() )
319
320 data.permuteLabels(False)
321
322 self.failUnless( (data.labels == origlabels).all() )
323
324
325 data2 = Dataset(samples=data.samples,
326 labels=data.labels,
327 chunks=data.chunks )
328
329
330 self.failUnless( (data2.labels == origlabels).all() )
331
332
333 data2.permuteLabels( True )
334
335
336 self.failUnless( (data.labels == origlabels).all() )
337
338 self.failIf( (data2.labels == origlabels).all() )
339
340
342 """Test adding custom attributes to a dataset
343 """
344
345
346
347 ds = Dataset(samples=range(5), labels=1, chunks=1)
348 self.failUnlessRaises(AttributeError, lambda x:x.blobs, ds)
349 """Dataset.blobs should fail since .blobs wasn't yet registered"""
350
351
352 Dataset._registerAttribute("blobs", "_data", hasunique=True)
353 ds = Dataset(samples=range(5), labels=1, chunks=1)
354 self.failUnless(not ds.blobs != [ 0 ],
355 msg="By default new attributes supposed to get 0 as the value")
356
357 try:
358 ds.blobs = [1, 2]
359 self.fail(msg="Dataset.blobs=[1,2] should fail since "
360 "there is 5 samples")
361 except ValueError, e:
362 pass
363
364 try:
365 ds.blobs = [1]
366 except e:
367 self.fail(msg="We must be able to assign the attribute")
368
369
370
371
372
373
385
386
388 """Test z-scoring transformation
389 """
390
391 samples = N.array( (0,1,3,4,2,2,3,1,1,3,3,1,2,2,2,2) ).\
392 reshape((16, 1))
393 data = Dataset(samples=samples,
394 labels=range(16), chunks=[0]*16)
395 self.failUnlessEqual( data.samples.mean(), 2.0 )
396 self.failUnlessEqual( data.samples.std(), 1.0 )
397 zscore(data, perchunk=True)
398
399
400 check = N.array([-2,-1,1,2,0,0,1,-1,-1,1,1,-1,0,0,0,0],
401 dtype='float64').reshape(16,1)
402 self.failUnless( (data.samples == check).all() )
403
404 data = Dataset(samples=samples,
405 labels=range(16), chunks=[0]*16)
406 zscore(data, perchunk=False)
407 self.failUnless( (data.samples == check).all() )
408
409
410 data = Dataset(samples=samples,
411 labels=[0, 2, 2, 2, 1] + [2]*11,
412 chunks=[0]*16)
413 zscore(data, baselinelabels=[0, 1])
414 self.failUnless((samples == data.samples+1.0).all())
415
416
427
428
430 """Test creation of new dataset by applying a mapper"""
431 mapper = MaskMapper(N.array([1, 0, 1]))
432 dataset = Dataset(samples=N.arange(12).reshape( (4, 3) ),
433 labels=1,
434 chunks=1)
435 seldataset = dataset.applyMapper(featuresmapper=mapper)
436 self.failUnless( (dataset.selectFeatures([0, 2]).samples
437 == seldataset.samples).all() )
438
439
440
441 if __debug__:
442
443 self.failUnlessRaises(ValueError, mapper.reverse, [10, 20, 30])
444 self.failUnlessRaises(ValueError, mapper.forward, [10, 20])
445
446
447
448
449
450
451
452
454 """Test Dataset.idhash() if it gets changed if any of the
455 labels/chunks changes
456 """
457
458 dataset = Dataset(samples=N.arange(12).reshape( (4, 3) ),
459 labels=1,
460 chunks=1)
461 origid = dataset.idhash
462 dataset.labels = [3, 1, 2, 3]
463 self.failUnless(origid != dataset.idhash,
464 msg="Changing all labels should alter dataset's idhash")
465
466 origid = dataset.idhash
467
468 z = dataset.labels[1]
469 self.failUnlessEqual(origid, dataset.idhash,
470 msg="Accessing shouldn't change idhash")
471 z = dataset.chunks
472 self.failUnlessEqual(origid, dataset.idhash,
473 msg="Accessing shouldn't change idhash")
474 z[2] = 333
475 self.failUnless(origid != dataset.idhash,
476 msg="Changing value in attribute should change idhash")
477
478 origid = dataset.idhash
479 dataset.samples[1, 1] = 1000
480 self.failUnless(origid != dataset.idhash,
481 msg="Changing value in data should change idhash")
482
483
484 origid = dataset.idhash
485 dataset.permuteLabels(True)
486 self.failUnless(origid != dataset.idhash,
487 msg="Permutation also changes idhash")
488
489 dataset.permuteLabels(False)
490 self.failUnless(origid == dataset.idhash,
491 msg="idhash should be restored after "
492 "permuteLabels(False)")
493
494
511
512
519
520
522 """Test mapping of the labels from strings to numericals
523 """
524 od = {'apple':0, 'orange':1}
525 samples = [[3], [2], [3]]
526 labels_l = ['apple', 'orange', 'apple']
527
528
529 ds = Dataset(samples=samples, labels='orange')
530 self.failUnless(N.all(ds.labels == ['orange']*3))
531
532
533 for ds in [Dataset(samples=samples, labels=labels_l, labels_map=od),
534
535 Dataset(samples=samples, labels=labels_l, labels_map=True)]:
536 self.failUnless(N.all(ds.labels == [0, 1, 0]))
537 self.failUnless(ds.labels_map == od)
538 ds_ = ds[1]
539 self.failUnless(ds_.labels_map == od,
540 msg='selectSamples should provide full mapping preserved')
541
542
543 self.failUnlessRaises(ValueError, Dataset, samples=samples,
544 labels=labels_l, labels_map = {'apple':0})
545
546
547
548 ds2 = Dataset(samples=samples, labels=labels_l)
549 self.failUnlessEqual(ds2.labels_map, None)
550
551
552 od3 = {1:100, 2:101, 3:100}
553 ds3 = Dataset(samples=samples, labels=[1, 2, 3],
554 labels_map=od3)
555 self.failUnlessEqual(ds3.labels_map, od3)
556 self.failUnless(N.all(ds3.labels == [100, 101, 100]))
557
558 ds3_ = ds3[1]
559 self.failUnlessEqual(ds3.labels_map, od3)
560
561 ds4 = Dataset(samples=samples, labels=labels_l)
562
563
564 ds = Dataset(samples=samples, labels=labels_l, labels_map=od)
565
566 self.failUnlessRaises(ValueError, ds.setLabelsMap,
567 {'orange': 1, 'nonorange': 3})
568 new_map = {'tasty':0, 'crappy':1}
569 ds.labels_map = new_map.copy()
570 self.failUnlessEqual(ds.labels_map, new_map)
571
572
574 """Adding datasets needs special care whenever labels mapping
575 is used."""
576 samples = [[3], [2], [3]]
577 l1 = ['a', 'b', 'a']
578 l2 = ['b', 'a', 'c']
579 ds1 = Dataset(samples=samples, labels=l1,
580 labels_map={'a':1, 'b':2})
581 ds2 = Dataset(samples=samples, labels=l2,
582 labels_map={'c':1, 'a':4, 'b':2})
583
584
585 ds0 = Dataset(samples=samples, labels=l2)
586
587
588 lm1 = ds1.labels_map.copy()
589 lm2 = ds2.labels_map.copy()
590
591 ds3 = ds1 + ds2
592 self.failUnless(N.all(ds3.labels ==
593 N.hstack((ds1.labels, [2, 1, 5]))))
594 self.failUnless(ds1.labels_map == lm1)
595 self.failUnless(ds2.labels_map == lm2)
596
597
598 ds1 += ds2
599 self.failUnless(N.all(ds1.labels == ds3.labels))
600
601
602 self.failUnless(N.all(ds1.labels_map == ds3.labels_map))
603
604
605
606 self.failUnlessRaises(ValueError, ds1.__add__, ds0)
607 self.failUnlessRaises(ValueError, ds1.__iadd__, ds0)
608
609
611
612 ds = datasets['uni2small']
613
614 ds_ = ds.copy()
615
616 self.failUnless(N.all(ds.samples == ds_.samples))
617 self.failUnless(N.all(ds.labels == ds_.labels))
618 self.failUnless(N.all(ds.chunks == ds_.chunks))
619
620
621 ds_.samples[0, 0] = 1234
622 self.failUnless(N.any(ds.samples != ds_.samples))
623 self.failUnless(N.all(ds.labels == ds_.labels))
624 self.failUnless(N.all(ds.chunks == ds_.chunks))
625
626 ds_.labels = N.hstack(([123], ds_.labels[1:]))
627 self.failUnless(N.any(ds.samples != ds_.samples))
628 self.failUnless(N.any(ds.labels != ds_.labels))
629 self.failUnless(N.all(ds.chunks == ds_.chunks))
630
631 ds_.chunks = N.hstack(([1234], ds_.chunks[1:]))
632 self.failUnless(N.any(ds.samples != ds_.samples))
633 self.failUnless(N.any(ds.labels != ds_.labels))
634 self.failUnless(N.any(ds.chunks != ds_.chunks))
635
636 self.failUnless(N.any(ds.uniquelabels != ds_.uniquelabels))
637 self.failUnless(N.any(ds.uniquechunks != ds_.uniquechunks))
638
639
641 """Test detection of transition points
642
643 Shame on Yarik -- he didn't create unittests right away... damn me
644 """
645 ds = Dataset(samples=N.array(range(10), ndmin=2).T,
646 labels=[0,0,1,1,0,0,1,1,0,0],
647 chunks=[0,0,0,0,0,1,1,1,1,1])
648 self.failUnless(ds.idsonboundaries() == [0,2,4,5,6,8],
649 "We should have got ids whenever either chunk or "
650 "label changes")
651 self.failUnless(ds.idsonboundaries(attributes_to_track=['chunks'])
652 == [0, 5])
653
654 self.failUnless(ds.idsonboundaries(prior=1, post=-1,
655 attributes_to_track=['chunks'])
656 == [4, 9])
657 self.failUnless(ds.idsonboundaries(prior=2, post=-1,
658 attributes_to_track=['chunks'])
659 == [3, 4, 8, 9])
660 self.failUnless(ds.idsonboundaries(prior=2, post=-1,
661 attributes_to_track=['chunks'],
662 revert=True)
663 == [0, 1, 2, 5, 6, 7])
664 self.failUnless(ds.idsonboundaries(prior=1, post=1,
665 attributes_to_track=['chunks'])
666 == [0, 1, 4, 5, 6, 9])
667
668 self.failUnless(ds.idsonboundaries(prior=2) == range(10))
669
670
673
674
675 if __name__ == '__main__':
676 import runner
677