Package mvpa :: Package misc :: Module stats
[hide private]
[frames] | no frames]

Source Code for Module mvpa.misc.stats

  1  # emacs: -*- mode: python; py-indent-offset: 4; indent-tabs-mode: nil -*- 
  2  # vi: set ft=python sts=4 ts=4 sw=4 et: 
  3  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  4  # 
  5  #   See COPYING file distributed along with the PyMVPA package for the 
  6  #   copyright and license terms. 
  7  # 
  8  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  9  """Little statistics helper""" 
 10   
 11  __docformat__ = 'restructuredtext' 
 12   
 13  from mvpa.base import externals 
 14   
 15  if externals.exists('scipy', raiseException=True): 
 16      import scipy.stats as stats 
 17   
 18  import numpy as N 
 19  import copy 
 20   
21 -def chisquare(obs, exp=None):
22 """Compute the chisquare value of a contingency table with arbitrary 23 dimensions. 24 25 If no expected frequencies are supplied, the total N is assumed to be 26 equally distributed across all cells. 27 28 Returns: chisquare-stats, associated p-value (upper tail) 29 """ 30 obs = N.array(obs) 31 32 # get total number of observations 33 nobs = N.sum(obs) 34 35 # if no expected value are supplied assume equal distribution 36 if exp == None: 37 exp = N.ones(obs.shape) * nobs / N.prod(obs.shape) 38 39 # make sure to have floating point data 40 exp = exp.astype(float) 41 42 # compute chisquare value 43 chisq = N.sum((obs - exp )**2 / exp) 44 45 # return chisq and probability (upper tail) 46 return chisq, stats.chisqprob(chisq, N.prod(obs.shape) - 1)
47 48
49 -class DSMatrix(object):
50 """DSMatrix allows for the creation of dissilimarity matrices using 51 arbitrary distance metrics. 52 """ 53 54 # metric is a string
55 - def __init__(self, data_vectors, metric='spearman'):
56 """Initialize DSMatrix 57 58 :Parameters: 59 data_vectors : ndarray 60 m x n collection of vectors, where m is the number of exemplars 61 and n is the number of features per exemplar 62 metric : string 63 Distance metric to use (e.g., 'euclidean', 'spearman', 'pearson', 64 'confusion') 65 """ 66 # init members 67 self.full_matrix = [] 68 self.u_triangle = None 69 self.vector_form = None 70 71 # this one we know straight away, so set it 72 self.metric = metric 73 74 # size of dataset (checking if we're dealing with a column vector only) 75 num_exem = N.shape(data_vectors)[0] 76 flag_1d = False 77 # changed 4/26/09 to new way of figuring out if array is 1-D 78 #if (isinstance(data_vectors, N.ndarray)): 79 if (not(num_exem == N.size(data_vectors))): 80 num_features = N.shape(data_vectors)[1] 81 else: 82 flag_1d = True 83 num_features = 1 84 85 # generate output (dissimilarity) matrix 86 dsmatrix = N.mat(N.zeros((num_exem, num_exem))) 87 88 if (metric == 'euclidean'): 89 #print 'Using Euclidean distance metric...' 90 # down rows 91 for i in range(num_exem): 92 # across columns 93 for j in range(num_exem): 94 if (not(flag_1d)): 95 dsmatrix[i,j] = N.linalg.norm(data_vectors[i,:] - data_vectors[j,:]) 96 else: 97 dsmatrix[i,j] = N.linalg.norm(data_vectors[i] - data_vectors[j]) 98 99 elif (metric == 'spearman'): 100 #print 'Using Spearman rank-correlation metric...' 101 # down rows 102 for i in range(num_exem): 103 # across columns 104 for j in range(num_exem): 105 dsmatrix[i,j] = 1 - stats.spearmanr(data_vectors[i,:],data_vectors[j,:])[0] 106 107 elif (metric == 'pearson'): 108 #print 'Using Pearson correlation metric...' 109 # down rows 110 for i in range(num_exem): 111 # across columns 112 for j in range(num_exem): 113 dsmatrix[i, j] = 1 - stats.pearsonr( 114 data_vectors[i,:],data_vectors[j,:])[0] 115 116 elif (metric == 'confusion'): 117 #print 'Using confusion correlation metric...' 118 # down rows 119 for i in range(num_exem): 120 # across columns 121 for j in range(num_exem): 122 if (not(flag_1d)): 123 dsmatrix[i, j] = 1 - int( 124 N.floor(N.sum(( 125 data_vectors[i, :] == data_vectors[j, :] 126 ).astype(N.int32)) / num_features)) 127 else: 128 dsmatrix[i, j] = 1 - int( 129 data_vectors[i] == data_vectors[j]) 130 131 self.full_matrix = dsmatrix
132
133 - def getTriangle(self):
134 # if we need to create the u_triangle representation, do so 135 if (self.u_triangle == None): 136 self.u_triangle = N.triu(self.full_matrix) 137 138 return self.u_triangle
139 140 # create the dissimilarity matrix on the (upper) triangle of the two 141 # two dissimilarity matrices; we can just reuse the same dissimilarity 142 # matrix code, but since it will return a matrix, we need to pick out 143 # either dsm[0,1] or dsm[1,0] 144 # note: this is a bit of a kludge right now, but it's the only way to solve 145 # certain problems: 146 # 1. Set all 0-valued elements in the original matrix to -1 (an impossible 147 # value for a dissimilarity matrix) 148 # 2. Find the upper triangle of the matrix 149 # 3. Create a vector from the upper triangle, but only with the 150 # elements whose absolute value is greater than 0 -- this 151 # will keep everything from the original matrix that wasn't 152 # part of the zero'ed-out portion when we took the upper 153 # triangle 154 # 4. Set all the -1-valued elements in the vector to 0 (their 155 # original value) 156 # 5. Cast to numpy array
157 - def getVectorForm(self):
158 if (not(self.vector_form == None)): 159 return self.vector_form 160 161 orig_dsmatrix = copy.deepcopy(self.getFullMatrix()) 162 163 orig_dsmatrix[orig_dsmatrix == 0] = -1 164 165 orig_tri = N.triu(orig_dsmatrix) 166 167 self.vector_form = orig_tri[abs(orig_tri) > 0] 168 169 self.vector_form[self.vector_form == -1] = 0 170 171 self.vector_form = N.asarray(self.vector_form) 172 self.vector_form = self.vector_form[0] 173 174 return self.vector_form
175 176 # XXX is there any reason to have these get* methods 177 # instead of plain access to full_matrix and method?
178 - def getFullMatrix(self):
179 return self.full_matrix
180
181 - def getMetric(self):
182 return self.metric
183