Merge pull request #221 from ljchang/fix_perm_tail

ljchang · web-flow · commit 106ca36a6be4 · 2018-03-11T21:51:40.000-04:00
Fix perm tail Former-commit-id: 22df5e5
diff --git a/.travis.yml b/.travis.yml
@@ -5,7 +5,7 @@ sudo: false
 python:
   - "2.7"
   - "3.6"
-  
+
 before_script:
   - "export DISPLAY=:99.0"
   - "sh -e /etc/init.d/xvfb start"
@@ -25,6 +25,7 @@ install:
   - pip install -r requirements.txt
   - pip install -r optional-dependencies.txt
   - python setup.py install
+  - pip install git+https://github.com/nilearn/nilearn --upgrade
 
 script: coverage run --source nltools -m py.test
 
diff --git a/nltools/stats.py b/nltools/stats.py
@@ -395,12 +395,32 @@ def _permute_group(data, random_state=None):
     return (np.mean(data.loc[perm_label==1, 'Values']) -
             np.mean(data.loc[perm_label==0, 'Values']))
 
-def one_sample_permutation(data, n_permute=5000, n_jobs=-1, random_state=None):
+
+def _calc_pvalue(all_p, stat, tail):
+    """Calculates p value based on distribution of correlations
+    This function is called by the permutation functions
+        all_p: list of correlation values from permutation
+        stat: actual value being tested, i.e., stats['correlation'] or stats['mean']
+        tail: (int) either 2 or 1 for two-tailed p-value or one-tailed
+    """
+    if tail==2:
+        p= np.mean( np.abs(all_p) >= np.abs(stat))
+    elif tail==1:
+        if stat >= 0:
+            p = np.mean(all_p >= stat)
+        else:
+            p = np.mean(all_p <= stat)
+    else:
+        raise ValueError('tail must be either 1 or 2')
+    return p
+
+def one_sample_permutation(data, n_permute=5000, tail=2, n_jobs=-1, random_state=None):
     ''' One sample permutation test using randomization.
 
         Args:
             data: Pandas DataFrame or Series or numpy array
             n_permute: (int) number of permutations
+            tail: (int) either 1 for one-tail or 2 for two-tailed test (default: 2)
             n_jobs: (int) The number of CPUs to use to do the computation.
                     -1 means all CPUs.
 
@@ -418,20 +438,18 @@ def one_sample_permutation(data, n_permute=5000, n_jobs=-1, random_state=None):
 
     all_p = Parallel(n_jobs=n_jobs)(delayed(_permute_sign)(data,
                      random_state=seeds[i]) for i in range(n_permute))
-    if stats['mean'] >= 0:
-        stats['p'] = np.mean(all_p >= stats['mean'])
-    else:
-        stats['p'] = np.mean(all_p <= stats['mean'])
+    stats['p'] = _calc_pvalue(all_p,stats['mean'],tail)
     return stats
 
 def two_sample_permutation(data1, data2, n_permute=5000,
-                           n_jobs=-1, random_state=None):
+                           tail=2, n_jobs=-1, random_state=None):
     ''' Independent sample permutation test.
 
         Args:
             data1: Pandas DataFrame or Series or numpy array
             data2: Pandas DataFrame or Series or numpy array
             n_permute: (int) number of permutations
+            tail: (int) either 1 for one-tail or 2 for two-tailed test (default: 2)
             n_jobs: (int) The number of CPUs to use to do the computation.
                     -1 means all CPUs.
         Returns:
@@ -451,14 +469,11 @@ def two_sample_permutation(data1, data2, n_permute=5000,
     all_p = Parallel(n_jobs=n_jobs)(delayed(_permute_group)(data,
                      random_state=seeds[i]) for i in range(n_permute))
 
-    if stats['mean']>=0:
-        stats['p'] = np.mean(all_p >= stats['mean'])
-    else:
-        stats['p'] = np.mean(all_p <= stats['mean'])
+    stats['p'] = _calc_pvalue(all_p,stats['mean'],tail)
     return stats
 
 def correlation_permutation(data1, data2, n_permute=5000, metric='spearman',
-                            n_jobs=-1, random_state=None):
+                            tail=2, n_jobs=-1, random_state=None):
     ''' Permute correlation.
 
         Args:
@@ -467,6 +482,7 @@ def correlation_permutation(data1, data2, n_permute=5000, metric='spearman',
             n_permute: (int) number of permutations
             metric: (str) type of association metric ['spearman','pearson',
                     'kendall']
+            tail: (int) either 1 for one-tail or 2 for two-tailed test (default: 2)
             n_jobs: (int) The number of CPUs to use to do the computation.
                     -1 means all CPUs.
 
@@ -504,10 +520,7 @@ def correlation_permutation(data1, data2, n_permute=5000, metric='spearman',
                         for i in range(n_permute))
     all_p = [x[0] for x in all_p]
 
-    if stats['correlation'] >= 0:
-        stats['p'] = np.mean(all_p >= stats['correlation'])
-    else:
-        stats['p'] = np.mean(all_p <= stats['correlation'])
+    stats['p'] = _calc_pvalue(all_p,stats['correlation'],tail)
     return stats
 
 def make_cosine_basis(nsamples, sampling_rate, filter_length, drop=0):
diff --git a/nltools/tests/test_stats.py b/nltools/tests/test_stats.py
@@ -7,38 +7,48 @@
 							upsample,
 							winsorize,
 							align,
-							transform_pairwise)
+							transform_pairwise, _calc_pvalue)
 from nltools.simulator import Simulator
 from nltools.mask import create_sphere
+# import pytest
 
 def test_permutation():
 	dat = np.random.multivariate_normal([2, 6], [[.5, 2], [.5, 3]], 1000)
 	x = dat[:, 0]
 	y = dat[:, 1]
-	stats = two_sample_permutation(x, y)
-	assert (stats['mean'] < -2) & (stats['mean'] > -6)
-	assert stats['p'] < .001
-	print(stats)
-	stats = one_sample_permutation(x-y)
-	assert (stats['mean'] < -2) & (stats['mean'] > -6)
-	assert stats['p'] < .001
-	print(stats)
-	stats = correlation_permutation(x, y, metric='pearson')
-	assert (stats['correlation'] > .4) & (stats['correlation']<.85)
-	assert stats['p'] < .001
-	stats = correlation_permutation(x, y, metric='spearman')
-	assert (stats['correlation'] > .4) & (stats['correlation']<.85)
-	assert stats['p'] < .001
-	stats = correlation_permutation(x, y, metric='kendall')
-	assert (stats['correlation'] > .4) & (stats['correlation']<.85)
-	assert stats['p'] < .001
+	stats = two_sample_permutation(x, y,tail=1)
+	assert (stats['mean'] < -2) & (stats['mean'] > -6) & (stats['p'] < .001)
+	stats = one_sample_permutation(x-y,tail=1)
+	assert (stats['mean'] < -2) & (stats['mean'] > -6) & (stats['p'] < .001)
+	stats = correlation_permutation(x, y, metric='pearson',tail=1)
+	assert (stats['correlation'] > .4) & (stats['correlation']<.85) & (stats['p'] < .001)
+	stats = correlation_permutation(x, y, metric='spearman',tail=1)
+	assert (stats['correlation'] > .4) & (stats['correlation']<.85) & (stats['p'] < .001)
+	stats = correlation_permutation(x, y, metric='kendall',tail=2)
+	assert (stats['correlation'] > .4) & (stats['correlation']<.85) & (stats['p'] < .001)
+	# with pytest.raises(ValueError):
+	# 	correlation_permutation(x, y, metric='kendall',tail=3)
+	# with pytest.raises(ValueError):
+	# 	correlation_permutation(x, y, metric='doesntwork',tail=3)
+	s = np.random.normal(0,1,10000)
+	two_sided = _calc_pvalue(all_p = s, stat= 1.96, tail = 2)
+	upper_p = _calc_pvalue(all_p = s, stat= 1.96, tail = 1)
+	lower_p = _calc_pvalue(all_p = s, stat= -1.96, tail = 1)
+	sum_p = upper_p + lower_p
+	np.testing.assert_almost_equal(two_sided, sum_p)
 
 def test_downsample():
 	dat = pd.DataFrame()
 	dat['x'] = range(0,100)
 	dat['y'] = np.repeat(range(1,11),10)
 	assert((dat.groupby('y').mean().values.ravel() == downsample(data=dat['x'],sampling_freq=10,target=1,target_type='hz',method='mean').values).all)
 	assert((dat.groupby('y').median().values.ravel() == downsample(data=dat['x'],sampling_freq=10,target=1,target_type='hz',method='median').values).all)
+	# with pytest.raises(ValueError):
+	# 	downsample(data=list(dat['x']),sampling_freq=10,target=1,target_type='hz',method='median')
+	# with pytest.raises(ValueError):
+	# 	downsample(data=dat['x'],sampling_freq=10,target=1,target_type='hz',method='doesnotwork')
+	# with pytest.raises(ValueError):
+	# 	downsample(data=dat['x'],sampling_freq=10,target=1,target_type='doesnotwork',method='median')
 
 def test_upsample():
 	dat = pd.DataFrame()
@@ -50,6 +60,10 @@ def test_upsample():
 	fs = 3
 	us = upsample(dat,sampling_freq=1,target=fs,target_type='hz')
 	assert(dat.shape[0]*fs-fs == us.shape[0])
+	# with pytest.raises(ValueError):
+	# 	upsample(dat,sampling_freq=1,target=fs,target_type='hz',method='doesnotwork')
+	# with pytest.raises(ValueError):
+	# 	upsample(dat,sampling_freq=1,target=fs,target_type='doesnotwork',method='linear')
 
 def test_winsorize():
 	outlier_test = pd.DataFrame([92, 19, 101, 58, 1053, 91, 26, 78, 10, 13,
diff --git a/requirements.txt b/requirements.txt
@@ -1,10 +1,10 @@
 	nibabel>=2.0.1
-	scikit-learn>=0.18.1
+	scikit-learn>=0.19.1
 	nilearn>=0.4
 	pandas>=0.20
 	numpy>=1.9
 	seaborn>=0.7.0
-	matplotlib>=2.1
+	matplotlib>=2.2.0
 	scipy
 	six
 	pynv