-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
3 changed files
with
456 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,120 @@ | ||
#!/usr/bin/env python | ||
# coding: utf-8 | ||
|
||
# In[ ]: | ||
|
||
import pandas as pd | ||
import numpy as np | ||
from scipy.stats import chi2_contingency | ||
import os | ||
from collections import deque | ||
from os import listdir | ||
from os.path import isfile, join | ||
import argparse | ||
import warnings | ||
import sys | ||
|
||
|
||
def closest(lst, K): | ||
return lst[min(range(len(lst)), key=lambda i: abs(lst[i] - K))] | ||
|
||
|
||
def f(df): | ||
return df[df['total_coverage'] == np.max(df.total_coverage)] | ||
|
||
|
||
############################################################################################################################### | ||
|
||
def apply_power_real_data(threshold, power_simulation_file, data_dir, output_dir): | ||
power_combined = pd.read_csv(power_simulation_file) | ||
|
||
mypath = (data_dir) | ||
files = [f for f in listdir(mypath) if isfile(join(mypath, f))] | ||
|
||
for file in files: | ||
if (file.startswith('._')): | ||
continue | ||
print(file) | ||
hap_counts_all = pd.read_csv(mypath + file, sep='\t') # files should be tab-delimited | ||
|
||
hap_counts = hap_counts_all[(hap_counts_all['Alt_count'] >= 10) | (hap_counts_all['Ref_count'] >= 10)] | ||
|
||
hap_counts['combine'] = hap_counts['Gene'] + "_" + hap_counts['variantID'] | ||
|
||
gene_var_ids = hap_counts['combine'].unique() | ||
|
||
power_combined_subset = power_combined[(power_combined['cohen_w'] == threshold)] | ||
coverage_values = power_combined_subset.total_coverage_gene.unique() | ||
|
||
data = deque() | ||
for gene_var in gene_var_ids: | ||
|
||
hap_count_subset = hap_counts[(hap_counts['combine'] == gene_var)] | ||
number_of_isoforms = len(hap_count_subset) | ||
gene_id = gene_var.split('_')[0] | ||
var_id = gene_var.split('_', 1)[1] | ||
|
||
if (number_of_isoforms == 1): | ||
print("gene_dropped", gene_var) | ||
continue | ||
|
||
if ((hap_count_subset['Alt_count'] == 0).all() or (hap_count_subset['Ref_count'] == 0).all()): | ||
print("gene_dropped", gene_var) | ||
continue | ||
|
||
total_coverage = hap_count_subset['Ref_count'].sum() + hap_count_subset['Alt_count'].sum() | ||
|
||
closest_coverage = closest(coverage_values, total_coverage) | ||
|
||
power_row = power_combined_subset.loc[ | ||
(power_combined_subset['transcript_isoform'] == number_of_isoforms) & ( | ||
power_combined_subset['total_coverage_gene'] == closest_coverage), 'power'] | ||
|
||
if (len(power_row) == 0): | ||
power = np.nan | ||
else: | ||
power = power_row.values[0] | ||
|
||
real_counts = [[hap_count_subset['Ref_count']], [hap_count_subset['Alt_count']]] | ||
|
||
chi2, p_value_real, dof, expected = chi2_contingency(real_counts) | ||
|
||
data.append([gene_id, var_id, total_coverage, power, p_value_real]) | ||
|
||
output_power = pd.DataFrame( | ||
{'gene_id': [item[0] for item in data], 'variant_id': [item[1] for item in data], | ||
'total_coverage': [item[2] for item in data], 'power': [item[3] for item in data], | ||
'p_value': [item[4] for item in data]}) | ||
|
||
# select the SNPwith the highest sum count in case of multiple SNP for a gene | ||
output_power_filtered = (output_power.groupby("gene_id").apply(f)) | ||
output_power_filtered_indexed = output_power_filtered.set_index('gene_id') | ||
|
||
output_power_filtered_indexed.to_csv(output_dir + file, sep='\t') | ||
|
||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser() | ||
# REQUIRED | ||
parser.add_argument("--output_dir", "--o", required=True, help="Output dir") | ||
parser.add_argument("--threshold", type=float, required=True, help="The effect size threshold") | ||
parser.add_argument("--data_dir", required=True, help="input dir") | ||
parser.add_argument("--power_simulation_file", required=True, | ||
help="power simulation file which is the output of running power_stats_simulation") | ||
|
||
args = parser.parse_args() | ||
|
||
threshold = args.threshold | ||
output_dir = args.output_dir | ||
data_dir = args.data_dir | ||
power_simulation_file = args.power_simulation_file | ||
|
||
warnings.filterwarnings("ignore") | ||
script_path = sys.path[0] | ||
print(script_path) | ||
|
||
bashCommand = 'mkdir -p ' + output_dir | ||
os.system(bashCommand) | ||
|
||
print('...') | ||
apply_power_real_data(threshold, power_simulation_file, data_dir, output_dir) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,189 @@ | ||
#!/usr/bin/env python | ||
# coding: utf-8 | ||
|
||
# In[5]: | ||
import matplotlib | ||
|
||
matplotlib.use('Agg') | ||
import pandas as pd | ||
import numpy as np | ||
from scipy.stats import chi2_contingency | ||
import os | ||
from collections import deque | ||
import matplotlib.pyplot as plt | ||
import math | ||
import seaborn as sns | ||
import argparse | ||
import warnings | ||
import sys | ||
|
||
|
||
def my_get_group(g, key): | ||
if key in g.groups: return True | ||
return False | ||
|
||
|
||
def produce_power_samples(transcript_isoform_max, total_coverage, n_samples, output): | ||
for num_of_transcripts in range(1, transcript_isoform_max): | ||
d = deque() | ||
|
||
for sqrt_coverage in range(2, math.ceil(np.sqrt(total_coverage)) + 1): | ||
coverage = sqrt_coverage ** 2 | ||
|
||
if ((coverage % 2) == 1): | ||
coverage = coverage + 1 | ||
coverage_eR = int(coverage / 2) | ||
coverage_eA = coverage - coverage_eR | ||
|
||
delta = np.log2(coverage_eA / coverage_eR) | ||
|
||
# produce N samples | ||
for samples in range(1, n_samples): | ||
|
||
p_i = np.random.random(num_of_transcripts) | ||
p_i /= p_i.sum() | ||
|
||
q_i = np.random.random(num_of_transcripts) | ||
q_i /= q_i.sum() | ||
|
||
obs_prob = [p_i, q_i] | ||
chi2_prob, p_prob, dof_prob, expected_prob = chi2_contingency(obs_prob, correction=False) | ||
cohen = round(np.sqrt(chi2_prob / transcript_isoform_max), 1) | ||
|
||
e_Ri = np.array(np.random.multinomial(coverage_eR, p_i, 1), dtype=float) | ||
e_Ai = np.array(np.random.multinomial(coverage_eA, q_i, 1), dtype=float) | ||
|
||
for j in range(0, len(e_Ri[0])): | ||
if (e_Ri[0][j] == 0 and e_Ai[0][j] == 0): | ||
e_Ri[0][j] = sys.float_info.epsilon | ||
e_Ai[0][j] = sys.float_info.epsilon | ||
|
||
obs = [e_Ri, e_Ai] | ||
|
||
chi2, p, dof, expected = chi2_contingency(obs) | ||
|
||
d.append([coverage, coverage_eR, coverage_eA, cohen, p, delta]) | ||
|
||
input_stats = pd.DataFrame({'total_count': [item[0] for item in d], 'ref_coverage': [item[1] for item in d], | ||
'alt_coverage': [item[2] for item in d], 'cohen_w': [item[3] for item in d], | ||
'p_value': [item[4] for item in d], 'delta_log2': [item[5] for item in d]}) | ||
|
||
########################################################################################################################### | ||
|
||
subset_input_stats = input_stats.groupby(['cohen_w', 'total_count']) | ||
|
||
dist_values = [0, .1, .2, .3, .4, .5, .6, .7, .8, .9, 1] # input_stats.cohen_w.unique() | ||
coverage_values = input_stats.total_count.unique() | ||
|
||
data = deque() | ||
|
||
for dist in dist_values: | ||
for count in coverage_values: | ||
|
||
if (my_get_group(subset_input_stats, (dist, count)) == False): | ||
data.append([dist, count, np.nan]) | ||
else: | ||
if (len(subset_input_stats.get_group((dist, count))) <= 1): | ||
print('warning: not enough samples for ' + ' effect size ' + str( | ||
dist) + ' and total coverage ' + str(count) + ' with ' + str( | ||
num_of_transcripts) + ' transcript isoforms') | ||
data.append([dist, count, np.nan]) | ||
else: | ||
rejected_null_hypothesis = subset_input_stats.get_group((dist, count))[ | ||
(subset_input_stats.get_group((dist, count))['p_value'] < 0.01)] | ||
power = len(rejected_null_hypothesis) / len(subset_input_stats.get_group((dist, count))) | ||
data.append([dist, count, power]) | ||
|
||
power_df = pd.DataFrame( | ||
{'cohen_w': [item[0] for item in data], 'total_coverage_gene': [item[1] for item in data], | ||
'power': [item[2] for item in data]}) | ||
|
||
power_df.to_csv(os.path.expanduser(output + '/transcript_isoform_' + str(num_of_transcripts) + '.csv'), | ||
index=False) | ||
|
||
|
||
def power_plot(power_stats, effect_size, output): | ||
power_combined = pd.read_csv(power_stats) | ||
power_combined_subset = power_combined[(power_combined['cohen_w'] == effect_size)] | ||
|
||
power_combined_subset_power = power_combined_subset.groupby(['transcript_isoform', 'total_coverage_gene']) | ||
trans_values = power_combined_subset.transcript_isoform.unique() | ||
trans_values = trans_values[(trans_values > 1)] | ||
coverage_values = power_combined_subset.total_coverage_gene.unique() | ||
data = deque() | ||
|
||
for trans in trans_values: | ||
for count in coverage_values: | ||
|
||
if (my_get_group(power_combined_subset_power, (trans, count)) == False): | ||
data.append([trans, count, np.nan]) | ||
else: | ||
power = power_combined_subset_power.get_group((trans, count))['power'].values[0] | ||
data.append([trans, count, power]) | ||
|
||
power_df_effectSize = pd.DataFrame( | ||
{'transcript isoforms': [item[0] for item in data], 'total_coverage_gene': [item[1] for item in data], | ||
'power': [item[2] for item in data]}) | ||
|
||
power_df_effectSize["total_coverage_gene"] = pd.Categorical(power_df_effectSize["total_coverage_gene"], | ||
power_df_effectSize.total_coverage_gene.unique()) | ||
data_matrix = power_df_effectSize.pivot("total_coverage_gene", "transcript isoforms", "power") | ||
|
||
fig = plt.figure(figsize=(8, 8)) | ||
sns.set(font_scale=1.4) | ||
r = sns.heatmap(data_matrix, cmap='BuPu', cbar=False) | ||
r.set_xticklabels(r.get_xticklabels(), rotation=270, horizontalalignment='center') | ||
|
||
plt.savefig(os.path.expanduser(output + '/power_' + str(effect_size) + '.pdf'), format='pdf', dpi=3000) | ||
|
||
|
||
if __name__ == "__main__": | ||
|
||
parser = argparse.ArgumentParser() | ||
# REQUIRED | ||
parser.add_argument("--output_path", "--o", required=True, help="Output dir") | ||
parser.add_argument("--isoform_max", type=int, required=True, help="max no. of transcript isoforms") | ||
parser.add_argument("--effect_size", type=float, required=True, help="effect size") | ||
parser.add_argument("--max_coverage", type=int, required=True, choices=range(2, 1024), help="max coverage <= 1024") | ||
parser.add_argument("--iterations", type=int, required=False, default=1000, help="number of simulations") | ||
|
||
args = parser.parse_args() | ||
|
||
iso_counts = args.isoform_max | ||
output_path = args.output_path | ||
effect_size = args.effect_size | ||
max_coverage = args.max_coverage | ||
num_samples = args.iterations | ||
|
||
warnings.filterwarnings("ignore") | ||
script_path = sys.path[0] | ||
print(script_path) | ||
|
||
bashCommand = 'mkdir -p ' + output_path | ||
os.system(bashCommand) | ||
|
||
bashCommand = 'mkdir ' + output_path + "/temp" | ||
os.system(bashCommand) | ||
|
||
bashCommand = 'mkdir ' + output_path + "/temp/sample_files/" | ||
os.system(bashCommand) | ||
|
||
bashCommand = 'mkdir ' + output_path + "/plot/" | ||
os.system(bashCommand) | ||
|
||
print('produce samples for simuation...') | ||
produce_power_samples(iso_counts + 1, max_coverage + 1, num_samples, output_path + "/temp/sample_files/") | ||
|
||
combined_power = pd.DataFrame() | ||
|
||
print('combining results...') | ||
for count in range(1, iso_counts + 1): | ||
input_stats = pd.read_csv(output_path + '/temp/sample_files/transcript_isoform_' + str(count) + '.csv') | ||
input_stats['transcript_isoform'] = count | ||
|
||
combined_power = combined_power.append(input_stats) | ||
|
||
combined_power.to_csv(os.path.expanduser(output_path + '/power_stats_simulation.csv'), index=False) | ||
|
||
print('saving plot...') | ||
power_plot(output_path + '/power_stats_simulation.csv', effect_size, output_path + "/plot/") |
Oops, something went wrong.