Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
DafniG authored May 8, 2022
1 parent afb9a8a commit 6643b5b
Show file tree
Hide file tree
Showing 3 changed files with 456 additions and 0 deletions.
120 changes: 120 additions & 0 deletions apply_power_estimation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
#!/usr/bin/env python
# coding: utf-8

# In[ ]:

import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
import os
from collections import deque
from os import listdir
from os.path import isfile, join
import argparse
import warnings
import sys


def closest(lst, K):
return lst[min(range(len(lst)), key=lambda i: abs(lst[i] - K))]


def f(df):
return df[df['total_coverage'] == np.max(df.total_coverage)]


###############################################################################################################################

def apply_power_real_data(threshold, power_simulation_file, data_dir, output_dir):
power_combined = pd.read_csv(power_simulation_file)

mypath = (data_dir)
files = [f for f in listdir(mypath) if isfile(join(mypath, f))]

for file in files:
if (file.startswith('._')):
continue
print(file)
hap_counts_all = pd.read_csv(mypath + file, sep='\t') # files should be tab-delimited

hap_counts = hap_counts_all[(hap_counts_all['Alt_count'] >= 10) | (hap_counts_all['Ref_count'] >= 10)]

hap_counts['combine'] = hap_counts['Gene'] + "_" + hap_counts['variantID']

gene_var_ids = hap_counts['combine'].unique()

power_combined_subset = power_combined[(power_combined['cohen_w'] == threshold)]
coverage_values = power_combined_subset.total_coverage_gene.unique()

data = deque()
for gene_var in gene_var_ids:

hap_count_subset = hap_counts[(hap_counts['combine'] == gene_var)]
number_of_isoforms = len(hap_count_subset)
gene_id = gene_var.split('_')[0]
var_id = gene_var.split('_', 1)[1]

if (number_of_isoforms == 1):
print("gene_dropped", gene_var)
continue

if ((hap_count_subset['Alt_count'] == 0).all() or (hap_count_subset['Ref_count'] == 0).all()):
print("gene_dropped", gene_var)
continue

total_coverage = hap_count_subset['Ref_count'].sum() + hap_count_subset['Alt_count'].sum()

closest_coverage = closest(coverage_values, total_coverage)

power_row = power_combined_subset.loc[
(power_combined_subset['transcript_isoform'] == number_of_isoforms) & (
power_combined_subset['total_coverage_gene'] == closest_coverage), 'power']

if (len(power_row) == 0):
power = np.nan
else:
power = power_row.values[0]

real_counts = [[hap_count_subset['Ref_count']], [hap_count_subset['Alt_count']]]

chi2, p_value_real, dof, expected = chi2_contingency(real_counts)

data.append([gene_id, var_id, total_coverage, power, p_value_real])

output_power = pd.DataFrame(
{'gene_id': [item[0] for item in data], 'variant_id': [item[1] for item in data],
'total_coverage': [item[2] for item in data], 'power': [item[3] for item in data],
'p_value': [item[4] for item in data]})

# select the SNPwith the highest sum count in case of multiple SNP for a gene
output_power_filtered = (output_power.groupby("gene_id").apply(f))
output_power_filtered_indexed = output_power_filtered.set_index('gene_id')

output_power_filtered_indexed.to_csv(output_dir + file, sep='\t')


if __name__ == "__main__":
parser = argparse.ArgumentParser()
# REQUIRED
parser.add_argument("--output_dir", "--o", required=True, help="Output dir")
parser.add_argument("--threshold", type=float, required=True, help="The effect size threshold")
parser.add_argument("--data_dir", required=True, help="input dir")
parser.add_argument("--power_simulation_file", required=True,
help="power simulation file which is the output of running power_stats_simulation")

args = parser.parse_args()

threshold = args.threshold
output_dir = args.output_dir
data_dir = args.data_dir
power_simulation_file = args.power_simulation_file

warnings.filterwarnings("ignore")
script_path = sys.path[0]
print(script_path)

bashCommand = 'mkdir -p ' + output_dir
os.system(bashCommand)

print('...')
apply_power_real_data(threshold, power_simulation_file, data_dir, output_dir)
189 changes: 189 additions & 0 deletions power_stats_simulation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
#!/usr/bin/env python
# coding: utf-8

# In[5]:
import matplotlib

matplotlib.use('Agg')
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
import os
from collections import deque
import matplotlib.pyplot as plt
import math
import seaborn as sns
import argparse
import warnings
import sys


def my_get_group(g, key):
if key in g.groups: return True
return False


def produce_power_samples(transcript_isoform_max, total_coverage, n_samples, output):
for num_of_transcripts in range(1, transcript_isoform_max):
d = deque()

for sqrt_coverage in range(2, math.ceil(np.sqrt(total_coverage)) + 1):
coverage = sqrt_coverage ** 2

if ((coverage % 2) == 1):
coverage = coverage + 1
coverage_eR = int(coverage / 2)
coverage_eA = coverage - coverage_eR

delta = np.log2(coverage_eA / coverage_eR)

# produce N samples
for samples in range(1, n_samples):

p_i = np.random.random(num_of_transcripts)
p_i /= p_i.sum()

q_i = np.random.random(num_of_transcripts)
q_i /= q_i.sum()

obs_prob = [p_i, q_i]
chi2_prob, p_prob, dof_prob, expected_prob = chi2_contingency(obs_prob, correction=False)
cohen = round(np.sqrt(chi2_prob / transcript_isoform_max), 1)

e_Ri = np.array(np.random.multinomial(coverage_eR, p_i, 1), dtype=float)
e_Ai = np.array(np.random.multinomial(coverage_eA, q_i, 1), dtype=float)

for j in range(0, len(e_Ri[0])):
if (e_Ri[0][j] == 0 and e_Ai[0][j] == 0):
e_Ri[0][j] = sys.float_info.epsilon
e_Ai[0][j] = sys.float_info.epsilon

obs = [e_Ri, e_Ai]

chi2, p, dof, expected = chi2_contingency(obs)

d.append([coverage, coverage_eR, coverage_eA, cohen, p, delta])

input_stats = pd.DataFrame({'total_count': [item[0] for item in d], 'ref_coverage': [item[1] for item in d],
'alt_coverage': [item[2] for item in d], 'cohen_w': [item[3] for item in d],
'p_value': [item[4] for item in d], 'delta_log2': [item[5] for item in d]})

###########################################################################################################################

subset_input_stats = input_stats.groupby(['cohen_w', 'total_count'])

dist_values = [0, .1, .2, .3, .4, .5, .6, .7, .8, .9, 1] # input_stats.cohen_w.unique()
coverage_values = input_stats.total_count.unique()

data = deque()

for dist in dist_values:
for count in coverage_values:

if (my_get_group(subset_input_stats, (dist, count)) == False):
data.append([dist, count, np.nan])
else:
if (len(subset_input_stats.get_group((dist, count))) <= 1):
print('warning: not enough samples for ' + ' effect size ' + str(
dist) + ' and total coverage ' + str(count) + ' with ' + str(
num_of_transcripts) + ' transcript isoforms')
data.append([dist, count, np.nan])
else:
rejected_null_hypothesis = subset_input_stats.get_group((dist, count))[
(subset_input_stats.get_group((dist, count))['p_value'] < 0.01)]
power = len(rejected_null_hypothesis) / len(subset_input_stats.get_group((dist, count)))
data.append([dist, count, power])

power_df = pd.DataFrame(
{'cohen_w': [item[0] for item in data], 'total_coverage_gene': [item[1] for item in data],
'power': [item[2] for item in data]})

power_df.to_csv(os.path.expanduser(output + '/transcript_isoform_' + str(num_of_transcripts) + '.csv'),
index=False)


def power_plot(power_stats, effect_size, output):
power_combined = pd.read_csv(power_stats)
power_combined_subset = power_combined[(power_combined['cohen_w'] == effect_size)]

power_combined_subset_power = power_combined_subset.groupby(['transcript_isoform', 'total_coverage_gene'])
trans_values = power_combined_subset.transcript_isoform.unique()
trans_values = trans_values[(trans_values > 1)]
coverage_values = power_combined_subset.total_coverage_gene.unique()
data = deque()

for trans in trans_values:
for count in coverage_values:

if (my_get_group(power_combined_subset_power, (trans, count)) == False):
data.append([trans, count, np.nan])
else:
power = power_combined_subset_power.get_group((trans, count))['power'].values[0]
data.append([trans, count, power])

power_df_effectSize = pd.DataFrame(
{'transcript isoforms': [item[0] for item in data], 'total_coverage_gene': [item[1] for item in data],
'power': [item[2] for item in data]})

power_df_effectSize["total_coverage_gene"] = pd.Categorical(power_df_effectSize["total_coverage_gene"],
power_df_effectSize.total_coverage_gene.unique())
data_matrix = power_df_effectSize.pivot("total_coverage_gene", "transcript isoforms", "power")

fig = plt.figure(figsize=(8, 8))
sns.set(font_scale=1.4)
r = sns.heatmap(data_matrix, cmap='BuPu', cbar=False)
r.set_xticklabels(r.get_xticklabels(), rotation=270, horizontalalignment='center')

plt.savefig(os.path.expanduser(output + '/power_' + str(effect_size) + '.pdf'), format='pdf', dpi=3000)


if __name__ == "__main__":

parser = argparse.ArgumentParser()
# REQUIRED
parser.add_argument("--output_path", "--o", required=True, help="Output dir")
parser.add_argument("--isoform_max", type=int, required=True, help="max no. of transcript isoforms")
parser.add_argument("--effect_size", type=float, required=True, help="effect size")
parser.add_argument("--max_coverage", type=int, required=True, choices=range(2, 1024), help="max coverage <= 1024")
parser.add_argument("--iterations", type=int, required=False, default=1000, help="number of simulations")

args = parser.parse_args()

iso_counts = args.isoform_max
output_path = args.output_path
effect_size = args.effect_size
max_coverage = args.max_coverage
num_samples = args.iterations

warnings.filterwarnings("ignore")
script_path = sys.path[0]
print(script_path)

bashCommand = 'mkdir -p ' + output_path
os.system(bashCommand)

bashCommand = 'mkdir ' + output_path + "/temp"
os.system(bashCommand)

bashCommand = 'mkdir ' + output_path + "/temp/sample_files/"
os.system(bashCommand)

bashCommand = 'mkdir ' + output_path + "/plot/"
os.system(bashCommand)

print('produce samples for simuation...')
produce_power_samples(iso_counts + 1, max_coverage + 1, num_samples, output_path + "/temp/sample_files/")

combined_power = pd.DataFrame()

print('combining results...')
for count in range(1, iso_counts + 1):
input_stats = pd.read_csv(output_path + '/temp/sample_files/transcript_isoform_' + str(count) + '.csv')
input_stats['transcript_isoform'] = count

combined_power = combined_power.append(input_stats)

combined_power.to_csv(os.path.expanduser(output_path + '/power_stats_simulation.csv'), index=False)

print('saving plot...')
power_plot(output_path + '/power_stats_simulation.csv', effect_size, output_path + "/plot/")
Loading

0 comments on commit 6643b5b

Please sign in to comment.