-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpar_rchip_gc.dmso.py
87 lines (76 loc) · 4.92 KB
/
par_rchip_gc.dmso.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# Script with several functions that collects statistics for x (and y) arrays based on bin values for z (and w)
import numpy as np
import pandas as pd
chrs = ['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15','chr16','chr17','chr18','chr19','chr20','chr21','chr22']
constants = pd.read_csv('/home1/05664/pwoolley/work/scripts/bedgraph.standardize/standardize_constants.txt',sep='\t')
var = [
'/home1/05664/pwoolley/work/scripts/chip.smoothing/dna_vec/rolling_mean_100/gc_vec.rolling_mean_100.',
'/scratch/05664/pwoolley/SHSY5Y_ChIP/chip.smoothing/rna_seq_vec/sh.august2022.gene.abundance.',
'/scratch/05664/pwoolley/SHSY5Y_ChIP/chip.smoothing/rolling_mean_100/shsy5y.par.atmi.ppois.rolling_mean_100.',
'/scratch/05664/pwoolley/SHSY5Y_ChIP/chip.smoothing/rolling_mean_100/shsy5y.par.dmso.ppois.rolling_mean_100.',
'/scratch/05664/pwoolley/SHSY5Y_ChIP/chip.smoothing/rolling_mean_100/shsy5y.rchip.atmi.ppois.rolling_mean_100.',
'/scratch/05664/pwoolley/SHSY5Y_ChIP/chip.smoothing/rolling_mean_100/shsy5y.rchip.dmso.ppois.rolling_mean_100.',
'/scratch/05664/pwoolley/SHSY5Y_ChIP/chip.smoothing/rolling_mean_100/shsy5y.rchip.atmi_nac.ppois.rolling_mean_100.',
'/scratch/05664/pwoolley/SHSY5Y_ChIP/chip.smoothing/rolling_mean_100/shsy5y.rchip.dmso_nac.ppois.rolling_mean_100.',
'/scratch/05664/pwoolley/SHSY5Y_ChIP/chip.smoothing/rna_seq_vec/sh.august2022.gene.tss.',
'/scratch/05664/pwoolley/SHSY5Y_ChIP/chip.smoothing/rna_seq_vec/sh.august2022.gene.tts.',
'/scratch/05664/pwoolley/SHSY5Y_ChIP/chip.smoothing/rna_seq_vec/sh.august2022.gene.promoter.',
'/scratch/05664/pwoolley/SHSY5Y_ChIP/chip.smoothing/rna_seq_vec/sh.august2022.gene.terminator.'
]
def zw_bin_xy_stat_MODIFIED(array): # array[:,(z,x,w_y)]
array = array[array[:,0].argsort()] # sorting by GC value
zbin_no = len(array)//500_000
zbins = np.linspace(0,zbin_no*500_000,zbin_no+1) # Double check this line
zbins = np.digitize(np.linspace(0,len(array),len(array)),zbins) # Double check this line
out_array = []
for i in range(1,zbin_no+1):
tmp = array[zbins==i] # tmp[zbin==i,(z,x,w_y)]
tmp = tmp[tmp[:,2].argsort()] # sorting by RChIP value
wbin_no = len(tmp)//25_000 # number of 1k sized bins
wbins = np.linspace(0,wbin_no*25_000,wbin_no+1) # creating equally SIZED bins
wbins = np.digitize(np.linspace(0,len(tmp),len(tmp)),wbins) # assigning bins based on index value
u_z = np.array([np.mean(tmp[:,0])]*wbin_no)
u_x = np.array([np.mean(tmp[(wbins==j),1]) for j in range(1,wbin_no+1)])
# s_x = np.array([np.std(tmp[(wbins==j),1]) for j in range(1,wbin_no+1)])
# m_x = np.array([np.median(tmp[(wbins==j),1]) for j in range(1,wbin_no+1)])
# X_x = np.array([len(tmp[(wbins==j),1]) for j in range(1,wbin_no+1)])
u_y = np.array([np.mean(tmp[(wbins==j),2]) for j in range(1,wbin_no+1)])
# s_y = np.array([np.std(tmp[(wbins==j),2]) for j in range(1,wbin_no+1)])
# m_y = np.array([np.median(tmp[(wbins==j),2]) for j in range(1,wbin_no+1)])
# X_y = np.array([len(tmp[(wbins==j),2]) for j in range(1,wbin_no+1)])
out_array.append(np.vstack((u_z,u_x,u_y)).T)
#out_array.append(np.vstack((u_z,u_x,s_x,m_x,X_x,u_y,s_y,m_y,X_y)).T)
out_array = np.vstack(out_array)
return(out_array)
gc_abun = pd.DataFrame()
genome = []
for chrom in chrs:
z_chr = np.load(var[0]+chrom+'.npy').astype('float32') # GC
x_chr = np.load(var[3]+chrom+'.npy').astype('float32') # PAR
y_chr = np.load(var[5]+chrom+'.npy').astype('float32') # RChIP
locs = np.where((x_chr>0) & (y_chr>0) & (z_chr>0.45))
z_chr = z_chr[locs]
x_chr = x_chr[locs]
y_chr = y_chr[locs]
del locs
D = constants[constants['signal_type']==var[3].split('/')[-1].split('ppois')[0]+'ppois']['D_depth_normalization_factor'].item() # getting operation factor
x_chr = (x_chr*D).astype('float32') # operating on signal
D = constants[constants['signal_type']==var[5].split('/')[-1].split('ppois')[0]+'ppois']['D_depth_normalization_factor'].item() # getting operation factor
y_chr = (y_chr*D).astype('float32') # operating on signal
genome.append(np.vstack((z_chr,x_chr,y_chr))) # appending the chromosome signal to the array
del z_chr,x_chr,y_chr
genome = np.hstack(genome).T
bins = zw_bin_xy_stat_MODIFIED(genome) #
x_name = var[3].split('/')[-1] # ChIP name
y_name = var[5].split('/')[-1] # ChIP name
gc_abun['bin_gc_u'] = bins[:,0]
gc_abun[x_name+'global_gc.u'] = bins[:,1]
#gc_abun[x_name+'global_gc.s'] = bins[:,2]
#gc_abun[x_name+'global_gc.m'] = bins[:,3]
#gc_abun[x_name+'global_gc.X'] = bins[:,4]
gc_abun[y_name+'global_gc.u'] = bins[:,2]
#gc_abun[y_name+'global_gc.s'] = bins[:,6]
#gc_abun[y_name+'global_gc.m'] = bins[:,7]
#gc_abun[y_name+'global_gc.X'] = bins[:,8]
gc_abun.to_csv('par_x.rchip_wy.gc_z.dmso.figures.500K_25K_0.45GC.NAs.txt',sep='\t',index=False)
gc_abun.dropna().to_csv('par_x.rchip_wy.gc_z.dmso.figures.500K_25K_0.45GC.txt',sep='\t',index=False)