-
Notifications
You must be signed in to change notification settings - Fork 0
/
tss_gc.py
72 lines (65 loc) · 4.53 KB
/
tss_gc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# Script with several functions that collects statistics for x (and y) arrays based on bin values for z (and w)
import numpy as np
import pandas as pd
chrs = ['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15','chr16','chr17','chr18','chr19','chr20','chr21','chr22']
constants = pd.read_csv('/home1/05664/pwoolley/work/scripts/bedgraph.standardize/standardize_constants.txt',sep='\t')
var = [
'/home1/05664/pwoolley/work/scripts/chip.smoothing/dna_vec/rolling_mean_100/gc_vec.rolling_mean_100.',
'/scratch/05664/pwoolley/SHSY5Y_ChIP/chip.smoothing/rna_seq_vec/sh.august2022.gene.abundance.',
'/scratch/05664/pwoolley/SHSY5Y_ChIP/chip.smoothing/rolling_mean_100/shsy5y.par.atmi.ppois.rolling_mean_100.',
'/scratch/05664/pwoolley/SHSY5Y_ChIP/chip.smoothing/rolling_mean_100/shsy5y.par.dmso.ppois.rolling_mean_100.',
'/scratch/05664/pwoolley/SHSY5Y_ChIP/chip.smoothing/rolling_mean_100/shsy5y.rchip.atmi.ppois.rolling_mean_100.',
'/scratch/05664/pwoolley/SHSY5Y_ChIP/chip.smoothing/rolling_mean_100/shsy5y.rchip.dmso.ppois.rolling_mean_100.',
'/scratch/05664/pwoolley/SHSY5Y_ChIP/chip.smoothing/rolling_mean_100/shsy5y.rchip.atmi_nac.ppois.rolling_mean_100.',
'/scratch/05664/pwoolley/SHSY5Y_ChIP/chip.smoothing/rolling_mean_100/shsy5y.rchip.dmso_nac.ppois.rolling_mean_100.',
'/scratch/05664/pwoolley/SHSY5Y_ChIP/chip.smoothing/rna_seq_vec/sh.august2022.gene.tss.',
'/scratch/05664/pwoolley/SHSY5Y_ChIP/chip.smoothing/rna_seq_vec/sh.august2022.gene.tts.',
'/scratch/05664/pwoolley/SHSY5Y_ChIP/chip.smoothing/rna_seq_vec/sh.august2022.gene.promoter.',
'/scratch/05664/pwoolley/SHSY5Y_ChIP/chip.smoothing/rna_seq_vec/sh.august2022.gene.terminator.'
]
def z_bin_x_stat(array,bin_no,range_bin): # array[:,(z,x)]
if bin_no == False:
array = array[array[:, 0].argsort()] # sort array by z vector
bin_no = len(array)//250_000 # Determining bin no
bins = np.linspace(0,bin_no*250_000,bin_no+1) # creating equally SIZED bins
bins = np.digitize(np.linspace(0,len(array),len(array)),bins) # assigning bins based on index value
u = np.array([np.mean(array[bins==i,1]) for i in range(1,bin_no+1)]) # mean x value in each bin
s = np.array([np.std(array[bins==i,1]) for i in range(1,bin_no+1)]) # standard deviation of x in each bin
m = np.array([np.median(array[bins==i,1]) for i in range(1,bin_no+1)])
X = np.array([len(array[bins==i,1]) for i in range(1,bin_no+1)])
u_z = np.array([np.mean(array[bins==i,0]) for i in range(1,bin_no+1)]) # mean z value in each bin
return(np.vstack((u_z, u,s,m,X)).T)
elif bin_no:
bins = np.linspace(range_bin[0],range_bin[1],bin_no+1) # creating equally spaced bins of a set number from min(x) to max(x)
array[:,0] = np.digitize(array[:,0],bins) # assigning bins to x array
u = np.array([np.mean(array[array[:,0]==i,1]) for i in range(1,bin_no+1)]) # mean y value in each x bin
s = np.array([np.std(array[array[:,0]==i,1]) for i in range(1,bin_no+1)]) # standard deviation of y in each x bin
m = np.array([np.median(array[array[:,0]==i,1]) for i in range(1,bin_no+1)])
X = np.array([len(array[array[:,0]==i,1]) for i in range(1,bin_no+1)])
return(np.vstack((u,s,m,X)).T)
for i in range(2,8):
genome = []
for chrom in chrs:
z_chr = np.load(var[0]+chrom+'.npy').astype('float32') # GC
x_chr = np.load(var[i]+chrom+'.npy').astype('float32') # ChIP
tss = np.load(var[8]+chrom+'.npy').astype('intc') # TSS
locs = np.where((tss>0) & (x_chr>0))
del tss
z_chr = z_chr[locs]
x_chr = x_chr[locs]
del locs
D = constants[constants['signal_type']==var[i].split('/')[-1].split('ppois')[0]+'ppois']['D_depth_normalization_factor'].item() # getting operation factor
x_chr = (x_chr*D).astype('float32') # operating on signal
genome.append(np.vstack((z_chr,x_chr))) # appending the chromosome signal to the array
del z_chr,x_chr
genome = np.hstack(genome).T
bins = z_bin_x_stat(genome,False,False) # Bins of equal size, not spacing
x_name = var[i].split('/')[-1] # ChIP name
gc_abun = pd.DataFrame()
gc_abun[x_name+'tss_gc_bin.u'] = bins[:,0]
gc_abun[x_name+'tss_gc.u'] = bins[:,1]
gc_abun[x_name+'tss_gc.s'] = bins[:,2]
gc_abun[x_name+'tss_gc.m'] = bins[:,3]
gc_abun[x_name+'tss_gc.X'] = bins[:,4]
gc_abun.to_csv('./outputs/tss_gc/'+x_name+'tss_gc.250K.NAs.txt',sep='\t',index=False)
gc_abun.dropna().to_csv('./outputs/tss_gc/'+x_name+'tss_gc.250K.txt',sep='\t',index=False)