-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathassemble_design.py
119 lines (99 loc) · 5.19 KB
/
assemble_design.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
### Make design of simulated ST datasets from single-cell data
import argparse
import pickle
import numpy as np
import pandas as pd
parser = argparse.ArgumentParser()
# parser.add_argument('lbl_gen_file', type=str,
# help='path to label generation pickle file')
# parser.add_argument('cnt_gen_file', type=str,
# help='path to label generation pickle file')
parser.add_argument('seed', type=int,
help='random seed of split')
parser.add_argument('--tot_spots', dest='tot_spots', type=int,
default=1000,
help='Total number of spots to simulate')
parser.add_argument('--mean_high', dest='mean_high', type=float,
default=2.5,
help='Mean cell density for high-density cell types')
parser.add_argument('--mean_low', dest='mean_low', type=float,
default=0.8,
help='Mean cell density for low-density cell types')
parser.add_argument('--percent_uniform', dest='percent_uniform', type=float,
default=80,
help='Sparsity of uniform cell types (% non-zero spots of total spots)')
parser.add_argument('--percent_sparse', dest='percent_sparse', type=float,
default=10,
help='Sparsity of sparse cell types (% non-zero spots of total spots)')
parser.add_argument('--annotation_col', dest='anno_col', type=str,
default="annotation_1",
help='Name of column to use in annotation file (default: annotation_1)')
parser.add_argument('--out_dir', dest='out_dir', type=str,
default='/nfs/team283/ed6/simulation/lowdens_synthetic_ST_fewcells/',
help='Output directory')
parser.add_argument('--assemble_id', dest='assemble_id', type=int,
default=1,
help='ID of ST assembly')
args = parser.parse_args()
# lbl_gen_file = args.lbl_gen_file
# count_gen_file = args.cnt_gen_file
seed = args.seed
tot_spots = args.tot_spots
mean_high = args.mean_high
mean_low = args.mean_low
percent_uniform = args.percent_uniform
percent_sparse = args.percent_sparse
out_dir = args.out_dir
assemble_id = args.assemble_id
anno_col = args.anno_col
### Load input data ###
lbl_gen_file = out_dir + "labels_generation_" + str(seed) + ".p"
count_gen_file = out_dir + "counts_generation_" + str(seed) + ".p"
lbl_generation = pickle.load(open(lbl_gen_file, "rb"))
cnt_generation = pickle.load(open(count_gen_file, "rb"))
uni_labels = lbl_generation[anno_col].unique()
labels = lbl_generation
cnt = cnt_generation
### Define uniform VS sparse cell types (w more sparse = 0)
uniform_ct = np.random.choice([0, 1], size=len(uni_labels), p=[0.8, 0.2])
#### Define low VS high density cell types (w more low density = 1)
uni_low = np.random.choice([0, 1], size=len(uni_labels[uniform_ct == 1]), p=[0.2, 0.8])
reg_low = np.random.choice([0, 1], size=len(uni_labels[uniform_ct == 0]), p=[0.2, 0.8])
design_df = pd.DataFrame({'uniform': uniform_ct}, index=uni_labels)
design_df['density'] = np.nan
design_df.loc[design_df.index[design_df.uniform == 1], 'density'] = uni_low
design_df.loc[design_df.index[design_df.uniform == 0], 'density'] = reg_low
### Generate no of spots per cell type
# Uniform ~ 60% of spots, sparse ~ 5% of spots
mean_unif = round((tot_spots / 100) * percent_uniform)
mean_sparse = round((tot_spots / 100) * percent_sparse)
sigma_unif = np.sqrt(mean_unif / 0.3)
sigma_sparse = np.sqrt(mean_sparse / 0.3)
shape_unif = mean_unif ** 2 / sigma_unif ** 2
scale_unif = sigma_unif ** 2 / mean_unif
shape_sparse = mean_sparse ** 2 / sigma_sparse ** 2
scale_sparse = sigma_sparse ** 2 / mean_sparse
unif_nspots = np.round(np.random.gamma(shape=shape_unif, scale=scale_unif, size=sum(design_df.uniform == 1)))
sparse_nspots = np.round(np.random.gamma(shape=shape_sparse, scale=scale_sparse, size=sum(design_df.uniform == 0)))
# if samples n spots is greater than total number of spots trim to the total
if (unif_nspots > tot_spots).sum() >= 1:
unif_nspots[unif_nspots > tot_spots] = tot_spots
if (sparse_nspots > tot_spots).sum() >= 1:
sparse_nspots[sparse_nspots > tot_spots] = tot_spots
design_df['nspots'] = np.nan
design_df.loc[design_df.index[design_df.uniform == 1], 'nspots'] = unif_nspots
design_df.loc[design_df.index[design_df.uniform == 0], 'nspots'] = sparse_nspots
### Generate avg density per spot per cell type
sigma_low = np.sqrt(mean_low / 2)
sigma_high = np.sqrt(mean_high / 2)
shape_low = mean_low ** 2 / sigma_low ** 2
scale_low = sigma_low ** 2 / mean_low
shape_high = mean_high ** 2 / sigma_high ** 2
scale_high = sigma_high ** 2 / mean_high
low_ncells_mean = np.random.gamma(shape=shape_low, scale=scale_low, size=sum(design_df.density == 1))
high_ncells_mean = np.random.gamma(shape=shape_high, scale=scale_high, size=sum(design_df.density == 0))
design_df['mean_ncells'] = np.nan
design_df.loc[design_df.index[design_df.density == 1], 'mean_ncells'] = low_ncells_mean
design_df.loc[design_df.index[design_df.density == 0], 'mean_ncells'] = high_ncells_mean
out_name = out_dir + "synthetic_ST_seed" + lbl_gen_file.split("_")[-1].rstrip(".p") + "_" + "design" + ".csv"
design_df.to_csv(out_name, sep=",", index=True, header=True)