forked from usatlas-ml-training/Anomaly-Detection
-
Notifications
You must be signed in to change notification settings - Fork 0
/
create_datasets.py
80 lines (65 loc) · 3.33 KB
/
create_datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import argparse
import h5py
from sklearn.model_selection import train_test_split
import numpy as np
def create_datasets_dense(bkg_file, output_bkg_name, signals_files, output_signal_names, events=None, test_size=0.2, val_size=0.2, input_shape=57):
# read BACKGROUND data
with h5py.File(bkg_file, 'r') as file:
full_data = file['Particles'][:,:,:-1]
np.random.shuffle(full_data)
if events: full_data = full_data[:events,:,:]
# define training, test and validation datasets
X_train, X_test = train_test_split(full_data, test_size=test_size, shuffle=True)
X_train, X_val = train_test_split(X_train, test_size=val_size)
del full_data
# flatten the data for model input
X_train = X_train.reshape(X_train.shape[0], input_shape)
X_test = X_test.reshape(X_test.shape[0], input_shape)
X_val = X_val.reshape(X_val.shape[0], input_shape)
with h5py.File(output_bkg_name + '_dataset.h5', 'w') as h5f:
h5f.create_dataset('X_train', data = X_train)
h5f.create_dataset('X_test', data = X_test)
h5f.create_dataset('X_val', data = X_val)
if signals_files:
# read SIGNAL data
for i, signal_file in enumerate(signals_files):
f = h5py.File(signal_file,'r')
signal_data = f['Particles'][:,:,:-1]
signal_data = signal_data.reshape(signal_data.shape[0],input_shape)
with h5py.File(output_signal_names[i] + '_dataset.h5', 'w') as h5f2:
h5f2.create_dataset('Data', data = signal_data)
return
def create_datasets_convolutional(bkg_file, output_bkg_name, signals_files, output_signal_names, events=None, test_size=0.2, val_size=0.2, input_shape=57):
# read BACKGROUND data
with h5py.File(bkg_file, 'r') as file:
full_data = file['Particles'][:,:,:-1]
np.random.shuffle(full_data)
if events: full_data = full_data[:events,:,:]
# define training, test and validation datasets
X_train, X_test = train_test_split(full_data, test_size=test_size, shuffle=True)
X_train, X_val = train_test_split(X_train, test_size=val_size)
del full_data
with h5py.File(output_bkg_name + '_dataset.h5', 'w') as h5f:
h5f.create_dataset('X_train', data = X_train)
h5f.create_dataset('X_test', data = X_test)
h5f.create_dataset('X_val', data = X_val)
if signals_files:
# read SIGNAL data
for i, signal_file in enumerate(signals_files):
f = h5py.File(signal_file,'r')
signal_data = f['Particles'][:,:,:-1]
with h5py.File(output_signal_names[i] + '_dataset.h5', 'w') as h5f2:
h5f2.create_dataset('Data', data = signal_data)
return
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--bkg_file', type=str)
parser.add_argument('--output_bkg_name', type=str)
parser.add_argument('--signals_files', type=str, action='append')
parser.add_argument('--output_signal_names', type=str, action='append')
parser.add_argument('--events', type=int, default=None)
parser.add_argument('--test_size', type=float, default=0.2)
parser.add_argument('--val_size', type=float, default=0.2)
parser.add_argument('--input_shape', type=int, default=57)
args = parser.parse_args()
create_datasets_dense(**vars(args))