-
Notifications
You must be signed in to change notification settings - Fork 17
/
Copy pathdivide_data.py
100 lines (77 loc) · 3.13 KB
/
divide_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import csv
from random import shuffle
from xml.etree import ElementTree
from utils import *
def process_data(original_path, portion):
"""
process original data with respective labels, create a csv file include the image_path and its label
:param original_path: path to the original data
:param portion: portion of train, valid, test set as array [0.85,0.15]
:return:
"""
# list all bmp files and labels
# load all image names into the list
location_folders = [os.path.join(original_path, f)
for f in os.listdir(original_path)
if os.path.isdir(os.path.join(original_path, f))]
# loop over location folders
subjects_folders = []
for l in location_folders:
subjects_folders.extend([os.path.join(l, f)
for f in os.listdir(l)
if os.path.isdir(os.path.join(l, f))])
# loop over images and save their path in the lists
images_list = []
labels_list = []
for s in subjects_folders:
images_list.extend([os.path.join(s, f)
for f in os.listdir(s)
if f.endswith(".bmp")])
labels_list.extend([os.path.join(s, f)
for f in os.listdir(s)
if f.endswith(".xml")])
# sort both lists to match labels with the images
images_list = sorted(images_list)
labels_list = sorted(labels_list)
assert len(images_list)==len(labels_list)
# contain both images and labels in one list
data = []
for i, img in enumerate(images_list):
xml_path = labels_list[i]
e = ElementTree.parse(xml_path).getroot()
x = np.float32(e[0].text)
y = np.float32(e[1].text)
w = np.float32(e[2].text)
h = np.float32(e[3].text)
a = np.float32(e[4].text)
if x <= 0 or x >= 192:
print("label for {0} is out of bound".format(img))
continue
if y <= 0 or y >= 192:
print("label for {0} is out of bound".format(img))
continue
data.append([img, x, y, w, h, a])
# shuffle data
shuffle(data)
# based portion values, make train, validation, test set
data_len = len(data)
train_len = int(np.ceil(data_len * portion[0]))
train_data = data[:train_len]
valid_data = data[train_len:]
assert data_len == (len(train_data)+len(valid_data))
# save all lists to data/ folder
saveCSV(train_data, "train_data.csv", "data/")
saveCSV(valid_data, "valid_data.csv", "data/")
print("There are {0} images in train set".format(len(train_data)))
print("There are {0} images in validation set".format(len(valid_data)))
def saveCSV(data_list, output_name, save_path):
# save a list into a CSV file
check_dir(save_path)
p = os.path.join(save_path, output_name)
with open(p, "w") as f:
writer = csv.writer(f)
writer.writerows(data_list)
print("{0} has been successfully saved on {1}".format(output_name, save_path))
if __name__ == "__main__":
process_data("data/Original-data", [0.9, 0.1])
print("done...")