-
Notifications
You must be signed in to change notification settings - Fork 15
/
load_mnist.py
123 lines (101 loc) · 3.81 KB
/
load_mnist.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import numpy as np
import os
import pdb
import matplotlib.pyplot as plt
datasets_dir = '/home/kunal/Desktop/CSE569_Project/ScratchImplementation/'
def one_hot(x, n):
if type(x) == list:
x = np.array(x)
x = x.flatten()
o_h = np.zeros((len(x), n))
o_h[np.arange(len(x)), x] = 1
return o_h
def train_validation_split(noTrSamples=1000, noTsSamples=100, \
digit_range=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], \
noTrPerClass=100, noTsPerClass=10, trData=[],trLabels=[]):
tsX = np.zeros((noTsSamples, 28*28))
trX = np.zeros((noTrSamples, 28*28))
tsY = np.zeros(noTsSamples)
trY = np.zeros(noTrSamples)
count = 0
for ll in digit_range:
# Train data
idl = np.where(trLabels == ll)
idl = idl[0][: noTrPerClass]
idx = list(range(count*noTrPerClass, (count+1)*noTrPerClass))
trX[idx, :] = trData[idl, :]
trY[idx] = trLabels[idl]
# Test data
idl = np.where(tsLabels == ll)
idl = idl[0][: noTsPerClass]
idx = list(range(count*noTsPerClass, (count+1)*noTsPerClass))
tsX[idx, :] = tsData[idl, :]
tsY[idx] = tsLabels[idl]
count += 1
np.random.seed(1)
test_idx = np.random.permutation(tsX.shape[0])
tsX = tsX[test_idx,:]
tsY = tsY[test_idx]
trX = trX.T
tsX = tsX.T
trY = trY.reshape(1, -1)
tsY = tsY.reshape(1, -1)
return trX, trY, tsX, tsY
def mnist(noTrSamples=1000, noTsSamples=100, \
digit_range=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], \
noTrPerClass=100, noTsPerClass=10):
assert noTrSamples==noTrPerClass*len(digit_range), 'noTrSamples and noTrPerClass mismatch'
assert noTsSamples==noTsPerClass*len(digit_range), 'noTrSamples and noTrPerClass mismatch'
data_dir = os.path.join(datasets_dir, 'mnist/')
fd = open(os.path.join(data_dir, 'train-images-idx3-ubyte'))
loaded = np.fromfile(file=fd, dtype=np.uint8)
trData = loaded[16:].reshape((60000, 28*28)).astype(float)
fd = open(os.path.join(data_dir, 'train-labels-idx1-ubyte'))
loaded = np.fromfile(file=fd, dtype=np.uint8)
trLabels = loaded[8:].reshape((60000)).astype(float)
fd = open(os.path.join(data_dir, 't10k-images-idx3-ubyte'))
loaded = np.fromfile(file=fd, dtype=np.uint8)
tsData = loaded[16:].reshape((10000, 28*28)).astype(float)
fd = open(os.path.join(data_dir, 't10k-labels-idx1-ubyte'))
loaded = np.fromfile(file=fd, dtype=np.uint8)
tsLabels = loaded[8:].reshape((10000)).astype(float)
trData = trData/255.
tsData = tsData/255.
tsX = np.zeros((noTsSamples, 28*28))
trX = np.zeros((noTrSamples, 28*28))
tsY = np.zeros(noTsSamples)
trY = np.zeros(noTrSamples)
count = 0
for ll in digit_range:
# Train data
idl = np.where(trLabels == ll)
idl = idl[0][: noTrPerClass]
idx = list(range(count*noTrPerClass, (count+1)*noTrPerClass))
trX[idx, :] = trData[idl, :]
trY[idx] = trLabels[idl]
# Test data
idl = np.where(tsLabels == ll)
idl = idl[0][: noTsPerClass]
idx = list(range(count*noTsPerClass, (count+1)*noTsPerClass))
tsX[idx, :] = tsData[idl, :]
tsY[idx] = tsLabels[idl]
count += 1
np.random.seed(1)
test_idx = np.random.permutation(tsX.shape[0])
tsX = tsX[test_idx,:]
tsY = tsY[test_idx]
trX = trX.T
tsX = tsX.T
trY = trY.reshape(1, -1)
tsY = tsY.reshape(1, -1)
return trX, trY, tsX, tsY
def main():
trX, trY, tsX, tsY = mnist(noTrSamples=30,
noTsSamples=15, digit_range=[0, 5, 8],
noTrPerClass=10, noTsPerClass=5)
print(trX.shape);
plt.imshow(trX[:,5].reshape(28, -1))
trY[0,5]
plt.show()
if __name__ == "__main__":
main()