-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdata_util.py
550 lines (450 loc) · 19.7 KB
/
data_util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
import requests
import numpy as np
from numpy import dstack
from time import sleep
import json
import os
import matplotlib.pyplot as plt
class NumpyException(Exception):
pass
class SubDictException(Exception):
pass
def pp(obj):
print(json.dumps(obj, indent=4))
class DotaData(object):
def __init__(self):
self.base_api = "https://api.opendota.com/api/"
def get(self, api):
'''
Uses the requests module to get data from the api
Returns the python object that corresponds to the api's json
'''
try:
r = requests.get("{}{}".format(self.base_api, api))
if(r.status_code == 200):
return r.status_code, r.json()
else:
return r.status_code, None
# handle all exceptions
except:
# return 404
return 404, None
def get_schema(self):
'''
The dota api has a schema endpoint. This method parses the json returned from that into a
dict with table names as keys and their columns as list values because the format on
the endpoint is incomprehensible.
'''
status, schema = self.get('schema')
redone = {}
for s in schema:
if s['table_name'] not in redone:
redone[s['table_name']] = []
if s['column_name'] not in redone[s['table_name']]:
redone[s['table_name']].append(s['column_name'])
self.write_json_file('schema.json', redone)
def extract_base_features(self, data):
'''
Extracts keys (features) from a list of dicts
Returns a set of features that all dicts contain,
as well as a set of all the extra keys
(extra keys are supplied to show how messed up opendota's api is,
only features in the base_feature_set should be used)
'''
if isinstance(data, list):
base_feature_set = set(data[0].keys())
extras = set()
for d in data:
f = set(d.keys())
extras = extras.union(base_feature_set.symmetric_difference(f))
base_feature_set = base_feature_set.intersection(f)
return base_feature_set, extras
elif isinstance(data, dict):
return data.keys(), set()
def np_ize(self, data, np_only=False):
'''
Turns a list of dicts into an np array
Returns only the subset of keys that belong to all dicts in the list, plus a numpy array
'''
if np_only is True:
return np.array(data)
if isinstance(data, list):
if len(data) > 0 and isinstance(data[0], dict):
features = data[0].keys()
if all([d.keys() == features for d in data]):
l = [[v for k, v in sorted(d.items())] for d in data]
return features, np.array(l)
else:
base_feature_set, extra_features = self.extract_base_features(data)
l = [[v for k, v in sorted(d.items()) if k in base_feature_set] for d in data]
return base_feature_set, np.array(l)
raise NumpyException("Unable to transform data into numpy array")
def sub_dicts(self, data, desired_keys):
'''
Transforms dictionaries into sub dictionaries
Returns a list of dictionaries comprised of the desired_keys
'''
base_feature_set, extras = self.extract_base_features(data)
if all(k in base_feature_set for k in desired_keys):
return [{k: d[k] for k in desired_keys} for d in data]
raise SubDictException("Unable to extract sub dict. Not all keys exist in every member of the data set.")
def read_json_file(self, filepath):
'''
Reads a json file
Returns a python object that corresponds to the file's json
'''
with open(filepath, 'r') as f:
return json.load(f)
def write_json_file(self, filepath, data):
'''
Writes a json file; writes json of python object
'''
with open(filepath, 'w') as f:
json.dump(data, f)
def shorten_data(self, data, desired_keys):
'''
Takes in data as a list of dicts, and desired_keys, which mimics the structure of the dicts to be returned
Returns a list of dicts with the structure of desired_keys (2 levels only)
'''
assert isinstance(desired_keys, dict)
if data is not None:
_data = self.sub_dicts(data, desired_keys.keys())
for d in _data:
# for key, value in desired_keys.iteritems():
for key, value in desired_keys.items():
if value is not None:
assert isinstance(value, list)
assert isinstance(d[key], list)
d[key] = self.sub_dicts(d[key], value)
return _data
return data
class BasicHeroData(DotaData):
'''
Basic Usage: Get dota data however you can, be it loaded from a file or directly from the api
then call load_data on it
load_data takes in a list of dicts in the format supplied by matches/ endpoint, shortens the data if it
is not already shortened, and transforms it into a numpy friendly format
load_saved_hero_data loads an already numpy friendly format
'''
def __init__(self):
super(BasicHeroData, self).__init__()
self.hero_features, self.hero_id_index_map = self.heroes()
self.target_labels = ['radiant_win']
def heroes(self):
'''
gets hero data from the heroes file (which in turn comes from the heroStats/ endpoint)
maps the supplied ids to indexes because there are some heroes missing (we don't know why)
returns the id_index_map and a doubled list of hero features, for both radiant and dire teams
'''
heroes = self.read_json_file('./Data/heroes.json')
id_name_map = {h['id']: h['name'] for h in heroes}
ids = id_name_map.keys()
id_index_map = {x: i for i, x in enumerate(ids)}
hero_features = [''] * (2 * (len(id_index_map)))
for i, id in enumerate(id_index_map):
hero_features[i] = '{}_{}'.format(id_name_map[id], 'radiant')
hero_features[i + len(id_index_map)] = '{}_{}'.format(id_name_map[id], 'dire')
return hero_features, id_index_map
def process_matches(self):
'''
Uses hero_id_index_map to create a list for each match that has ones
for a hero pick and zeroes for all other heroes
For both radiant and dire teams in the match
Returns an array of such matches and the outcome of each match in data, targets
'''
targets = []
data = []
for match in self.shortened_data:
datum = [0] * (2 * len(self.hero_id_index_map))
for player in match['players']:
try:
if player['isRadiant'] is True:
index = self.hero_id_index_map[player['hero_id']]
else:
index = self.hero_id_index_map[player['hero_id']] + len(self.hero_id_index_map)
datum[index] = 1
except KeyError: # there is some data that has hero_ids that aren't in the heroes.json
datum = [0] * len(datum)
break
if any([x != 0 for x in datum]):
data.append(datum)
targets.append([int(match['radiant_win'])])
return data, targets
def load_data(self, matches):
'''
Accepts input data in the format of the matches/ endpoint,
shortens data, processes it and sets the raw_data, raw_targets,
data, targets class variables
'''
self.shortened_data = self.shorten_data(matches, {'players': ['isRadiant', 'hero_id'], 'radiant_win': None})
data, targets = self.process_matches()
self.raw_data = data
self.raw_targets = targets
self.data = self.np_ize(data, True)
self.targets = self.np_ize(targets, True)
def _load_heroes(self):
'''
Calls the heroStats/ endpoint and returns a shortened list
'''
status, heroes = self.get('heroStats')
shortened_heroe_data = self.shorten_data(heroes, { 'id': None, "name": None, "localized_name": None,})
self.write_json_file('./Data/heroes.json', shortened_heroe_data)
def _chunk_match_ids(self):
'''
Seperates match ids into distinct files so that they can be processed
We were having issues with the dota matches/ endpoint and large, repetitive queries
because at one call per second, 48,000+ queries was way too much
'''
amount = 1000
matches = self.read_json_file("./Data/Matches_By_Id/40000_plus_matches.json")
matches = [match['match_id'] for match in matches]
iterations = len(matches) / amount
remainder = len(matches) % amount
base_filepath = "./Data/Matches_By_Id/chunked/"
r = matches[:remainder]
self.write_json_file("{}{}".format(base_filepath, 'remainder.json'), r)
for i in range(iterations):
match_subset = matches[remainder + (i * amount):remainder + ((i + 1) * amount)]
self.write_json_file("{}{}".format(base_filepath, '{}.json'.format(str(i + 1))), match_subset)
def _chunk_matches(self, filename):
'''
With above, gets a match id file from Data/Matches_By_Id/chunked, calls the match endpoint on those ids,
saves the shortened results into the corresponding chunked file in Data/Matches/chunked
This takes a while, so some explosions let you know when it is done ;) (if you have pygame installed)
Inputs: string filename (example: '1')
'''
match_ids = self.read_json_file("./Data/Matches_By_Id/chunked/{}.json".format(filename))
matches = self._get_match(match_ids)
matches = self.shorten_data(matches, {'players': ['isRadiant', 'hero_id'], 'radiant_win': None})
self.write_json_file("./Data/Matches/chunked/{}.json".format(filename), matches)
try:
import time
from pygame import mixer
mixer.init()
alert=mixer.Sound('boom.wav')
alert.play()
time.sleep(1)
alert.play()
time.sleep(1)
alert.play()
time.sleep(1)
except ImportError:
pass
def _get_match(self, match_ids):
'''
calls the dota matches endpoint with input match_ids
'''
matches = []
for mid in match_ids:
status, data = self.get("matches/{}".format(mid))
print mid
if(status == 200):
matches.append(data)
else:
print "bad status"
sleep(1.1) # the opendota api requests that this endpoint only be hit 1/s
return matches
def _gather_chunked_data(self, r_max, outfile='40k_matches_short.json'):
'''
Saves all the individual chunks in one file
'''
matches = []
for i in range(1, r_max):
matches += self.read_json_file('./Data/Matches/chunked/{}.json'.format(i))
matches += self.read_json_file('./Data/Matches/chunked/remainder.json')
self.write_json_file('./Data/Matches/{}'.format(outfile), matches)
def _data(self):
return {
'raw_data': self.raw_data,
'raw_targets': self.raw_targets,
'features': self.hero_features,
'target_labels': self.target_labels
}
def _save_hero_data(self):
'''
saves data in np friendly format to be loaded into ML methods
initial - 40k plus
'''
matches = self.read_json_file('./Data/Matches/40k_matches_short.json')
self.load_data(matches)
data = self._data()
self.write_json_file('./Data/hero_data/full_40000_plus_data.json', data)
def load_saved_hero_data(self, filepath):
'''
loads np friendly version of the data for use in ML methods
'''
hero_data = self.read_json_file(filepath)
hero_data['data'] = np.array(hero_data['raw_data'])
hero_data['targets'] = np.array(hero_data['raw_targets'])
del hero_data['raw_targets']
del hero_data['raw_data']
return hero_data['data'], hero_data['targets'], hero_data['features'], hero_data['target_labels']
def _assess_hero_data(self, data):
'''
Counts the number of times a hero is used over the dataset
Calculates the percentages of each hero's use for use in _drop_features
'''
def sum(x):
return np.sum(x)
# find difference between the team 1 and team 2s usage of the player
def sub(x):
return abs(x[:113] - x[113:])
# total number of times (between both teams) a hero is used
def add(x):
return np.add(x[:113], x[113:])
# get sum of each column (find out how much each hero is used)
self.feature_details = np.apply_along_axis(sum, axis=0, arr=data)
self.summed_features = np.apply_along_axis(add, axis=0, arr=self.feature_details)
self.percentages = np.divide(self.summed_features.astype('float32'), sum(self.summed_features))
def _plot_summed(self):
'''
Plots the summed features from _assess_hero_data
'''
_sorted = np.sort(self.summed_features, axis=-1, kind='mergesort', order=None)
y_pos = np.arange(len(_sorted))
plt.figure(figsize=(20, 3)) # width:20, height:3
plt.bar(range(len(self.summed_features)), self.summed_features, align='center', alpha=0.5, width=0.3)
plt.xticks(range(0, len(self.summed_features), 10))
plt.ylabel('Usage')
plt.title('Dota 2 hero usages in 40k matches')
plt.show()
def _drop_features(self, data, targets, features, threshold):
'''
drops data and features if they do not pass the percentage threshold
uses percentage from _assess_hero_data
'''
column_drop_indexes = []
for i, x in enumerate(self.percentages):
if x < threshold:
column_drop_indexes.append(i)
column_drop_indexes.append(i + len(features) / 2)
row_drop_indexes = []
for index, d in enumerate(data):
if any([d[i] == 1 for i in column_drop_indexes]):
row_drop_indexes.append(index)
data = np.delete(data, row_drop_indexes, 0)
targets = np.delete(targets, row_drop_indexes, 0)
features = np.delete(features, column_drop_indexes)
return data, targets, features
def _save_data_dropped_features(self, threshold, name):
'''
Loads saved data, assesses the prevalence of heroes and drops features
and associated data from those that do not pass the threshold
'''
data, targets, features, target_labels = self.load_saved_hero_data('./Data/hero_data/full_40000_plus_data.json')
self._assess_hero_data(data)
#self._plot_summed()
data, targets, features = self._drop_features(data, targets, features, threshold)
d = {
'raw_data': data.tolist(),
'raw_targets': targets.tolist(),
'features': features.tolist(),
'target_labels': target_labels
}
self.write_json_file('./Data/hero_data/{}'.format(name), d)
def _match_id_dict_to_list(self, read_path, write_path):
matches = self.read_json_file(read_path)
self.write_json_file(write_path, sorted([m['match_id'] for m in matches]))
def get_player_rankings(self, infile, outfile):
'''
Inputs: infile of match ids, outfile path
gets player solo_competitive_rank, competitive_rank, and mmr_estimate for each match id in input file
'''
print "in get_player_rankings"
match_ids = self.read_json_file(infile)
print len(match_ids)
matches = []
for mid in match_ids:
print mid
status, match = self.get('/matches/{}'.format(mid))
#pp(match)
M = self.shorten_data([match], {'players': ['account_id', 'hero_id'], 'match_id': None})
#pp(M)
_M = []
for player in M[0]['players']:
status, _player = self.get('/players/{}'.format(player['account_id']))
pp(_player)
sleep(1.1)
#pp(player)
p = (self.shorten_data([_player], {'solo_competitive_rank':None, 'competitive_rank': None, 'mmr_estimate':None}))
#pp(p)
#p.update(player)
player.update(p[0])
#pp(M)
matches.append(M[0])
#pp(matches)
matches = {m['match_id']:m['players'] for m in matches}
#pp(matches)
self.write_json_file(outfile, matches)
def get_solo_player_rankings(self, infile, outfile):
'''
Inputs: infile of match ids, outfile path
Gets player solo_competitive_rank for each match id in input file
'''
match_ids = self.read_json_file(infile)
matches = []
for mid in match_ids:
status, match = self.get('/matches/{}'.format(mid))
#pp(match)
M = self.shorten_data([match], {'players': ['account_id', 'hero_id', 'solo_competitive_rank'], 'match_id': None})
pp(M)
matches.append(M[0])
pp(matches)
matches = {m['match_id']:m['players'] for m in matches}
self.write_json_file(outfile, matches)
def solo():
'''
Gets player rankings
'''
h = BasicHeroData()
dir_1 = './Data/Matches_By_Id/chunked/'
dir_2 = './Data/Matches/solo_chunked/'
if not os.path.isdir(dir_1):
os.mkdir(dir_1)
if not os.path.isdir(dir_2):
os.mkdir(dir_2)
for i in range(1,47):
h.get_player_rankings('{}{}.json'.format(dir_1, str(i)), '{}{}.json'.format(dir_2, str(i)))
h.get_player_rankings('{}remainder.json'.format(dir_1), '{}remainder.json'.format(dir_2))
def run_on_machine(low, high):
'''
Inputs: start file name (number in range), end file name (number in range)
Get player rankings
Intended to be run on separate VMS
'''
h = BasicHeroData()
dir_1 = './Data/Matches_By_Id/chunked/'
dir_2 = './Data/Matches/chunked_players/'
if not os.path.isdir(dir_1):
os.mkdir(dir_1)
if not os.path.isdir(dir_2):
os.mkdir(dir_2)
for i in range(low,high):
h.get_player_rankings('{}{}.json'.format(dir_1, str(i)), '{}{}.json'.format(dir_2, str(i)))
h.get_player_rankings('{}remainder.json'.format(dir_1), '{}remainder.json'.format(dir_2))
def make_dummy_input_array(features, num_samples):
'''
used to make a array with random elements to start
'''
X = np.empty((0, features))
for i in range(0, num_samples):
arr = np.zeros(shape=(features, 1))
for i in range(features):
# arr[i] = np.random.u
arr[i] = np.random.random_integers(0, 1)
# arr[i] = np.random.random_integers(0, 9)
arr = arr.T
X = np.append(X, arr, axis=0)
return X
def double_inverse_samples(original_arr):
'''
input is an array whereeach row is duplicated and inserted after that current row and the values are flipped
for data and for target
'''
doubled_arr = np.zeros((original_arr.shape[0] * 2, original_arr.shape[1]))
j = 0
for i in range (0, doubled_arr.shape[0], 2):
doubled_arr[i] = np.copy(original_arr[j])
doubled_arr[i+1] = [0 if x == True else 1 for x in original_arr[j]]
j = j + 1
return doubled_arr