-
Notifications
You must be signed in to change notification settings - Fork 0
/
dataset.py
121 lines (87 loc) · 4.18 KB
/
dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
from config import CONFIG
from pprint import pprint, pformat
import logging
from pprint import pprint, pformat
logging.basicConfig(format="%(levelname)-8s:%(filename)s.%(funcName)20s >> %(message)s")
log = logging.getLogger(__name__)
log.setLevel(logging.INFO)
import random
from anikattu.utilz import tqdm
from anikattu.debug import memory_consumed
from anikattu.vocab import Vocab
from collections import Counter
import random
def split_dataset(dataset, ratio=0.8):
pivot = ratio * len(dataset)
return dataset[:pivot], dataset[pivot:]
class DatasetList:
def __init__(self, name, datasets, portion_percent=1.0, sort_key=None):
self.name = name
self.portion_percent = portion_percent
self.datasets = list(datasets)
self.trainset, self.testset = [], []
for dataset in self.datasets:
self.trainset.extend(self.portion(dataset.trainset))
self.testset.extend(self.portion(dataset.testset))
self.trainset_dict = {i.id:i for i in self.trainset}
self.testset_dict = {i.id:i for i in self.testset}
random.shuffle(self.trainset)
random.shuffle(self.testset)
if sort_key:
self.trainset = sorted(self.trainset, key=sort_key, reverse=True)
self.testset = sorted(self.testset, key=sort_key, reverse=True)
def portion(self, dataset, percent=None):
percent = percent if percent else self.portion_percent
return dataset[:int(len(dataset) * percent)]
class NLPDatasetList(DatasetList):
def __init__(self, name, datasets, portion_percent=1.0, sort_key=None):
super().__init__(name, datasets, portion_percent, sort_key)
input_vocab = Counter()
special_tokens = []
for dataset in self.datasets:
input_vocab += dataset.input_vocab.freq_dict
for token in dataset.input_vocab.special_tokens:
if token not in special_tokens:
special_tokens.append(token)
self.input_vocab = Vocab(input_vocab, special_tokens)
output_vocab = Counter()
special_tokens = []
for dataset in self.datasets:
output_vocab += dataset.output_vocab.freq_dict
special_tokens.extend(dataset.output_vocab.special_tokens)
self.output_vocab = Vocab(output_vocab, special_tokens)
log.info('build dataset: {}'.format(name))
log.info(' trainset size: {}'.format(len(self.trainset)))
log.info(' testset size: {}'.format(len(self.testset)))
log.info(' input_vocab size: {}'.format(len(self.input_vocab)))
log.info(' output_vocab size: {}'.format(len(self.output_vocab)))
def __iter__(self):
self.datasets.__iter__()
class Dataset:
def __init__(self, name, dataset):
self.name = name
log.info('building dataset: {}'.format(name))
if not isinstance(dataset, tuple):
dataset = split_dataset(list(dataset))
self.trainset, self.testset = dataset
self.trainset_dict = {i.id:i for i in self.trainset}
self.testset_dict = {i.id:i for i in self.testset}
log.info('build dataset: {}'.format(name))
log.info(' trainset size: {}'.format(len(self.trainset)))
log.info(' testset size: {}'.format(len(self.testset)))
class NLPDataset(Dataset):
def __init__(self, name, dataset, input_vocab, output_vocab):
self.name = name
log.info('building dataset: {}'.format(name))
if not isinstance(dataset, tuple):
dataset = split_dataset(list(dataset))
self.trainset, self.testset = dataset
self.input_vocab = input_vocab
self.output_vocab = output_vocab
self.trainset_dict = {i.id:i for i in self.trainset}
self.testset_dict = {i.id:i for i in self.testset}
log.info('build dataset: {}'.format(name))
log.info(' trainset size: {}'.format(len(self.trainset)))
log.info(' testset size: {}'.format(len(self.testset)))
log.info(' input_vocab size: {}'.format(len(self.input_vocab)))
log.info(' output_vocab size: {}'.format(len(self.output_vocab)))