forked from EleutherAI/gpt-neo
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtasks.py
116 lines (97 loc) · 3.91 KB
/
tasks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import os.path
import json
import requests
import numpy as np
import ftfy
from data.encoders import fetch_encoder, encode
import tensorflow as tf
import re
from functools import partial
lambada_src_uri = 'http://eaidata.bmk.sh/data/lambada_test.jsonl'
normalization = 'NFKC'
# Note: this task is called "lambada" but it really refers to OpenAI's version
# of the task, which actually differs in some ways from the task described in
# the original paper. So, strictly speaking, accuracy values from this task
# should not be compared to accuracy values from the original lambada task.
# For more information, see
# https://github.com/openai/gpt-2/issues/131
def lambada_create_tokens_data(params, path):
with open(path, 'w') as f:
req = requests.get(lambada_src_uri)
req.raise_for_status()
jsons = [json.loads(l) for l in req.iter_lines()]
texts = [ftfy.fix_text(j['text'], normalization=normalization) for j in jsons]
enc = fetch_encoder(params)
arrays = [encode(enc, t) for t in texts]
json.dump(arrays, f)
return arrays
def lambada_read_or_create_tokens_data(params, path):
# if you tell me where the file should go, i will helpfully create it for you
if not os.path.exists(path):
return lambada_create_tokens_data(params, path)
with open(path) as f:
return json.load(f)
def bin_pack(params, tokens_data):
eos_token = params['eos_id']
n_ctx = params['n_ctx']
dummy_token = 1
pad_batch_size = params['eval_batch_size']
bins = []
for a in tokens_data:
if len(bins) == 0 or len(bins[-1]) + len(a) + 1 > n_ctx:
bins.append([])
bins[-1] += a
bins[-1].append(eos_token)
while len(bins) % pad_batch_size != 0:
bins.append([])
bins_array = np.full((len(bins), n_ctx), dummy_token, dtype=np.uint16)
for i, b in enumerate(bins):
bins_array[i, 0:len(b)] = b
return bins_array
def lambada_init(params):
ds_configs = params['dataset_configs']
l = [
ds_configs[ds_id].get('lambada_tokens_path', "./lambada.json")
for ds_id, _, _, _ in params['datasets']
]
assert len(l) > 0, 'lambada_tokens_path not found in the dataset config'
lt_path = l[0]
assert lt_path.endswith('.json'), 'lambada_tokens_path must have extension json'
tokens_data = lambada_read_or_create_tokens_data(params, lt_path)
bins_array = bin_pack(params, tokens_data)
params['lambada_tokens_path'] = lt_path
params['lambada_n_steps'] = len(bins_array) // params['eval_batch_size']
def lambada_get_task_info(params):
return {
'n_steps': params['lambada_n_steps'],
}
# The LAMBADA evaluation code looks at the logits of each position just before an eos_token
def lambada_input(params):
eos_token = 50256 if params['n_vocab'] >= 50257 else 0
n_ctx = params['n_ctx']
lt_path = params['lambada_tokens_path']
tokens_data = lambada_read_or_create_tokens_data(params, lt_path)
bins_array = bin_pack(params, tokens_data)
dataset = tf.data.Dataset.from_tensor_slices(bins_array)
def _get_output(bin):
bin = tf.cast(bin, dtype=tf.int32)
indexes = tf.range(n_ctx)
results = tf.gather(bin, (indexes + 1) % n_ctx)
eos_next_positions = tf.math.equal(tf.gather(bin, (indexes + 2) % n_ctx), eos_token)
output = tf.where(eos_next_positions, results, tf.constant(eos_token, shape=[n_ctx]))
bin = tf.reshape(bin, [n_ctx])
bin = tf.cast(bin, dtype=tf.int32)
output = tf.reshape(output, [n_ctx])
output = tf.cast(output, dtype=tf.int32)
return bin, output
dataset = dataset.map(_get_output)
dataset = dataset.batch(params['eval_batch_size'], drop_remainder=True)
dataset = dataset.repeat()
return dataset
task_descriptors = {
'lambada': {
'init_fn': lambada_init,
'get_task_info_fn': lambada_get_task_info,
'input_fn': lambada_input,
}
}