-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
123 lines (99 loc) · 3.84 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import os, sys, logging, argparse, yaml, easydict
import numpy as np
import torch
from transformers import (
TrainingArguments,
Trainer,
)
from transformers.trainer import Trainer
from peft import (
LoraConfig,
get_peft_model,
)
from accelerate import Accelerator
from torchdrug.utils import comm, pretty
from llm import *
from collector import *
from preprocess import *
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='data preprocessing')
parser.add_argument("--config", "-c", type=str,
default='config/fb15k237.yaml')
parser.add_argument("--version", "-v", type=str,
default='')
parser.add_argument("--seed", "-s", type=str,
default=42)
args = parser.parse_args()
with open(args.config, "r") as f:
cfg = easydict.EasyDict(yaml.safe_load(f))
if args.version:
cfg.dataset.version = args.version
torch.manual_seed(args.seed + comm.get_rank())
config_name = args.config.split('/')[-1].split('.')[0]
if hasattr(cfg.dataset, 'version'):
config_name += '_' + cfg.dataset.version
args.config_name = config_name
cfg.trainer.output_dir += config_name
if comm.get_rank() == 0:
print("Config file: %s" % args.config)
print(pretty.format(cfg))
saved_dir = 'data/preprocessed/'
file_path = saved_dir+args.config_name+'.pkl'
if 'ind' in args.config_name:
dataset = InductiveKGCDataset.load(file_path)
else:
dataset = KGCDataset.load(file_path)
tokenizer = dataset.tokenizer
cfg.context_retriever.kg_encoder.base_layer.num_relation = int(
dataset.kgdata.num_relation)
cfg.score_retriever.kg_encoder.base_layer.num_relation = int(
dataset.kgdata.num_relation)
torch.nn.Module = torch.nn._Module
config = MKGLConfig.from_pretrained(**cfg.mkglconfig)
model = MKGL.from_pretrained(
**cfg.mkgl, device_map={"": Accelerator().process_index}, config=config)
lora_config = LoraConfig(**cfg.loraconfig)
model = get_peft_model(model, lora_config)
kgl2token = torch.tensor(np.stack(dataset.vocab_df.text_token_ids)[:, :cfg.kgl_token_length])
model.init_kg_specs(kgl2token, tokenizer.vocab_size, cfg)
if comm.get_rank() == 0:
print(model.print_trainable_parameters())
print(model)
if 'ind' in args.config:
task = KGL4IndKGC(cfg.mkgl4kgc, llmodel=model, dataset=dataset)
else:
task = KGL4KGC(cfg.mkgl4kgc, llmodel=model, dataset=dataset)
data_loader = MKGLDataCollector(dataset)
training_args = TrainingArguments(**cfg.trainer)
if comm.get_rank() == 0:
print(training_args)
def compute_metrics(predictions):
ranking = predictions[0].astype(float)
metric = ("mr", "mrr", "hits@1", "hits@3", "hits@10")
results = {}
for _metric in metric:
if _metric == "mr":
score = ranking.mean()
elif _metric == "mrr":
score = (1 / ranking).mean()
elif _metric.startswith("hits@"):
threshold = int(_metric[5:])
score = (ranking <= threshold).mean()
else:
raise ValueError("Unknown metric `%s`" % _metric)
results[_metric] = score
if comm.get_rank() == 0:
print(results)
return results
removed_columns = ['h_raw', 't_raw', 'r_raw', 'h_fine', 't_fine', 'r_fine', 'inv_r_fine']
trainer = Trainer(
model=task,
args=training_args,
eval_dataset=dataset.test_data.remove_columns(
removed_columns),
train_dataset=dataset.train_data.remove_columns(removed_columns),
data_collator=data_loader,
compute_metrics=compute_metrics
)
trainer.evaluate()
trainer.train()