-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmia_dataset.py
82 lines (71 loc) · 3.07 KB
/
mia_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
from torch.utils.data import Dataset
from datasets import load_dataset
class MIADataset(Dataset):
def __init__(self, name):
self.name = name
def _init_dataset(self):
pass
class TextDataset(Dataset):
def __init__(self, texts):
self.texts = texts
def __len__(self):
return len(self.texts)
def __getitem__(self, idx):
return self.texts[idx]
class WikiMIA(MIADataset):
def __init__(self, name, length=64):
super().__init__(name)
self.length = length
self._init_dataset()
def _init_dataset(self):
self.dataset = load_dataset("swj0419/WikiMIA", split=f"WikiMIA_length{self.length}")
self.member = {}
self.non_member = {}
self.member["WikiMIA"] = []
self.non_member["WikiMIA"] = []
for i in range(len(self.dataset)):
if self.dataset[i]['label'] == 1:
self.non_member["WikiMIA"].append(self.dataset[i]['input'])
else:
self.member["WikiMIA"].append(self.dataset[i]['input'])
self.member["WikiMIA"] = TextDataset(self.member["WikiMIA"])
self.non_member["WikiMIA"] = TextDataset(self.non_member["WikiMIA"])
class TemporalArxiv(MIADataset):
def __init__(self):
super().__init__("TemporalArixivMIA")
self._init_dataset()
def _init_dataset(self):
self.dataset = load_dataset("iamgroot42/mimir", "temporal_arxiv")
self.member = TextDataset(self.dataset["member"])
self.non_member = TextDataset(self.dataset["non_member"])
class TemporalWiki(MIADataset):
def __init__(self):
super().__init__("TemporalWikiMIA")
self._init_dataset()
def _init_dataset(self):
self.dataset = load_dataset("iamgroot42/mimir", "temporal_wiki")
self.member = TextDataset(self.dataset["member"])
self.non_member = TextDataset(self.dataset["non_member"])
class MIMIR(MIADataset):
def __init__(self, domain="all"):
super().__init__("MIMIR")
self._init_dataset()
self.domain = domain
def _init_dataset(self):
if self.domain == "all":
self.dataset = {}
for domain in ["arxiv", "dm_mathematics", "github", "hackernews", "pile_cc",
"pubmed_central", "wikipedia_(en)", "full_pile", "c4"]:
self.dataset[domain] = load_dataset("iamgroot42/mimir", domain, split="ngram_13_0.8")
self.member = {}
self.non_member = {}
for domain in self.dataset:
self.member[domain] = TextDataset(self.dataset[domain]["member"])
self.non_member[domain] = TextDataset(self.dataset[domain]["non_member"])
else:
self.dataset = {}
self.dataset[self.domain] = load_dataset("iamgroot42/mimir", self.domain, split="ngram_13_0.8" )
self.member = {}
self.member[self.domain] = TextDataset(self.dataset[self.domain]["member"])
self.non_member = {}
self.non_member[self.domain] = TextDataset(self.dataset[self.domain]["non_member"])