-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstanford_dataset.py
56 lines (49 loc) · 1.56 KB
/
stanford_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# Description : Loads and provide necessary methods to access and explore test
# dataset
# Importing libraries
import json
class StanfordDataset:
def __init__(self):
trainingF = open('data/testingData.json','r')
trainingData = trainingF.readline()
trainingF.close()
self.trainingDataJson = json.loads(trainingData)
self.titles = []
for i in range(0,len(self.trainingDataJson['data'])):
self.titles.append(self.trainingDataJson['data'][i]['title'])
# Get Dataset topic by name
# Input:
# topicName(str) : Name of topic
# Output:
# devData(dict) : JSON of data on that topic
def getTopic(self,topicName):
devTitle = topicName
for index in range(0,len(self.titles)):
if devTitle == self.titles[index]:
break
devData = self.trainingDataJson['data'][index]
return devData
# Get All listed question
# Input:
# topicName(str) : Name of topic
# Output:
# questions(list) : List of Questions
def getAllQuestions(self,topicName):
devData = self.getTopic(topicName)
questions = []
for index in range(0,len(devData['paragraphs'])):
p = devData['paragraphs'][index]
for qs in range(0,len(p['qas'])):
questions.append(p['qas'][qs]['question'])
return questions
# Get paragraphs for that topic
# Input:
# topicName(str) : Name of topic
# Output:
# paragraphs(list) : List of paragraphs
def getParagraph(self,topicName):
devData = self.getTopic(topicName)
paragraphs = []
for index in range(0,len(devData['paragraphs'])):
paragraphs.append(devData['paragraphs'][index]['context'])
return paragraphs