-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtesting.py
56 lines (43 loc) · 1.6 KB
/
testing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# Ari Boyarsky ([email protected])
from feature_extraction import featureExtractor
import nltk
import json
from collections import defaultdict
import itertools
from pprint import pprint
data = []
# currently temp file to speed up debugging
# call data[article #]['type']
# type = [category, text]
with open(r'data/data5.json') as data_file:
for line in data_file:
data.append(json.loads(line))
# pprint(data)
# create word list and labels list for each article
# either feed this to Josh line by line, or create document list of word and label
# corpora object contains a list of words and categories for each article
# call data[article #]['words'] for a list of words
# call data[article #]['labels'] for a list of categories
corpora = {}
i = 0
for article in data:
corpora[i] = {}
words = nltk.word_tokenize(article['text'])
corpora[i]['words'] = words
corpora[i]['labels'] = article['category']
i += 1
#########################################
###Josh's Feature Extraction#############
#########################################
myFeatureExtractor = featureExtractor()
#Begin by iterating over the corpora object
for i in corpora:
document = myFeatureExtractor.createStringObject(corpora[i]['words'])
myFeatureExtractor.addDocument(document)
#Now train the model
myFeatureExtractor.trainModel()
#print(myFeatureExtractor.model.docvecs.index_to_doctag(0))
#print(myFeatureExtractor.model.most_similar('central'))
#print(myFeatureExtractor.fetchFeatureMatrix())
#Confirm that I can take test data and output a feature vector
#print(myFeatureExtractor.getFeatures(corpora[3]['words']))