-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathari_preproccessing.py
36 lines (30 loc) · 1.01 KB
/
ari_preproccessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# Ari Boyarsky ([email protected])
import nltk
import json
from collections import defaultdict
import itertools
from pprint import pprint
data = []
# currently temp file to speed up debugging
# call data[article #]['type']
# type = [category, text]
with open(r'data\data5.json') as data_file:
for line in data_file:
data.append(json.loads(line))
# pprint(data)
# create word list and labels list for each article
# either feed this to Josh line by line, or create document list of word and label
# corpora object contains a list of words and categories for each article
# call data[article #]['words'] for a list of words
# call data[article #]['labels'] for a list of categories
corpora = {}
i = 0
for article in data:
corpora[i] = {}
# to test within the guidelines of of bluemix we must limit text to 1024 chars
text = article['text'][:1024]
text = text.rsplit(' ', 1)[0]
words = nltk.word_tokenize(text)
corpora[i]['words'] = words
corpora[i]['labels'] = article['category']
i += 1