-
Notifications
You must be signed in to change notification settings - Fork 20
/
Text_Preprocessing.py
62 lines (54 loc) · 1.93 KB
/
Text_Preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import re
from nltk.tokenize import word_tokenize
from keras.preprocessing.sequence import pad_sequences
import numpy as np
# Tokenize the Input Sentences
# def tokenize(sentence):
# return [x.strip() for x in re.split('(\W+)?', sent) if x.strip()]
def tokenize(sentence):
return word_tokenize(sentence)
# Vectorize the text
# Convert Subtext, Questions to Vector Form
def vectorize_ques(data, word_id, test_max_length, ques_max_length):
X = []
Xq = []
for subtext, question in data:
x = [word_id[w] for w in subtext]
xq = [word_id[w] for w in question]
# let's not forget that index 0 is reserved
X.append(x)
Xq.append(xq)
return (pad_sequences(X, maxlen=test_max_length),
pad_sequences(Xq, maxlen=ques_max_length))
# Vectorize the text
# Convert Subtext, Questions, Answers to Vector Form
# Y: array[] of zero's with "1" corresponding to word representing correct answer
def vectorize_text(data, word_id, text_max_length, ques_max_length):
X = []
Xq = []
Y = []
for subtext, question, answer in data:
x = [word_id[w] for w in subtext]
# Save the ID of Questions using SubText
xq = [word_id[w] for w in question]
# Save the answers for the Questions in "Y" as "1"
y = np.zeros(len(word_id) + 1)
y[word_id[answer]] = 1
X.append(x)
Xq.append(xq)
Y.append(y)
return (pad_sequences(X, maxlen=text_max_length),
pad_sequences(Xq, maxlen=ques_max_length),
np.array(Y))
# Read the text files
def read_text():
text = []
input_line = input('Story, Empty to stop: ')
while input_line != '':
# for now, lines have to be a full sentence
if not input_line.endswith('.'):
input_line += '.'
text.extend(tokenize(input_line))
input_line = input('Story, Empty to stop: ')
return text
# -------------------- EOC ------------------------