-
Notifications
You must be signed in to change notification settings - Fork 0
/
matrix.js
109 lines (106 loc) · 4.24 KB
/
matrix.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
const fs = require('fs');
const lowerCase = require('lower-case')
const natural = require('natural');
const HashMap = require('hashmap');
function extractStopWord() {
let stopWordsFile = './stopWords.txt';
return fs.readFileSync(stopWordsFile);
}
module.exports = {
generateInvertedIndex: async function () {
let punctuations = ['.', ',', "'", "\"", ":", ";", "?", "\r\n", "!", "--", "-", "(", ")", "\r\n\r\n", "\r\n\r\n\r\n", "]", "["];
let words = [];
let map = new HashMap();
let stopWord = extractStopWord();
function getData(file) {
return new Promise(resolve => {
fs.readFile(file, 'utf8', function read(err, data) {
let tokenizer = new natural.WordPunctTokenizer();
words = tokenizer.tokenize(data);
words.forEach(word => {
word = lowerCase(word);
let regex = /[.,\s]/g;
word = word.replace(regex, '');
if (!stopWord.includes(word) && !punctuations.includes(word)) {
if (map.get(word) == undefined) {
let tempList = [i];
map.set(word, tempList);
}
else {
if (!map.get(word).includes(i)) {
let tempList = map.get(word);
tempList.push(i);
map.delete(word);
map.set(word, tempList);
}
}
}
})
resolve();
});
})
}
let i = 1;
while (i <= 50) {
let file = './stories/' + i + '.txt';
await getData(file);
i++;
}
return map;
},
generatePositionalIndex: async function () {
let punctuations = ['.', ',', "'", "\"", ":", ";", "?", "\r\n", "!", "--", "-", "(", ")", "\r\n\r\n", "\r\n\r\n\r\n", "]", "["];
let words = [];
let map = new HashMap();
let j = 0;
let stopWord = extractStopWord();
function getData(file) {
return new Promise(resolve => {
words = fs.readFileSync(file, 'utf8');
let tokenizer = new natural.WordPunctTokenizer();
words = tokenizer.tokenize(words);
j = 0;
words.forEach(word => {
word = lowerCase(word);
var regex = /[.,\s]/g;
word = word.replace(regex, '');
if (!stopWord.includes(word) && !punctuations.includes(word)) {
if (map.get(word) == undefined) {
let wordMap = new HashMap();
let positions = [j];
wordMap.set(i, positions);
map.set(word, wordMap);
}
else {
let tempList = map.get(word);
let tempPostions = [];
if (!map.get(word).get(i)) {
tempPostions = [j];
tempList.set(i, tempPostions);
map.delete(word);
map.set(word, tempList);
}
else {
tempPostions = tempList.get(i);
tempPostions.push(j);
tempList.delete(i);
tempList.set(i, tempPostions);
map.delete(word);
map.set(word, tempList);
}
}
j++;
}
})
resolve();
})
}
let i = 1;
while (i <= 50) {
let file = './stories/' + i + '.txt';
await getData(file);
i++;
}
return map;
}
}