-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDocumentParser
114 lines (102 loc) · 3.58 KB
/
DocumentParser
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
package com.test.document.parser;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashMap;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
/**
* Class to read documents
*
* @author Sandeep Naik
*/
public class DocumentParser
{
// This variable will hold all terms of each document in an array.
private List<String[]> termsDocsArray = null;
private List<String> allTerms = null; // to hold all terms
private List<double[]> tfidfDocsVector = null;
private Map<Integer, Integer> map = null;
/**
* Method to read files and store in array.
*
* @param filePath : source file path
* @throws FileNotFoundException
* @throws IOException
*/
public void parseFiles(String filePath) throws FileNotFoundException, IOException
{
File[] allfiles = new File(filePath).listFiles();
BufferedReader in = null;
termsDocsArray = new ArrayList<String[]>();
allTerms = new ArrayList<String>();
for (File f : allfiles)
{
in = new BufferedReader(new FileReader(f));
StringBuilder sb = new StringBuilder();
String s = null;
while ((s = in.readLine()) != null)
{
sb.append(s);
}
String[] tokenizedTerms = sb.toString().replaceAll("[\\W&&[^\\s]]", "").split("\\W+"); // to get individual
// terms
for (String term : tokenizedTerms)
{
if (!allTerms.contains(term))
{ // avoid duplicate entry
allTerms.add(term);
}
}
termsDocsArray.add(tokenizedTerms);
}
}
/**
* Method to create termVector according to its tfidf score.
*/
public void tfIdfCalculator()
{
double tf; // term frequency
double idf; // inverse document frequency
double tfidf; // term requency inverse document frequency
tfidfDocsVector = new ArrayList<double[]>();
for (String[] docTermsArray : termsDocsArray)
{
double[] tfidfvectors = new double[allTerms.size()];
int count = 0;
for (String terms : allTerms)
{
tf = new TfIdf().tfCalculator(docTermsArray, terms);
idf = new TfIdf().idfCalculator(termsDocsArray, terms);
tfidf = tf * idf;
tfidfvectors[count] = tfidf;
count++;
}
tfidfDocsVector.add(tfidfvectors); // storing document vectors;
}
}
/**
* Method to calculate cosine similarity between all the documents.
*/
public Map<Integer, Integer> getCosineSimilarity()
{
map = new HashMap<>();
for (int i = 0; i < tfidfDocsVector.size(); i++)
{
for (int j = 1; j <tfidfDocsVector.size(); j++)
{
System.out.println("between " + i + " and " + j + " = "
+ new CosineSimilarity().cosineSimilarity(tfidfDocsVector.get(i), tfidfDocsVector.get(j)));
double cousineSimilarty = new CosineSimilarity().cosineSimilarity(tfidfDocsVector.get(i),
tfidfDocsVector.get(j));
map.put(i, j);
}
}
return map;
}
}