Skip to content

Commit 2e2ac7f

Browse files
committed
fix: strip HTML tags and not text inside them while checking JW score
closes #48
1 parent 51dc45c commit 2e2ac7f

File tree

2 files changed

+57
-2
lines changed

2 files changed

+57
-2
lines changed

src/main/java/bugs/stackoverflow/belisarius/utils/CheckUtils.java

+2-2
Original file line numberDiff line numberDiff line change
@@ -100,8 +100,8 @@ public static boolean checkIfNoCodeBlock(String target) {
100100
}
101101

102102
public static double getJaroWinklerScore(String original, String target, double percentage) {
103-
String originalBody = stripTags(original);
104-
String targetBody = stripTags(target);
103+
String targetBody = removeHtml(target);
104+
String originalBody = removeHtml(original);
105105
double score = 1.0;
106106

107107
if (targetBody.length() < originalBody.length() * percentage) {

src/test/java/bugs/stackoverflow/belisarius/utils/CheckUtilsTest.java

+55
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import org.junit.jupiter.api.Test;
99

1010
public class CheckUtilsTest {
11+
private final double percentage = 0.8;
1112

1213
// Blacklisted and offensive words strings
1314
private final String blacklistedQuestionWords = "problem now solved problem has been now fixed found my solution "
@@ -67,4 +68,58 @@ public void repeatedWordTest() {
6768
assertEquals(CheckUtils.checkRepeatedWords(repeatedWords).size(), 1);
6869
assertEquals(CheckUtils.checkRepeatedWords(notRepeatedWords).size(), 6);
6970
}
71+
72+
@Test
73+
public void jaroWinklerScoreTest() {
74+
double score1 = CheckUtils.getJaroWinklerScore(
75+
"Lorem ipsum dolor sit amet, consectetur adipiscing elit, "
76+
+ "sed do eiusmod tempor incididunt ut labore et dolore magna "
77+
+ "aliqua. Ut enim ad minim veniam, quis nostrud exercitation "
78+
+ "ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis "
79+
+ "aute irure dolor in reprehenderit in voluptate velit esse cillum "
80+
+ "dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat "
81+
+ "non proident, sunt in culpa qui officia deserunt mollit anim id "
82+
+ "est laborum.",
83+
"This text has nothing to do with the above one.",
84+
percentage
85+
);
86+
assertTrue(score1 < 0.6); // this should be caught
87+
88+
double score2 = CheckUtils.getJaroWinklerScore(
89+
"Lorem ipsum dolor sit amet, consectetur adipiscing elit, "
90+
+ "sed do <code>eiusmod tempor incididunt ut</code> labore et dolore magna "
91+
+ "aliqua. <blockquote>Ut enim</blockquote> ad minim veniam, quis nostrud exercitation "
92+
+ "ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis "
93+
+ "<pre>aute irure dolor in reprehenderit in voluptate velit esse cillum</pre> "
94+
+ "dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat "
95+
+ "non <a href=\"#\">proident</a>, sunt in culpa qui officia deserunt mollit anim id "
96+
+ "est laborum.",
97+
"This text has nothing to do with the above one.",
98+
percentage
99+
);
100+
assertTrue(score2 < 0.6); // this should also be caught
101+
102+
// and have the same score, as HTML tags are stripped
103+
assertEquals(score1, score2);
104+
105+
double score3 = CheckUtils.getJaroWinklerScore(
106+
"Lorem ipsum dolor sit amet, consectetur adipiscing elit, "
107+
+ "sed do eiusmod tempor incididunt ut labore et dolore magna "
108+
+ "aliqua. Ut enim ad minim veniam, quis nostrud exercitation "
109+
+ "ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis "
110+
+ "aute irure dolor in reprehenderit in voluptate velit esse cillum "
111+
+ "dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat "
112+
+ "non proident, sunt in culpa qui officia deserunt mollit anim id "
113+
+ "est laborum.",
114+
"Lorem ipsum dolor sit amet, consectetur adipiscing elit, "
115+
+ "sed do eiusmoda tempor incididunt ut labore et dolore magna "
116+
+ "aliqua. Ut enim ad minim veniamd, quis nostrud exercitation "
117+
+ "ullamco laboris nisi ut aliquixp ex ea commodo consequat. Duis "
118+
+ "aute irure dolor in reprehendxerit in voluptate velit esse cillum ",
119+
percentage
120+
);
121+
// edit removes text, but does not change body completely
122+
// => should not be reported
123+
assertTrue(score3 > 0.6);
124+
}
70125
}

0 commit comments

Comments
 (0)