Skip to content

Commit 61e5413

Browse files
authored
fix: test against RepeatedWordFilter with post's text
simplify the initialisation of `FEW_UNIQUES_THRESHOLD`
1 parent b62daa2 commit 61e5413

File tree

1 file changed

+15
-20
lines changed

1 file changed

+15
-20
lines changed

src/main/java/bugs/stackoverflow/belisarius/utils/CheckUtils.java

+15-20
Original file line numberDiff line numberDiff line change
@@ -14,24 +14,19 @@
1414
import info.debatty.java.stringsimilarity.JaroWinkler;
1515

1616
public class CheckUtils {
17-
// Map with the threshold for the FewUniqueCharacters reason.
18-
// Stolen from https://github.com/Charcoal-SE/SmokeDetector/blob/84099eecffbd85b15de90a1ea41a7c8776ac0903/findspam.py#L592-L595
19-
// Key is the number of unique characters, value is a list with lengths which correspond to the key
20-
private static final Map<Integer, List<Integer>> FEW_UNIQUES_THRESHOLD = new HashMap<Integer, List<Integer>>() {
21-
private static final long serialVersionUID = -2936670361661267625L;
22-
23-
{
24-
put(6, Arrays.asList(30, 36));
25-
put(7, Arrays.asList(36, 42));
26-
put(8, Arrays.asList(42, 48));
27-
put(9, Arrays.asList(48, 54));
28-
put(10, Arrays.asList(54, 60));
29-
put(11, Arrays.asList(60, 70));
30-
put(12, Arrays.asList(70, 80));
31-
put(13, Arrays.asList(80, 90));
32-
put(14, Arrays.asList(90, 100));
33-
}
34-
};
17+
// Stolen from https://github.com/Charcoal-SE/SmokeDetector/blob/078d8237f31ddc9b914cacd045667d1921473032/findspam.py#L869-L872
18+
// key: number of unique characters, value: [lower bound, upper bound]
19+
private static final Map<Integer, List<Integer>> FEW_UNIQUES_THRESHOLD = Map.of(
20+
6, Arrays.asList(30, 36),
21+
7, Arrays.asList(36, 42),
22+
8, Arrays.asList(42, 48),
23+
9, Arrays.asList(48, 54),
24+
10, Arrays.asList(54, 60),
25+
11, Arrays.asList(60, 70),
26+
12, Arrays.asList(70, 80),
27+
13, Arrays.asList(80, 90),
28+
14, Arrays.asList(90, 100)
29+
);
3530

3631
public static Map<Integer, String> checkForBlackListedWords(String target, String lastTarget, String postType) {
3732
Map<Integer, String> blacklistedWords = DatabaseUtils.getBlacklistedWordsByType(postType);
@@ -126,7 +121,7 @@ public static String checkForFewUniqueCharacters(String target) {
126121
&& length < lengths.get(1)
127122
&& uniquesCount <= threshold.getKey()
128123
) {
129-
return body.codePoints() // Intstream of codePoints
124+
return body.codePoints() // Intstream of codePoints
130125
.distinct()
131126
.collect(
132127
StringBuilder::new, // collect to a StringBuilder
@@ -171,7 +166,7 @@ public static boolean checkIfBodyContainsWord(Pattern pattern, String target) {
171166
}
172167

173168
public static Set<String> checkRepeatedWords(String target) {
174-
String[] words = target.split("\\W");
169+
String[] words = removeHtml(target).split("\\W");
175170

176171
return new HashSet<>(Arrays.asList(words));
177172
}

0 commit comments

Comments
 (0)