|
14 | 14 | import info.debatty.java.stringsimilarity.JaroWinkler;
|
15 | 15 |
|
16 | 16 | public class CheckUtils {
|
17 |
| - // Map with the threshold for the FewUniqueCharacters reason. |
18 |
| - // Stolen from https://github.com/Charcoal-SE/SmokeDetector/blob/84099eecffbd85b15de90a1ea41a7c8776ac0903/findspam.py#L592-L595 |
19 |
| - // Key is the number of unique characters, value is a list with lengths which correspond to the key |
20 |
| - private static final Map<Integer, List<Integer>> FEW_UNIQUES_THRESHOLD = new HashMap<Integer, List<Integer>>() { |
21 |
| - private static final long serialVersionUID = -2936670361661267625L; |
22 |
| - |
23 |
| - { |
24 |
| - put(6, Arrays.asList(30, 36)); |
25 |
| - put(7, Arrays.asList(36, 42)); |
26 |
| - put(8, Arrays.asList(42, 48)); |
27 |
| - put(9, Arrays.asList(48, 54)); |
28 |
| - put(10, Arrays.asList(54, 60)); |
29 |
| - put(11, Arrays.asList(60, 70)); |
30 |
| - put(12, Arrays.asList(70, 80)); |
31 |
| - put(13, Arrays.asList(80, 90)); |
32 |
| - put(14, Arrays.asList(90, 100)); |
33 |
| - } |
34 |
| - }; |
| 17 | + // Stolen from https://github.com/Charcoal-SE/SmokeDetector/blob/078d8237f31ddc9b914cacd045667d1921473032/findspam.py#L869-L872 |
| 18 | + // key: number of unique characters, value: [lower bound, upper bound] |
| 19 | + private static final Map<Integer, List<Integer>> FEW_UNIQUES_THRESHOLD = Map.of( |
| 20 | + 6, Arrays.asList(30, 36), |
| 21 | + 7, Arrays.asList(36, 42), |
| 22 | + 8, Arrays.asList(42, 48), |
| 23 | + 9, Arrays.asList(48, 54), |
| 24 | + 10, Arrays.asList(54, 60), |
| 25 | + 11, Arrays.asList(60, 70), |
| 26 | + 12, Arrays.asList(70, 80), |
| 27 | + 13, Arrays.asList(80, 90), |
| 28 | + 14, Arrays.asList(90, 100) |
| 29 | + ); |
35 | 30 |
|
36 | 31 | public static Map<Integer, String> checkForBlackListedWords(String target, String lastTarget, String postType) {
|
37 | 32 | Map<Integer, String> blacklistedWords = DatabaseUtils.getBlacklistedWordsByType(postType);
|
@@ -126,7 +121,7 @@ public static String checkForFewUniqueCharacters(String target) {
|
126 | 121 | && length < lengths.get(1)
|
127 | 122 | && uniquesCount <= threshold.getKey()
|
128 | 123 | ) {
|
129 |
| - return body.codePoints() // Intstream of codePoints |
| 124 | + return body.codePoints() // Intstream of codePoints |
130 | 125 | .distinct()
|
131 | 126 | .collect(
|
132 | 127 | StringBuilder::new, // collect to a StringBuilder
|
@@ -171,7 +166,7 @@ public static boolean checkIfBodyContainsWord(Pattern pattern, String target) {
|
171 | 166 | }
|
172 | 167 |
|
173 | 168 | public static Set<String> checkRepeatedWords(String target) {
|
174 |
| - String[] words = target.split("\\W"); |
| 169 | + String[] words = removeHtml(target).split("\\W"); |
175 | 170 |
|
176 | 171 | return new HashSet<>(Arrays.asList(words));
|
177 | 172 | }
|
|
0 commit comments