languagetool-org · ljo · Jul 25, 2023 · Jul 26, 2023
diff --git a/languagetool-language-modules/sv/src/main/java/org/languagetool/language/Swedish.java b/languagetool-language-modules/sv/src/main/java/org/languagetool/language/Swedish.java
@@ -23,6 +23,7 @@
 import org.languagetool.Language;
 import org.languagetool.LanguageMaintainedState;
 import org.languagetool.UserConfig;
+import org.languagetool.languagemodel.LanguageModel;
 import org.languagetool.rules.*;
 import org.languagetool.rules.spelling.SpellingCheckRule;
 import org.languagetool.rules.spelling.hunspell.HunspellRule;
@@ -35,16 +36,24 @@
 import org.languagetool.tagging.sv.SwedishTagger;
 import org.languagetool.tokenizers.SRXSentenceTokenizer;
 import org.languagetool.tokenizers.SentenceTokenizer;
+//import org.languagetool.tokenizers.sv.SwedishWordTokenizer;
 
+import java.io.File;
 import java.io.IOException;
+import java.io.InputStream;
 import java.util.*;
 
+import static java.util.Arrays.asList;
+
 /**
- * @deprecated this language is unmaintained in LT and might be removed in a future release if we cannot find contributors for it (deprecated since 3.6)
- * Actively maintained since v6.2+
+ *
+ * Deprecated in 3.6, but actively maintained again since v6.2+
+ *
  */
 @Deprecated
-public class Swedish extends Language {
+public class Swedish extends Language implements AutoCloseable {
+
+  private LanguageModel languageModel;
 
   @Override
   public String getName() {
@@ -72,6 +81,56 @@ public SentenceTokenizer createDefaultSentenceTokenizer() {
     return new SRXSentenceTokenizer(this);
   }
 
+/*
+  @Override
+  public Tokenizer createDefaultWordTokenizer() {
+    return new SwedishWordTokenizer();
+  }
+*/
+
+  @Override
+  public synchronized LanguageModel getLanguageModel(File indexDir) throws IOException {
+    languageModel = initLanguageModel(indexDir, languageModel);
+    return languageModel;
+  }
+
+  @Override
+  public List<Rule> getRelevantLanguageModelRules(ResourceBundle messages, LanguageModel languageModel, UserConfig userConfig) throws IOException {
+    return asList(
+        //new UpperCaseNgramRule(messages, languageModel, this, userConfig),
+        new SwedishConfusionProbabilityRule(messages, languageModel, this)
+        //new SwedishNgramProbabilityRule(messages, languageModel, this)
+    );
+  }
+
+/*
+  @Override
+  public List<Rule> getRelevantLanguageModelCapableRules(ResourceBundle messages, @Nullable LanguageModel lm, GlobalConfig globalConfig, UserConfig userConfig, Language motherTongue, List<Language> altLanguages) throws IOException {
+    if (lm != null && motherTongue != null) {
+      if ("en".equals(motherTongue.getShortCode())) {
+        return asList(new SwedishForEnglishNativesFalseFriendRule(messages, lm, motherTongue, this));
+      } else if ("de".equals(motherTongue.getShortCode())) {
+        return asList(new SwedishForGermansFalseFriendRule(messages, lm, motherTongue, this));
+      } else if ("da".equals(motherTongue.getShortCode())) {
+        return asList(new SwedishForDanesFalseFriendRule(messages, lm, motherTongue, this));
+      } else if ("no".equals(motherTongue.getShortCode())) {
+        return asList(new SwedishForNorwegiansFalseFriendRule(messages, lm, motherTongue, this));
+      }
+    }
+    return asList();
+  }
+
+  @Override
+  public boolean hasNGramFalseFriendRule(Language motherTongue) {
+    return motherTongue != null && (
+      "en".equals(motherTongue.getShortCode()) ||
+      "de".equals(motherTongue.getShortCode()) ||
+      "da".equals(motherTongue.getShortCode()) ||
+      "no".equals(motherTongue.getShortCode()));
+  }
+
+*/
+
   @Override
   public Disambiguator createDefaultDisambiguator() {
     return new SwedishHybridDisambiguator();
@@ -118,4 +177,41 @@ public List<Rule> getRelevantRules(ResourceBundle messages, UserConfig userConfi
   protected SpellingCheckRule createDefaultSpellingRule(ResourceBundle messages) throws IOException {
     return new HunspellRule(messages, this, null, null);
   }
+
+  /** @since 6.2+ */
+//  @Override
+//  public String getOpeningDoubleQuote() {
+//    return "”";
+//  }
+
+  /** @since 6.2+ */
+//  @Override
+//  public String getClosingDoubleQuote() {
+//    return "”";
+//  }
+
+  /** @since 6.2+ */
+//  @Override
+//  public String getOpeningSingleQuote() {
+//    return "’";
+//  }
+
+  /** @since 6.2+ */
+//  @Override
+//  public String getClosingSingleQuote() {
+//    return "’";
+//  }
+
+  /**
+   * Closes the language model, if any.
+   * @since 6.2+
+   */
+  @Override
+  public void close() throws Exception {
+    if (languageModel != null) {
+      languageModel.close();
+    }
+  }
+
 }
+
diff --git a/...e-modules/sv/src/main/java/org/languagetool/rules/sv/SwedishConfusionProbabilityRule.java b/...e-modules/sv/src/main/java/org/languagetool/rules/sv/SwedishConfusionProbabilityRule.java
@@ -0,0 +1,68 @@
+/* LanguageTool, a natural language style checker 
+ * Copyright (C) 2014 Daniel Naber (http://www.danielnaber.de)
+ * 
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
+ * USA
+ */
+package org.languagetool.rules.sv;
+
+import org.languagetool.Language;
+import org.languagetool.languagemodel.LanguageModel;
+import org.languagetool.rules.ngrams.ConfusionProbabilityRule;
+import org.languagetool.rules.Example;
+import org.languagetool.rules.patterns.PatternToken;
+
+import java.util.Arrays;
+import java.util.List;
+import java.util.ResourceBundle;
+
+import static org.languagetool.rules.patterns.PatternRuleBuilderHelper.posRegex;
+import static org.languagetool.rules.patterns.PatternRuleBuilderHelper.token;
+import static org.languagetool.rules.patterns.PatternRuleBuilderHelper.tokenRegex;
+
+/**
+ * @since 2.7
+ */
+public class SwedishConfusionProbabilityRule extends ConfusionProbabilityRule {
+
+  private static final List<String> EXCEPTIONS = Arrays.asList(
+      // Use all-lowercase, matches will be case-insensitive.
+      "god sak"
+    );
+
+  private static final List<List<PatternToken>> ANTI_PATTERNS = Arrays.asList(
+    Arrays.asList(
+      // "De små öronen" "Dessa små öron"
+      tokenRegex("de|dessa|dom"),
+      token("små"),
+      posRegex("NN:PLU")
+    )
+  );
+
+  public SwedishConfusionProbabilityRule(ResourceBundle messages, LanguageModel languageModel, Language language) {
+    this(messages, languageModel, language, 3);
+  }
+
+  public SwedishConfusionProbabilityRule(ResourceBundle messages, LanguageModel languageModel, Language language, int grams) {
+    super(messages, languageModel, language, grams, EXCEPTIONS, ANTI_PATTERNS);
+    addExamplePair(Example.wrong("Ett <marker>streck</marker> mot horisonten."),
+                   Example.fixed("Ett <marker>sträck</marker> mot horisonten."));
+  }
+
+  protected boolean isCommonWord(String token) {
+    return token.matches("[\\wåäöüßÅÄÖÜ]+");
+  }
+
+}
diff --git a/...ol-language-modules/sv/src/main/resources/org/languagetool/resource/sv/confusion_sets.txt b/...ol-language-modules/sv/src/main/resources/org/languagetool/resource/sv/confusion_sets.txt
@@ -0,0 +1,17 @@
+# Swedish confusion sets
+# Line format:
+# <word1>|<description1>; <word2>|<description2>; <factor>   # optional comment
+#   <word1> and <word2> are words that can easily be confused
+#   <description> will be used in the error message to explain the word (optional)
+#   <factor> is the factor of how much more the other word must be more
+#            probable so the text is considered potentially incorrect.
+#            Use a higher value for better precision but lower recall.
+#   Precision (p) and recall (r) values in the comments come from ConfusionRuleEvaluator
+#   The number after recall is the number of sentences used for evaluation.
+# Order is relevant for ambiguous cases like 'know' ('no' or 'now') where the match
+# is used whose pair comes first in this file.
+# Alphabetical order on each line is also important!
+#
+dem; dom; 100
+streck; sträck; 25
+
diff --git a/...dules/sv/src/test/java/org/languagetool/rules/sv/SwedishConfusionProbabilityRuleTest.java b/...dules/sv/src/test/java/org/languagetool/rules/sv/SwedishConfusionProbabilityRuleTest.java
@@ -0,0 +1,108 @@
+/* LanguageTool, a natural language style checker 
+ * Copyright (C) 2015 Daniel Naber (http://www.danielnaber.de)
+ * 
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
+ * USA
+ */
+package org.languagetool.rules.sv;
+
+import org.apache.commons.lang3.StringUtils;
+import org.junit.Ignore;
+import org.junit.Test;
+import org.languagetool.*;
+import org.languagetool.languagemodel.LuceneLanguageModel;
+import org.languagetool.rules.RuleMatch;
+import org.languagetool.rules.ngrams.FakeLanguageModel;
+
+import java.io.File;
+import java.io.IOException;
+
+import static org.hamcrest.CoreMatchers.is;
+import static org.junit.Assert.*;
+
+public class SwedishConfusionProbabilityRuleTest {
+
+  private final Language swedish = Languages.getLanguageForShortCode("sv");
+  private final JLanguageTool lt = new JLanguageTool(swedish);
+
+  private SwedishConfusionProbabilityRule rule;
+
+  @SuppressWarnings("ResultOfObjectAllocationIgnored")
+  @Test
+  public void testConstructor() {
+    new SwedishConfusionProbabilityRule(TestTools.getEnglishMessages(), new FakeLanguageModel(), swedish);
+  }
+
+  @Test
+  @Ignore
+  public void testRule() throws IOException {
+    File indexDir = new File("/data/ngram-index/sv");
+    if (!indexDir.exists()) {
+      throw new RuntimeException("ngram data not found at " + indexDir + ", get more info at https://dev.languagetool.org/finding-errors-using-n-gram-data");
+    }
+    rule = new SwedishConfusionProbabilityRule(TestTools.getMessages("sv"), new LuceneLanguageModel(indexDir), swedish);
+
+    Replacement majMaj = new Replacement("Maj", "maj");
+    assertMatch("Från 15 Maj är det eldningsförbud.", majMaj);
+    assertMatch("15 Maj är sista dag för antagningen.", majMaj);
+
+    Replacement mmajMaj = new Replacement("majs", "Majs");
+    assertMatch("Det är majs födelsedag idag.", mmajMaj);
+
+    Replacement streStra = new Replacement("Sträck", "Streck");
+    assertMatch("Sträcket är rakt.", streStra);
+
+    Replacement straStre = new Replacement("streck", "sträck");
+    assertMatch("I ett streck.", straStre);
+
+    Replacement sanktSankta = new Replacement("S:t", "Sankt");
+    assertMatch("S:t Johannesgatan är mitt i stan.", sanktSankta);
+
+    Replacement sanktaStora = new Replacement("S:a", "Stora");
+    assertMatch("S:a Karlsö blev naturreservat 1970.", sanktaStora);
+
+    Replacement saSankta = new Replacement("S:a", "Sankta");
+    assertMatch("S:a Klara kloster.", saSankta);
+
+  }
+
+  private void assertMatch(String errorInput, Replacement rep) throws IOException {
+    assertMatch(errorInput, 1);
+    String fixedInput = StringUtils.replaceOnce(errorInput, rep.oldString, rep.newsString);
+    if (fixedInput.equals(errorInput)) {
+      throw new RuntimeException("Could not fix sentence: '" + errorInput  + "' with " + rep);
+    }
+    assertMatch(fixedInput, 0);
+  }
+
+  private void assertMatch(String input, int expectedMatches) throws IOException {
+    AnalyzedSentence errorSentence = lt.getAnalyzedSentence(input);
+    RuleMatch[] matches = rule.match(errorSentence);
+    assertThat("Got " + matches.length + " match(es) for: " + input, matches.length, is(expectedMatches));
+  }
+
+  static class Replacement {
+    String oldString;
+    String newsString;
+    Replacement(String oldString, String newsString) {
+      this.oldString = oldString;
+      this.newsString = newsString;
+    }
+    @Override
+    public String toString() {
+      return oldString + "/" + newsString;
+    }
+  }
+}
diff --git a/languagetool-language-modules/sv/src/test/java/org/languagetool/rules/sv/SwedishTest.java b/languagetool-language-modules/sv/src/test/java/org/languagetool/rules/sv/SwedishTest.java
@@ -20,9 +20,11 @@
 
 import org.junit.Test;
 import org.languagetool.JLanguageTool;
+import org.languagetool.Language;
 import org.languagetool.LanguageSpecificTest;
 import org.languagetool.language.Swedish;
 
+import java.io.File;
 import java.io.IOException;
 
 import static org.junit.Assert.assertThat;
@@ -32,7 +34,11 @@ public class SwedishTest extends LanguageSpecificTest {
 
   @Test
   public void testLanguage() throws IOException {
-    runTests(new Swedish());
+    Language sv = new Swedish();
+    JLanguageTool ltSwedish = sv.createDefaultJLanguageTool();
+    File svNgramsIndex = new File("/data/ngram-index");
+    ltSwedish.activateLanguageModelRules(svNgramsIndex);
+    runTests(sv);
   }
 
   @Test