diff --git a/languagetool-language-modules/sv/src/main/java/org/languagetool/language/Swedish.java b/languagetool-language-modules/sv/src/main/java/org/languagetool/language/Swedish.java index 5798dbde397c..00902727fde0 100644 --- a/languagetool-language-modules/sv/src/main/java/org/languagetool/language/Swedish.java +++ b/languagetool-language-modules/sv/src/main/java/org/languagetool/language/Swedish.java @@ -23,6 +23,7 @@ import org.languagetool.Language; import org.languagetool.LanguageMaintainedState; import org.languagetool.UserConfig; +import org.languagetool.languagemodel.LanguageModel; import org.languagetool.rules.*; import org.languagetool.rules.spelling.SpellingCheckRule; import org.languagetool.rules.spelling.hunspell.HunspellRule; @@ -35,16 +36,24 @@ import org.languagetool.tagging.sv.SwedishTagger; import org.languagetool.tokenizers.SRXSentenceTokenizer; import org.languagetool.tokenizers.SentenceTokenizer; +//import org.languagetool.tokenizers.sv.SwedishWordTokenizer; +import java.io.File; import java.io.IOException; +import java.io.InputStream; import java.util.*; +import static java.util.Arrays.asList; + /** - * @deprecated this language is unmaintained in LT and might be removed in a future release if we cannot find contributors for it (deprecated since 3.6) - * Actively maintained since v6.2+ + * + * Deprecated in 3.6, but actively maintained again since v6.2+ + * */ @Deprecated -public class Swedish extends Language { +public class Swedish extends Language implements AutoCloseable { + + private LanguageModel languageModel; @Override public String getName() { @@ -72,6 +81,56 @@ public SentenceTokenizer createDefaultSentenceTokenizer() { return new SRXSentenceTokenizer(this); } +/* + @Override + public Tokenizer createDefaultWordTokenizer() { + return new SwedishWordTokenizer(); + } +*/ + + @Override + public synchronized LanguageModel getLanguageModel(File indexDir) throws IOException { + languageModel = initLanguageModel(indexDir, languageModel); + return languageModel; + } + + @Override + public List getRelevantLanguageModelRules(ResourceBundle messages, LanguageModel languageModel, UserConfig userConfig) throws IOException { + return asList( + //new UpperCaseNgramRule(messages, languageModel, this, userConfig), + new SwedishConfusionProbabilityRule(messages, languageModel, this) + //new SwedishNgramProbabilityRule(messages, languageModel, this) + ); + } + +/* + @Override + public List getRelevantLanguageModelCapableRules(ResourceBundle messages, @Nullable LanguageModel lm, GlobalConfig globalConfig, UserConfig userConfig, Language motherTongue, List altLanguages) throws IOException { + if (lm != null && motherTongue != null) { + if ("en".equals(motherTongue.getShortCode())) { + return asList(new SwedishForEnglishNativesFalseFriendRule(messages, lm, motherTongue, this)); + } else if ("de".equals(motherTongue.getShortCode())) { + return asList(new SwedishForGermansFalseFriendRule(messages, lm, motherTongue, this)); + } else if ("da".equals(motherTongue.getShortCode())) { + return asList(new SwedishForDanesFalseFriendRule(messages, lm, motherTongue, this)); + } else if ("no".equals(motherTongue.getShortCode())) { + return asList(new SwedishForNorwegiansFalseFriendRule(messages, lm, motherTongue, this)); + } + } + return asList(); + } + + @Override + public boolean hasNGramFalseFriendRule(Language motherTongue) { + return motherTongue != null && ( + "en".equals(motherTongue.getShortCode()) || + "de".equals(motherTongue.getShortCode()) || + "da".equals(motherTongue.getShortCode()) || + "no".equals(motherTongue.getShortCode())); + } + +*/ + @Override public Disambiguator createDefaultDisambiguator() { return new SwedishHybridDisambiguator(); @@ -118,4 +177,41 @@ public List getRelevantRules(ResourceBundle messages, UserConfig userConfi protected SpellingCheckRule createDefaultSpellingRule(ResourceBundle messages) throws IOException { return new HunspellRule(messages, this, null, null); } + + /** @since 6.2+ */ +// @Override +// public String getOpeningDoubleQuote() { +// return "”"; +// } + + /** @since 6.2+ */ +// @Override +// public String getClosingDoubleQuote() { +// return "”"; +// } + + /** @since 6.2+ */ +// @Override +// public String getOpeningSingleQuote() { +// return "’"; +// } + + /** @since 6.2+ */ +// @Override +// public String getClosingSingleQuote() { +// return "’"; +// } + + /** + * Closes the language model, if any. + * @since 6.2+ + */ + @Override + public void close() throws Exception { + if (languageModel != null) { + languageModel.close(); + } + } + } + diff --git a/languagetool-language-modules/sv/src/main/java/org/languagetool/rules/sv/SwedishConfusionProbabilityRule.java b/languagetool-language-modules/sv/src/main/java/org/languagetool/rules/sv/SwedishConfusionProbabilityRule.java new file mode 100644 index 000000000000..ccf5259b5370 --- /dev/null +++ b/languagetool-language-modules/sv/src/main/java/org/languagetool/rules/sv/SwedishConfusionProbabilityRule.java @@ -0,0 +1,68 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2014 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package org.languagetool.rules.sv; + +import org.languagetool.Language; +import org.languagetool.languagemodel.LanguageModel; +import org.languagetool.rules.ngrams.ConfusionProbabilityRule; +import org.languagetool.rules.Example; +import org.languagetool.rules.patterns.PatternToken; + +import java.util.Arrays; +import java.util.List; +import java.util.ResourceBundle; + +import static org.languagetool.rules.patterns.PatternRuleBuilderHelper.posRegex; +import static org.languagetool.rules.patterns.PatternRuleBuilderHelper.token; +import static org.languagetool.rules.patterns.PatternRuleBuilderHelper.tokenRegex; + +/** + * @since 2.7 + */ +public class SwedishConfusionProbabilityRule extends ConfusionProbabilityRule { + + private static final List EXCEPTIONS = Arrays.asList( + // Use all-lowercase, matches will be case-insensitive. + "god sak" + ); + + private static final List> ANTI_PATTERNS = Arrays.asList( + Arrays.asList( + // "De små öronen" "Dessa små öron" + tokenRegex("de|dessa|dom"), + token("små"), + posRegex("NN:PLU") + ) + ); + + public SwedishConfusionProbabilityRule(ResourceBundle messages, LanguageModel languageModel, Language language) { + this(messages, languageModel, language, 3); + } + + public SwedishConfusionProbabilityRule(ResourceBundle messages, LanguageModel languageModel, Language language, int grams) { + super(messages, languageModel, language, grams, EXCEPTIONS, ANTI_PATTERNS); + addExamplePair(Example.wrong("Ett streck mot horisonten."), + Example.fixed("Ett sträck mot horisonten.")); + } + + protected boolean isCommonWord(String token) { + return token.matches("[\\wåäöüßÅÄÖÜ]+"); + } + +} diff --git a/languagetool-language-modules/sv/src/main/resources/org/languagetool/resource/sv/confusion_sets.txt b/languagetool-language-modules/sv/src/main/resources/org/languagetool/resource/sv/confusion_sets.txt new file mode 100644 index 000000000000..3a5f13d5b3e2 --- /dev/null +++ b/languagetool-language-modules/sv/src/main/resources/org/languagetool/resource/sv/confusion_sets.txt @@ -0,0 +1,17 @@ +# Swedish confusion sets +# Line format: +# |; |; # optional comment +# and are words that can easily be confused +# will be used in the error message to explain the word (optional) +# is the factor of how much more the other word must be more +# probable so the text is considered potentially incorrect. +# Use a higher value for better precision but lower recall. +# Precision (p) and recall (r) values in the comments come from ConfusionRuleEvaluator +# The number after recall is the number of sentences used for evaluation. +# Order is relevant for ambiguous cases like 'know' ('no' or 'now') where the match +# is used whose pair comes first in this file. +# Alphabetical order on each line is also important! +# +dem; dom; 100 +streck; sträck; 25 + diff --git a/languagetool-language-modules/sv/src/test/java/org/languagetool/rules/sv/SwedishConfusionProbabilityRuleTest.java b/languagetool-language-modules/sv/src/test/java/org/languagetool/rules/sv/SwedishConfusionProbabilityRuleTest.java new file mode 100644 index 000000000000..096ddda7325d --- /dev/null +++ b/languagetool-language-modules/sv/src/test/java/org/languagetool/rules/sv/SwedishConfusionProbabilityRuleTest.java @@ -0,0 +1,108 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2015 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package org.languagetool.rules.sv; + +import org.apache.commons.lang3.StringUtils; +import org.junit.Ignore; +import org.junit.Test; +import org.languagetool.*; +import org.languagetool.languagemodel.LuceneLanguageModel; +import org.languagetool.rules.RuleMatch; +import org.languagetool.rules.ngrams.FakeLanguageModel; + +import java.io.File; +import java.io.IOException; + +import static org.hamcrest.CoreMatchers.is; +import static org.junit.Assert.*; + +public class SwedishConfusionProbabilityRuleTest { + + private final Language swedish = Languages.getLanguageForShortCode("sv"); + private final JLanguageTool lt = new JLanguageTool(swedish); + + private SwedishConfusionProbabilityRule rule; + + @SuppressWarnings("ResultOfObjectAllocationIgnored") + @Test + public void testConstructor() { + new SwedishConfusionProbabilityRule(TestTools.getEnglishMessages(), new FakeLanguageModel(), swedish); + } + + @Test + @Ignore + public void testRule() throws IOException { + File indexDir = new File("/data/ngram-index/sv"); + if (!indexDir.exists()) { + throw new RuntimeException("ngram data not found at " + indexDir + ", get more info at https://dev.languagetool.org/finding-errors-using-n-gram-data"); + } + rule = new SwedishConfusionProbabilityRule(TestTools.getMessages("sv"), new LuceneLanguageModel(indexDir), swedish); + + Replacement majMaj = new Replacement("Maj", "maj"); + assertMatch("Från 15 Maj är det eldningsförbud.", majMaj); + assertMatch("15 Maj är sista dag för antagningen.", majMaj); + + Replacement mmajMaj = new Replacement("majs", "Majs"); + assertMatch("Det är majs födelsedag idag.", mmajMaj); + + Replacement streStra = new Replacement("Sträck", "Streck"); + assertMatch("Sträcket är rakt.", streStra); + + Replacement straStre = new Replacement("streck", "sträck"); + assertMatch("I ett streck.", straStre); + + Replacement sanktSankta = new Replacement("S:t", "Sankt"); + assertMatch("S:t Johannesgatan är mitt i stan.", sanktSankta); + + Replacement sanktaStora = new Replacement("S:a", "Stora"); + assertMatch("S:a Karlsö blev naturreservat 1970.", sanktaStora); + + Replacement saSankta = new Replacement("S:a", "Sankta"); + assertMatch("S:a Klara kloster.", saSankta); + + } + + private void assertMatch(String errorInput, Replacement rep) throws IOException { + assertMatch(errorInput, 1); + String fixedInput = StringUtils.replaceOnce(errorInput, rep.oldString, rep.newsString); + if (fixedInput.equals(errorInput)) { + throw new RuntimeException("Could not fix sentence: '" + errorInput + "' with " + rep); + } + assertMatch(fixedInput, 0); + } + + private void assertMatch(String input, int expectedMatches) throws IOException { + AnalyzedSentence errorSentence = lt.getAnalyzedSentence(input); + RuleMatch[] matches = rule.match(errorSentence); + assertThat("Got " + matches.length + " match(es) for: " + input, matches.length, is(expectedMatches)); + } + + static class Replacement { + String oldString; + String newsString; + Replacement(String oldString, String newsString) { + this.oldString = oldString; + this.newsString = newsString; + } + @Override + public String toString() { + return oldString + "/" + newsString; + } + } +} diff --git a/languagetool-language-modules/sv/src/test/java/org/languagetool/rules/sv/SwedishTest.java b/languagetool-language-modules/sv/src/test/java/org/languagetool/rules/sv/SwedishTest.java index 6e2501d713bf..852de9bb2e39 100644 --- a/languagetool-language-modules/sv/src/test/java/org/languagetool/rules/sv/SwedishTest.java +++ b/languagetool-language-modules/sv/src/test/java/org/languagetool/rules/sv/SwedishTest.java @@ -20,9 +20,11 @@ import org.junit.Test; import org.languagetool.JLanguageTool; +import org.languagetool.Language; import org.languagetool.LanguageSpecificTest; import org.languagetool.language.Swedish; +import java.io.File; import java.io.IOException; import static org.junit.Assert.assertThat; @@ -32,7 +34,11 @@ public class SwedishTest extends LanguageSpecificTest { @Test public void testLanguage() throws IOException { - runTests(new Swedish()); + Language sv = new Swedish(); + JLanguageTool ltSwedish = sv.createDefaultJLanguageTool(); + File svNgramsIndex = new File("/data/ngram-index"); + ltSwedish.activateLanguageModelRules(svNgramsIndex); + runTests(sv); } @Test