Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[sv] Adding initial ngram support for Swedish #9042

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import org.languagetool.Language;
import org.languagetool.LanguageMaintainedState;
import org.languagetool.UserConfig;
import org.languagetool.languagemodel.LanguageModel;
import org.languagetool.rules.*;
import org.languagetool.rules.spelling.SpellingCheckRule;
import org.languagetool.rules.spelling.hunspell.HunspellRule;
Expand All @@ -35,16 +36,24 @@
import org.languagetool.tagging.sv.SwedishTagger;
import org.languagetool.tokenizers.SRXSentenceTokenizer;
import org.languagetool.tokenizers.SentenceTokenizer;
//import org.languagetool.tokenizers.sv.SwedishWordTokenizer;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.*;

import static java.util.Arrays.asList;

/**
* @deprecated this language is unmaintained in LT and might be removed in a future release if we cannot find contributors for it (deprecated since 3.6)
* Actively maintained since v6.2+
*
* Deprecated in 3.6, but actively maintained again since v6.2+
*
*/
@Deprecated
public class Swedish extends Language {
public class Swedish extends Language implements AutoCloseable {

private LanguageModel languageModel;

@Override
public String getName() {
Expand Down Expand Up @@ -72,6 +81,56 @@ public SentenceTokenizer createDefaultSentenceTokenizer() {
return new SRXSentenceTokenizer(this);
}

/*
@Override
public Tokenizer createDefaultWordTokenizer() {
return new SwedishWordTokenizer();
}
*/

@Override
public synchronized LanguageModel getLanguageModel(File indexDir) throws IOException {
languageModel = initLanguageModel(indexDir, languageModel);
return languageModel;
}

@Override
public List<Rule> getRelevantLanguageModelRules(ResourceBundle messages, LanguageModel languageModel, UserConfig userConfig) throws IOException {
return asList(
//new UpperCaseNgramRule(messages, languageModel, this, userConfig),
new SwedishConfusionProbabilityRule(messages, languageModel, this)
//new SwedishNgramProbabilityRule(messages, languageModel, this)
);
}

/*
@Override
public List<Rule> getRelevantLanguageModelCapableRules(ResourceBundle messages, @Nullable LanguageModel lm, GlobalConfig globalConfig, UserConfig userConfig, Language motherTongue, List<Language> altLanguages) throws IOException {
if (lm != null && motherTongue != null) {
if ("en".equals(motherTongue.getShortCode())) {
return asList(new SwedishForEnglishNativesFalseFriendRule(messages, lm, motherTongue, this));
} else if ("de".equals(motherTongue.getShortCode())) {
return asList(new SwedishForGermansFalseFriendRule(messages, lm, motherTongue, this));
} else if ("da".equals(motherTongue.getShortCode())) {
return asList(new SwedishForDanesFalseFriendRule(messages, lm, motherTongue, this));
} else if ("no".equals(motherTongue.getShortCode())) {
return asList(new SwedishForNorwegiansFalseFriendRule(messages, lm, motherTongue, this));
}
}
return asList();
}

@Override
public boolean hasNGramFalseFriendRule(Language motherTongue) {
return motherTongue != null && (
"en".equals(motherTongue.getShortCode()) ||
"de".equals(motherTongue.getShortCode()) ||
"da".equals(motherTongue.getShortCode()) ||
"no".equals(motherTongue.getShortCode()));
}

*/

@Override
public Disambiguator createDefaultDisambiguator() {
return new SwedishHybridDisambiguator();
Expand Down Expand Up @@ -118,4 +177,41 @@ public List<Rule> getRelevantRules(ResourceBundle messages, UserConfig userConfi
protected SpellingCheckRule createDefaultSpellingRule(ResourceBundle messages) throws IOException {
return new HunspellRule(messages, this, null, null);
}

/** @since 6.2+ */
// @Override
// public String getOpeningDoubleQuote() {
// return "”";
// }

/** @since 6.2+ */
// @Override
// public String getClosingDoubleQuote() {
// return "”";
// }

/** @since 6.2+ */
// @Override
// public String getOpeningSingleQuote() {
// return "’";
// }

/** @since 6.2+ */
// @Override
// public String getClosingSingleQuote() {
// return "’";
// }

/**
* Closes the language model, if any.
* @since 6.2+
*/
@Override
public void close() throws Exception {
if (languageModel != null) {
languageModel.close();
}
}

}

Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
/* LanguageTool, a natural language style checker
* Copyright (C) 2014 Daniel Naber (http://www.danielnaber.de)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* USA
*/
package org.languagetool.rules.sv;

import org.languagetool.Language;
import org.languagetool.languagemodel.LanguageModel;
import org.languagetool.rules.ngrams.ConfusionProbabilityRule;
import org.languagetool.rules.Example;
import org.languagetool.rules.patterns.PatternToken;

import java.util.Arrays;
import java.util.List;
import java.util.ResourceBundle;

import static org.languagetool.rules.patterns.PatternRuleBuilderHelper.posRegex;
import static org.languagetool.rules.patterns.PatternRuleBuilderHelper.token;
import static org.languagetool.rules.patterns.PatternRuleBuilderHelper.tokenRegex;

/**
* @since 2.7
*/
public class SwedishConfusionProbabilityRule extends ConfusionProbabilityRule {

private static final List<String> EXCEPTIONS = Arrays.asList(
// Use all-lowercase, matches will be case-insensitive.
"god sak"
);

private static final List<List<PatternToken>> ANTI_PATTERNS = Arrays.asList(
Arrays.asList(
// "De små öronen" "Dessa små öron"
tokenRegex("de|dessa|dom"),
token("små"),
posRegex("NN:PLU")
)
);

public SwedishConfusionProbabilityRule(ResourceBundle messages, LanguageModel languageModel, Language language) {
this(messages, languageModel, language, 3);
}

public SwedishConfusionProbabilityRule(ResourceBundle messages, LanguageModel languageModel, Language language, int grams) {
super(messages, languageModel, language, grams, EXCEPTIONS, ANTI_PATTERNS);
addExamplePair(Example.wrong("Ett <marker>streck</marker> mot horisonten."),
Example.fixed("Ett <marker>sträck</marker> mot horisonten."));
}

protected boolean isCommonWord(String token) {
return token.matches("[\\wåäöüßÅÄÖÜ]+");
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Swedish confusion sets
# Line format:
# <word1>|<description1>; <word2>|<description2>; <factor> # optional comment
# <word1> and <word2> are words that can easily be confused
# <description> will be used in the error message to explain the word (optional)
# <factor> is the factor of how much more the other word must be more
# probable so the text is considered potentially incorrect.
# Use a higher value for better precision but lower recall.
# Precision (p) and recall (r) values in the comments come from ConfusionRuleEvaluator
# The number after recall is the number of sentences used for evaluation.
# Order is relevant for ambiguous cases like 'know' ('no' or 'now') where the match
# is used whose pair comes first in this file.
# Alphabetical order on each line is also important!
#
dem; dom; 100
streck; sträck; 25

Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
/* LanguageTool, a natural language style checker
* Copyright (C) 2015 Daniel Naber (http://www.danielnaber.de)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* USA
*/
package org.languagetool.rules.sv;

import org.apache.commons.lang3.StringUtils;
import org.junit.Ignore;
import org.junit.Test;
import org.languagetool.*;
import org.languagetool.languagemodel.LuceneLanguageModel;
import org.languagetool.rules.RuleMatch;
import org.languagetool.rules.ngrams.FakeLanguageModel;

import java.io.File;
import java.io.IOException;

import static org.hamcrest.CoreMatchers.is;
import static org.junit.Assert.*;

public class SwedishConfusionProbabilityRuleTest {

private final Language swedish = Languages.getLanguageForShortCode("sv");
private final JLanguageTool lt = new JLanguageTool(swedish);

private SwedishConfusionProbabilityRule rule;

@SuppressWarnings("ResultOfObjectAllocationIgnored")
@Test
public void testConstructor() {
new SwedishConfusionProbabilityRule(TestTools.getEnglishMessages(), new FakeLanguageModel(), swedish);
}

@Test
@Ignore
public void testRule() throws IOException {
File indexDir = new File("/data/ngram-index/sv");
if (!indexDir.exists()) {
throw new RuntimeException("ngram data not found at " + indexDir + ", get more info at https://dev.languagetool.org/finding-errors-using-n-gram-data");
}
rule = new SwedishConfusionProbabilityRule(TestTools.getMessages("sv"), new LuceneLanguageModel(indexDir), swedish);

Replacement majMaj = new Replacement("Maj", "maj");
assertMatch("Från 15 Maj är det eldningsförbud.", majMaj);
assertMatch("15 Maj är sista dag för antagningen.", majMaj);

Replacement mmajMaj = new Replacement("majs", "Majs");
assertMatch("Det är majs födelsedag idag.", mmajMaj);

Replacement streStra = new Replacement("Sträck", "Streck");
assertMatch("Sträcket är rakt.", streStra);

Replacement straStre = new Replacement("streck", "sträck");
assertMatch("I ett streck.", straStre);

Replacement sanktSankta = new Replacement("S:t", "Sankt");
assertMatch("S:t Johannesgatan är mitt i stan.", sanktSankta);

Replacement sanktaStora = new Replacement("S:a", "Stora");
assertMatch("S:a Karlsö blev naturreservat 1970.", sanktaStora);

Replacement saSankta = new Replacement("S:a", "Sankta");
assertMatch("S:a Klara kloster.", saSankta);

}

private void assertMatch(String errorInput, Replacement rep) throws IOException {
assertMatch(errorInput, 1);
String fixedInput = StringUtils.replaceOnce(errorInput, rep.oldString, rep.newsString);
if (fixedInput.equals(errorInput)) {
throw new RuntimeException("Could not fix sentence: '" + errorInput + "' with " + rep);
}
assertMatch(fixedInput, 0);
}

private void assertMatch(String input, int expectedMatches) throws IOException {
AnalyzedSentence errorSentence = lt.getAnalyzedSentence(input);
RuleMatch[] matches = rule.match(errorSentence);
assertThat("Got " + matches.length + " match(es) for: " + input, matches.length, is(expectedMatches));
}

static class Replacement {
String oldString;
String newsString;
Replacement(String oldString, String newsString) {
this.oldString = oldString;
this.newsString = newsString;
}
@Override
public String toString() {
return oldString + "/" + newsString;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,11 @@

import org.junit.Test;
import org.languagetool.JLanguageTool;
import org.languagetool.Language;
import org.languagetool.LanguageSpecificTest;
import org.languagetool.language.Swedish;

import java.io.File;
import java.io.IOException;

import static org.junit.Assert.assertThat;
Expand All @@ -32,7 +34,11 @@ public class SwedishTest extends LanguageSpecificTest {

@Test
public void testLanguage() throws IOException {
runTests(new Swedish());
Language sv = new Swedish();
JLanguageTool ltSwedish = sv.createDefaultJLanguageTool();
File svNgramsIndex = new File("/data/ngram-index");
ltSwedish.activateLanguageModelRules(svNgramsIndex);
runTests(sv);
}

@Test
Expand Down