From 5eb02545c5113366af5a2951c2974489e61045ca Mon Sep 17 00:00:00 2001 From: Kalafatis Kwstas Date: Sat, 21 Sep 2024 11:49:42 +0300 Subject: [PATCH] Add Optimal String Alignment (OSA) Distance Algorithm (#464) --- .../Similarity/OptimalStringAlignmentTests.cs | 74 +++++++++ .../Similarity/OptimalStringAlignment.cs | 157 ++++++++++++++++++ README.md | 1 + 3 files changed, 232 insertions(+) create mode 100644 Algorithms.Tests/Strings/Similarity/OptimalStringAlignmentTests.cs create mode 100644 Algorithms/Strings/Similarity/OptimalStringAlignment.cs diff --git a/Algorithms.Tests/Strings/Similarity/OptimalStringAlignmentTests.cs b/Algorithms.Tests/Strings/Similarity/OptimalStringAlignmentTests.cs new file mode 100644 index 00000000..48199013 --- /dev/null +++ b/Algorithms.Tests/Strings/Similarity/OptimalStringAlignmentTests.cs @@ -0,0 +1,74 @@ +using Algorithms.Strings.Similarity; +using FluentAssertions; +using NUnit.Framework; +using System; + +namespace Algorithms.Tests.Strings.Similarity +{ + [TestFixture] + public class OptimalStringAlignmentTests + { + [Test] + public void Calculate_IdenticalStrings_ReturnsZero() + { + var result = OptimalStringAlignment.Calculate("example", "example"); + result.Should().Be(0.0); + } + + [Test] + public void Calculate_FirstStringEmpty_ReturnsLengthOfSecondString() + { + var result = OptimalStringAlignment.Calculate("", "example"); + result.Should().Be("example".Length); + } + + [Test] + public void Calculate_SecondStringEmpty_ReturnsLengthOfFirstString() + { + var result = OptimalStringAlignment.Calculate("example", ""); + result.Should().Be("example".Length); + } + + [Test] + public void Calculate_BothStringsEmpty_ReturnsZero() + { + var result = OptimalStringAlignment.Calculate("", ""); + result.Should().Be(0.0); + } + + [Test] + public void Calculate_OneInsertion_ReturnsOne() + { + var result = OptimalStringAlignment.Calculate("example", "examples"); + result.Should().Be(1.0); + } + + [Test] + public void Calculate_OneDeletion_ReturnsOne() + { + var result = OptimalStringAlignment.Calculate("examples", "example"); + result.Should().Be(1.0); + } + + [Test] + public void Calculate_OneSubstitution_ReturnsOne() + { + var result = OptimalStringAlignment.Calculate("example", "exbmple"); + result.Should().Be(1.0); + } + + [Test] + public void Calculate_OneTransposition_ReturnsOne() + { + var result = OptimalStringAlignment.Calculate("example", "exmaple"); + result.Should().Be(1.0); + } + + [Test] + public void Calculate_MultipleOperations_ReturnsCorrectDistance() + { + var result = OptimalStringAlignment.Calculate("kitten", "sitting"); + result.Should().Be(3.0); + } + } +} diff --git a/Algorithms/Strings/Similarity/OptimalStringAlignment.cs b/Algorithms/Strings/Similarity/OptimalStringAlignment.cs new file mode 100644 index 00000000..743c4ce1 --- /dev/null +++ b/Algorithms/Strings/Similarity/OptimalStringAlignment.cs @@ -0,0 +1,157 @@ +using System; + +namespace Algorithms.Strings.Similarity +{ + /// + /// Provides methods to calculate the Optimal String Alignment distance between two strings. + /// + /// The Optimal String Alignment distance, also known as the restricted Damerau-Levenshtein distance, + /// is a string metric used to measure the difference between two sequences. It is similar to the + /// Levenshtein distance, but it also considers transpositions (swapping of two adjacent characters) + /// as a single operation. This metric is particularly useful when adjacent characters are commonly + /// transposed, such as in typographical errors. + /// + /// The OSA distance between two strings is defined as the minimum number of operations required to + /// transform one string into the other, where the operations include: + /// + /// 1. Insertion: Adding a single character. + /// 2. Deletion: Removing a single character. + /// 3. Substitution: Replacing one character with another. + /// 4. Transposition: Swapping two adjacent characters (this is what distinguishes OSA from the + /// traditional Levenshtein distance). + /// + /// The OSA distance algorithm ensures that no operation is applied more than once to the same + /// character in the same position. This is the main difference between the OSA and the more general + /// Damerau-Levenshtein distance, which does not have this restriction. + /// + /// + /// Example Usage: + /// + /// int distance = OptimalStringAlignmentDistance("example", "exmaple"); + /// Console.WriteLine(distance); // Output: 1 + /// + /// In this example, the strings "example" and "exmaple" differ by one transposition of adjacent characters ('a' and 'm'), + /// so the OSA distance is 1. + /// + /// + /// int distance = OptimalStringAlignmentDistance("kitten", "sitting"); + /// Console.WriteLine(distance); // Output: 3 + /// + /// Here, the strings "kitten" and "sitting" have three differences (substitutions 'k' to 's', 'e' to 'i', and insertion of 'g'), + /// resulting in an OSA distance of 3. + /// + /// + /// + /// This algorithm has a time complexity of O(n * m), where n and m are the lengths of the two input strings. + /// It is efficient for moderate-sized strings but may become computationally expensive for very long strings. + /// + public static class OptimalStringAlignment + { + /// + /// Calculates the Optimal String Alignment distance between two strings. + /// + /// The first string. + /// The second string. + /// The Optimal String Alignment distance between the two strings. + /// Thrown when either of the input strings is null. + public static double Calculate(string firstString, string secondString) + { + ArgumentNullException.ThrowIfNull(nameof(firstString)); + ArgumentNullException.ThrowIfNull(nameof(secondString)); + + if (firstString == secondString) + { + return 0.0; + } + + if (firstString.Length == 0) + { + return secondString.Length; + } + + if (secondString.Length == 0) + { + return firstString.Length; + } + + var distanceMatrix = GenerateDistanceMatrix(firstString.Length, secondString.Length); + distanceMatrix = CalculateDistance(firstString, secondString, distanceMatrix); + + return distanceMatrix[firstString.Length, secondString.Length]; + } + + /// + /// Generates the initial distance matrix for the given lengths of the two strings. + /// + /// The length of the first string. + /// The length of the second string. + /// The initialized distance matrix. + private static int[,] GenerateDistanceMatrix(int firstLength, int secondLength) + { + var distanceMatrix = new int[firstLength + 2, secondLength + 2]; + + for (var i = 0; i <= firstLength; i++) + { + distanceMatrix[i, 0] = i; + } + + for (var j = 0; j <= secondLength; j++) + { + distanceMatrix[0, j] = j; + } + + return distanceMatrix; + } + + /// + /// Calculates the distance matrix for the given strings using the Optimal String Alignment algorithm. + /// + /// The first string. + /// The second string. + /// The initial distance matrix. + /// The calculated distance matrix. + private static int[,] CalculateDistance(string firstString, string secondString, int[,] distanceMatrix) + { + for (var i = 1; i <= firstString.Length; i++) + { + for (var j = 1; j <= secondString.Length; j++) + { + var cost = 1; + + if (firstString[i - 1] == secondString[j - 1]) + { + cost = 0; + } + + distanceMatrix[i, j] = Minimum( + distanceMatrix[i - 1, j - 1] + cost, // substitution + distanceMatrix[i, j - 1] + 1, // insertion + distanceMatrix[i - 1, j] + 1); // deletion + + if (i > 1 && j > 1 + && firstString[i - 1] == secondString[j - 2] + && firstString[i - 2] == secondString[j - 1]) + { + distanceMatrix[i, j] = Math.Min( + distanceMatrix[i, j], + distanceMatrix[i - 2, j - 2] + cost); // transposition + } + } + } + + return distanceMatrix; + } + + /// + /// Returns the minimum of three integers. + /// + /// The first integer. + /// The second integer. + /// The third integer. + /// The minimum of the three integers. + private static int Minimum(int a, int b, int c) + { + return Math.Min(a, Math.Min(b, c)); + } + } +} diff --git a/README.md b/README.md index ef5a1cfd..23ae8a18 100644 --- a/README.md +++ b/README.md @@ -185,6 +185,7 @@ find more than one implementation for the same objective but using different alg * [Hamming Distance](./Algorithms/Strings/Similarity/HammingDistance.cs) * [Jaro Similarity](./Algorithms/Strings/Similarity/JaroSimilarity.cs) * [Jaro-Winkler Distance](./Algorithms/Strings/Similarity/JaroWinklerDistance.cs) + * [Optimal String Alignment](./Algorithms/Strings/Similarity/OptimalStringAlignment.cs) * [Pattern Matching](./Algorithms/Strings/PatternMatching/) * [Bitop Pattern Matching](./Algorithms/Strings/PatternMatching/Bitap.cs) * [Naive String Search](./Algorithms/Strings/PatternMatching/NaiveStringSearch.cs)