diff --git a/.rubocop.yml b/.rubocop.yml new file mode 100644 index 0000000..4e3229d --- /dev/null +++ b/.rubocop.yml @@ -0,0 +1,22 @@ +AllCops: + TargetRubyVersion: 2.4 +Metrics/AbcSize: + Enabled: false +Metrics/CyclomaticComplexity: + Enabled: false +Metrics/MethodLength: + Enabled: false +Metrics/PerceivedComplexity: + Enabled: false +Metrics/BlockLength: + Exclude: + - "tasks/**/*.rake" + +Style/StringLiterals: + EnforcedStyle: double_quotes +Layout/AlignParameters: + EnforcedStyle: with_fixed_indentation +Layout/MultilineMethodCallIndentation: + EnforcedStyle: indented +Layout/MultilineOperationIndentation: + EnforcedStyle: indented diff --git a/.yardopts b/.yardopts new file mode 100644 index 0000000..0a5cb3f --- /dev/null +++ b/.yardopts @@ -0,0 +1 @@ +-m markdown - LICENSE.txt diff --git a/Gemfile b/Gemfile index 4382df8..c6ed0c0 100644 --- a/Gemfile +++ b/Gemfile @@ -1,6 +1,8 @@ +# frozen_string_literal: true + source "https://rubygems.org" -git_source(:github) {|repo_name| "https://github.com/#{repo_name}" } +git_source(:github) { |repo_name| "https://github.com/#{repo_name}" } # Specify your gem's dependencies in edits.gemspec gemspec diff --git a/README.md b/README.md index 8d40748..b09f749 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ # Edits -Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/edits`. To experiment with that code, run `bin/console` for an interactive prompt. +A collection of edit distance algorithms in Ruby. -TODO: Delete this and the text above, and describe your gem +Includes Levenshtein, Restricted Edit (Optimal Alignment) and Damerau-Levenshtein distances, and Jaro and Jaro-Winkler similarity. ## Installation @@ -22,7 +22,66 @@ Or install it yourself as: ## Usage -TODO: Write usage instructions here +### Levenshtein + +Edit distance, accounting for deletion, addition and substitution. + +```ruby +Edits::Levenshtein.distance "raked", "bakers" +# => 3 +Edits::Levenshtein.distance "iota", "atom" +# => 4 +Edits::Levenshtein.distance "acer", "earn" +# => 4 + +# Max distance +Edits::Levenshtein.distance_with_max "iota", "atom", 2 +# => 2 +Edits::Levenshtein.most_similar "atom", %w[tram atlas rota racer] +# => "atlas" +``` + +### Restricted Edit (Optimal Alignment) + +Edit distance, accounting for deletion, addition, substitution and swapped +characters. + +```ruby +Edits::RestrictedEdit.distance "raked", "bakers" +# => 3 +Edits::RestrictedEdit.distance "iota", "atom" +# => 3 +Edits::RestrictedEdit.distance "acer", "earn" +# => 4 +``` + +### Damerau-Levenshtein + +Edit distance, accounting for deletions, additions, substitution and +transposition. + +```ruby +Edits::DamerauLevenshtein.distance "raked", "bakers" +# => 3 +Edits::DamerauLevenshtein.distance "iota", "atom" +# => 3 +Edits::DamerauLevenshtein.distance "acer", "earn" +# => 3 +``` + +### Jaro & Jaro-Winkler + +```ruby +Edits::Jaro.similarity "information", "informant" +# => 0.90235690235690236 +Edits::Jaro.distance "information", "informant" +# => 0.097643097643097643 + +Edits::JaroWinkler.similarity "information", "informant" +# => 0.94141414141414137 +Edits::JaroWinkler.distance "information", "informant" +# => 0.05858585858585863 +``` ## Development @@ -32,7 +91,7 @@ To install this gem onto your local machine, run `bundle exec rake install`. To ## Contributing -Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/edits. +Bug reports and pull requests are welcome on GitHub at https://github.com/tcrouch/edits. ## License diff --git a/Rakefile b/Rakefile index b7e9ed5..a95a7da 100644 --- a/Rakefile +++ b/Rakefile @@ -1,6 +1,10 @@ +# frozen_string_literal: true + require "bundler/gem_tasks" require "rspec/core/rake_task" +Dir["tasks/**/*.rake"].each { |t| load t } + RSpec::Core::RakeTask.new(:spec) -task :default => :spec +task default: :spec diff --git a/bin/console b/bin/console index f51b317..89d2992 100755 --- a/bin/console +++ b/bin/console @@ -1,4 +1,5 @@ #!/usr/bin/env ruby +# frozen_string_literal: true require "bundler/setup" require "edits" diff --git a/edits.gemspec b/edits.gemspec index 80e2ab2..b4c1e77 100644 --- a/edits.gemspec +++ b/edits.gemspec @@ -1,4 +1,5 @@ -# coding: utf-8 +# frozen_string_literal: true + lib = File.expand_path("../lib", __FILE__) $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib) require "edits/version" @@ -9,21 +10,12 @@ Gem::Specification.new do |spec| spec.authors = ["Tom Crouch"] spec.email = ["tom.crouch@gmail.com"] - spec.summary = %q{TODO: Write a short summary, because Rubygems requires one.} - spec.description = %q{TODO: Write a longer description or delete this line.} - spec.homepage = "TODO: Put your gem's website or public repo URL here." + spec.summary = "A collection of edit distance algorithms." + # spec.description = "TODO: Write a longer description or delete this line." + spec.homepage = "https://github.com/tcrouch/edits" spec.license = "MIT" - # Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host' - # to allow pushing to a single host or delete this section to allow pushing to any host. - if spec.respond_to?(:metadata) - spec.metadata["allowed_push_host"] = "TODO: Set to 'http://mygemserver.com'" - else - raise "RubyGems 2.0 or newer is required to protect against " \ - "public gem pushes." - end - - spec.files = `git ls-files -z`.split("\x0").reject do |f| + spec.files = `git ls-files -z`.split("\x0").reject do |f| f.match(%r{^(test|spec|features)/}) end spec.bindir = "exe" @@ -32,5 +24,8 @@ Gem::Specification.new do |spec| spec.add_development_dependency "bundler", "~> 1.15" spec.add_development_dependency "rake", "~> 10.0" - spec.add_development_dependency "rspec", "~> 3.0" + spec.add_development_dependency "rspec", "~> 3.6" + spec.add_development_dependency "benchmark-ips" + spec.add_development_dependency "redcarpet" + spec.add_development_dependency "yard", "~> 0.9.9" end diff --git a/lib/edits.rb b/lib/edits.rb index 161aa8e..6262213 100644 --- a/lib/edits.rb +++ b/lib/edits.rb @@ -1,5 +1,15 @@ +# frozen_string_literal: true + require "edits/version" +require "edits/damerau_levenshtein" +require "edits/hamming" +require "edits/jaro" +require "edits/jaro_winkler" +require "edits/levenshtein" +require "edits/restricted_edit" + +# A collection of edit distance algorithms module Edits # Your code goes here... end diff --git a/lib/edits/damerau_levenshtein.rb b/lib/edits/damerau_levenshtein.rb new file mode 100644 index 0000000..3463cfb --- /dev/null +++ b/lib/edits/damerau_levenshtein.rb @@ -0,0 +1,94 @@ +# frozen_string_literal: true + +module Edits + # Implemention of the Damerau/Levenshtein distance algorithm. + # + # Determines distance between two strings by counting edits, identifying: + # * Insertion + # * Deletion + # * Substitution + # * Transposition + module DamerauLevenshtein + # Calculate the Damerau/Levenshtein distance of two sequences. + # + # @example + # DamerauLevenshtein.distance("acer", "earn") + # # => 3 + # @param seq1 [String, Array] + # @param seq2 [String, Array] + # @return [Integer] + def self.distance(seq1, seq2) + if seq1.length > seq2.length + temp = seq1 + seq1 = seq2 + seq2 = temp + end + + # array of Integer codepoints outperforms String + seq1 = seq1.codepoints if seq1.is_a? String + seq2 = seq2.codepoints if seq2.is_a? String + + rows = seq1.length + cols = seq2.length + return cols if rows.zero? + return rows if cols.zero? + + # 'infinite' edit distance for padding cost matrix. + # Can be any value greater than max[rows, cols] + inf = rows + cols + + # Initialize first two rows of cost matrix. + # The full initial state where cols=3, rows=2 (inf=5) would be: + # [[5, 5, 5, 5, 5], + # [5, 0, 1, 2, 3], + # [5, 1, 0, 0, 0], + # [5, 2, 0, 0, 0]] + matrix = [Array.new(cols + 2, inf)] + matrix << 0.upto(cols).to_a.unshift(inf) + + # element => last row seen + item_history = Hash.new(0) + + 1.upto(rows) do |row| + # generate next row of cost matrix + new_row = Array.new(cols + 2, 0) + new_row[0] = inf + new_row[1] = row + matrix << new_row + + last_match_col = 0 + seq1_item = seq1[row - 1] + + 1.upto(cols) do |col| + seq2_item = seq2[col - 1] + last_match_row = item_history[seq2_item] + + sub_cost = seq1_item == seq2_item ? 0 : 1 + + transposition = 1 + matrix[last_match_row][last_match_col] + transposition += row - last_match_row - 1 + transposition += col - last_match_col - 1 + + # TODO: do insertion/deletion need to be considered when + # seq1_item == seq2_item ? + deletion = matrix[row][col + 1] + 1 + insertion = matrix[row + 1][col] + 1 + substitution = matrix[row][col] + sub_cost + + # step cost is min of operation costs + cost = substitution < insertion ? substitution : insertion + cost = deletion if deletion < cost + cost = transposition if transposition < cost + + matrix[row + 1][col + 1] = cost + + last_match_col = col if sub_cost.zero? + end + + item_history[seq1_item] = row + end + + matrix[rows + 1][cols + 1] + end + end +end diff --git a/lib/edits/hamming.rb b/lib/edits/hamming.rb new file mode 100644 index 0000000..7e60825 --- /dev/null +++ b/lib/edits/hamming.rb @@ -0,0 +1,26 @@ +# frozen_string_literal: true + +module Edits + # @see https://en.wikipedia.org/wiki/Hamming_distance + module Hamming + # Calculate the Hamming distance between two sequences. + # + # @note A true distance metric, satisfies triangle inequality. + # + # @param seq1 [String, Array] + # @param seq2 [String, Array] + # @return [Integer] Hamming distance + def self.distance(seq1, seq2) + # if seq1.is_a?(Integer) && seq2.is_a?(Integer) + # return (seq1 ^ seq2).to_s(2).count("1") + # end + + length = seq1.length < seq2.length ? seq1.length : seq2.length + diff = (seq1.length - seq2.length).abs + + length.times.reduce(diff) do |distance, i| + seq1[i] == seq2[i] ? distance : distance + 1 + end + end + end +end diff --git a/lib/edits/jaro.rb b/lib/edits/jaro.rb new file mode 100644 index 0000000..8c1aaa7 --- /dev/null +++ b/lib/edits/jaro.rb @@ -0,0 +1,105 @@ +# frozen_string_literal: true + +module Edits + # @see https://en.wikipedia.org/wiki/Jaro-Winkler_distance + module Jaro + # Calculate Jaro similarity + # + # `Sj = 1/3 * ((m / |A|) + (m / |B|) + ((m - t) / m))` + # + # Where `m` is #matches and `t` is #transposes + # + # @example + # Edits::Jaro.similarity("information", "informant") + # # => 0.9023569023569024 + # @param seq1 [String, Array] + # @param seq2 [String, Array] + # @return [Float] + def self.similarity(seq1, seq2) + return 1.0 if seq1 == seq2 + return 0.0 if seq1.empty? || seq2.empty? + + seq1 = seq1.codepoints if seq1.is_a? String + seq2 = seq2.codepoints if seq2.is_a? String + + m, t = jaro_matches(seq1, seq2) + return 0.0 if m.zero? + + m = m.to_f + (1.0 / 3) * ((m / seq1.length) + (m / seq2.length) + ((m - t) / m)) + end + + # Calculate Jaro distance + # + # @example + # Edits::Jaro.distance("information", "informant") + # # => 0.09764309764309764 + # @param (see #distance) + # @return [Float] + def self.distance(str1, str2) + 1.0 - similarity(str1, str2) + end + + # Calculate number of Jaro matches and transpositions + # + # @param (see #distance) + # @return [(Integer, Integer)] matches and transpositions + def self.jaro_matches(seq1, seq2) + if seq1.length > seq2.length + temp = seq1 + seq1 = seq2 + seq2 = temp + end + + # search range: (max(|A|, |B|) / 2) - 1 + range = (seq2.length / 2) - 1 + range = 0 if range.negative? + + seq1_flags = Array.new(seq1.length, false) + seq2_flags = Array.new(seq2.length, false) + + matches = 0 + last2 = seq2.length - 1 + + # Pass 1: + # - determine number of matches + # - initialize transposition flags + seq1.length.times do |i| + min_bound = i >= range ? i - range : 0 + max_bound = (i + range) <= last2 ? (i + range) : last2 + + min_bound.upto(max_bound) do |j| + next unless seq2_flags[j] != true && seq2[j] == seq1[i] + + seq2_flags[j] = true + seq1_flags[i] = true + matches += 1 + break + end + end + + return [0, 0] if matches.zero? + + transposes = 0 + j = 0 + + # Pass 2: determine number of half-transpositions + seq1.length.times do |i| + # find a match in first string + next unless seq1_flags[i] == true + # go to location of next match on second string + j += 1 until seq2_flags[j] + + # transposition if not the current match + transposes += 1 if seq1[i] != seq2[j] + j += 1 + end + + # half-transpositions -> transpositions + transposes /= 2 + + [matches, transposes] + end + private_class_method :jaro_matches + end +end diff --git a/lib/edits/jaro_winkler.rb b/lib/edits/jaro_winkler.rb new file mode 100644 index 0000000..32ebaac --- /dev/null +++ b/lib/edits/jaro_winkler.rb @@ -0,0 +1,72 @@ +# frozen_string_literal: true + +module Edits + # @see https://en.wikipedia.org/wiki/Jaro-Winkler_distance + module JaroWinkler + # Prefix scaling factor for jaro-winkler metric. Default is 0.1 + # Should not exceed 0.25 or metric range will leave 0..1 + WINKLER_PREFIX_WEIGHT = 0.1 + + # Threshold for boosting Jaro with winkler prefix multiplier. + # Default is 0.7 + WINKLER_THRESHOLD = 0.7 + + # Calculate Jaro-Winkler similarity of given strings + # + # Adds weight to Jaro distance according to the length of a common prefix + # of up to 4 letters, where exists. The additional weighting is only + # applied when the original distance passes a threshold. + # + # `Sw = Sj + (l * p * (1 - Dj))` + # + # Where `Sj` is Jaro, `l` is prefix length, and `p` is prefix weight + # + # @example + # Edits::JaroWinkler.similarity("information", "informant") + # # => 0.9414141414141414 + # + # @param seq1 [String, Array] + # @param seq2 [String, Array] + # @param threshold [Float] threshold for applying Winkler prefix weighting + # @param weight [Float] weighting for common prefix, should not exceed 0.25 + # @return [Float] + def self.similarity( + seq1, seq2, + threshold: WINKLER_THRESHOLD, + weight: WINKLER_PREFIX_WEIGHT + ) + + dj = Jaro.similarity(seq1, seq2) + + if dj > threshold + # size of common prefix, max 4 + max_bound = seq1.length > seq2.length ? seq2.length : seq1.length + max_bound = 4 if max_bound > 4 + + l = 0 + l += 1 until seq1[l] != seq2[l] || l >= max_bound + + l < 1 ? dj : dj + (l * weight * (1 - dj)) + else + dj + end + end + + # Calculate Jaro-Winkler distance + # + # @note Not a true distance metric, fails to satisfy triangle inequality. + # + # @example + # Edits::JaroWinkler.distance("information", "informant") + # # => 0.05858585858585863 + # @param (see #distance) + # @return [Float] + def self.distance( + seq1, seq2, + threshold: WINKLER_THRESHOLD, + weight: WINKLER_PREFIX_WEIGHT + ) + 1.0 - similarity(seq1, seq2, threshold: threshold, weight: weight) + end + end +end diff --git a/lib/edits/levenshtein.rb b/lib/edits/levenshtein.rb new file mode 100644 index 0000000..7beaa4b --- /dev/null +++ b/lib/edits/levenshtein.rb @@ -0,0 +1,161 @@ +# frozen_string_literal: true + +module Edits + # Implementation of Levenshtein distance algorithm. + # + # Determines distance between two string by counting edits, identifying: + # - Insertion + # - Deletion + # - Substitution + module Levenshtein + # Calculate the Levenshtein (edit) distance of two sequences. + # + # @note A true distance metric, satisfies triangle inequality. + # @example + # Levenshtein.distance('sand', 'hands') + # # => 2 + # @param seq1 [String, Array] + # @param seq2 [String, Array] + # @return [Integer] + def self.distance(seq1, seq2) + if seq1.length > seq2.length + temp = seq1 + seq1 = seq2 + seq2 = temp + end + + # array of Integer codepoints outperforms String + seq1 = seq1.codepoints if seq1.is_a? String + seq2 = seq2.codepoints if seq2.is_a? String + + rows = seq1.length + cols = seq2.length + return cols if rows.zero? + return rows if cols.zero? + + # Initialize first row of cost matrix. + # The full initial state where cols=3, rows=2 would be: + # [[0, 1, 2, 3], + # [1, 0, 0, 0], + # [2, 0, 0, 0]] + last_row = 0.upto(cols).to_a + + rows.times do |row| + last_col = row + 1 + + seq1_item = seq1[row] + + cols.times do |col| + deletion = last_row[col + 1] + 1 + insertion = last_col + 1 + substitution = last_row[col] + (seq1_item == seq2[col] ? 0 : 1) + + # step cost is min of operation costs + cost = deletion < insertion ? deletion : insertion + cost = substitution if substitution < cost + + # overwrite previous row as we progress + last_row[col] = last_col + last_col = cost + end + last_row[cols] = last_col + end + + last_row[cols] + end + + # Calculate the Levenshtein (edit) distance of two sequences, bounded by + # a maximum value. + # + # @example + # Edits::Levenshtein.distance("cloud", "crayon") + # # => 5 + # Edits::Levenshtein.distance_with_max("cloud", "crayon", 2) + # # => 2 + # @param seq1 [String, Array] + # @param seq2 [String, Array] + # @param max [Integer] maximum distance + # @return [Integer] + def self.distance_with_max(seq1, seq2, max) + if seq1.length > seq2.length + temp = seq1 + seq1 = seq2 + seq2 = temp + end + + rows = seq1.length + cols = seq2.length + return cols if rows.zero? + return rows if cols.zero? + return max if (rows - cols).abs >= max + + seq1 = seq1.codepoints if seq1.is_a? String + seq2 = seq2.codepoints if seq2.is_a? String + + last_row = 0.upto(cols).to_a + + rows.times do |row| + last_col_cost = row + 1 + seq1_item = seq1[row] + + min_col = row > max ? row - max : 0 + max_col = row + max + max_col = cols - 1 if max_col > cols - 1 + diagonal = cols - rows + row + + cols.times do |col| + return max if diagonal == col && last_row[col] >= max + col_cost = + if col < min_col || col > max_col + max + 1 + else + # step cost is min of operation costs + deletion = last_row[col + 1] + 1 + insertion = last_col_cost + 1 + substitution = last_row[col] + (seq1_item == seq2[col] ? 0 : 1) + + cost = deletion < insertion ? deletion : insertion + substitution < cost ? substitution : cost + end + + last_row[col] = last_col_cost + last_col_cost = col_cost + end + + last_row[cols] = last_col_cost + end + + last_row[cols] > max ? max : last_row[cols] + end + + # Given a prototype string and an array of strings, determines which + # string is most similar to the prototype. + # + # `Levenshtein.most_similar("foo", strings)` is functionally equivalent to + # `strings.min_by { |s| Levenshtein.distance("foo", s) }`, leveraging + # {.distance_with_max}. + # + # @example + # Edits::Levenshtein.most_similar("atom", %w[tram atlas rota racer]) + # # => "atlas" + # @param prototype [String] + # @param strings [] + # @return [String, nil] most similar string, or nil for empty array + def self.most_similar(prototype, strings) + return nil if strings.empty? + min_s = strings[0] + min_d = distance(prototype, min_s) + + strings[1..-1].each do |s| + return min_s if min_d.zero? + d = distance_with_max(prototype, s, min_d) + if d < min_d + min_d = d + min_s = s + end + end + + min_s + end + end +end diff --git a/lib/edits/restricted_edit.rb b/lib/edits/restricted_edit.rb new file mode 100644 index 0000000..e7b3ed5 --- /dev/null +++ b/lib/edits/restricted_edit.rb @@ -0,0 +1,86 @@ +# frozen_string_literal: true + +module Edits + # Implements Restricted Damerau-Levenshtein distance (Optimal Alignment) + # algorithm. + # + # Determines distance between two strings by counting edits, identifying: + # * Insertion + # * Deletion + # * Substitution + # * Swapped items + module RestrictedEdit + # Calculate the Restricted Damerau-Levenshtein distance (Optimal Alignment) + # of two sequences. + # + # @note Not a true distance metric, fails to satisfy triangle inequality. + # @example + # RestrictedEdit.distance("iota", "atom") + # # => 3 + # @param seq1 [String, Array] + # @param seq2 [String, Array] + # @return [Integer] + def self.distance(seq1, seq2) + if seq1.length > seq2.length + temp = seq1 + seq1 = seq2 + seq2 = temp + end + + # array of Integer codepoints outperforms String + seq1 = seq1.codepoints if seq1.is_a? String + seq2 = seq2.codepoints if seq2.is_a? String + + rows = seq1.length + cols = seq2.length + return cols if rows.zero? + return rows if cols.zero? + + # previous two rows of cost matrix are retained + lastlast_row = [] + last_row = [] + # Initialize first row of cost matrix. + # The full initial state where cols=3, rows=2 would be: + # [[0, 1, 2, 3], + # [1, 0, 0, 0], + # [2, 0, 0, 0]] + curr_row = 0.upto(cols).to_a + + rows.times do |row| + lastlast_row = last_row + last_row = curr_row + + # generate next row of cost matrix + curr_row = Array.new(cols + 1, 0) + curr_row[0] = row + 1 + + curr_item = seq1[row] + + cols.times do |col| + sub_cost = curr_item == seq2[col] ? 0 : 1 + is_swap = sub_cost == 1 && + row.positive? && col.positive? && + curr_item == seq2[col - 1] && + seq1[row - 1] == seq2[col] + + deletion = last_row[col + 1] + 1 + insertion = curr_row[col] + 1 + substitution = last_row[col] + sub_cost + + # step cost is min of operation costs + cost = deletion < insertion ? deletion : insertion + cost = substitution if substitution < cost + + if is_swap + swap = lastlast_row[col - 1] + 1 + cost = swap if swap < cost + end + + curr_row[col + 1] = cost + end + end + + curr_row[cols] + end + end +end diff --git a/lib/edits/version.rb b/lib/edits/version.rb index 39f9483..af0a857 100644 --- a/lib/edits/version.rb +++ b/lib/edits/version.rb @@ -1,3 +1,5 @@ +# frozen_string_literal: true + module Edits VERSION = "0.1.0" end diff --git a/spec/.rubocop.yml b/spec/.rubocop.yml new file mode 100644 index 0000000..dd94258 --- /dev/null +++ b/spec/.rubocop.yml @@ -0,0 +1,4 @@ +inherit_from: ../.rubocop.yml + +Metrics/BlockLength: + Enabled: false diff --git a/spec/edits/damerau_levenshtein_spec.rb b/spec/edits/damerau_levenshtein_spec.rb new file mode 100644 index 0000000..1978a92 --- /dev/null +++ b/spec/edits/damerau_levenshtein_spec.rb @@ -0,0 +1,42 @@ +# frozen_string_literal: true + +require "spec_helper" +require "edits/levenshtein_shared" + +RSpec.describe Edits::DamerauLevenshtein do + describe ".distance" do + subject { described_class.distance a, b } + + include_examples "levenshtein" + + [ + # swaps + ["a cat", "an act", 2], + ["abc", "acb", 1], + ["abc", "bac", 1], + ["abcdef", "abcdfe", 1], + ["abcdefghij", "acbdegfhji", 3], + ["acre", "acer", 1], + ["art", "ran", 2], + ["caned", "acned", 1], + ["iota", "atom", 3], + ["minion", "noir", 4], + + # complex transpositions + ["a cat", "a tc", 2], + ["acer", "earn", 3], + ["craned", "read", 3], + ["information", "informant", 3], + ["raced", "dear", 4], + ["roam", "art", 3], + ["tram", "rota", 3] + ].each do |(a, b, distance)| + context "with '#{a}', '#{b}'" do + let(:a) { a } + let(:b) { b } + + it { is_expected.to eq distance } + end + end + end +end diff --git a/spec/edits/hamming_spec.rb b/spec/edits/hamming_spec.rb new file mode 100644 index 0000000..162a91b --- /dev/null +++ b/spec/edits/hamming_spec.rb @@ -0,0 +1,30 @@ +# frozen_string_literal: true + +require "spec_helper" + +RSpec.describe Edits::Hamming do + describe ".distance" do + subject { described_class.distance a, b } + + [ + ["", "", 0], + ["a", "a", 0], + + ["1011101", "1001001", 2], + ["2173896", "2233796", 3], + ["foo", "bar", 3], + ["toned", "roses", 3], + + ["", "abc", 3], + ["abc", "", 3], + ["foo", "barbaz", 6] + ].each do |a, b, distance| + context "with '#{a}', '#{b}'" do + let(:a) { a } + let(:b) { b } + + it { is_expected.to eq distance } + end + end + end +end diff --git a/spec/edits/jaro_spec.rb b/spec/edits/jaro_spec.rb new file mode 100644 index 0000000..cc86319 --- /dev/null +++ b/spec/edits/jaro_spec.rb @@ -0,0 +1,64 @@ +# frozen_string_literal: true + +require "spec_helper" + +RSpec.describe Edits::Jaro do + describe ".jaro_matches" do + { + ["", ""] => [0, 0], + ["", "a"] => [0, 0], + ["a", ""] => [0, 0], + + %w[abc abc] => [3, 0], + %w[abc def] => [0, 0], + %w[abcvwxyz cabvwxyz] => [8, 1], + %w[dixon dicksonx] => [4, 0], + %w[dwayne duane] => [4, 0], + %w[information informant] => [9, 1], + %w[iota atom] => [2, 1], + %w[jones johnson] => [4, 0], + %w[martha marhta] => [6, 1], + %w[necessary nessecary] => [9, 2] + }.each do |(a, b), result| + context "with '#{a}', '#{b}'" do + it "returns #{result.first} matches" do + matches = Edits::Jaro.send(:jaro_matches, a, b).first + expect(matches).to eq result.first + end + + it "returns #{result.last} transposes" do + transposes = Edits::Jaro.send(:jaro_matches, a, b).last + expect(transposes).to eq result.last + end + end + end + end + + describe ".similarity" do + subject { described_class.similarity(a, b).round(3) } + + { + ["", ""] => 1, + ["", "a"] => 0, + ["a", ""] => 0, + + %w[abc abc] => 1, + %w[abc def] => 0, + %w[abcvwxyz cabvwxyz] => 0.958, + %w[dixon dicksonx] => 0.767, + %w[dwayne duane] => 0.822, + %w[information informant] => 0.902, + %w[iota atom] => 0.5, + %w[jones johnson] => 0.790, + %w[martha marhta] => 0.944, + %w[necessary nessecary] => 0.926 + }.each do |(a, b), expected| + context "with '#{a}', '#{b}'" do + let(:a) { a } + let(:b) { b } + + it { is_expected.to eq expected } + end + end + end +end diff --git a/spec/edits/jaro_winkler_spec.rb b/spec/edits/jaro_winkler_spec.rb new file mode 100644 index 0000000..cc10eb3 --- /dev/null +++ b/spec/edits/jaro_winkler_spec.rb @@ -0,0 +1,33 @@ +# frozen_string_literal: true + +require "spec_helper" + +RSpec.describe Edits::JaroWinkler do + describe ".similarity" do + subject { described_class.similarity(a, b).round(3) } + + { + ["", ""] => 1, + ["", "a"] => 0, + ["a", ""] => 0, + + %w[abc abc] => 1, + %w[abc def] => 0, + %w[abcvwxyz cabvwxyz] => 0.958, + %w[dixon dicksonx] => 0.813, + %w[dwayne duane] => 0.84, + %w[information informant] => 0.941, + %w[iota atom] => 0.5, + %w[jones johnson] => 0.832, + %w[martha marhta] => 0.961, + %w[necessary nessecary] => 0.941 + }.each do |(a, b), expected| + context "with '#{a}', '#{b}'" do + let(:a) { a } + let(:b) { b } + + it { is_expected.to eq expected } + end + end + end +end diff --git a/spec/edits/levenshtein_shared.rb b/spec/edits/levenshtein_shared.rb new file mode 100644 index 0000000..d8511aa --- /dev/null +++ b/spec/edits/levenshtein_shared.rb @@ -0,0 +1,54 @@ +# frozen_string_literal: true + +RSpec.shared_examples "levenshtein" do + [ + ["", "", 0], + ["", "abc", 3], + ["abc", "", 3], + + ["a", "a", 0], + ["one", "one", 0], + + ["abc", "d", 3], + ["bar", "foo", 3], + ["d", "abc", 3], + ["foo", "bar", 3], + + # insertion + ["fog", "frog", 1], + ["mitten", "mittens", 1], + ["mitten", "smitten", 1], + ["pit", "pint", 1], + ["tom", "thom", 1], + ["tom", "thomas", 3], + + # deletion + ["frog", "fog", 1], + ["mittens", "mitten", 1], + ["pint", "pit", 1], + ["smitten", "mitten", 1], + ["thom", "tom", 1], + ["thomas", "tom", 3], + + # substitution + ["book", "back", 2], + ["raked", "baker", 2], + ["ran", "fan", 1], + ["rat", "ran", 1], + ["saturday", "caturday", 1], + + # multiple edits + ["kitten", "sitting", 3], + ["phish", "fish", 2], + ["raked", "bakers", 3], + ["sittings", "kitting", 2], + ["sunday", "saturday", 3] + ].each do |(a, b, distance)| + context "with '#{a}', '#{b}'" do + let(:a) { a } + let(:b) { b } + + it { is_expected.to eq distance } + end + end +end diff --git a/spec/edits/levenshtein_spec.rb b/spec/edits/levenshtein_spec.rb new file mode 100644 index 0000000..b978192 --- /dev/null +++ b/spec/edits/levenshtein_spec.rb @@ -0,0 +1,109 @@ +# frozen_string_literal: true + +require "spec_helper" +require "edits/levenshtein_shared" + +RSpec.describe Edits::Levenshtein do + cases = [ + # swaps + ["a cat", "an act", 3], + ["abc", "acb", 2], + ["abc", "bac", 2], + ["abcdef", "abcdfe", 2], + ["abcdefghij", "acbdegfhji", 6], + ["acre", "acer", 2], + ["art", "ran", 3], + ["caned", "acned", 2], + ["iota", "atom", 4], + ["minion", "noir", 5], + + # complex transpositions + ["a cat", "a tc", 3], + ["a cat", "an abct", 4], + ["acer", "earn", 4], + ["craned", "read", 4], + ["information", "informant", 4], + ["raced", "dear", 5], + ["roam", "art", 4], + ["tram", "rota", 4] + ] + + describe ".distance" do + subject { described_class.distance a, b } + + include_examples "levenshtein" + + cases.each do |(a, b, distance)| + context "with '#{a}', '#{b}'" do + let(:a) { a } + let(:b) { b } + + it { is_expected.to eq distance } + end + end + end + + describe ".distance_with_max" do + subject { described_class.distance_with_max a, b, max } + + context "when max is 100" do + let(:max) { 100 } + + include_examples "levenshtein" + + cases.each do |(a, b, distance)| + context "with '#{a}', '#{b}'" do + let(:a) { a } + let(:b) { b } + + it { is_expected.to eq distance } + end + end + end + + context "with 'foo', 'barbaz'" do + let(:a) { "foo" } + let(:b) { "barbaz" } + + context "when distance is 4" do + let(:max) { 4 } + + it { is_expected.to eq 4 } + end + + context "when distance is 2" do + let(:max) { 2 } + + it { is_expected.to eq 2 } + end + end + end + + describe ".most_similar" do + let(:prototype) { "atom" } + + subject { described_class.most_similar prototype, words } + + context "with empty array" do + let(:words) { [] } + + it { is_expected.to be_nil } + end + + context "when a single word has the lowest distance" do + let(:words) { %w[light at atlas beer iota train] } + + it "returns the word with lowest distance from prototype" do + expect(subject).to eq "at" + end + end + + context "when two words share the lowest distance" do + let(:words) { %w[light beer iota train] } + + it "returns the first with lowest distance from prototype" do + expect(subject).to eq "beer" + end + end + end +end diff --git a/spec/edits/restricted_edit_spec.rb b/spec/edits/restricted_edit_spec.rb new file mode 100644 index 0000000..ae0cad6 --- /dev/null +++ b/spec/edits/restricted_edit_spec.rb @@ -0,0 +1,43 @@ +# frozen_string_literal: true + +require "spec_helper" +require "edits/levenshtein_shared" + +RSpec.describe Edits::RestrictedEdit do + describe ".distance" do + subject { described_class.distance a, b } + + include_examples "levenshtein" + + [ + # swaps + ["a cat", "an act", 2], + ["abc", "acb", 1], + ["abc", "bac", 1], + ["abcdef", "abcdfe", 1], + ["abcdefghij", "acbdegfhji", 3], + ["acre", "acer", 1], + ["art", "ran", 2], + ["caned", "acned", 1], + ["iota", "atom", 3], + ["minion", "noir", 4], + + # complex transpositions + ["a cat", "a tc", 3], + ["a cat", "an abct", 4], + ["acer", "earn", 4], + ["craned", "read", 4], + ["information", "informant", 4], + ["raced", "dear", 5], + ["roam", "art", 4], + ["tram", "rota", 4] + ].each do |(a, b, distance)| + context "with '#{a}', '#{b}'" do + let(:a) { a } + let(:b) { b } + + it { is_expected.to eq distance } + end + end + end +end diff --git a/spec/edits_spec.rb b/spec/edits_spec.rb index 0c62bac..4ab35b0 100644 --- a/spec/edits_spec.rb +++ b/spec/edits_spec.rb @@ -1,11 +1,9 @@ +# frozen_string_literal: true + require "spec_helper" RSpec.describe Edits do it "has a version number" do expect(Edits::VERSION).not_to be nil end - - it "does something useful" do - expect(false).to eq(true) - end end diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb index e65a4e8..801e3f7 100644 --- a/spec/spec_helper.rb +++ b/spec/spec_helper.rb @@ -1,3 +1,5 @@ +# frozen_string_literal: true + require "bundler/setup" require "edits" diff --git a/tasks/benchmark.rake b/tasks/benchmark.rake new file mode 100644 index 0000000..b299531 --- /dev/null +++ b/tasks/benchmark.rake @@ -0,0 +1,48 @@ +# frozen_string_literal: true + +require "benchmark" +require "edits" + +desc "Compare metrics" +task :benchmark do + words = File.read("/usr/share/dict/words") + .split(/\n/).compact.shuffle(random: Random.new(1)) + + Benchmark.bm(20) do |x| + x.report("Hamming") do + words.each_cons(2) do |a, b| + Edits::Hamming.distance a, b + end + end + + x.report("Levenshtein") do + words.each_cons(2) do |a, b| + Edits::Levenshtein.distance a, b + end + end + + x.report("RestrictedEdit") do + words.each_cons(2) do |a, b| + Edits::RestrictedEdit.distance a, b + end + end + + x.report("DamerauLevenshtein") do + words.each_cons(2) do |a, b| + Edits::DamerauLevenshtein.distance a, b + end + end + + x.report("Jaro") do + words.each_cons(2) do |a, b| + Edits::Jaro.distance a, b + end + end + + x.report("JaroWinkler") do + words.each_cons(2) do |a, b| + Edits::JaroWinkler.distance a, b + end + end + end +end diff --git a/tasks/benchmark/levenshtein.rake b/tasks/benchmark/levenshtein.rake new file mode 100644 index 0000000..6c07d46 --- /dev/null +++ b/tasks/benchmark/levenshtein.rake @@ -0,0 +1,145 @@ +# frozen_string_literal: true + +require "benchmark" +require "benchmark/ips" +require "edits" + +namespace :benchmark do + desc "distance vs. distance_with_max (x100)" + task :lev_max do + words = File.read("/usr/share/dict/words") + .split(/\n/).compact.shuffle(random: Random.new(1)) + .take(101) + + Benchmark.ips do |x| + x.report("distance") do + words.each_cons(2) do |a, b| + Edits::Levenshtein.distance a, b + end + end + + x.report("with max 1") do + words.each_cons(2) do |a, b| + Edits::Levenshtein.distance_with_max a, b, 1 + end + end + + x.report("with max 2") do + words.each_cons(2) do |a, b| + Edits::Levenshtein.distance_with_max a, b, 2 + end + end + + x.report("with max 3") do + words.each_cons(2) do |a, b| + Edits::Levenshtein.distance_with_max a, b, 3 + end + end + + x.report("with max 4") do + words.each_cons(2) do |a, b| + Edits::Levenshtein.distance_with_max a, b, 4 + end + end + + x.report("with max 6") do + words.each_cons(2) do |a, b| + Edits::Levenshtein.distance_with_max a, b, 6 + end + end + + x.report("with max 8") do + words.each_cons(2) do |a, b| + Edits::Levenshtein.distance_with_max a, b, 8 + end + end + + x.report("with max 50") do + words.each_cons(2) do |a, b| + Edits::Levenshtein.distance_with_max a, b, 100 + end + end + + x.compare! + end + end + + desc "most_similar vs. min_by (100 words)" + task :lev_similar do + words = File.read("/usr/share/dict/words") + .split(/\n/).compact.shuffle(random: Random.new(1)) + .take(100) + + Benchmark.ips do |x| + x.report("most_similar") do + Edits::Levenshtein.most_similar("wxyz", words) + end + + x.report("min_by") do + words.min_by { |s| Edits::Levenshtein.distance("wxyz", s) } + end + + x.compare! + end + end + + task :rowgen1 do + cols = 5 + rows = 3 + + Benchmark.ips do |x| + x.report "new, unshift" do + Array.new(cols, 0).unshift(rows) + end + + x.report "new, []=" do + curr_row = Array.new(cols + 1, 0) + curr_row[0] = rows + end + + x.report "literal, concat" do + [rows].concat(Array.new(cols, 0)) + end + + x.report "literal, +" do + m = [] + m << [rows] + Array.new(cols, 0) + end + + x.compare! + end + end + + task :rowgen2 do + cols = 5 + rows = 3 + inf = cols + rows + + Benchmark.ips do |x| + x.report "new, unshift" do + m = [] + m << Array.new(cols, 0).unshift(rows, inf) + end + + x.report "new, []=" do + m = [] + curr_row = Array.new(cols + 2, 0) + curr_row[0] = rows + curr_row[1] = inf + m << curr_row + end + + x.report "literal, concat" do + m = [] + m << [rows, inf].concat(Array.new(cols, 0)) + end + + x.report "literal, +" do + m = [] + m << [rows, inf] + Array.new(cols, 0) + end + + x.compare! + end + end +end