diff --git a/src/StringDistances.jl b/src/StringDistances.jl index f146a97..81ad450 100755 --- a/src/StringDistances.jl +++ b/src/StringDistances.jl @@ -8,10 +8,15 @@ include("distances/qgram.jl") include("modifiers.jl") include("normalize.jl") include("pairwise.jl") +include("find_partial.jl") + # Distances API Distances.result_type(dist::StringDistance, s1::Type, s2::Type) = typeof(dist("", "")) Distances.result_type(dist::StringDistance, s1, s2) = result_type(dist, typeof(s1), typeof(s2)) +# ambiguity fix +Distances.result_type(dist::StringDistance, s1::AbstractArray, s2::AbstractArray) = result_type(dist, eltype(s1), eltype(s2)) + @@ -49,7 +54,8 @@ compare, result_type, qgrams, findnearest, +findnearest_partial, +findall_partial, pairwise, pairwise! end - diff --git a/src/distances/utils.jl b/src/distances/utils.jl index 4c826cb..2a7831b 100755 --- a/src/distances/utils.jl +++ b/src/distances/utils.jl @@ -42,6 +42,22 @@ function reorder(s1, s2) (length(s1) <= length(s2)) ? (s1, s2) : (s2, s1) end +function _enforce_shorter_first(needle, haystack) + if length(needle) > length(haystack) + throw(ArgumentError("Shorter argument must come first; used to find matches in the longer second argument.")) + end + return needle, haystack +end + +enforce_shorter_first(needle, haystack) = _enforce_shorter_first(needle, haystack) + +function enforce_shorter_first(needle::AbstractString, haystack::AbstractString) + needle = string_with_length(needle) + haystack = string_with_length(haystack) + return _enforce_shorter_first(needle, haystack) +end + + function common_prefix(s1, s2) l = 0 for (ch1, ch2) in zip(s1, s2) @@ -77,4 +93,6 @@ function _slice(s::AbstractString, n1::Integer, n2::Integer) SubString(s, nextind(s, 0, n1 + 1), nextind(s, 0, n2)) end - +# like `_slice` but get the indices (for indexable collections) +_slice_inds(::Any, n1::Integer, n2::Integer) = n1:n2 +_slice_inds(s::AbstractString, n1::Integer, n2::Integer) = nextind(s, 0, n1):nextind(s, 0, n2) diff --git a/src/find_partial.jl b/src/find_partial.jl new file mode 100644 index 0000000..9703f96 --- /dev/null +++ b/src/find_partial.jl @@ -0,0 +1,137 @@ +""" + update_max_dist(dist, max_dist) -> StringDistance + +Given a `StringDistance` `dist` which supports the `max_dist` field, return +a similar object with the field updated to the value passed in. If the distance does +not support this field, `dist` is returned. +""" +update_max_dist + +# Why not use `update_max_dist(::T, max_dist) where T <: Union{DamerauLevenshtein, Levenshtein, Hamming}` +# instead of `@eval`? Because then `T` will be e.g. `DamerauLevenshtein{Nothing}`, and we cannot then +# do `T(1)` to create a new one. +for T in (:DamerauLevenshtein, :Levenshtein, :Hamming) + @eval function update_max_dist(::$T, max_dist) + return $T(max_dist) + end +end + +for T in (:Normalized, :TokenMax) + @eval function update_max_dist(dist::$T, max_dist) + return $T(dist.dist, max_dist) + end +end + +for T in (:Partial, :TokenSort, :TokenSet) + @eval function update_max_dist(dist::$T, max_dist) + return $T(update_max_dist(dist.dist, max_dist)) + end +end +update_max_dist(d::Any, max_dist) = d + +""" + get_max_dist(dist) -> Number + +Given a `StringDistance` `dist` which supports the `max_dist` field, return +the value of the field. If the object does not support `max_dist`, then +return nothing. +""" +get_max_dist + +get_max_dist(dist::Union{Levenshtein, DamerauLevenshtein, Hamming, Normalized, TokenMax}) = dist.max_dist +get_max_dist(dist::Union{Partial, TokenSort, TokenSet}) = get_max_dist(dist.dist) +get_max_dist(::Any) = nothing + +""" + findnearest_partial(needle, haystack, dist) -> (d, inds) + +`Partial(dist)(needle, haystack)` returns +the closest distance `d` between `needle` and any segment of `haystack` (of equal length to that of `needle`). `findnearest_partial` returns the same value, but also +returns the first set of indices at which an optimal partial match was found. If `dist` supports a `max_dist` +field, and no match was found with distance at most `max_dist`, then returns an empty range (`1:0`) for +the indices. Requires `haystack` to be indexable (e.g. an `AbstractString`). + +See also [`Partial`](@ref) and [`findall_partial`](@ref). +""" +findnearest_partial + +# unwrap `Partial`s since we compare as partials anyway. +findnearest_partial(s1, s2, dist::Partial) = findnearest_partial(s1, s2, dist.dist) + +function findnearest_partial(s1, s2, dist) + max_dist = get_max_dist(dist) + s1, s2 = enforce_shorter_first(s1, s2) + + if max_dist === nothing + # return something larger than any possible distance, + # but not e.g. `typemax(Int)` which will lead to overflows, + # and with an integer type, since we need to be able to + # construct e.g. `Levenshtein` distances with this parameter. + max_dist = 10*length(s2) + end + + len1, len2 = length(s1), length(s2) + len1 == len2 && return dist(s1, s2), firstindex(s2):lastindex(s2) + out = max_dist + 1 + len1 == 0 && return out, 1:0 + out_idx = 0 + for (i, x) in enumerate(qgrams(s2, len1)) + curr = dist(s1, x) + out_idx = ifelse(curr < out, i, out_idx) + out = min(out, curr) + max_dist = max_dist === nothing ? out : min(out, max_dist) + dist = update_max_dist(dist, max_dist) + end + + if out_idx == 0 + # return more obvious invalid range if a match isn't found without exceeding `max_dist` + return out, 1:0 + else + return out, _slice_inds(s2, out_idx, out_idx + len1 - 1) + end +end + + +""" + findall_partial(needle, haystack, dist; max_dist = StringDistances.get_max_dist(dist)) -> Vector{Tuple{T,UnitRange}} + +Searches for occurrences of `needle` in `haystack` that differ by at most `max_dist` according to the distance measure `dist`. Only considers sequential segments of `haystack` of equal length to that of `needle`. + +Returns a vector of tuples, each corresponding to a match found. The first entry gives the distance of the match, and the second entry gives the indices of `haystack` corresponding to the match. Matches may overlap. Requires `haystack` to be indexable (e.g. an `AbstractString`). + +See also [`Partial`](@ref) and [`findnearest_partial`](@ref). +""" +findall_partial + +findall_partial(s1, s2, dist::Partial; max_dist = get_max_dist(dist)) = findall_partial(s1, s2, dist.dist; max_dist = max_dist) + +function findall_partial(s1, s2, dist; max_dist = get_max_dist(dist)) + if max_dist === nothing + throw(ArgumentError("`dist` does not have a `max_dist` set and one was not passed to `findall_partial`.")) + end + + s1, s2 = enforce_shorter_first(s1, s2) + T = Distances.result_type(dist, s1, s2) + + dist = update_max_dist(dist, max_dist) + len1, len2 = length(s1), length(s2) + matches = Tuple{T,UnitRange}[] + len1 == 0 && return matches + + if len1 == len2 + curr = dist(s1, s2) + if curr <= max_dist + push!(matches, (curr, firstindex(s2):lastindex(s2))) + end + return matches + end + + for (i, x) in enumerate(qgrams(s2, len1)) + curr = dist(s1, x) + if curr <= max_dist + inds = _slice_inds(s2, i, i + len1 - 1) + push!(matches, (curr, inds)) + end + end + return matches +end diff --git a/src/modifiers.jl b/src/modifiers.jl index 0ee8338..6067036 100755 --- a/src/modifiers.jl +++ b/src/modifiers.jl @@ -7,6 +7,9 @@ Creates the `Partial{dist}` distance. See http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/ +See also [`findnearest_partial`](@ref) and [`findall_partial`](@ref). + + ### Examples ```julia-repl julia> s1 = "New York Mets vs Atlanta Braves" @@ -124,5 +127,3 @@ function (dist::TokenSet)(s1::Union{AbstractString, Missing}, s2::Union{Abstract score_12 = dist.dist(s1, s2) min(score_01, score_02, score_12) end - - diff --git a/test/find_partial.jl b/test/find_partial.jl new file mode 100644 index 0000000..06bb421 --- /dev/null +++ b/test/find_partial.jl @@ -0,0 +1,179 @@ +using StringDistances: get_max_dist, update_max_dist + +@testset "`find*_partial`" begin + @testset "`get_max_dist` and `update_max_dist`" begin + d = 2 + @testset "$T" for T in (DamerauLevenshtein, Levenshtein, Hamming) + dist = T(d) + @test get_max_dist(dist) == d + @test update_max_dist(dist, 2*d) == T(2*d) + + d1 = 0.5 + @testset "$mod" for mod in (StringDistances.Normalized, StringDistances.TokenMax) + mod_dist = mod(dist, d1) + @test get_max_dist(mod_dist) == d1 + @test update_max_dist(mod_dist, 2*d1) == mod(dist, 2*d1) + end + + @testset "$mod" for mod in (Partial, StringDistances.TokenSort, StringDistances.TokenSet) + if mod == StringDistances.TokenMax + T <: Distances.PreMetric || continue + end + modified_dist = mod(dist) + @test get_max_dist(modified_dist) == d + @test update_max_dist(modified_dist, 2*d) == mod(T(2*d)) + end + end + + end + + + @testset "`findnearest_partial` and `findall_partial` correctness with DamerauLevenshtein" begin + ## Equal length cases + + # `d` replaced by `x`; 1 away + str1 = "abcd" + str2 = "abcx" + d, inds = findnearest_partial(str1, str2, Partial(DamerauLevenshtein(1))) + matches = findall_partial(str1, str2, Partial(DamerauLevenshtein(1))) + @test d == Partial(DamerauLevenshtein(1))(str1, str2) + @test matches == [(1, 1:4)] == [(d, inds)] + + # `cd` replaced by `xy`; 2 away + str1 = "abcd" + str2 = "abxy" + + d, inds = findnearest_partial(str1, str2, Partial(DamerauLevenshtein(1))) + matches = findall_partial(str1, str2, Partial(DamerauLevenshtein(1))) + @test d == 2 # `max_dist + 1` + @test isempty(matches) + + d, inds = findnearest_partial(str1, str2, Partial(DamerauLevenshtein(2))) + matches = findall_partial(str1, str2, Partial(DamerauLevenshtein(2))) + @test d == Partial(DamerauLevenshtein(1))(str1, str2) + @test matches == [(2, 1:4)] == [(d, inds)] + + ## Nonequal length cases + + # `d` replaced by `x`; 1 away + str1 = "abcdef" + str2 = "1234abcxef1234" + + d, inds = findnearest_partial(str1, str2, Partial(DamerauLevenshtein(1))) + matches = findall_partial(str1, str2, Partial(DamerauLevenshtein(1))) + @test d == Partial(DamerauLevenshtein(1))(str1, str2) + @test matches == [(1, 5:10)] == [(d, inds)] + + d, inds = findnearest_partial(str1, str2, Partial(DamerauLevenshtein(2))) + matches = findall_partial(str1, str2, Partial(DamerauLevenshtein(2))) + @test d == Partial(DamerauLevenshtein(2))(str1, str2) + @test matches == [(1, 5:10)] == [(d, inds)] + + d, inds = findnearest_partial(str1, str2, Partial(DamerauLevenshtein(3))) + matches = findall_partial(str1, str2, Partial(DamerauLevenshtein(3))) + @test d == Partial(DamerauLevenshtein(3))(str1, str2) + @test d == 1 + @test inds == 5:10 + @test matches == [(3, 4:9), (1, 5:10), (3, 6:11)] + + # `cde` replaced by `xyz`; 3 away + str1 = "abcdef" + str2 = "1234abxyzf1234" + + d, inds = findnearest_partial(str1, str2, Partial(DamerauLevenshtein(1))) + matches = findall_partial(str1, str2, Partial(DamerauLevenshtein(1))) + @test d == Partial(DamerauLevenshtein(1))(str1, str2) + @test d == 2 # max_dist + 1 + @test inds == 1:0 + @test isempty(matches) + + d, inds = findnearest_partial(str1, str2, Partial(DamerauLevenshtein(2))) + matches = findall_partial(str1, str2, Partial(DamerauLevenshtein(2))) + @test d == Partial(DamerauLevenshtein(2))(str1, str2) + @test d == 3 # max_dist + 1 + @test inds == 1:0 + @test isempty(matches) + + d, inds = findnearest_partial(str1, str2, Partial(DamerauLevenshtein(3))) + matches = findall_partial(str1, str2, Partial(DamerauLevenshtein(3))) + @test d == Partial(DamerauLevenshtein(3))(str1, str2) + @test matches == [(3, 5:10)] == [(d, inds)] + + d, inds = findnearest_partial(str1, str2, Partial(DamerauLevenshtein(4))) + matches = findall_partial(str1, str2, Partial(DamerauLevenshtein(4))) + @test d == Partial(DamerauLevenshtein(4))(str1, str2) + @test matches == [(3, 5:10)] == [(d, inds)] + + # In the first case, `cde` replaced by `xyz` (3 away); in the second, only `e` is replaced by `x` (one away) + str1 = "abcdef" + str2 = "1234abxyzf1234abcdxf123" + for max_dist in (1, 2) + d, inds = findnearest_partial(str1, str2, Partial(DamerauLevenshtein(max_dist))) + matches = findall_partial(str1, str2, Partial(DamerauLevenshtein(max_dist))) + @test d == Partial(DamerauLevenshtein(max_dist))(str1, str2) + @test matches == [(1, 15:20)] == [(d, inds)] + end + # Now at 3, we find the other match. + # We also match at "4abcdx": delete '4', substitute 'x' => 'e', and insert 'f' at the end + # as well as "bcdxf1": insert 'a' at the start, substitute 'x' => 'e', and delete '1' at the end + d, inds = findnearest_partial(str1, str2, Partial(DamerauLevenshtein(3))) + matches = findall_partial(str1, str2, Partial(DamerauLevenshtein(3))) + @test d == Partial(DamerauLevenshtein(3))(str1, str2) + @test (d, inds) == (1, 15:20) + @test matches == [(3, 5:10), (3, 14:19), (1, 15:20), (3, 16:21)] + end + + @testset "`findnearest_partial` and `findall_partial`: test other distances" begin + # `d` replaced by `x`; 1 away + str1 = "abcdef" + str2 = "1234abcxef1234" + + # `Partial` unwrapping + for d in (1,2,3), T in (DamerauLevenshtein, Levenshtein, Hamming), f in (findnearest_partial, findall_partial) + results1 = f(str1, str2, T(d)) + results2 = f(str1, str2, Partial(T(d))) + @test results1 == results2 + end + + @testset "$dist" for dist in (DamerauLevenshtein(), Levenshtein(), Hamming(), Jaro(), JaroWinkler(), RatcliffObershelp()) + d, inds = findnearest_partial(str1, str2, dist) + @test inds == 5:10 + matches = findall_partial(str1, str2, dist; max_dist = d) + @test matches == [(d, inds)] + + d1, inds1 = findnearest_partial(str1, str2, Partial(dist)) + @test d ≈ d1 + @test inds == inds1 + + matches1 = findall_partial(str1, str2, Partial(dist); max_dist = d) + @test matches1 == [(d1, inds1)] + + d2, inds2 = findnearest_partial(str1, str2, StringDistances.Normalized(dist, 1.0)) + if dist ∈ (Jaro(), JaroWinkler(), RatcliffObershelp()) + @test d2 ≈ d + + matches2 = findall_partial(str1, str2, StringDistances.Normalized(dist, 1.0); max_dist = d) + @test matches2 == [(d2, inds2)] + matches3 = findall_partial(str1, str2, StringDistances.Normalized(dist, d)) + @test matches3 == matches2 + else + @test d2 * length(str1) ≈ d + + matches2 = findall_partial(str1, str2, StringDistances.Normalized(dist, 1.0); max_dist = d/length(str1) + eps()) + @test matches2 == [(d2, inds2)] + matches3 = findall_partial(str1, str2, StringDistances.Normalized(dist, d/length(str1) + eps())) + @test matches3 == matches2 + end + @test inds == inds2 + end + end + + @testset "Non-string tests" begin + v = [6,4,1,3,2,6] + @test findnearest_partial(1:3, v, DamerauLevenshtein()) == (1,3:5) + + matches = [(2, 2:4), (2, 3:5), (2, 4:6)] + @test findnearest_partial(1:3, v, Hamming()) ∈ matches + @test findall_partial(1:3, v, Hamming(), max_dist = 2) == matches + end +end diff --git a/test/runtests.jl b/test/runtests.jl index f8a7f7d..9dfceb6 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -4,3 +4,4 @@ using Test include("distances.jl") include("pairwise.jl") include("modifiers.jl") +include("find_partial.jl")