matthieugomez · ericphanson · Dec 8, 2020 · Dec 8, 2020
diff --git a/src/StringDistances.jl b/src/StringDistances.jl
@@ -8,10 +8,15 @@ include("distances/qgram.jl")
 include("modifiers.jl")
 include("normalize.jl")
 include("pairwise.jl")
+include("find_partial.jl")
+
 # Distances API
 Distances.result_type(dist::StringDistance, s1::Type, s2::Type) = typeof(dist("", ""))
 Distances.result_type(dist::StringDistance, s1, s2) = result_type(dist, typeof(s1), typeof(s2))
 
+# ambiguity fix
+Distances.result_type(dist::StringDistance, s1::AbstractArray, s2::AbstractArray) = result_type(dist, eltype(s1), eltype(s2))
+
 
 
 
@@ -49,7 +54,8 @@ compare,
 result_type,
 qgrams,
 findnearest,
+findnearest_partial,
+findall_partial,
 pairwise,
 pairwise!
 end
-
diff --git a/src/distances/utils.jl b/src/distances/utils.jl
@@ -42,6 +42,22 @@ function reorder(s1, s2)
     (length(s1) <= length(s2)) ? (s1, s2) : (s2, s1)
 end
 
+function _enforce_shorter_first(needle, haystack)
+    if length(needle) > length(haystack)
+        throw(ArgumentError("Shorter argument must come first; used to find matches in the longer second argument."))
+    end
+    return needle, haystack
+end
+
+enforce_shorter_first(needle, haystack) = _enforce_shorter_first(needle, haystack)
+
+function enforce_shorter_first(needle::AbstractString, haystack::AbstractString)
+    needle = string_with_length(needle)
+    haystack = string_with_length(haystack)
+    return _enforce_shorter_first(needle, haystack)
+end
+
+
 function common_prefix(s1, s2)
     l = 0
     for (ch1, ch2) in zip(s1, s2)
@@ -77,4 +93,6 @@ function _slice(s::AbstractString, n1::Integer, n2::Integer)
    SubString(s, nextind(s, 0, n1 + 1),  nextind(s, 0, n2))
 end
 
-
+# like `_slice` but get the indices (for indexable collections)
+_slice_inds(::Any, n1::Integer, n2::Integer) = n1:n2
+_slice_inds(s::AbstractString, n1::Integer, n2::Integer) = nextind(s, 0, n1):nextind(s, 0, n2)
diff --git a/src/find_partial.jl b/src/find_partial.jl
@@ -0,0 +1,137 @@
+"""
+    update_max_dist(dist, max_dist) -> StringDistance
+
+Given a `StringDistance` `dist` which supports the `max_dist` field, return
+a similar object with the field updated to the value passed in. If the distance does
+not support this field, `dist` is returned.
+"""
+update_max_dist
+
+# Why not use `update_max_dist(::T, max_dist) where T <: Union{DamerauLevenshtein, Levenshtein, Hamming}`
+# instead of `@eval`? Because then `T` will be e.g. `DamerauLevenshtein{Nothing}`, and we cannot then
+# do `T(1)` to create a new one.
+for T in (:DamerauLevenshtein, :Levenshtein, :Hamming)
+    @eval function update_max_dist(::$T, max_dist)
+        return $T(max_dist)
+    end
+end
+
+for T in (:Normalized, :TokenMax)
+    @eval function update_max_dist(dist::$T, max_dist)
+        return $T(dist.dist, max_dist)
+    end
+end
+
+for T in (:Partial, :TokenSort, :TokenSet)
+    @eval function update_max_dist(dist::$T, max_dist)
+        return $T(update_max_dist(dist.dist, max_dist))
+    end
+end
+update_max_dist(d::Any, max_dist) = d
+
+"""
+    get_max_dist(dist) -> Number
+
+Given a `StringDistance` `dist` which supports the `max_dist` field, return
+the value of the field. If the object does not support `max_dist`, then
+return nothing.
+"""
+get_max_dist
+
+get_max_dist(dist::Union{Levenshtein, DamerauLevenshtein, Hamming, Normalized, TokenMax}) = dist.max_dist
+get_max_dist(dist::Union{Partial, TokenSort, TokenSet}) = get_max_dist(dist.dist)
+get_max_dist(::Any) = nothing
+
+"""
+    findnearest_partial(needle, haystack, dist) -> (d, inds)
+
+`Partial(dist)(needle, haystack)` returns
+the closest distance `d` between `needle` and any segment of `haystack` (of equal length to that of `needle`). `findnearest_partial` returns the same value, but also
+returns the first set of indices at which an optimal partial match was found. If `dist` supports a `max_dist`
+field, and no match was found with distance at most `max_dist`, then returns an empty range (`1:0`) for
+the indices. Requires `haystack` to be indexable (e.g. an `AbstractString`).
+
+See also [`Partial`](@ref) and [`findall_partial`](@ref).
+"""
+findnearest_partial
+
+# unwrap `Partial`s since we compare as partials anyway.
+findnearest_partial(s1, s2, dist::Partial) = findnearest_partial(s1, s2, dist.dist)
+
+function findnearest_partial(s1, s2, dist)
+    max_dist = get_max_dist(dist)
+    s1, s2 = enforce_shorter_first(s1, s2)  
+
+    if max_dist === nothing
+        # return something larger than any possible distance,
+        # but not e.g. `typemax(Int)` which will lead to overflows,
+        # and with an integer type, since we need to be able to
+        # construct e.g. `Levenshtein` distances with this parameter.
+        max_dist = 10*length(s2)
+    end
+
+    len1, len2 = length(s1), length(s2)
+    len1 == len2 && return dist(s1, s2), firstindex(s2):lastindex(s2)
+    out = max_dist + 1
+    len1 == 0 && return out, 1:0
+    out_idx = 0
+    for (i, x) in enumerate(qgrams(s2, len1))
+        curr = dist(s1, x)
+        out_idx = ifelse(curr < out, i, out_idx)
+        out = min(out, curr)
+        max_dist = max_dist === nothing ? out : min(out, max_dist)
+        dist = update_max_dist(dist, max_dist)
+    end
+
+    if out_idx == 0
+        # return more obvious invalid range if a match isn't found without exceeding `max_dist`
+        return out, 1:0
+    else
+        return out, _slice_inds(s2, out_idx, out_idx + len1 - 1)
+    end
+end
+
+
+"""
+    findall_partial(needle, haystack, dist; max_dist = StringDistances.get_max_dist(dist)) -> Vector{Tuple{T,UnitRange}}
+
+Searches for occurrences of `needle` in `haystack` that differ by at most `max_dist` according to the distance measure `dist`. Only considers sequential segments of `haystack` of equal length to that of `needle`.
+
+Returns a vector of tuples, each corresponding to a match found. The first entry gives the distance of the match, and the second entry gives the indices of `haystack` corresponding to the match. Matches may overlap. Requires `haystack` to be indexable (e.g. an `AbstractString`).
+
+See also [`Partial`](@ref) and [`findnearest_partial`](@ref).
+"""
+findall_partial
+
+findall_partial(s1, s2, dist::Partial; max_dist = get_max_dist(dist)) = findall_partial(s1, s2, dist.dist; max_dist = max_dist)
+
+function findall_partial(s1, s2, dist; max_dist = get_max_dist(dist))
+    if max_dist === nothing
+        throw(ArgumentError("`dist` does not have a `max_dist` set and one was not passed to `findall_partial`."))
+    end
+
+    s1, s2 = enforce_shorter_first(s1, s2)  
+    T = Distances.result_type(dist, s1, s2)
+
+    dist = update_max_dist(dist, max_dist)
+    len1, len2 = length(s1), length(s2)
+    matches = Tuple{T,UnitRange}[]
+    len1 == 0 && return matches
+
+    if len1 == len2
+        curr = dist(s1, s2)
+        if curr <= max_dist
+            push!(matches, (curr, firstindex(s2):lastindex(s2)))
+        end
+        return matches
+    end
+
+    for (i, x) in enumerate(qgrams(s2, len1))
+        curr = dist(s1, x)
+        if curr <= max_dist
+            inds = _slice_inds(s2, i, i + len1 - 1)
+            push!(matches, (curr, inds))
+        end
+    end
+    return matches
+end
diff --git a/src/modifiers.jl b/src/modifiers.jl
@@ -7,6 +7,9 @@ Creates the `Partial{dist}` distance.
 
 See http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
 
+See also [`findnearest_partial`](@ref) and [`findall_partial`](@ref).
+
+
 ### Examples
 ```julia-repl
 julia> s1 = "New York Mets vs Atlanta Braves"
@@ -124,5 +127,3 @@ function (dist::TokenSet)(s1::Union{AbstractString, Missing}, s2::Union{Abstract
     score_12 = dist.dist(s1, s2)
     min(score_01, score_02, score_12)
 end
-
-
diff --git a/test/find_partial.jl b/test/find_partial.jl
@@ -0,0 +1,179 @@
+using StringDistances: get_max_dist, update_max_dist
+
+@testset "`find*_partial`" begin
+    @testset "`get_max_dist` and `update_max_dist`" begin
+        d = 2
+        @testset "$T" for T in (DamerauLevenshtein, Levenshtein, Hamming)
+            dist = T(d)
+            @test get_max_dist(dist) == d
+            @test update_max_dist(dist, 2*d) == T(2*d)
+
+            d1 = 0.5
+            @testset "$mod" for mod in (StringDistances.Normalized, StringDistances.TokenMax)
+                mod_dist = mod(dist, d1)
+                @test get_max_dist(mod_dist) == d1
+                @test update_max_dist(mod_dist, 2*d1) == mod(dist, 2*d1)
+            end
+
+            @testset "$mod" for mod in (Partial, StringDistances.TokenSort, StringDistances.TokenSet)
+                if mod == StringDistances.TokenMax
+                    T <: Distances.PreMetric || continue
+                end
+                modified_dist = mod(dist)
+                @test get_max_dist(modified_dist) == d
+                @test update_max_dist(modified_dist, 2*d) == mod(T(2*d))
+            end
+        end
+
+    end
+
+
+    @testset "`findnearest_partial` and `findall_partial` correctness with DamerauLevenshtein" begin
+        ## Equal length cases
+
+        # `d` replaced by `x`; 1 away
+        str1 = "abcd"
+        str2 = "abcx"
+        d, inds = findnearest_partial(str1, str2, Partial(DamerauLevenshtein(1)))
+        matches = findall_partial(str1, str2, Partial(DamerauLevenshtein(1)))
+        @test d == Partial(DamerauLevenshtein(1))(str1, str2)
+        @test matches == [(1, 1:4)] == [(d, inds)]
+
+        # `cd` replaced by `xy`; 2 away
+        str1 = "abcd"
+        str2 = "abxy"
+
+        d, inds = findnearest_partial(str1, str2, Partial(DamerauLevenshtein(1)))
+        matches = findall_partial(str1, str2, Partial(DamerauLevenshtein(1)))
+        @test d == 2 # `max_dist + 1`
+        @test isempty(matches)
+
+        d, inds = findnearest_partial(str1, str2, Partial(DamerauLevenshtein(2)))
+        matches = findall_partial(str1, str2, Partial(DamerauLevenshtein(2)))
+        @test d == Partial(DamerauLevenshtein(1))(str1, str2)
+        @test matches == [(2, 1:4)] == [(d, inds)]
+
+        ## Nonequal length cases
+
+        # `d` replaced by `x`; 1 away
+        str1 = "abcdef"
+        str2 = "1234abcxef1234"
+
+        d, inds = findnearest_partial(str1, str2, Partial(DamerauLevenshtein(1)))
+        matches = findall_partial(str1, str2, Partial(DamerauLevenshtein(1)))
+        @test d == Partial(DamerauLevenshtein(1))(str1, str2)
+        @test matches == [(1, 5:10)] == [(d, inds)]
+
+        d, inds = findnearest_partial(str1, str2, Partial(DamerauLevenshtein(2)))
+        matches = findall_partial(str1, str2, Partial(DamerauLevenshtein(2)))
+        @test d == Partial(DamerauLevenshtein(2))(str1, str2)
+        @test matches == [(1, 5:10)] == [(d, inds)]
+
+        d, inds = findnearest_partial(str1, str2, Partial(DamerauLevenshtein(3)))
+        matches = findall_partial(str1, str2, Partial(DamerauLevenshtein(3)))
+        @test d == Partial(DamerauLevenshtein(3))(str1, str2)
+        @test d == 1
+        @test inds == 5:10
+        @test matches == [(3, 4:9), (1, 5:10), (3, 6:11)]
+
+        # `cde` replaced by `xyz`; 3 away
+        str1 = "abcdef"
+        str2 = "1234abxyzf1234"
+
+        d, inds = findnearest_partial(str1, str2, Partial(DamerauLevenshtein(1)))
+        matches = findall_partial(str1, str2, Partial(DamerauLevenshtein(1)))
+        @test d == Partial(DamerauLevenshtein(1))(str1, str2)
+        @test d == 2 # max_dist + 1
+        @test inds == 1:0
+        @test isempty(matches)
+
+        d, inds = findnearest_partial(str1, str2, Partial(DamerauLevenshtein(2)))
+        matches = findall_partial(str1, str2, Partial(DamerauLevenshtein(2)))
+        @test d == Partial(DamerauLevenshtein(2))(str1, str2)
+        @test d == 3 # max_dist + 1
+        @test inds == 1:0
+        @test isempty(matches)
+
+        d, inds = findnearest_partial(str1, str2, Partial(DamerauLevenshtein(3)))
+        matches = findall_partial(str1, str2, Partial(DamerauLevenshtein(3)))
+        @test d == Partial(DamerauLevenshtein(3))(str1, str2)
+        @test matches == [(3, 5:10)] == [(d, inds)]
+
+        d, inds = findnearest_partial(str1, str2, Partial(DamerauLevenshtein(4)))
+        matches = findall_partial(str1, str2, Partial(DamerauLevenshtein(4)))
+        @test d == Partial(DamerauLevenshtein(4))(str1, str2)
+        @test matches == [(3, 5:10)] == [(d, inds)]
+
+        # In the first case, `cde` replaced by `xyz` (3 away); in the second, only `e` is replaced by `x` (one away)
+        str1 = "abcdef"
+        str2 = "1234abxyzf1234abcdxf123"
+        for max_dist in (1, 2)
+            d, inds = findnearest_partial(str1, str2, Partial(DamerauLevenshtein(max_dist)))
+            matches = findall_partial(str1, str2, Partial(DamerauLevenshtein(max_dist)))
+            @test d == Partial(DamerauLevenshtein(max_dist))(str1, str2)
+            @test matches == [(1, 15:20)] == [(d, inds)]
+        end
+        # Now at 3, we find the other match.
+        # We also match at "4abcdx": delete '4', substitute 'x' => 'e', and insert 'f' at the end
+        # as well as "bcdxf1": insert 'a' at the start, substitute 'x' => 'e', and delete '1' at the end
+        d, inds = findnearest_partial(str1, str2, Partial(DamerauLevenshtein(3)))
+        matches = findall_partial(str1, str2, Partial(DamerauLevenshtein(3)))
+        @test d == Partial(DamerauLevenshtein(3))(str1, str2)
+        @test (d, inds) == (1, 15:20)
+        @test matches == [(3, 5:10), (3, 14:19), (1, 15:20), (3, 16:21)]
+    end
+
+    @testset "`findnearest_partial` and `findall_partial`: test other distances" begin
+        # `d` replaced by `x`; 1 away
+        str1 = "abcdef"
+        str2 = "1234abcxef1234"
+
+        # `Partial` unwrapping
+        for d in (1,2,3), T in (DamerauLevenshtein, Levenshtein, Hamming), f in (findnearest_partial, findall_partial)
+            results1 = f(str1, str2, T(d))
+            results2 = f(str1, str2, Partial(T(d)))
+            @test results1 == results2
+        end
+
+        @testset "$dist" for dist in (DamerauLevenshtein(), Levenshtein(), Hamming(), Jaro(), JaroWinkler(), RatcliffObershelp())
+            d, inds = findnearest_partial(str1, str2, dist)
+            @test inds == 5:10
+            matches = findall_partial(str1, str2, dist; max_dist = d)
+            @test matches == [(d, inds)]
+
+            d1, inds1 = findnearest_partial(str1, str2, Partial(dist))
+            @test d ≈ d1
+            @test inds == inds1
+
+            matches1 = findall_partial(str1, str2, Partial(dist); max_dist = d)
+            @test matches1 == [(d1, inds1)]
+
+            d2, inds2 = findnearest_partial(str1, str2, StringDistances.Normalized(dist, 1.0))
+            if dist ∈ (Jaro(), JaroWinkler(), RatcliffObershelp())
+                @test d2 ≈ d
+
+                matches2 = findall_partial(str1, str2, StringDistances.Normalized(dist, 1.0); max_dist = d)
+                @test matches2 == [(d2, inds2)]
+                matches3 = findall_partial(str1, str2, StringDistances.Normalized(dist, d))
+                @test matches3 == matches2
+            else
+                @test d2 * length(str1) ≈ d
+
+                matches2 = findall_partial(str1, str2, StringDistances.Normalized(dist, 1.0); max_dist = d/length(str1) + eps())
+                @test matches2 == [(d2, inds2)]
+                matches3 = findall_partial(str1, str2, StringDistances.Normalized(dist, d/length(str1) + eps()))
+                @test matches3 == matches2
+            end
+            @test inds == inds2
+        end
+    end
+
+    @testset "Non-string tests" begin
+        v = [6,4,1,3,2,6]
+        @test findnearest_partial(1:3, v, DamerauLevenshtein()) == (1,3:5)
+
+        matches = [(2, 2:4), (2, 3:5), (2, 4:6)]
+        @test findnearest_partial(1:3, v, Hamming()) ∈ matches
+        @test findall_partial(1:3, v, Hamming(), max_dist = 2) == matches
+    end
+end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -4,3 +4,4 @@ using Test
 include("distances.jl")
 include("pairwise.jl")
 include("modifiers.jl")
+include("find_partial.jl")