Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add findnearest_partial and findall_partial #44

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion src/StringDistances.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,15 @@ include("distances/qgram.jl")
include("modifiers.jl")
include("normalize.jl")
include("pairwise.jl")
include("find_partial.jl")

# Distances API
Distances.result_type(dist::StringDistance, s1::Type, s2::Type) = typeof(dist("", ""))
Distances.result_type(dist::StringDistance, s1, s2) = result_type(dist, typeof(s1), typeof(s2))

# ambiguity fix
Distances.result_type(dist::StringDistance, s1::AbstractArray, s2::AbstractArray) = result_type(dist, eltype(s1), eltype(s2))




Expand Down Expand Up @@ -49,7 +54,8 @@ compare,
result_type,
qgrams,
findnearest,
findnearest_partial,
findall_partial,
pairwise,
pairwise!
end

20 changes: 19 additions & 1 deletion src/distances/utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,22 @@ function reorder(s1, s2)
(length(s1) <= length(s2)) ? (s1, s2) : (s2, s1)
end

function _enforce_shorter_first(needle, haystack)
if length(needle) > length(haystack)
throw(ArgumentError("Shorter argument must come first; used to find matches in the longer second argument."))
end
return needle, haystack
end

enforce_shorter_first(needle, haystack) = _enforce_shorter_first(needle, haystack)

function enforce_shorter_first(needle::AbstractString, haystack::AbstractString)
needle = string_with_length(needle)
haystack = string_with_length(haystack)
return _enforce_shorter_first(needle, haystack)
end


function common_prefix(s1, s2)
l = 0
for (ch1, ch2) in zip(s1, s2)
Expand Down Expand Up @@ -77,4 +93,6 @@ function _slice(s::AbstractString, n1::Integer, n2::Integer)
SubString(s, nextind(s, 0, n1 + 1), nextind(s, 0, n2))
end


# like `_slice` but get the indices (for indexable collections)
_slice_inds(::Any, n1::Integer, n2::Integer) = n1:n2
_slice_inds(s::AbstractString, n1::Integer, n2::Integer) = nextind(s, 0, n1):nextind(s, 0, n2)
137 changes: 137 additions & 0 deletions src/find_partial.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
"""
update_max_dist(dist, max_dist) -> StringDistance

Given a `StringDistance` `dist` which supports the `max_dist` field, return
a similar object with the field updated to the value passed in. If the distance does
not support this field, `dist` is returned.
"""
update_max_dist

# Why not use `update_max_dist(::T, max_dist) where T <: Union{DamerauLevenshtein, Levenshtein, Hamming}`
# instead of `@eval`? Because then `T` will be e.g. `DamerauLevenshtein{Nothing}`, and we cannot then
# do `T(1)` to create a new one.
for T in (:DamerauLevenshtein, :Levenshtein, :Hamming)
@eval function update_max_dist(::$T, max_dist)
return $T(max_dist)
end
end

for T in (:Normalized, :TokenMax)
@eval function update_max_dist(dist::$T, max_dist)
return $T(dist.dist, max_dist)
end
end

for T in (:Partial, :TokenSort, :TokenSet)
@eval function update_max_dist(dist::$T, max_dist)
return $T(update_max_dist(dist.dist, max_dist))
end
end
update_max_dist(d::Any, max_dist) = d

"""
get_max_dist(dist) -> Number

Given a `StringDistance` `dist` which supports the `max_dist` field, return
the value of the field. If the object does not support `max_dist`, then
return nothing.
"""
get_max_dist

get_max_dist(dist::Union{Levenshtein, DamerauLevenshtein, Hamming, Normalized, TokenMax}) = dist.max_dist
get_max_dist(dist::Union{Partial, TokenSort, TokenSet}) = get_max_dist(dist.dist)
get_max_dist(::Any) = nothing

"""
findnearest_partial(needle, haystack, dist) -> (d, inds)

`Partial(dist)(needle, haystack)` returns
the closest distance `d` between `needle` and any segment of `haystack` (of equal length to that of `needle`). `findnearest_partial` returns the same value, but also
returns the first set of indices at which an optimal partial match was found. If `dist` supports a `max_dist`
field, and no match was found with distance at most `max_dist`, then returns an empty range (`1:0`) for
the indices. Requires `haystack` to be indexable (e.g. an `AbstractString`).

See also [`Partial`](@ref) and [`findall_partial`](@ref).
"""
findnearest_partial

# unwrap `Partial`s since we compare as partials anyway.
findnearest_partial(s1, s2, dist::Partial) = findnearest_partial(s1, s2, dist.dist)

function findnearest_partial(s1, s2, dist)
max_dist = get_max_dist(dist)
s1, s2 = enforce_shorter_first(s1, s2)

if max_dist === nothing
# return something larger than any possible distance,
# but not e.g. `typemax(Int)` which will lead to overflows,
# and with an integer type, since we need to be able to
# construct e.g. `Levenshtein` distances with this parameter.
max_dist = 10*length(s2)
end

len1, len2 = length(s1), length(s2)
len1 == len2 && return dist(s1, s2), firstindex(s2):lastindex(s2)
out = max_dist + 1
len1 == 0 && return out, 1:0
out_idx = 0
for (i, x) in enumerate(qgrams(s2, len1))
curr = dist(s1, x)
out_idx = ifelse(curr < out, i, out_idx)
out = min(out, curr)
max_dist = max_dist === nothing ? out : min(out, max_dist)
dist = update_max_dist(dist, max_dist)
end

if out_idx == 0
# return more obvious invalid range if a match isn't found without exceeding `max_dist`
return out, 1:0
else
return out, _slice_inds(s2, out_idx, out_idx + len1 - 1)
end
end


"""
findall_partial(needle, haystack, dist; max_dist = StringDistances.get_max_dist(dist)) -> Vector{Tuple{T,UnitRange}}

Searches for occurrences of `needle` in `haystack` that differ by at most `max_dist` according to the distance measure `dist`. Only considers sequential segments of `haystack` of equal length to that of `needle`.

Returns a vector of tuples, each corresponding to a match found. The first entry gives the distance of the match, and the second entry gives the indices of `haystack` corresponding to the match. Matches may overlap. Requires `haystack` to be indexable (e.g. an `AbstractString`).

See also [`Partial`](@ref) and [`findnearest_partial`](@ref).
"""
findall_partial

findall_partial(s1, s2, dist::Partial; max_dist = get_max_dist(dist)) = findall_partial(s1, s2, dist.dist; max_dist = max_dist)

function findall_partial(s1, s2, dist; max_dist = get_max_dist(dist))
if max_dist === nothing
throw(ArgumentError("`dist` does not have a `max_dist` set and one was not passed to `findall_partial`."))
end

s1, s2 = enforce_shorter_first(s1, s2)
T = Distances.result_type(dist, s1, s2)

dist = update_max_dist(dist, max_dist)
len1, len2 = length(s1), length(s2)
matches = Tuple{T,UnitRange}[]
len1 == 0 && return matches

if len1 == len2
curr = dist(s1, s2)
if curr <= max_dist
push!(matches, (curr, firstindex(s2):lastindex(s2)))
end
return matches
end

for (i, x) in enumerate(qgrams(s2, len1))
curr = dist(s1, x)
if curr <= max_dist
inds = _slice_inds(s2, i, i + len1 - 1)
push!(matches, (curr, inds))
end
end
return matches
end
5 changes: 3 additions & 2 deletions src/modifiers.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ Creates the `Partial{dist}` distance.

See http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/

See also [`findnearest_partial`](@ref) and [`findall_partial`](@ref).


### Examples
```julia-repl
julia> s1 = "New York Mets vs Atlanta Braves"
Expand Down Expand Up @@ -124,5 +127,3 @@ function (dist::TokenSet)(s1::Union{AbstractString, Missing}, s2::Union{Abstract
score_12 = dist.dist(s1, s2)
min(score_01, score_02, score_12)
end


179 changes: 179 additions & 0 deletions test/find_partial.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
using StringDistances: get_max_dist, update_max_dist

@testset "`find*_partial`" begin
@testset "`get_max_dist` and `update_max_dist`" begin
d = 2
@testset "$T" for T in (DamerauLevenshtein, Levenshtein, Hamming)
dist = T(d)
@test get_max_dist(dist) == d
@test update_max_dist(dist, 2*d) == T(2*d)

d1 = 0.5
@testset "$mod" for mod in (StringDistances.Normalized, StringDistances.TokenMax)
mod_dist = mod(dist, d1)
@test get_max_dist(mod_dist) == d1
@test update_max_dist(mod_dist, 2*d1) == mod(dist, 2*d1)
end

@testset "$mod" for mod in (Partial, StringDistances.TokenSort, StringDistances.TokenSet)
if mod == StringDistances.TokenMax
T <: Distances.PreMetric || continue
end
modified_dist = mod(dist)
@test get_max_dist(modified_dist) == d
@test update_max_dist(modified_dist, 2*d) == mod(T(2*d))
end
end

end


@testset "`findnearest_partial` and `findall_partial` correctness with DamerauLevenshtein" begin
## Equal length cases

# `d` replaced by `x`; 1 away
str1 = "abcd"
str2 = "abcx"
d, inds = findnearest_partial(str1, str2, Partial(DamerauLevenshtein(1)))
matches = findall_partial(str1, str2, Partial(DamerauLevenshtein(1)))
@test d == Partial(DamerauLevenshtein(1))(str1, str2)
@test matches == [(1, 1:4)] == [(d, inds)]

# `cd` replaced by `xy`; 2 away
str1 = "abcd"
str2 = "abxy"

d, inds = findnearest_partial(str1, str2, Partial(DamerauLevenshtein(1)))
matches = findall_partial(str1, str2, Partial(DamerauLevenshtein(1)))
@test d == 2 # `max_dist + 1`
@test isempty(matches)

d, inds = findnearest_partial(str1, str2, Partial(DamerauLevenshtein(2)))
matches = findall_partial(str1, str2, Partial(DamerauLevenshtein(2)))
@test d == Partial(DamerauLevenshtein(1))(str1, str2)
@test matches == [(2, 1:4)] == [(d, inds)]

## Nonequal length cases

# `d` replaced by `x`; 1 away
str1 = "abcdef"
str2 = "1234abcxef1234"

d, inds = findnearest_partial(str1, str2, Partial(DamerauLevenshtein(1)))
matches = findall_partial(str1, str2, Partial(DamerauLevenshtein(1)))
@test d == Partial(DamerauLevenshtein(1))(str1, str2)
@test matches == [(1, 5:10)] == [(d, inds)]

d, inds = findnearest_partial(str1, str2, Partial(DamerauLevenshtein(2)))
matches = findall_partial(str1, str2, Partial(DamerauLevenshtein(2)))
@test d == Partial(DamerauLevenshtein(2))(str1, str2)
@test matches == [(1, 5:10)] == [(d, inds)]

d, inds = findnearest_partial(str1, str2, Partial(DamerauLevenshtein(3)))
matches = findall_partial(str1, str2, Partial(DamerauLevenshtein(3)))
@test d == Partial(DamerauLevenshtein(3))(str1, str2)
@test d == 1
@test inds == 5:10
@test matches == [(3, 4:9), (1, 5:10), (3, 6:11)]

# `cde` replaced by `xyz`; 3 away
str1 = "abcdef"
str2 = "1234abxyzf1234"

d, inds = findnearest_partial(str1, str2, Partial(DamerauLevenshtein(1)))
matches = findall_partial(str1, str2, Partial(DamerauLevenshtein(1)))
@test d == Partial(DamerauLevenshtein(1))(str1, str2)
@test d == 2 # max_dist + 1
@test inds == 1:0
@test isempty(matches)

d, inds = findnearest_partial(str1, str2, Partial(DamerauLevenshtein(2)))
matches = findall_partial(str1, str2, Partial(DamerauLevenshtein(2)))
@test d == Partial(DamerauLevenshtein(2))(str1, str2)
@test d == 3 # max_dist + 1
@test inds == 1:0
@test isempty(matches)

d, inds = findnearest_partial(str1, str2, Partial(DamerauLevenshtein(3)))
matches = findall_partial(str1, str2, Partial(DamerauLevenshtein(3)))
@test d == Partial(DamerauLevenshtein(3))(str1, str2)
@test matches == [(3, 5:10)] == [(d, inds)]

d, inds = findnearest_partial(str1, str2, Partial(DamerauLevenshtein(4)))
matches = findall_partial(str1, str2, Partial(DamerauLevenshtein(4)))
@test d == Partial(DamerauLevenshtein(4))(str1, str2)
@test matches == [(3, 5:10)] == [(d, inds)]

# In the first case, `cde` replaced by `xyz` (3 away); in the second, only `e` is replaced by `x` (one away)
str1 = "abcdef"
str2 = "1234abxyzf1234abcdxf123"
for max_dist in (1, 2)
d, inds = findnearest_partial(str1, str2, Partial(DamerauLevenshtein(max_dist)))
matches = findall_partial(str1, str2, Partial(DamerauLevenshtein(max_dist)))
@test d == Partial(DamerauLevenshtein(max_dist))(str1, str2)
@test matches == [(1, 15:20)] == [(d, inds)]
end
# Now at 3, we find the other match.
# We also match at "4abcdx": delete '4', substitute 'x' => 'e', and insert 'f' at the end
# as well as "bcdxf1": insert 'a' at the start, substitute 'x' => 'e', and delete '1' at the end
d, inds = findnearest_partial(str1, str2, Partial(DamerauLevenshtein(3)))
matches = findall_partial(str1, str2, Partial(DamerauLevenshtein(3)))
@test d == Partial(DamerauLevenshtein(3))(str1, str2)
@test (d, inds) == (1, 15:20)
@test matches == [(3, 5:10), (3, 14:19), (1, 15:20), (3, 16:21)]
end

@testset "`findnearest_partial` and `findall_partial`: test other distances" begin
# `d` replaced by `x`; 1 away
str1 = "abcdef"
str2 = "1234abcxef1234"

# `Partial` unwrapping
for d in (1,2,3), T in (DamerauLevenshtein, Levenshtein, Hamming), f in (findnearest_partial, findall_partial)
results1 = f(str1, str2, T(d))
results2 = f(str1, str2, Partial(T(d)))
@test results1 == results2
end

@testset "$dist" for dist in (DamerauLevenshtein(), Levenshtein(), Hamming(), Jaro(), JaroWinkler(), RatcliffObershelp())
d, inds = findnearest_partial(str1, str2, dist)
@test inds == 5:10
matches = findall_partial(str1, str2, dist; max_dist = d)
@test matches == [(d, inds)]

d1, inds1 = findnearest_partial(str1, str2, Partial(dist))
@test d ≈ d1
@test inds == inds1

matches1 = findall_partial(str1, str2, Partial(dist); max_dist = d)
@test matches1 == [(d1, inds1)]

d2, inds2 = findnearest_partial(str1, str2, StringDistances.Normalized(dist, 1.0))
if dist ∈ (Jaro(), JaroWinkler(), RatcliffObershelp())
@test d2 ≈ d

matches2 = findall_partial(str1, str2, StringDistances.Normalized(dist, 1.0); max_dist = d)
@test matches2 == [(d2, inds2)]
matches3 = findall_partial(str1, str2, StringDistances.Normalized(dist, d))
@test matches3 == matches2
else
@test d2 * length(str1) ≈ d

matches2 = findall_partial(str1, str2, StringDistances.Normalized(dist, 1.0); max_dist = d/length(str1) + eps())
@test matches2 == [(d2, inds2)]
matches3 = findall_partial(str1, str2, StringDistances.Normalized(dist, d/length(str1) + eps()))
@test matches3 == matches2
end
@test inds == inds2
end
end

@testset "Non-string tests" begin
v = [6,4,1,3,2,6]
@test findnearest_partial(1:3, v, DamerauLevenshtein()) == (1,3:5)

matches = [(2, 2:4), (2, 3:5), (2, 4:6)]
@test findnearest_partial(1:3, v, Hamming()) ∈ matches
@test findall_partial(1:3, v, Hamming(), max_dist = 2) == matches
end
end
1 change: 1 addition & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ using Test
include("distances.jl")
include("pairwise.jl")
include("modifiers.jl")
include("find_partial.jl")