Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: Add cooksdistance(::GeneralizedLinearModel) #510

Draft
wants to merge 3 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,15 @@ uuid = "38e38edf-8417-5370-95a0-9cbb8c7f171a"
version = "1.8.1"

[deps]
CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
RDatasets = "ce6b1742-4840-55fa-b093-852dadbb1d8b"
Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
StatsAPI = "82ae8749-77ed-4fe6-ae5f-f523153014b0"
StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
Expand Down
1 change: 1 addition & 0 deletions src/GLM.jl
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ module GLM
include("glmfit.jl")
include("ftest.jl")
include("negbinfit.jl")
include("influence.jl")
include("deprecated.jl")

end # module
93 changes: 93 additions & 0 deletions src/influence.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
#####
##### Measures of influence
#####

StatsBase.leverage(model::LinPredModel) = _leverage(model, model.pp)

function _leverage(_, pred::DensePredQR)
Q = pred.qr.Q
r = linpred_rank(pred)
y = diagm(size(Q, 1), r, trues(r))
Z = Q * y
return vec(sum(abs2, Z; dims=1))
end

function _leverage(model, pred::DensePredChol{<:Any,<:Cholesky})
X = weightedmodelmatrix(model)
choldiv!(pred.chol, X)
return vec(sum(abs2, X; dims=2))
end

function _leverage(model, pred::DensePredChol{<:Any,<:CholeskyPivoted})
X = weightedmodelmatrix(model)
C = pred.chol
if any(x -> isapprox(x, zero(x)), diag(C.L))
Q = qr!(X).Q
r = rank(C)
y = diagm(size(Q, 1), r, trues(r))
X = Q * y
else
choldiv!(C, X)
end
return vec(sum(abs2, X; dims=2))
end

function _leverage(model, pred::SparsePredChol)
X = weightedmodelmatrix(model)
Z = pred.chol.L \ X' # can't be done in-place for SuiteSparse factorizations
return vec(sum(abs2, Z; dims=1))
end

# Overwrite `X` with the solution `Z` to `L*Z = Xᵀ`, where `L` is the lower triangular
# factor from the Cholesky factorization of `XᵀX`.
choldiv!(C::Cholesky, X) = ldiv!(C.L, X')
choldiv!(C::CholeskyPivoted, X) = ldiv!(C.L, view(X, :, invperm(C.p))')

# `X` for unweighted models, `√W*X` for weighted, where `W` is a diagonal matrix
# of the prior weights. A copy of `X` is made so that it can be mutated downstream
# without affecting the underlying model object.
function weightedmodelmatrix(model::LinearModel)
X = copy(modelmatrix(model))
priorwt = model.rr.wts
if !isempty(priorwt)
X .*= sqrt.(priorwt)
end
return X
end

# `√W*X`, where `W` is a diagonal matrix of the working weights from the final IRLS
# iteration. This handles GLMs with and without prior weights, as the prior weights
# simply become part of the working weights for IRLS. No explicit copy of `X` needs
# to be made since we're always doing a multiplication, unlike for the method above.
weightedmodelmatrix(model::GeneralizedLinearModel) =
modelmatrix(model) .* sqrt.(model.rr.wrkwt)

@noinline function _checkrankdeficient(model)
if linpred_rank(model.pp) < size(modelmatrix(model), 2)
throw(ArgumentError("models with collinear terms are not currently supported"))
end
return nothing
end

function StatsBase.cooksdistance(model::GeneralizedLinearModel)
_checkrankdeficient(model)
y = response(model)
ŷ = fitted(model)
k = dof(model) - hasintercept(model)
φ̂ = dispersion(model)
h = leverage(model)
D = model.rr.d
return @. (y - ŷ)^2 / glmvar(D, ŷ) * (h / (1 - h)^2) / (φ̂ * (k + 1))
end

function StatsBase.cooksdistance(model::LinearModel)
_checkrankdeficient(model)
u = residuals(model)
#if !isempty(model.rr.wts)
# u .*= sqrt.(model.rr.wts)
#end
mse = dispersion(model, true)
k = dof(model) - 1
h = leverage(model)
return @. u^2 * (h / (1 - h)^2) / (k * mse)
end
26 changes: 0 additions & 26 deletions src/lm.jl
Original file line number Diff line number Diff line change
Expand Up @@ -325,29 +325,3 @@ function confint(obj::LinearModel; level::Real=0.95)
hcat(coef(obj),coef(obj)) + stderror(obj) *
quantile(TDist(dof_residual(obj)), (1. - level)/2.) * [1. -1.]
end

"""
cooksdistance(obj::LinearModel)

Compute [Cook's distance](https://en.wikipedia.org/wiki/Cook%27s_distance)
for each observation in linear model `obj`, giving an estimate of the influence
of each data point.
Currently only implemented for linear models without weights.
"""
function StatsBase.cooksdistance(obj::LinearModel)
u = residuals(obj)
mse = dispersion(obj,true)
k = dof(obj)-1
d_res = dof_residual(obj)
X = modelmatrix(obj)
XtX = crossmodelmatrix(obj)
k == size(X,2) || throw(ArgumentError("Models with collinear terms are not currently supported."))
wts = obj.rr.wts
if isempty(wts)
hii = diag(X * inv(XtX) * X')
else
throw(ArgumentError("Weighted models are not currently supported."))
end
D = @. u^2 * (hii / (1 - hii)^2) / (k*mse)
return D
end
29 changes: 28 additions & 1 deletion test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,12 @@ linreg(x::AbstractVecOrMat, y::AbstractVector) = qr!(simplemm(x)) \ y
@test isapprox(aic(lm1), -36.409684288095946)
@test isapprox(aicc(lm1), -24.409684288095946)
@test isapprox(bic(lm1), -37.03440588041178)
# Comparator values for `leverage` and `cooksdistance` are from R's `hatvalues` and
# `cooks.distance`, respectively
@test leverage(lm1) ≈ [0.5918367347, 0.2816326531, 0.1673469388, 0.1836734694,
0.2489795918, 0.5265306122] atol=1e-10
@test cooksdistance(lm1) ≈ [1.0705381016, 0.0038594655, 0.0123926845, 0.0940006684,
0.1666111600, 2.1649894169] atol=1e-10
lm2 = fit(LinearModel, hcat(ones(6), 10form.Carb), form.OptDen, true)
@test isa(lm2.pp.chol, CholeskyPivoted)
@test lm2.pp.chol.piv == [2, 1]
Expand Down Expand Up @@ -100,7 +106,13 @@ end
@test isapprox(loglikelihood(lm_model), -4353.946729075838)
@test isapprox(loglikelihood(glm_model), -4353.946729075838)
@test isapprox(nullloglikelihood(lm_model), -4984.892139711452)
@test isapprox(mean(residuals(lm_model)), -5.412966629787718)
@test isapprox(mean(residuals(lm_model)), -5.412966629787718)
@test leverage(lm_model) ≈ leverage(glm_model)
# Values computed from R
@test leverage(lm_model)[1:10] ≈ [0.0031577628, 0.0050424090, 0.0044471988, 0.0084538844,
0.0088088189, 0.0014421478, 0.0031762479, 0.0042745349,
0.0073786323, 0.0126819467] atol=1e-8
@test_broken cooksdistance(lm_model) ≈ cooksdistance(glm_model)
end

@testset "rankdeficient" begin
Expand Down Expand Up @@ -154,6 +166,7 @@ end
[Inf 0.0 1.0 -Inf Inf
Inf 0.0 1.0 -Inf Inf
Inf 0.0 1.0 -Inf Inf])
@test leverage(model) ≈ [1, 1, 1]

model = lm(@formula(y ~ 0 + x), df)
ct = coeftable(model)
Expand All @@ -165,6 +178,8 @@ end
[Inf 0.0 1.0 -Inf Inf
Inf 0.0 1.0 -Inf Inf
Inf 0.0 1.0 -Inf Inf])
@test leverage(model) ≈ [1, 1, 1]
@test all(isnan, cooksdistance(model))

model = glm(@formula(y ~ x), df, Normal(), IdentityLink())
ct = coeftable(model)
Expand All @@ -176,6 +191,7 @@ end
[Inf 0.0 1.0 -Inf Inf
Inf 0.0 1.0 -Inf Inf
Inf 0.0 1.0 -Inf Inf])
@test leverage(model) ≈ [1, 1, 1]

model = glm(@formula(y ~ 0 + x), df, Normal(), IdentityLink())
ct = coeftable(model)
Expand All @@ -187,6 +203,8 @@ end
[Inf 0.0 1.0 -Inf Inf
Inf 0.0 1.0 -Inf Inf
Inf 0.0 1.0 -Inf Inf])
@test leverage(model) ≈ [1, 1, 1]
@test all(isnan, cooksdistance(model))

# Saturated and rank-deficient model
df = DataFrame(x1=["a", "b", "c"], x2=["a", "b", "c"], y=[1, 2, 3])
Expand All @@ -203,6 +221,15 @@ end
Inf 0.0 1.0 -Inf Inf
NaN NaN NaN NaN NaN
NaN NaN NaN NaN NaN])
if model isa LinearModel
# This is a very loose tolerance; the difference may just be due to the
# difference in how models are fit between this package and R, in particular
# the use of Cholesky vs. QR factorization, but idk
@test leverage(model) ≈ [1, 1, 1] atol=1e-4
else
# Currently these are like [1.05, 1.49, 1.11]
@test_broken leverage(model) ≈ [1, 1, 1] atol=1e-4
end
end
end

Expand Down