JuliaStats · ararslan · Dec 7, 2022 · Feb 10, 2023 · Feb 11, 2023
diff --git a/Project.toml b/Project.toml
@@ -3,12 +3,15 @@ uuid = "38e38edf-8417-5370-95a0-9cbb8c7f171a"
 version = "1.8.1"
 
 [deps]
+CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
+RDatasets = "ce6b1742-4840-55fa-b093-852dadbb1d8b"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
+StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 StatsAPI = "82ae8749-77ed-4fe6-ae5f-f523153014b0"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"

diff --git a/src/GLM.jl b/src/GLM.jl
@@ -120,6 +120,7 @@ module GLM
     include("glmfit.jl")
     include("ftest.jl")
     include("negbinfit.jl")
+    include("influence.jl")
     include("deprecated.jl")
 
 end # module
diff --git a/src/influence.jl b/src/influence.jl
@@ -0,0 +1,93 @@
+#####
+##### Measures of influence
+#####
+
+StatsBase.leverage(model::LinPredModel) = _leverage(model, model.pp)
+
+function _leverage(_, pred::DensePredQR)
+    Q = pred.qr.Q
+    r = linpred_rank(pred)
+    y = diagm(size(Q, 1), r, trues(r))
+    Z = Q * y
+    return vec(sum(abs2, Z; dims=1))
+end
+
+function _leverage(model, pred::DensePredChol{<:Any,<:Cholesky})
+    X = weightedmodelmatrix(model)
+    choldiv!(pred.chol, X)
+    return vec(sum(abs2, X; dims=2))
+end
+
+function _leverage(model, pred::DensePredChol{<:Any,<:CholeskyPivoted})
+    X = weightedmodelmatrix(model)
+    C = pred.chol
+    if any(x -> isapprox(x, zero(x)), diag(C.L))
+        Q = qr!(X).Q
+        r = rank(C)
+        y = diagm(size(Q, 1), r, trues(r))
+        X = Q * y
+    else
+        choldiv!(C, X)
+    end
+    return vec(sum(abs2, X; dims=2))
+end
+
+function _leverage(model, pred::SparsePredChol)
+    X = weightedmodelmatrix(model)
+    Z = pred.chol.L \ X'  # can't be done in-place for SuiteSparse factorizations
+    return vec(sum(abs2, Z; dims=1))
+end
+
+# Overwrite `X` with the solution `Z` to `L*Z = Xᵀ`, where `L` is the lower triangular
+# factor from the Cholesky factorization of `XᵀX`.
+choldiv!(C::Cholesky, X) = ldiv!(C.L, X')
+choldiv!(C::CholeskyPivoted, X) = ldiv!(C.L, view(X, :, invperm(C.p))')
+
+# `X` for unweighted models, `√W*X` for weighted, where `W` is a diagonal matrix
+# of the prior weights. A copy of `X` is made so that it can be mutated downstream
+# without affecting the underlying model object.
+function weightedmodelmatrix(model::LinearModel)
+    X = copy(modelmatrix(model))
+    priorwt = model.rr.wts
+    if !isempty(priorwt)
+        X .*= sqrt.(priorwt)
+    end
+    return X
+end
+
+# `√W*X`, where `W` is a diagonal matrix of the working weights from the final IRLS
+# iteration. This handles GLMs with and without prior weights, as the prior weights
+# simply become part of the working weights for IRLS. No explicit copy of `X` needs
+# to be made since we're always doing a multiplication, unlike for the method above.
+weightedmodelmatrix(model::GeneralizedLinearModel) =
+    modelmatrix(model) .* sqrt.(model.rr.wrkwt)
+
+@noinline function _checkrankdeficient(model)
+    if linpred_rank(model.pp) < size(modelmatrix(model), 2)
+        throw(ArgumentError("models with collinear terms are not currently supported"))
+    end
+    return nothing
+end
+
+function StatsBase.cooksdistance(model::GeneralizedLinearModel)
+    _checkrankdeficient(model)
+    y = response(model)
+    ŷ = fitted(model)
+    k = dof(model) - hasintercept(model)
+    φ̂ = dispersion(model)
+    h = leverage(model)
+    D = model.rr.d
+    return @. (y - ŷ)^2 / glmvar(D, ŷ) * (h / (1 - h)^2) / (φ̂ * (k + 1))
+end
+
+function StatsBase.cooksdistance(model::LinearModel)
+    _checkrankdeficient(model)
+    u = residuals(model)
+    #if !isempty(model.rr.wts)
+    #    u .*= sqrt.(model.rr.wts)
+    #end
+    mse = dispersion(model, true)
+    k = dof(model) - 1
+    h = leverage(model)
+    return @. u^2 * (h / (1 - h)^2) / (k * mse)
+end
diff --git a/src/lm.jl b/src/lm.jl
@@ -325,29 +325,3 @@ function confint(obj::LinearModel; level::Real=0.95)
     hcat(coef(obj),coef(obj)) + stderror(obj) *
     quantile(TDist(dof_residual(obj)), (1. - level)/2.) * [1. -1.]
 end
-
-"""
-    cooksdistance(obj::LinearModel)
-
-Compute [Cook's distance](https://en.wikipedia.org/wiki/Cook%27s_distance)
-for each observation in linear model `obj`, giving an estimate of the influence
-of each data point.
-Currently only implemented for linear models without weights.
-"""
-function StatsBase.cooksdistance(obj::LinearModel)
-    u = residuals(obj)
-    mse = dispersion(obj,true)
-    k = dof(obj)-1
-    d_res = dof_residual(obj)
-    X = modelmatrix(obj)
-    XtX = crossmodelmatrix(obj)
-    k == size(X,2) || throw(ArgumentError("Models with collinear terms are not currently supported."))
-    wts = obj.rr.wts
-    if isempty(wts)
-        hii = diag(X * inv(XtX) * X')
-    else
-        throw(ArgumentError("Weighted models are not currently supported."))
-    end
-    D = @. u^2 * (hii / (1 - hii)^2) / (k*mse)
-    return D
-end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -41,6 +41,12 @@ linreg(x::AbstractVecOrMat, y::AbstractVector) = qr!(simplemm(x)) \ y
     @test isapprox(aic(lm1), -36.409684288095946)
     @test isapprox(aicc(lm1), -24.409684288095946)
     @test isapprox(bic(lm1), -37.03440588041178)
+    # Comparator values for `leverage` and `cooksdistance` are from R's `hatvalues` and
+    # `cooks.distance`, respectively
+    @test leverage(lm1) ≈ [0.5918367347, 0.2816326531, 0.1673469388, 0.1836734694,
+                           0.2489795918, 0.5265306122] atol=1e-10
+    @test cooksdistance(lm1) ≈ [1.0705381016, 0.0038594655, 0.0123926845, 0.0940006684,
+                                0.1666111600, 2.1649894169] atol=1e-10
     lm2 = fit(LinearModel, hcat(ones(6), 10form.Carb), form.OptDen, true)
     @test isa(lm2.pp.chol, CholeskyPivoted)
     @test lm2.pp.chol.piv == [2, 1]
@@ -100,7 +106,13 @@ end
     @test isapprox(loglikelihood(lm_model), -4353.946729075838)
     @test isapprox(loglikelihood(glm_model), -4353.946729075838)
     @test isapprox(nullloglikelihood(lm_model), -4984.892139711452)
-    @test isapprox(mean(residuals(lm_model)), -5.412966629787718) 
+    @test isapprox(mean(residuals(lm_model)), -5.412966629787718)
+    @test leverage(lm_model) ≈ leverage(glm_model)
+    # Values computed from R
+    @test leverage(lm_model)[1:10] ≈ [0.0031577628, 0.0050424090, 0.0044471988, 0.0084538844,
+                                      0.0088088189, 0.0014421478, 0.0031762479, 0.0042745349,
+                                      0.0073786323, 0.0126819467] atol=1e-8
+    @test_broken cooksdistance(lm_model) ≈ cooksdistance(glm_model)
 end
 
 @testset "rankdeficient" begin
@@ -154,6 +166,7 @@ end
                   [Inf 0.0 1.0 -Inf Inf
                    Inf 0.0 1.0 -Inf Inf
                    Inf 0.0 1.0 -Inf Inf])
+    @test leverage(model) ≈ [1, 1, 1]
 
     model = lm(@formula(y ~ 0 + x), df)
     ct = coeftable(model)
@@ -165,6 +178,8 @@ end
                   [Inf 0.0 1.0 -Inf Inf
                    Inf 0.0 1.0 -Inf Inf
                    Inf 0.0 1.0 -Inf Inf])
+    @test leverage(model) ≈ [1, 1, 1]
+    @test all(isnan, cooksdistance(model))
 
     model = glm(@formula(y ~ x), df, Normal(), IdentityLink())
     ct = coeftable(model)
@@ -176,6 +191,7 @@ end
                   [Inf 0.0 1.0 -Inf Inf
                    Inf 0.0 1.0 -Inf Inf
                    Inf 0.0 1.0 -Inf Inf])
+    @test leverage(model) ≈ [1, 1, 1]
 
     model = glm(@formula(y ~ 0 + x), df, Normal(), IdentityLink())
     ct = coeftable(model)
@@ -187,6 +203,8 @@ end
                   [Inf 0.0 1.0 -Inf Inf
                    Inf 0.0 1.0 -Inf Inf
                    Inf 0.0 1.0 -Inf Inf])
+    @test leverage(model) ≈ [1, 1, 1]
+    @test all(isnan, cooksdistance(model))
 
     # Saturated and rank-deficient model
     df = DataFrame(x1=["a", "b", "c"], x2=["a", "b", "c"], y=[1, 2, 3])
@@ -203,6 +221,15 @@ end
                        Inf 0.0 1.0 -Inf Inf
                        NaN NaN NaN  NaN NaN
                        NaN NaN NaN  NaN NaN])
+        if model isa LinearModel
+            # This is a very loose tolerance; the difference may just be due to the
+            # difference in how models are fit between this package and R, in particular
+            # the use of Cholesky vs. QR factorization, but idk
+            @test leverage(model) ≈ [1, 1, 1] atol=1e-4
+        else
+            # Currently these are like [1.05, 1.49, 1.11]
+            @test_broken leverage(model) ≈ [1, 1, 1] atol=1e-4
+        end
     end
 end