Merge pull request #67 from ShouvikGhosh2048/adding_numerical_tests

Adding frequentist numerical tests + Integrate StatsAPI
xKDR · Dec 29, 2022 · c51b875 · c51b875
2 parents 39ff832 + 77db266
commit c51b875
Show file tree

Hide file tree

Showing 11 changed files with 127 additions and 14 deletions.
diff --git a/Project.toml b/Project.toml
@@ -12,6 +12,7 @@ LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 NLSolversBase = "d41bc354-129a-5804-8e4c-c37616107c6c"
 Optim = "429524aa-4258-5aef-a3af-852621145aeb"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+StatsAPI = "82ae8749-77ed-4fe6-ae5f-f523153014b0"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 StatsModels = "3eaba693-59b7-5ba5-a881-562e759f1c8d"
 Turing = "fce5fe82-541a-59a6-adf8-730c64b5f9a0"

diff --git a/src/CRRao.jl b/src/CRRao.jl
@@ -3,6 +3,7 @@ module CRRao
 using DataFrames, GLM, Turing, StatsModels, StatsBase
 using StatsBase, Distributions, LinearAlgebra
 using Optim, NLSolversBase, Random
+import StatsBase: coef, coeftable, r2, adjr2, loglikelihood, aic, bic, predict, residuals, cooksdistance, fit
 
 """
 ```julia

diff --git a/src/fitmodel.jl b/src/fitmodel.jl
@@ -5,7 +5,7 @@ FrequentistRegression{RegressionType}
 
 Type to represent frequentist regression models returned by `fit` functions. This type is used internally by the package to represent all frequentist regression models. `RegressionType` is a `Symbol` representing the model class.
 """
-struct FrequentistRegression{RegressionType}
+struct FrequentistRegression{RegressionType} <: RegressionModel
     model
     formula::FormulaTerm
     link
@@ -27,7 +27,7 @@ BayesianRegression{RegressionType}
 
 Type to represent bayesian regression models returned by `fit` functions. This type is used internally by the package to represent all bayesian regression models. `RegressionType` is a `Symbol` representing the model class.
 """
-struct BayesianRegression{RegressionType}
+struct BayesianRegression{RegressionType} <: RegressionModel
     chain
     formula::FormulaTerm
     link

diff --git a/src/frequentist/getter.jl b/src/frequentist/getter.jl
@@ -247,7 +247,7 @@ function predict(container::FrequentistRegression{:LogisticRegression}, newdata:
         return Probit_Link.(z)
     elseif (container.link == GLM.CauchitLink)
         return Cauchit_Link.(z)
-    elseif (container.link == GLM.Cloglog)
+    elseif (container.link == GLM.CloglogLink)
         return Cloglog_Link.(z)
     end
 end

diff --git a/test/Project.toml b/test/Project.toml
@@ -1,4 +1,5 @@
 [deps]
+GLM = "38e38edf-8417-5370-95a0-9cbb8c7f171a"
 Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
 RDatasets = "ce6b1742-4840-55fa-b093-852dadbb1d8b"
 StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"

diff --git a/test/numerical/frequentist/LinearRegression.jl b/test/numerical/frequentist/LinearRegression.jl
@@ -0,0 +1,16 @@
+mtcars = dataset("datasets", "mtcars")
+
+tests = [
+    (@formula(MPG ~ HP + WT + Gear), 157.052778719219475, 164.381458233218098),
+    (@formula(MPG ~ 0 + HP + WT + Gear), 186.905400588726366, 192.768344199925281),
+    (@formula(MPG ~ HP + HP^2 + WT + WT * HP), 145.922971789355415, 154.717387206153774),
+    (@formula(log(MPG) ~ log(HP) + log(WT)), -48.353276320549490, -42.490332709350582)
+]
+
+for (test_formula, test_aic, test_bic) in tests
+    crrao_model = fit(test_formula, mtcars, LinearRegression())
+    glm_model = lm(test_formula, mtcars)
+    compare_models(crrao_model, glm_model, mtcars)
+    @test isapprox(aic(crrao_model), test_aic)
+    @test isapprox(bic(crrao_model), test_bic)
+end
diff --git a/test/numerical/frequentist/LogisticRegression.jl b/test/numerical/frequentist/LogisticRegression.jl
@@ -0,0 +1,37 @@
+turnout = dataset("Zelig", "turnout")
+
+tests = [
+    (Logit(), LogitLink(), [
+        (@formula(Vote ~ Age + Race + Income + Educate), 2033.981263703107970, 2061.985776000818532),
+        (@formula(Vote ~ 0 + Age + Race + Income + Educate), 2033.981263703107970, 2061.985776000818532),
+        (@formula(Vote ~ Age + Age^2 + Race + Income * Educate), 2023.429129040224325, 2062.635446257018884),
+        (@formula(Vote ~ log(Age) + Educate), 2069.570124645007581, 2086.372832023633691)
+    ]),
+    (Probit(), ProbitLink(), [
+        (@formula(Vote ~ Age + Race + Income + Educate), 2034.196513858159960, 2062.201026155870295),
+        (@formula(Vote ~ 0 + Age + Race + Income + Educate), 2034.196513858159960, 2062.201026155870295),
+        (@formula(Vote ~ Age + Age^2 + Race + Income * Educate), 2023.024253890667069, 2062.230571107461856),
+        (@formula(Vote ~ log(Age) + Educate), 2068.082640703962625, 2084.885348082588735)
+    ]),
+    (Cloglog(), CloglogLink(), [
+        (@formula(Vote ~ Age + Race + Income + Educate), 2036.690120606579512, 2064.694632904289847),
+        (@formula(Vote ~ 0 + Age + Race + Income + Educate), 2036.690120606579512, 2064.694632904289847),
+        (@formula(Vote ~ log(Age) + Educate), 2067.047234348701750, 2083.849941727327860)
+    ]),
+    (Cauchit(), CauchitLink(), [
+        (@formula(Vote ~ Age + Race + Income + Educate), 2050.941937867712113, 2078.946450165422448),
+        (@formula(Vote ~ 0 + Age + Race + Income + Educate), 2050.941937867712113, 2078.946450165422448),
+        (@formula(Vote ~ Age + Age^2 + Race + Income * Educate), 2036.876992804688371, 2076.083310021483157),
+        (@formula(Vote ~ log(Age) + Educate), 2085.365660144347203, 2102.168367522973313)
+    ])
+]
+
+for (crrao_link, glm_link, formulae_and_values) in tests
+    for (test_formula, test_aic, test_bic) in formulae_and_values
+        crrao_model = fit(test_formula, turnout, LogisticRegression(), crrao_link)
+        glm_model = glm(test_formula, turnout, Binomial(), glm_link)
+        compare_models(crrao_model, glm_model, turnout)
+        @test isapprox(aic(crrao_model), test_aic)
+        @test isapprox(bic(crrao_model), test_bic)
+    end
+end
diff --git a/test/numerical/frequentist/NegBinomialRegression.jl b/test/numerical/frequentist/NegBinomialRegression.jl
@@ -0,0 +1,16 @@
+sanction = dataset("Zelig", "sanction")
+
+tests = [
+    (@formula(Num ~ Target + Coop + NCost), 344.8795500751855, 361.37651186201265),
+    (@formula(Num ~ 0 + Target + Coop + NCost), 344.87955007518536, 361.37651186201253),
+    (@formula(Num ~ Target + Target^2 + Coop + Coop * Target), 362.262357097934, 376.40261005807156),
+    (@formula(Num ~ log(Target) + log(Coop)), 367.1949690199781, 376.6218043267365)
+]
+
+for (test_formula, test_aic, test_bic) in tests
+    crrao_model = fit(test_formula, sanction, NegBinomRegression())
+    glm_model = negbin(test_formula, sanction, LogLink())
+    compare_models(crrao_model, glm_model, sanction)
+    @test isapprox(aic(crrao_model), test_aic)
+    @test isapprox(bic(crrao_model), test_bic)
+end
diff --git a/test/numerical/frequentist/PoissonRegression.jl b/test/numerical/frequentist/PoissonRegression.jl
@@ -0,0 +1,16 @@
+sanction = dataset("Zelig", "sanction")
+
+tests = [
+    (@formula(Num ~ Target + Coop + NCost), 580.673868966946884, 594.814121927084443),
+    (@formula(Num ~ 0 + Target + Coop + NCost), 580.673868966946770, 594.814121927084329),
+    (@formula(Num ~ Target + Target^2 + Coop + Coop * Target), 871.418230125844502, 883.201774259292506),
+    (@formula(Num ~ log(Target) + log(Coop)), 944.326386272138961, 951.396512752207741)
+]
+
+for (test_formula, test_aic, test_bic) in tests
+    crrao_model = fit(test_formula, sanction, PoissonRegression())
+    glm_model = glm(test_formula, sanction, Poisson(), LogLink())
+    compare_models(crrao_model, glm_model, sanction)
+    @test isapprox(aic(crrao_model), test_aic)
+    @test isapprox(bic(crrao_model), test_bic)
+end
diff --git a/test/numerical/frequentist/tests.jl b/test/numerical/frequentist/tests.jl
@@ -0,0 +1,20 @@
+function compare_models(crrao_model, glm_model, df)
+    @test isapprox(coeftable(crrao_model).cols, coeftable(glm_model).cols)
+    @test isapprox(predict(crrao_model, df), predict(glm_model, df))
+end
+
+@testset "Linear Regression" begin
+    include("LinearRegression.jl")
+end
+
+@testset "Logistic Regression" begin
+    include("LogisticRegression.jl")
+end
+
+@testset "Poisson Regression" begin
+    include("PoissonRegression.jl")
+end
+
+@testset "Negative Binomial Regression" begin
+    include("NegBinomialRegression.jl")
+end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -1,4 +1,4 @@
-using CRRao, Test, StableRNGs, Logging, RDatasets, StatsModels, Statistics
+using CRRao, Test, StableRNGs, Logging, RDatasets, StatsModels, GLM, Statistics
 
 Logging.disable_logging(Logging.Warn)
 
@@ -25,20 +25,25 @@ CRRao.set_rng(StableRNG(123))
     end
 
     @testset "Numerical Tests" begin
-        @testset "Linear Regression" begin
-            include("numerical/bayesian/LinearRegression.jl")
+        @testset "Frequentist" begin
+            include("numerical/frequentist/tests.jl")
         end
+        @testset "Bayesian" begin
+            @testset "Linear Regression" begin
+                include("numerical/bayesian/LinearRegression.jl")
+            end
 
-        @testset "Logistic Regression" begin
-            include("numerical/bayesian/LogisticRegression.jl")
-        end
+            @testset "Logistic Regression" begin
+                include("numerical/bayesian/LogisticRegression.jl")
+            end
 
-        @testset "Poisson Regression" begin
-            include("numerical/bayesian/PoissonRegression.jl")
-        end
+            @testset "Poisson Regression" begin
+                include("numerical/bayesian/PoissonRegression.jl")
+            end
 
-        @testset "Negative Binomial Regression" begin
-            include("numerical/bayesian/NegBinomialRegression.jl")
+            @testset "Negative Binomial Regression" begin
+                include("numerical/bayesian/NegBinomialRegression.jl")
+            end
         end
     end
 end