JuliaStats · mapi1 · Jul 17, 2023 · Jul 17, 2023 · Aug 1, 2023 · nalimilan
diff --git a/src/correlation.jl b/src/correlation.jl
@@ -2,8 +2,11 @@
 # TODO: Implement `CorrelationTest` for two arguments
 
 using Statistics: clampcor
+using StatsBase: corspearman
 
-export CorrelationTest
+export CorrelationTest, SpearmanCorrelationTest
+
+abstract type AbstractCorrelationTest <: HypothesisTest end
 
 """
     CorrelationTest(x, y)
@@ -17,6 +20,7 @@ Perform a t-test for the hypothesis that ``\\text{Cor}(x,y|Z=z) = 0``, i.e. the
 correlation of vectors `x` and `y` given the matrix `Z` is zero.
 
 Implements `pvalue` for the t-test.
+
 Implements `confint` using an approximate confidence interval based on Fisher's
 ``z``-transform.
 
@@ -29,7 +33,7 @@ See also `partialcor` from StatsBase.
 * [Section testing using Student's t-distribution from Pearson correlation coefficient on Wikipedia](https://en.wikipedia.org/wiki/Pearson_correlation_coefficient#Testing_using_Student's_t-distribution)
 * [K.J. Levy and S.C. Narula (1978): Testing Hypotheses concerning Partial Correlations: Some Methods and Discussion. International Statistical Review 46(2).](https://www.jstor.org/stable/1402814)
 """
-struct CorrelationTest{T<:Real} <: HypothesisTest
+struct CorrelationTest{T<:Real} <: AbstractCorrelationTest
     r::T
     n::Int
     k::Int
@@ -60,15 +64,15 @@ struct CorrelationTest{T<:Real} <: HypothesisTest
 end
 
 testname(p::CorrelationTest) =
-    string("Test for nonzero ", p.k != 0 ? "partial " : "", "correlation")
+    string("Test for nonzero ", p.k != 0 ? "partial " : "", "Pearson correlation")
 
 function population_param_of_interest(p::CorrelationTest)
-    param = p.k != 0 ? "Partial correlation" : "Correlation"
+    param = p.k != 0 ? "Partial Pearson correlation" : "Pearson correlation"
-    param = p.k != 0 ? "Partial Pearson correlation" : "Pearson correlation"
+    param = p.k != 0 ? "Partial correlation" : "Pearson correlation"
-    param = p.k != 0 ? "Partial Pearson correlation" : "Pearson correlation"
+    param = p.k != 0 ? "Partial correlation" : "Pearson correlation"
     (param, zero(p.r), p.r)
 end
 
-StatsAPI.nobs(p::CorrelationTest) = p.n
-StatsAPI.dof(p::CorrelationTest) = p.n - 2 - p.k
+StatsAPI.nobs(p::AbstractCorrelationTest) = p.n
+StatsAPI.dof(p::AbstractCorrelationTest) = p.n - 2 - p.k
 
 function StatsAPI.confint(test::CorrelationTest{T}, level::Float64=0.95) where T
     dof(test) > 1 || return (-one(T), one(T))  # Otherwise we can get NaNs
@@ -80,12 +84,65 @@ function StatsAPI.confint(test::CorrelationTest{T}, level::Float64=0.95) where T
     return (elo, ehi)
 end
 
-default_tail(::CorrelationTest) = :both
-StatsAPI.pvalue(test::CorrelationTest; tail=:both) = pvalue(TDist(dof(test)), test.t, tail=tail)
+default_tail(::AbstractCorrelationTest) = :both
+StatsAPI.pvalue(test::AbstractCorrelationTest; tail=:both) = pvalue(TDist(dof(test)), test.t, tail=tail)
 
-function show_params(io::IO, test::CorrelationTest, indent="")
+function show_params(io::IO, test::AbstractCorrelationTest, indent="")
     println(io, indent, "number of observations:          ", nobs(test))
     println(io, indent, "number of conditional variables: ", test.k)
     println(io, indent, "t-statistic:                     ", test.t)
     println(io, indent, "degrees of freedom:              ", dof(test))
 end
+
+"""
+    SpearmanCorrelationTest(x, y)
+
+Perform a t-test for the hypothesis that ``\\text{Cor}(x,y) = 0``, i.e. the rank-based Spearman correlation 
+of vectors `x` and `y` is zero.
+
+Implements `pvalue` for the t-test.
-Implements `pvalue` for the t-test.
+Implements `pvalue` for the t-test using the Fisher transformation.
-Implements `pvalue` for the t-test.
+Implements `pvalue` for the t-test using the Fisher transformation.
+
+Implements `confint` using an approximate confidence interval adjusting for the non-normality of the ranks based on [1]. This is still an approximation and which performs insufficient in the case of:
+
+* small sample sizes n < 25
-Implements `confint` using an approximate confidence interval adjusting for the non-normality of the ranks based on [1]. This is still an approximation and which performs insufficient in the case of:
-
-* small sample sizes n < 25
+Implements `confint` using an approximate confidence interval adjusting for the non-normality of the ranks based on [1]. This is still an approximation, which performs insufficiently in the case of:
+* sample sizes below 25
-Implements `confint` using an approximate confidence interval adjusting for the non-normality of the ranks based on [1]. This is still an approximation and which performs insufficient in the case of:
-
-* small sample sizes n < 25
+Implements `confint` using an approximate confidence interval adjusting for the non-normality of the ranks based on [1]. This is still an approximation, which performs insufficiently in the case of:
+* sample sizes below 25
+* a high true population Spearman correlation
-* a high true population Spearman correlation
+* a true population Spearman correlation above 0.95
-* a high true population Spearman correlation
+* a true population Spearman correlation above 0.95
+
+In these cases a bootstrap confidence interval can perform better [2].
+
+# External resources
+[1] D. G. Bonett and T. A. Wright, “Sample size requirements for estimating pearson, kendall and spearman correlations,” Psychometrika, vol. 65, no. 1, pp. 23–28, Mar. 2000, doi: 10.1007/BF02294183.
-[1] D. G. Bonett and T. A. Wright, “Sample size requirements for estimating pearson, kendall and spearman correlations,” Psychometrika, vol. 65, no. 1, pp. 23–28, Mar. 2000, doi: 10.1007/BF02294183.
+[1] D. G. Bonett and T. A. Wright, “Sample size requirements for estimating Pearson, Kendall and Spearman correlations,” Psychometrika, vol. 65, no. 1, pp. 23–28, Mar. 2000, doi: 10.1007/BF02294183.
-[1] D. G. Bonett and T. A. Wright, “Sample size requirements for estimating pearson, kendall and spearman correlations,” Psychometrika, vol. 65, no. 1, pp. 23–28, Mar. 2000, doi: 10.1007/BF02294183.
+[1] D. G. Bonett and T. A. Wright, “Sample size requirements for estimating Pearson, Kendall and Spearman correlations,” Psychometrika, vol. 65, no. 1, pp. 23–28, Mar. 2000, doi: 10.1007/BF02294183.
+
+[2] A. J. Bishara and J. B. Hittner, “Confidence intervals for correlations when data are not normal,” Behav Res, vol. 49, no. 1, pp. 294–309, Feb. 2017, doi: 10.3758/s13428-016-0702-8.
+
-
-
+"""
+struct SpearmanCorrelationTest{T<:Real} <: AbstractCorrelationTest
+    r::T
+    n::Int
+    k::Int
+    t::T
+
+    # Error checking is done in `corspearman`
+    function SpearmanCorrelationTest(x::AbstractVector, y::AbstractVector)
+        r = corspearman(x, y)
+        n = length(x)
+        t = r * sqrt((n - 2) / (1 - r^2))
+        return new{typeof(r)}(r, n, 0, t)
+    end
+end
+
+testname(p::SpearmanCorrelationTest) =  "Spearman correlation"
-testname(p::SpearmanCorrelationTest) =  "Spearman correlation"
+testname(p::SpearmanCorrelationTest) = "Spearman correlation"
-testname(p::SpearmanCorrelationTest) =  "Spearman correlation"
+testname(p::SpearmanCorrelationTest) = "Spearman correlation"
+
+function population_param_of_interest(p::SpearmanCorrelationTest)
+    param = "Spearman correlation"
+    (param, zero(p.r), p.r)
+end
+
+function StatsAPI.confint(test::SpearmanCorrelationTest{T}, level::Float64=0.95) where T
+    dof(test) > 1 || return (-one(T), one(T))  # Otherwise we can get NaNs
+    q = quantile(Normal(), 1 - (1-level) / 2)
+    fisher = atanh(test.r)
+    bound = sqrt((1 + test.r^2 / 2) / (dof(test)-1)) * q # Estimates variance as in Bonnet et al. (2000)
-    bound = sqrt((1 + test.r^2 / 2) / (dof(test)-1)) * q # Estimates variance as in Bonnet et al. (2000)
+    # Estimates variance as in Bonett and Wright (2000)
+    bound = sqrt((1 + test.r^2 / 2) / (dof(test)-1)) * q
-    bound = sqrt((1 + test.r^2 / 2) / (dof(test)-1)) * q # Estimates variance as in Bonnet et al. (2000)
+    # Estimates variance as in Bonett and Wright (2000)
+    bound = sqrt((1 + test.r^2 / 2) / (dof(test)-1)) * q
+    elo = clampcor(tanh(fisher - bound))
+    ehi = clampcor(tanh(fisher + bound))
+    return (elo, ehi)
+end
diff --git a/test/correlation.jl b/test/correlation.jl
@@ -26,6 +26,30 @@ using DelimitedFiles
     @test pvalue(x) ≈ 0.2948405 atol=1e-6
 end
 
+@testset "SpearmanCorrelation" begin
+    # Columns are line number, calcium, iron
+    nutrient = readdlm(joinpath(@__DIR__, "data", "nutrient.txt"))[:, 1:3]
+    w = SpearmanCorrelationTest(nutrient[:,2], nutrient[:,3])
+    let out = sprint(show, w)
+        @test occursin("reject h_0", out) && !occursin("fail to", out)
+    end
+    # let ci = confint(w)
+    #     @test first(ci) ≈ 0.3327138 atol=1e-6
+    #     @test last(ci) ≈ 0.4546639 atol=1e-6
+    # end
+    @test nobs(w) == 737
+    @test dof(w) == 735
+    @test pvalue(w) < 1e-25
+
+    x = SpearmanCorrelationTest(nutrient[:,1], nutrient[:,2])
+    @test occursin("fail to reject", sprint(show, x))
+    # let ci = confint(x)
+    #     @test first(ci) ≈ -0.1105478 atol=1e-6
+    #     @test last(ci) ≈ 0.0336730 atol=1e-6
+    # end
+    @test pvalue(x) ≈ 0.09275 atol=1e-2 # value from R's cor.test(..., method="spearman") which does not use a t test algorithm AS 89 
+end
+
 @testset "Partial correlation" begin
     # Columns are information, similarities, arithmetic, picture completion
     wechsler = readdlm(joinpath(@__DIR__, "data", "wechsler.txt"))[:,2:end]