PDQ Functions via Gram Charlier, Edgeworth, and Cornish Fisher Approximations
– Steven E. Pav, shabbychef@gmail.com
This package may be installed from CRAN; the latest development version may be installed via drat, or built from github:
# install via CRAN:
install.packages("PDQutils")
# get latest dev release via drat:
if (require(drat)) {
:::add("shabbychef")
dratinstall.packages("PDQutils")
}
# get snapshot from github (may be buggy)
if (require(devtools)) {
install_github("shabbychef/PDQutils")
}
Approximating the distribution of a random variable via the Gram Charlier, Edgeworth, or Cornish Fisher expansions is most convenient when the random variable can be decomposed as the sum of a small number of independent random variables whose cumulants can be computed. For example, suppose \(Y = \sum_{1 \le i \le k} \sqrt{X_i / \nu_i}\) where the \(X_i\) are independent central chi-square random variables with degrees of freedom \(\nu_1,\nu_2,...,\nu_k\). I will call this a ‘snak’ distribution, for ‘sum of Nakagami’, since each summand follows a Nakagami distribution. We can easily write code that generates variates from this distribution given a vector of the degrees of freedom:
<- function(n, dfs) {
rsnak <- Reduce("+", lapply(dfs, function(k) {
samples sqrt(rchisq(n, df = k)/k)
})) }
Let’s take one hundred thousand draws from this distribution and see whether it is approximately normal, by performing a q-q plot against a normal distribution.
<- 1e+05
n.samp <- c(8, 15, 4000, 10000)
dfs set.seed(18181)
# now draw from the distribution
<- rsnak(n.samp, dfs)
rvs
<- data.frame(draws = rvs)
data <- mean(rvs)
mu <- sd(rvs)
sigma library(ggplot2)
<- ggplot(data, aes(sample = draws)) + stat_qq(distribution = function(p) {
ph qnorm(p, mean = mu, sd = sigma)
+ geom_abline(slope = 1, intercept = 0, colour = "red") +
}) theme(text = element_text(size = 8)) + labs(title = "Q-Q plot (against normality)")
print(ph)
While this is very nearly normal, we can get a better approximation. Using the additivity property of cumulants, we can compute the cumulants of \(Y\) easily if we have the cumulants of the \(X_i\). These in turn can be computed from the raw moments. See wikipedia for the raw moments of the Chi distribution. The following function computes the cumulants:
# for the moment2cumulant function:
library(PDQutils)
# compute the first ord.max raw cumulants of the
# sum of Nakagami variates
<- function(dfs, ord.max = 10) {
snak_cumulants # first compute the raw moments
<- lapply(dfs, function(k) {
moms <- 1:ord.max
ords <- 2^(ords/2) * exp(lgamma((k + ords)/2) -
moms lgamma(k/2))
# we are dividing the chi by sqrt the d.f.
<- moms/(k^(ords/2))
moms
moms
})# turn moments into cumulants
<- lapply(moms, moment2cumulant)
cumuls # sum the cumulants
<- Reduce("+", cumuls)
tot.cumul return(tot.cumul)
}
We can now implement the ‘dpq’ functions trivially using the Edgeworth and Cornish Fisher approximations, as follows:
library(PDQutils)
<- function(x, dfs, ord.max = 6, ...) {
dsnak <- snak_cumulants(dfs, ord.max)
raw.cumul <- dapx_edgeworth(x, raw.cumul, support = c(0,
retval Inf), ...)
return(retval)
}<- function(q, dfs, ord.max = 6, ...) {
psnak <- snak_cumulants(dfs, ord.max)
raw.cumul <- papx_edgeworth(q, raw.cumul, support = c(0,
retval Inf), ...)
return(retval)
}<- function(p, dfs, ord.max = 10, ...) {
qsnak <- snak_cumulants(dfs, ord.max)
raw.cumul <- qapx_cf(p, raw.cumul, support = c(0,
retval Inf), ...)
return(retval)
}
The density and distribution functions could also have been implemented via the Gram Charlier expansion, although there seems to be little justification for so doing, as the Edgeworth expansion is often a better approximation.
<- function(x, dfs, ord.max = 10, ...) {
dsnak_2 <- cumulant2moment(snak_cumulants(dfs,
raw.moment
ord.max))<- dapx_gca(x, raw.moment, support = c(0,
retval Inf), ...)
return(retval)
}<- function(q, dfs, ord.max = 10, ...) {
psnak_2 <- cumulant2moment(snak_cumulants(dfs,
raw.moment
ord.max))<- papx_gca(q, raw.moment, support = c(0,
retval Inf), ...)
return(retval)
}
The q-q plot looks better now:
<- data.frame(draws = rvs)
data library(ggplot2)
<- ggplot(data, aes(sample = draws)) + stat_qq(distribution = function(p) {
ph qsnak(p, dfs = dfs)
+ geom_abline(slope = 1, intercept = 0, colour = "red") +
}) theme(text = element_text(size = 8)) + labs(title = "Q-Q against qsnak (C-F appx.)")
print(ph)
Note that the q-q plot uses the approximate quantile function, qsnak. If we compute the approximate CDF of the random draws, we hope it will be nearly uniform, and indeed it is:
<- psnak(rvs, dfs = dfs)
apx.p if (require(ggplot2)) {
# qplot(apx.p, stat='ecdf', geom='step')
ggplot(data.frame(pv = apx.p), aes(x = pv)) + stat_ecdf(geom = "step")
}
Blinnikov and Moessner note that the Gram Charlier expansion will actually diverge for some distributions when more terms in the expansion are considered, behaviour which is not seen for the Edgeworth expansion. We will consider the case of a chi-square distribution with 5 degrees of freedom. The 2 and 6 term Gram Charlier expansions are shown, along with the true value of the PDF, replicating figure 1 of Blinnikov and Moessner:
# compute moments and cumulants:
<- 5
df <- 20
max.ord <- 0:(max.ord - 1)
subords <- df * (2^subords) * factorial(subords)
raw.cumulants <- cumulant2moment(raw.cumulants)
raw.moments
# compute the PDF of the rescaled variable:
<- seq(-sqrt(df/2) * 0.99, 6, length.out = 1001)
xvals <- sqrt(2 * df) * xvals + df
chivals <- sqrt(2 * df) * dchisq(chivals, df = df)
pdf.true
<- sqrt(2 * df) * dapx_gca(chivals, raw.moments = raw.moments[1:2],
pdf.gca2 support = c(0, Inf))
<- sqrt(2 * df) * dapx_gca(chivals, raw.moments = raw.moments[1:6],
pdf.gca6 support = c(0, Inf))
<- data.frame(x = xvals, true = pdf.true, gca2 = pdf.gca2,
all.pdf gca6 = pdf.gca6)
# plot it by reshaping and ggplot'ing
require(reshape2)
<- melt(all.pdf, id.vars = "x", variable.name = "pdf",
arr.data value.name = "density")
require(ggplot2)
<- ggplot(arr.data, aes(x = x, y = density, group = pdf,
ph colour = pdf)) + geom_line()
print(ph)
Compare this with the 2 and 4 term Edgeworth expansions, replicating figure 6 of Blinnikov and Moessner:
# compute the PDF of the rescaled variable:
<- seq(-sqrt(df/2) * 0.99, 6, length.out = 1001)
xvals <- sqrt(2 * df) * xvals + df
chivals <- sqrt(2 * df) * dchisq(chivals, df = df)
pdf.true
<- sqrt(2 * df) * dapx_edgeworth(chivals,
pdf.edgeworth2 raw.cumulants = raw.cumulants[1:4], support = c(0,
Inf))
<- sqrt(2 * df) * dapx_edgeworth(chivals,
pdf.edgeworth4 raw.cumulants = raw.cumulants[1:6], support = c(0,
Inf))
<- data.frame(x = xvals, true = pdf.true, edgeworth2 = pdf.edgeworth2,
all.pdf edgeworth4 = pdf.edgeworth4)
# plot it by reshaping and ggplot'ing
require(reshape2)
<- melt(all.pdf, id.vars = "x", variable.name = "pdf",
arr.data value.name = "density")
require(ggplot2)
<- ggplot(arr.data, aes(x = x, y = density, group = pdf,
ph colour = pdf)) + geom_line()
print(ph)
In one of a series of papers, Chernozhukov et. al. demonstrate the use of monotonic rearrangements in Edgeworth and Cornish-Fisher expansions of the CDF and quantile functions, which are, by definition, non-decreasing. It is shown that monotone rearrangement reduces the error of an initial approximation. This is easy enough to code with tools readily available in R. First, let us compute the 8 term Gram Charlier approximation to the CDF of the Chi-square with 5 degrees of freedom. This should display non-monotonicity. Then we compute the monotonic rearrangement:
<- 5
df <- 20
max.ord <- 0:(max.ord - 1)
subords <- df * (2^subords) * factorial(subords)
raw.cumulants <- cumulant2moment(raw.cumulants)
raw.moments
# compute the CDF of the rescaled variable:
<- seq(-sqrt(df/2) * 0.99, 6, length.out = 1001)
xvals <- sqrt(2 * df) * xvals + df
chivals <- pchisq(chivals, df = df)
cdf.true <- papx_gca(chivals, raw.moments = raw.moments[1:8],
cdf.gca8 support = c(0, Inf))
# it is this simple:
require(quantreg)
<- stepfun(xvals, c(0, cdf.gca8))
in.fn <- rearrange(in.fn)
out.fn <- out.fn(xvals)
cdf.rearranged
<- data.frame(x = xvals, true = cdf.true, gca8 = cdf.gca8,
all.cdf rearranged = cdf.rearranged)
# plot it by reshaping and ggplot'ing
require(reshape2)
<- melt(all.cdf, id.vars = "x", variable.name = "cdf",
arr.data value.name = "density")
require(ggplot2)
<- ggplot(arr.data, aes(x = x, y = density, group = cdf,
ph colour = cdf)) + geom_line()
print(ph)