Load libraries and data as before
library("ggplot2")
library("dplyr")
library("tidyr")
library("knitcitations")
library("rgpdd")
library("FKF")
Parallel version of dplyr::do
do_parallel <- function(df, f, ...){
# supports only one group for now
require("parallel")
require("lazyeval")
require("reshape2")
options(mc.cores = detectCores())
grps <- groups(df)
ids <- sapply(grps, function(i) unique(df[[as.character(i)]]))
names(ids) <- as.character(ids)
## turn grouped data.frame to a list of data.frames by MainID
list_data <- lapply(ids,
function(id){
.dots <- list(interp(~y == x, .values = list(y = grps[[1]], x = id)))
filter_(df, .dots = .dots)
})
## Actually do the fitting in parallel
list_out <- mclapply(list_data, f, ...)
## reshape outputs back to a data.frame
melt(list_out, id=names(list_out[[1]])) %>%
rename_(.dots = setNames(list("L1"), as.character(grps[[1]])) ) %>%
as_data_frame()
}
Prepare data
Prepare data, as before: we filter on the stated criteria
gpdd_main %>%
filter(SamplingProtocol == "Count",
SourceDimension %in% c("Count", "Index"),
SamplingFrequency == "1",
DatasetLength >= 15) %>%
select(MainID) %>%
arrange(MainID) ->
filtered
and select data matching this filter. We add a column for the log of the population size and group by data ID:
gpdd_data %>%
filter(MainID %in% filtered$MainID) %>%
select(MainID, Population, SampleYear) %>%
group_by(MainID) %>%
mutate(logN = log(Population)) ->
df
Lastly, we replace -Inf
(introduced from log(0)
terms) with smallest finite values observed. (arbitrary, authors do not specify how these values are handled.)
i <- which(df$logN == -Inf)
df$logN[i] <- min(df$logN[-i])-1
We may test on a subset of the data first:
#not run
some <- sample(unique(df$MainID), 10)
df %>% filter(MainID %in% some) -> df
Import function definitions
We import our previous model definitions:
downloader::download("https://github.com/ropensci/rgpdd/raw/master/inst/scripts/knape-de-valpine.R", "knape-de-valpine.R")
source("knape-de-valpine.R")
unlink("knape-de-valpine.R")
Simulating
FKF package doesn’t bother to define a simulation method, so we can simply define one directly from the state equations. Though a C implementation would be preferrable, fitting will always be much more rate-limiting. (We will also ignore the multi-variate definition for simplicity here).
use <- function(x, default){
if(is.null(x))
default
else
x
}
sim_fkf <- function(fit){
n <- fit[["n"]]
dt <- fit[["dt"]]
HHt <- fit[["HHt"]]
Tt <- use(fit[["Tt"]], 1)
GGt <- use(fit[["GGt"]], 0)
a0 <- fit[["a0"]]
ct <- 0
Zt <- 1
a <- numeric(n)
y <- numeric(n)
eta <- rnorm(n, dt, sqrt(HHt))
epsilon <- rnorm(n, ct, sqrt(GGt))
a[1] <- a0
for(t in 1:(n-1)){
a[t+1] <- Tt * a[t] + eta[t]
y[t] <- Zt * a[t] + epsilon[t]
}
y[n] <- Zt * a[n] + epsilon[n]
y
}
The study also creates simulated datasets based on the real data but explicitly making the assumption of either density independence (DI) or density dependence (DD). For each dataset, a density-independent simulated dataset is created by simulating under the SSRW model that was fit. The density-dependent model is created by explicitly fixing the density dependent parameter (\(c\) in the language of the paper, Tt
in FKF notation) to 0.8 and estimating the other parameters of this modified SSG model. We can define this model analgously to the others, only this time fixing Tt = 0.8
:
fit_dd <- function(y,
init = c(dt = mean(y), HHt = log(var(y)/2), GGt = log(var(y)/2)),
...){
o <- optim(init,
fn = function(par, ...)
-fkf(dt = matrix(par[1]), HHt = matrix(exp(par[2])),
GGt = matrix(exp(par[3])), ...)$logLik,
Tt = matrix(0.8), a0 = y[1], P0 = matrix(10),
ct = matrix(0), Zt = matrix(1), yt = rbind(y),
check.input = FALSE, ...)
o$par[["HHt"]] <- exp(o$par[["HHt"]])
o$par[["GGt"]] <- exp(o$par[["GGt"]])
c(o, list(a0 = y[1], n = length(y)))
}
The script adds a method for this to robust_fit()
as well; though given the computational cost it is not clear if a robust fit is actually used in generating the data.
sim_di <- function(df) data.frame(logN = sim_fkf(robust_fit("ssrw", df$logN, N = 3)))
sim_dd <- function(df) data.frame(logN = sim_fkf(robust_fit("dd", df$logN, N = 3)))
system.time(
df %>% group_by(MainID) %>% do_parallel(sim_di) -> DI
)
user system elapsed
123.079 7.039 8.836
system.time(
df %>% group_by(MainID) %>% do_parallel(sim_dd) -> DD
)
user system elapsed
120.578 5.209 8.023
We can then use these two collections of datasets just as before:
system.time(
DD %>% group_by(MainID) %>% do_parallel(kalman, method = "BFGS") -> DD_fits
)
user system elapsed
9218.572 248.985 324.883
system.time(
DI %>% group_by(MainID) %>% do_parallel(kalman, method = "BFGS") -> DI_fits
)
user system elapsed
8074.330 230.513 280.989
and from before:
system.time(
df %>% group_by(MainID) %>% do_parallel(kalman, method = "BFGS") -> fits
)
user system elapsed
7761.519 201.922 256.863
Figure 2
From these simulations and corresponding parameter estimates of the density-dependent parameter, we can create our version of Figure 2:
order <- c("real", "independent", "dependent")
combined <- rbind(
DD_fits %>%
filter(model %in% c("ssg", "g"), parameter == "Tt") %>%
select(model, value, MainID) %>%
mutate(type = factor("dependent", levels=order)),
DI_fits %>%
filter(model %in% c("ssg", "g"), parameter == "Tt") %>%
select(model, value, MainID) %>%
mutate(type = factor("independent", levels=order)),
fits %>%
filter(model %in% c("ssg", "g"), parameter == "Tt") %>%
select(model, value, MainID) %>%
mutate(type = factor("real", levels=order))) %>%
ungroup() %>%
transmute(uncertainty =
plyr::revalue(model,
c(ssg = "accounting for uncertainty",
g = "ignoring uncertainty")),
type, value, MainID)
ggplot(combined) +
geom_histogram(aes(value), binwidth=0.2) +
facet_grid(uncertainty ~ type) +
geom_vline(aes(xintercept = mean(value)), lwd=.5) +
geom_vline(aes(xintercept = median(value)), lwd=.5, col="grey") +
xlim(c(-1.1,1.1)) +
xlab("c value") +
theme_bw(16)
Bootstrapping
With fitting and simulating functions in place, defining the bootstrap is straight forward. We define these separately for the state-space Gompertz (ssg; i.e. the model with both density dependence and observational errors) and the Gompertz (g; density dependence, no observational error). We compare in each case to the simulations of the corresponding model without density dependence.
bootstrap <- function(df, null_model = "ssrw", test_model = "ssg", N=100){
y <- df$logN
ssg <- robust_fit(test_model, y)
ssrw <- robust_fit(null_model, y)
sims <- as.data.frame(t(replicate(N, sim_fkf(ssrw))))
# We use a relaxed version of robust_fit, with N=3
sims %>% rowwise() %>% do(robust_fit(null_model, y = as.numeric(.), N = 3)) %>% select(mloglik) -> null
# compute p value of observed LR statistic relative to null distribution
lr <- 2 * (ssrw$mloglik - ssg$mloglik)
null_dist <- 2 * (null$mloglik - ssg$mloglik)
data.frame(p = sum(null_dist < lr)/N)
}
With these functions defined, we can perform the actual analysis.
system.time(
df %>% group_by(MainID) %>% do_parallel(bootstrap, "ssrw", "ssg") -> ssg_p_values
)
user system elapsed
18552.515 509.365 630.503
system.time(
df %>% group_by(MainID) %>% do_parallel(bootstrap, "rw", "g") -> g_p_values
)
user system elapsed
7958.974 215.100 259.174
We can also do the bootstrapping for the simulated data:
system.time(
DD %>% group_by(MainID) %>% do_parallel(bootstrap, "ssrw", "ssg") -> dd_ssg_p_values
)
user system elapsed
20739.616 533.335 704.335
system.time(
DD %>% group_by(MainID) %>% do_parallel(bootstrap, "rw", "g") -> dd_g_p_values
)
user system elapsed
9970.254 285.473 332.748
system.time(
DI %>% group_by(MainID) %>% do_parallel(bootstrap, "ssrw", "ssg") -> di_ssg_p_values
)
user system elapsed
18631.331 479.485 622.542
system.time(
DI %>% group_by(MainID) %>% do_parallel(bootstrap, "rw", "g") -> di_g_p_values
)
user system elapsed
8021.423 227.704 264.113
P <- rbind(
ssg_p_values %>% mutate(data = "real", model = "ssg"),
g_p_values %>% mutate(data = "real", model = "g"),
di_ssg_p_values %>% mutate(data = "DI", model = "ssg"),
di_g_p_values %>% mutate(data = "DI", model = "g"),
dd_g_p_values %>% mutate(data = "DD", model = "g"),
dd_ssg_p_values %>% mutate(data = "DD", model = "ssg"))
ggplot(P) +
geom_histogram(aes(p)) +
facet_grid(model ~ data) +
theme_bw(16)