Commit 7a14bfa1 authored by Chris Jewell's avatar Chris Jewell
Browse files

Switched to using data structures to sanitise input data.

parent e24a727d
......@@ -16,47 +16,12 @@
#' @import dplyr
#' @export
#'
#' @return Object of \code{\link{HaldDP}} with methods for creating a HaldDP model,
#' running the model, and accessing and plotting the results.
#' @format Object of \code{\link{R6Class}} with methods for creating a HaldDP model,
#' running the model, and accessing and plotting the results.
#' @section Description:
#' This function fits a non-parametric Poisson source attribution model for human cases of
#' disease. It supports multiple types, sources, times and locations. The number of
#' human cases for each type, time and location follow a Poisson likelihood.
#'
#' @section Methods:
#' \describe{
#' \item{\code{new(data, k, priors, a_q, inits = NULL)}}{Constructor takes
#'
#' \code{data} dataframe (with columns containing the number of human cases named
#' \code{Human}, columns containing the number of positive source samples
#' (one column per source), a column with the time id's named \code{Time},
#' a column with the type id's named \code{Type}, and a column with the source
#' location id's \code{Location}). The data for the human cases and source
#' counts must be integers. The data for the time, location and type columns
#' must be factors. The source counts are currently only allowed to vary over time,
#' hence they must be repeated for each location within each time.
#'
#' \code{k} prevalence dataframe (with columns named \code{Value, Time,
#' Location and Source}). Prevalences must be between 0 and 1 as they are the
#' proportion of samples that were positive for any type for a given source and time.
#'
#' \code{priors} list with elements named \code{a_r}, \code{a_alpha}, \code{a_theta} and \code{b_theta},
#' @param y a \code{\link{Y}} object containing case observations
#' @param x a \code{\link{X}} object containing source observations
#' @param k a \code{\link{Prev}} object containing source prevalences
#' @param priors \code{priors} list with elements named \code{a_r}, \code{a_alpha}, \code{a_theta} and \code{b_theta},
#' corresponding to the prior parameters for the \code{r}, \code{alpha}, and base
#' distribution for the DP parameters respectively.
## The \code{r} parameters have
## a Dirichlet prior for each type/ time combination, whilst the \code{alpha}
## parameters have a Dirichet prior for each time/ location combination. Therefore,
## the prior values \code{a_alpha} can be either a single positive number (to be
## used for all \code{alphas}) or a data frame with a single positive number for
## each \code{alpha} parameter (in a column called \code{Value}) and columns named
## \code{Time}, \code{Location} and \code{Source} containing the time, location and
## source names for each \code{alpha} prior parameter. Similarly, the prior values \code{a_r}
## can either be a single positive number or a data frame with columns named \code{Value},
## \code{Time}, \code{Type} and \code{Source}.
## The base distribution is Gamma distributed with shape and rate parameters given by \code{a_theta}
## and \code{b_theta} respectively.
#'
#' \tabular{lllll}{
#' \emph{Parameter} \tab \emph{Prior Distribution} \tab \emph{Prior Parameters}\cr
......@@ -78,9 +43,7 @@
#' distribution (\code{theta}) \tab alpha)\tab rate of the Gamma base distribution.\cr
#' }
#'
#' \code{a_q} the Dirichlet Process concentration parameter.
#'
#' \code{inits} (optional) initial values for the mcmc algorithm. This is a list
#' @param init initial values for the mcmc algorithm. This is a list
#' that may contain any of the following items: \code{alpha} (a data frame with
#' columns named \code{Value} contining the initial values, \code{Time},
#' \code{Location}, \code{Source}), \code{q} (a data frame with columns names
......@@ -90,6 +53,7 @@
#' named \code{Source}, a column with the time id's named \code{Time}, a column
#' with the type id's named \code{Type}.)
#' An optional list giving the starting values for the parameters.
#'
#' \tabular{lll}{
#' \emph{Parameter} \tab \emph{Description} \cr
#' \code{r}
......@@ -107,9 +71,22 @@
#' \tab and the type ids (named Type). DEFAULT: initialise all type effects to be in \cr
#' \tab a single group with a theta value calculated as \cr
#' \tab \eqn{\theta = sum(Human_itl) / sum_l=1^L(sum_t=1^T(sum_i=1^n(sum_j=1^m(alpha_jtl * r_ijt * k_jt))))}. \cr
#' \tab i.e. \eqn{theta = sum(Human_itl) / sum(lambda_ijtl / theta)}}
#' \tab i.e. \eqn{theta = sum(Human_itl) / sum(lambda_ijtl / theta)}
#' }
#'
#' @param a_q the Dirichlet Process concentration parameter.
#'
#' @format Object of \code{\link{R6Class}} with methods for creating a HaldDP model,
#' running the model, and accessing and plotting the results.
#' @section Description:
#' This function fits a non-parametric Poisson source attribution model for human cases of
#' disease. It supports multiple types, sources, times and locations. The number of
#' human cases for each type, time and location follow a Poisson likelihood.
#' @return Object of \code{\link{HaldDP}} with methods for creating a HaldDP model,
#' running the model, and accessing and plotting the results.
#'
#' @section HaldDP Object Methods:
#' \describe{
#' \item{\code{mcmc_params(n_iter = 1000, burn_in = 0, thin = 1,
#' n_r = ceiling(private$nTypes * 0.2), params_fix = NULL)}}{when called, sets the mcmc
#' parameters.
......@@ -417,15 +394,16 @@ HaldDP_ <- R6::R6Class(
private$namesLocations = dimnames(private$y)$location
private$namesSources = dimnames(private$X)$source
},
set_a_q = function(a_q)
checkin_a_q = function(a_q)
{
if (length(a_q) != 1 |
!isFinitePositive(a_q) |
a_q <= 0)
if (length(a_q) > 1)
warning("length(a_q) > 1 so only first element will be used")
if (!isFinitePositive(a_q[1]) |
a_q[1] <= 0)
stop("a_q should be a single positive number")
private$a_q <- a_q
private$a_q <- a_q[1]
},
set_priors = function(priors)
checkin_priors = function(priors)
{
if (!is.list(priors) |
!all(c("a_alpha", "a_r", "a_theta", "b_theta") %in% names(priors)))
......@@ -576,112 +554,44 @@ HaldDP_ <- R6::R6Class(
priors$a_r <- a_r
private$priors <- HaldDP_priors$new(priors)
},
set_inits = function(inits)
checkin_inits = function(inits)
{
## r values
if (!("r" %in% names(inits))) {
## default is the source data matrix (plus stability factor)
## default is the source data matrix MLE (plus stability factor)
inits$r = apply(private$X + 1e-6, c('source', 'time'), function(x)
x / sum(x)) %>% aperm(c('type', 'source', 'time'))
} else {
if (!is.data.frame(inits$r))
stop("inits$r must be a data frame.")
if (!all(c("type", "source", "time", "value") %in% colnames(inits$r)))
stop("inits$r must be a data frame with columns called type, source, time and value")
if (!setequal(unique(inits$r$Type), private$namesTypes) |
!setequal(unique(inits$r$Time), private$namesTimes) |
!setequal(unique(inits$r$Source), private$namesSources))
stop(
"inits$r must be a data frame with columns called type, source, time and value with one row per combination of type, source and time."
)
initr_r = tryCatch(
acast(inits$r, Type ~ Source ~ Time, value.var = 'Value'),
condition=function(c) stop("inits$r must have a non-NA row for each Type/Time/Source combination.")
)
names(dimnames(initr_r)) = c('type','source','time')
sumToUnit = apply(initr_r, c('type','time'), sum)
if(!isTRUE(all.equal(sumToUnit, 1, tol=1e-5)))
stop("inits$r must sum to 1 within each time and source")
inits$r <- inits_r
if (!identical(class(inits$r), c('X','Data','R6')))
stop("inits$r must be an object of type sourceR::X")
if (!identical(dimnames(inits$r), dimnames(private$X)))
stop("inits$r must be of the same rank as source data")
srcSums = apply(inits$r$x, c('source','time'), sum)
if(any(srcSums != 1))
warning("1 or more r[,source,time] vectors did not sum to 1. Renormalising.")
inits$r$x <- apply(init$r$x, c('source','time'), function(x) x/sum(x))
}
## alpha inits
inits_alpha <- array(
NA,
dim = c(private$nSources, private$nTimes, private$nLocations),
dimnames = list(
source = private$namesSources,
time = private$namesTimes,
location = private$namesLocations
)
)
if (!("alpha" %in% names(inits))) {
## Surely this is very slow!! TODO: find better way!
for (time in private$namesTimes) {
for (location in private$namesLocations) {
inits_alpha[, which(private$namesTimes == time),
which(private$namesLocations == location)] <-
c(gtools::rdirichlet(1, private$priors$a_alpha[, time, location]))
}
}
inits$alpha = apply(private$priors$a_alpha, c('time', 'location'), function(x) {
z = gtools::rdirichlet(1, x)
names(z) = names(x)
z
})
} else {
if (!is.data.frame(inits$alpha))
stop("inits$alpha must be a data frame.")
inits$alpha <- na.omit(inits$alpha)
if (!all(c("location", "source", "time", "value") %in% colnames(inits$alpha)))
stop("inits$alpha must be a data frame with columns called location, source, time and value")
inits$alpha$Location <- as.factor(inits$alpha$Location)
inits$alpha$Source <- as.factor(inits$alpha$Source)
inits$alpha$Time <- as.factor(inits$alpha$Time)
if (!all(paste(gtools::mixedsort(unique(
inits$alpha$Location
))) == private$namesLocations) |
!all(paste(gtools::mixedsort(unique(
inits$alpha$Time
))) == private$namesTimes) |
!all(paste(gtools::mixedsort(unique(
inits$alpha$Source
))) == private$namesSources) |
dim(inits$alpha)[1] != (private$nLocations * private$nSources * private$nTimes))
stop(
"inits$alpha must be a data frame with columns called location, source, time and value with one row per combination of location, source and time."
)
if (!all(isFinitePositive(inits$alpha$Value)))
stop("inits$alpha$Value must contain only positive numbers")
## Extract values from inits alpha data frame and put into arrays.
## Surely this is very slow!! TODO: find better way!
for (time in private$namesTimes) {
for (location in private$namesLocations) {
for (sources in private$namesSources) {
tmp_init_alpha <-
inits$alpha$Value[which((inits$alpha$Location == location) &
(inits$alpha$Time == time) &
(inits$alpha$Source == sources)
)]
if (length(tmp_init_alpha) == 0L)
stop("inits$alpha must have a non-NA row for each time, location and source.")
if (length(tmp_init_alpha) != 1 |
!isFinitePositive(tmp_init_alpha))
stop("All elements of inits$alpha must be positive numbers.")
inits_alpha[which(private$namesSources == sources),
which(private$namesTimes == time),
which(private$namesLocations == location)] <-
tmp_init_alpha
}
## check the initial values sum to 1 within each time and location
if (!isTRUE(all.equal(sum(inits_alpha[, which(private$namesTimes == time), which(private$namesLocations == location)]), 1)))
stop("inits$alpha must sum to 1 within each time and location.")
}
}
if(!identical(class(inits$alpha), c('Alpha','Data','R6')))
stop('inits$alpha must be of class sourceR::Alpha')
if(!( identical(dimnames(inits$alpha$x)$source, private$namesSources) &
identical(dimnames(inits$alpha$x)$time, private$namesTimes) &
identical(dimnames(inits$alpha$x)$location, private$namesLocations)))
stop('inits$alpha must have same number of sources and times as X, and same number of locations as y')
srcSums = apply(inits$alpha$x, c('time','location'), sum)
if(any(srcSums != 1))
warning("1 or more init$alpha vectors did not sum to 1. Renormalising.")
inits$alpha$x = apply(inits$alpha$x, c('time','location'), function(x) x/sum(x))
}
inits$alpha <- inits_alpha
## q inits
if (!("q" %in% names(inits))) {
......@@ -1008,11 +918,11 @@ HaldDP_ <- R6::R6Class(
private$checkin_data(y,x,k)
## read in priors
private$set_priors(priors)
private$set_a_q(a_q)
private$checkin_priors(priors)
private$checkin_a_q(a_q)
## read in initial values (must go after priors as uses priors to generate initial values)
private$set_inits(inits)
private$checkin_inits(inits)
private$DPModel_impl =
DPModel_impl$new(
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment