Commit 1eec5435 authored by Chris Jewell's avatar Chris Jewell
Browse files

Class-based data input system complete: data, initial values, and priors.

parent d0f9613e
......@@ -2,6 +2,7 @@
export(AdaptiveDirMRW)
export(AdaptiveLogDirMRW)
export(Alpha)
export(DataNode)
export(DirichletNode)
export(DirichletProcessNode)
......@@ -11,6 +12,7 @@ export(HaldDP)
export(PoisGammaDPUpdate)
export(PoissonNode)
export(Prev)
export(Q)
export(StochasticNode)
export(X)
export(Y)
......
HaldDP_inits = R6::R6Class(
"inits_class",
public = list(
theta = NULL,
s = NULL,
alpha = NULL,
r = NULL,
initialize = function(inits) {
self$theta <-
inits$theta ## vector of length number of groups (i.e. no of unique values in s)
self$s <- inits$s
self$alpha <-
inits$alpha ## 3D array [source, time, location]
self$r <-
inits$r ## 3D array [source, type, time] giving the relative prevalences
}
)
)
HaldDP_priors = R6::R6Class(
"priors",
public = list(
a_theta = NULL,
b_theta = NULL,
a_r = NULL,
a_alpha = NULL,
initialize = function(priors) {
self$a_theta <- priors$a_theta ## single number
self$b_theta <- priors$b_theta ## single number
self$a_r <-
priors$a_r ## DP prior for each type, source, time, location
self$a_alpha <-
priors$a_alpha ## DP prior for each source, time, location
}
)
)
......@@ -30,10 +30,10 @@ Y_ = R6::R6Class('Y',
type,
time = NULL,
location = NULL) {
if (is.null(data$time)) {
if (is.null(time)) {
data$time = rep('1', nrow(data))
}
if (is.null(data$location)) {
if (is.null(location)) {
data$location = rep('1', nrow(data))
}
data = data[, c(y, type, time, location)]
......@@ -77,9 +77,9 @@ Y = function(data, y = 'y', type = 'type', time = 'time', location = 'location')
X_ = R6::R6Class('X',
inherit = Data_,
private = list(
pack = function(data, x, type, source, time = NULL)
pack = function(data, x, type, source, time)
{
if (is.null(data$time)) {
if (is.null(time)) {
data$time = rep('1', nrow(data))
}
data = data[, c(x, type, source, time)]
......@@ -98,12 +98,12 @@ X_ = R6::R6Class('X',
},
check = function()
{
if (!all(isFiniteInteger(self$x)))
if (!all(is.finite(self$x)))
stop("Missing value in source data. Each source/type/time combination must have a value.")
if (any(self$x < 0))
stop("Negative value in source data.")
if (any(apply(self$x, c('type', 'time'), sum) == 0))
stop("Each type must have at least one source case")
stop("Each type/time must have at least one source case")
}
))
......@@ -120,7 +120,7 @@ X_ = R6::R6Class('X',
#'
#' @return A X source data structure for use in sourceR models
#' @export
X = function(data, x = 'x', type = 'type', source = 'source', time = 'time')
X = function(data, x, type, source, time = NULL)
X_$new(data, x, type, source, time)
......@@ -130,7 +130,7 @@ Prev_ = R6::R6Class('Prev',
private = list(
pack = function(data, prev, source, time)
{
if (is.null(data$time)) {
if (is.null(time)) {
data$time = rep('1', nrow(data))
}
data = data[, c(prev, source, time)]
......@@ -166,7 +166,7 @@ Prev_ = R6::R6Class('Prev',
#'
#' @return A Prev data structure for use in sourceR models
#' @export
Prev = function(data, prev = 'prev', source = 'source', time = 'time')
Prev = function(data, prev, source, time = NULL)
Prev_$new(data, prev, source, time)
#' Alpha prior hyperparameter class
......@@ -175,7 +175,7 @@ Alpha_ = R6::R6Class('Alpha',
private = list(
pack = function(data, alpha, source, time, location)
{
if (is.null(data$time)) {
if (is.null(time)) {
data$time = rep('1', nrow(data))
}
if(is.null(data$location)) {
......@@ -183,13 +183,13 @@ Alpha_ = R6::R6Class('Alpha',
}
data = data[, c(alpha, source, time, location)]
names(data) = c('alpha', 'source', 'time', 'location')
data[, c(source, time, location)] = as.character(data[,c(source, time, location)])
self$x = tryCatch(
reshape2::acast(data, source ~ time ~ location, value.var = 'prev', drop=F),
reshape2::acast(data, source ~ time ~ location, value.var = 'alpha', drop=F),
condition = function(c)
stop('Alpha must have one value per source/time/location')
)
names(dimnames(self$x)) = c('source', 'time')
names(dimnames(self$x)) = c('source', 'time','location')
},
check = function()
{
......@@ -213,6 +213,42 @@ Alpha_ = R6::R6Class('Alpha',
#'
#' @return An Alpha_ data structure for use in sourceR models
#' @export
Alpha = function(data, alpha = 'alpha', source = 'source', time = 'time', location = 'location')
Alpha = function(data, alpha, source, time = NULL, location = NULL)
Alpha_$new(data, alpha, source, time, location)
Q_ = R6::R6Class('Q',
inherit = Data_,
public = list(
s = NULL,
theta = NULL
),
private = list(
pack = function(data, q, type)
{
self$theta = unique(data[,q])
self$s = vapply(data[,q], function(qi) which(self$theta == qi), integer(1))
names(self$s) = data[,type]
},
check = function()
{
if(any(is.na(self$theta)))
stop("NA in q value")
if(any(self$theta <= 0))
stop("q value <= 0. q must be positive.")
}
))
#' Constructs initial values for q
#'
#' The Q constructor returns a R6 Q_ class which feeds sanitised
#' initial values for q into the model.
#'
#' @param data long-format data.frame containing type-name and q-value for each observation.
#' @param q name of 'q' column
#' @param type name of type column
#'
#' @return A Q_ data structure for use in sourceR models
#' @export
Q = function(data, q, type)
Q_$new(data, q, type)
......@@ -6,7 +6,7 @@
# Purpose: Source attribution model interface #
#####################################################
#' Runs the HaldDP source attribution model
#' Builds a HaldDP source attribution model
#'
#' @docType class
#' @name HaldDP
......@@ -57,21 +57,18 @@
#' \tabular{lll}{
#' \emph{Parameter} \tab \emph{Description} \cr
#' \code{r}
#' \tab A data frame with columns giving the initial values (named \code{Value}),\cr
#' \tab times (named Time) and source and type id's (named \code{Source} and Type. \cr
#' \tab DEFAULT: the default initial values are the maximum likelihood point \cr
#' \tab estimates of \code{r} from the source matrix (i.e. \eqn{r_ij = x_ij / sum_i=1^n x_ij}).\cr
#' \tab An object of type \code{\link{X}} giving the initial values for $R$ matrix,\cr
#' \tab If not specified defaults to the element-wise maximum likelihood\cr
#' \tab estimates of \code{r} from the source matrix.\cr
#' Source effects (\code{alpha})
#' \tab A data frame with columns named \code{Value} (containing the initial values), \cr
#' \tab \code{Source} (containing the source names) and columns giving the time and \cr
#' \tab location for each parameter (named Location). DEFAULT: The default initial values\cr
#' \tab for the source effects are drawn the prior distribution (Dirichlet). \cr
#' \tab An object of type \code{\link{Alpha}} specifying alpha value for each source/time/location.\cr
#' \tab If not specified, default initial values\cr
#' \tab for the source effects are drawn from the prior distribution. \cr
#' Type effects (\code{q})
#' \tab A data frame with columns giving the initial values (named \code{Value})\cr
#' \tab and the type ids (named Type). DEFAULT: initialise all type effects to be in \cr
#' \tab a single group with a theta value calculated as \cr
#' \tab \eqn{\theta = sum(Human_itl) / sum_l=1^L(sum_t=1^T(sum_i=1^n(sum_j=1^m(alpha_jtl * r_ijt * k_jt))))}. \cr
#' \tab i.e. \eqn{theta = sum(Human_itl) / sum(lambda_ijtl / theta)}
#' \tab An object of type \code{\link{Q}} giving the initial clustering and values for \eqn{q}\cr
#' \tab If not specified, defaults to a single group with a theta value calculated as \cr
#' \tab \eqn{\theta = sum(y_itl) / sum_l=1^L(sum_t=1^T(sum_i=1^n(sum_j=1^m(alpha_jtl * r_ijt * k_jt))))}. \cr
#' \tab i.e. \eqn{theta = sum(y_itl) / sum(lambda_ijtl / theta)}
#' }
#'
#' @param a_q the Dirichlet Process concentration parameter.
......@@ -405,8 +402,8 @@ HaldDP_ <- R6::R6Class(
},
checkin_priors = function(priors)
{
if (!is.list(priors) |
!all(c("a_alpha", "a_r", "a_theta", "b_theta") %in% names(priors)))
if (!(is.list(priors) &
all(c("a_alpha", "a_r", "a_theta", "b_theta") %in% names(priors))))
stop("The priors must be a list with names a_alpha, a_r, a_theta, and b_theta.")
## theta priors
......@@ -418,164 +415,77 @@ HaldDP_ <- R6::R6Class(
stop("priors$b_theta must be a single positive number.")
## r priors
r_df <- is.data.frame(priors$a_r)
if (isTRUE(r_df)) {
## check data frame has the right colnames
if (!all(c("Value", "Type", "Time", "Source") %in% colnames(priors$a_r)))
stop("priors$a_r must have columns names Value, Type, Time, and Source.")
priors$a_r$Time <- as.factor(priors$a_r$Time)
priors$a_r$Type <- as.factor(priors$a_r$Type)
priors$a_r$Source <- as.factor(priors$a_r$Source)
## check the data frame has the right values in the columns
if (!all(paste(gtools::mixedsort(unique(
priors$a_r$Type
))) == private$namesTypes))
stop("The levels of priors$a_r$Type must be the same as those in data$Type.")
if (!all(paste(gtools::mixedsort(unique(
priors$a_r$Time
))) == private$namesTimes))
stop("The levels of priors$a_r$Time must be the same as those in data$Time.")
if (!all(paste(gtools::mixedsort(unique(
priors$a_r$Source
))) == private$namesSources))
stop("The levels of priors$a_r$Source must be the same as those in data$Source.")
priors$a_r <- na.omit(priors$a_r)
## check the dimensions are correct
if (dim(unique(priors$a_r[, c("Type", "Time", "Source")]))[1] != (private$nTypes * private$nTimes * private$nSources))
stop("priors$a_r must have a single number for each time, type and source combination.")
# !all(private$test_integer(priors$a_r$Value)) |
if (!all(priors$a_r$Value >= 0))
stop("priors$a_r$Value must contain only positive numbers")
} else {
if (!isTRUE(length(priors$a_r) == 1) |
!isFinitePositive(priors$a_r))
stop("priors$a_r must be a data frame or a single number.")
}
a_r <- array(
NA,
dim = c(private$nTypes, private$nSources, private$nTimes),
dimnames = list(
type = private$namesTypes,
source = private$namesSources,
time = private$namesTimes
if(is.atomic(priors$a_r) & length(priors$a_r)==1) {
if(!isFinitePositive(priors$a_r))
stop("priors$a_r must be positive for a single number specification.")
priors$a_r <- array(
priors$a_r,
dim = c(private$nTypes, private$nSources, private$nTimes),
dimnames = list(
type = private$namesTypes,
source = private$namesSources,
time = private$namesTimes
)
)
)
}
else if(identical(class(priors$a_r), c('X','Data','R6')))
{
if(!identical(dimnames(priors$a_r$x), dimnames(private$X)))
stop("dimnames(priors$a_r$x) must match dimnames(X)")
priors$a_r = priors$a_r$x
}
else
stop("Prior for r must be either a single number or of type sourceR::X")
## alpha priors
alpha_df <- is.data.frame(priors$a_alpha)
if (isTRUE(alpha_df)) {
## check data frame has the right colnames
if (!all(c("Value", "Source", "Time", "Location") %in% colnames(priors$a_alpha)))
stop("a_alpha must have columns names Value, Source, Time, and Location.")
priors$a_alpha$Time <- as.factor(priors$a_alpha$Time)
priors$a_alpha$Location <-
as.factor(priors$a_alpha$Location)
priors$a_alpha$Source <- as.factor(priors$a_alpha$Source)
## check the data frame has the right values in the columns
if (!all(paste(gtools::mixedsort(unique(
priors$ar$Location
))) == private$namesLocations))
stop("The levels of priors$a_alpha$Location must be the same as those in data$Location.")
if (!all(paste(gtools::mixedsort(unique(priors$ar$Time))) == private$namesTimes))
stop("The levels of priors$a_alpha$Time must be the same as those in data$Time.")
if (!all(paste(gtools::mixedsort(unique(
priors$ar$Source
))) == private$namesSources))
stop("The levels of priors$a_alpha$Source must be the same as those in data$Source.")
priors$a_alpha <- na.omit(priors$a_alpha)
## check the dimensions are correct
if (dim(unique(priors$a_alpha[, c("Location", "Time", "Source")]))[1] != (private$nLocations * private$nTimes * private$nSources))
stop(
"priors$a_alpha must have a single number for each time, location and source combination."
if(is.atomic(priors$a_alpha) & length(priors$a_alpha)==1) {
if(!isFinitePositive(priors$a_alpha))
stop("priors$a_alpha <= 0. Must be positive.")
priors$a_alpha = array(
priors$a_alpha,
dim = c(private$nSources, private$nTimes, private$nLocations),
dimnames = list(
source = private$namesSources,
time = private$namesTimes,
location = private$namesLocations
)
if (!all(isFiniteInteger(priors$a_alpha$Value)) |
!all(priors$a_alpha$Value >= 0))
stop("priors$a_alpha$Value must contain only positive numbers")
} else {
if (!isTRUE(length(priors$a_alpha) == 1) |
!isFinitePositive(priors$a_alpha))
stop("priors$a_alpha must be a data frame or a single number.")
}
a_alpha <- array(
NA,
dim = c(private$nSources, private$nTimes, private$nLocations),
dimnames = list(
source = private$namesSources,
time = private$namesTimes,
location = private$namesLocations
)
)
## Extract values from priors alpha and r data frames for alpha and r and put into arrays.
## Surely this is very slow!! TODO: find better way!
for (time in private$namesTimes) {
for (sources in private$namesSources) {
for (type in private$namesTypes) {
if (isTRUE(r_df)) {
tmp_a_r <-
priors$a_r$Value[which((priors$a_r$Type == type) &
(priors$a_r$Time == time) &
(priors$a_r$Source == sources)
)]
if (length(tmp_a_r) != 1L)
stop("priors$r must have a single non-NA row for each time, type and source.")
} else {
tmp_a_r <- priors$a_r
}
a_r[which(private$namesTypes == type),
which(private$namesSources == sources),
which(private$namesTimes == time)] <- tmp_a_r
}
for (location in private$namesLocations) {
if (isTRUE(alpha_df)) {
tmp_a_alpha <-
priors$a_alpha$Value[which((priors$a_alpha$Source == sources) &
(priors$a_alpha$Time == time) &
(priors$a_alpha$Location == location)
)]
if (length(tmp_a_alpha) != 1L)
stop("priors$alpha must have a single non-NA row for each time, location and source.")
} else {
tmp_a_alpha <- priors$a_alpha
}
a_alpha[which(private$namesSources == sources),
which(private$namesTimes == time),
which(private$namesLocations == location)] <-
tmp_a_alpha
}
}
}
priors$a_alpha <- a_alpha
priors$a_r <- a_r
private$priors <- HaldDP_priors$new(priors)
else if(identical(class(priors$a_alpha), c('Alpha','Data','R6')))
{
dn = list(source=private$namesSources, time=private$namesTimes, location=private$namesLocations)
if(!identical(dimnames(priors$a_alpha$x), dn))
stop("priors$a_alpha must have source/time/location dimensions appropriate to data.")
priors$a_alpha = priors$a_alpha$x
}
else
stop("Prior parameters for alpha must be specified either as a constant or of type sourceR::Alpha")
# Stow as a member
private$priors = priors
},
checkin_inits = function(inits)
{
private$inits = list()
## r values
if (!("r" %in% names(inits))) {
## default is the source data matrix MLE (plus stability factor)
inits$r = apply(private$X + 1e-6, c('source', 'time'), function(x)
private$inits$r = apply(private$X + 1e-6, c('source', 'time'), function(x)
x / sum(x)) %>% aperm(c('type', 'source', 'time'))
} else {
if (!identical(class(inits$r), c('X','Data','R6')))
stop("inits$r must be an object of type sourceR::X")
if (!identical(dimnames(inits$r), dimnames(private$X)))
if (!identical(dimnames(inits$r$x), dimnames(private$X)))
stop("inits$r must be of the same rank as source data")
if (any(inits$r$x <=0))
stop("One or more elements of initial r values < 1.")
srcSums = apply(inits$r$x, c('source','time'), sum)
if(any(srcSums != 1))
warning("1 or more r[,source,time] vectors did not sum to 1. Renormalising.")
inits$r$x <- apply(init$r$x, c('source','time'), function(x) x/sum(x))
private$inits$r = apply(inits$r$x, c('source','time'), function(x) x/sum(x))
}
## alpha inits
if (!("alpha" %in% names(inits))) {
inits$alpha = apply(private$priors$a_alpha, c('time', 'location'), function(x) {
private$inits$alpha = apply(private$priors$a_alpha, c('time', 'location'), function(x) {
z = gtools::rdirichlet(1, x)
names(z) = names(x)
z
......@@ -590,7 +500,7 @@ HaldDP_ <- R6::R6Class(
srcSums = apply(inits$alpha$x, c('time','location'), sum)
if(any(srcSums != 1))
warning("1 or more init$alpha vectors did not sum to 1. Renormalising.")
inits$alpha$x = apply(inits$alpha$x, c('time','location'), function(x) x/sum(x))
private$inits$alpha = apply(inits$alpha$x, c('time','location'), function(x) x/sum(x))
}
## q inits
......@@ -599,36 +509,28 @@ HaldDP_ <- R6::R6Class(
## sum_{i=1}^n lambda_i = q * sum_{t=1}^T sum_{l=1}^L (sum_{i=1}^n sum_{j=1}^m p_{ijt} a_{jtl})
## sum_{i=1}^n y_i = q * sum_{t=1}^T sum_{l=1}^L (sum_{i=1}^n sum_{j=1}^m p_{ijt} a_{jtl})
## q = sum_{i=1}^n y_i / sum_{t=1}^T sum_{l=1}^L (sum_{i=1}^n sum_{j=1}^m p_{ijt} a_{jtl})
sum_alpha_jtl_r_ijt <- 0
for (time in 1:private$nTimes) {
for (location in 1:private$nLocations) {
sum_alpha_jtl_r_ijt <-
sum_alpha_jtl_r_ijt + sum(inits$r[, , time] %*% inits$alpha[, time, location])
}
}
q_val <- sum(private$y) / sum_alpha_jtl_r_ijt
inits$theta <- q_val
inits$s <- rep(1, private$nTypes)
tt = private$inits[c('alpha','r')]
tt$k = private$k
tt = lapply(tt, function(x) as.tensor.default(x, dims=dimnames(x)))
lambdaBar = mul.tensor(
tt$r,
i = 'source',
tt$alpha * tt$k,
j = 'source',
by = c('type','time','location')
) %>% sum
q_val <- sum(private$y) / lambdaBar
private$inits$theta <- q_val
private$inits$s <- rep(1, private$nTypes)
names(private$inits$s) = private$namesTypes
} else {
if (!is.data.frame(inits$q))
stop("inits$q must be a data frame.")
inits$q <- na.omit(inits$q)
if (!all(c("type", "value") %in% colnames(inits$q)))
stop("inits$q must be a data frame with columns called type and value")
inits$q$Type <- as.factor(inits$q$Type)
if (!all(paste(gtools::mixedsort(unique(inits$q$Type))) == private$namesTypes) |
dim(inits$q)[1] != private$nTypes)
stop("inits$q must be a data frame with columns called type and value with one row per type.")
if (!all(isFinitePositive(inits$q$Value)))
stop("inits$q$value must contain only positive numbers.")
inits$q <- inits$q[gtools::mixedsort(inits$q$Type),]
inits$theta <- sort(unique(inits$q$Value))
inits$s <- as.numeric(as.factor(inits$q$Value))
names(inits$s) <- private$namesTypes
if(!identical(class(inits$q), c('Q','Data','R6')))
stop("q initialiser must be of class sourceR::Q")
if(!identical(names(inits$q), self$namesTypes))
stop("q initialiser must have type names identical to y")
inits_$theta = inits$q$theta
inits_$s = inits$q$s
}
private$inits <- HaldDP_inits$new(inits)
},
set_niter = function(n_iter)
{
......@@ -908,7 +810,7 @@ HaldDP_ <- R6::R6Class(
),
public = list(
#TODO : change r init to an x/colsums(x) and remove from model!
initialize = function(y, x, k, priors, a_q, inits = NULL)
initialize = function(y, x, k, priors, a_q, inits)
{
if(!identical(class(y), c("Y","Data","R6")) )
stop("y must be of type sourceR::Y")
......
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/data.R
\name{Alpha}
\alias{Alpha}
\title{Constructs alpha prior}
\usage{
Alpha(data, alpha = "alpha", source = "source", time = "time",
location = "location")
}
\arguments{
\item{data}{long-format data.frame containing Dirichlet prior hyperparameter, source, time, and location columns}
\item{alpha}{name of hyperparameter column}
\item{source}{name of source column}
\item{time}{name of optional time column}
\item{location}{name of optional location column}
}
\value{
An Alpha_ data structure for use in sourceR models
}
\description{
The Alpha constructure function returns an R6 Alpha_
class which feeds sanitised prior or initialisation values for alpha into the model.
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/data.R
\docType{data}
\name{Alpha_}
\alias{Alpha_}
\title{Alpha prior hyperparameter class}
\format{An object of class \code{R6ClassGenerator} of length 24.}
\usage{
Alpha_
}
\description{
Alpha prior hyperparameter class
}
\keyword{datasets}
......@@ -3,45 +3,20 @@
\docType{class}
\name{HaldDP}
\alias{HaldDP}
\title{Runs the HaldDP source attribution model}
\title{Builds a HaldDP source attribution model}
\format{Object of \code{\link{R6Class}} with methods for creating a HaldDP model,
running the model, and accessing and plotting the results.}
\usage{
HaldDP
HaldDP(y, x, k, priors, a_q, inits = NULL)
}
\value{
Object of \code{\link{HaldDP}} with methods for creating a HaldDP model,
running the model, and accessing and plotting the results.
}
\description{
Runs the HaldDP source attribution model
}
\section{Description}{
\arguments{
\item{y}{a \code{\link{Y}} object containing case observations}