Commit 119e0dbd authored by Chris Jewell's avatar Chris Jewell
Browse files

Merge branch 'add-aws' into mod-hmc-gibbs

parents 3462054d 16bad2b7
......@@ -134,7 +134,7 @@ def CovidUK(covariates, initial_state, initial_step, num_steps):
def alpha_t(alpha_0):
return BrownianMotion(
tf.range(num_steps, dtype=DTYPE), x0=alpha_0, scale=0.01
tf.range(num_steps, dtype=DTYPE), x0=alpha_0, scale=0.005
)
def gamma0():
......
"""A Ruffus-ised pipeline for COVID-19 analysis"""
import os
from os.path import expandvars
import warnings
import yaml
import datetime
import s3fs
import ruffus as rf
from covid.ruffus_pipeline import run_pipeline
......@@ -61,6 +64,9 @@ if __name__ == "__main__":
data_args.add_argument(
"--pillar", type=str, help="Pillar", choices=["both", "1", "2"]
)
data_args.add_argument(
"--aws", action='store_true', help="Push to AWS"
)
cli_options = argparser.parse_args()
global_config = _import_global_config(cli_options.config)
......
"""Represents the analytic pipeline as a ruffus chain"""
import os
import warnings
import yaml
from datetime import datetime
from uuid import uuid1
import json
import netCDF4 as nc
import s3fs
import pandas as pd
import ruffus as rf
......@@ -14,7 +16,7 @@ from covid.tasks import (
assemble_data,
mcmc,
thin_posterior,
next_generation_matrix,
reproduction_number,
overall_rt,
predict,
summarize,
......@@ -91,11 +93,11 @@ def run_pipeline(global_config, results_directory, cli_options):
rf.transform(
input=[[process_data, thin_samples]],
filter=rf.formatter(),
output=wd("ngm.nc"),
)(next_generation_matrix)
output=wd("reproduction_number.nc"),
)(reproduction_number)
rf.transform(
input=next_generation_matrix,
input=reproduction_number,
filter=rf.formatter(),
output=wd("national_rt.xlsx"),
)(overall_rt)
......@@ -146,7 +148,7 @@ def run_pipeline(global_config, results_directory, cli_options):
# Summarisation
rf.transform(
input=next_generation_matrix,
input=reproduction_number,
filter=rf.formatter(),
output=wd("rt_summary.csv"),
)(summarize.rt)
......@@ -184,15 +186,15 @@ def run_pipeline(global_config, results_directory, cli_options):
df.to_csv(output_file)
# Plot in-sample
@rf.transform(
input=[insample7, insample14],
filter=rf.formatter(".+/insample(?P<LAG>\d+).nc"),
add_inputs=rf.add_inputs(process_data),
output="{path[0]}/insample_plots{LAG[0]}",
extras=["{LAG[0]}"],
)
def plot_insample_predictive_timeseries(input_files, output_dir, lag):
insample_predictive_timeseries(input_files, output_dir, lag)
# @rf.transform(
# input=[insample7, insample14],
# filter=rf.formatter(".+/insample(?P<LAG>\d+).nc"),
# add_inputs=rf.add_inputs(process_data),
# output="{path[0]}/insample_plots{LAG[0]}",
# extras=["{LAG[0]}"],
# )
# def plot_insample_predictive_timeseries(input_files, output_dir, lag):
# insample_predictive_timeseries(input_files, output_dir, lag)
# Geopackage
rf.transform(
......@@ -221,11 +223,34 @@ def run_pipeline(global_config, results_directory, cli_options):
insample7,
insample14,
medium_term,
next_generation_matrix,
reproduction_number,
]
],
rf.formatter(),
wd("summary_longformat.xlsx"),
)(summary_longformat)
@rf.active_if(cli_options.aws)
@rf.transform(
input=[
process_data,
run_mcmc,
insample7,
insample14,
medium_term,
reproduction_number,
],
filter=rf.formatter(),
output="{subdir[0][0]}/{basename[0]}{ext[0]}",
extras=[global_config["AWSS3"]],
)
def upload_to_aws(input_file, output_file, config):
obj_path = f"{config['bucket']}/{output_file}"
s3 = s3fs.S3FileSystem(profile=config["profile"])
if not s3.exists(obj_path):
print(f"Copy {input_file} to {obj_path}", flush=True)
s3.put(input_file, obj_path)
else:
warnings.warn(f"Path '{obj_path}' already exists, not uploading.")
rf.cmdline.run(cli_options)
......@@ -3,7 +3,7 @@
from covid.tasks.assemble_data import assemble_data
from covid.tasks.inference import mcmc
from covid.tasks.thin_posterior import thin_posterior
from covid.tasks.next_generation_matrix import next_generation_matrix
from covid.tasks.next_generation_matrix import reproduction_number
from covid.tasks.overall_rt import overall_rt
from covid.tasks.predict import predict
import covid.tasks.summarize as summarize
......@@ -18,7 +18,7 @@ __all__ = [
"assemble_data",
"mcmc",
"thin_posterior",
"next_generation_matrix",
"reproduction_number",
"overall_rt",
"predict",
"summarize",
......
......@@ -5,13 +5,12 @@ import numpy as np
import xarray
import tensorflow as tf
from covid import model_spec
from covid.util import copy_nc_attrs
from gemlib.util import compute_state
def calc_posterior_ngm(samples, covar_data):
def calc_posterior_rit(samples, initial_state, times, covar_data):
"""Calculates effective reproduction number for batches of metapopulations
:param theta: a tensor of batched theta parameters [B] + theta.shape
:param xi: a tensor of batched xi parameters [B] + xi.shape
......@@ -20,63 +19,75 @@ def calc_posterior_ngm(samples, covar_data):
:param covar_data: the covariate data
:return a batched vector of R_it estimates
"""
times = tf.convert_to_tensor(times)
def r_fn(args):
beta1_, beta2_, beta3_, sigma_, xi_, gamma0_, events_ = args
t = events_.shape[-2] - 1
par = tf.nest.pack_sequence_as(samples, args)
state = compute_state(
samples["init_state"], events_, model_spec.STOICHIOMETRY
)
state = tf.gather(state, t, axis=-2) # State on final inference day
par = dict(
beta1=beta1_,
beta2=beta2_,
beta3=beta3_,
sigma=sigma_,
gamma0=gamma0_,
xi=xi_,
initial_state, par["seir"], model_spec.STOICHIOMETRY
)
ngm_fn = model_spec.next_generation_matrix_fn(covar_data, par)
ngm = ngm_fn(t, state)
return ngm
del par["seir"]
def fn(t):
state_ = tf.gather(
state, t, axis=-2
) # State on final inference day
ngm_fn = model_spec.next_generation_matrix_fn(covar_data, par)
ngm = ngm_fn(t, state_)
return ngm
ngm = tf.vectorized_map(fn, elems=times)
return tf.reduce_sum(ngm, axis=-2) # sum over destinations
return tf.vectorized_map(
r_fn,
elems=(
samples["beta1"],
samples["beta2"],
samples["beta3"],
samples["sigma"],
samples["xi"],
samples["gamma0"],
samples["seir"],
),
elems=tf.nest.flatten(samples),
)
def next_generation_matrix(input_files, output_file):
CHUNKSIZE = 50
def reproduction_number(input_files, output_file):
covar_data = xarray.open_dataset(input_files[0], group="constant_data")
with open(input_files[1], "rb") as f:
samples = pkl.load(f)
num_samples = samples["seir"].shape[0]
initial_state = samples["initial_state"]
del samples["initial_state"]
times = np.arange(covar_data.coords["time"].shape[0])
# Compute ngm posterior in chunks to prevent over-memory
r_its = []
for i in range(0, num_samples, CHUNKSIZE):
start = i
end = np.minimum(i + CHUNKSIZE, num_samples)
print(f"Chunk {start}:{end}", flush=True)
subsamples = {k: v[start:end] for k, v in samples.items()}
r_it = calc_posterior_rit(subsamples, initial_state, times, covar_data)
r_its.append(r_it)
# Compute ngm posterior
ngm = calc_posterior_ngm(samples, covar_data).numpy()
ngm = xarray.DataArray(
ngm,
r_it = xarray.DataArray(
tf.concat(r_its, axis=0),
coords=[
np.arange(ngm.shape[0]),
covar_data.coords["location"],
np.arange(num_samples),
covar_data.coords["time"][times],
covar_data.coords["location"],
],
dims=["iteration", "dest", "src"],
dims=["iteration", "time", "location"],
)
ngm = xarray.Dataset({"ngm": ngm})
weight = covar_data["N"] / covar_data["N"].sum()
r_t = (r_it * weight).sum(dim="location")
ds = xarray.Dataset({"R_it": r_it, "R_t": r_t})
# Output
ngm.to_netcdf(output_file, group="posterior_predictive")
ds.to_netcdf(output_file, group="posterior_predictive")
copy_nc_attrs(input_files[0], output_file)
......@@ -86,22 +97,20 @@ if __name__ == "__main__":
parser = ArgumentParser()
parser.add_argument(
"-s",
"--samples",
"samples",
type=str,
description="A pickle file with MCMC samples",
required=True,
help="A pickle file with MCMC samples",
)
parser.add_argument(
"-d",
"--data",
type=str,
decription="A data glob pickle file",
require=True,
help="A data glob pickle file",
required=True,
)
parser.add_argument(
"-o", "--output", type=str, description="The output file", require=True
"-o", "--output", type=str, help="The output file", required=True
)
args = parser.parse_args()
next_generation_matrix([args.data, args.samples], args.output)
reproduction_number([args.data, args.samples], args.output)
......@@ -10,17 +10,18 @@ from covid.summary import (
)
def overall_rt(next_generation_matrix, output_file):
def overall_rt(inference_data, output_file):
r_t = xarray.open_dataset(inference_data, group="posterior_predictive")[
"R_t"
]
ngms = xarray.open_dataset(
next_generation_matrix, group="posterior_predictive"
)["ngm"]
b, _ = power_iteration(ngms)
rt = rayleigh_quotient(ngms, b)
q = np.arange(0.05, 1.0, 0.05)
rt_quantiles = pd.DataFrame(
{"Rt": np.quantile(rt, q, axis=-1)}, index=q
).T.to_excel(output_file)
quantiles = r_t.isel(time=-1).quantile(q=q)
quantiles.to_dataframe().T.to_excel(output_file)
# pd.DataFrame({"Rt": np.quantile(r_t, q, axis=-1)}, index=q).T.to_excel(
# output_file
# )
if __name__ == "__main__":
......@@ -30,10 +31,11 @@ if __name__ == "__main__":
parser = ArgumentParser()
parser.add_argument(
"input_file",
description="The input .pkl file containing the next generation matrix",
type=str,
help="The input .pkl file containing the next generation matrix",
)
parser.add_argument(
"output_file", description="The name of the output .xlsx file"
"output_file", type=str, help="The name of the output .xlsx file"
)
args = parser.parse_args()
......
......@@ -10,7 +10,9 @@ from covid.util import copy_nc_attrs
from gemlib.util import compute_state
def predicted_incidence(posterior_samples, covar_data, init_step, num_steps):
def predicted_incidence(
posterior_samples, init_state, covar_data, init_step, num_steps
):
"""Runs the simulation forward in time from `init_state` at time `init_time`
for `num_steps`.
:param param: a dictionary of model parameters
......@@ -21,48 +23,37 @@ def predicted_incidence(posterior_samples, covar_data, init_step, num_steps):
transitions
"""
@tf.function
def sim_fn(args):
beta1_, beta2_, sigma_, xi_, gamma0_, gamma1_, init_ = args
par = dict(
beta1=beta1_,
beta2=beta2_,
sigma=sigma_,
xi=xi_,
gamma0=gamma0_,
gamma1=gamma1_,
)
model = model_spec.CovidUK(
covar_data,
initial_state=init_,
initial_step=init_step,
num_steps=num_steps,
)
sim = model.sample(**par)
return sim["seir"]
posterior_state = compute_state(
posterior_samples["init_state"],
init_state,
posterior_samples["seir"],
model_spec.STOICHIOMETRY,
)
init_state = posterior_state[..., init_step, :]
events = tf.map_fn(
sim_fn,
elems=(
posterior_samples["beta1"],
posterior_samples["beta2"],
posterior_samples["sigma"],
posterior_samples["xi"],
posterior_samples["gamma0"],
posterior_samples["gamma1"],
init_state,
),
fn_output_signature=(tf.float64),
)
return init_state, events
posterior_samples["new_init_state"] = posterior_state[..., init_step, :]
del posterior_samples["seir"]
@tf.function
def do_sim():
def sim_fn(args):
par = tf.nest.pack_sequence_as(posterior_samples, args)
init_ = par["new_init_state"]
del par["new_init_state"]
model = model_spec.CovidUK(
covar_data,
initial_state=init_,
initial_step=init_step,
num_steps=num_steps,
)
sim = model.sample(**par)
return sim["seir"]
return tf.map_fn(
sim_fn,
elems=tf.nest.flatten(posterior_samples),
fn_output_signature=(tf.float64),
)
return posterior_samples["new_init_state"], do_sim()
def read_pkl(filename):
......@@ -74,7 +65,10 @@ def predict(data, posterior_samples, output_file, initial_step, num_steps):
covar_data = xarray.open_dataset(data, group="constant_data")
cases = xarray.open_dataset(data, group="observations")
samples = read_pkl(posterior_samples)
initial_state = samples["initial_state"]
del samples["initial_state"]
if initial_step < 0:
initial_step = samples["seir"].shape[-2] + initial_step
......@@ -87,11 +81,11 @@ def predict(data, posterior_samples, output_file, initial_step, num_steps):
)
estimated_init_state, predicted_events = predicted_incidence(
samples, covar_data, initial_step, num_steps
samples, initial_state, covar_data, initial_step, num_steps
)
prediction = xarray.DataArray(
predicted_events,
predicted_events.numpy(),
coords=[
np.arange(predicted_events.shape[0]),
covar_data.coords["location"],
......@@ -101,7 +95,7 @@ def predict(data, posterior_samples, output_file, initial_step, num_steps):
dims=("iteration", "location", "time", "event"),
)
estimated_init_state = xarray.DataArray(
estimated_init_state,
estimated_init_state.numpy(),
coords=[
np.arange(estimated_init_state.shape[0]),
covar_data.coords["location"],
......@@ -123,23 +117,21 @@ if __name__ == "__main__":
parser = ArgumentParser()
parser.add_argument(
"-i", "--initial-step", type=int, default=0, description="Initial step"
)
parser.add_argument(
"-n", "--num-steps", type=int, default=1, description="Number of steps"
"-i", "--initial-step", type=int, default=0, help="Initial step"
)
parser.add_argument(
"data_pkl", type=str, description="Covariate data pickle"
"-n", "--num-steps", type=int, default=1, help="Number of steps"
)
parser.add_argument("data_pkl", type=str, help="Covariate data pickle")
parser.add_argument(
"posterior_samples_pkl",
type=str,
description="Posterior samples pickle",
help="Posterior samples pickle",
)
parser.add_argument(
"output_file",
type=str,
description="Output pkl file",
help="Output pkl file",
)
args = parser.parse_args()
......
......@@ -20,14 +20,14 @@ def rt(input_file, output_file):
:param output_file: a .csv of mean (ci) values
"""
ngm = xarray.open_dataset(input_file, group="posterior_predictive")["ngm"]
r_it = xarray.open_dataset(input_file, group="posterior_predictive")["R_it"]
rt = np.sum(ngm, axis=-2)
rt = r_it.isel(time=-1).drop("time")
rt_summary = mean_and_ci(rt, name="Rt")
exceed = np.mean(rt > 1.0, axis=0)
rt_summary = pd.DataFrame(
rt_summary, index=pd.Index(ngm.coords["dest"], name="location")
rt_summary, index=pd.Index(r_it.coords["location"], name="location")
)
rt_summary["Rt_exceed"] = exceed
rt_summary.to_csv(output_file)
......
......@@ -147,14 +147,12 @@ def summary_longformat(input_files, output_file):
df = pd.concat([df, prev_df], axis="index")
# Rt
ngms = xarray.load_dataset(input_files[4], group="posterior_predictive")[
"ngm"
rt = xarray.load_dataset(input_files[4], group="posterior_predictive")[
"R_it"
]
rt = ngms.sum(dim="dest")
rt = rt.rename({"src": "location"})
rt_summary = xarray2summarydf(rt)
rt_summary = xarray2summarydf(rt.isel(time=-1))
rt_summary["value_name"] = "R"
rt_summary["time"] = cases.coords["time"].data[-1] + np.timedelta64(1, "D")
rt_summary["time"] = rt.coords["time"].data[-1] + np.timedelta64(1, "D")
df = pd.concat([df, rt_summary], axis="index")
quantiles = df.columns[df.columns.str.startswith("0.")]
......
......@@ -6,24 +6,12 @@ import pickle as pkl
def thin_posterior(input_file, output_file, config):
thin_idx = slice(config["start"], config["end"], config["by"])
idx = slice(config["start"], config["end"], config["by"])
f = h5py.File(input_file, "r", rdcc_nbytes=1024 ** 3, rdcc_nslots=1e6)
output_dict = dict(
beta1=f["samples/beta1"][thin_idx],
beta2=f["samples/beta2"][thin_idx],
beta3=f["samples/beta3"][
thin_idx,
],
sigma=f["samples/sigma"][
thin_idx,
],
xi=f["samples/xi"][thin_idx],
gamma0=f["samples/gamma0"][thin_idx],
gamma1=f["samples/gamma1"][thin_idx],
seir=f["samples/events"][thin_idx],
init_state=f["initial_state"][:],
)
output_dict = {k: v[idx] for k, v in f["samples"].items()}
output_dict["initial_state"] = f["initial_state"][:]
f.close()
with open(output_file, "wb") as f:
......@@ -36,26 +24,20 @@ def thin_posterior(input_file, output_file, config):
if __name__ == "__main__":
import yaml
import os
from covid.cli_arg_parse import cli_args
args = cli_args()
with open(args.config, "r") as f:
config = yaml.load(f, Loader=yaml.FullLoader)
import argparse
def join_and_expand(prefix, filename):
return os.path.expandvars(os.path.join(prefix, filename))
input_file = join_and_expand(
config["output"]["results_dir"], config["output"]["posterior"]
parser = argparse.ArgumentParser()
parser.add_argument(
"-c", "--config", type=str, help="Configuration file", required=True
)
output_file = join_and_expand(
config["output"]["results_dir"], config["ThinPosterior"]["output_file"]
parser.add_argument(
"-o", "--output", type=str, help="Output pkl file", required=True
)
parser.add_argument("samples", type=str, help="Posterior HDF5 file")
args = parser.parse_args()
thin_idx = range(
config["ThinPosterior"]["thin_start"],
config["ThinPosterior"]["thin_end"],
config["ThinPosterior"]["thin_by"],
)
thin_posterior(input_file, output_file, thin_idx)
with open(args.config, "r") as f:
config = yaml.load(f, Loader=yaml.FullLoader)
print("Config: ", config["ThinPosterior"])
thin_posterior(args.samples, args.output, config["ThinPosterior"])
......@@ -68,15 +68,15 @@ def within_between(input_files, output_file):
with open(input_files[1], "rb") as f:
samples = pkl.load(f)