Commit f42a20f2 authored by Chris Jewell's avatar Chris Jewell
Browse files

Tidy up and fix old dependencies

parent c9415c4d
...@@ -6,6 +6,9 @@ from covid19uk.posterior.thin import thin_posterior ...@@ -6,6 +6,9 @@ from covid19uk.posterior.thin import thin_posterior
from covid19uk.posterior.reproduction_number import reproduction_number from covid19uk.posterior.reproduction_number import reproduction_number
from covid19uk.posterior.predict import predict from covid19uk.posterior.predict import predict
from covid19uk.posterior.within_between import within_between from covid19uk.posterior.within_between import within_between
from covid19uk.version import version
__version__ = version()
__all__ = [ __all__ = [
"assemble_data", "assemble_data",
...@@ -14,4 +17,5 @@ __all__ = [ ...@@ -14,4 +17,5 @@ __all__ = [
"reproduction_number", "reproduction_number",
"predict", "predict",
"within_between", "within_between",
"version",
] ]
"""Covid data adaptors and support code""" """Covid data adaptors and support code"""
from covid.data.data import ( from covid19uk.data.loaders import (
read_mobility, read_mobility,
read_population, read_population,
read_traffic_flow, read_traffic_flow,
) )
from covid.data.tiers import TierData from covid19uk.data.tiers import TierData
from covid.data.area_code import AreaCodeData from covid19uk.data.area_code import AreaCodeData
from covid.data.case_data import CasesData from covid19uk.data.case_data import CasesData
__all__ = [ __all__ = [
"TierData", "TierData",
......
"""Retrieves LAD19 area codes""" """Retrieves LAD19 area codes"""
import pandas as pd
import requests
from http import HTTPStatus from http import HTTPStatus
import json import json
from covid.data.util import ( import pandas as pd
import requests
from covid19uk.data.util import (
merge_lad_codes, merge_lad_codes,
check_lad19cd_format, check_lad19cd_format,
invalidInput, invalidInput,
format_output_filename,
) )
......
...@@ -4,10 +4,9 @@ import time ...@@ -4,10 +4,9 @@ import time
from warnings import warn from warnings import warn
import requests import requests
import json import json
import numpy as np
import pandas as pd import pandas as pd
from covid.data.util import ( from covid19uk.data.util import (
invalidInput, invalidInput,
get_date_low_high, get_date_low_high,
check_date_bounds, check_date_bounds,
...@@ -15,7 +14,7 @@ from covid.data.util import ( ...@@ -15,7 +14,7 @@ from covid.data.util import (
check_lad19cd_format, check_lad19cd_format,
merge_lad_codes, merge_lad_codes,
) )
from covid.data import AreaCodeData from covid19uk.data import AreaCodeData
class CasesData: class CasesData:
...@@ -124,12 +123,7 @@ class CasesData: ...@@ -124,12 +123,7 @@ class CasesData:
if settings["format"].lower() == "phe": if settings["format"].lower() == "phe":
df = CasesData.adapt_phe( df = CasesData.adapt_phe(
df, df, date_low, date_high, pillars, measure, areacodes,
date_low,
date_high,
pillars,
measure,
areacodes,
) )
elif settings["format"].lower() == "gov": elif settings["format"].lower() == "gov":
df = CasesData.adapt_gov_api( df = CasesData.adapt_gov_api(
...@@ -164,7 +158,7 @@ class CasesData: ...@@ -164,7 +158,7 @@ class CasesData:
Adapt the line listing data to the desired dataframe format. Adapt the line listing data to the desired dataframe format.
""" """
df = df[["pillar", "LTLA_code", "specimen_date", "lab_report_date"]] df = df[["pillar", "LTLA_code", "specimen_date", "lab_report_date"]]
# Clean missing values # Clean missing values
df.dropna(inplace=True) df.dropna(inplace=True)
df = df.rename(columns={"LTLA_code": "lad19cd"}) df = df.rename(columns={"LTLA_code": "lad19cd"})
......
"""Methods to read in COVID-19 data and output
well-known formats"""
from warnings import warn
import numpy as np
import xarray
import pandas as pd
__all__ = [
"read_mobility",
"read_population",
"read_traffic_flow",
"read_phe_cases",
]
def read_mobility(path, locations=None):
"""Reads in CSV with mobility matrix.
CSV format: <To>,<id>,<id>,....
<id>,<val>,<val>,...
...
:param path: path to CSV file
:param locations: a list of locations to use
:returns: a numpy matrix sorted by <id> on both rows and cols.
"""
mobility = pd.read_csv(path)
mobility = mobility.rename(columns={"From": "src", "To": "dest"})
if locations is not None:
mobility = mobility[
mobility["src"].isin(locations) & mobility["dest"].isin(locations)
]
mobility = mobility.sort_values(["src", "dest"])
mobility = (
mobility.groupby(["src", "dest"]).agg({"Flow": sum}).reset_index()
)
mob_matrix = mobility.pivot(index="dest", columns="src", values="Flow")
mob_matrix[mob_matrix.isna()] = 0.0
return xarray.DataArray(
mob_matrix, dims=["location_dest", "location_src"], name="mobility"
)
def read_population(path, locations=None):
"""Reads population CSV
:param path: CSV file
:param locations: locations to use
:returns: a pandas Series indexed by LTLAs
"""
pop = pd.read_csv(path, index_col="lad19cd")
if locations is not None:
pop = pop[pop.index.isin(locations)]
pop = pop.sum(axis=1)
pop = pop.sort_index()
pop.name = "population"
pop.index.name = "location"
return xarray.DataArray(pop)
def read_traffic_flow(
path: str, date_low: np.datetime64, date_high: np.datetime64
):
"""Read traffic flow data, returning a timeseries between dates.
:param path: path to a traffic flow CSV with <date>,<Car> columns
:returns: a Pandas timeseries
"""
if path is None:
dates = np.arange(date_low, date_high, np.timedelta64(1, "D"))
return xarray.DataArray(
np.ones(dates.shape[0], np.float64),
name="flow",
dims=["date"],
coords=[dates],
)
commute_raw = pd.read_excel(
path, index_col="Date", skiprows=5, usecols=["Date", "Cars"]
)
commute_raw.index = pd.to_datetime(commute_raw.index, format="%Y-%m-%d")
commute_raw.sort_index(axis=0, inplace=True)
commute = pd.DataFrame(
index=np.arange(date_low, date_high, np.timedelta64(1, "D"))
)
commute = commute.merge(
commute_raw, left_index=True, right_index=True, how="left"
)
commute[commute.index < commute_raw.index[0]] = commute_raw.iloc[0, 0]
commute[commute.index > commute_raw.index[-1]] = commute_raw.iloc[-1, 0]
commute["Cars"] = commute["Cars"] / 100.0
commute.columns = ["flow"]
return xarray.DataArray(commute)
def _merge_ltla(series):
london = ["E09000001", "E09000033"]
corn_scilly = ["E06000052", "E06000053"]
series.loc[series.isin(london)] = ",".join(london)
series.loc[series.isin(corn_scilly)] = ",".join(corn_scilly)
return series
def read_phe_cases(
path, date_low, date_high, pillar="both", date_type="specimen", ltlas=None
):
"""Reads a PHE Anonymised Line Listing for dates in [low_date, high_date)
:param path: path to PHE Anonymised Line Listing Data
:param low_date: lower date bound
:param high_date: upper date bound
:returns: a Pandas data frame of LTLAs x dates
"""
date_type_map = {"specimen": "specimen_date", "report": "lab_report_date"}
pillar_map = {"both": None, "1": "Pillar 1", "2": "Pillar 2"}
line_listing = pd.read_csv(
path, usecols=[date_type_map[date_type], "LTLA_code", "pillar"]
)[[date_type_map[date_type], "LTLA_code", "pillar"]]
line_listing.columns = ["date", "lad19cd", "pillar"]
line_listing["lad19cd"] = _merge_ltla(line_listing["lad19cd"])
# Select dates
line_listing["date"] = pd.to_datetime(
line_listing["date"], format="%d/%m/%Y"
)
line_listing = line_listing[
(date_low <= line_listing["date"]) & (line_listing["date"] < date_high)
]
# Choose pillar
if pillar_map[pillar] is not None:
line_listing = line_listing.loc[
line_listing["pillar"] == pillar_map[pillar]
]
# Drop na rows
orig_len = line_listing.shape[0]
line_listing = line_listing.dropna(axis=0)
warn(
f"Removed {orig_len - line_listing.shape[0]} rows of {orig_len} \
due to missing values ({100. * (orig_len - line_listing.shape[0])/orig_len}%)"
)
# Aggregate by date/region
case_counts = line_listing.groupby(["date", "lad19cd"]).size()
case_counts.name = "count"
# Re-index
dates = pd.date_range(date_low, date_high, closed="left")
if ltlas is None:
ltlas = case_counts.index.levels[1]
index = pd.MultiIndex.from_product(
[dates, ltlas], names=["date", "lad19cd"]
)
case_counts = case_counts.reindex(index, fill_value=0)
return case_counts.reset_index().pivot(
index="lad19cd", columns="date", values="count"
)
def read_tier_restriction_data(
tier_restriction_csv, lad19cd_lookup, date_low, date_high
):
data = pd.read_csv(tier_restriction_csv)
data.loc[:, "date"] = pd.to_datetime(data["date"])
# Group merged ltlas
london = ["City of London", "Westminster"]
corn_scilly = ["Cornwall", "Isles of Scilly"]
data.loc[data["ltla"].isin(london), "ltla"] = ":".join(london)
data.loc[data["ltla"].isin(corn_scilly), "ltla"] = ":".join(corn_scilly)
# Fix up dodgy names
data.loc[
data["ltla"] == "Blackburn With Darwen", "ltla"
] = "Blackburn with Darwen"
# Merge
data = lad19cd_lookup.merge(
data, how="left", left_on="lad19nm", right_on="ltla"
)
# Re-index
data.index = pd.MultiIndex.from_frame(data[["date", "lad19cd"]])
data = data[["tier_2", "tier_3", "national_lockdown"]]
data = data[~data.index.duplicated()]
dates = pd.date_range(date_low, date_high - pd.Timedelta(1, "D"))
lad19cd = lad19cd_lookup["lad19cd"].sort_values().unique()
new_index = pd.MultiIndex.from_product([dates, lad19cd])
data = data.reindex(new_index, fill_value=0.0)
warn(f"Tier summary: {np.mean(data, axis=0)}")
# Pack into [T, M, V] array.
arr_data = data.to_xarray().to_array()
return np.transpose(arr_data, axes=[1, 2, 0])
def read_challen_tier_restriction(tier_restriction_csv, date_low, date_high):
tiers = pd.read_csv(tier_restriction_csv)
tiers["date"] = pd.to_datetime(tiers["date"], format="%Y-%m-%d")
tiers["code"] = _merge_ltla(tiers["code"])
# Separate out December tiers
tiers.loc[
(tiers["date"] > np.datetime64("2020-12-02"))
& (tiers["tier"] == "three"),
"tier",
] = "dec_three"
tiers.loc[
(tiers["date"] > np.datetime64("2020-12-02"))
& (tiers["tier"] == "two"),
"tier",
] = "dec_two"
tiers.loc[
(tiers["date"] > np.datetime64("2020-12-02"))
& (tiers["tier"] == "one"),
"tier",
] = "dec_one"
index = pd.MultiIndex.from_frame(tiers[["date", "code", "tier"]])
index = index.sort_values()
index = index[~index.duplicated()]
ser = pd.Series(1.0, index=index, name="value")
ser = ser[date_low : (date_high - np.timedelta64(1, "D"))]
xarr = ser.to_xarray()
xarr.data[np.isnan(xarr.data)] = 0.0
return xarr.loc[..., ["two", "three", "dec_two", "dec_three"]]
...@@ -3,6 +3,7 @@ well-known formats""" ...@@ -3,6 +3,7 @@ well-known formats"""
from warnings import warn from warnings import warn
import numpy as np import numpy as np
import xarray
import pandas as pd import pandas as pd
__all__ = [ __all__ = [
...@@ -13,37 +14,47 @@ __all__ = [ ...@@ -13,37 +14,47 @@ __all__ = [
] ]
def read_mobility(path): def read_mobility(path, locations=None):
"""Reads in CSV with mobility matrix. """Reads in CSV with mobility matrix.
CSV format: <To>,<id>,<id>,.... CSV format: <To>,<id>,<id>,....
<id>,<val>,<val>,... <id>,<val>,<val>,...
... ...
:param path: path to CSV file
:param locations: a list of locations to use
:returns: a numpy matrix sorted by <id> on both rows and cols. :returns: a numpy matrix sorted by <id> on both rows and cols.
""" """
mobility = pd.read_csv(path) mobility = pd.read_csv(path)
mobility = mobility[ mobility = mobility.rename(columns={"From": "src", "To": "dest"})
mobility["From"].str.startswith("E") if locations is not None:
& mobility["To"].str.startswith("E") mobility = mobility[
] mobility["src"].isin(locations) & mobility["dest"].isin(locations)
mobility = mobility.sort_values(["From", "To"]) ]
mobility = mobility.groupby(["From", "To"]).agg({"Flow": sum}).reset_index() mobility = mobility.sort_values(["src", "dest"])
mob_matrix = mobility.pivot(index="To", columns="From", values="Flow") mobility = (
mobility.groupby(["src", "dest"]).agg({"Flow": sum}).reset_index()
)
mob_matrix = mobility.pivot(index="dest", columns="src", values="Flow")
mob_matrix[mob_matrix.isna()] = 0.0 mob_matrix[mob_matrix.isna()] = 0.0
return mob_matrix return xarray.DataArray(
mob_matrix, dims=["location_dest", "location_src"], name="mobility"
)
def read_population(path): def read_population(path, locations=None):
"""Reads population CSV """Reads population CSV
:param path: CSV file
:param locations: locations to use
:returns: a pandas Series indexed by LTLAs :returns: a pandas Series indexed by LTLAs
""" """
pop = pd.read_csv(path, index_col="lad19cd") pop = pd.read_csv(path, index_col="lad19cd")
pop = pop[pop.index.str.startswith("E")] if locations is not None:
pop = pop[pop.index.isin(locations)]
pop = pop.sum(axis=1) pop = pop.sum(axis=1)
pop = pop.sort_index() pop = pop.sort_index()
pop.name = "n" pop.name = "population"
return pop pop.index.name = "location"
return xarray.DataArray(pop)
def read_traffic_flow( def read_traffic_flow(
...@@ -53,6 +64,15 @@ def read_traffic_flow( ...@@ -53,6 +64,15 @@ def read_traffic_flow(
:param path: path to a traffic flow CSV with <date>,<Car> columns :param path: path to a traffic flow CSV with <date>,<Car> columns
:returns: a Pandas timeseries :returns: a Pandas timeseries
""" """
if path is None:
dates = np.arange(date_low, date_high, np.timedelta64(1, "D"))
return xarray.DataArray(
np.ones(dates.shape[0], np.float64),
name="flow",
dims=["date"],
coords=[dates],
)
commute_raw = pd.read_excel( commute_raw = pd.read_excel(
path, index_col="Date", skiprows=5, usecols=["Date", "Cars"] path, index_col="Date", skiprows=5, usecols=["Date", "Cars"]
) )
...@@ -67,8 +87,8 @@ def read_traffic_flow( ...@@ -67,8 +87,8 @@ def read_traffic_flow(
commute[commute.index < commute_raw.index[0]] = commute_raw.iloc[0, 0] commute[commute.index < commute_raw.index[0]] = commute_raw.iloc[0, 0]
commute[commute.index > commute_raw.index[-1]] = commute_raw.iloc[-1, 0] commute[commute.index > commute_raw.index[-1]] = commute_raw.iloc[-1, 0]
commute["Cars"] = commute["Cars"] / 100.0 commute["Cars"] = commute["Cars"] / 100.0
commute.columns = ["percent"] commute.columns = ["flow"]
return commute return xarray.DataArray(commute)
def _merge_ltla(series): def _merge_ltla(series):
......
...@@ -3,8 +3,8 @@ ...@@ -3,8 +3,8 @@
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from covid.data.area_code import AreaCodeData from covid19uk.data.area_code import AreaCodeData
from covid.data.util import get_date_low_high, invalidInput, merge_lad_codes from covid19uk.data.util import get_date_low_high, invalidInput, merge_lad_codes
class TierData: class TierData:
...@@ -112,18 +112,9 @@ class TierData: ...@@ -112,18 +112,9 @@ class TierData:
# Separate out December tiers # Separate out December tiers
date_mask = tiers["date"] > np.datetime64("2020-12-02") date_mask = tiers["date"] > np.datetime64("2020-12-02")
tiers.loc[ tiers.loc[date_mask & (tiers["tier"] == "three"), "tier",] = "dec_three"
date_mask & (tiers["tier"] == "three"), tiers.loc[date_mask & (tiers["tier"] == "two"), "tier",] = "dec_two"
"tier", tiers.loc[date_mask & (tiers["tier"] == "one"), "tier",] = "dec_one"
] = "dec_three"
tiers.loc[
date_mask & (tiers["tier"] == "two"),
"tier",
] = "dec_two"
tiers.loc[
date_mask & (tiers["tier"] == "one"),
"tier",
] = "dec_one"
# filter down to the lads # filter down to the lads
if len(lads) > 0: if len(lads) > 0:
......
...@@ -18,11 +18,13 @@ from tensorflow_probability.python.experimental.stats import sample_stats ...@@ -18,11 +18,13 @@ from tensorflow_probability.python.experimental.stats import sample_stats
from gemlib.util import compute_state from gemlib.util import compute_state
from gemlib.mcmc import Posterior from gemlib.mcmc import Posterior
from gemlib.mcmc import GibbsKernel from gemlib.mcmc import GibbsKernel
from gemlib.distributions import BrownianMotion
from covid19uk.tasks.mcmc_kernel_factory import make_hmc_base_kernel from covid19uk.inference.mcmc_kernel_factory import make_hmc_base_kernel
from covid19uk.tasks.mcmc_kernel_factory import make_hmc_fast_adapt_kernel from covid19uk.inference.mcmc_kernel_factory import make_hmc_fast_adapt_kernel
from covid19uk.tasks.mcmc_kernel_factory import make_hmc_slow_adapt_kernel from covid19uk.inference.mcmc_kernel_factory import make_hmc_slow_adapt_kernel
from covid19uk.tasks.mcmc_kernel_factory import make_event_multiscan_gibbs_step from covid19uk.inference.mcmc_kernel_factory import (
make_event_multiscan_gibbs_step,
)
import covid19uk.model_spec as model_spec import covid19uk.model_spec as model_spec
......
...@@ -8,10 +8,14 @@ import tensorflow as tf ...@@ -8,10 +8,14 @@ import tensorflow as tf
import tensorflow_probability as tfp import tensorflow_probability as tfp
from gemlib.distributions import DiscreteTimeStateTransitionModel from gemlib.distributions import DiscreteTimeStateTransitionModel
from gemlib.distributions import BrownianMotion
from covid19uk.util import impute_previous_cases from covid19uk.util import impute_previous_cases
import covid19uk.data.loaders as data
from covid19uk.data import AreaCodeData
from covid19uk.data import CasesData
from covid19uk.data import read_mobility
from covid19uk.data import read_population
from covid19uk.data import read_traffic_flow
tfd = tfp.distributions tfd = tfp.distributions
...@@ -33,24 +37,22 @@ def gather_data(config): ...@@ -33,24 +37,22 @@ def gather_data(config):
date_low = np.datetime64(config["date_range"][0]) date_low = np.datetime64(config["date_range"][0])
date_high = np.datetime64(config["date_range"][1]) date_high = np.datetime64(config["date_range"][1])
locations = data.AreaCodeData.process(config) locations = AreaCodeData.process(config)
mobility = data.read_mobility( mobility = read_mobility(config["mobility_matrix"], locations["lad19cd"])
config["mobility_matrix"], locations["lad19cd"] popsize = read_population(config["population_size"], locations["lad19cd"])
) commute_volume = read_traffic_flow(
popsize = data.read_population(
config["population_size"], locations["lad19cd"]
)
commute_volume = data.read_traffic_flow(
config["commute_volume"], date_low=date_low, date_high=date_high config["commute_volume"], date_low=date_low, date_high=date_high
) )
geo = gp.read_file(config["geopackage"]) geo = gp.read_file(config["geopackage"])
geo = geo.sort_values("lad19cd") geo = geo.sort_values("lad19cd")
geo = geo[geo["lad19cd"].isin(locations["lad19cd"])] geo = geo[geo["lad19cd"].isin(locations["lad19cd"])]
area = xarray.DataArray( area = xarray.DataArray(
geo.area, name="area", dims=["location"], coords=[geo["lad19cd"]],