Commit f42a20f2 authored by Chris Jewell's avatar Chris Jewell
Browse files

Tidy up and fix old dependencies

parent c9415c4d
......@@ -6,6 +6,9 @@ from covid19uk.posterior.thin import thin_posterior
from covid19uk.posterior.reproduction_number import reproduction_number
from covid19uk.posterior.predict import predict
from covid19uk.posterior.within_between import within_between
from covid19uk.version import version
__version__ = version()
__all__ = [
"assemble_data",
......@@ -14,4 +17,5 @@ __all__ = [
"reproduction_number",
"predict",
"within_between",
"version",
]
"""Covid data adaptors and support code"""
from covid.data.data import (
from covid19uk.data.loaders import (
read_mobility,
read_population,
read_traffic_flow,
)
from covid.data.tiers import TierData
from covid.data.area_code import AreaCodeData
from covid.data.case_data import CasesData
from covid19uk.data.tiers import TierData
from covid19uk.data.area_code import AreaCodeData
from covid19uk.data.case_data import CasesData
__all__ = [
"TierData",
......
"""Retrieves LAD19 area codes"""
import pandas as pd
import requests
from http import HTTPStatus
import json
from covid.data.util import (
import pandas as pd
import requests
from covid19uk.data.util import (
merge_lad_codes,
check_lad19cd_format,
invalidInput,
format_output_filename,
)
......
......@@ -4,10 +4,9 @@ import time
from warnings import warn
import requests
import json
import numpy as np
import pandas as pd
from covid.data.util import (
from covid19uk.data.util import (
invalidInput,
get_date_low_high,
check_date_bounds,
......@@ -15,7 +14,7 @@ from covid.data.util import (
check_lad19cd_format,
merge_lad_codes,
)
from covid.data import AreaCodeData
from covid19uk.data import AreaCodeData
class CasesData:
......@@ -124,12 +123,7 @@ class CasesData:
if settings["format"].lower() == "phe":
df = CasesData.adapt_phe(
df,
date_low,
date_high,
pillars,
measure,
areacodes,
df, date_low, date_high, pillars, measure, areacodes,
)
elif settings["format"].lower() == "gov":
df = CasesData.adapt_gov_api(
......@@ -164,7 +158,7 @@ class CasesData:
Adapt the line listing data to the desired dataframe format.
"""
df = df[["pillar", "LTLA_code", "specimen_date", "lab_report_date"]]
# Clean missing values
df.dropna(inplace=True)
df = df.rename(columns={"LTLA_code": "lad19cd"})
......
"""Methods to read in COVID-19 data and output
well-known formats"""
from warnings import warn
import numpy as np
import xarray
import pandas as pd
__all__ = [
"read_mobility",
"read_population",
"read_traffic_flow",
"read_phe_cases",
]
def read_mobility(path, locations=None):
"""Reads in CSV with mobility matrix.
CSV format: <To>,<id>,<id>,....
<id>,<val>,<val>,...
...
:param path: path to CSV file
:param locations: a list of locations to use
:returns: a numpy matrix sorted by <id> on both rows and cols.
"""
mobility = pd.read_csv(path)
mobility = mobility.rename(columns={"From": "src", "To": "dest"})
if locations is not None:
mobility = mobility[
mobility["src"].isin(locations) & mobility["dest"].isin(locations)
]
mobility = mobility.sort_values(["src", "dest"])
mobility = (
mobility.groupby(["src", "dest"]).agg({"Flow": sum}).reset_index()
)
mob_matrix = mobility.pivot(index="dest", columns="src", values="Flow")
mob_matrix[mob_matrix.isna()] = 0.0
return xarray.DataArray(
mob_matrix, dims=["location_dest", "location_src"], name="mobility"
)
def read_population(path, locations=None):
"""Reads population CSV
:param path: CSV file
:param locations: locations to use
:returns: a pandas Series indexed by LTLAs
"""
pop = pd.read_csv(path, index_col="lad19cd")
if locations is not None:
pop = pop[pop.index.isin(locations)]
pop = pop.sum(axis=1)
pop = pop.sort_index()
pop.name = "population"
pop.index.name = "location"
return xarray.DataArray(pop)
def read_traffic_flow(
path: str, date_low: np.datetime64, date_high: np.datetime64
):
"""Read traffic flow data, returning a timeseries between dates.
:param path: path to a traffic flow CSV with <date>,<Car> columns
:returns: a Pandas timeseries
"""
if path is None:
dates = np.arange(date_low, date_high, np.timedelta64(1, "D"))
return xarray.DataArray(
np.ones(dates.shape[0], np.float64),
name="flow",
dims=["date"],
coords=[dates],
)
commute_raw = pd.read_excel(
path, index_col="Date", skiprows=5, usecols=["Date", "Cars"]
)
commute_raw.index = pd.to_datetime(commute_raw.index, format="%Y-%m-%d")
commute_raw.sort_index(axis=0, inplace=True)
commute = pd.DataFrame(
index=np.arange(date_low, date_high, np.timedelta64(1, "D"))
)
commute = commute.merge(
commute_raw, left_index=True, right_index=True, how="left"
)
commute[commute.index < commute_raw.index[0]] = commute_raw.iloc[0, 0]
commute[commute.index > commute_raw.index[-1]] = commute_raw.iloc[-1, 0]
commute["Cars"] = commute["Cars"] / 100.0
commute.columns = ["flow"]
return xarray.DataArray(commute)
def _merge_ltla(series):
london = ["E09000001", "E09000033"]
corn_scilly = ["E06000052", "E06000053"]
series.loc[series.isin(london)] = ",".join(london)
series.loc[series.isin(corn_scilly)] = ",".join(corn_scilly)
return series
def read_phe_cases(
path, date_low, date_high, pillar="both", date_type="specimen", ltlas=None
):
"""Reads a PHE Anonymised Line Listing for dates in [low_date, high_date)
:param path: path to PHE Anonymised Line Listing Data
:param low_date: lower date bound
:param high_date: upper date bound
:returns: a Pandas data frame of LTLAs x dates
"""
date_type_map = {"specimen": "specimen_date", "report": "lab_report_date"}
pillar_map = {"both": None, "1": "Pillar 1", "2": "Pillar 2"}
line_listing = pd.read_csv(
path, usecols=[date_type_map[date_type], "LTLA_code", "pillar"]
)[[date_type_map[date_type], "LTLA_code", "pillar"]]
line_listing.columns = ["date", "lad19cd", "pillar"]
line_listing["lad19cd"] = _merge_ltla(line_listing["lad19cd"])
# Select dates
line_listing["date"] = pd.to_datetime(
line_listing["date"], format="%d/%m/%Y"
)
line_listing = line_listing[
(date_low <= line_listing["date"]) & (line_listing["date"] < date_high)
]
# Choose pillar
if pillar_map[pillar] is not None:
line_listing = line_listing.loc[
line_listing["pillar"] == pillar_map[pillar]
]
# Drop na rows
orig_len = line_listing.shape[0]
line_listing = line_listing.dropna(axis=0)
warn(
f"Removed {orig_len - line_listing.shape[0]} rows of {orig_len} \
due to missing values ({100. * (orig_len - line_listing.shape[0])/orig_len}%)"
)
# Aggregate by date/region
case_counts = line_listing.groupby(["date", "lad19cd"]).size()
case_counts.name = "count"
# Re-index
dates = pd.date_range(date_low, date_high, closed="left")
if ltlas is None:
ltlas = case_counts.index.levels[1]
index = pd.MultiIndex.from_product(
[dates, ltlas], names=["date", "lad19cd"]
)
case_counts = case_counts.reindex(index, fill_value=0)
return case_counts.reset_index().pivot(
index="lad19cd", columns="date", values="count"
)
def read_tier_restriction_data(
tier_restriction_csv, lad19cd_lookup, date_low, date_high
):
data = pd.read_csv(tier_restriction_csv)
data.loc[:, "date"] = pd.to_datetime(data["date"])
# Group merged ltlas
london = ["City of London", "Westminster"]
corn_scilly = ["Cornwall", "Isles of Scilly"]
data.loc[data["ltla"].isin(london), "ltla"] = ":".join(london)
data.loc[data["ltla"].isin(corn_scilly), "ltla"] = ":".join(corn_scilly)
# Fix up dodgy names
data.loc[
data["ltla"] == "Blackburn With Darwen", "ltla"
] = "Blackburn with Darwen"
# Merge
data = lad19cd_lookup.merge(
data, how="left", left_on="lad19nm", right_on="ltla"
)
# Re-index
data.index = pd.MultiIndex.from_frame(data[["date", "lad19cd"]])
data = data[["tier_2", "tier_3", "national_lockdown"]]
data = data[~data.index.duplicated()]
dates = pd.date_range(date_low, date_high - pd.Timedelta(1, "D"))
lad19cd = lad19cd_lookup["lad19cd"].sort_values().unique()
new_index = pd.MultiIndex.from_product([dates, lad19cd])
data = data.reindex(new_index, fill_value=0.0)
warn(f"Tier summary: {np.mean(data, axis=0)}")
# Pack into [T, M, V] array.
arr_data = data.to_xarray().to_array()
return np.transpose(arr_data, axes=[1, 2, 0])
def read_challen_tier_restriction(tier_restriction_csv, date_low, date_high):
tiers = pd.read_csv(tier_restriction_csv)
tiers["date"] = pd.to_datetime(tiers["date"], format="%Y-%m-%d")
tiers["code"] = _merge_ltla(tiers["code"])
# Separate out December tiers
tiers.loc[
(tiers["date"] > np.datetime64("2020-12-02"))
& (tiers["tier"] == "three"),
"tier",
] = "dec_three"
tiers.loc[
(tiers["date"] > np.datetime64("2020-12-02"))
& (tiers["tier"] == "two"),
"tier",
] = "dec_two"
tiers.loc[
(tiers["date"] > np.datetime64("2020-12-02"))
& (tiers["tier"] == "one"),
"tier",
] = "dec_one"
index = pd.MultiIndex.from_frame(tiers[["date", "code", "tier"]])
index = index.sort_values()
index = index[~index.duplicated()]
ser = pd.Series(1.0, index=index, name="value")
ser = ser[date_low : (date_high - np.timedelta64(1, "D"))]
xarr = ser.to_xarray()
xarr.data[np.isnan(xarr.data)] = 0.0
return xarr.loc[..., ["two", "three", "dec_two", "dec_three"]]
......@@ -3,6 +3,7 @@ well-known formats"""
from warnings import warn
import numpy as np
import xarray
import pandas as pd
__all__ = [
......@@ -13,37 +14,47 @@ __all__ = [
]
def read_mobility(path):
def read_mobility(path, locations=None):
"""Reads in CSV with mobility matrix.
CSV format: <To>,<id>,<id>,....
<id>,<val>,<val>,...
...
:param path: path to CSV file
:param locations: a list of locations to use
:returns: a numpy matrix sorted by <id> on both rows and cols.
"""
mobility = pd.read_csv(path)
mobility = mobility[
mobility["From"].str.startswith("E")
& mobility["To"].str.startswith("E")
]
mobility = mobility.sort_values(["From", "To"])
mobility = mobility.groupby(["From", "To"]).agg({"Flow": sum}).reset_index()
mob_matrix = mobility.pivot(index="To", columns="From", values="Flow")
mobility = mobility.rename(columns={"From": "src", "To": "dest"})
if locations is not None:
mobility = mobility[
mobility["src"].isin(locations) & mobility["dest"].isin(locations)
]
mobility = mobility.sort_values(["src", "dest"])
mobility = (
mobility.groupby(["src", "dest"]).agg({"Flow": sum}).reset_index()
)
mob_matrix = mobility.pivot(index="dest", columns="src", values="Flow")
mob_matrix[mob_matrix.isna()] = 0.0
return mob_matrix
return xarray.DataArray(
mob_matrix, dims=["location_dest", "location_src"], name="mobility"
)
def read_population(path):
def read_population(path, locations=None):
"""Reads population CSV
:param path: CSV file
:param locations: locations to use
:returns: a pandas Series indexed by LTLAs
"""
pop = pd.read_csv(path, index_col="lad19cd")
pop = pop[pop.index.str.startswith("E")]
if locations is not None:
pop = pop[pop.index.isin(locations)]
pop = pop.sum(axis=1)
pop = pop.sort_index()
pop.name = "n"
return pop
pop.name = "population"
pop.index.name = "location"
return xarray.DataArray(pop)
def read_traffic_flow(
......@@ -53,6 +64,15 @@ def read_traffic_flow(
:param path: path to a traffic flow CSV with <date>,<Car> columns
:returns: a Pandas timeseries
"""
if path is None:
dates = np.arange(date_low, date_high, np.timedelta64(1, "D"))
return xarray.DataArray(
np.ones(dates.shape[0], np.float64),
name="flow",
dims=["date"],
coords=[dates],
)
commute_raw = pd.read_excel(
path, index_col="Date", skiprows=5, usecols=["Date", "Cars"]
)
......@@ -67,8 +87,8 @@ def read_traffic_flow(
commute[commute.index < commute_raw.index[0]] = commute_raw.iloc[0, 0]
commute[commute.index > commute_raw.index[-1]] = commute_raw.iloc[-1, 0]
commute["Cars"] = commute["Cars"] / 100.0
commute.columns = ["percent"]
return commute
commute.columns = ["flow"]
return xarray.DataArray(commute)
def _merge_ltla(series):
......
......@@ -3,8 +3,8 @@
import numpy as np
import pandas as pd
from covid.data.area_code import AreaCodeData
from covid.data.util import get_date_low_high, invalidInput, merge_lad_codes
from covid19uk.data.area_code import AreaCodeData
from covid19uk.data.util import get_date_low_high, invalidInput, merge_lad_codes
class TierData:
......@@ -112,18 +112,9 @@ class TierData:
# Separate out December tiers
date_mask = tiers["date"] > np.datetime64("2020-12-02")
tiers.loc[
date_mask & (tiers["tier"] == "three"),
"tier",
] = "dec_three"
tiers.loc[
date_mask & (tiers["tier"] == "two"),
"tier",
] = "dec_two"
tiers.loc[
date_mask & (tiers["tier"] == "one"),
"tier",
] = "dec_one"
tiers.loc[date_mask & (tiers["tier"] == "three"), "tier",] = "dec_three"
tiers.loc[date_mask & (tiers["tier"] == "two"), "tier",] = "dec_two"
tiers.loc[date_mask & (tiers["tier"] == "one"), "tier",] = "dec_one"
# filter down to the lads
if len(lads) > 0:
......
......@@ -18,11 +18,13 @@ from tensorflow_probability.python.experimental.stats import sample_stats
from gemlib.util import compute_state
from gemlib.mcmc import Posterior
from gemlib.mcmc import GibbsKernel
from gemlib.distributions import BrownianMotion
from covid19uk.tasks.mcmc_kernel_factory import make_hmc_base_kernel
from covid19uk.tasks.mcmc_kernel_factory import make_hmc_fast_adapt_kernel
from covid19uk.tasks.mcmc_kernel_factory import make_hmc_slow_adapt_kernel
from covid19uk.tasks.mcmc_kernel_factory import make_event_multiscan_gibbs_step
from covid19uk.inference.mcmc_kernel_factory import make_hmc_base_kernel
from covid19uk.inference.mcmc_kernel_factory import make_hmc_fast_adapt_kernel
from covid19uk.inference.mcmc_kernel_factory import make_hmc_slow_adapt_kernel
from covid19uk.inference.mcmc_kernel_factory import (
make_event_multiscan_gibbs_step,
)
import covid19uk.model_spec as model_spec
......
......@@ -8,10 +8,14 @@ import tensorflow as tf
import tensorflow_probability as tfp
from gemlib.distributions import DiscreteTimeStateTransitionModel
from gemlib.distributions import BrownianMotion
from covid19uk.util import impute_previous_cases
import covid19uk.data.loaders as data
from covid19uk.data import AreaCodeData
from covid19uk.data import CasesData
from covid19uk.data import read_mobility
from covid19uk.data import read_population
from covid19uk.data import read_traffic_flow
tfd = tfp.distributions
......@@ -33,24 +37,22 @@ def gather_data(config):
date_low = np.datetime64(config["date_range"][0])
date_high = np.datetime64(config["date_range"][1])
locations = data.AreaCodeData.process(config)
mobility = data.read_mobility(
config["mobility_matrix"], locations["lad19cd"]
)
popsize = data.read_population(
config["population_size"], locations["lad19cd"]
)
commute_volume = data.read_traffic_flow(
locations = AreaCodeData.process(config)
mobility = read_mobility(config["mobility_matrix"], locations["lad19cd"])
popsize = read_population(config["population_size"], locations["lad19cd"])
commute_volume = read_traffic_flow(
config["commute_volume"], date_low=date_low, date_high=date_high
)
geo = gp.read_file(config["geopackage"])
geo = geo.sort_values("lad19cd")
geo = geo[geo["lad19cd"].isin(locations["lad19cd"])]
area = xarray.DataArray(
geo.area, name="area", dims=["location"], coords=[geo["lad19cd"]],
geo.area,
name="area",
dims=["location"],
coords=[geo["lad19cd"]],
)
# tier_restriction = data.TierData.process(config)[:, :, [0, 2, 3, 4]]
dates = pd.date_range(*config["date_range"], closed="left")
weekday = xarray.DataArray(
dates.weekday < 5,
......@@ -59,7 +61,7 @@ def gather_data(config):
coords=[dates.to_numpy()],
)
cases = data.CasesData.process(config).to_xarray()
cases = CasesData.process(config).to_xarray()
return (
xarray.Dataset(
dict(
......@@ -130,9 +132,6 @@ def CovidUK(covariates, initial_state, initial_step, num_steps):
)
def alpha_t():
# return BrownianMotion(
# tf.range(num_steps, dtype=DTYPE), x0=alpha_0, scale=0.005
# )
return tfd.MultivariateNormalDiag(
loc=tf.constant(0.0, dtype=DTYPE),
scale_diag=tf.fill(
......
"""Returns tuple of semantic version"""
import pkg_resources
def version():
ver = pkg_resources.get_distribution("covid19uk").version
return ver.split(".")
......@@ -9,22 +9,14 @@ license = "MIT"
python = "^3.7"
pandas = "^1.1.3"
geopandas = "^0.8.1"
mapclassify = "^2.3.0"
PyYAML = "^5.3.1"
descartes = "^1.1.0"
matplotlib = "^3.3.2"
xlrd = "^1.2.0"
tqdm = "^4.50.2"
h5py = "^3.1.0"
gemlib = {git = "http://fhm-chicas-code.lancs.ac.uk/GEM/gemlib.git", branch="develop"}
xarray = {extras = ["netcdf4"], version = "^0.17.0"}
seaborn = "^0.11.0"
ruffus = "^2.8.4"
jedi = "^0.17.2"
XlsxWriter = "^1.3.7"
netCDF4 = "^1.5.6"
dask = {extras = ["array"], version = "^2021.2.0"}
s3fs = "^2021.04.0"
[tool.poetry.dev-dependencies]
ipython = "^7.18.1"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment