pydata.py 4.3 KB
Newer Older
1
2
"""Python-based data munging"""

3
import re
4
5
from warnings import warn

6
7
import numpy as np
import pandas as pd
8
import pyreadr as pyr
9

Chris Jewell's avatar
Chris Jewell committed
10
11
12

def load_commute_volume(filename, date_range):
    """Loads commute data and clips or extends date range"""
13
14
15
    commute_raw = pd.read_excel(
        filename, index_col="Date", skiprows=5, usecols=["Date", "Cars"]
    )
16
    commute_raw.index = pd.to_datetime(commute_raw.index, format="%Y-%m-%d")
Chris Jewell's avatar
Chris Jewell committed
17
    commute_raw.sort_index(axis=0, inplace=True)
18
19
20
21
    commute = pd.DataFrame(
        index=np.arange(date_range[0], date_range[1], np.timedelta64(1, "D"))
    )
    commute = commute.merge(commute_raw, left_index=True, right_index=True, how="left")
Chris Jewell's avatar
Chris Jewell committed
22
23
    commute[commute.index < commute_raw.index[0]] = commute_raw.iloc[0, 0]
    commute[commute.index > commute_raw.index[-1]] = commute_raw.iloc[-1, 0]
24
25
    commute["Cars"] = commute["Cars"] / 100.0
    commute.columns = ["percent"]
Chris Jewell's avatar
Chris Jewell committed
26
27
28
    return commute


29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
def load_mobility_matrix(flow_file):
    """Loads mobility matrix from rds file"""
    mobility = list(pyr.read_r(flow_file).values())[0]
    mobility = mobility[
        mobility["From"].str.startswith("E") & mobility["To"].str.startswith("E")
    ]
    mobility = mobility.sort_values(["From", "To"])
    mobility = mobility.groupby(["From", "To"]).agg({"Flow": sum}).reset_index()
    mob_matrix = mobility.pivot(index="To", columns="From", values="Flow")
    mob_matrix[mob_matrix.isna()] = 0.0
    return mob_matrix


def load_population(pop_file):
    pop = pd.read_csv(pop_file, index_col="lad19cd")
    pop = pop[pop.index.str.startswith("E")]
    pop = pop.sum(axis=1)
    pop = pop.sort_index()
    pop.name = "n"
    return pop
49
50


51
52
53
def linelist2timeseries(date, region_code, date_range=None):
    """Constructs a daily aggregated timeseries given dates and region code
       Optionally accepts a list expressing a required date range."""
54

55
    linelist = pd.DataFrame(dict(date=pd.to_datetime(date), region_code=region_code))
56
57
58

    # 1. clip dates
    if date_range is not None:
59
        linelist = linelist[
Chris Jewell's avatar
Chris Jewell committed
60
            (date_range[0] <= linelist["date"]) & (linelist["date"] < date_range[1])
61
        ]
62
63
64
65
    raw_len = linelist.shape[0]

    # 2. Remove NA rows
    linelist = linelist.dropna(axis=0)  # remove na's
66
67
68
69
    warn(
        f"Removed {raw_len - linelist.shape[0]} rows of {raw_len} due to missing data \
({100. * (raw_len - linelist.shape[0])/raw_len}%)"
    )
70

71
72
    # 3. Aggregate by date/region and sort on index
    case_counts = linelist.groupby(["date", "region_code"]).size()
73
    case_counts.sort_index(axis=0, inplace=True)
74

75
76
    # 4. Reindex by day
    one_day = np.timedelta64(1, "D")
Chris Jewell's avatar
Chris Jewell committed
77
    full_dates = pd.date_range(date_range[0], date_range[1] - one_day)
78
79
80
81
82
83
    index = pd.MultiIndex.from_product(
        [full_dates, case_counts.index.levels[1]], names=["date", "region_code"]
    )
    case_counts = case_counts.reindex(index)
    case_counts.loc[case_counts.isna()] = 0.0
    case_counts.name = "count"
84
85
86
    return case_counts


87
def _read_csv(filename, date_format="%m/%d/%Y"):
88
    data = pd.read_csv(filename, low_memory=False)
89
90
91
92
    data["specimen_date"] = pd.to_datetime(data["specimen_date"], format=date_format)
    return data


93
def phe_case_data(linelisting_file, date_range=None, date_type='specimen', pillar=None):
94

95
96
97
98
99
100
101
102
103
104
105
106
    read_file = dict(csv=_read_csv, xlsx=pd.read_excel)
    match_extension = re.match(r"(.*)\.(.*)$", linelisting_file)
    if match_extension is None:
        raise ValueError(
            f"Linelisting filename '{linelisting_file}' is not in name.extension format"
        )
    filetype = match_extension.group(2)
    try:
        ll = read_file[filetype](linelisting_file)
    except KeyError:
        raise ValueError(f"No handler implemented for file type '{filetype}'")

107
    if pillar is not None:
Chris Jewell's avatar
Chris Jewell committed
108
        ll = ll.loc[ll["pillar"] == pillar]
109
110
111
112
113

    date_type_map = {'specimen': 'specimen_type',
                     'report': 'lab_report_date'}

    date = ll[date_type_map[date_type]]
114
    ltla_region = ll["LTLA_code"]
115

116
117
118
119
120
    # Merged regions
    london = ["E09000001", "E09000033"]
    corn_scilly = ["E06000052", "E06000053"]
    ltla_region.loc[ltla_region.isin(london)] = ",".join(london)
    ltla_region.loc[ltla_region.isin(corn_scilly)] = ",".join(corn_scilly)
121

122
123
    ts = linelist2timeseries(date, ltla_region, date_range)
    return ts.reset_index().pivot(index="region_code", columns="date", values="count")