Model/etl/energy_efficiency/app.py

import inspect
import pandas as pd
from tqdm import tqdm
from pathlib import Path

src_file_path = inspect.getfile(lambda: None)

EPC_DIRECTORY = Path(src_file_path).parent / "local_data" / "all-domestic-certificates"


def app():
    # For EPCs lodged from 2020 onwards, this collects data on the energy efficiency categories for wall insulation
    # so that when we simulate, we know what the resulting energy efficiency category will be

    epc_directories = [entry for entry in EPC_DIRECTORY.iterdir() if entry.is_dir()]

    date_cutoff = "2020-01-01"
    walls_data = []
    ashp_data = []
    for i, directory in tqdm(enumerate(epc_directories), total=len(epc_directories)):
        data = pd.read_csv(directory / "certificates.csv", low_memory=False)
        # Rename the columns to the same format as the api returns
        data.columns = [c.replace("_", "-").lower() for c in data.columns]

        insulated_walls = data[
            data["walls-description"].isin(
                [
                    "Cavity wall, filled cavity",
                    "Solid brick, with internal insulation",
                    "Solid brick, with external insulation",
                ]
            )
        ]
        insulated_walls = insulated_walls[~pd.isnull(insulated_walls["uprn"])]
        insulated_walls = insulated_walls[
            pd.to_datetime(insulated_walls["lodgement-date"]) >= date_cutoff
            ]

        ashp = data[
            data["mainheat-description"] == "Air source heat pump, radiators, electric"
            ]
        ashp = ashp[~pd.isnull(ashp["uprn"])]
        ashp = ashp[
            pd.to_datetime(ashp["lodgement-date"]) >= date_cutoff
            ]

        walls_data.append(insulated_walls)
        ashp_data.append(ashp)

    walls_df = pd.concat(walls_data)
    ashp_df = pd.concat(ashp_data)

    ashp_agg = (
        ashp_df.
        groupby(
            ["construction-age-band", "mainheat-description", "mainheatcont-description", "mainheat-energy-eff",
             "mainheatc-energy-eff"]
        )
        .size()
        .reset_index()
    )
    ashp_agg = ashp_agg[
        ashp_agg["mainheatcont-description"].isin(
            ["Programmer, TRVs and bypass", "Time and temperature zone control"]
        )
    ]

    aggregations = {}
    for description in [
        "Cavity wall, filled cavity", "Solid brick, with internal insulation", "Solid brick, with external insulation"
    ]:
        aggregation = walls_df[
            walls_df["walls-description"] == description
            ].groupby(
            ["construction-age-band", "walls-energy-eff"]
        ).size().reset_index().rename(columns={0: "count"})

        # For each grouping of age band, we use the most populus energy efficiency category
        aggregation_deduped = aggregation.sort_values("count", ascending=False).drop_duplicates("construction-age-band")
        aggregations[description] = aggregation_deduped

    # Since these tables are small, we just convert them to python dictionaries
    # This data is just held in the wall_energy_efficiency_values script, rather than s3
    df1 = aggregations["Cavity wall, filled cavity"]
    df2 = aggregations["Solid brick, with internal insulation"]
    df3 = aggregations["Solid brick, with external insulation"]

    df1.to_dict("records")
    df2.to_dict("records")
    df3.to_dict("records")