Model/etl/energy_efficiency/app.py
2024-08-01 22:07:19 +01:00

90 lines
3.3 KiB
Python

import inspect
import pandas as pd
from tqdm import tqdm
from pathlib import Path
src_file_path = inspect.getfile(lambda: None)
EPC_DIRECTORY = Path(src_file_path).parent / "local_data" / "all-domestic-certificates"
def app():
# For EPCs lodged from 2020 onwards, this collects data on the energy efficiency categories for wall insulation
# so that when we simulate, we know what the resulting energy efficiency category will be
epc_directories = [entry for entry in EPC_DIRECTORY.iterdir() if entry.is_dir()]
date_cutoff = "2020-01-01"
walls_data = []
ashp_data = []
for i, directory in tqdm(enumerate(epc_directories), total=len(epc_directories)):
data = pd.read_csv(directory / "certificates.csv", low_memory=False)
# Rename the columns to the same format as the api returns
data.columns = [c.replace("_", "-").lower() for c in data.columns]
insulated_walls = data[
data["walls-description"].isin(
[
"Cavity wall, filled cavity",
"Solid brick, with internal insulation",
"Solid brick, with external insulation",
]
)
]
insulated_walls = insulated_walls[~pd.isnull(insulated_walls["uprn"])]
insulated_walls = insulated_walls[
pd.to_datetime(insulated_walls["lodgement-date"]) >= date_cutoff
]
ashp = data[
data["mainheat-description"] == "Air source heat pump, radiators, electric"
]
ashp = ashp[~pd.isnull(ashp["uprn"])]
ashp = ashp[
pd.to_datetime(ashp["lodgement-date"]) >= date_cutoff
]
walls_data.append(insulated_walls)
ashp_data.append(ashp)
walls_df = pd.concat(walls_data)
ashp_df = pd.concat(ashp_data)
ashp_agg = (
ashp_df.
groupby(
["construction-age-band", "mainheat-description", "mainheatcont-description", "mainheat-energy-eff",
"mainheatc-energy-eff"]
)
.size()
.reset_index()
)
ashp_agg = ashp_agg[
ashp_agg["mainheatcont-description"].isin(
["Programmer, TRVs and bypass", "Time and temperature zone control"]
)
]
aggregations = {}
for description in [
"Cavity wall, filled cavity", "Solid brick, with internal insulation", "Solid brick, with external insulation"
]:
aggregation = walls_df[
walls_df["walls-description"] == description
].groupby(
["construction-age-band", "walls-energy-eff"]
).size().reset_index().rename(columns={0: "count"})
# For each grouping of age band, we use the most populus energy efficiency category
aggregation_deduped = aggregation.sort_values("count", ascending=False).drop_duplicates("construction-age-band")
aggregations[description] = aggregation_deduped
# Since these tables are small, we just convert them to python dictionaries
# This data is just held in the wall_energy_efficiency_values script, rather than s3
df1 = aggregations["Cavity wall, filled cavity"]
df2 = aggregations["Solid brick, with internal insulation"]
df3 = aggregations["Solid brick, with external insulation"]
df1.to_dict("records")
df2.to_dict("records")
df3.to_dict("records")