Model/model_data/cleaner_app.py
2023-09-20 14:23:46 +01:00

328 lines
18 KiB
Python

from tqdm import tqdm
import os
import pandas as pd
import msgpack
from model_data.EpcClean import EpcClean
from model_data.analysis.UvalueEstimations import UvalueEstimations
from model_data.simulation_system.core.Settings import EARLIEST_EPC_DATE
from pathlib import Path
from utils.s3 import save_data_to_s3
LAND_REGISTRY_PATHS = [
os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-monthly-update-new-version.csv",
os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2022 (1).csv",
os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2021.csv",
os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2020.csv",
os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2019.csv",
os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2018.csv",
os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2017-part1.csv",
os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2017-part2.csv",
]
EPC_DIRECTORY = Path(__file__).parent / "model_data" / "simulation_system" / "data" / "all-domestic-certificates"
ENVIRONMENT = os.getenv("ENVIRONMENT", "dev")
def app():
"""
For a pre-defined list of constituencies and property data_types, we'll download EPC data from the API
and produce a dataset of cleaned fields so that when we get new properties, we can quickly
sanitise any description data
Currently, this application is just run on a local machine
"""
cleaned_data = {}
epc_directories = [entry for entry in EPC_DIRECTORY.iterdir() if entry.is_dir()]
for directory in tqdm(epc_directories):
directory_destructured = str(directory).split("/")[-1].split("-")
gss_code = directory_destructured[1]
local_authority = directory_destructured[2]
data = pd.read_csv(directory / "certificates.csv", low_memory=False)
# Rename the columns to the same format as the api returns
data.columns = [c.replace("_", "-").lower() for c in data.columns]
# Take just date before the date threshold
data = data[data["lodgement-date"] >= EARLIEST_EPC_DATE]
# Convert to list of dictioaries as returned by the api
data = data.to_dict("records")
# Incorporate input data into cleaning
cleaner = EpcClean(data)
cleaner.clean()
# Extended cleaned_data
for k, data in cleaner.cleaned.items():
if k not in cleaned_data:
cleaned_data[k] = data
else:
existing_descriptions = [x["original_description"] for x in cleaned_data[k]]
new_data = [x for x in data if x["original_description"] not in existing_descriptions]
cleaned_data[k].extend(new_data)
# TODO: Add property age band into this
# uvalue_estimates = UvalueEstimations(data=data)
# uvalue_estimates.get_estimates(cleaner=cleaner)
# # TODO: Store these to a s3
# uvalue_estimates.walls
# uvalue_estimates.floors
# uvalue_estimates.roofs
# Basic check to make sure all descriptions are unique
for _, cleaned in cleaned_data.items():
descriptions = [x["original_description"] for x in cleaned]
if len(descriptions) != len(set(descriptions)):
raise ValueError("Duplicated descriptions found, check me")
# Finally, we attach u-values to the descriptions for walls, roofs and floors
wall_types = [
"Stone: granite or whinstone as built",
"Stone: sandstone or limestone as built",
"Solid brick as built",
"Stone/solid brick with 50 mm external or internal insulation",
"Stone/solid brick with 100 mm external or internal insulation",
"Stone/solid brick with 150 mm external or internal insulation",
"Stone/solid brick with 200 mm external or internal insulation",
"Cob as built",
"Cob with 50 mm external or internal insulation",
"Cob with 100 mm external or internal insulation",
"Cob with 150 mm external or internal insulation",
"Cob with 200 mm external or internal insulation",
"Cavity as built",
"Unfilled cavity with 50 mm external or internal insulation",
"Unfilled cavity with 100 mm external or internal insulation",
"Unfilled cavity with 150 mm external or internal insulation",
"Unfilled cavity with 200 mm external or internal insulation",
"Filled cavity",
"Filled cavity with 50 mm external or internal insulation",
"Filled cavity with 100 mm external or internal insulation",
"Filled cavity with 150 mm external or internal insulation",
"Filled cavity with 200 mm external or internal insulation",
"Timber frame as built",
"Timber frame with internal insulation",
"System build as built",
"System build with 50 mm external or internal insulation",
"System build with 100 mm external or internal insulation",
"System build with 150 mm external or internal insulation",
"System build with 200 mm external or internal insulation",
]
u_values = [
["a", "a", "a", "a", "1.7b", "1.0", "0.6", "0.60", "0.45", "0.35", "0.30", "0.28"],
["a", "a", "a", "a", "1.7b", "1.0", "0.6", "0.60", "0.45", "0.35", "0.30", "0.28"],
["1.7", "1.7", "1.7", "1.7", "1.7", "1.0", "0.60", "0.60", "0.45", "0.35", "0.30", "0.28"],
["0.55", "0.55", "0.55", "0.55", "0.55", "0.45", "0.35", "0.35", "0.30", "0.25", "0.21", "0.21"],
["0.32", "0.32", "0.32", "0.32", "0.32", "0.28", "0.24", "0.24", "0.21", "0.19", "0.17", "0.16"],
["0.23", "0.23", "0.23", "0.23", "0.23", "0.21", "0.18", "0.18", "0.17", "0.15", "0.14", "0.14"],
["0.18", "0.18", "0.18", "0.18", "0.18", "0.17", "0.15", "0.15", "0.14", "0.13", "0.12", "0.12"],
["0.80", "0.80", "0.80", "0.80", "0.80", "0.80", "0.60", "0.60", "0.45", "0.35", "0.30", "0.28"],
["0.40", "0.40", "0.40", "0.40", "0.40", "0.40", "0.35", "0.35", "0.30", "0.25", "0.21", "0.21"],
["0.26", "0.26", "0.26", "0.26", "0.26", "0.26", "0.24", "0.24", "0.21", "0.19", "0.17", "0.16"],
["0.20", "0.20", "0.20", "0.20", "0.20", "0.20", "0.18", "0.18", "0.17", "0.15", "0.14", "0.14"],
["0.16", "0.16", "0.16", "0.16", "0.16", "0.16", "0.15", "0.15", "0.14", "0.13", "0.12", "0.12"],
["1.5", "1.5", "1.5", "1.5", "1.5", "1.0", "0.60", "0.60", "0.45", "0.35", "0.30", "0.28"],
["0.53", "0.53", "0.53", "0.53", "0.53", "0.45", "0.35", "0.35", "0.30", "0.25", "0.21", "0.21"],
["0.32", "0.32", "0.32", "0.32", "0.32", "0.30", "0.24", "0.24", "0.21", "0.19", "0.17", "0.16"],
["0.23", "0.23", "0.23", "0.23", "0.23", "0.21", "0.18", "0.18", "0.17", "0.15", "0.14", "0.14"],
["0.18", "0.18", "0.18", "0.18", "0.18", "0.17", "0.15", "0.15", "0.14", "0.13", "0.12", "0.12"],
["0.7", "0.7", "0.7", "0.7", "0.7", "0.40", "0.35", "0.35", "0.45", "0.35", "0.30", "0.28"],
["0.37", "0.37", "0.37", "0.37", "0.37", "0.27", "0.25", "0.25", "0.25", "0.25", "0.21", "0.21"],
["0.25", "0.25", "0.25", "0.25", "0.25", "0.20", "0.19", "0.19", "0.19", "0.19", "0.17", "0.16"],
["0.19", "0.19", "0.19", "0.19", "0.19", "0.16", "0.15", "0.15", "0.15", "0.15", "0.14", "0.14"],
["0.16", "0.16", "0.16", "0.16", "0.16", "0.13", "0.13", "0.13", "0.13", "0.13", "0.12", "0.12"],
["2.5", "1.9", "1.9", "1.0", "0.80", "0.45", "0.40", "0.40", "0.40", "0.35", "0.30", "0.28"],
["0.60", "0.55", "0.55", "0.40", "0.40", "0.40", "0.40", "0.40", "0.40", "0.35", "0.30", "0.28"],
["2.0", "2.0", "2.0", "2.0", "1.7", "1.0", "0.60", "0.60", "0.45", "0.35", "0.30", "0.28"],
["0.60", "0.60", "0.60", "0.60", "0.55", "0.45", "0.35", "0.35", "0.30", "0.25", "0.21", "0.21"],
["0.35", "0.35", "0.35", "0.35", "0.35", "0.32", "0.24", "0.24", "0.21", "0.19", "0.17", "0.16"],
["0.25", "0.25", "0.25", "0.25", "0.25", "0.21", "0.18", "0.18", "0.17", "0.15", "0.14", "0.14"],
["0.18", "0.18", "0.18", "0.18", "0.18", "0.17", "0.15", "0.15", "0.14", "0.13", "0.12", "0.12"],
]
age_bands = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L"]
wall_uvalues = []
for i, wall_type in enumerate(wall_types):
row = {"Wall_type": wall_type}
for j, age_band in enumerate(age_bands):
row[age_band] = u_values[i][j]
wall_uvalues.append(row)
parkhome_wall_uvalues = [
{"Wall_type": "Park home as built", "F": "1.7", "G": "1.2", "I": "0.7", "K": "0.6"},
{"Wall_type": "Park home with additional insulation", }
]
wall_uvalues.extend(parkhome_wall_uvalues)
wall_uvalues_df = pd.DataFrame(wall_uvalues)
# This maps the descriptions in the EPC data to the descriptions in the table
epc_wall_description_map = {
############################
# Cavity wall mappings
############################
"Cavity wall, as built, partial insulation": "Filled cavity",
"Cavity wall, filled cavity": "Filled cavity",
"Cavity wall, as built, no insulation": "Cavity as built",
"Cavity wall, as built, insulated": "Unfilled cavity with 100 mm external or internal insulation",
"Cavity wall, with external insulation": "Unfilled cavity with 100 mm external or internal insulation",
"Cavity wall,": "Cavity as built", # General case of cavity wall without further details
"Cavity wall, filled cavity and external insulation":
"Filled cavity with 100 mm external or internal insulation",
"Cavity wall, filled cavity and internal insulation":
"Filled cavity with 100 mm external or internal insulation",
"Cavity wall, with internal insulation": "Unfilled cavity with 100 mm external or internal insulation",
############################
# Solid brick wall mappings
############################
"Solid brick, as built, no insulation": "Solid brick as built",
"Solid brick, with internal insulation": "Stone/solid brick with 100 mm external or internal insulation",
"Solid brick, as built, insulated": "Stone/solid brick with 100 mm external or internal insulation",
"Solid brick, with external insulation": "Stone/solid brick with 100 mm external or internal insulation",
"Solid brick, as built, partial insulation": "Stone/solid brick with 50 mm external or internal insulation",
############################
# Timber frame wall mappings
############################
# These mappings are perhaps the most dubious due to the lack of timber options in the RdSAP table
"Timber frame, as built, insulated": "Timber frame with internal insulation",
"Timber frame, with additional insulation": "Timber frame with internal insulation",
"Timber frame, as built, partial insulation": "Timber frame as built",
"Timber frame, as built, no insulation": "Timber frame as built",
"Timber frame, with external insulation": "Timber frame with internal insulation",
############################
# Sandstone/limestones wall mappings
############################
"Sandstone or limestone, as built, no insulation": "Stone: sandstone or limestone as built",
"Sandstone or limestone, with internal insulation":
"Stone/solid brick with 100 mm external or internal insulation",
"Sandstone or limestone, as built, partial insulation": "Stone/solid brick with 50 mm external or internal "
"insulation",
"Sandstone, as built, no insulation": "Stone: sandstone or limestone as built",
"Sandstone or limestone, as built, insulated": "Stone/solid brick with 100 mm external or internal"
"insulation",
"Sandstone, as built, insulated": "Stone/solid brick with 100 mm external or internal insulation",
"Sandstone, with internal insulation": "Stone/solid brick with 100 mm external or internal insulation",
"Sandstone or limestone, with external insulation": "Stone/solid brick with 100 mm external or internal "
"insulation",
"Sandstone, with external insulation": "Stone/solid brick with 100 mm external or internal insulation",
"Sandstone, as built, partial insulation": "Stone/solid brick with 50 mm external or internal insulation",
############################
# Granite/whinstone wall mappings
############################
"Granite or whinstone, as built, no insulation": "Stone: granite or whinstone as built",
"Granite or whinstone, with internal insulation": "Stone/solid brick with 100 mm external or internal "
"insulation",
"Granite or whinstone, as built, partial insulation": "Stone/solid brick with 50 mm external or internal "
"insulation",
"Granite or whinstone, as built, insulated": "Stone/solid brick with 100 mm external or internal "
"insulation",
"Granite or whinstone, with external insulation": "Stone/solid brick with 100 mm external or internal "
"insulation",
############################
# System built wall mappings
############################
"System built, as built, no insulation": "System build as built",
"System built, as built, partial insulation": "System build with 50 mm external or internal insulation",
"System built, with internal insulation": "System build with 100 mm external or internal insulation",
"System built, with external insulation": "System build with 100 mm external or internal insulation",
"System built, as built, insulated": "System build with 100 mm external or internal insulation",
############################
# Cob wall mappings
############################
"Cob, as built": "Cob as built",
"Cob, with external insulation": "Cob with 100 mm external or internal insulation",
"Cob, with internal insulation": "Cob with 100 mm external or internal insulation",
############################
# Park home mappings
############################
"Park home wall, as built": "Park home as built",
"Park home wall, with external insulation": "Park home with additional insulation",
"Park home wall, with internal insulation": "Park home with additional insulation",
}
from recommendations.rdsap_tables import default_wall_thickness
def apply_formula_s_5_1_1(is_granite_or_whinstone, is_sandstone_or_limestone, age_band):
"""
As the u-value table in https://bregroup.com/wp-content/uploads/2019/09/RdSAP_2012_9.94-20-09-2019.pdf
on page 19, certain u-values as indicated by an "a", should be populated using a formula as defined in section
S.5.1.1
:param wall_type:
:return:
"""
stone_wall_thickness = [x for x in default_wall_thickness if x["type"] == "stone"][0]
thickness = stone_wall_thickness["J_K_L"] if age_band in ["J", "L", "L"] else stone_wall_thickness[age_band]
if is_granite_or_whinstone:
return 3.3 - 0.002 * thickness
if is_sandstone_or_limestone:
return 3 - 0.002 * thickness
for wall in cleaned_data["walls-description"]:
if wall["thermal_transmittance"]:
continue
description = wall["clean_description"]
# Remove (assumed)
description = description.replace("(assumed)", "").rstrip()
mapped_description = epc_wall_description_map[description]
# Get the u-value
for ab in age_bands:
mapped_value = wall_uvalues_df[wall_uvalues_df["Wall_type"] == mapped_description][ab].values[0]
if mapped_value == "a":
# The rdSap documentation indicateswe should use a formula to calculate the u-value
uvalue = float(
apply_formula_s_5_1_1(
is_granite_or_whinstone=wall["is_granite_or_whinstone"],
is_sandstone_or_limestone=wall["is_sandstone_or_limestone"],
age_band=ab
)
)
elif "b" in mapped_value:
potential_uvalue = float(mapped_value.replace("b", ""))
formula_uvalue = float(apply_formula_s_5_1_1(
is_granite_or_whinstone=wall["is_granite_or_whinstone"],
is_sandstone_or_limestone=wall["is_sandstone_or_limestone"],
age_band=ab
))
uvalue = min(potential_uvalue, formula_uvalue)
else:
uvalue = float(mapped_value)
df = pd.DataFrame(cleaned_data["walls-description"])
df = df[pd.isnull(df["thermal_transmittance"])]
df["clean_description"].values
# We store a singular file however we could store the data under the following file path:
# cleaned_epc_data/{component}/{original_description}/cleaned.bson
# where component is one of the keys of cleaned_data. If we store it against the original data, this
# data being read in will be extremely small, meaning quicker load times. We'll begin by storing as a single
# file and monitor usage patterns to see if it makes sense to split the data up
save_data_to_s3(
data=msgpack.packb(cleaned_data, use_bin_type=True),
s3_file_name="cleaned_epc_data/cleaned.bson",
bucket_name=f"retrofit-data-{ENVIRONMENT}"
)
if __name__ == "__main__":
print("Initialising cleaner app run")
app()