from tqdm import tqdm import os import pandas as pd import msgpack from model_data.EpcClean import EpcClean from model_data.analysis.UvalueEstimations import UvalueEstimations from model_data.simulation_system.core.Settings import EARLIEST_EPC_DATE from pathlib import Path from utils.s3 import save_data_to_s3 LAND_REGISTRY_PATHS = [ os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-monthly-update-new-version.csv", os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2022 (1).csv", os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2021.csv", os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2020.csv", os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2019.csv", os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2018.csv", os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2017-part1.csv", os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2017-part2.csv", ] EPC_DIRECTORY = Path(__file__).parent / "model_data" / "simulation_system" / "data" / "all-domestic-certificates" ENVIRONMENT = os.getenv("ENVIRONMENT", "dev") def app(): """ For a pre-defined list of constituencies and property data_types, we'll download EPC data from the API and produce a dataset of cleaned fields so that when we get new properties, we can quickly sanitise any description data Currently, this application is just run on a local machine """ cleaned_data = {} epc_directories = [entry for entry in EPC_DIRECTORY.iterdir() if entry.is_dir()] for directory in tqdm(epc_directories): directory_destructured = str(directory).split("/")[-1].split("-") gss_code = directory_destructured[1] local_authority = directory_destructured[2] data = pd.read_csv(directory / "certificates.csv", low_memory=False) # Rename the columns to the same format as the api returns data.columns = [c.replace("_", "-").lower() for c in data.columns] # Take just date before the date threshold data = data[data["lodgement-date"] >= EARLIEST_EPC_DATE] # Convert to list of dictioaries as returned by the api data = data.to_dict("records") # Incorporate input data into cleaning cleaner = EpcClean(data) cleaner.clean() # Extended cleaned_data for k, data in cleaner.cleaned.items(): if k not in cleaned_data: cleaned_data[k] = data else: existing_descriptions = [x["original_description"] for x in cleaned_data[k]] new_data = [x for x in data if x["original_description"] not in existing_descriptions] cleaned_data[k].extend(new_data) # TODO: Add property age band into this # uvalue_estimates = UvalueEstimations(data=data) # uvalue_estimates.get_estimates(cleaner=cleaner) # # TODO: Store these to a s3 # uvalue_estimates.walls # uvalue_estimates.floors # uvalue_estimates.roofs # Basic check to make sure all descriptions are unique for _, cleaned in cleaned_data.items(): descriptions = [x["original_description"] for x in cleaned] if len(descriptions) != len(set(descriptions)): raise ValueError("Duplicated descriptions found, check me") # Finally, we attach u-values to the descriptions for walls, roofs and floors wall_types = [ "Stone: granite or whinstone as built", "Stone: sandstone or limestone as built", "Solid brick as built", "Stone/solid brick with 50 mm external or internal insulation", "Stone/solid brick with 100 mm external or internal insulation", "Stone/solid brick with 150 mm external or internal insulation", "Stone/solid brick with 200 mm external or internal insulation", "Cob as built", "Cob with 50 mm external or internal insulation", "Cob with 100 mm external or internal insulation", "Cob with 150 mm external or internal insulation", "Cob with 200 mm external or internal insulation", "Cavity as built", "Unfilled cavity with 50 mm external or internal insulation", "Unfilled cavity with 100 mm external or internal insulation", "Unfilled cavity with 150 mm external or internal insulation", "Unfilled cavity with 200 mm external or internal insulation", "Filled cavity", "Filled cavity with 50 mm external or internal insulation", "Filled cavity with 100 mm external or internal insulation", "Filled cavity with 150 mm external or internal insulation", "Filled cavity with 200 mm external or internal insulation", "Timber frame as built", "Timber frame with internal insulation", "System build as built", "System build with 50 mm external or internal insulation", "System build with 100 mm external or internal insulation", "System build with 150 mm external or internal insulation", "System build with 200 mm external or internal insulation", ] u_values = [ ["a", "a", "a", "a", "1.7b", "1.0", "0.6", "0.60", "0.45", "0.35", "0.30", "0.28"], ["a", "a", "a", "a", "1.7b", "1.0", "0.6", "0.60", "0.45", "0.35", "0.30", "0.28"], ["1.7", "1.7", "1.7", "1.7", "1.7", "1.0", "0.60", "0.60", "0.45", "0.35", "0.30", "0.28"], ["0.55", "0.55", "0.55", "0.55", "0.55", "0.45", "0.35", "0.35", "0.30", "0.25", "0.21", "0.21"], ["0.32", "0.32", "0.32", "0.32", "0.32", "0.28", "0.24", "0.24", "0.21", "0.19", "0.17", "0.16"], ["0.23", "0.23", "0.23", "0.23", "0.23", "0.21", "0.18", "0.18", "0.17", "0.15", "0.14", "0.14"], ["0.18", "0.18", "0.18", "0.18", "0.18", "0.17", "0.15", "0.15", "0.14", "0.13", "0.12", "0.12"], ["0.80", "0.80", "0.80", "0.80", "0.80", "0.80", "0.60", "0.60", "0.45", "0.35", "0.30", "0.28"], ["0.40", "0.40", "0.40", "0.40", "0.40", "0.40", "0.35", "0.35", "0.30", "0.25", "0.21", "0.21"], ["0.26", "0.26", "0.26", "0.26", "0.26", "0.26", "0.24", "0.24", "0.21", "0.19", "0.17", "0.16"], ["0.20", "0.20", "0.20", "0.20", "0.20", "0.20", "0.18", "0.18", "0.17", "0.15", "0.14", "0.14"], ["0.16", "0.16", "0.16", "0.16", "0.16", "0.16", "0.15", "0.15", "0.14", "0.13", "0.12", "0.12"], ["1.5", "1.5", "1.5", "1.5", "1.5", "1.0", "0.60", "0.60", "0.45", "0.35", "0.30", "0.28"], ["0.53", "0.53", "0.53", "0.53", "0.53", "0.45", "0.35", "0.35", "0.30", "0.25", "0.21", "0.21"], ["0.32", "0.32", "0.32", "0.32", "0.32", "0.30", "0.24", "0.24", "0.21", "0.19", "0.17", "0.16"], ["0.23", "0.23", "0.23", "0.23", "0.23", "0.21", "0.18", "0.18", "0.17", "0.15", "0.14", "0.14"], ["0.18", "0.18", "0.18", "0.18", "0.18", "0.17", "0.15", "0.15", "0.14", "0.13", "0.12", "0.12"], ["0.7", "0.7", "0.7", "0.7", "0.7", "0.40", "0.35", "0.35", "0.45", "0.35", "0.30", "0.28"], ["0.37", "0.37", "0.37", "0.37", "0.37", "0.27", "0.25", "0.25", "0.25", "0.25", "0.21", "0.21"], ["0.25", "0.25", "0.25", "0.25", "0.25", "0.20", "0.19", "0.19", "0.19", "0.19", "0.17", "0.16"], ["0.19", "0.19", "0.19", "0.19", "0.19", "0.16", "0.15", "0.15", "0.15", "0.15", "0.14", "0.14"], ["0.16", "0.16", "0.16", "0.16", "0.16", "0.13", "0.13", "0.13", "0.13", "0.13", "0.12", "0.12"], ["2.5", "1.9", "1.9", "1.0", "0.80", "0.45", "0.40", "0.40", "0.40", "0.35", "0.30", "0.28"], ["0.60", "0.55", "0.55", "0.40", "0.40", "0.40", "0.40", "0.40", "0.40", "0.35", "0.30", "0.28"], ["2.0", "2.0", "2.0", "2.0", "1.7", "1.0", "0.60", "0.60", "0.45", "0.35", "0.30", "0.28"], ["0.60", "0.60", "0.60", "0.60", "0.55", "0.45", "0.35", "0.35", "0.30", "0.25", "0.21", "0.21"], ["0.35", "0.35", "0.35", "0.35", "0.35", "0.32", "0.24", "0.24", "0.21", "0.19", "0.17", "0.16"], ["0.25", "0.25", "0.25", "0.25", "0.25", "0.21", "0.18", "0.18", "0.17", "0.15", "0.14", "0.14"], ["0.18", "0.18", "0.18", "0.18", "0.18", "0.17", "0.15", "0.15", "0.14", "0.13", "0.12", "0.12"], ] age_bands = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L"] wall_uvalues = [] for i, wall_type in enumerate(wall_types): row = {"Wall_type": wall_type} for j, age_band in enumerate(age_bands): row[age_band] = u_values[i][j] wall_uvalues.append(row) parkhome_wall_uvalues = [ {"Wall_type": "Park home as built", "F": "1.7", "G": "1.2", "I": "0.7", "K": "0.6"}, {"Wall_type": "Park home with additional insulation", } ] wall_uvalues.extend(parkhome_wall_uvalues) wall_uvalues_df = pd.DataFrame(wall_uvalues) # This maps the descriptions in the EPC data to the descriptions in the table epc_wall_description_map = { ############################ # Cavity wall mappings ############################ "Cavity wall, as built, partial insulation": "Filled cavity", "Cavity wall, filled cavity": "Filled cavity", "Cavity wall, as built, no insulation": "Cavity as built", "Cavity wall, as built, insulated": "Unfilled cavity with 100 mm external or internal insulation", "Cavity wall, with external insulation": "Unfilled cavity with 100 mm external or internal insulation", "Cavity wall,": "Cavity as built", # General case of cavity wall without further details "Cavity wall, filled cavity and external insulation": "Filled cavity with 100 mm external or internal insulation", "Cavity wall, filled cavity and internal insulation": "Filled cavity with 100 mm external or internal insulation", "Cavity wall, with internal insulation": "Unfilled cavity with 100 mm external or internal insulation", ############################ # Solid brick wall mappings ############################ "Solid brick, as built, no insulation": "Solid brick as built", "Solid brick, with internal insulation": "Stone/solid brick with 100 mm external or internal insulation", "Solid brick, as built, insulated": "Stone/solid brick with 100 mm external or internal insulation", "Solid brick, with external insulation": "Stone/solid brick with 100 mm external or internal insulation", "Solid brick, as built, partial insulation": "Stone/solid brick with 50 mm external or internal insulation", ############################ # Timber frame wall mappings ############################ # These mappings are perhaps the most dubious due to the lack of timber options in the RdSAP table "Timber frame, as built, insulated": "Timber frame with internal insulation", "Timber frame, with additional insulation": "Timber frame with internal insulation", "Timber frame, as built, partial insulation": "Timber frame as built", "Timber frame, as built, no insulation": "Timber frame as built", "Timber frame, with external insulation": "Timber frame with internal insulation", ############################ # Sandstone/limestones wall mappings ############################ "Sandstone or limestone, as built, no insulation": "Stone: sandstone or limestone as built", "Sandstone or limestone, with internal insulation": "Stone/solid brick with 100 mm external or internal insulation", "Sandstone or limestone, as built, partial insulation": "Stone/solid brick with 50 mm external or internal " "insulation", "Sandstone, as built, no insulation": "Stone: sandstone or limestone as built", "Sandstone or limestone, as built, insulated": "Stone/solid brick with 100 mm external or internal" "insulation", "Sandstone, as built, insulated": "Stone/solid brick with 100 mm external or internal insulation", "Sandstone, with internal insulation": "Stone/solid brick with 100 mm external or internal insulation", "Sandstone or limestone, with external insulation": "Stone/solid brick with 100 mm external or internal " "insulation", "Sandstone, with external insulation": "Stone/solid brick with 100 mm external or internal insulation", "Sandstone, as built, partial insulation": "Stone/solid brick with 50 mm external or internal insulation", ############################ # Granite/whinstone wall mappings ############################ "Granite or whinstone, as built, no insulation": "Stone: granite or whinstone as built", "Granite or whinstone, with internal insulation": "Stone/solid brick with 100 mm external or internal " "insulation", "Granite or whinstone, as built, partial insulation": "Stone/solid brick with 50 mm external or internal " "insulation", "Granite or whinstone, as built, insulated": "Stone/solid brick with 100 mm external or internal " "insulation", "Granite or whinstone, with external insulation": "Stone/solid brick with 100 mm external or internal " "insulation", ############################ # System built wall mappings ############################ "System built, as built, no insulation": "System build as built", "System built, as built, partial insulation": "System build with 50 mm external or internal insulation", "System built, with internal insulation": "System build with 100 mm external or internal insulation", "System built, with external insulation": "System build with 100 mm external or internal insulation", "System built, as built, insulated": "System build with 100 mm external or internal insulation", ############################ # Cob wall mappings ############################ "Cob, as built": "Cob as built", "Cob, with external insulation": "Cob with 100 mm external or internal insulation", "Cob, with internal insulation": "Cob with 100 mm external or internal insulation", ############################ # Park home mappings ############################ "Park home wall, as built": "Park home as built", "Park home wall, with external insulation": "Park home with additional insulation", "Park home wall, with internal insulation": "Park home with additional insulation", } from recommendations.rdsap_tables import default_wall_thickness def apply_formula_s_5_1_1(is_granite_or_whinstone, is_sandstone_or_limestone, age_band): """ As the u-value table in https://bregroup.com/wp-content/uploads/2019/09/RdSAP_2012_9.94-20-09-2019.pdf on page 19, certain u-values as indicated by an "a", should be populated using a formula as defined in section S.5.1.1 :param wall_type: :return: """ stone_wall_thickness = [x for x in default_wall_thickness if x["type"] == "stone"][0] thickness = stone_wall_thickness["J_K_L"] if age_band in ["J", "L", "L"] else stone_wall_thickness[age_band] if is_granite_or_whinstone: return 3.3 - 0.002 * thickness if is_sandstone_or_limestone: return 3 - 0.002 * thickness for wall in cleaned_data["walls-description"]: if wall["thermal_transmittance"]: continue description = wall["clean_description"] # Remove (assumed) description = description.replace("(assumed)", "").rstrip() mapped_description = epc_wall_description_map[description] # Get the u-value for ab in age_bands: mapped_value = wall_uvalues_df[wall_uvalues_df["Wall_type"] == mapped_description][ab].values[0] if mapped_value == "a": # The rdSap documentation indicateswe should use a formula to calculate the u-value uvalue = float( apply_formula_s_5_1_1( is_granite_or_whinstone=wall["is_granite_or_whinstone"], is_sandstone_or_limestone=wall["is_sandstone_or_limestone"], age_band=ab ) ) elif "b" in mapped_value: potential_uvalue = float(mapped_value.replace("b", "")) formula_uvalue = float(apply_formula_s_5_1_1( is_granite_or_whinstone=wall["is_granite_or_whinstone"], is_sandstone_or_limestone=wall["is_sandstone_or_limestone"], age_band=ab )) uvalue = min(potential_uvalue, formula_uvalue) else: uvalue = float(mapped_value) df = pd.DataFrame(cleaned_data["walls-description"]) df = df[pd.isnull(df["thermal_transmittance"])] df["clean_description"].values # We store a singular file however we could store the data under the following file path: # cleaned_epc_data/{component}/{original_description}/cleaned.bson # where component is one of the keys of cleaned_data. If we store it against the original data, this # data being read in will be extremely small, meaning quicker load times. We'll begin by storing as a single # file and monitor usage patterns to see if it makes sense to split the data up save_data_to_s3( data=msgpack.packb(cleaned_data, use_bin_type=True), s3_file_name="cleaned_epc_data/cleaned.bson", bucket_name=f"retrofit-data-{ENVIRONMENT}" ) if __name__ == "__main__": print("Initialising cleaner app run") app()