From 44099f8d8362847a56ea418d6a4b41ca8739876c Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 3 Aug 2023 12:09:37 +0100 Subject: [PATCH] Working on simulation system data --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- model_data/simulation_system/app.py | 137 ++++++++++++++++++++++++++-- 3 files changed, 130 insertions(+), 11 deletions(-) diff --git a/.idea/Model.iml b/.idea/Model.iml index 05b9012b..b03b31b1 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml index 3b05c6ac..ca0e1cd9 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -1,6 +1,6 @@ - + diff --git a/model_data/simulation_system/app.py b/model_data/simulation_system/app.py index 15902d19..d846dcf7 100644 --- a/model_data/simulation_system/app.py +++ b/model_data/simulation_system/app.py @@ -49,10 +49,56 @@ COMPONENT_FEATURES = [ 'EXTENSION_COUNT' ] +# For these fields, we take an average if we have multiple values AVERAGE_FIXED_FEATURES = [ - "TOTAL_FLOOR_AREA" + "TOTAL_FLOOR_AREA", + "FLOOR_HEIGHT" ] +# For these fields, we take the latest value if we have multiple values +# Since more recent EPCs have been conducted with more rigour, we assume that the latest value is +# the most accurate +LATEST_FIELD = [ + "NUMBER_HABITABLE_ROOMS", + "NUMBER_HEATED_ROOMS", + "FIXED_LIGHTING_OUTLETS_COUNT", + "CONSTRUCTION_AGE_BAND" +] + +# If we see thee features changing, we don't use the EPC, since deem it not to be reliable +MANDATORY_FIXED_FEATURES = [ + "PROPERTY_TYPE", +] + +# For particularly old EPC data, we have inconsistent records so we'll only include EPCS that were +# conducted after 2010, since SAP09 was introduced in 2009 an later SAP12 was introduced in England +# and Wales from 31 July 2014 +EARLIEST_EPC_DATE = "2014-08-01" + +RDSAP_RESPONSE = "CURRENT_ENERGY_EFFICIENCY" +HEAT_DEMAND_RESPONSE = "ENERGY_CONSUMPTION_CURRENT" + + +def make_cleaning_averages(df): + cleaning_averages = df.groupby( + ["PROPERTY_TYPE", "BUILT_FORM", "CONSTRUCTION_AGE_BAND", "NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"] + )[AVERAGE_FIXED_FEATURES].mean().reset_index() + + return cleaning_averages + + +FLOOR_LEVEL_MAP = { + "00": 0, + "01": 1, + "02": 2, + "03": 3, + "Basement": -1, + "Ground": 0, + "1st": 1, + "2nd": 2, + "3rd": 3, +} + def app(): # Get all the files in the directory @@ -63,6 +109,15 @@ def app(): filepath = os.path.join(DATA_DIRECTORY, directory, "certificates.csv") df = pd.read_csv(filepath, low_memory=False) df = df[~pd.isnull(df["UPRN"])] + + df = df[df["LODGEMENT_DATE"] >= EARLIEST_EPC_DATE] + + cleaning_averages = make_cleaning_averages(df) + + # We remove EPCS that were conducted for a new build, since these are performed with + # full SAP, which produces different results to the RdSAP methodology + df = df[df["TRANSACTION_TYPE"] != "new dwelling"] + df["UPRN"] = df["UPRN"].astype(int).astype(str) counts = df.groupby("UPRN").size().reset_index() counts.columns = ["UPRN", "count"] @@ -73,32 +128,70 @@ def app(): df = df[df["UPRN"].isin(counts["UPRN"])] df = df.sort_values(["UPRN", "LODGEMENT_DATE"], ascending=True) - for uprn, property_data in df.groupby("UPRN"): + results = [] + for uprn, property_data in df.groupby("UPRN", observed=True): # Fixed features - these are property attributes that shouldn't change over time + ignore_epc = False fixed_data = {} for field in FIXED_FEATURES: vals = property_data[field].dropna().unique() # Remove invalid values vals = [v for v in vals if v not in BaseUtility.DATA_ANOMALY_MATCHES] - if len(vals) > 1: - raise ValueError("Fixed feature {} has more than one value - fix me".format(field)) + if field == "FLOOR_LEVEL": + vals = list({FLOOR_LEVEL_MAP[v] for v in vals}) if field in AVERAGE_FIXED_FEATURES: - # Check the values are too far apart - if abs(vals[0] - vals[1]) / vals[0] > 0.1: - raise ValueError("Large deviation in fixed feature {} - fix me".format(field)) - field_value = np.mean(vals) + if len(vals) > 1: + # Check the values are too far apart + if abs(vals[0] - vals[1]) / vals[0] > 0.1: + # Take the more recent value since it's likely to be more accurate + vals = [vals[-1]] + + if vals: + field_value = np.mean(vals) + else: + # Clean using averages + avgs = cleaning_averages[ + (cleaning_averages["PROPERTY_TYPE"] == property_data["PROPERTY_TYPE"].iloc[0]) & + (cleaning_averages["BUILT_FORM"] == property_data["BUILT_FORM"].iloc[0]) & + (cleaning_averages["CONSTRUCTION_AGE_BAND"] == property_data["CONSTRUCTION_AGE_BAND"].iloc[ + 0]) & + (cleaning_averages["NUMBER_HABITABLE_ROOMS"] == + property_data["NUMBER_HABITABLE_ROOMS"].iloc[0]) & + (cleaning_averages["NUMBER_HEATED_ROOMS"] == property_data["NUMBER_HEATED_ROOMS"].iloc[0]) + ] + + field_value = avgs[field].iloc[0] + + elif field in LATEST_FIELD: + field_value = vals[-1] if vals else None else: + if len(vals) > 1: + if field in MANDATORY_FIXED_FEATURES: + ignore_epc = True + else: + raise ValueError("Fixed feature {} has more than one value - fix me".format(field)) + field_value = vals[0] if vals else None fixed_data[field] = field_value - variable_data = property_data[COMPONENT_FEATURES] + if ignore_epc: + continue + # We include the lodgement date here as we probably need to factor time into the + # model, since EPC standards and rigour have changed over time + variable_data = property_data[ + COMPONENT_FEATURES + ["LODGEMENT_DATE", RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE] + ] + + # Note: we look at changes between subsequent EPCS, however we could look at other permutations + # e.g. first vs second, second vs third and also first vs third + property_model_data = [] for idx in range(0, property_data.shape[0] - 1): if idx >= property_data.shape[0] - 1: @@ -106,3 +199,29 @@ def app(): starting_record = variable_data.iloc[idx] ending_record = variable_data.iloc[idx + 1] + rdsap_change = ending_record[RDSAP_RESPONSE] - starting_record[RDSAP_RESPONSE] + heat_demand_change = ending_record[HEAT_DEMAND_RESPONSE] - starting_record[HEAT_DEMAND_RESPONSE] + + if rdsap_change == 0: + # Assumption: We aren't interested in records that exhibit no change + continue + + # TODO: We need to pre-process the data. For instance, rather than using static for roofs, walls and + # floors, we may want to use the U-value. We may also want to handle the (assumed) tags + # within descriptions + + starting_record = starting_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_STARTING") + ending_record = ending_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_ENDING") + + features = pd.concat([starting_record, ending_record]) + + property_model_data.append( + { + "UPRN": uprn, + "RDSAP_CHANGE": rdsap_change, + "HEAT_DEMAND_CHANGE": heat_demand_change, + **features.to_dict() + } + ) + + results.extend(property_model_data)