diff --git a/model_data/simulation_system/generate_rdsap_change.py b/model_data/simulation_system/generate_rdsap_change.py index 2d691d62..53107df0 100644 --- a/model_data/simulation_system/generate_rdsap_change.py +++ b/model_data/simulation_system/generate_rdsap_change.py @@ -1,4 +1,3 @@ -import numpy as np import pandas as pd from tqdm import tqdm @@ -31,6 +30,7 @@ def app(): # TODO [x] : Does energy tariff make a difference # - leave for now but it may not + # TODO: [x] : Add starting SAP and head demand as a feature # TODO [x] : If SAP hasn't changed, we don't include the record # TODO [x]: If SAP gets worse, it genuinely looks like in the vast majority of cases that the building looks # worse in the newer epc, so we can switch the orders @@ -53,6 +53,7 @@ def app(): df = data_processor.pre_process() cleaning_averages = data_processor.make_cleaning_averages() + data_by_urpn = [] for uprn, property_data in df.groupby("UPRN", observed=True): # Fixed features - these are property attributes that shouldn't change over time @@ -85,8 +86,7 @@ def app(): # We include the lodgement date here as we probably need to factor time into the # model, since EPC standards and rigour have changed over time variable_data = modified_property_data[ - COMPONENT_FEATURES - + ["LODGEMENT_DATE", RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE] + COMPONENT_FEATURES + ["LODGEMENT_DATE", RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE] ] # Note: we look at changes between subsequent EPCS, however we could look at other permutations @@ -104,11 +104,15 @@ def app(): gets_better = earliest_record[RDSAP_RESPONSE] <= latest_record[RDSAP_RESPONSE] if gets_better: - rdsap_change = latest_record[RDSAP_RESPONSE] - earliest_record[RDSAP_RESPONSE] - heat_demand_change = latest_record[HEAT_DEMAND_RESPONSE] - earliest_record[HEAT_DEMAND_RESPONSE] + starting_sap = earliest_record[RDSAP_RESPONSE] + starting_heat_demand = earliest_record[HEAT_DEMAND_RESPONSE] + rdsap_change = latest_record[RDSAP_RESPONSE] - starting_sap + heat_demand_change = latest_record[HEAT_DEMAND_RESPONSE] - starting_heat_demand else: - rdsap_change = earliest_record[RDSAP_RESPONSE] - latest_record[RDSAP_RESPONSE] - heat_demand_change = earliest_record[HEAT_DEMAND_RESPONSE] - latest_record[HEAT_DEMAND_RESPONSE] + starting_sap = latest_record[RDSAP_RESPONSE] + starting_heat_demand = latest_record[HEAT_DEMAND_RESPONSE] + rdsap_change = earliest_record[RDSAP_RESPONSE] - starting_sap + heat_demand_change = earliest_record[HEAT_DEMAND_RESPONSE] - starting_heat_demand if rdsap_change == 0: continue @@ -127,24 +131,28 @@ def app(): "UPRN": uprn, "RDSAP_CHANGE": rdsap_change, "HEAT_DEMAND_CHANGE": heat_demand_change, + "STARTING_SAP": starting_sap, + "STARTING_HEAT_DEMAND": starting_heat_demand, **fixed_data, **features.to_dict(), } ) - property_model_df = pd.DataFrame(property_model_data) - # Add some temporal features - we look at the days from the standard starting point in time - # for the starting and ending date so all records are from a fixed point - # TODO: implement me - property_model_df["DAYS_TO_STARTING"] = None - property_model_df["DAYS_TO_ENDING"] = None + data_by_urpn.extend(property_model_data) - dataset.append(property_model_df) + data_by_urpn_df = pd.DataFrame(data_by_urpn) + # Add some temporal features - we look at the days from the standard starting point in time + # for the starting and ending date so all records are from a fixed point + # TODO: implement me + data_by_urpn_df["DAYS_TO_STARTING"] = None + data_by_urpn_df["DAYS_TO_ENDING"] = None # TODO: We need to pre-process the data. For instance, rather than using static for roofs, walls and # floors, we may want to use the U-value. We may also want to handle the (assumed) tags # within descriptions + dataset.append(data_by_urpn_df) + cleaning_averages["LOCAL_AUTHORITY"] = df["LOCAL_AUTHORITY"].values[0] cleaning_dataset.append(cleaning_averages)