From e516a6ac4151a24f77432473a7310442762a0309 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 6 Sep 2023 17:08:18 +0100 Subject: [PATCH] make sure all responses are positive, began adding temporal features --- model_data/simulation_system/core/Settings.py | 11 +- .../generate_rdsap_change.py | 156 ++++++------------ 2 files changed, 51 insertions(+), 116 deletions(-) diff --git a/model_data/simulation_system/core/Settings.py b/model_data/simulation_system/core/Settings.py index 259acddd..030747ee 100644 --- a/model_data/simulation_system/core/Settings.py +++ b/model_data/simulation_system/core/Settings.py @@ -53,12 +53,6 @@ DEPLOYMENT_FOLDER = "deployment" TOTAL_FLOOR_AREA_NATIONAL_AVERAGE = 70 FLOOR_HEIGHT_NATIONAL_AVERAGE = 2.45 -# If we have multiple records for a numerical field, such as floor area, -# we check the margine for error between the biggest and lowest values. If we see large -# swings in measured values, we take the most recent value for this field as we interpret this -# as inaccurate measurements in the past and use the most recent value -MULTIPLE_VALUES_MARGIN_FOR_ERROR = 0.1 - COLUMNS_TO_MERGE_ON = [ "PROPERTY_TYPE", "BUILT_FORM", @@ -109,12 +103,11 @@ COMPONENT_FEATURES = [ "NUMBER_OPEN_FIREPLACES", "MAINHEATCONT_DESCRIPTION", "EXTENSION_COUNT", + "TOTAL_FLOOR_AREA", + "FLOOR_HEIGHT", # 'GLAZED_AREA', # May not need this since we have MULTI_GLAZE_PROPORTION ] -# For these fields, we take an average if we have multiple values -AVERAGE_FIXED_FEATURES = ["TOTAL_FLOOR_AREA", "FLOOR_HEIGHT"] - # For these fields, we take the latest value if we have multiple values # Since more recent EPCs have been conducted with more rigour, we assume that the latest value is # the most accurate diff --git a/model_data/simulation_system/generate_rdsap_change.py b/model_data/simulation_system/generate_rdsap_change.py index 80991e82..2d691d62 100644 --- a/model_data/simulation_system/generate_rdsap_change.py +++ b/model_data/simulation_system/generate_rdsap_change.py @@ -5,13 +5,11 @@ from tqdm import tqdm from pathlib import Path from simulation_system.core.Settings import ( MANDATORY_FIXED_FEATURES, - AVERAGE_FIXED_FEATURES, LATEST_FIELD, COMPONENT_FEATURES, RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE, COLUMNS_TO_MERGE_ON, - MULTIPLE_VALUES_MARGIN_FOR_ERROR, ) from simulation_system.core.DataProcessor import DataProcessor from utils import save_dataframe_to_s3_parquet @@ -19,9 +17,6 @@ from utils import save_dataframe_to_s3_parquet DATA_DIRECTORY = Path(__file__).parent / "simulation_system" / "data" / "all-domestic-certificates" -# TODO: Have a look at temporal features - - def app(): # Get all the files in the directory @@ -34,58 +29,20 @@ def app(): dataset = [] cleaning_dataset = [] - # TODO: Does energy tariff make a difference - # TODO: If SAP hasn't changed, we don't include the record - # TODO: Floor area will impact the EPC so instead of averaging, we should have a starting and ending value. - # TODO: Same as floor area for floor height - # TODO: If fundamental building fabric changes, we should proabably discard the record - # TODO: Should we prune records that have an exceptionally large amount of time between them? - # TODO: If we have multiple EPCs lodged on the same day, should we remove them? Could be corrections? + # TODO [x] : Does energy tariff make a difference + # - leave for now but it may not + # TODO [x] : If SAP hasn't changed, we don't include the record + # TODO [x]: If SAP gets worse, it genuinely looks like in the vast majority of cases that the building looks + # worse in the newer epc, so we can switch the orders + # TODO [] : Have a look at temporal features + # TODO [x] : Floor area will impact the EPC so instead of averaging, we should have a starting and ending value. + # TODO [x]: Same as floor area for floor height + # TODO []: If fundamental building fabric changes, we should proabably discard the record + # TODO [x]: Should we prune records that have an exceptionally large amount of time between them? + # - leave for now and check performance after temporal features + # TODO [x]: If we have multiple EPCs lodged on the same day, should we remove them? Could be corrections? + # - Leave for now # - # TODO: REMOVE ME - dodgy_uprns = [] - observed_uprns = [ - "10002082244", # Doesn't really make sense, house no longer has lel and not has more insulation but lower score - "10002082259", - # Property has more roof insulation, lel, but now the floor isn't insulated and has a lower score. Also the - # floor assessment is now assumed whereas before it wasnt - "10002082418", # Walls went from insulated to not... - "10002082640", # Property identical besides different energy taffiff - "10002082830", # Lots of records going from not insulated to insulated but some parts of - # the property has gotten better - "10002083244", # latest epc indicates the property is worse - "10002083592", # lastest epc doesn't have a fuel system present, but has slightly more insulation. Also the - # floor type has changed from solid to syspended. lel has decreased - "100030533576", # property slightly worse, has less lels and the floor description has changed type - "100030533668", # has slightly less lels. Glazed type is now missing - "100030533803", # Not super clea why this is lower, newer epc has more lel but is using second heating - "100030534016", # Property has less lel but more roof insulation. Floor type has changed - "100030534040", # property has less lel and the floor type has changed - "100030534041", # property has less insulation and less lel - "100030534243", # Cavity wall has gone from filled to unfilled - "100030534294", # less roof insulation but now has an air source heat pump - "100030534322", # identical between records but now with higher lel but no change recorded - "100030534413", # identical between records but different energy tariff, no sap change - "100030534437", # property has less lel and the mainheating no longer has a programmer and trvs - "100030534569", # Cavity wall no longer filled, 30mm more roof insulation in newest epc - "100030534676", # Property has less lel, is now using secondary heating, has 50mm less roof insulation, but - # the wall cavity is no longer filled - "100030534732", # property has higher lel %. Not clear why this is worse, glazing type has changed. - # This looks dodgy has the UPRN_SOURCE is address matched also the floor area has increased from the first to - # the later epc - "100030534791", # Property has started using secondary heating - the EPCs are taken on the same day so maybe we - # should discard - "100030534795", # More lel but a lot less insulation. This is a very dodgy record, sap has gone from 90 to 66 - # The newer epc indicates the property now has 40% photo supply so this doesn't make much sense - "100030534897", # Roof has gone from thatched with additional insulation to pitched with insulation, - # sap score hasn't changed - "100030534986", # Property has gone from 300mm loft insulation to none. has 2% higher lel (negligible) and - # slightly better main heating setup - "100030535043", # Property lel increased by 12%, not clear why sap worse. Maybe due to different floor area and - # wall height - "100030535173", # lel increased from 20% to 80% but roof gone from 100m insulation to "limited" insulation - "100030535244", # lel gone from 100% to 0%, sap is the same - ] for directory in tqdm(directories): @@ -121,18 +78,6 @@ def app(): cols_to_merge_on=COLUMNS_TO_MERGE_ON ) - for field in AVERAGE_FIXED_FEATURES: - - vals = list(modified_property_data[field].dropna().unique()) - if len(vals) > 1: - lowest_value = min(vals) - largest_value = max(vals) - if abs(largest_value - lowest_value) / lowest_value > MULTIPLE_VALUES_MARGIN_FOR_ERROR: - # Take the more recent value since it's likely to be more accurate - vals = [vals[-1]] - - fixed_data[field] = np.mean(vals) - # Combine all fields together fixed_data.update(mandatory_field_data) fixed_data.update(latest_field_data) @@ -152,46 +97,28 @@ def app(): if idx >= modified_property_data.shape[0] - 1: break - starting_record = variable_data.iloc[idx] - ending_record = variable_data.iloc[idx + 1] - rdsap_change = ( - ending_record[RDSAP_RESPONSE] - starting_record[RDSAP_RESPONSE] - ) - heat_demand_change = ( - ending_record[HEAT_DEMAND_RESPONSE] - - starting_record[HEAT_DEMAND_RESPONSE] - ) + earliest_record = variable_data.iloc[idx] + latest_record = variable_data.iloc[idx + 1] - # Check for a change in the starting and ending record - check_cols = [ - col for col in starting_record.index if col not in [ - "LODGEMENT_DATE", "CURRENT_ENERGY_EFFICIENCY", "ENERGY_CONSUMPTION_CURRENT", "ENERGY_TARIFF" - ] - ] - all_same = True - for col in check_cols: - if starting_record[col] != ending_record[col]: - all_same = False - break + # Check if the sap gets better or worse + gets_better = earliest_record[RDSAP_RESPONSE] <= latest_record[RDSAP_RESPONSE] - if rdsap_change <= 0: - if all_same | (uprn in observed_uprns): - if uprn not in observed_uprns: - dodgy_uprns.append(uprn) - else: - compare = pd.concat([starting_record, ending_record], axis=1) - bljd + if gets_better: + rdsap_change = latest_record[RDSAP_RESPONSE] - earliest_record[RDSAP_RESPONSE] + heat_demand_change = latest_record[HEAT_DEMAND_RESPONSE] - earliest_record[HEAT_DEMAND_RESPONSE] + else: + rdsap_change = earliest_record[RDSAP_RESPONSE] - latest_record[RDSAP_RESPONSE] + heat_demand_change = earliest_record[HEAT_DEMAND_RESPONSE] - latest_record[HEAT_DEMAND_RESPONSE] - # TODO: We need to pre-process the data. For instance, rather than using static for roofs, walls and - # floors, we may want to use the U-value. We may also want to handle the (assumed) tags - # within descriptions + if rdsap_change == 0: + continue - starting_record = starting_record[ - COMPONENT_FEATURES + ["LODGEMENT_DATE"] - ].add_suffix("_STARTING") - ending_record = ending_record[ - COMPONENT_FEATURES + ["LODGEMENT_DATE"] - ].add_suffix("_ENDING") + if gets_better: + starting_record = earliest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_STARTING") + ending_record = latest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_ENDING") + else: + starting_record = latest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_STARTING") + ending_record = earliest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_ENDING") features = pd.concat([starting_record, ending_record]) @@ -205,7 +132,18 @@ def app(): } ) - dataset.append(property_model_data) + property_model_df = pd.DataFrame(property_model_data) + # Add some temporal features - we look at the days from the standard starting point in time + # for the starting and ending date so all records are from a fixed point + # TODO: implement me + property_model_df["DAYS_TO_STARTING"] = None + property_model_df["DAYS_TO_ENDING"] = None + + dataset.append(property_model_df) + + # TODO: We need to pre-process the data. For instance, rather than using static for roofs, walls and + # floors, we may want to use the U-value. We may also want to handle the (assumed) tags + # within descriptions cleaning_averages["LOCAL_AUTHORITY"] = df["LOCAL_AUTHORITY"].values[0] cleaning_dataset.append(cleaning_averages) @@ -218,8 +156,12 @@ def app(): file_key="sap_change_model/cleaning_dataset.parquet", ) - output = pd.DataFrame(dataset) - output.to_parquet("./dataset.parquet") + output = pd.concat(dataset) + save_dataframe_to_s3_parquet( + df=output, + bucket_name="retrofit-data-dev", + file_key="sap_change_model/dataset.parquet", + ) if __name__ == "__main__":