diff --git a/.idea/Model.iml b/.idea/Model.iml index 4413bb06..b0f9c00d 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index 3b05c6ac..ca0e1cd9 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -1,6 +1,6 @@ - + diff --git a/etl/epc/DataProcessor.py b/etl/epc/DataProcessor.py index e9c84c3c..c500c095 100644 --- a/etl/epc/DataProcessor.py +++ b/etl/epc/DataProcessor.py @@ -569,3 +569,20 @@ class DataProcessor: df[col] = df[col].fillna("Unknown") return df + + @staticmethod + def clean_efficiency_variables(df): + missings = pd.isnull(df).sum() + missings = missings[missings >= 1] + + if len(missings) == 0: + return df + + # Make sure they are all efficiency columns + if any(~missings.index.str.contains("ENERGY_EFF")): + raise ValueError("Non efficiency columns are missing") + + for m in missings.index: + df[m] = df[m].fillna("NO_RATING") + + return df diff --git a/etl/epc/property_change_app.py b/etl/epc/property_change_app.py index 067d7161..12f6f0c0 100644 --- a/etl/epc/property_change_app.py +++ b/etl/epc/property_change_app.py @@ -12,6 +12,10 @@ from etl.epc.settings import ( HEAT_DEMAND_RESPONSE, COLUMNS_TO_MERGE_ON, CARBON_RESPONSE, + CORE_COMPONENT_FEATURES, + EFFICIENCY_FEATURES, + POTENTIAL_COLUMNS, + MINIMUM_FLOOR_HEIGHT ) from etl.epc.DataProcessor import DataProcessor from utils.s3 import save_dataframe_to_s3_parquet, read_from_s3 @@ -363,6 +367,25 @@ def make_uvalues(df): return df +def compare_records(earliest_record: pd.Series, latest_record: pd.Series, columns: list): + """ + For a list of columns, check if the earliest and latest record are the same + If they are the same, we indicate this, because we have example of SAP scores changing + without any feature changes + :param earliest_record: pd.Series + :param latest_record: pd.Series + :param columns: list of columns to compare + :return: boolean indicating whether or not all features are the same + """ + + all_equal = True + for col in columns: + if earliest_record[col] != latest_record[col]: + return False + if all_equal: + return True + + def app(): # Get all the files in the directory @@ -376,6 +399,8 @@ def app(): dataset = [] cleaning_dataset = [] + # Keep track of the number of all equals + all_equal_count = 0 for directory in tqdm(directories): @@ -422,7 +447,9 @@ def app(): # We include the lodgement date here as we probably need to factor time into the # model, since EPC standards and rigour have changed over time variable_data = property_data[ - COMPONENT_FEATURES + ["LODGEMENT_DATE", RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE, CARBON_RESPONSE] + COMPONENT_FEATURES + EFFICIENCY_FEATURES + POTENTIAL_COLUMNS + [ + "LODGEMENT_DATE", RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE, CARBON_RESPONSE + ] ] # Note: we look at changes between subsequent EPCS, however we could look at other permutations @@ -439,6 +466,8 @@ def app(): # Check if the sap gets better or worse gets_better = earliest_record[RDSAP_RESPONSE] <= latest_record[RDSAP_RESPONSE] + component_variables = COMPONENT_FEATURES + EFFICIENCY_FEATURES + if gets_better: starting_sap = earliest_record[RDSAP_RESPONSE] starting_heat_demand = earliest_record[HEAT_DEMAND_RESPONSE] @@ -452,8 +481,8 @@ def app(): heat_demand_change = latest_record[HEAT_DEMAND_RESPONSE] - starting_heat_demand carbon_change = latest_record[CARBON_RESPONSE] - starting_carbon - starting_record = earliest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_STARTING") - ending_record = latest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_ENDING") + starting_record = earliest_record[component_variables + ["LODGEMENT_DATE"]].add_suffix("_STARTING") + ending_record = latest_record[component_variables + ["LODGEMENT_DATE"]].add_suffix("_ENDING") else: starting_sap = latest_record[RDSAP_RESPONSE] starting_heat_demand = latest_record[HEAT_DEMAND_RESPONSE] @@ -467,12 +496,23 @@ def app(): heat_demand_change = earliest_record[HEAT_DEMAND_RESPONSE] - starting_heat_demand carbon_change = earliest_record[CARBON_RESPONSE] - starting_carbon - starting_record = latest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_STARTING") - ending_record = earliest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_ENDING") + starting_record = latest_record[component_variables + ["LODGEMENT_DATE"]].add_suffix("_STARTING") + ending_record = earliest_record[component_variables + ["LODGEMENT_DATE"]].add_suffix("_ENDING") if rdsap_change == 0: continue + all_equal = compare_records( + earliest_record=earliest_record, + latest_record=latest_record, + columns=CORE_COMPONENT_FEATURES + ) + + if all_equal: + # Keep track of this for the moment + all_equal_count += 1 + continue + features = pd.concat([starting_record, ending_record]) property_model_data.append( @@ -487,6 +527,10 @@ def app(): "HEAT_DEMAND_ENDING": ending_heat_demand, "CARBON_STARTING": starting_carbon, "CARBON_ENDING": ending_carbon, + "POTENTIAL_ENERGY_EFFICIENCY": earliest_record["POTENTIAL_ENERGY_EFFICIENCY"], + "ENVIRONMENT_IMPACT_POTENTIAL": earliest_record["ENVIRONMENT_IMPACT_POTENTIAL"], + "ENERGY_CONSUMPTION_POTENTIAL": earliest_record["ENERGY_CONSUMPTION_POTENTIAL"], + "CO2_EMISSIONS_POTENTIAL": earliest_record["CO2_EMISSIONS_POTENTIAL"], **fixed_data, **features.to_dict(), } @@ -496,8 +540,6 @@ def app(): data_by_urpn_df = pd.DataFrame(data_by_urpn) - # Add some temporal features - we look at the days from the standard starting point in time - # for the starting and ending date so all records are from a fixed point data_by_urpn_df["DAYS_TO_STARTING"] = DataProcessor.calculate_days_to( data_by_urpn_df["LODGEMENT_DATE_STARTING"] ) @@ -508,6 +550,8 @@ def app(): data_by_urpn_df = data_by_urpn_df.drop(columns=["LODGEMENT_DATE_STARTING", "LODGEMENT_DATE_ENDING"]) + data_by_urpn_df = DataProcessor.clean_efficiency_variables(data_by_urpn_df) + # We look for key building fabric features that have changed from one EPC to the next. # if, for example, we see that a home has gone from being a cavity wall to a solid wall, we # remove this record, as it indicates that the quality of the EPC conducted in the first instance @@ -541,6 +585,8 @@ def app(): cleaning_averages["LOCAL_AUTHORITY"] = df["LOCAL_AUTHORITY"].values[0] cleaning_dataset.append(cleaning_averages) + print("Final all equal count: %s" % str(all_equal_count)) + # Store cleaning dataset in s3 as a parquet file cleaning_dataset = pd.concat(cleaning_dataset) save_dataframe_to_s3_parquet( diff --git a/etl/epc/settings.py b/etl/epc/settings.py index fb8e464d..93b8929b 100644 --- a/etl/epc/settings.py +++ b/etl/epc/settings.py @@ -85,8 +85,7 @@ FIXED_FEATURES = [ "FIXED_LIGHTING_OUTLETS_COUNT", ] -COMPONENT_FEATURES = [ - "TRANSACTION_TYPE", +CORE_COMPONENT_FEATURES = [ "WALLS_DESCRIPTION", "FLOOR_DESCRIPTION", "LIGHTING_DESCRIPTION", @@ -96,21 +95,49 @@ COMPONENT_FEATURES = [ "MAIN_FUEL", "MECHANICAL_VENTILATION", "SECONDHEAT_DESCRIPTION", - "ENERGY_TARIFF", # Not sure if this is relevant - "SOLAR_WATER_HEATING_FLAG", - "PHOTO_SUPPLY", "WINDOWS_DESCRIPTION", "GLAZED_TYPE", "MULTI_GLAZE_PROPORTION", "LOW_ENERGY_LIGHTING", "NUMBER_OPEN_FIREPLACES", "MAINHEATCONT_DESCRIPTION", + "SOLAR_WATER_HEATING_FLAG", + "PHOTO_SUPPLY", +] + +EFFICIENCY_FEATURES = [ + 'HOT_WATER_ENERGY_EFF', + 'FLOOR_ENERGY_EFF', + 'WINDOWS_ENERGY_EFF', + 'WALLS_ENERGY_EFF', + 'SHEATING_ENERGY_EFF', + 'ROOF_ENERGY_EFF', + 'MAINHEAT_ENERGY_EFF', + 'MAINHEATC_ENERGY_EFF', + 'LIGHTING_ENERGY_EFF' +] + +COMPONENT_FEATURES = CORE_COMPONENT_FEATURES + [ + "TRANSACTION_TYPE", + "ENERGY_TARIFF", # Not sure if this is relevant "EXTENSION_COUNT", "TOTAL_FLOOR_AREA", "FLOOR_HEIGHT", # 'GLAZED_AREA', # May not need this since we have MULTI_GLAZE_PROPORTION ] +POTENTIAL_COLUMNS = [ + 'POTENTIAL_ENERGY_RATING', + 'POTENTIAL_ENERGY_EFFICIENCY', + 'ENVIRONMENT_IMPACT_POTENTIAL', + 'ENERGY_CONSUMPTION_POTENTIAL', + 'CO2_EMISSIONS_POTENTIAL', + # We don't include cost features for the moment + # 'LIGHTING_COST_POTENTIAL', + # 'HEATING_COST_POTENTIAL', + # 'HOT_WATER_COST_POTENTIAL' +] + # For these fields, we take the latest value if we have multiple values # Since more recent EPCs have been conducted with more rigour, we assume that the latest value is # the most accurate @@ -253,3 +280,7 @@ ENDING_SUFFIX_COMPONENT_COLS = [ 'rate_control', 'glazing_type', 'fuel_type', 'main-fuel_tariff_type', 'is_community', 'no_individual_heating_or_community_network', 'complex_fuel_type', 'estimated_perimeter' ] + +# We found that without performing any filtering, the bottom 0.5% of homes had a floor height of 1.65m. We'll therefore +# filter out any homes with a floor height below this +MINIMUM_FLOOR_HEIGHT = 1.65