From 7e36dc6d4982f491bb82406553afcd664fa5710f Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Thu, 15 Feb 2024 16:36:29 +0000 Subject: [PATCH] add starting and ending rooms --- etl/epc/DataProcessor.py | 3 + etl/epc/Dataset.py | 144 +++++++++++---- etl/epc/Pipeline.py | 21 ++- etl/epc/Record.py | 135 +++++++++----- etl/epc/settings.py | 380 ++++++++++++++++++++++++++++----------- 5 files changed, 501 insertions(+), 182 deletions(-) diff --git a/etl/epc/DataProcessor.py b/etl/epc/DataProcessor.py index af55535c..a77bcaa3 100644 --- a/etl/epc/DataProcessor.py +++ b/etl/epc/DataProcessor.py @@ -704,6 +704,9 @@ class EPCDataProcessor: # We remove EPCs where the tenure is unknown, but is usually an indicator of a new build self.data = self.data[~self.data["TENURE"].isin(IGNORED_TENURES)] + # We remap zero values to None + self.data.loc[self.data['FLOOR_HEIGHT'] == 0, 'FLOOR_HEIGHT'] = None + def clean_multi_glaze_proportion(self, ignore_step: bool = False) -> None: """ If there is no multi-glaze proportion but the windows are fully glazed, then we should assume a score of 100 diff --git a/etl/epc/Dataset.py b/etl/epc/Dataset.py index 322f3238..dac829e2 100644 --- a/etl/epc/Dataset.py +++ b/etl/epc/Dataset.py @@ -17,33 +17,111 @@ from recommendations.recommendation_utils import ( # TODO: Can probably produce this in the property change app and store in S3 BOOLEAN_VARIABLES = [ - 'is_cavity_wall', 'is_filled_cavity', 'is_solid_brick', 'is_system_built', 'is_timber_frame', - 'is_granite_or_whinstone', 'is_as_built', 'is_cob', 'is_sandstone_or_limestone', 'is_park_home', - 'external_insulation', 'internal_insulation', 'is_park_home_ending', 'external_insulation_ending', - 'internal_insulation_ending', 'is_to_unheated_space', 'is_to_external_air', 'is_suspended', 'is_solid', - 'another_property_below', 'is_pitched', 'is_roof_room', 'is_loft', 'is_flat', 'is_thatched', 'is_at_rafters', - 'has_dwelling_above', 'has_radiators', 'has_fan_coil_units', 'has_pipes_in_screed_above_insulation', - 'has_pipes_in_insulated_timber_floor', 'has_pipes_in_concrete_slab', 'has_boiler', 'has_air_source_heat_pump', - 'has_room_heaters', 'has_electric_storage_heaters', 'has_warm_air', 'has_electric_underfloor_heating', - 'has_electric_ceiling_heating', 'has_community_scheme', 'has_ground_source_heat_pump', 'has_no_system_present', - 'has_portable_electric_heaters', 'has_water_source_heat_pump', 'has_electric_heat_pump', 'has_micro-cogeneration', - 'has_solar_assisted_heat_pump', 'has_exhaust_source_heat_pump', 'has_community_heat_pump', 'has_electric', - 'has_mains_gas', 'has_wood_logs', 'has_coal', 'has_oil', 'has_wood_pellets', 'has_anthracite', - 'has_dual_fuel_mineral_and_wood', 'has_smokeless_fuel', 'has_lpg', 'has_b30k', 'has_electricaire', - 'has_assumed_for_most_rooms', 'has_underfloor_heating', 'has_radiators_ending', 'has_fan_coil_units_ending', - 'has_pipes_in_screed_above_insulation_ending', 'has_pipes_in_insulated_timber_floor_ending', - 'has_pipes_in_concrete_slab_ending', 'has_boiler_ending', 'has_air_source_heat_pump_ending', - 'has_room_heaters_ending', 'has_electric_storage_heaters_ending', 'has_warm_air_ending', - 'has_electric_underfloor_heating_ending', 'has_electric_ceiling_heating_ending', 'has_community_scheme_ending', - 'has_ground_source_heat_pump_ending', 'has_no_system_present_ending', 'has_portable_electric_heaters_ending', - 'has_water_source_heat_pump_ending', 'has_electric_heat_pump_ending', 'has_micro-cogeneration_ending', - 'has_solar_assisted_heat_pump_ending', 'has_exhaust_source_heat_pump_ending', 'has_community_heat_pump_ending', - 'has_electric_ending', 'has_mains_gas_ending', 'has_wood_logs_ending', 'has_coal_ending', 'has_oil_ending', - 'has_wood_pellets_ending', 'has_anthracite_ending', 'has_dual_fuel_mineral_and_wood_ending', - 'has_smokeless_fuel_ending', 'has_lpg_ending', 'has_b30k_ending', 'has_electricaire_ending', - 'has_assumed_for_most_rooms_ending', 'has_underfloor_heating_ending', 'multiple_room_thermostats', - 'multiple_room_thermostats_ending', 'is_community', 'no_individual_heating_or_community_network', - 'is_community_ending', 'no_individual_heating_or_community_network_ending' + "is_cavity_wall", + "is_filled_cavity", + "is_solid_brick", + "is_system_built", + "is_timber_frame", + "is_granite_or_whinstone", + "is_as_built", + "is_cob", + "is_sandstone_or_limestone", + "is_park_home", + "external_insulation", + "internal_insulation", + "is_park_home_ending", + "external_insulation_ending", + "internal_insulation_ending", + "is_to_unheated_space", + "is_to_external_air", + "is_suspended", + "is_solid", + "another_property_below", + "is_pitched", + "is_roof_room", + "is_loft", + "is_flat", + "is_thatched", + "is_at_rafters", + "has_dwelling_above", + "has_radiators", + "has_fan_coil_units", + "has_pipes_in_screed_above_insulation", + "has_pipes_in_insulated_timber_floor", + "has_pipes_in_concrete_slab", + "has_boiler", + "has_air_source_heat_pump", + "has_room_heaters", + "has_electric_storage_heaters", + "has_warm_air", + "has_electric_underfloor_heating", + "has_electric_ceiling_heating", + "has_community_scheme", + "has_ground_source_heat_pump", + "has_no_system_present", + "has_portable_electric_heaters", + "has_water_source_heat_pump", + "has_electric_heat_pump", + "has_micro-cogeneration", + "has_solar_assisted_heat_pump", + "has_exhaust_source_heat_pump", + "has_community_heat_pump", + "has_electric", + "has_mains_gas", + "has_wood_logs", + "has_coal", + "has_oil", + "has_wood_pellets", + "has_anthracite", + "has_dual_fuel_mineral_and_wood", + "has_smokeless_fuel", + "has_lpg", + "has_b30k", + "has_electricaire", + "has_assumed_for_most_rooms", + "has_underfloor_heating", + "has_radiators_ending", + "has_fan_coil_units_ending", + "has_pipes_in_screed_above_insulation_ending", + "has_pipes_in_insulated_timber_floor_ending", + "has_pipes_in_concrete_slab_ending", + "has_boiler_ending", + "has_air_source_heat_pump_ending", + "has_room_heaters_ending", + "has_electric_storage_heaters_ending", + "has_warm_air_ending", + "has_electric_underfloor_heating_ending", + "has_electric_ceiling_heating_ending", + "has_community_scheme_ending", + "has_ground_source_heat_pump_ending", + "has_no_system_present_ending", + "has_portable_electric_heaters_ending", + "has_water_source_heat_pump_ending", + "has_electric_heat_pump_ending", + "has_micro-cogeneration_ending", + "has_solar_assisted_heat_pump_ending", + "has_exhaust_source_heat_pump_ending", + "has_community_heat_pump_ending", + "has_electric_ending", + "has_mains_gas_ending", + "has_wood_logs_ending", + "has_coal_ending", + "has_oil_ending", + "has_wood_pellets_ending", + "has_anthracite_ending", + "has_dual_fuel_mineral_and_wood_ending", + "has_smokeless_fuel_ending", + "has_lpg_ending", + "has_b30k_ending", + "has_electricaire_ending", + "has_assumed_for_most_rooms_ending", + "has_underfloor_heating_ending", + "multiple_room_thermostats", + "multiple_room_thermostats_ending", + "is_community", + "no_individual_heating_or_community_network", + "is_community_ending", + "no_individual_heating_or_community_network_ending", ] @@ -330,14 +408,16 @@ class TrainingDataset(BaseDataset): self.df["estimated_perimeter_starting"] = self.df.apply( lambda row: estimate_perimeter( row["ground_floor_area_starting"], - row["number_habitable_rooms"] / row["estimated_number_of_floors"], + row["number_habitable_rooms_starting"] + / row["estimated_number_of_floors"], ), axis=1, ) self.df["estimated_perimeter_ending"] = self.df.apply( lambda row: estimate_perimeter( row["ground_floor_area_starting"], - row["number_habitable_rooms"] / row["estimated_number_of_floors"], + row["number_habitable_rooms_ending"] + / row["estimated_number_of_floors"], ), axis=1, ) @@ -647,7 +727,11 @@ class TrainingDataset(BaseDataset): for col in missings.index: unique_values = self.df[col].unique() - if (True in unique_values) or (False in unique_values) or (col in BOOLEAN_VARIABLES): + if ( + (True in unique_values) + or (False in unique_values) + or (col in BOOLEAN_VARIABLES) + ): self.df[col] = self.df[col].fillna(False) if "none" in unique_values: self.df[col] = self.df[col].fillna("none") diff --git a/etl/epc/Pipeline.py b/etl/epc/Pipeline.py index 99bbb22f..0943b206 100644 --- a/etl/epc/Pipeline.py +++ b/etl/epc/Pipeline.py @@ -19,11 +19,12 @@ from etl.epc.settings import ( CORE_COMPONENT_FEATURES, EFFICIENCY_FEATURES, POTENTIAL_COLUMNS, + ROOM_FEATURES, ) # TODO: change in setting file MANDATORY_FIXED_FEATURES = [x.lower() for x in MANDATORY_FIXED_FEATURES] -LATEST_FIELD = [x.lower() for x in LATEST_FIELD] +LATEST_FIELD = [x.lower() for x in LATEST_FIELD if x.lower() not in ROOM_FEATURES] COMPONENT_FEATURES = [x.lower() for x in COMPONENT_FEATURES] RDSAP_RESPONSE = RDSAP_RESPONSE.lower() HEAT_DEMAND_RESPONSE = HEAT_DEMAND_RESPONSE.lower() @@ -33,6 +34,7 @@ EFFICIENCY_FEATURES = [x.lower() for x in EFFICIENCY_FEATURES] POTENTIAL_COLUMNS = [x.lower() for x in POTENTIAL_COLUMNS] VARIABLE_DATA_FEATURES = ( COMPONENT_FEATURES + + ROOM_FEATURES + EFFICIENCY_FEATURES + POTENTIAL_COLUMNS + ["lodgement_date", RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE, CARBON_RESPONSE] @@ -78,9 +80,9 @@ class EPCPipeline: run_mode="training", epc_local_file="certificates.csv", epc_bucket_name="retrofit-data-dev", - epc_cleaning_dataset_key="sap_change_model/cleaning_dataset.parquet", - epc_all_equal_rows_key="sap_change_model/all_equal_rows.parquet", - epc_compiled_dataset_key="sap_change_model/dataset.parquet", + epc_cleaning_dataset_key="sap_change_model/cleaning_dataset_rooms.parquet", + epc_all_equal_rows_key="sap_change_model/all_equal_rows_rooms.parquet", + epc_compiled_dataset_key="sap_change_model/dataset_rooms.parquet", ): """ :param directories: List of directories to process @@ -283,9 +285,14 @@ class EPCPipeline: latest_record: EPCRecord = epc_records[idx2] # Auto sort the records so that the record with highest RDSAP score is always record1 - difference_record: EPCDifferenceRecord = latest_record - earliest_record - # TODO: Use method above instead of overloading operator - difference_record.append_fixed_data(fixed_data) + difference_record: EPCDifferenceRecord = ( + latest_record.create_EPCDifferenceRecord( + other=earliest_record, fixed_data=fixed_data + ) + ) + # difference_record: EPCDifferenceRecord = latest_record - earliest_record + # # TODO: Use method above instead of overloading operator + # difference_record.append_fixed_data(fixed_data) # TODO: Pull out RDSAP_CHANGE to a variable if difference_record.get("rdsap_change") == 0: diff --git a/etl/epc/Record.py b/etl/epc/Record.py index 79e36d5b..fc670e5e 100644 --- a/etl/epc/Record.py +++ b/etl/epc/Record.py @@ -19,6 +19,7 @@ from etl.epc.settings import ( CARBON_RESPONSE, COMPONENT_FEATURES, EFFICIENCY_FEATURES, + ROOM_FEATURES, ) from recommendations.recommendation_utils import estimate_number_of_floors from utils.s3 import read_dataframe_from_s3_parquet @@ -83,6 +84,8 @@ class EPCRecord: current_energy_efficiency: int = None energy_consumption_current: int = None co2_emissions_current: float = None + number_habitable_rooms: float = None + number_heated_rooms: float = None # u_values_walls = None # u_values_roof = None @@ -268,6 +271,12 @@ class EPCRecord: self.co2_emissions_current: float = float( self.prepared_epc["co2_emissions_current"] ) + self.number_habitable_rooms: float = float( + self.prepared_epc["number_habitable_rooms"] + ) + self.number_heated_rooms: float = float( + self.prepared_epc["number_heated_rooms"] + ) def _identify_delta_between_prepared_and_original_records(self): """ @@ -380,8 +389,9 @@ class EPCRecord: raise ValueError("EPC Recrod doesn not contain epc data") self.prepared_epc["floor-level"] = ( - FLOOR_LEVEL_MAP[self.prepared_epc["floor-level"]] if - self.prepared_epc["floor-level"] not in DATA_ANOMALY_MATCHES else None + FLOOR_LEVEL_MAP[self.prepared_epc["floor-level"]] + if self.prepared_epc["floor-level"] not in DATA_ANOMALY_MATCHES + else None ) def _clean_number_lighting_outlets(self): @@ -426,9 +436,16 @@ class EPCRecord: cleaning_data.columns = [x.upper() for x in cleaning_data.columns] cleaned_property_data = EPCDataProcessor.apply_averages_cleaning( - data_to_clean=self.epc_record_as_dataframe("prepared_epc", replace_empty_string=True), + data_to_clean=self.epc_record_as_dataframe( + "prepared_epc", replace_empty_string=True + ), cleaning_data=cleaning_data, - cols_to_merge_on=['PROPERTY_TYPE', 'BUILT_FORM', 'CONSTRUCTION_AGE_BAND', 'LOCAL_AUTHORITY'], + cols_to_merge_on=[ + "PROPERTY_TYPE", + "BUILT_FORM", + "CONSTRUCTION_AGE_BAND", + "LOCAL_AUTHORITY", + ], ) self.prepared_epc["fixed-lighting-outlets-count"] = round( cleaned_property_data["FIXED_LIGHTING_OUTLETS_COUNT"].values[0] @@ -542,9 +559,14 @@ class EPCRecord: "N": False, } - self.prepared_epc["mains-gas-flag"] = None if ( - self.prepared_epc["mains-gas-flag"] == "" or self.prepared_epc["mains-gas-flag"] in DATA_ANOMALY_MATCHES - ) else mains_gas_map[self.prepared_epc["mains-gas-flag"]] + self.prepared_epc["mains-gas-flag"] = ( + None + if ( + self.prepared_epc["mains-gas-flag"] == "" + or self.prepared_epc["mains-gas-flag"] in DATA_ANOMALY_MATCHES + ) + else mains_gas_map[self.prepared_epc["mains-gas-flag"]] + ) def _clean_heat_loss_corridor(self): """ @@ -553,11 +575,7 @@ class EPCRecord: if not self.prepared_epc: raise ValueError("EPC Recrod doesn not contain epc data") - valid_values = [ - "no corridor", - "unheated corridor", - "heated corridor" - ] + valid_values = ["no corridor", "unheated corridor", "heated corridor"] boolean_map = { "no corridor": False, @@ -566,19 +584,23 @@ class EPCRecord: } self.prepared_epc["heat-loss-corridor"] = ( - "no corridor" if self.prepared_epc["heat-loss-corridor"] in DATA_ANOMALY_MATCHES else - self.prepared_epc["heat-loss-corridor"] + "no corridor" + if self.prepared_epc["heat-loss-corridor"] in DATA_ANOMALY_MATCHES + else self.prepared_epc["heat-loss-corridor"] ) if self.prepared_epc["heat-loss-corridor"] not in valid_values: self.prepared_epc["heat-loss-corridor"] = "no corridor" self.prepared_epc["unheated-corridor-length"] = ( - float(self.prepared_epc["unheated-corridor-length"]) if - self.prepared_epc["unheated-corridor-length"] not in ["", None] else None + float(self.prepared_epc["unheated-corridor-length"]) + if self.prepared_epc["unheated-corridor-length"] not in ["", None] + else None ) # We create boolean versions of heat-loss-corridor - self.heat_loss_corridor_bool = boolean_map[self.prepared_epc["heat-loss-corridor"]] + self.heat_loss_corridor_bool = boolean_map[ + self.prepared_epc["heat-loss-corridor"] + ] def _clean_count_variables(self): """ @@ -591,7 +613,7 @@ class EPCRecord: "number-open-fireplaces", "extension-count", "flat-storey-count", - "number-habitable-rooms" + "number-habitable-rooms", ] null_attributes = ["flat-storey-count", "number-habitable-rooms"] @@ -615,9 +637,11 @@ class EPCRecord: if not self.prepared_epc: raise ValueError("EPC Recrod doesn not contain epc data") - self.prepared_epc['wind-turbine-count'] = int( - self.prepared_epc['wind-turbine-count'] - ) if self.prepared_epc['wind-turbine-count'] not in DATA_ANOMALY_MATCHES else None + self.prepared_epc["wind-turbine-count"] = ( + int(self.prepared_epc["wind-turbine-count"]) + if self.prepared_epc["wind-turbine-count"] not in DATA_ANOMALY_MATCHES + else None + ) def _clean_solar_hot_water(self): """ @@ -626,12 +650,7 @@ class EPCRecord: if not self.prepared_epc: raise ValueError("EPC Recrod doesn not contain epc data") - value_map = { - "Y": "Y", - "N": "N", - "": "N", - None: "N" - } + value_map = {"Y": "Y", "N": "N", "": "N", None: "N"} boolean_map = { "Y": True, @@ -643,7 +662,9 @@ class EPCRecord: ] # Create a boolean version for storage in the database - self.solar_water_heating_flag_bool = boolean_map[self.prepared_epc['solar-water-heating-flag']] + self.solar_water_heating_flag_bool = boolean_map[ + self.prepared_epc["solar-water-heating-flag"] + ] def _clean_solar_pv(self): """ @@ -652,8 +673,11 @@ class EPCRecord: if not self.prepared_epc: raise ValueError("EPC Recrod doesn not contain epc data") - self.prepared_epc['photo-supply'] = float(self.prepared_epc['photo-supply']) if ( - self.prepared_epc['photo-supply'] not in DATA_ANOMALY_MATCHES) else None + self.prepared_epc["photo-supply"] = ( + float(self.prepared_epc["photo-supply"]) + if (self.prepared_epc["photo-supply"] not in DATA_ANOMALY_MATCHES) + else None + ) def _clean_energy(self): """ @@ -676,7 +700,7 @@ class EPCRecord: if not self.prepared_epc: raise ValueError("EPC Recrod doesn not contain epc data") - self.prepared_epc['built-form'] = BUILT_FORM_REMAP.get( + self.prepared_epc["built-form"] = BUILT_FORM_REMAP.get( self.prepared_epc["built-form"], self.prepared_epc["built-form"] ) @@ -691,8 +715,10 @@ class EPCRecord: if not self.prepared_epc: raise ValueError("EPC Recrod doesn not contain epc data") - self.prepared_epc["construction-age-band"] = EPCDataProcessor.clean_construction_age_band( - self.prepared_epc["construction-age-band"] + self.prepared_epc["construction-age-band"] = ( + EPCDataProcessor.clean_construction_age_band( + self.prepared_epc["construction-age-band"] + ) ) if self.prepared_epc["construction-age-band"] in DATA_ANOMALY_MATCHES: @@ -703,15 +729,20 @@ class EPCRecord: old_record["lodgement-datetime"] for old_record in self.old_data if old_record["construction-age-band"] - not in DATA_ANOMALY_MATCHES + not in DATA_ANOMALY_MATCHES ] ) - most_recent = [old_record for old_record in self.old_data if - old_record["lodgement-datetime"] == max_datetime] + most_recent = [ + old_record + for old_record in self.old_data + if old_record["lodgement-datetime"] == max_datetime + ] - self.prepared_epc["construction-age-band"] = EPCDataProcessor.clean_construction_age_band( - most_recent[0]["construction-age-band"] + self.prepared_epc["construction-age-band"] = ( + EPCDataProcessor.clean_construction_age_band( + most_recent[0]["construction-age-band"] + ) ) self.construction_age_band = self.prepared_epc["construction-age-band"] @@ -721,7 +752,7 @@ class EPCRecord: self.age_band is None ): self.age_band = "L" - self.construction_age_band = 'England and Wales: 2012 onwards' + self.construction_age_band = "England and Wales: 2012 onwards" self.prepared_epc["construction-age-band"] = self.construction_age_band if self.age_band is None: @@ -760,10 +791,10 @@ class EPCRecord: """ This method will clean the ventilation, if empty or invalid """ - self.prepared_epc['mechanical-ventilation'] = None if ( - self.prepared_epc['mechanical-ventilation'] in DATA_ANOMALY_MATCHES - ) else ( - self.prepared_epc['mechanical-ventilation'] + self.prepared_epc["mechanical-ventilation"] = ( + None + if (self.prepared_epc["mechanical-ventilation"] in DATA_ANOMALY_MATCHES) + else (self.prepared_epc["mechanical-ventilation"]) ) def _field_validation(self): @@ -841,6 +872,20 @@ class EPCRecord: f"{validation_config['range']}" ) + def create_EPCDifferenceRecord(self, other, fixed_data, auto_sort: bool = True): + """ + This method will create the difference record between the two records + """ + if not isinstance(other, EPCRecord): + raise ValueError("Can only subtract EPCRecord from EPCRecord") + + difference_record = EPCDifferenceRecord( + record1=self, record2=other, auto_sort=auto_sort + ) + difference_record.append_fixed_data(fixed_data) + + return difference_record + def __sub__(self, other): """ This method will return the difference between two EPC records @@ -848,6 +893,8 @@ class EPCRecord: if not isinstance(other, EPCRecord): raise ValueError("Can only subtract EPCRecord from EPCRecord") + print("Deprecated method, use create_EPCDifferenceRecord instead") + difference_record = EPCDifferenceRecord( record1=self, record2=other, auto_sort=True ) @@ -962,7 +1009,7 @@ class EPCDifferenceRecord: CARBON_RESPONSE ) - component_variables = COMPONENT_FEATURES + EFFICIENCY_FEATURES + component_variables = COMPONENT_FEATURES + EFFICIENCY_FEATURES + ROOM_FEATURES ending_record = self.record2.get( component_variables + ["lodgement_date"], return_asdict=True, diff --git a/etl/epc/settings.py b/etl/epc/settings.py index 7100b0e9..18dbaa7c 100644 --- a/etl/epc/settings.py +++ b/etl/epc/settings.py @@ -28,18 +28,18 @@ DATA_ANOMALY_MATCHES = { # The Building Emission Rate (BER) data field for non-domestic buildings may contain a ‘blank’ value. The BER # was only lodged on the register from 7 March 2010. "Blank" - # There are currently just over 8,600 records where the local authority identifier is ‘null’. This is due to - # the Register Operator not being able to match the building address in the Markermap Ordinance Survey (GB) - # lookup tables or OS MasterMap Address Layer 2 data. The majority of these addresses have been requested - # manually by energy assessors for inclusion by the Register Operator in the registers (e.g. new builds, - # etc). These records are being published for completeness. An ongoing process to manage these manually added + # There are currently just over 8,600 records where the local authority identifier is ‘null’. This is due to + # the Register Operator not being able to match the building address in the Markermap Ordinance Survey (GB) + # lookup tables or OS MasterMap Address Layer 2 data. The majority of these addresses have been requested + # manually by energy assessors for inclusion by the Register Operator in the registers (e.g. new builds, + # etc). These records are being published for completeness. An ongoing process to manage these manually added # addresses will take time to develop to deal with these and future anomalies. # - # There are several fields within the lodged data where it is possible to enter multiple entries to cater for - # different data_types of build within a single property, i.e. extensions. This results in multiple entries for - # the description fields for floor, roof and wall. For the purposes of this data release only the information - # contained within the first of these multiple entries is being provided. As there are no restrictions on the - # value in this first field it means that sometimes the first field in a multiple entry description field may + # There are several fields within the lodged data where it is possible to enter multiple entries to cater for + # different data_types of build within a single property, i.e. extensions. This results in multiple entries for + # the description fields for floor, roof and wall. For the purposes of this data release only the information + # contained within the first of these multiple entries is being provided. As there are no restrictions on the + # value in this first field it means that sometimes the first field in a multiple entry description field may # contain a ‘null’ value. A resolution to correct these anomalies will be considered for future data releases. "NULL", # We sometimes see fields populated with just an empty string. @@ -163,17 +163,20 @@ CORE_COMPONENT_FEATURES = [ ] EFFICIENCY_FEATURES = [ - 'HOT_WATER_ENERGY_EFF', - 'FLOOR_ENERGY_EFF', - 'WINDOWS_ENERGY_EFF', - 'WALLS_ENERGY_EFF', - 'SHEATING_ENERGY_EFF', - 'ROOF_ENERGY_EFF', - 'MAINHEAT_ENERGY_EFF', - 'MAINHEATC_ENERGY_EFF', - 'LIGHTING_ENERGY_EFF' + "HOT_WATER_ENERGY_EFF", + "FLOOR_ENERGY_EFF", + "WINDOWS_ENERGY_EFF", + "WALLS_ENERGY_EFF", + "SHEATING_ENERGY_EFF", + "ROOF_ENERGY_EFF", + "MAINHEAT_ENERGY_EFF", + "MAINHEATC_ENERGY_EFF", + "LIGHTING_ENERGY_EFF", ] +ROOM_FEATURES = ["number_habitable_rooms", "number_heated_rooms"] + + COMPONENT_FEATURES = CORE_COMPONENT_FEATURES + [ "TRANSACTION_TYPE", "ENERGY_TARIFF", # Not sure if this is relevant @@ -184,10 +187,10 @@ COMPONENT_FEATURES = CORE_COMPONENT_FEATURES + [ ] POTENTIAL_COLUMNS = [ - 'POTENTIAL_ENERGY_EFFICIENCY', - 'ENVIRONMENT_IMPACT_POTENTIAL', - 'ENERGY_CONSUMPTION_POTENTIAL', - 'CO2_EMISSIONS_POTENTIAL', + "POTENTIAL_ENERGY_EFFICIENCY", + "ENVIRONMENT_IMPACT_POTENTIAL", + "ENERGY_CONSUMPTION_POTENTIAL", + "CO2_EMISSIONS_POTENTIAL", # We don't include cost features for the moment # 'LIGHTING_COST_POTENTIAL', # 'HEATING_COST_POTENTIAL', @@ -237,30 +240,55 @@ DATA_PROCESSOR_SETTINGS = { # This has a manual mapping of the column types required COLUMNTYPES = { - 'UPRN': 'object', 'TOTAL_FLOOR_AREA': 'float64', 'FLOOR_HEIGHT': 'float64', 'PROPERTY_TYPE': 'object', - 'BUILT_FORM': 'object', 'CONSTITUENCY': 'object', 'NUMBER_HABITABLE_ROOMS': 'float64', - 'NUMBER_HEATED_ROOMS': 'float64', 'FIXED_LIGHTING_OUTLETS_COUNT': 'float64', - 'CONSTRUCTION_AGE_BAND': 'object', - 'TRANSACTION_TYPE': 'object', - 'WALLS_DESCRIPTION': 'object', - 'FLOOR_DESCRIPTION': 'object', - 'LIGHTING_DESCRIPTION': 'object', - 'ROOF_DESCRIPTION': 'object', - 'MAINHEAT_DESCRIPTION': 'object', - 'HOTWATER_DESCRIPTION': 'object', 'MAIN_FUEL': 'object', - 'MECHANICAL_VENTILATION': 'object', - 'SECONDHEAT_DESCRIPTION': 'object', 'ENERGY_TARIFF': 'object', - 'SOLAR_WATER_HEATING_FLAG': 'object', 'PHOTO_SUPPLY': 'float64', - 'WINDOWS_DESCRIPTION': 'object', - 'GLAZED_TYPE': 'object', - 'MULTI_GLAZE_PROPORTION': 'float64', - 'LOW_ENERGY_LIGHTING': 'float64', - 'NUMBER_OPEN_FIREPLACES': 'float64', - 'MAINHEATCONT_DESCRIPTION': 'object', - 'EXTENSION_COUNT': 'float64', - 'LODGEMENT_DATE': 'object', - **dict(zip(EFFICIENCY_FEATURES, ['object', ] * len(EFFICIENCY_FEATURES))), - **dict(zip(POTENTIAL_COLUMNS, ['float64', ] * len(POTENTIAL_COLUMNS))) + "UPRN": "object", + "TOTAL_FLOOR_AREA": "float64", + "FLOOR_HEIGHT": "float64", + "PROPERTY_TYPE": "object", + "BUILT_FORM": "object", + "CONSTITUENCY": "object", + "NUMBER_HABITABLE_ROOMS": "float64", + "NUMBER_HEATED_ROOMS": "float64", + "FIXED_LIGHTING_OUTLETS_COUNT": "float64", + "CONSTRUCTION_AGE_BAND": "object", + "TRANSACTION_TYPE": "object", + "WALLS_DESCRIPTION": "object", + "FLOOR_DESCRIPTION": "object", + "LIGHTING_DESCRIPTION": "object", + "ROOF_DESCRIPTION": "object", + "MAINHEAT_DESCRIPTION": "object", + "HOTWATER_DESCRIPTION": "object", + "MAIN_FUEL": "object", + "MECHANICAL_VENTILATION": "object", + "SECONDHEAT_DESCRIPTION": "object", + "ENERGY_TARIFF": "object", + "SOLAR_WATER_HEATING_FLAG": "object", + "PHOTO_SUPPLY": "float64", + "WINDOWS_DESCRIPTION": "object", + "GLAZED_TYPE": "object", + "MULTI_GLAZE_PROPORTION": "float64", + "LOW_ENERGY_LIGHTING": "float64", + "NUMBER_OPEN_FIREPLACES": "float64", + "MAINHEATCONT_DESCRIPTION": "object", + "EXTENSION_COUNT": "float64", + "LODGEMENT_DATE": "object", + **dict( + zip( + EFFICIENCY_FEATURES, + [ + "object", + ] + * len(EFFICIENCY_FEATURES), + ) + ), + **dict( + zip( + POTENTIAL_COLUMNS, + [ + "float64", + ] + * len(POTENTIAL_COLUMNS), + ) + ), } # For modelling, we don't allow records with more than 100 SAP points @@ -280,7 +308,7 @@ fill_na_map = { "LOW_ENERGY_LIGHTING": 0, "MAINHEATCONT_DESCRIPTION": "Unknown", "EXTENSION_COUNT": 0, - "NUMBER_OPEN_FIREPLACES": 0 + "NUMBER_OPEN_FIREPLACES": 0, } ################################################################################################ @@ -289,62 +317,212 @@ fill_na_map = { ################################################################################################ STARTING_SUFFIX_COMPONENT_COLS = [ - "SAP", "HEAT_DEMAND", "CARBON", "TRANSACTION_TYPE", "MECHANICAL_VENTILATION", - "SECONDHEAT_DESCRIPTION", "ENERGY_TARIFF", "SOLAR_WATER_HEATING_FLAG", "PHOTO_SUPPLY", - "GLAZED_TYPE", "MULTI_GLAZE_PROPORTION", "LOW_ENERGY_LIGHTING", "NUMBER_OPEN_FIREPLACES", - "EXTENSION_COUNT", "TOTAL_FLOOR_AREA", "FLOOR_HEIGHT", "DAYS_TO", "estimated_perimeter" + "SAP", + "HEAT_DEMAND", + "CARBON", + "TRANSACTION_TYPE", + "MECHANICAL_VENTILATION", + "SECONDHEAT_DESCRIPTION", + "ENERGY_TARIFF", + "SOLAR_WATER_HEATING_FLAG", + "PHOTO_SUPPLY", + "GLAZED_TYPE", + "MULTI_GLAZE_PROPORTION", + "LOW_ENERGY_LIGHTING", + "NUMBER_OPEN_FIREPLACES", + "EXTENSION_COUNT", + "TOTAL_FLOOR_AREA", + "FLOOR_HEIGHT", + "DAYS_TO", + "estimated_perimeter", +] +NO_SUFFIX_COMPONENT_COLS = [ + "walls_thermal_transmittance", + "is_cavity_wall", + "is_filled_cavity", + "is_solid_brick", + "is_system_built", + "is_timber_frame", + "is_granite_or_whinstone", + "is_as_built", + "is_cob", + "is_sandstone_or_limestone", + "is_park_home", + "walls_insulation_thickness", + "external_insulation", + "internal_insulation", + "floor_thermal_transmittance", + "is_to_unheated_space", + "is_to_external_air", + "is_suspended", + "is_solid", + "another_property_below", + "floor_insulation_thickness", + "roof_thermal_transmittance", + "is_pitched", + "is_roof_room", + "is_loft", + "is_flat", + "is_thatched", + "is_at_rafters", + "has_dwelling_above", + "roof_insulation_thickness", + "heater_type", + "system_type", + "thermostat_characteristics", + "heating_scope", + "energy_recovery", + "hotwater_tariff_type", + "extra_features", + "chp_systems", + "distribution_system", + "no_system_present", + "appliance", + "has_radiators", + "has_fan_coil_units", + "has_pipes_in_screed_above_insulation", + "has_pipes_in_insulated_timber_floor", + "has_pipes_in_concrete_slab", + "has_boiler", + "has_air_source_heat_pump", + "has_room_heaters", + "has_electric_storage_heaters", + "has_warm_air", + "has_electric_underfloor_heating", + "has_electric_ceiling_heating", + "has_community_scheme", + "has_ground_source_heat_pump", + "has_no_system_present", + "has_portable_electric_heaters", + "has_water_source_heat_pump", + "has_electric_heat_pump", + "has_micro-cogeneration", + "has_solar_assisted_heat_pump", + "has_exhaust_source_heat_pump", + "has_community_heat_pump", + "has_electric", + "has_mains_gas", + "has_wood_logs", + "has_coal", + "has_oil", + "has_wood_pellets", + "has_anthracite", + "has_dual_fuel_mineral_and_wood", + "has_smokeless_fuel", + "has_lpg", + "has_b30k", + "has_electricaire", + "has_assumed_for_most_rooms", + "has_underfloor_heating", + "thermostatic_control", + "charging_system", + "switch_system", + "no_control", + "dhw_control", + "community_heating", + "multiple_room_thermostats", + "auxiliary_systems", + "trvs", + "rate_control", + "glazing_type", + "fuel_type", + "main-fuel_tariff_type", + "is_community", + "no_individual_heating_or_community_network", + "complex_fuel_type", ] -NO_SUFFIX_COMPONENT_COLS = ['walls_thermal_transmittance', 'is_cavity_wall', - 'is_filled_cavity', 'is_solid_brick', 'is_system_built', 'is_timber_frame', - 'is_granite_or_whinstone', 'is_as_built', 'is_cob', 'is_sandstone_or_limestone', - 'is_park_home', 'walls_insulation_thickness', 'external_insulation', 'internal_insulation', - 'floor_thermal_transmittance', 'is_to_unheated_space', 'is_to_external_air', 'is_suspended', - 'is_solid', 'another_property_below', 'floor_insulation_thickness', - 'roof_thermal_transmittance', 'is_pitched', 'is_roof_room', 'is_loft', 'is_flat', - 'is_thatched', 'is_at_rafters', 'has_dwelling_above', 'roof_insulation_thickness', - 'heater_type', 'system_type', 'thermostat_characteristics', 'heating_scope', - 'energy_recovery', - 'hotwater_tariff_type', 'extra_features', 'chp_systems', 'distribution_system', - 'no_system_present', 'appliance', 'has_radiators', 'has_fan_coil_units', - 'has_pipes_in_screed_above_insulation', 'has_pipes_in_insulated_timber_floor', - 'has_pipes_in_concrete_slab', 'has_boiler', 'has_air_source_heat_pump', 'has_room_heaters', - 'has_electric_storage_heaters', 'has_warm_air', 'has_electric_underfloor_heating', - 'has_electric_ceiling_heating', 'has_community_scheme', 'has_ground_source_heat_pump', - 'has_no_system_present', 'has_portable_electric_heaters', 'has_water_source_heat_pump', - 'has_electric_heat_pump', 'has_micro-cogeneration', 'has_solar_assisted_heat_pump', - 'has_exhaust_source_heat_pump', 'has_community_heat_pump', 'has_electric', 'has_mains_gas', - 'has_wood_logs', 'has_coal', 'has_oil', 'has_wood_pellets', 'has_anthracite', - 'has_dual_fuel_mineral_and_wood', 'has_smokeless_fuel', 'has_lpg', 'has_b30k', - 'has_electricaire', 'has_assumed_for_most_rooms', 'has_underfloor_heating', - 'thermostatic_control', 'charging_system', 'switch_system', 'no_control', 'dhw_control', - 'community_heating', 'multiple_room_thermostats', 'auxiliary_systems', 'trvs', - 'rate_control', - 'glazing_type', 'fuel_type', 'main-fuel_tariff_type', 'is_community', - 'no_individual_heating_or_community_network', 'complex_fuel_type', - ] ENDING_SUFFIX_COMPONENT_COLS = [ - 'SAP', 'HEAT_DEMAND', 'CARBON', 'TRANSACTION_TYPE', 'MECHANICAL_VENTILATION', 'SECONDHEAT_DESCRIPTION', - 'ENERGY_TARIFF', 'SOLAR_WATER_HEATING_FLAG', 'PHOTO_SUPPLY', 'GLAZED_TYPE', 'MULTI_GLAZE_PROPORTION', - 'LOW_ENERGY_LIGHTING', 'NUMBER_OPEN_FIREPLACES', 'EXTENSION_COUNT', 'TOTAL_FLOOR_AREA', 'FLOOR_HEIGHT', - 'DAYS_TO', 'walls_thermal_transmittance', 'is_park_home', 'walls_insulation_thickness', - 'external_insulation', 'internal_insulation', 'floor_thermal_transmittance', 'floor_insulation_thickness', - 'roof_thermal_transmittance', 'roof_insulation_thickness', 'heater_type', 'system_type', - 'thermostat_characteristics', 'heating_scope', 'energy_recovery', 'hotwater_tariff_type', 'extra_features', - 'chp_systems', 'distribution_system', 'no_system_present', 'appliance', 'has_radiators', - 'has_fan_coil_units', 'has_pipes_in_screed_above_insulation', 'has_pipes_in_insulated_timber_floor', - 'has_pipes_in_concrete_slab', 'has_boiler', 'has_air_source_heat_pump', 'has_room_heaters', - 'has_electric_storage_heaters', 'has_warm_air', 'has_electric_underfloor_heating', - 'has_electric_ceiling_heating', 'has_community_scheme', 'has_ground_source_heat_pump', - 'has_no_system_present', 'has_portable_electric_heaters', 'has_water_source_heat_pump', - 'has_electric_heat_pump', 'has_micro-cogeneration', 'has_solar_assisted_heat_pump', - 'has_exhaust_source_heat_pump', 'has_community_heat_pump', 'has_electric', 'has_mains_gas', 'has_wood_logs', - 'has_coal', 'has_oil', 'has_wood_pellets', 'has_anthracite', 'has_dual_fuel_mineral_and_wood', - 'has_smokeless_fuel', 'has_lpg', 'has_b30k', 'has_electricaire', 'has_assumed_for_most_rooms', - 'has_underfloor_heating', 'thermostatic_control', 'charging_system', 'switch_system', 'no_control', - 'dhw_control', 'community_heating', 'multiple_room_thermostats', 'auxiliary_systems', 'trvs', - 'rate_control', 'glazing_type', 'fuel_type', 'main-fuel_tariff_type', 'is_community', - 'no_individual_heating_or_community_network', 'complex_fuel_type', 'estimated_perimeter' + "SAP", + "HEAT_DEMAND", + "CARBON", + "TRANSACTION_TYPE", + "MECHANICAL_VENTILATION", + "SECONDHEAT_DESCRIPTION", + "ENERGY_TARIFF", + "SOLAR_WATER_HEATING_FLAG", + "PHOTO_SUPPLY", + "GLAZED_TYPE", + "MULTI_GLAZE_PROPORTION", + "LOW_ENERGY_LIGHTING", + "NUMBER_OPEN_FIREPLACES", + "EXTENSION_COUNT", + "TOTAL_FLOOR_AREA", + "FLOOR_HEIGHT", + "DAYS_TO", + "walls_thermal_transmittance", + "is_park_home", + "walls_insulation_thickness", + "external_insulation", + "internal_insulation", + "floor_thermal_transmittance", + "floor_insulation_thickness", + "roof_thermal_transmittance", + "roof_insulation_thickness", + "heater_type", + "system_type", + "thermostat_characteristics", + "heating_scope", + "energy_recovery", + "hotwater_tariff_type", + "extra_features", + "chp_systems", + "distribution_system", + "no_system_present", + "appliance", + "has_radiators", + "has_fan_coil_units", + "has_pipes_in_screed_above_insulation", + "has_pipes_in_insulated_timber_floor", + "has_pipes_in_concrete_slab", + "has_boiler", + "has_air_source_heat_pump", + "has_room_heaters", + "has_electric_storage_heaters", + "has_warm_air", + "has_electric_underfloor_heating", + "has_electric_ceiling_heating", + "has_community_scheme", + "has_ground_source_heat_pump", + "has_no_system_present", + "has_portable_electric_heaters", + "has_water_source_heat_pump", + "has_electric_heat_pump", + "has_micro-cogeneration", + "has_solar_assisted_heat_pump", + "has_exhaust_source_heat_pump", + "has_community_heat_pump", + "has_electric", + "has_mains_gas", + "has_wood_logs", + "has_coal", + "has_oil", + "has_wood_pellets", + "has_anthracite", + "has_dual_fuel_mineral_and_wood", + "has_smokeless_fuel", + "has_lpg", + "has_b30k", + "has_electricaire", + "has_assumed_for_most_rooms", + "has_underfloor_heating", + "thermostatic_control", + "charging_system", + "switch_system", + "no_control", + "dhw_control", + "community_heating", + "multiple_room_thermostats", + "auxiliary_systems", + "trvs", + "rate_control", + "glazing_type", + "fuel_type", + "main-fuel_tariff_type", + "is_community", + "no_individual_heating_or_community_network", + "complex_fuel_type", + "estimated_perimeter", ] # We found that without performing any filtering, the bottom 0.5% of homes had a floor height of 1.65m. We'll therefore