add starting and ending rooms

This commit is contained in:
Michael Duong 2024-02-15 16:36:29 +00:00
parent 02005254f5
commit 7e36dc6d49
5 changed files with 501 additions and 182 deletions

View file

@ -704,6 +704,9 @@ class EPCDataProcessor:
# We remove EPCs where the tenure is unknown, but is usually an indicator of a new build
self.data = self.data[~self.data["TENURE"].isin(IGNORED_TENURES)]
# We remap zero values to None
self.data.loc[self.data['FLOOR_HEIGHT'] == 0, 'FLOOR_HEIGHT'] = None
def clean_multi_glaze_proportion(self, ignore_step: bool = False) -> None:
"""
If there is no multi-glaze proportion but the windows are fully glazed, then we should assume a score of 100

View file

@ -17,33 +17,111 @@ from recommendations.recommendation_utils import (
# TODO: Can probably produce this in the property change app and store in S3
BOOLEAN_VARIABLES = [
'is_cavity_wall', 'is_filled_cavity', 'is_solid_brick', 'is_system_built', 'is_timber_frame',
'is_granite_or_whinstone', 'is_as_built', 'is_cob', 'is_sandstone_or_limestone', 'is_park_home',
'external_insulation', 'internal_insulation', 'is_park_home_ending', 'external_insulation_ending',
'internal_insulation_ending', 'is_to_unheated_space', 'is_to_external_air', 'is_suspended', 'is_solid',
'another_property_below', 'is_pitched', 'is_roof_room', 'is_loft', 'is_flat', 'is_thatched', 'is_at_rafters',
'has_dwelling_above', 'has_radiators', 'has_fan_coil_units', 'has_pipes_in_screed_above_insulation',
'has_pipes_in_insulated_timber_floor', 'has_pipes_in_concrete_slab', 'has_boiler', 'has_air_source_heat_pump',
'has_room_heaters', 'has_electric_storage_heaters', 'has_warm_air', 'has_electric_underfloor_heating',
'has_electric_ceiling_heating', 'has_community_scheme', 'has_ground_source_heat_pump', 'has_no_system_present',
'has_portable_electric_heaters', 'has_water_source_heat_pump', 'has_electric_heat_pump', 'has_micro-cogeneration',
'has_solar_assisted_heat_pump', 'has_exhaust_source_heat_pump', 'has_community_heat_pump', 'has_electric',
'has_mains_gas', 'has_wood_logs', 'has_coal', 'has_oil', 'has_wood_pellets', 'has_anthracite',
'has_dual_fuel_mineral_and_wood', 'has_smokeless_fuel', 'has_lpg', 'has_b30k', 'has_electricaire',
'has_assumed_for_most_rooms', 'has_underfloor_heating', 'has_radiators_ending', 'has_fan_coil_units_ending',
'has_pipes_in_screed_above_insulation_ending', 'has_pipes_in_insulated_timber_floor_ending',
'has_pipes_in_concrete_slab_ending', 'has_boiler_ending', 'has_air_source_heat_pump_ending',
'has_room_heaters_ending', 'has_electric_storage_heaters_ending', 'has_warm_air_ending',
'has_electric_underfloor_heating_ending', 'has_electric_ceiling_heating_ending', 'has_community_scheme_ending',
'has_ground_source_heat_pump_ending', 'has_no_system_present_ending', 'has_portable_electric_heaters_ending',
'has_water_source_heat_pump_ending', 'has_electric_heat_pump_ending', 'has_micro-cogeneration_ending',
'has_solar_assisted_heat_pump_ending', 'has_exhaust_source_heat_pump_ending', 'has_community_heat_pump_ending',
'has_electric_ending', 'has_mains_gas_ending', 'has_wood_logs_ending', 'has_coal_ending', 'has_oil_ending',
'has_wood_pellets_ending', 'has_anthracite_ending', 'has_dual_fuel_mineral_and_wood_ending',
'has_smokeless_fuel_ending', 'has_lpg_ending', 'has_b30k_ending', 'has_electricaire_ending',
'has_assumed_for_most_rooms_ending', 'has_underfloor_heating_ending', 'multiple_room_thermostats',
'multiple_room_thermostats_ending', 'is_community', 'no_individual_heating_or_community_network',
'is_community_ending', 'no_individual_heating_or_community_network_ending'
"is_cavity_wall",
"is_filled_cavity",
"is_solid_brick",
"is_system_built",
"is_timber_frame",
"is_granite_or_whinstone",
"is_as_built",
"is_cob",
"is_sandstone_or_limestone",
"is_park_home",
"external_insulation",
"internal_insulation",
"is_park_home_ending",
"external_insulation_ending",
"internal_insulation_ending",
"is_to_unheated_space",
"is_to_external_air",
"is_suspended",
"is_solid",
"another_property_below",
"is_pitched",
"is_roof_room",
"is_loft",
"is_flat",
"is_thatched",
"is_at_rafters",
"has_dwelling_above",
"has_radiators",
"has_fan_coil_units",
"has_pipes_in_screed_above_insulation",
"has_pipes_in_insulated_timber_floor",
"has_pipes_in_concrete_slab",
"has_boiler",
"has_air_source_heat_pump",
"has_room_heaters",
"has_electric_storage_heaters",
"has_warm_air",
"has_electric_underfloor_heating",
"has_electric_ceiling_heating",
"has_community_scheme",
"has_ground_source_heat_pump",
"has_no_system_present",
"has_portable_electric_heaters",
"has_water_source_heat_pump",
"has_electric_heat_pump",
"has_micro-cogeneration",
"has_solar_assisted_heat_pump",
"has_exhaust_source_heat_pump",
"has_community_heat_pump",
"has_electric",
"has_mains_gas",
"has_wood_logs",
"has_coal",
"has_oil",
"has_wood_pellets",
"has_anthracite",
"has_dual_fuel_mineral_and_wood",
"has_smokeless_fuel",
"has_lpg",
"has_b30k",
"has_electricaire",
"has_assumed_for_most_rooms",
"has_underfloor_heating",
"has_radiators_ending",
"has_fan_coil_units_ending",
"has_pipes_in_screed_above_insulation_ending",
"has_pipes_in_insulated_timber_floor_ending",
"has_pipes_in_concrete_slab_ending",
"has_boiler_ending",
"has_air_source_heat_pump_ending",
"has_room_heaters_ending",
"has_electric_storage_heaters_ending",
"has_warm_air_ending",
"has_electric_underfloor_heating_ending",
"has_electric_ceiling_heating_ending",
"has_community_scheme_ending",
"has_ground_source_heat_pump_ending",
"has_no_system_present_ending",
"has_portable_electric_heaters_ending",
"has_water_source_heat_pump_ending",
"has_electric_heat_pump_ending",
"has_micro-cogeneration_ending",
"has_solar_assisted_heat_pump_ending",
"has_exhaust_source_heat_pump_ending",
"has_community_heat_pump_ending",
"has_electric_ending",
"has_mains_gas_ending",
"has_wood_logs_ending",
"has_coal_ending",
"has_oil_ending",
"has_wood_pellets_ending",
"has_anthracite_ending",
"has_dual_fuel_mineral_and_wood_ending",
"has_smokeless_fuel_ending",
"has_lpg_ending",
"has_b30k_ending",
"has_electricaire_ending",
"has_assumed_for_most_rooms_ending",
"has_underfloor_heating_ending",
"multiple_room_thermostats",
"multiple_room_thermostats_ending",
"is_community",
"no_individual_heating_or_community_network",
"is_community_ending",
"no_individual_heating_or_community_network_ending",
]
@ -330,14 +408,16 @@ class TrainingDataset(BaseDataset):
self.df["estimated_perimeter_starting"] = self.df.apply(
lambda row: estimate_perimeter(
row["ground_floor_area_starting"],
row["number_habitable_rooms"] / row["estimated_number_of_floors"],
row["number_habitable_rooms_starting"]
/ row["estimated_number_of_floors"],
),
axis=1,
)
self.df["estimated_perimeter_ending"] = self.df.apply(
lambda row: estimate_perimeter(
row["ground_floor_area_starting"],
row["number_habitable_rooms"] / row["estimated_number_of_floors"],
row["number_habitable_rooms_ending"]
/ row["estimated_number_of_floors"],
),
axis=1,
)
@ -647,7 +727,11 @@ class TrainingDataset(BaseDataset):
for col in missings.index:
unique_values = self.df[col].unique()
if (True in unique_values) or (False in unique_values) or (col in BOOLEAN_VARIABLES):
if (
(True in unique_values)
or (False in unique_values)
or (col in BOOLEAN_VARIABLES)
):
self.df[col] = self.df[col].fillna(False)
if "none" in unique_values:
self.df[col] = self.df[col].fillna("none")

View file

@ -19,11 +19,12 @@ from etl.epc.settings import (
CORE_COMPONENT_FEATURES,
EFFICIENCY_FEATURES,
POTENTIAL_COLUMNS,
ROOM_FEATURES,
)
# TODO: change in setting file
MANDATORY_FIXED_FEATURES = [x.lower() for x in MANDATORY_FIXED_FEATURES]
LATEST_FIELD = [x.lower() for x in LATEST_FIELD]
LATEST_FIELD = [x.lower() for x in LATEST_FIELD if x.lower() not in ROOM_FEATURES]
COMPONENT_FEATURES = [x.lower() for x in COMPONENT_FEATURES]
RDSAP_RESPONSE = RDSAP_RESPONSE.lower()
HEAT_DEMAND_RESPONSE = HEAT_DEMAND_RESPONSE.lower()
@ -33,6 +34,7 @@ EFFICIENCY_FEATURES = [x.lower() for x in EFFICIENCY_FEATURES]
POTENTIAL_COLUMNS = [x.lower() for x in POTENTIAL_COLUMNS]
VARIABLE_DATA_FEATURES = (
COMPONENT_FEATURES
+ ROOM_FEATURES
+ EFFICIENCY_FEATURES
+ POTENTIAL_COLUMNS
+ ["lodgement_date", RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE, CARBON_RESPONSE]
@ -78,9 +80,9 @@ class EPCPipeline:
run_mode="training",
epc_local_file="certificates.csv",
epc_bucket_name="retrofit-data-dev",
epc_cleaning_dataset_key="sap_change_model/cleaning_dataset.parquet",
epc_all_equal_rows_key="sap_change_model/all_equal_rows.parquet",
epc_compiled_dataset_key="sap_change_model/dataset.parquet",
epc_cleaning_dataset_key="sap_change_model/cleaning_dataset_rooms.parquet",
epc_all_equal_rows_key="sap_change_model/all_equal_rows_rooms.parquet",
epc_compiled_dataset_key="sap_change_model/dataset_rooms.parquet",
):
"""
:param directories: List of directories to process
@ -283,9 +285,14 @@ class EPCPipeline:
latest_record: EPCRecord = epc_records[idx2]
# Auto sort the records so that the record with highest RDSAP score is always record1
difference_record: EPCDifferenceRecord = latest_record - earliest_record
# TODO: Use method above instead of overloading operator
difference_record.append_fixed_data(fixed_data)
difference_record: EPCDifferenceRecord = (
latest_record.create_EPCDifferenceRecord(
other=earliest_record, fixed_data=fixed_data
)
)
# difference_record: EPCDifferenceRecord = latest_record - earliest_record
# # TODO: Use method above instead of overloading operator
# difference_record.append_fixed_data(fixed_data)
# TODO: Pull out RDSAP_CHANGE to a variable
if difference_record.get("rdsap_change") == 0:

View file

@ -19,6 +19,7 @@ from etl.epc.settings import (
CARBON_RESPONSE,
COMPONENT_FEATURES,
EFFICIENCY_FEATURES,
ROOM_FEATURES,
)
from recommendations.recommendation_utils import estimate_number_of_floors
from utils.s3 import read_dataframe_from_s3_parquet
@ -83,6 +84,8 @@ class EPCRecord:
current_energy_efficiency: int = None
energy_consumption_current: int = None
co2_emissions_current: float = None
number_habitable_rooms: float = None
number_heated_rooms: float = None
# u_values_walls = None
# u_values_roof = None
@ -268,6 +271,12 @@ class EPCRecord:
self.co2_emissions_current: float = float(
self.prepared_epc["co2_emissions_current"]
)
self.number_habitable_rooms: float = float(
self.prepared_epc["number_habitable_rooms"]
)
self.number_heated_rooms: float = float(
self.prepared_epc["number_heated_rooms"]
)
def _identify_delta_between_prepared_and_original_records(self):
"""
@ -380,8 +389,9 @@ class EPCRecord:
raise ValueError("EPC Recrod doesn not contain epc data")
self.prepared_epc["floor-level"] = (
FLOOR_LEVEL_MAP[self.prepared_epc["floor-level"]] if
self.prepared_epc["floor-level"] not in DATA_ANOMALY_MATCHES else None
FLOOR_LEVEL_MAP[self.prepared_epc["floor-level"]]
if self.prepared_epc["floor-level"] not in DATA_ANOMALY_MATCHES
else None
)
def _clean_number_lighting_outlets(self):
@ -426,9 +436,16 @@ class EPCRecord:
cleaning_data.columns = [x.upper() for x in cleaning_data.columns]
cleaned_property_data = EPCDataProcessor.apply_averages_cleaning(
data_to_clean=self.epc_record_as_dataframe("prepared_epc", replace_empty_string=True),
data_to_clean=self.epc_record_as_dataframe(
"prepared_epc", replace_empty_string=True
),
cleaning_data=cleaning_data,
cols_to_merge_on=['PROPERTY_TYPE', 'BUILT_FORM', 'CONSTRUCTION_AGE_BAND', 'LOCAL_AUTHORITY'],
cols_to_merge_on=[
"PROPERTY_TYPE",
"BUILT_FORM",
"CONSTRUCTION_AGE_BAND",
"LOCAL_AUTHORITY",
],
)
self.prepared_epc["fixed-lighting-outlets-count"] = round(
cleaned_property_data["FIXED_LIGHTING_OUTLETS_COUNT"].values[0]
@ -542,9 +559,14 @@ class EPCRecord:
"N": False,
}
self.prepared_epc["mains-gas-flag"] = None if (
self.prepared_epc["mains-gas-flag"] == "" or self.prepared_epc["mains-gas-flag"] in DATA_ANOMALY_MATCHES
) else mains_gas_map[self.prepared_epc["mains-gas-flag"]]
self.prepared_epc["mains-gas-flag"] = (
None
if (
self.prepared_epc["mains-gas-flag"] == ""
or self.prepared_epc["mains-gas-flag"] in DATA_ANOMALY_MATCHES
)
else mains_gas_map[self.prepared_epc["mains-gas-flag"]]
)
def _clean_heat_loss_corridor(self):
"""
@ -553,11 +575,7 @@ class EPCRecord:
if not self.prepared_epc:
raise ValueError("EPC Recrod doesn not contain epc data")
valid_values = [
"no corridor",
"unheated corridor",
"heated corridor"
]
valid_values = ["no corridor", "unheated corridor", "heated corridor"]
boolean_map = {
"no corridor": False,
@ -566,19 +584,23 @@ class EPCRecord:
}
self.prepared_epc["heat-loss-corridor"] = (
"no corridor" if self.prepared_epc["heat-loss-corridor"] in DATA_ANOMALY_MATCHES else
self.prepared_epc["heat-loss-corridor"]
"no corridor"
if self.prepared_epc["heat-loss-corridor"] in DATA_ANOMALY_MATCHES
else self.prepared_epc["heat-loss-corridor"]
)
if self.prepared_epc["heat-loss-corridor"] not in valid_values:
self.prepared_epc["heat-loss-corridor"] = "no corridor"
self.prepared_epc["unheated-corridor-length"] = (
float(self.prepared_epc["unheated-corridor-length"]) if
self.prepared_epc["unheated-corridor-length"] not in ["", None] else None
float(self.prepared_epc["unheated-corridor-length"])
if self.prepared_epc["unheated-corridor-length"] not in ["", None]
else None
)
# We create boolean versions of heat-loss-corridor
self.heat_loss_corridor_bool = boolean_map[self.prepared_epc["heat-loss-corridor"]]
self.heat_loss_corridor_bool = boolean_map[
self.prepared_epc["heat-loss-corridor"]
]
def _clean_count_variables(self):
"""
@ -591,7 +613,7 @@ class EPCRecord:
"number-open-fireplaces",
"extension-count",
"flat-storey-count",
"number-habitable-rooms"
"number-habitable-rooms",
]
null_attributes = ["flat-storey-count", "number-habitable-rooms"]
@ -615,9 +637,11 @@ class EPCRecord:
if not self.prepared_epc:
raise ValueError("EPC Recrod doesn not contain epc data")
self.prepared_epc['wind-turbine-count'] = int(
self.prepared_epc['wind-turbine-count']
) if self.prepared_epc['wind-turbine-count'] not in DATA_ANOMALY_MATCHES else None
self.prepared_epc["wind-turbine-count"] = (
int(self.prepared_epc["wind-turbine-count"])
if self.prepared_epc["wind-turbine-count"] not in DATA_ANOMALY_MATCHES
else None
)
def _clean_solar_hot_water(self):
"""
@ -626,12 +650,7 @@ class EPCRecord:
if not self.prepared_epc:
raise ValueError("EPC Recrod doesn not contain epc data")
value_map = {
"Y": "Y",
"N": "N",
"": "N",
None: "N"
}
value_map = {"Y": "Y", "N": "N", "": "N", None: "N"}
boolean_map = {
"Y": True,
@ -643,7 +662,9 @@ class EPCRecord:
]
# Create a boolean version for storage in the database
self.solar_water_heating_flag_bool = boolean_map[self.prepared_epc['solar-water-heating-flag']]
self.solar_water_heating_flag_bool = boolean_map[
self.prepared_epc["solar-water-heating-flag"]
]
def _clean_solar_pv(self):
"""
@ -652,8 +673,11 @@ class EPCRecord:
if not self.prepared_epc:
raise ValueError("EPC Recrod doesn not contain epc data")
self.prepared_epc['photo-supply'] = float(self.prepared_epc['photo-supply']) if (
self.prepared_epc['photo-supply'] not in DATA_ANOMALY_MATCHES) else None
self.prepared_epc["photo-supply"] = (
float(self.prepared_epc["photo-supply"])
if (self.prepared_epc["photo-supply"] not in DATA_ANOMALY_MATCHES)
else None
)
def _clean_energy(self):
"""
@ -676,7 +700,7 @@ class EPCRecord:
if not self.prepared_epc:
raise ValueError("EPC Recrod doesn not contain epc data")
self.prepared_epc['built-form'] = BUILT_FORM_REMAP.get(
self.prepared_epc["built-form"] = BUILT_FORM_REMAP.get(
self.prepared_epc["built-form"], self.prepared_epc["built-form"]
)
@ -691,8 +715,10 @@ class EPCRecord:
if not self.prepared_epc:
raise ValueError("EPC Recrod doesn not contain epc data")
self.prepared_epc["construction-age-band"] = EPCDataProcessor.clean_construction_age_band(
self.prepared_epc["construction-age-band"]
self.prepared_epc["construction-age-band"] = (
EPCDataProcessor.clean_construction_age_band(
self.prepared_epc["construction-age-band"]
)
)
if self.prepared_epc["construction-age-band"] in DATA_ANOMALY_MATCHES:
@ -703,15 +729,20 @@ class EPCRecord:
old_record["lodgement-datetime"]
for old_record in self.old_data
if old_record["construction-age-band"]
not in DATA_ANOMALY_MATCHES
not in DATA_ANOMALY_MATCHES
]
)
most_recent = [old_record for old_record in self.old_data if
old_record["lodgement-datetime"] == max_datetime]
most_recent = [
old_record
for old_record in self.old_data
if old_record["lodgement-datetime"] == max_datetime
]
self.prepared_epc["construction-age-band"] = EPCDataProcessor.clean_construction_age_band(
most_recent[0]["construction-age-band"]
self.prepared_epc["construction-age-band"] = (
EPCDataProcessor.clean_construction_age_band(
most_recent[0]["construction-age-band"]
)
)
self.construction_age_band = self.prepared_epc["construction-age-band"]
@ -721,7 +752,7 @@ class EPCRecord:
self.age_band is None
):
self.age_band = "L"
self.construction_age_band = 'England and Wales: 2012 onwards'
self.construction_age_band = "England and Wales: 2012 onwards"
self.prepared_epc["construction-age-band"] = self.construction_age_band
if self.age_band is None:
@ -760,10 +791,10 @@ class EPCRecord:
"""
This method will clean the ventilation, if empty or invalid
"""
self.prepared_epc['mechanical-ventilation'] = None if (
self.prepared_epc['mechanical-ventilation'] in DATA_ANOMALY_MATCHES
) else (
self.prepared_epc['mechanical-ventilation']
self.prepared_epc["mechanical-ventilation"] = (
None
if (self.prepared_epc["mechanical-ventilation"] in DATA_ANOMALY_MATCHES)
else (self.prepared_epc["mechanical-ventilation"])
)
def _field_validation(self):
@ -841,6 +872,20 @@ class EPCRecord:
f"{validation_config['range']}"
)
def create_EPCDifferenceRecord(self, other, fixed_data, auto_sort: bool = True):
"""
This method will create the difference record between the two records
"""
if not isinstance(other, EPCRecord):
raise ValueError("Can only subtract EPCRecord from EPCRecord")
difference_record = EPCDifferenceRecord(
record1=self, record2=other, auto_sort=auto_sort
)
difference_record.append_fixed_data(fixed_data)
return difference_record
def __sub__(self, other):
"""
This method will return the difference between two EPC records
@ -848,6 +893,8 @@ class EPCRecord:
if not isinstance(other, EPCRecord):
raise ValueError("Can only subtract EPCRecord from EPCRecord")
print("Deprecated method, use create_EPCDifferenceRecord instead")
difference_record = EPCDifferenceRecord(
record1=self, record2=other, auto_sort=True
)
@ -962,7 +1009,7 @@ class EPCDifferenceRecord:
CARBON_RESPONSE
)
component_variables = COMPONENT_FEATURES + EFFICIENCY_FEATURES
component_variables = COMPONENT_FEATURES + EFFICIENCY_FEATURES + ROOM_FEATURES
ending_record = self.record2.get(
component_variables + ["lodgement_date"],
return_asdict=True,

View file

@ -28,18 +28,18 @@ DATA_ANOMALY_MATCHES = {
# The Building Emission Rate (BER) data field for non-domestic buildings may contain a blank value. The BER
# was only lodged on the register from 7 March 2010.
"Blank"
# There are currently just over 8,600 records where the local authority identifier is null. This is due to
# the Register Operator not being able to match the building address in the Markermap Ordinance Survey (GB)
# lookup tables or OS MasterMap Address Layer 2 data. The majority of these addresses have been requested
# manually by energy assessors for inclusion by the Register Operator in the registers (e.g. new builds,
# etc). These records are being published for completeness. An ongoing process to manage these manually added
# There are currently just over 8,600 records where the local authority identifier is null. This is due to
# the Register Operator not being able to match the building address in the Markermap Ordinance Survey (GB)
# lookup tables or OS MasterMap Address Layer 2 data. The majority of these addresses have been requested
# manually by energy assessors for inclusion by the Register Operator in the registers (e.g. new builds,
# etc). These records are being published for completeness. An ongoing process to manage these manually added
# addresses will take time to develop to deal with these and future anomalies.
#
# There are several fields within the lodged data where it is possible to enter multiple entries to cater for
# different data_types of build within a single property, i.e. extensions. This results in multiple entries for
# the description fields for floor, roof and wall. For the purposes of this data release only the information
# contained within the first of these multiple entries is being provided. As there are no restrictions on the
# value in this first field it means that sometimes the first field in a multiple entry description field may
# There are several fields within the lodged data where it is possible to enter multiple entries to cater for
# different data_types of build within a single property, i.e. extensions. This results in multiple entries for
# the description fields for floor, roof and wall. For the purposes of this data release only the information
# contained within the first of these multiple entries is being provided. As there are no restrictions on the
# value in this first field it means that sometimes the first field in a multiple entry description field may
# contain a null value. A resolution to correct these anomalies will be considered for future data releases.
"NULL",
# We sometimes see fields populated with just an empty string.
@ -163,17 +163,20 @@ CORE_COMPONENT_FEATURES = [
]
EFFICIENCY_FEATURES = [
'HOT_WATER_ENERGY_EFF',
'FLOOR_ENERGY_EFF',
'WINDOWS_ENERGY_EFF',
'WALLS_ENERGY_EFF',
'SHEATING_ENERGY_EFF',
'ROOF_ENERGY_EFF',
'MAINHEAT_ENERGY_EFF',
'MAINHEATC_ENERGY_EFF',
'LIGHTING_ENERGY_EFF'
"HOT_WATER_ENERGY_EFF",
"FLOOR_ENERGY_EFF",
"WINDOWS_ENERGY_EFF",
"WALLS_ENERGY_EFF",
"SHEATING_ENERGY_EFF",
"ROOF_ENERGY_EFF",
"MAINHEAT_ENERGY_EFF",
"MAINHEATC_ENERGY_EFF",
"LIGHTING_ENERGY_EFF",
]
ROOM_FEATURES = ["number_habitable_rooms", "number_heated_rooms"]
COMPONENT_FEATURES = CORE_COMPONENT_FEATURES + [
"TRANSACTION_TYPE",
"ENERGY_TARIFF", # Not sure if this is relevant
@ -184,10 +187,10 @@ COMPONENT_FEATURES = CORE_COMPONENT_FEATURES + [
]
POTENTIAL_COLUMNS = [
'POTENTIAL_ENERGY_EFFICIENCY',
'ENVIRONMENT_IMPACT_POTENTIAL',
'ENERGY_CONSUMPTION_POTENTIAL',
'CO2_EMISSIONS_POTENTIAL',
"POTENTIAL_ENERGY_EFFICIENCY",
"ENVIRONMENT_IMPACT_POTENTIAL",
"ENERGY_CONSUMPTION_POTENTIAL",
"CO2_EMISSIONS_POTENTIAL",
# We don't include cost features for the moment
# 'LIGHTING_COST_POTENTIAL',
# 'HEATING_COST_POTENTIAL',
@ -237,30 +240,55 @@ DATA_PROCESSOR_SETTINGS = {
# This has a manual mapping of the column types required
COLUMNTYPES = {
'UPRN': 'object', 'TOTAL_FLOOR_AREA': 'float64', 'FLOOR_HEIGHT': 'float64', 'PROPERTY_TYPE': 'object',
'BUILT_FORM': 'object', 'CONSTITUENCY': 'object', 'NUMBER_HABITABLE_ROOMS': 'float64',
'NUMBER_HEATED_ROOMS': 'float64', 'FIXED_LIGHTING_OUTLETS_COUNT': 'float64',
'CONSTRUCTION_AGE_BAND': 'object',
'TRANSACTION_TYPE': 'object',
'WALLS_DESCRIPTION': 'object',
'FLOOR_DESCRIPTION': 'object',
'LIGHTING_DESCRIPTION': 'object',
'ROOF_DESCRIPTION': 'object',
'MAINHEAT_DESCRIPTION': 'object',
'HOTWATER_DESCRIPTION': 'object', 'MAIN_FUEL': 'object',
'MECHANICAL_VENTILATION': 'object',
'SECONDHEAT_DESCRIPTION': 'object', 'ENERGY_TARIFF': 'object',
'SOLAR_WATER_HEATING_FLAG': 'object', 'PHOTO_SUPPLY': 'float64',
'WINDOWS_DESCRIPTION': 'object',
'GLAZED_TYPE': 'object',
'MULTI_GLAZE_PROPORTION': 'float64',
'LOW_ENERGY_LIGHTING': 'float64',
'NUMBER_OPEN_FIREPLACES': 'float64',
'MAINHEATCONT_DESCRIPTION': 'object',
'EXTENSION_COUNT': 'float64',
'LODGEMENT_DATE': 'object',
**dict(zip(EFFICIENCY_FEATURES, ['object', ] * len(EFFICIENCY_FEATURES))),
**dict(zip(POTENTIAL_COLUMNS, ['float64', ] * len(POTENTIAL_COLUMNS)))
"UPRN": "object",
"TOTAL_FLOOR_AREA": "float64",
"FLOOR_HEIGHT": "float64",
"PROPERTY_TYPE": "object",
"BUILT_FORM": "object",
"CONSTITUENCY": "object",
"NUMBER_HABITABLE_ROOMS": "float64",
"NUMBER_HEATED_ROOMS": "float64",
"FIXED_LIGHTING_OUTLETS_COUNT": "float64",
"CONSTRUCTION_AGE_BAND": "object",
"TRANSACTION_TYPE": "object",
"WALLS_DESCRIPTION": "object",
"FLOOR_DESCRIPTION": "object",
"LIGHTING_DESCRIPTION": "object",
"ROOF_DESCRIPTION": "object",
"MAINHEAT_DESCRIPTION": "object",
"HOTWATER_DESCRIPTION": "object",
"MAIN_FUEL": "object",
"MECHANICAL_VENTILATION": "object",
"SECONDHEAT_DESCRIPTION": "object",
"ENERGY_TARIFF": "object",
"SOLAR_WATER_HEATING_FLAG": "object",
"PHOTO_SUPPLY": "float64",
"WINDOWS_DESCRIPTION": "object",
"GLAZED_TYPE": "object",
"MULTI_GLAZE_PROPORTION": "float64",
"LOW_ENERGY_LIGHTING": "float64",
"NUMBER_OPEN_FIREPLACES": "float64",
"MAINHEATCONT_DESCRIPTION": "object",
"EXTENSION_COUNT": "float64",
"LODGEMENT_DATE": "object",
**dict(
zip(
EFFICIENCY_FEATURES,
[
"object",
]
* len(EFFICIENCY_FEATURES),
)
),
**dict(
zip(
POTENTIAL_COLUMNS,
[
"float64",
]
* len(POTENTIAL_COLUMNS),
)
),
}
# For modelling, we don't allow records with more than 100 SAP points
@ -280,7 +308,7 @@ fill_na_map = {
"LOW_ENERGY_LIGHTING": 0,
"MAINHEATCONT_DESCRIPTION": "Unknown",
"EXTENSION_COUNT": 0,
"NUMBER_OPEN_FIREPLACES": 0
"NUMBER_OPEN_FIREPLACES": 0,
}
################################################################################################
@ -289,62 +317,212 @@ fill_na_map = {
################################################################################################
STARTING_SUFFIX_COMPONENT_COLS = [
"SAP", "HEAT_DEMAND", "CARBON", "TRANSACTION_TYPE", "MECHANICAL_VENTILATION",
"SECONDHEAT_DESCRIPTION", "ENERGY_TARIFF", "SOLAR_WATER_HEATING_FLAG", "PHOTO_SUPPLY",
"GLAZED_TYPE", "MULTI_GLAZE_PROPORTION", "LOW_ENERGY_LIGHTING", "NUMBER_OPEN_FIREPLACES",
"EXTENSION_COUNT", "TOTAL_FLOOR_AREA", "FLOOR_HEIGHT", "DAYS_TO", "estimated_perimeter"
"SAP",
"HEAT_DEMAND",
"CARBON",
"TRANSACTION_TYPE",
"MECHANICAL_VENTILATION",
"SECONDHEAT_DESCRIPTION",
"ENERGY_TARIFF",
"SOLAR_WATER_HEATING_FLAG",
"PHOTO_SUPPLY",
"GLAZED_TYPE",
"MULTI_GLAZE_PROPORTION",
"LOW_ENERGY_LIGHTING",
"NUMBER_OPEN_FIREPLACES",
"EXTENSION_COUNT",
"TOTAL_FLOOR_AREA",
"FLOOR_HEIGHT",
"DAYS_TO",
"estimated_perimeter",
]
NO_SUFFIX_COMPONENT_COLS = [
"walls_thermal_transmittance",
"is_cavity_wall",
"is_filled_cavity",
"is_solid_brick",
"is_system_built",
"is_timber_frame",
"is_granite_or_whinstone",
"is_as_built",
"is_cob",
"is_sandstone_or_limestone",
"is_park_home",
"walls_insulation_thickness",
"external_insulation",
"internal_insulation",
"floor_thermal_transmittance",
"is_to_unheated_space",
"is_to_external_air",
"is_suspended",
"is_solid",
"another_property_below",
"floor_insulation_thickness",
"roof_thermal_transmittance",
"is_pitched",
"is_roof_room",
"is_loft",
"is_flat",
"is_thatched",
"is_at_rafters",
"has_dwelling_above",
"roof_insulation_thickness",
"heater_type",
"system_type",
"thermostat_characteristics",
"heating_scope",
"energy_recovery",
"hotwater_tariff_type",
"extra_features",
"chp_systems",
"distribution_system",
"no_system_present",
"appliance",
"has_radiators",
"has_fan_coil_units",
"has_pipes_in_screed_above_insulation",
"has_pipes_in_insulated_timber_floor",
"has_pipes_in_concrete_slab",
"has_boiler",
"has_air_source_heat_pump",
"has_room_heaters",
"has_electric_storage_heaters",
"has_warm_air",
"has_electric_underfloor_heating",
"has_electric_ceiling_heating",
"has_community_scheme",
"has_ground_source_heat_pump",
"has_no_system_present",
"has_portable_electric_heaters",
"has_water_source_heat_pump",
"has_electric_heat_pump",
"has_micro-cogeneration",
"has_solar_assisted_heat_pump",
"has_exhaust_source_heat_pump",
"has_community_heat_pump",
"has_electric",
"has_mains_gas",
"has_wood_logs",
"has_coal",
"has_oil",
"has_wood_pellets",
"has_anthracite",
"has_dual_fuel_mineral_and_wood",
"has_smokeless_fuel",
"has_lpg",
"has_b30k",
"has_electricaire",
"has_assumed_for_most_rooms",
"has_underfloor_heating",
"thermostatic_control",
"charging_system",
"switch_system",
"no_control",
"dhw_control",
"community_heating",
"multiple_room_thermostats",
"auxiliary_systems",
"trvs",
"rate_control",
"glazing_type",
"fuel_type",
"main-fuel_tariff_type",
"is_community",
"no_individual_heating_or_community_network",
"complex_fuel_type",
]
NO_SUFFIX_COMPONENT_COLS = ['walls_thermal_transmittance', 'is_cavity_wall',
'is_filled_cavity', 'is_solid_brick', 'is_system_built', 'is_timber_frame',
'is_granite_or_whinstone', 'is_as_built', 'is_cob', 'is_sandstone_or_limestone',
'is_park_home', 'walls_insulation_thickness', 'external_insulation', 'internal_insulation',
'floor_thermal_transmittance', 'is_to_unheated_space', 'is_to_external_air', 'is_suspended',
'is_solid', 'another_property_below', 'floor_insulation_thickness',
'roof_thermal_transmittance', 'is_pitched', 'is_roof_room', 'is_loft', 'is_flat',
'is_thatched', 'is_at_rafters', 'has_dwelling_above', 'roof_insulation_thickness',
'heater_type', 'system_type', 'thermostat_characteristics', 'heating_scope',
'energy_recovery',
'hotwater_tariff_type', 'extra_features', 'chp_systems', 'distribution_system',
'no_system_present', 'appliance', 'has_radiators', 'has_fan_coil_units',
'has_pipes_in_screed_above_insulation', 'has_pipes_in_insulated_timber_floor',
'has_pipes_in_concrete_slab', 'has_boiler', 'has_air_source_heat_pump', 'has_room_heaters',
'has_electric_storage_heaters', 'has_warm_air', 'has_electric_underfloor_heating',
'has_electric_ceiling_heating', 'has_community_scheme', 'has_ground_source_heat_pump',
'has_no_system_present', 'has_portable_electric_heaters', 'has_water_source_heat_pump',
'has_electric_heat_pump', 'has_micro-cogeneration', 'has_solar_assisted_heat_pump',
'has_exhaust_source_heat_pump', 'has_community_heat_pump', 'has_electric', 'has_mains_gas',
'has_wood_logs', 'has_coal', 'has_oil', 'has_wood_pellets', 'has_anthracite',
'has_dual_fuel_mineral_and_wood', 'has_smokeless_fuel', 'has_lpg', 'has_b30k',
'has_electricaire', 'has_assumed_for_most_rooms', 'has_underfloor_heating',
'thermostatic_control', 'charging_system', 'switch_system', 'no_control', 'dhw_control',
'community_heating', 'multiple_room_thermostats', 'auxiliary_systems', 'trvs',
'rate_control',
'glazing_type', 'fuel_type', 'main-fuel_tariff_type', 'is_community',
'no_individual_heating_or_community_network', 'complex_fuel_type',
]
ENDING_SUFFIX_COMPONENT_COLS = [
'SAP', 'HEAT_DEMAND', 'CARBON', 'TRANSACTION_TYPE', 'MECHANICAL_VENTILATION', 'SECONDHEAT_DESCRIPTION',
'ENERGY_TARIFF', 'SOLAR_WATER_HEATING_FLAG', 'PHOTO_SUPPLY', 'GLAZED_TYPE', 'MULTI_GLAZE_PROPORTION',
'LOW_ENERGY_LIGHTING', 'NUMBER_OPEN_FIREPLACES', 'EXTENSION_COUNT', 'TOTAL_FLOOR_AREA', 'FLOOR_HEIGHT',
'DAYS_TO', 'walls_thermal_transmittance', 'is_park_home', 'walls_insulation_thickness',
'external_insulation', 'internal_insulation', 'floor_thermal_transmittance', 'floor_insulation_thickness',
'roof_thermal_transmittance', 'roof_insulation_thickness', 'heater_type', 'system_type',
'thermostat_characteristics', 'heating_scope', 'energy_recovery', 'hotwater_tariff_type', 'extra_features',
'chp_systems', 'distribution_system', 'no_system_present', 'appliance', 'has_radiators',
'has_fan_coil_units', 'has_pipes_in_screed_above_insulation', 'has_pipes_in_insulated_timber_floor',
'has_pipes_in_concrete_slab', 'has_boiler', 'has_air_source_heat_pump', 'has_room_heaters',
'has_electric_storage_heaters', 'has_warm_air', 'has_electric_underfloor_heating',
'has_electric_ceiling_heating', 'has_community_scheme', 'has_ground_source_heat_pump',
'has_no_system_present', 'has_portable_electric_heaters', 'has_water_source_heat_pump',
'has_electric_heat_pump', 'has_micro-cogeneration', 'has_solar_assisted_heat_pump',
'has_exhaust_source_heat_pump', 'has_community_heat_pump', 'has_electric', 'has_mains_gas', 'has_wood_logs',
'has_coal', 'has_oil', 'has_wood_pellets', 'has_anthracite', 'has_dual_fuel_mineral_and_wood',
'has_smokeless_fuel', 'has_lpg', 'has_b30k', 'has_electricaire', 'has_assumed_for_most_rooms',
'has_underfloor_heating', 'thermostatic_control', 'charging_system', 'switch_system', 'no_control',
'dhw_control', 'community_heating', 'multiple_room_thermostats', 'auxiliary_systems', 'trvs',
'rate_control', 'glazing_type', 'fuel_type', 'main-fuel_tariff_type', 'is_community',
'no_individual_heating_or_community_network', 'complex_fuel_type', 'estimated_perimeter'
"SAP",
"HEAT_DEMAND",
"CARBON",
"TRANSACTION_TYPE",
"MECHANICAL_VENTILATION",
"SECONDHEAT_DESCRIPTION",
"ENERGY_TARIFF",
"SOLAR_WATER_HEATING_FLAG",
"PHOTO_SUPPLY",
"GLAZED_TYPE",
"MULTI_GLAZE_PROPORTION",
"LOW_ENERGY_LIGHTING",
"NUMBER_OPEN_FIREPLACES",
"EXTENSION_COUNT",
"TOTAL_FLOOR_AREA",
"FLOOR_HEIGHT",
"DAYS_TO",
"walls_thermal_transmittance",
"is_park_home",
"walls_insulation_thickness",
"external_insulation",
"internal_insulation",
"floor_thermal_transmittance",
"floor_insulation_thickness",
"roof_thermal_transmittance",
"roof_insulation_thickness",
"heater_type",
"system_type",
"thermostat_characteristics",
"heating_scope",
"energy_recovery",
"hotwater_tariff_type",
"extra_features",
"chp_systems",
"distribution_system",
"no_system_present",
"appliance",
"has_radiators",
"has_fan_coil_units",
"has_pipes_in_screed_above_insulation",
"has_pipes_in_insulated_timber_floor",
"has_pipes_in_concrete_slab",
"has_boiler",
"has_air_source_heat_pump",
"has_room_heaters",
"has_electric_storage_heaters",
"has_warm_air",
"has_electric_underfloor_heating",
"has_electric_ceiling_heating",
"has_community_scheme",
"has_ground_source_heat_pump",
"has_no_system_present",
"has_portable_electric_heaters",
"has_water_source_heat_pump",
"has_electric_heat_pump",
"has_micro-cogeneration",
"has_solar_assisted_heat_pump",
"has_exhaust_source_heat_pump",
"has_community_heat_pump",
"has_electric",
"has_mains_gas",
"has_wood_logs",
"has_coal",
"has_oil",
"has_wood_pellets",
"has_anthracite",
"has_dual_fuel_mineral_and_wood",
"has_smokeless_fuel",
"has_lpg",
"has_b30k",
"has_electricaire",
"has_assumed_for_most_rooms",
"has_underfloor_heating",
"thermostatic_control",
"charging_system",
"switch_system",
"no_control",
"dhw_control",
"community_heating",
"multiple_room_thermostats",
"auxiliary_systems",
"trvs",
"rate_control",
"glazing_type",
"fuel_type",
"main-fuel_tariff_type",
"is_community",
"no_individual_heating_or_community_network",
"complex_fuel_type",
"estimated_perimeter",
]
# We found that without performing any filtering, the bottom 0.5% of homes had a floor height of 1.65m. We'll therefore