From efdef5eb46bf8803761ccbf93f1b31dfa09f9e08 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 10 Oct 2023 05:41:25 +0800 Subject: [PATCH] added recommendation scoring setup process --- backend/Property.py | 86 +++++++++++++++++++ .../app/db/functions/materials_functions.py | 5 ++ backend/app/plan/router.py | 13 +-- backend/app/plan/utils.py | 24 +++++- backend/tests/test_sap_model_prep.py | 0 etl/epc/DataProcessor.py | 26 +++++- etl/epc/property_change_app.py | 11 ++- etl/epc/settings.py | 64 ++++++++++++++ recommendations/config.py | 7 +- 9 files changed, 211 insertions(+), 25 deletions(-) create mode 100644 backend/tests/test_sap_model_prep.py diff --git a/backend/Property.py b/backend/Property.py index 1e8bbaf8..a9bbc69e 100644 --- a/backend/Property.py +++ b/backend/Property.py @@ -569,3 +569,89 @@ class Property(Definitions): :return: """ self.wall_type = get_wall_type(**self.walls) + + @staticmethod + def _extract_component(component_data, component_rename_cols, component_drop_cols, rename_prefix=None): + for k in component_rename_cols: + component_data[f"{rename_prefix}_{k}"] = component_data[k] + + component_data = { + k: v for k, v in component_data.items() if k not in component_drop_cols + component_rename_cols + } + + return component_data + + def get_model_data(self): + """ + This method extracts cleaned data from the property object, which is used in our machine learning models + + This will use many of the cleaned properties, extracted from the epc data, or methods in DataProcessor. + + For future iterations of this, we probably want to implement a singular method in DataProcessor, which can + be used in the etl code and in here + + :return: dictionary of model data to be scored in the model + """ + + drop_cols = ["original_description", "clean_description"] + insulation_drop_cols = ["thermal_transmittance_unit", "is_assumed", "is_valid"] + insulation_rename_cols = ["thermal_transmittance", "insulation_thickness"] + + walls = self._extract_component(self.walls, insulation_rename_cols, insulation_drop_cols + drop_cols, "walls") + roof = self._extract_component(self.roof, insulation_rename_cols, insulation_drop_cols + drop_cols, "roof") + floor = self._extract_component(self.floor, insulation_rename_cols, insulation_drop_cols + drop_cols, "floor") + + windows = self._extract_component(self.windows, [], drop_cols + ["no_data"]) + fuel = self._extract_component(self.main_fuel, ["tariff_type"], drop_cols + ["tariff_type"], "main-fuel") + main_heating = self._extract_component(self.main_heating, [], drop_cols + ["has_assumed"]) + main_heating_controls = self._extract_component(self.main_heating_controls, [], drop_cols) + hotwater = self._extract_component(self.hotwater, ["tariff_type"], drop_cols + ['assumed'], "hotwater") + + # We'll need to clean second heating + second_heating = self.data["secondheat-description"] + + epc_raw_columns = [ + 'TRANSACTION_TYPE', + 'ENERGY_TARIFF', + 'PROPERTY_TYPE', + 'UPRN', + 'NUMBER_OPEN_FIREPLACES', + 'FIXED_LIGHTING_OUTLETS_COUNT', + 'MULTI_GLAZE_PROPORTION', + 'MECHANICAL_VENTILATION', + 'PHOTO_SUPPLY', + 'LOW_ENERGY_LIGHTING', + 'SOLAR_WATER_HEATING_FLAG', + 'BUILT_FORM', + 'GLAZED_TYPE', + 'CONSTITUENCY', + 'NUMBER_HEATED_ROOMS', + 'EXTENSION_COUNT', + ] + epc_raw_data = { + k: self.data[k.lower().replace("_", "-")] for k in epc_raw_columns + } + + property_data = { + **walls, + **roof, + **floor, + **fuel, + **main_heating, + **main_heating_controls, + **hotwater, + **windows, + "SECONDHEAT_DESCRIPTION": second_heating, + "DAYS_TO": DataProcessor.calculate_days_to(self.data["lodgement-date"]), + "SAP": self.data["current-energy-efficiency"], + "CARBON": self.data["co2-emissions-current"], + "HEAT_DEMAND": self.data["energy-consumption-current"], + "estimated_perimeter": self.perimeter, + "CONSTRUCTION_AGE_BAND": self.age_band, + "FLOOR_HEIGHT": self.floor_height, + "NUMBER_HABITABLE_ROOMS": self.number_of_rooms, + "TOTAL_FLOOR_AREA": self.floor_area, + **epc_raw_data + } + + return property_data diff --git a/backend/app/db/functions/materials_functions.py b/backend/app/db/functions/materials_functions.py index f4c38aed..f3c2f316 100644 --- a/backend/app/db/functions/materials_functions.py +++ b/backend/app/db/functions/materials_functions.py @@ -7,6 +7,11 @@ def get_materials(session): """ This function will retrieve all materials from the database. :return: A list of Material objects if successful, an empty list otherwise. + + + TODO: It might not be the best choice to store the materials data in a database table since thi + table probably won't be very large and won't be updated that often. It might be better to + store this data in s3 load it into memory when the app starts up. We will test this """ materials = session.query(Material).filter(Material.is_active).all() diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py index 83b758e3..9b801348 100644 --- a/backend/app/plan/router.py +++ b/backend/app/plan/router.py @@ -106,10 +106,6 @@ async def trigger_plan(body: PlanTriggerRequest): # The materials data could be cached or local so we don't need to make # consistent requests to the backend for # the same data - # TODO: It might not be the best choice to store the materials data in a database table since thi - # table probably won't be very large and won't be updated that often. It might be better to - # store this data in s3 load it into memory when the app starts up. We will test this - logger.info("Reading in materials and cleaned datasets") materials = get_materials(session) materials_by_type = filter_materials(materials) @@ -158,20 +154,15 @@ async def trigger_plan(body: PlanTriggerRequest): recommendations[p.id] = property_recommendations # Finally, we'll prepare data for predicting the impact on SAP - # TODO: We should use the cleaned data from get_components in the data rather than the raw - # values. We should create a method in Property which takes the EPC data and inserts the cleaned - # data - data_processor = DataProcessor(None, newdata=True) - data_processor.insert_data(pd.DataFrame([p.data.copy()])) - data_processor.pre_process() + data_processor.insert_data(pd.DataFrame([p.get_model_data()])) starting_epc_data = data_processor.get_component_features(suffix="_STARTING") ending_epc_data = data_processor.get_component_features(suffix="_ENDING") fixed_data = data_processor.get_fixed_features() # We update the ending record with the recommended updates and we set lodgement date to today - ending_epc_data["LODGEMENT_DATE_ENDING"] = created_at + ending_epc_data["LODGEMENT_DATE_ENDING"] = data_processor.calculate_days_to(created_at) for recommendations_by_type in property_recommendations: for rec in recommendations_by_type: diff --git a/backend/app/plan/utils.py b/backend/app/plan/utils.py index c257fd4e..fae84ae4 100644 --- a/backend/app/plan/utils.py +++ b/backend/app/plan/utils.py @@ -2,7 +2,10 @@ import pandas as pd from backend.Property import Property from collections import defaultdict from utils.s3 import read_from_s3 + from recommendations.config import UPGRADES_MAP +from recommendations.recommendation_utils import get_wall_u_value, get_floor_u_value + from backend.app.db.utils import row2dict from backend.app.config import get_settings import msgpack @@ -85,8 +88,27 @@ def create_recommendation_scoring_data( # We update the description to indicate it's insulated if recommendation["type"] == "wall_insulation": - scoring_dict["WALLS_DESCRIPTION_ENDING"] = UPGRADES_MAP[property.walls["clean_description"]] + # The upgrade made here is to the u-value of the walls and the description of the + # insulation thickness + # We may not have the u-value initially, so we calculate it + if not scoring_dict["walls_thermal_transmittance"]: + scoring_dict["walls_thermal_transmittance"] = get_wall_u_value( + clean_description=property.walls["clean_description"], + age_band=property.age_band, + is_granite_or_whinstone=property.walls["is_granite_or_whinstone"], + is_sandstone_or_limestone=property.walls["is_sandstone_or_limestone"] + ) + + scoring_dict["walls_thermal_transmittance_ENDING"] = get_wall_u_value( + clean_description=UPGRADES_MAP[property.walls["clean_description"]], + age_band=property.age_band, + is_granite_or_whinstone=property.walls["is_granite_or_whinstone"], + is_sandstone_or_limestone=property.walls["is_sandstone_or_limestone"] + ) + scoring_dict["walls_insulation_thickness_ENDING"] = "above average" + elif recommendation["type"] == "floor_insulation": + blah scoring_dict["FLOOR_DESCRIPTION_ENDING"] = UPGRADES_MAP[property.floor["clean_description"]] else: raise NotImplementedError("Implement me") diff --git a/backend/tests/test_sap_model_prep.py b/backend/tests/test_sap_model_prep.py new file mode 100644 index 00000000..e69de29b diff --git a/etl/epc/DataProcessor.py b/etl/epc/DataProcessor.py index 27bac020..47b1b367 100644 --- a/etl/epc/DataProcessor.py +++ b/etl/epc/DataProcessor.py @@ -9,12 +9,14 @@ from etl.epc.settings import ( AVERAGE_FIXED_FEATURES, BUILT_FORM_REMAP, COLUMNS_TO_MERGE_ON, - COMPONENT_FEATURES, FIXED_FEATURES, COLUMNTYPES, RDSAP_RESPONSE, MAX_SAP_SCORE, fill_na_map, + STARTING_SUFFIX_COMPONENT_COLS, + NO_SUFFIX_COMPONENT_COLS, + ENDING_SUFFIX_COMPONENT_COLS ) from recommendations.rdsap_tables import FLOOR_LEVEL_MAP @@ -500,9 +502,15 @@ class DataProcessor: """ if suffix not in ["_STARTING", "_ENDING"]: - raise Exception("Suffix should be one of _STARTING or _ENFING") + raise Exception("Suffix should be one of _STARTING or _ENDING") - return self.data[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].copy().add_suffix(suffix) + if suffix == "_STARTING": + starting_cols = self.data[STARTING_SUFFIX_COMPONENT_COLS].copy().add_suffix(suffix) + fixed_cols = self.data[NO_SUFFIX_COMPONENT_COLS].copy() + + return pd.concat([starting_cols, fixed_cols], axis=1) + + return self.data[ENDING_SUFFIX_COMPONENT_COLS].copy().add_suffix(suffix) def get_fixed_features(self) -> pd.DataFrame: """ @@ -531,3 +539,15 @@ class DataProcessor: df[column] = df[column].astype(bool) return df + + @staticmethod + def calculate_days_to(lodgement_date): + + if isinstance(lodgement_date, str): + return ( + pd.to_datetime(lodgement_date).tz_localize(None) - pd.to_datetime(EARLIEST_EPC_DATE) + ).days + + return ( + pd.to_datetime(lodgement_date) - pd.to_datetime(EARLIEST_EPC_DATE) + ).dt.days diff --git a/etl/epc/property_change_app.py b/etl/epc/property_change_app.py index 8b5a5088..a0f5a21c 100644 --- a/etl/epc/property_change_app.py +++ b/etl/epc/property_change_app.py @@ -514,12 +514,11 @@ def app(): # Add some temporal features - we look at the days from the standard starting point in time # for the starting and ending date so all records are from a fixed point - data_by_urpn_df["DAYS_TO_STARTING"] = ( - pd.to_datetime(data_by_urpn_df["LODGEMENT_DATE_STARTING"]) - pd.to_datetime(EARLIEST_EPC_DATE) - ).dt.days - data_by_urpn_df["DAYS_TO_ENDING"] = ( - pd.to_datetime(data_by_urpn_df["LODGEMENT_DATE_ENDING"]) - pd.to_datetime(EARLIEST_EPC_DATE) - ).dt.days + data_by_urpn_df["DAYS_TO_STARTING"] = DataProcessor.calculate_days_to( + data_by_urpn_df["LODGEMENT_DATE_STARTING"]) + + data_by_urpn_df["DAYS_TO_ENDING"] = DataProcessor.calculate_days_to( + data_by_urpn_df["LODGEMENT_DATE_ENDING"]) data_by_urpn_df = data_by_urpn_df.drop(columns=["LODGEMENT_DATE_STARTING", "LODGEMENT_DATE_ENDING"]) diff --git a/etl/epc/settings.py b/etl/epc/settings.py index 9ebb0806..fb8e464d 100644 --- a/etl/epc/settings.py +++ b/etl/epc/settings.py @@ -189,3 +189,67 @@ fill_na_map = { "EXTENSION_COUNT": 0, "NUMBER_OPEN_FIREPLACES": 0 } + +################################################################################################ +# These are the features we need for scoring +# We'll likely change how we do this in the future +################################################################################################ + +STARTING_SUFFIX_COMPONENT_COLS = [ + "SAP", "HEAT_DEMAND", "CARBON", "TRANSACTION_TYPE", "MECHANICAL_VENTILATION", + "SECONDHEAT_DESCRIPTION", "ENERGY_TARIFF", "SOLAR_WATER_HEATING_FLAG", "PHOTO_SUPPLY", + "GLAZED_TYPE", "MULTI_GLAZE_PROPORTION", "LOW_ENERGY_LIGHTING", "NUMBER_OPEN_FIREPLACES", + "EXTENSION_COUNT", "TOTAL_FLOOR_AREA", "FLOOR_HEIGHT", "DAYS_TO", "estimated_perimeter" +] +NO_SUFFIX_COMPONENT_COLS = ['walls_thermal_transmittance', 'is_cavity_wall', + 'is_filled_cavity', 'is_solid_brick', 'is_system_built', 'is_timber_frame', + 'is_granite_or_whinstone', 'is_as_built', 'is_cob', 'is_sandstone_or_limestone', + 'is_park_home', 'walls_insulation_thickness', 'external_insulation', 'internal_insulation', + 'floor_thermal_transmittance', 'is_to_unheated_space', 'is_to_external_air', 'is_suspended', + 'is_solid', 'another_property_below', 'floor_insulation_thickness', + 'roof_thermal_transmittance', 'is_pitched', 'is_roof_room', 'is_loft', 'is_flat', + 'is_thatched', 'is_at_rafters', 'has_dwelling_above', 'roof_insulation_thickness', + 'heater_type', 'system_type', 'thermostat_characteristics', 'heating_scope', + 'energy_recovery', + 'hotwater_tariff_type', 'extra_features', 'chp_systems', 'distribution_system', + 'no_system_present', 'appliance', 'has_radiators', 'has_fan_coil_units', + 'has_pipes_in_screed_above_insulation', 'has_pipes_in_insulated_timber_floor', + 'has_pipes_in_concrete_slab', 'has_boiler', 'has_air_source_heat_pump', 'has_room_heaters', + 'has_electric_storage_heaters', 'has_warm_air', 'has_electric_underfloor_heating', + 'has_electric_ceiling_heating', 'has_community_scheme', 'has_ground_source_heat_pump', + 'has_no_system_present', 'has_portable_electric_heaters', 'has_water_source_heat_pump', + 'has_electric_heat_pump', 'has_micro-cogeneration', 'has_solar_assisted_heat_pump', + 'has_exhaust_source_heat_pump', 'has_community_heat_pump', 'has_electric', 'has_mains_gas', + 'has_wood_logs', 'has_coal', 'has_oil', 'has_wood_pellets', 'has_anthracite', + 'has_dual_fuel_mineral_and_wood', 'has_smokeless_fuel', 'has_lpg', 'has_b30k', + 'has_electricaire', 'has_assumed_for_most_rooms', 'has_underfloor_heating', + 'thermostatic_control', 'charging_system', 'switch_system', 'no_control', 'dhw_control', + 'community_heating', 'multiple_room_thermostats', 'auxiliary_systems', 'trvs', + 'rate_control', + 'glazing_type', 'fuel_type', 'main-fuel_tariff_type', 'is_community', + 'no_individual_heating_or_community_network', 'complex_fuel_type', + ] + +ENDING_SUFFIX_COMPONENT_COLS = [ + 'SAP', 'HEAT_DEMAND', 'CARBON', 'TRANSACTION_TYPE', 'MECHANICAL_VENTILATION', 'SECONDHEAT_DESCRIPTION', + 'ENERGY_TARIFF', 'SOLAR_WATER_HEATING_FLAG', 'PHOTO_SUPPLY', 'GLAZED_TYPE', 'MULTI_GLAZE_PROPORTION', + 'LOW_ENERGY_LIGHTING', 'NUMBER_OPEN_FIREPLACES', 'EXTENSION_COUNT', 'TOTAL_FLOOR_AREA', 'FLOOR_HEIGHT', + 'DAYS_TO', 'walls_thermal_transmittance', 'is_park_home', 'walls_insulation_thickness', + 'external_insulation', 'internal_insulation', 'floor_thermal_transmittance', 'floor_insulation_thickness', + 'roof_thermal_transmittance', 'roof_insulation_thickness', 'heater_type', 'system_type', + 'thermostat_characteristics', 'heating_scope', 'energy_recovery', 'hotwater_tariff_type', 'extra_features', + 'chp_systems', 'distribution_system', 'no_system_present', 'appliance', 'has_radiators', + 'has_fan_coil_units', 'has_pipes_in_screed_above_insulation', 'has_pipes_in_insulated_timber_floor', + 'has_pipes_in_concrete_slab', 'has_boiler', 'has_air_source_heat_pump', 'has_room_heaters', + 'has_electric_storage_heaters', 'has_warm_air', 'has_electric_underfloor_heating', + 'has_electric_ceiling_heating', 'has_community_scheme', 'has_ground_source_heat_pump', + 'has_no_system_present', 'has_portable_electric_heaters', 'has_water_source_heat_pump', + 'has_electric_heat_pump', 'has_micro-cogeneration', 'has_solar_assisted_heat_pump', + 'has_exhaust_source_heat_pump', 'has_community_heat_pump', 'has_electric', 'has_mains_gas', 'has_wood_logs', + 'has_coal', 'has_oil', 'has_wood_pellets', 'has_anthracite', 'has_dual_fuel_mineral_and_wood', + 'has_smokeless_fuel', 'has_lpg', 'has_b30k', 'has_electricaire', 'has_assumed_for_most_rooms', + 'has_underfloor_heating', 'thermostatic_control', 'charging_system', 'switch_system', 'no_control', + 'dhw_control', 'community_heating', 'multiple_room_thermostats', 'auxiliary_systems', 'trvs', + 'rate_control', 'glazing_type', 'fuel_type', 'main-fuel_tariff_type', 'is_community', + 'no_individual_heating_or_community_network', 'complex_fuel_type', 'estimated_perimeter' +] diff --git a/recommendations/config.py b/recommendations/config.py index 20169ffd..242e54c5 100644 --- a/recommendations/config.py +++ b/recommendations/config.py @@ -1,10 +1,9 @@ # This map defines the upgrades that are possible to be recommended by the recommendation engine # For example, -# TODO: once we use cleaned descriptions, this should be updated using the cleaned descriptions UPGRADES_MAP = { - 'Solid brick, as built, no insulation (assumed)': 'Solid brick, as built, insulated (assumed)', - 'Suspended, no insulation (assumed)': 'Suspended, insulated (assumed)', - 'Solid, no insulation (assumed)': 'Solid, insulated (assumed)', + 'Solid brick, as built, no insulation': 'Solid brick, as built, insulated', + 'Suspended, no insulation': 'Suspended, insulated', + 'Solid, no insulation': 'Solid, insulated', } PARTIAL_CAVITY_DESCRIPTIONS = [