diff --git a/backend/Property.py b/backend/Property.py index 82108bbb..19f15b02 100644 --- a/backend/Property.py +++ b/backend/Property.py @@ -11,11 +11,17 @@ from utils.s3 import read_dataframe_from_s3_parquet from etl.epc.settings import DATA_ANOMALY_MATCHES from recommendations.rdsap_tables import FLOOR_LEVEL_MAP from recommendations.recommendation_utils import ( - estimate_perimeter, get_wall_type, estimate_external_wall_area, esimtate_pitched_roof_area, estimate_windows + estimate_perimeter, + get_wall_type, + estimate_external_wall_area, + esimtate_pitched_roof_area, + estimate_windows, ) -ENVIRONMENT = os.environ.get('ENVIRONMENT', 'dev') -DATA_BUCKET = os.environ.get('DATA_BUCKET', 'retrofit-data-dev' if ENVIRONMENT == 'dev' else None) +ENVIRONMENT = os.environ.get("ENVIRONMENT", "dev") +DATA_BUCKET = os.environ.get( + "DATA_BUCKET", "retrofit-data-dev" if ENVIRONMENT == "dev" else None +) logger = setup_logger() @@ -30,7 +36,7 @@ class Property: "roof-description": "roof", "walls-description": "walls", "windows-description": "windows", - "lighting-description": "lighting" + "lighting-description": "lighting", } floor = None @@ -57,7 +63,9 @@ class Property: self.address = address self.postcode = postcode - self.data = {k.replace("_", "-"): v for k, v in epc_record.get("prepared_epc").items()} + self.data = { + k.replace("_", "-"): v for k, v in epc_record.get("prepared_epc").items() + } self.old_data = epc_record.get("old_data") self.property_dimensions = None @@ -92,7 +100,9 @@ class Property: "wind_turbine": epc_record.prepared_epc.get("wind_turbine_count"), } self.number_of_open_fireplaces = { - "number_of_open_fireplaces": epc_record.prepared_epc.get("number_open_fireplaces"), + "number_of_open_fireplaces": epc_record.prepared_epc.get( + "number_open_fireplaces" + ), } self.number_of_extensions = { "number_of_extensions": epc_record.prepared_epc.get("extension_count"), @@ -105,13 +115,15 @@ class Property: "length": epc_record.prepared_epc.get("unheated_corridor_length"), "heat_loss_corridor_boolean": epc_record.get("heat_loss_corridor_bool"), } - self.mains_gas = epc_record.prepared_epc.get('mains_gas_flag') - self.floor_height = epc_record.prepared_epc.get('floor_height') + self.mains_gas = epc_record.prepared_epc.get("mains_gas_flag") + self.floor_height = epc_record.prepared_epc.get("floor_height") self.insulation_wall_area = None - self.floor_area = epc_record.prepared_epc.get('total_floor_area') + self.floor_area = epc_record.prepared_epc.get("total_floor_area") self.pitched_roof_area = None self.insulation_floor_area = None - self.number_lighting_outlets = epc_record.prepared_epc.get("fixed_lighting_outlets_count") + self.number_lighting_outlets = epc_record.prepared_epc.get( + "fixed_lighting_outlets_count" + ) self.floor_level = None self.number_of_windows = None self.solar_pv_percentage = None @@ -128,18 +140,33 @@ class Property: It will be the same starting and ending EPC, as we don't have the expected EPC yet """ - difference_record = self.epc_record - self.epc_record + # difference_record = self.epc_record - self.epc_record # TODO: change these lower and replace in the settings file + print( + "CHANGE THE LATEST FIELD TO REMOVE NUMBER HABITABLE ROOMS IF WE WANT TO USE STARTING/ENDING" + ) fixed_data_col_names = MANDATORY_FIXED_FEATURES + LATEST_FIELD print("NEED TO CHANGE THE DASH TO LOWER CASE") - fixed_data_col_names = [x.lower().replace("_", "-") for x in fixed_data_col_names] + fixed_data_col_names = [ + x.lower().replace("_", "-") for x in fixed_data_col_names + ] - fixed_data = {k.replace("-", "_"): v for k, v in self.data.items() if k in fixed_data_col_names} + fixed_data = { + k.replace("-", "_"): v + for k, v in self.data.items() + if k in fixed_data_col_names + } - difference_record.append_fixed_data(fixed_data) + # difference_record.append_fixed_data(fixed_data) - self.base_difference_record = TrainingDataset(datasets=[difference_record], cleaned_lookup=cleaned_lookup) + difference_record = self.epc_record.create_EPCDifferenceRecord( + self.epc_record, fixed_data + ) + + self.base_difference_record = TrainingDataset( + datasets=[difference_record], cleaned_lookup=cleaned_lookup + ) # TODO: adjust the base difference record with the previously calculated u values + features # estimated_perimeter is different to the perimeter in the epc record @@ -147,9 +174,7 @@ class Property: # self.base_difference_record.df def adjust_difference_record_with_recommendations( - self, - property_recommendations, - property_representative_recommendations + self, property_recommendations, property_representative_recommendations ): """ This method will adjust the difference record, based on the recommendations made for the property @@ -161,13 +186,23 @@ class Property: """ self.recommendations_scoring_data = [] - phases = sorted([r[0]["phase"] for r in property_recommendations if r[0]["phase"] is not None]) + phases = sorted( + [ + r[0]["phase"] + for r in property_recommendations + if r[0]["phase"] is not None + ] + ) for phase in phases: - property_recommendations_by_phase = [r for r in property_recommendations if r[0]["phase"] == phase][0] + property_recommendations_by_phase = [ + r for r in property_recommendations if r[0]["phase"] == phase + ][0] previous_phases = [p for p in phases if p < phase] previous_phase_representatives = [ - r for r in property_representative_recommendations if r["phase"] in previous_phases + r + for r in property_representative_recommendations + if r["phase"] in previous_phases ] # For solid wall insulation, we will actually have 2 representative recommendations, since we consider # both internal and external wall insulation as possible measures. We will use the representative that @@ -175,15 +210,20 @@ class Property: # Take the representative with the lowest efficiency, by phase # To be safe, we sort by phase - previous_phase_representatives = sorted(previous_phase_representatives, key=lambda x: x['phase']) + previous_phase_representatives = sorted( + previous_phase_representatives, key=lambda x: x["phase"] + ) previous_phase_representatives = [ - min(group, key=lambda x: x['efficiency']) for _, group in groupby( - previous_phase_representatives, key=lambda x: x['phase'] + min(group, key=lambda x: x["efficiency"]) + for _, group in groupby( + previous_phase_representatives, key=lambda x: x["phase"] ) ] - recommendation_record = self.base_difference_record.df.to_dict("records")[0].copy() + recommendation_record = self.base_difference_record.df.to_dict("records")[ + 0 + ].copy() for rec in property_recommendations_by_phase: # We simulate the impact of the recommendation at this current phase, and all of the prior phases @@ -195,13 +235,16 @@ class Property: property_id=self.id, recommendation_record=recommendation_record, recommendations=previous_phase_representatives + [rec], - primary_recommendation_id=rec["recommendation_id"] + primary_recommendation_id=rec["recommendation_id"], ) self.recommendations_scoring_data.append(scoring_dict) @staticmethod def create_recommendation_scoring_data( - property_id, recommendation_record, recommendations: list, primary_recommendation_id: int + property_id, + recommendation_record, + recommendations: list, + primary_recommendation_id: int, ): """ This function will iterate through a list of recommendations and apply a simulation for each recommendation @@ -216,7 +259,9 @@ class Property: output = recommendation_record.copy() for col in [ - "walls_insulation_thickness", "floor_insulation_thickness", "roof_insulation_thickness" + "walls_insulation_thickness", + "floor_insulation_thickness", + "roof_insulation_thickness", ]: if output[col] is None: output[col] = "none" @@ -226,11 +271,15 @@ class Property: # We update the description to indicate it's insulated if recommendation["type"] in [ - "internal_wall_insulation", "external_wall_insulation", "cavity_wall_insulation" + "internal_wall_insulation", + "external_wall_insulation", + "cavity_wall_insulation", ]: # The upgrade made here is to the u-value of the walls and the description of the # insulation thickness - output["walls_thermal_transmittance_ending"] = recommendation["new_u_value"] + output["walls_thermal_transmittance_ending"] = recommendation[ + "new_u_value" + ] # Setting the insulation thickness here to above average should be tested further because we # don't see a high volume of instances for this output["walls_insulation_thickness_ending"] = "average" @@ -263,10 +312,14 @@ class Property: # Update description to indicate it's insulate if recommendation["type"] in [ - "solid_floor_insulation", "suspended_floor_insulation", "exposed_floor_insulation" + "solid_floor_insulation", + "suspended_floor_insulation", + "exposed_floor_insulation", ]: if len(recommendation["parts"]) > 1: - raise NotImplementedError("Have more than 1 floor insulation part - handle this case") + raise NotImplementedError( + "Have more than 1 floor insulation part - handle this case" + ) # output["floor_thermal_transmittance_ending"] = recommendation["new_u_value"] # We don't really see above average for this in the training data @@ -280,22 +333,43 @@ class Property: if output["floor_insulation_thickness_ending"] is None: output["floor_insulation_thickness_ending"] = "none" - if recommendation["type"] in ["loft_insulation", "room_roof_insulation", "flat_roof_insulation"]: - output["roof_thermal_transmittance_ending"] = recommendation["new_u_value"] + if recommendation["type"] in [ + "loft_insulation", + "room_roof_insulation", + "flat_roof_insulation", + ]: + output["roof_thermal_transmittance_ending"] = recommendation[ + "new_u_value" + ] parts = recommendation["parts"] if len(parts) != 1: - raise ValueError("More than one part for roof insulation - investiage me") + raise ValueError( + "More than one part for roof insulation - investiage me" + ) # This is based on the values we have in the training data valid_numeric_values = [ - 12, 25, 50, 75, 100, 150, 200, 250, 270, 300, 350, 400 + 12, + 25, + 50, + 75, + 100, + 150, + 200, + 250, + 270, + 300, + 350, + 400, ] proposed_depth = int(parts[0]["depth"]) if proposed_depth not in valid_numeric_values: # Take the nearest value for scoring - proposed_depth = min(valid_numeric_values, key=lambda x: abs(x - proposed_depth)) + proposed_depth = min( + valid_numeric_values, key=lambda x: abs(x - proposed_depth) + ) output["roof_insulation_thickness_ending"] = str(proposed_depth) if recommendation["type"] == "loft_insulation": @@ -329,11 +403,17 @@ class Property: if output["glazing_type_ending"] == "multiple": pass elif output["glazing_type_ending"] == "single": - output["glazing_type_ending"] = "secondary" if is_secondary_glazing else "double" + output["glazing_type_ending"] = ( + "secondary" if is_secondary_glazing else "double" + ) elif output["glazing_type_ending"] == "double": - output["glazing_type_ending"] = "multiple" if is_secondary_glazing else "double" + output["glazing_type_ending"] = ( + "multiple" if is_secondary_glazing else "double" + ) elif output["glazing_type_ending"] == "secondary": - output["glazing_type_ending"] = "secondary" if is_secondary_glazing else "multiple" + output["glazing_type_ending"] = ( + "secondary" if is_secondary_glazing else "multiple" + ) elif output["glazing_type_ending"] in ["triple", "high performance"]: output["glazing_type_ending"] = "multiple" else: @@ -342,7 +422,9 @@ class Property: if is_secondary_glazing: output["glazed_type_ending"] = "secondary glazing" else: - output["glazed_type_ending"] = "double glazing installed during or after 2002" + output["glazed_type_ending"] = ( + "double glazing installed during or after 2002" + ) if recommendation["type"] in ["heating", "hot_water_tank_insulation", "heating_control"]: # We update the data, as defined in the recommendaton @@ -367,13 +449,17 @@ class Property: "windows_glazing", "solar_pv", "heating", "hot_water_tank_insulation", "heating_control", ]: - raise NotImplementedError("Implement me, given type %s" % recommendation["type"]) + raise NotImplementedError( + "Implement me, given type %s" % recommendation["type"] + ) - output['id'] = "+".join([str(property_id), str(primary_recommendation_id)]) + output["id"] = "+".join([str(property_id), str(primary_recommendation_id)]) return output - def get_components(self, cleaned, photo_supply_lookup, floor_area_decile_thresholds): + def get_components( + self, cleaned, photo_supply_lookup, floor_area_decile_thresholds + ): """ Given the cleaning that has been performed, we'll use this to identify the property components, from roof to walls to windows, heating and hot water @@ -398,10 +484,12 @@ class Property: if self.data[description] in self.DATA_ANOMALY_MATCHES: template = cleaned[description][0] fill_dict = dict(zip(template.keys(), [None] * len(template))) - fill_dict.update({ - "original_description": self.data[description], - "clean_description": self.data[description], - }) + fill_dict.update( + { + "original_description": self.data[description], + "clean_description": self.data[description], + } + ) setattr( self, self.ATTRIBUTE_MAP[description], @@ -410,11 +498,15 @@ class Property: continue attributes = [ - x for x in cleaned[description] if x["original_description"] == self.data[description] + x + for x in cleaned[description] + if x["original_description"] == self.data[description] ] if len(attributes) > 1: - raise ValueError("Either No attributes or multiple found for %s" % description) + raise ValueError( + "Either No attributes or multiple found for %s" % description + ) if len(attributes) == 0: # We attempt to perform the clean on the fly @@ -422,8 +514,12 @@ class Property: cleaner_cls = cleaner_cls(self.data[description]) processed = { "original_description": self.data[description], - "clean_description": cleaner_cls.description.replace("(assumed)", "").rstrip().capitalize(), - **cleaner_cls.process() + "clean_description": cleaner_cls.description.replace( + "(assumed)", "" + ) + .rstrip() + .capitalize(), + **cleaner_cls.process(), } attributes = [processed] @@ -435,7 +531,8 @@ class Property: self.set_floor_level() self.set_windows_count() self.set_solar_panel_area( - photo_supply_lookup=photo_supply_lookup, floor_area_decile_thresholds=floor_area_decile_thresholds + photo_supply_lookup=photo_supply_lookup, + floor_area_decile_thresholds=floor_area_decile_thresholds, ) self.set_energy_source() @@ -452,7 +549,11 @@ class Property: self.is_heritage = spatial["is_heritage_building"].values[0] # We do an equals True, in the case of one of these variables being True - if (self.in_conservation_area == True) | (self.is_listed == True) | (self.is_heritage == True): + if ( + (self.in_conservation_area == True) + | (self.is_listed == True) + | (self.is_heritage == True) + ): self.restricted_measures = True spatial_dict = spatial.to_dict("records")[0] @@ -494,7 +595,7 @@ class Property: "tenure": self.data["tenure"], "current_epc_rating": self.data["current-energy-rating"], "current_sap_points": self.data["current-energy-efficiency"], - "current_valuation": current_valuation + "current_valuation": current_valuation, } property_data = self._clean_upload_data(property_data) @@ -506,7 +607,11 @@ class Property: """ Utility function for usage in the lambda, for preparing the _rating fields """ - return rating_lookup[field].value if (field not in cls.DATA_ANOMALY_MATCHES) and (field is not None) else None + return ( + rating_lookup[field].value + if (field not in cls.DATA_ANOMALY_MATCHES) and (field is not None) + else None + ) def get_property_details_epc(self, portfolio_id: int, rating_lookup): @@ -516,21 +621,37 @@ class Property: "full_address": self.data["address"], "total_floor_area": float(self.data["total-floor-area"]), "walls": self.walls["clean_description"], - "walls_rating": self._prepare_rating_field(self.data["walls-energy-eff"], rating_lookup), + "walls_rating": self._prepare_rating_field( + self.data["walls-energy-eff"], rating_lookup + ), "roof": self.roof["clean_description"], - "roof_rating": self._prepare_rating_field(self.data["roof-energy-eff"], rating_lookup), + "roof_rating": self._prepare_rating_field( + self.data["roof-energy-eff"], rating_lookup + ), "floor": self.floor["clean_description"], - "floor_rating": self._prepare_rating_field(self.data["floor-energy-eff"], rating_lookup), + "floor_rating": self._prepare_rating_field( + self.data["floor-energy-eff"], rating_lookup + ), "windows": self.windows["clean_description"], - "windows_rating": self._prepare_rating_field(self.data["windows-energy-eff"], rating_lookup), + "windows_rating": self._prepare_rating_field( + self.data["windows-energy-eff"], rating_lookup + ), "heating": self.main_heating["clean_description"], - "heating_rating": self._prepare_rating_field(self.data["mainheat-energy-eff"], rating_lookup), + "heating_rating": self._prepare_rating_field( + self.data["mainheat-energy-eff"], rating_lookup + ), "heating_controls": self.main_heating_controls["clean_description"], - "heating_controls_rating": self._prepare_rating_field(self.data["mainheatc-energy-eff"], rating_lookup), + "heating_controls_rating": self._prepare_rating_field( + self.data["mainheatc-energy-eff"], rating_lookup + ), "hot_water": self.hotwater["clean_description"], - "hot_water_rating": self._prepare_rating_field(self.data["hot-water-energy-eff"], rating_lookup), + "hot_water_rating": self._prepare_rating_field( + self.data["hot-water-energy-eff"], rating_lookup + ), "lighting": self.lighting["clean_description"], - "lighting_rating": self._prepare_rating_field(self.data["lighting-energy-eff"], rating_lookup), + "lighting_rating": self._prepare_rating_field( + self.data["lighting-energy-eff"], rating_lookup + ), "mainfuel": self.main_fuel["clean_description"], "ventilation": self.ventilation["ventilation"], "solar_pv": self.solar_pv["solar_pv"], @@ -539,7 +660,9 @@ class Property: "floor_height": self.floor_height, "heat_loss_corridor": self.heat_loss_corridor["heat_loss_corridor_boolean"], "unheated_corridor_length": self.heat_loss_corridor["length"], - "number_of_open_fireplaces": self.number_of_open_fireplaces["number_of_open_fireplaces"], + "number_of_open_fireplaces": self.number_of_open_fireplaces[ + "number_of_open_fireplaces" + ], "number_of_extensions": self.number_of_extensions["number_of_extensions"], "number_of_storeys": self.number_of_storeys["number_of_storeys"], "mains_gas": self.mains_gas, @@ -547,20 +670,21 @@ class Property: "primary_energy_consumption": self.energy["primary_energy_consumption"], "co2_emissions": self.energy["co2_emissions"], "adjusted_energy_consumption": self.current_adjusted_energy, - "estimated": self.data.get("estimated", False) + "estimated": self.data.get("estimated", False), } return property_details_epc def get_spatial_data(self, uprn_filenames): - """ Given a property's UPRN, this method will pull the associated spatial data from s3 :return: """ if self.uprn is None: - logger.warning("We do not have a UPRN for this property - this needs to be implemented") + logger.warning( + "We do not have a UPRN for this property - this needs to be implemented" + ) self.in_conservation_area = False self.is_listed = False self.is_heritage = False @@ -568,12 +692,15 @@ class Property: return # We get the file name for the uprn - filtered_df = uprn_filenames[(uprn_filenames['lower'] <= self.uprn) & (uprn_filenames['upper'] >= self.uprn)] + filtered_df = uprn_filenames[ + (uprn_filenames["lower"] <= self.uprn) + & (uprn_filenames["upper"] >= self.uprn) + ] if filtered_df.empty: logger.warning("Could not find file containing UPRNS") return None - filename = filtered_df.iloc[0]['filenames'] + filename = filtered_df.iloc[0]["filenames"] spatial_data = read_dataframe_from_s3_parquet( bucket_name=DATA_BUCKET, file_key=f"spatial/{filename}" @@ -591,15 +718,27 @@ class Property: :return: filtered property dimensions dataframe """ - result = property_dimensions[(property_dimensions["PROPERTY_TYPE"] == self.data["property-type"])] + result = property_dimensions[ + (property_dimensions["PROPERTY_TYPE"] == self.data["property-type"]) + ] - if self.construction_age_band is not None and self.construction_age_band not in self.DATA_ANOMALY_MATCHES: - result = result[(result["CONSTRUCTION_AGE_BAND"] == self.construction_age_band)] + if ( + self.construction_age_band is not None + and self.construction_age_band not in self.DATA_ANOMALY_MATCHES + ): + result = result[ + (result["CONSTRUCTION_AGE_BAND"] == self.construction_age_band) + ] - if self.data["built-form"] not in self.DATA_ANOMALY_MATCHES and self.data["built-form"] in result["BUILT_FORM"]: + if ( + self.data["built-form"] not in self.DATA_ANOMALY_MATCHES + and self.data["built-form"] in result["BUILT_FORM"] + ): result = result[(result["BUILT_FORM"] == self.data["built-form"])] - return result[["NUMBER_HABITABLE_ROOMS", "TOTAL_FLOOR_AREA", "FLOOR_HEIGHT"]].mean() + return result[ + ["NUMBER_HABITABLE_ROOMS", "TOTAL_FLOOR_AREA", "FLOOR_HEIGHT"] + ].mean() def set_basic_property_dimensions(self): """ @@ -618,7 +757,8 @@ class Property: # They could also be added as attributes to the EPC Record self.perimeter = estimate_perimeter( - self.floor_area / self.number_of_floors, self.number_of_rooms / self.number_of_floors + self.floor_area / self.number_of_floors, + self.number_of_rooms / self.number_of_floors, ) self.insulation_wall_area = estimate_external_wall_area( @@ -636,8 +776,9 @@ class Property: def set_floor_level(self): self.floor_level = ( - FLOOR_LEVEL_MAP[self.data["floor-level"]] if - self.data["floor-level"] not in self.DATA_ANOMALY_MATCHES and self.data['floor-level'] is not None + FLOOR_LEVEL_MAP[self.data["floor-level"]] + if self.data["floor-level"] not in self.DATA_ANOMALY_MATCHES + and self.data["floor-level"] is not None else None ) @@ -699,12 +840,16 @@ class Property: raise NotImplementedError("Implement this floor type") @staticmethod - def _extract_component(component_data, component_rename_cols, component_drop_cols, rename_prefix=None): + def _extract_component( + component_data, component_rename_cols, component_drop_cols, rename_prefix=None + ): for k in component_rename_cols: component_data[f"{rename_prefix}_{k}"] = component_data.get(k) component_data = { - k: v for k, v in component_data.items() if k not in component_drop_cols + component_rename_cols + k: v + for k, v in component_data.items() + if k not in component_drop_cols + component_rename_cols } return component_data @@ -752,7 +897,7 @@ class Property: is_flat=self.roof["is_flat"], is_pitched=self.roof["is_pitched"], is_roof_room=self.roof["is_roof_room"], - floor_area=self.floor_area + floor_area=self.floor_area, ) percentage_of_roof = photo_supply_matched["photo_supply_median"].mean() @@ -768,8 +913,9 @@ class Property: """ return ( - self.insulation_floor_area * percentage_of_roof if self.roof["is_flat"] else - self.pitched_roof_area * percentage_of_roof + self.insulation_floor_area * percentage_of_roof + if self.roof["is_flat"] + else self.pitched_roof_area * percentage_of_roof ) def set_energy_source(self): @@ -782,7 +928,12 @@ class Property: # If the tariff explicitly indicates electricity use without a dual indication and mains_gas_flag is not True # We check for the common electricity tariffs if not self.data["mains-gas-flag"] and self.data["energy-tariff"] in [ - "Single", "off-peak 7 hour", "off-peak 10 hour", "off-peak 18 hour", "standard tariff", "24 hour" + "Single", + "off-peak 7 hour", + "off-peak 10 hour", + "off-peak 18 hour", + "standard tariff", + "24 hour", ]: energy_source = "electricity" diff --git a/etl/epc/Dataset.py b/etl/epc/Dataset.py index 7040d66c..e897da78 100644 --- a/etl/epc/Dataset.py +++ b/etl/epc/Dataset.py @@ -563,7 +563,6 @@ class TrainingDataset(BaseDataset): "original_description_ending", "thermal_transmittance_unit_ending", "is_cavity_wall_ending", - "is_filled_cavity_ending", "is_solid_brick_ending", "is_system_built_ending", "is_timber_frame_ending", @@ -607,7 +606,6 @@ class TrainingDataset(BaseDataset): "is_loft_ending", "is_flat_ending", "is_thatched_ending", - "is_at_rafters_ending", "has_dwelling_above_ending", "is_assumed_ending", "is_valid_ending", diff --git a/etl/epc/Pipeline.py b/etl/epc/Pipeline.py index 36c381ce..6abf05bd 100644 --- a/etl/epc/Pipeline.py +++ b/etl/epc/Pipeline.py @@ -1,5 +1,6 @@ import msgpack import pandas as pd +from datetime import datetime from typing import List from pathlib import Path @@ -25,7 +26,8 @@ from etl.epc.settings import ( # TODO: change in setting file MANDATORY_FIXED_FEATURES = [x.lower() for x in MANDATORY_FIXED_FEATURES] -LATEST_FIELD = [x.lower() for x in LATEST_FIELD if x.lower() not in ROOM_FEATURES] +# LATEST_FIELD = [x.lower() for x in LATEST_FIELD if x.lower() not in ROOM_FEATURES] +LATEST_FIELD = [x.lower() for x in LATEST_FIELD] COMPONENT_FEATURES = [x.lower() for x in COMPONENT_FEATURES] RDSAP_RESPONSE = RDSAP_RESPONSE.lower() HEAT_DEMAND_RESPONSE = HEAT_DEMAND_RESPONSE.lower() @@ -81,9 +83,9 @@ class EPCPipeline: run_mode="training", epc_local_file="certificates.csv", epc_bucket_name="retrofit-data-dev", - epc_cleaning_dataset_key="sap_change_model/cleaning_dataset_rooms.parquet", - epc_all_equal_rows_key="sap_change_model/all_equal_rows_rooms.parquet", - epc_compiled_dataset_key="sap_change_model/dataset_rooms.parquet", + epc_cleaning_dataset_key="sap_change_model/{}/cleaning_dataset_rooms.parquet", + epc_all_equal_rows_key="sap_change_model/{}/all_equal_rows_rooms.parquet", + epc_compiled_dataset_key="sap_change_model/{}/dataset_rooms.parquet", use_parallel=False, ): """ @@ -106,10 +108,13 @@ class EPCPipeline: self.run_mode = run_mode self.epc_local_file = epc_local_file self.epc_bucket_name = epc_bucket_name - self.epc_cleaning_dataset_key = epc_cleaning_dataset_key - self.epc_all_equal_rows_key = epc_all_equal_rows_key - self.epc_compiled_dataset_key = epc_compiled_dataset_key + self.use_parallel = use_parallel + self.timeprefix = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + + self.epc_cleaning_dataset_key = epc_cleaning_dataset_key.format(self.timeprefix) + self.epc_all_equal_rows_key = epc_all_equal_rows_key.format(self.timeprefix) + self.epc_compiled_dataset_key = epc_compiled_dataset_key.format(self.timeprefix) def run(self): """ diff --git a/etl/epc/generate_scenarios_data.py b/etl/epc/generate_scenarios_data.py index afe9ab98..f9f66034 100644 --- a/etl/epc/generate_scenarios_data.py +++ b/etl/epc/generate_scenarios_data.py @@ -20,6 +20,10 @@ from recommendations.Recommendations import Recommendations from utils.logger import setup_logger from utils.s3 import read_dataframe_from_s3_parquet, save_dataframe_to_s3_parquet +from datetime import datetime + +now = datetime.now().strftime("%d-%m-%Y-%H-%M-%S") + logger = setup_logger() logger.info("Connecting to db") @@ -50,9 +54,19 @@ scenario_properties = [ "postcode": "NN1 5JY", "lmk-key": "1459796789102016070507274146560098", "measures": [ - [["internal_wall_insulation"], "11", None, [0]], - [["external_wall_insulation"], "10", None, [0]], - [["solar", "windows"], "12-15", {"photo_supply_ending": 50}, [0, 1]], + [ + ["internal_wall_insulation"], + "11", + {"walls_insulation_thickness_ending": "average"}, + [0], + ], + [ + ["external_wall_insulation"], + "10", + {"walls_insulation_thickness_ending": "average"}, + [0], + ], + [["solar", "windows"], "15", {"photo_supply_ending": 50}, [0, 1]], ], }, { @@ -60,7 +74,12 @@ scenario_properties = [ "postcode": "HP1 2HA", "lmk-key": "c14029235739827d5f627dc8aa9bb567d026b267e851e0db0001db24638667b1", "measures": [ - [["cavity_wall_insulation", "loft_insulation"], "15", None, [0, 1]], + [ + ["cavity_wall_insulation", "loft_insulation"], + "15", + {"walls_insulation_thickness_ending": "average"}, + [0, 1], + ], ], }, { @@ -68,7 +87,12 @@ scenario_properties = [ "postcode": "HP1 2HE", "lmk-key": "99296a6dda21314fef3a61cda59e441e9a2aacf115eb96f4a0fa85696bf7b117", "measures": [ - [["cavity_wall_insulation", "loft_insulation"], "15", None, [0, 1]], + [ + ["cavity_wall_insulation", "loft_insulation"], + "15", + {"walls_insulation_thickness_ending": "average"}, + [0, 1], + ], ], }, { @@ -76,7 +100,12 @@ scenario_properties = [ "postcode": "HP1 2AN", "lmk-key": "d1e0534be3a44c33003323b21d0e322e3daddc65b5ee71936f89c59ddab96b50", "measures": [ - [["cavity_wall_insulation", "loft_insulation"], "15", None, [0, 1]], + [ + ["cavity_wall_insulation", "loft_insulation"], + "15", + {"walls_insulation_thickness_ending": "average"}, + [0, 1], + ], ], }, { @@ -84,11 +113,17 @@ scenario_properties = [ "postcode": "HP1 2HX", "lmk-key": "1eae354db522a95188018d9cd0502ed8c609910b6c88f8797d3a25f59b11770a", "measures": [ - [["cavity_wall_insulation", "loft_insulation"], "15", None, [0, 1]], + [ + ["cavity_wall_insulation", "loft_insulation"], + "15", + {"walls_insulation_thickness_ending": "average"}, + [0, 1], + ], ], }, ] + recommendations_scoring_data = [] for scenario_property in scenario_properties: @@ -132,7 +167,7 @@ for scenario_property in scenario_properties: p.get_components(cleaned, photo_supply_lookup, floor_area_decile_thresholds) recommender = Recommendations(property_instance=p, materials=materials) - property_recommendations = recommender.recommend() + property_recommendations = recommender.recommend("0") wall_recommendations = recommender.wall_recomender.recommendations loft_recommendations = recommender.roof_recommender.recommendations @@ -183,7 +218,7 @@ for scenario_property in scenario_properties: if "windows" in measure: for rec in windows_recommendations: - if rec["type"] == "windows": + if rec["type"] == "windows_glazing": windows_recs.append(rec) combi_list = [wall_recs, loft_recs, solar_recs, windows_recs] @@ -213,6 +248,9 @@ for scenario_property in scenario_properties: recommendations_scoring_data.extend(scoring_list) recommendations_scoring_data = pd.DataFrame(recommendations_scoring_data) +recommendations_scoring_data["impact"] = recommendations_scoring_data["impact"].astype( + int +) recommendations_scoring_data = recommendations_scoring_data.drop( columns=[ "rdsap_change", @@ -240,12 +278,12 @@ all_predictions = model_api.predict_all( prediction_buckets={ "sap_change_predictions": get_settings().SAP_PREDICTIONS_BUCKET, "heat_demand_predictions": get_settings().HEAT_PREDICTIONS_BUCKET, - "carbon_change_predictions": get_settings().CARBON_PREDICTIONS_BUCKET - } + "carbon_change_predictions": get_settings().CARBON_PREDICTIONS_BUCKET, + }, ) save_dataframe_to_s3_parquet( recommendations_scoring_data, "retrofit-data-dev", - "scenario_data/recommendations_scoring_data.parquet", + f"scenario_data/{now}/recommendations_scoring_data.parquet", ) diff --git a/etl/epc/requirements.txt b/etl/epc/requirements.txt index 9f972bde..87148180 100644 --- a/etl/epc/requirements.txt +++ b/etl/epc/requirements.txt @@ -1,4 +1,5 @@ pandas==2.1.3 tqdm==4.66.1 msgpack==1.0.7 -boto3==1.29.6 \ No newline at end of file +boto3==1.29.6 +pyarrow==15.0.2 \ No newline at end of file