From ff954eeeda8f121cc5d3af711c9b71147097a11f Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Tue, 11 Jun 2024 21:31:42 +0100 Subject: [PATCH 1/2] remove potential columns --- etl/epc/Pipeline.py | 14 +++----- etl/epc/Record.py | 87 +++++++++++++++++++++++++-------------------- 2 files changed, 54 insertions(+), 47 deletions(-) diff --git a/etl/epc/Pipeline.py b/etl/epc/Pipeline.py index 3a078703..47cddeb0 100644 --- a/etl/epc/Pipeline.py +++ b/etl/epc/Pipeline.py @@ -39,7 +39,7 @@ VARIABLE_DATA_FEATURES = ( COMPONENT_FEATURES + ROOM_FEATURES + EFFICIENCY_FEATURES - + POTENTIAL_COLUMNS + # + POTENTIAL_COLUMNS + ["lodgement_date", RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE, CARBON_RESPONSE] ) @@ -66,14 +66,10 @@ clean_lookup = get_cleaned_description_mapping() # TODO: THIS IS A TEMPORARY FIX new_walls_description_mapping = pd.DataFrame(clean_lookup["walls-description"]) - -import numpy as np - -new_walls_description_mapping["thermal_transmittance_unit"] = np.where( - ~pd.isnull(new_walls_description_mapping["thermal_transmittance_unit"]), - "w/m-¦k", - new_walls_description_mapping["thermal_transmittance_unit"], -) +new_walls_description_mapping.loc[ + ~new_walls_description_mapping["thermal_transmittance_unit"].isnull(), + "thermal_transmittance_unit", +] = "w/m-¦k" clean_lookup["walls-description"] = new_walls_description_mapping.to_dict( orient="records" diff --git a/etl/epc/Record.py b/etl/epc/Record.py index 9a965c6a..9b69c33a 100644 --- a/etl/epc/Record.py +++ b/etl/epc/Record.py @@ -76,10 +76,10 @@ class EPCRecord: mainheat_energy_eff: str = None mainheatc_energy_eff: str = None lighting_energy_eff: str = None - potential_energy_efficiency: float = None - environment_impact_potential: float = None - energy_consumption_potential: float = None - co2_emissions_potential: float = None + # potential_energy_efficiency: float = None + # environment_impact_potential: float = None + # energy_consumption_potential: float = None + # co2_emissions_potential: float = None lodgement_date: str = None current_energy_efficiency: int = None energy_consumption_current: int = None @@ -249,18 +249,18 @@ class EPCRecord: self.mainheat_energy_eff: str = self.prepared_epc["mainheat_energy_eff"] self.mainheatc_energy_eff: str = self.prepared_epc["mainheatc_energy_eff"] self.lighting_energy_eff: str = self.prepared_epc["lighting_energy_eff"] - self.potential_energy_efficiency: float = float( - self.prepared_epc["potential_energy_efficiency"] - ) - self.environment_impact_potential: float = float( - self.prepared_epc["environment_impact_potential"] - ) - self.energy_consumption_potential: float = float( - self.prepared_epc["energy_consumption_potential"] - ) - self.co2_emissions_potential: float = float( - self.prepared_epc["co2_emissions_potential"] - ) + # self.potential_energy_efficiency: float = float( + # self.prepared_epc["potential_energy_efficiency"] + # ) + # self.environment_impact_potential: float = float( + # self.prepared_epc["environment_impact_potential"] + # ) + # self.energy_consumption_potential: float = float( + # self.prepared_epc["energy_consumption_potential"] + # ) + # self.co2_emissions_potential: float = float( + # self.prepared_epc["co2_emissions_potential"] + # ) self.lodgement_date: str = self.prepared_epc["lodgement_date"] self.current_energy_efficiency: int = int( self.prepared_epc["current_energy_efficiency"] @@ -466,9 +466,7 @@ class EPCRecord: (property_dimensions["PROPERTY_TYPE"] == self.prepared_epc["property-type"]) ] - if ( - self.construction_age_band not in DATA_ANOMALY_MATCHES - ): + if self.construction_age_band not in DATA_ANOMALY_MATCHES: result = result[ (result["CONSTRUCTION_AGE_BAND"] == self.construction_age_band) ] @@ -480,7 +478,12 @@ class EPCRecord: result = result[(result["BUILT_FORM"] == self.prepared_epc["built-form"])] return result[ - ["NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS", "TOTAL_FLOOR_AREA", "FLOOR_HEIGHT"] + [ + "NUMBER_HABITABLE_ROOMS", + "NUMBER_HEATED_ROOMS", + "TOTAL_FLOOR_AREA", + "FLOOR_HEIGHT", + ] ].mean() def _clean_property_dimensions(self): @@ -491,9 +494,11 @@ class EPCRecord: if not self.prepared_epc: raise ValueError("EPC Record doesn not contain epc data") - if (self.prepared_epc["number-habitable-rooms"] in DATA_ANOMALY_MATCHES) or ( - self.prepared_epc["floor-height"] in DATA_ANOMALY_MATCHES - ) or (self.prepared_epc["number-heated-rooms"] in DATA_ANOMALY_MATCHES): + if ( + (self.prepared_epc["number-habitable-rooms"] in DATA_ANOMALY_MATCHES) + or (self.prepared_epc["floor-height"] in DATA_ANOMALY_MATCHES) + or (self.prepared_epc["number-heated-rooms"] in DATA_ANOMALY_MATCHES) + ): property_dimensions = read_dataframe_from_s3_parquet( bucket_name=DATA_BUCKET, file_key=f"property_dimensions/{self.prepared_epc['local-authority']}.parquet", @@ -507,12 +512,18 @@ class EPCRecord: self.property_dimensions["NUMBER_HABITABLE_ROOMS"].round() ) else: - self.prepared_epc["number-habitable-rooms"] = float(self.prepared_epc["number-habitable-rooms"]) + self.prepared_epc["number-habitable-rooms"] = float( + self.prepared_epc["number-habitable-rooms"] + ) if self.prepared_epc["number-heated-rooms"] in DATA_ANOMALY_MATCHES: - self.prepared_epc["number-heated-rooms"] = float(self.property_dimensions["NUMBER_HEATED_ROOMS"].round()) + self.prepared_epc["number-heated-rooms"] = float( + self.property_dimensions["NUMBER_HEATED_ROOMS"].round() + ) else: - self.prepared_epc["number-heated-rooms"] = float(self.prepared_epc["number-heated-rooms"]) + self.prepared_epc["number-heated-rooms"] = float( + self.prepared_epc["number-heated-rooms"] + ) self.number_of_floors = estimate_number_of_floors( self.prepared_epc["property-type"] @@ -1033,18 +1044,18 @@ class EPCDifferenceRecord: "heat_demand_ending": self.record2.get(HEAT_DEMAND_RESPONSE), "carbon_starting": self.record1.get(CARBON_RESPONSE), "carbon_ending": self.record2.get(CARBON_RESPONSE), - "potential_energy_efficiency": self.earliest_record.get( - "potential_energy_efficiency" - ), - "environment_impact_potential": self.earliest_record.get( - "environment_impact_potential" - ), - "energy_consumption_potential": self.earliest_record.get( - "energy_consumption_potential" - ), - "co2_emissions_potential": self.earliest_record.get( - "co2_emissions_potential" - ), + # "potential_energy_efficiency": self.earliest_record.get( + # "potential_energy_efficiency" + # ), + # "environment_impact_potential": self.earliest_record.get( + # "environment_impact_potential" + # ), + # "energy_consumption_potential": self.earliest_record.get( + # "energy_consumption_potential" + # ), + # "co2_emissions_potential": self.earliest_record.get( + # "co2_emissions_potential" + # ), **ending_record, **starting_record, } From b63de79043b2b7a1e9498754621944315aaa76f7 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Wed, 3 Jul 2024 23:35:02 +0100 Subject: [PATCH 2/2] add cost to EPCRecord, Difference record and pipeline --- etl/epc/Pipeline.py | 4 +++- etl/epc/Record.py | 12 ++++++++++++ etl/epc/settings.py | 6 ++++++ 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/etl/epc/Pipeline.py b/etl/epc/Pipeline.py index 47cddeb0..bc3bfd91 100644 --- a/etl/epc/Pipeline.py +++ b/etl/epc/Pipeline.py @@ -22,6 +22,7 @@ from etl.epc.settings import ( EFFICIENCY_FEATURES, POTENTIAL_COLUMNS, ROOM_FEATURES, + COST_FEATURES, ) # TODO: change in setting file @@ -42,6 +43,7 @@ VARIABLE_DATA_FEATURES = ( # + POTENTIAL_COLUMNS + ["lodgement_date", RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE, CARBON_RESPONSE] ) +COST_FEATURES = [x.lower() for x in COST_FEATURES] def get_cleaned_description_mapping(): @@ -278,7 +280,7 @@ class EPCPipeline: # We include the lodgement date here as we probably need to factor time into the # model, since EPC standards and rigour have changed over time - variable_data = property_data[VARIABLE_DATA_FEATURES] + variable_data = property_data[VARIABLE_DATA_FEATURES + COST_FEATURES] uprn = str(uprn) epc_records = [ diff --git a/etl/epc/Record.py b/etl/epc/Record.py index 9b69c33a..b8471ccf 100644 --- a/etl/epc/Record.py +++ b/etl/epc/Record.py @@ -76,6 +76,9 @@ class EPCRecord: mainheat_energy_eff: str = None mainheatc_energy_eff: str = None lighting_energy_eff: str = None + lighting_cost_current: float = None + heating_cost_current: float = None + hot_water_cost_current: float = None # potential_energy_efficiency: float = None # environment_impact_potential: float = None # energy_consumption_potential: float = None @@ -249,6 +252,9 @@ class EPCRecord: self.mainheat_energy_eff: str = self.prepared_epc["mainheat_energy_eff"] self.mainheatc_energy_eff: str = self.prepared_epc["mainheatc_energy_eff"] self.lighting_energy_eff: str = self.prepared_epc["lighting_energy_eff"] + self.lighting_cost_current: float = self.prepared_epc["lighting_cost_current"] + self.heating_cost_current: float = self.prepared_epc["heating_cost_current"] + self.hot_water_cost_current: float = self.prepared_epc["hot_water_cost_current"] # self.potential_energy_efficiency: float = float( # self.prepared_epc["potential_energy_efficiency"] # ) @@ -1044,6 +1050,12 @@ class EPCDifferenceRecord: "heat_demand_ending": self.record2.get(HEAT_DEMAND_RESPONSE), "carbon_starting": self.record1.get(CARBON_RESPONSE), "carbon_ending": self.record2.get(CARBON_RESPONSE), + "lighting_cost_starting": self.record1.get("lighting_cost_current"), + "lighting_cost_ending": self.record2.get("lighting_cost_current"), + "heating_cost_starting": self.record1.get("heating_cost_current"), + "heating_cost_ending": self.record2.get("heating_cost_current"), + "hot_water_cost_starting": self.record1.get("hot_water_cost_current"), + "hot_water_cost_ending": self.record2.get("hot_water_cost_current"), # "potential_energy_efficiency": self.earliest_record.get( # "potential_energy_efficiency" # ), diff --git a/etl/epc/settings.py b/etl/epc/settings.py index 18dbaa7c..a814750f 100644 --- a/etl/epc/settings.py +++ b/etl/epc/settings.py @@ -110,6 +110,12 @@ DEPLOYMENT_FOLDER = "deployment" TOTAL_FLOOR_AREA_NATIONAL_AVERAGE = 70 FLOOR_HEIGHT_NATIONAL_AVERAGE = 2.45 +COST_FEATURES = [ + "LIGHTING_COST_CURRENT", + "HEATING_COST_CURRENT", + "HOT_WATER_COST_CURRENT", +] + AVERAGE_FIXED_FEATURES = [ "TOTAL_FLOOR_AREA", "FLOOR_HEIGHT",