From e25a8d2d5e3194bc59071aae49b5a81731f85262 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Wed, 10 Jul 2024 23:20:15 +0100 Subject: [PATCH] add some temp code to add gas prices and electric prices, both current month and the prior month --- etl/epc/Pipeline.py | 91 ++++++++++++++++++++++++++++++++++++++++++++- etl/epc/Record.py | 22 +++++++++++ 2 files changed, 111 insertions(+), 2 deletions(-) diff --git a/etl/epc/Pipeline.py b/etl/epc/Pipeline.py index bc3bfd91..b6e3fc49 100644 --- a/etl/epc/Pipeline.py +++ b/etl/epc/Pipeline.py @@ -1,6 +1,6 @@ import msgpack import pandas as pd -from datetime import datetime +from datetime import datetime, timedelta from typing import List from pathlib import Path @@ -77,6 +77,21 @@ clean_lookup["walls-description"] = new_walls_description_mapping.to_dict( orient="records" ) +# TODO: Move this to s3 if needed +ENERGY_DIRECTORY = Path(__file__).parent / "local_data" / "energy_data" + +electricity_data = pd.read_csv(ENERGY_DIRECTORY / "electricity-prices.csv") +electricity_data.columns = ["lodgement_date", "electricity_price"] + +gas_data = pd.read_csv(ENERGY_DIRECTORY / "gas-prices.csv") +gas_data.columns = ["lodgement_date", "gas_price"] + +for df in [electricity_data, gas_data]: + df["lodgement_date"] = pd.to_datetime(df["lodgement_date"]) + df["lodgement_year"] = df["lodgement_date"].dt.year + df["lodgement_month"] = df["lodgement_date"].dt.month + df.drop(columns=["lodgement_date"], inplace=True) + class EPCPipeline: """ @@ -243,6 +258,69 @@ class EPCPipeline: constituency_difference_records = [] + constituency_data["lodgement_date"] = pd.to_datetime( + constituency_data["lodgement_date"] + ) + constituency_data["previous_date"] = constituency_data[ + "lodgement_date" + ].dt.to_period("M").dt.to_timestamp() - timedelta(days=1) + constituency_data["lodgement_year"] = constituency_data[ + "lodgement_date" + ].dt.year + constituency_data["lodgement_month"] = constituency_data[ + "lodgement_date" + ].dt.month + constituency_data["previous_year"] = constituency_data["previous_date"].dt.year + constituency_data["previous_month"] = constituency_data[ + "previous_date" + ].dt.month + + constituency_data = pd.merge( + constituency_data, + electricity_data[ + ["electricity_price", "lodgement_year", "lodgement_month"] + ], + how="left", + on=["lodgement_year", "lodgement_month"], + ) + constituency_data = pd.merge( + constituency_data, + gas_data[["gas_price", "lodgement_year", "lodgement_month"]], + how="left", + on=["lodgement_year", "lodgement_month"], + ) + + constituency_data = pd.merge( + constituency_data, + electricity_data[ + ["electricity_price", "lodgement_year", "lodgement_month"] + ], + how="left", + left_on=["previous_year", "previous_month"], + right_on=["lodgement_year", "lodgement_month"], + suffixes=("", "_previous"), + ) + + constituency_data = pd.merge( + constituency_data, + gas_data[["gas_price", "lodgement_year", "lodgement_month"]], + how="left", + left_on=["previous_year", "previous_month"], + right_on=["lodgement_year", "lodgement_month"], + suffixes=("", "_previous"), + ) + + constituency_data = constituency_data.drop( + columns=[ + "lodgement_year", + "lodgement_month", + "previous_year", + "previous_month", + "lodgement_month_previous", + "lodgement_year_previous", + ] + ) + for uprn, property_data in constituency_data.groupby("uprn", observed=True): difference_records = self.process_uprn( uprn=str(uprn), property_data=property_data, directory=directory @@ -280,7 +358,16 @@ class EPCPipeline: # We include the lodgement date here as we probably need to factor time into the # model, since EPC standards and rigour have changed over time - variable_data = property_data[VARIABLE_DATA_FEATURES + COST_FEATURES] + variable_data = property_data[ + VARIABLE_DATA_FEATURES + + COST_FEATURES + + [ + "electricity_price", + "gas_price", + "electricity_price_previous", + "gas_price_previous", + ] + ] uprn = str(uprn) epc_records = [ diff --git a/etl/epc/Record.py b/etl/epc/Record.py index b8471ccf..98a3812b 100644 --- a/etl/epc/Record.py +++ b/etl/epc/Record.py @@ -79,6 +79,10 @@ class EPCRecord: lighting_cost_current: float = None heating_cost_current: float = None hot_water_cost_current: float = None + electricity_price: float = None + gas_price: float = None + electricity_price_previous: float = None + gas_price_previous: float = None # potential_energy_efficiency: float = None # environment_impact_potential: float = None # energy_consumption_potential: float = None @@ -255,6 +259,12 @@ class EPCRecord: self.lighting_cost_current: float = self.prepared_epc["lighting_cost_current"] self.heating_cost_current: float = self.prepared_epc["heating_cost_current"] self.hot_water_cost_current: float = self.prepared_epc["hot_water_cost_current"] + self.electricity_price: float = self.prepared_epc["electricity_price"] + self.gas_price: float = self.prepared_epc["gas_price"] + self.electricity_price_previous: float = self.prepared_epc[ + "electricity_price_previous" + ] + self.gas_price_previous: float = self.prepared_epc["gas_price_previous"] # self.potential_energy_efficiency: float = float( # self.prepared_epc["potential_energy_efficiency"] # ) @@ -1056,6 +1066,18 @@ class EPCDifferenceRecord: "heating_cost_ending": self.record2.get("heating_cost_current"), "hot_water_cost_starting": self.record1.get("hot_water_cost_current"), "hot_water_cost_ending": self.record2.get("hot_water_cost_current"), + "electricity_price_starting": self.record1.get("electricity_price"), + "electricity_price_ending": self.record2.get("electricity_price"), + "gas_price_starting": self.record1.get("gas_price"), + "gas_price_ending": self.record2.get("gas_price"), + "electricity_price_previous_starting": self.record1.get( + "electricity_price_previous" + ), + "electricity_price_previous_ending": self.record2.get( + "electricity_price_previous" + ), + "gas_price_previous_starting": self.record1.get("gas_price_previous"), + "gas_price_previous_ending": self.record2.get("gas_price_previous"), # "potential_energy_efficiency": self.earliest_record.get( # "potential_energy_efficiency" # ),