Model/etl/air_source_heat_pump/AirSourceHeatPumpEfficiency.py
Khalim Conn-Kowlessar bf45a5f4fa minor
2024-10-03 16:10:32 +01:00

124 lines
5 KiB
Python

import pandas as pd
from tqdm import tqdm
from utils.s3 import save_dataframe_to_s3_parquet, read_dataframe_from_s3_parquet
from utils.logger import setup_logger
from etl.epc.settings import EARLIEST_EPC_DATE
logger = setup_logger()
class AirSourceHeatPumpEfficiency:
def __init__(self, file_directories, cleaned_lookup):
"""
:param file_directories: A list of directories where files are stored.
:param cleaned_lookup: A dictionary containing cleaned lookup data.
"""
self.file_directories = file_directories
self.cleaned_lookup = cleaned_lookup
self.results = []
def create_dataset(self):
logger.info("Creating solar photo supply dataset")
heating_data = []
for dir in tqdm(self.file_directories):
filepath = dir / "certificates.csv"
df = pd.read_csv(filepath, low_memory=False)
# df = df[~pd.isnull(df["UPRN"])]
# df["UPRN"] = df["UPRN"].astype(int).astype(str)
# Take entries after SAP12
df["LODGEMENT_DATE"] = pd.to_datetime(df["LODGEMENT_DATE"])
df = df[df["LODGEMENT_DATE"] > EARLIEST_EPC_DATE]
# df = df[
# ~df["TENURE"].isin(
# [
# "unknown",
# "Not defined - use in the case of a new dwelling for which the intended tenure in not known. "
# "It is not to be used for an existing dwelling"
# ]
# )
# ]
# Take entries that contain an air source heat pump
df = df[
(
# Air source heat pumps
(df["MAINHEAT_DESCRIPTION"] == "Air source heat pump, radiators, electric") &
(df["MAINHEATCONT_DESCRIPTION"] == "Time and temperature zone control")
) |
(
# High heat retention storage
df["MAINHEATCONT_DESCRIPTION"] == "Controls for high heat retention storage heaters"
)
]
# Drop rows that have a missing PROPERTY_TYPE, BUILT_FORM, CONSTRUCTION_AGE_BAND, TOTAL_FLOOR_AREA
for col in ["PROPERTY_TYPE", "BUILT_FORM", "CONSTRUCTION_AGE_BAND", "TOTAL_FLOOR_AREA"]:
df = df[~pd.isnull(df[col])]
heating_data.append(df)
# temp
# import pickle
# with open("heating_data - delete me.pkl", "wb") as f:
# pickle.dump(heating_data, f)
heating_df = pd.concat(heating_data)
# Clean construction age band
from etl.epc.DataProcessor import EPCDataProcessor
heating_df["CONSTRUCTION_AGE_BAND_CLEAN"] = heating_df["CONSTRUCTION_AGE_BAND"].apply(
lambda x: EPCDataProcessor.clean_construction_age_band(x)
)
ashp_df = heating_df[
(heating_df["MAINHEAT_DESCRIPTION"] == "Air source heat pump, radiators, electric") &
# ~heating_df["CONSTRUCTION_AGE_BAND"].str.contains("England and Wales")
(~heating_df["CONSTRUCTION_AGE_BAND"].isin(["NO DATA!", "INVALID!"])) &
(heating_df["LODGEMENT_DATE"] >= pd.to_datetime("2019-01-01"))
]
ashp_efficiencies = (
ashp_df.groupby(
[
"CONSTRUCTION_AGE_BAND_CLEAN",
# "WALLS_DESCRIPTION",
# "ROOF_DESCRIPTION",
"MAINHEAT_ENERGY_EFF",
]
)["LMK_KEY"].count().reset_index()
)
ashp_df["MAINHEAT_ENERGY_EFF"].value_counts()
ashp_efficiencies["CONSTRUCTION_AGE_BAND_CLEAN"].value_counts()
ashp_efficiency_agg
all_counts_agg = all_counts.groupby(
[
"PROPERTY_TYPE",
"BUILT_FORM",
"MAINHEAT_DESCRIPTION",
"MAINHEAT_ENERGY_EFF",
"MAINHEATCONT_DESCRIPTION",
"MAINHEATC_ENERGY_EFF",
"MAIN_FUEL",
"HOTWATER_DESCRIPTION",
"HOT_WATER_ENERGY_EFF",
"MAINS_GAS_FLAG"
]
)["count"].sum().reset_index()
all_counts_agg.groupby("PROPERTY_TYPE")["count"].sum()
# In houses, 68% of the cases where we see air source heat pumps are in detached and semi-detached houses
all_counts_agg[all_counts_agg["PROPERTY_TYPE"] == "House"]["BUILT_FORM"].value_counts(normalize=True)
all_counts_agg[all_counts_agg["PROPERTY_TYPE"] == "Flat"]["BUILT_FORM"].value_counts()
# In Bungalows, 74% of cases where we see air source heat pumps are in detached and semi-detached houses
all_counts_agg[all_counts_agg["PROPERTY_TYPE"] == "Bungalow"]["BUILT_FORM"].value_counts(normalize=True)
# TODO: Research options for mid and end-terrace houses
# TODO: Research the options for flats - we see them appear in flats, but practically speaking, how does the
# install process work?