import pandas as pd from tqdm import tqdm from utils.s3 import save_dataframe_to_s3_parquet, read_dataframe_from_s3_parquet from utils.logger import setup_logger from etl.epc.settings import EARLIEST_EPC_DATE logger = setup_logger() class AirSourceHeatPumpEfficiency: def __init__(self, file_directories, cleaned_lookup): """ :param file_directories: A list of directories where files are stored. :param cleaned_lookup: A dictionary containing cleaned lookup data. """ self.file_directories = file_directories self.cleaned_lookup = cleaned_lookup self.results = [] def create_dataset(self): logger.info("Creating solar photo supply dataset") heating_data = [] for dir in tqdm(self.file_directories): filepath = dir / "certificates.csv" df = pd.read_csv(filepath, low_memory=False) # df = df[~pd.isnull(df["UPRN"])] # df["UPRN"] = df["UPRN"].astype(int).astype(str) # Take entries after SAP12 df["LODGEMENT_DATE"] = pd.to_datetime(df["LODGEMENT_DATE"]) df = df[df["LODGEMENT_DATE"] > EARLIEST_EPC_DATE] # df = df[ # ~df["TENURE"].isin( # [ # "unknown", # "Not defined - use in the case of a new dwelling for which the intended tenure in not known. " # "It is not to be used for an existing dwelling" # ] # ) # ] # Take entries that contain an air source heat pump df = df[ ( # Air source heat pumps (df["MAINHEAT_DESCRIPTION"] == "Air source heat pump, radiators, electric") & (df["MAINHEATCONT_DESCRIPTION"] == "Time and temperature zone control") ) | ( # High heat retention storage df["MAINHEATCONT_DESCRIPTION"] == "Controls for high heat retention storage heaters" ) ] # Drop rows that have a missing PROPERTY_TYPE, BUILT_FORM, CONSTRUCTION_AGE_BAND, TOTAL_FLOOR_AREA for col in ["PROPERTY_TYPE", "BUILT_FORM", "CONSTRUCTION_AGE_BAND", "TOTAL_FLOOR_AREA"]: df = df[~pd.isnull(df[col])] heating_data.append(df) # temp # import pickle # with open("heating_data - delete me.pkl", "wb") as f: # pickle.dump(heating_data, f) heating_df = pd.concat(heating_data) # Clean construction age band from etl.epc.DataProcessor import EPCDataProcessor heating_df["CONSTRUCTION_AGE_BAND_CLEAN"] = heating_df["CONSTRUCTION_AGE_BAND"].apply( lambda x: EPCDataProcessor.clean_construction_age_band(x) ) ashp_df = heating_df[ (heating_df["MAINHEAT_DESCRIPTION"] == "Air source heat pump, radiators, electric") & # ~heating_df["CONSTRUCTION_AGE_BAND"].str.contains("England and Wales") (~heating_df["CONSTRUCTION_AGE_BAND"].isin(["NO DATA!", "INVALID!"])) & (heating_df["LODGEMENT_DATE"] >= pd.to_datetime("2019-01-01")) ] ashp_efficiencies = ( ashp_df.groupby( [ "CONSTRUCTION_AGE_BAND_CLEAN", # "WALLS_DESCRIPTION", # "ROOF_DESCRIPTION", "MAINHEAT_ENERGY_EFF", ] )["LMK_KEY"].count().reset_index() ) ashp_df["MAINHEAT_ENERGY_EFF"].value_counts() ashp_efficiencies["CONSTRUCTION_AGE_BAND_CLEAN"].value_counts() ashp_efficiency_agg all_counts_agg = all_counts.groupby( [ "PROPERTY_TYPE", "BUILT_FORM", "MAINHEAT_DESCRIPTION", "MAINHEAT_ENERGY_EFF", "MAINHEATCONT_DESCRIPTION", "MAINHEATC_ENERGY_EFF", "MAIN_FUEL", "HOTWATER_DESCRIPTION", "HOT_WATER_ENERGY_EFF", "MAINS_GAS_FLAG" ] )["count"].sum().reset_index() all_counts_agg.groupby("PROPERTY_TYPE")["count"].sum() # In houses, 68% of the cases where we see air source heat pumps are in detached and semi-detached houses all_counts_agg[all_counts_agg["PROPERTY_TYPE"] == "House"]["BUILT_FORM"].value_counts(normalize=True) all_counts_agg[all_counts_agg["PROPERTY_TYPE"] == "Flat"]["BUILT_FORM"].value_counts() # In Bungalows, 74% of cases where we see air source heat pumps are in detached and semi-detached houses all_counts_agg[all_counts_agg["PROPERTY_TYPE"] == "Bungalow"]["BUILT_FORM"].value_counts(normalize=True) # TODO: Research options for mid and end-terrace houses # TODO: Research the options for flats - we see them appear in flats, but practically speaking, how does the # install process work?