Model/model_data/simulation_system/generate_rdsap_change.py

import pandas as pd
from tqdm import tqdm

from pathlib import Path
from simulation_system.core.Settings import (
    MANDATORY_FIXED_FEATURES,
    LATEST_FIELD,
    COMPONENT_FEATURES,
    RDSAP_RESPONSE,
    HEAT_DEMAND_RESPONSE,
    COLUMNS_TO_MERGE_ON,
    EARLIEST_EPC_DATE
)
from simulation_system.core.DataProcessor import DataProcessor
from utils import save_dataframe_to_s3_parquet

DATA_DIRECTORY = Path(__file__).parent / "simulation_system" / "data" / "all-domestic-certificates"


def app():
    # Get all the files in the directory

    # Data glossary:
    # https://epc.opendatacommunities.org/docs/guidance#glossary

    # List all subdirectories
    directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()]

    dataset = []
    cleaning_dataset = []

    # TODO [x] : Does energy tariff make a difference
    #           - leave for now but it may not
    # TODO: [x] : Add starting SAP and head demand as a feature
    # TODO [x] : If SAP hasn't changed, we don't include the record
    # TODO [x]: If SAP gets worse, it genuinely looks like in the vast majority of cases that the building looks
    #           worse in the newer epc, so we can switch the orders
    # TODO [] : Have a look at temporal features
    # TODO [x] : Floor area will impact the EPC so instead of averaging, we should have a starting and ending value.
    # TODO [x]: Same as floor area for floor height
    # TODO []: If fundamental building fabric changes, we should proabably discard the record
    # TODO [x]: Should we prune records that have an exceptionally large amount of time between them?
    #           - leave for now and check performance after temporal features
    # TODO [x]: If we have multiple EPCs lodged on the same day, should we remove them? Could be corrections?
    #           - Leave for now
    #

    for directory in tqdm(directories):

        filepath = directory / "certificates.csv"

        data_processor = DataProcessor(filepath=filepath)

        df = data_processor.pre_process()
        cleaning_averages = data_processor.make_cleaning_averages()

        data_by_urpn = []
        for uprn, property_data in df.groupby("UPRN", observed=True):

            # Fixed features - these are property attributes that shouldn't change over time
            fixed_data = {}

            # If a property has changed building type, we can ignore the epc rating i.e. this should be 1 unique row
            if any(property_data[MANDATORY_FIXED_FEATURES].nunique() > 1):
                continue

            # Take the latest row for both the LATEST_FEILDS and MANDATORY FIELDS
            latest_field_data = property_data[LATEST_FIELD].iloc[-1].to_dict()
            mandatory_field_data = (
                property_data[MANDATORY_FIXED_FEATURES].iloc[-1].to_dict()
            )

            # Extract the columns that are not all None
            modified_property_data = DataProcessor.apply_averages_cleaning(
                data_to_clean=property_data,
                cleaning_data=cleaning_averages,
                cols_to_merge_on=COLUMNS_TO_MERGE_ON
            )

            # Combine all fields together
            fixed_data.update(mandatory_field_data)
            fixed_data.update(latest_field_data)

            # We include the lodgement date here as we probably need to factor time into the
            # model, since EPC standards and rigour have changed over time
            variable_data = modified_property_data[
                COMPONENT_FEATURES + ["LODGEMENT_DATE", RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE]
                ]

            # Note: we look at changes between subsequent EPCS, however we could look at other permutations
            # e.g. first vs second, second vs third and also first vs third
            property_model_data = []
            for idx in range(0, modified_property_data.shape[0] - 1):

                if idx >= modified_property_data.shape[0] - 1:
                    break

                earliest_record = variable_data.iloc[idx]
                latest_record = variable_data.iloc[idx + 1]

                # Check if the sap gets better or worse
                gets_better = earliest_record[RDSAP_RESPONSE] <= latest_record[RDSAP_RESPONSE]

                if gets_better:
                    starting_sap = earliest_record[RDSAP_RESPONSE]
                    starting_heat_demand = earliest_record[HEAT_DEMAND_RESPONSE]
                    rdsap_change = latest_record[RDSAP_RESPONSE] - starting_sap
                    heat_demand_change = latest_record[HEAT_DEMAND_RESPONSE] - starting_heat_demand
                else:
                    starting_sap = latest_record[RDSAP_RESPONSE]
                    starting_heat_demand = latest_record[HEAT_DEMAND_RESPONSE]
                    rdsap_change = earliest_record[RDSAP_RESPONSE] - starting_sap
                    heat_demand_change = earliest_record[HEAT_DEMAND_RESPONSE] - starting_heat_demand

                if rdsap_change == 0:
                    continue

                if gets_better:
                    starting_record = earliest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_STARTING")
                    ending_record = latest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_ENDING")
                else:
                    starting_record = latest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_STARTING")
                    ending_record = earliest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_ENDING")

                features = pd.concat([starting_record, ending_record])

                property_model_data.append(
                    {
                        "UPRN": uprn,
                        "RDSAP_CHANGE": rdsap_change,
                        "HEAT_DEMAND_CHANGE": heat_demand_change,
                        "STARTING_SAP": starting_sap,
                        "STARTING_HEAT_DEMAND": starting_heat_demand,
                        **fixed_data,
                        **features.to_dict(),
                    }
                )

            data_by_urpn.extend(property_model_data)

        data_by_urpn_df = pd.DataFrame(data_by_urpn)
        # Add some temporal features - we look at the days from the standard starting point in time
        # for the starting and ending date so all records are from a fixed point
        data_by_urpn_df["DAYS_TO_STARTING"] = (
            pd.to_datetime(data_by_urpn_df["LODGEMENT_DATE_STARTING"]) - pd.to_datetime(EARLIEST_EPC_DATE)
        ).dt.days
        data_by_urpn_df["DAYS_TO_ENDING"] = (
            pd.to_datetime(data_by_urpn_df["LODGEMENT_DATE_ENDING"]) - pd.to_datetime(EARLIEST_EPC_DATE)
        ).dt.days

        # TODO: We need to pre-process the data. For instance, rather than using static for roofs, walls and
        #       floors, we may want to use the U-value. We may also want to handle the (assumed) tags
        #       within descriptions

        dataset.append(data_by_urpn_df)

        cleaning_averages["LOCAL_AUTHORITY"] = df["LOCAL_AUTHORITY"].values[0]
        cleaning_dataset.append(cleaning_averages)

    # Store cleaning dataset in s3 as a parquet file
    cleaning_dataset = pd.concat(cleaning_dataset)
    save_dataframe_to_s3_parquet(
        df=cleaning_dataset,
        bucket_name="retrofit-data-dev",
        file_key="sap_change_model/cleaning_dataset.parquet",
    )

    output = pd.concat(dataset)
    save_dataframe_to_s3_parquet(
        df=output,
        bucket_name="retrofit-data-dev",
        file_key="sap_change_model/dataset.parquet",
    )


if __name__ == "__main__":
    app()