Model/model_data/simulation_system/generate_rdsap_change.py

import numpy as np
import pandas as pd
from tqdm import tqdm

from pathlib import Path
from simulation_system.core.Settings import (
    MANDATORY_FIXED_FEATURES,
    AVERAGE_FIXED_FEATURES,
    LATEST_FIELD,
    COMPONENT_FEATURES,
    RDSAP_RESPONSE,
    HEAT_DEMAND_RESPONSE,
    COLUMNS_TO_MERGE_ON,
    MULTIPLE_VALUES_MARGIN_FOR_ERROR,
)
from simulation_system.core.DataProcessor import DataProcessor
from utils import save_dataframe_to_s3_parquet

DATA_DIRECTORY = Path(__file__).parent / "simulation_system" / "data" / "all-domestic-certificates"


# TODO: Have a look at temporal features


def app():
    # Get all the files in the directory

    # Data glossary:
    # https://epc.opendatacommunities.org/docs/guidance#glossary

    # List all subdirectories
    directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()]

    dataset = []
    cleaning_dataset = []

    # TODO: Does energy tariff make a difference
    # TODO: If SAP hasn't changed, we don't include the record
    # TODO: Floor area will impact the EPC so instead of averaging, we should have a starting and ending value.
    # TODO: Same as floor area for floor height
    # TODO: If fundamental building fabric changes, we should proabably discard the record
    # TODO: Should we prune records that have an exceptionally large amount of time between them?
    # TODO: If we have multiple EPCs lodged on the same day, should we remove them? Could be corrections?
    #
    # TODO: REMOVE ME
    dodgy_uprns = []
    observed_uprns = [
        "10002082244",  # Doesn't really make sense, house no longer has lel and not has more insulation but lower score
        "10002082259",
        # Property has more roof insulation, lel, but now the floor isn't insulated and has a lower score. Also the
        # floor assessment is now assumed whereas before it wasnt
        "10002082418",  # Walls went from insulated to not...
        "10002082640",  # Property identical besides different energy taffiff
        "10002082830",  # Lots of records going from not insulated to insulated but some parts of
        # the property has gotten better
        "10002083244",  # latest epc indicates the property is worse
        "10002083592",  # lastest epc doesn't have a fuel system present, but has slightly more insulation. Also the
        # floor type has changed from solid to syspended. lel has decreased
        "100030533576",  # property slightly worse, has less lels and the floor description has changed type
        "100030533668",  # has slightly less lels. Glazed type is now missing
        "100030533803",  # Not super clea why this is lower, newer epc has more lel but is using second heating
        "100030534016",  # Property has less lel but more roof insulation. Floor type has changed
        "100030534040",  # property has less lel and the floor type has changed
        "100030534041",  # property has less insulation and less lel
        "100030534243",  # Cavity wall has gone from filled to unfilled
        "100030534294",  # less roof insulation but now has an air source heat pump
        "100030534322",  # identical between records but now with higher lel but no change recorded
        "100030534413",  # identical between records but different energy tariff, no sap change
        "100030534437",  # property has less lel and the mainheating no longer has a programmer and trvs
        "100030534569",  # Cavity wall no longer filled, 30mm more roof insulation in newest epc
        "100030534676",  # Property has less lel, is now using secondary heating, has 50mm less roof insulation, but
        # the wall cavity is no longer filled
        "100030534732",  # property has higher lel %. Not clear why this is worse, glazing type has changed.
        # This looks dodgy has the UPRN_SOURCE is address matched also the floor area has increased from the first to
        # the later epc
        "100030534791",  # Property has started using secondary heating - the EPCs are taken on the same day so maybe we
        # should discard
        "100030534795",  # More lel but a lot less insulation. This is a very dodgy record, sap has gone from 90 to 66
        # The newer epc indicates the property now has 40% photo supply so this doesn't make much sense
        "100030534897",  # Roof has gone from thatched with additional insulation to pitched with insulation,
        # sap score hasn't changed
        "100030534986",  # Property has gone from 300mm loft insulation to none. has 2% higher lel (negligible) and
        # slightly better main heating setup
        "100030535043",  # Property lel increased by 12%, not clear why sap worse. Maybe due to different floor area and
        # wall height
        "100030535173",  # lel increased from 20% to 80% but roof gone from 100m insulation to "limited" insulation
        "100030535244",  # lel gone from 100% to 0%, sap is the same
    ]

    for directory in tqdm(directories):

        filepath = directory / "certificates.csv"

        data_processor = DataProcessor(filepath=filepath)

        df = data_processor.pre_process()
        cleaning_averages = data_processor.make_cleaning_averages()

        for uprn, property_data in df.groupby("UPRN", observed=True):

            # Fixed features - these are property attributes that shouldn't change over time
            fixed_data = {}

            # If a property has changed building type, we can ignore the epc rating i.e. this should be 1 unique row
            if any(property_data[MANDATORY_FIXED_FEATURES].nunique() > 1):
                continue

            # Take the latest row for both the LATEST_FEILDS and MANDATORY FIELDS
            latest_field_data = property_data[LATEST_FIELD].iloc[-1].to_dict()
            mandatory_field_data = (
                property_data[MANDATORY_FIXED_FEATURES].iloc[-1].to_dict()
            )

            # Taking just the last row, which is the percentage change from the latest to previous one only
            # property_data[AVERAGE_FIXED_FEATURES].fillna(value=0).pct_change().iloc[-1] > 0.1

            # Extract the columns that are not all None
            modified_property_data = DataProcessor.apply_averages_cleaning(
                data_to_clean=property_data,
                cleaning_data=cleaning_averages,
                cols_to_merge_on=COLUMNS_TO_MERGE_ON
            )

            for field in AVERAGE_FIXED_FEATURES:

                vals = list(modified_property_data[field].dropna().unique())
                if len(vals) > 1:
                    lowest_value = min(vals)
                    largest_value = max(vals)
                    if abs(largest_value - lowest_value) / lowest_value > MULTIPLE_VALUES_MARGIN_FOR_ERROR:
                        # Take the more recent value since it's likely to be more accurate
                        vals = [vals[-1]]

                fixed_data[field] = np.mean(vals)

            # Combine all fields together
            fixed_data.update(mandatory_field_data)
            fixed_data.update(latest_field_data)

            # We include the lodgement date here as we probably need to factor time into the
            # model, since EPC standards and rigour have changed over time
            variable_data = modified_property_data[
                COMPONENT_FEATURES
                + ["LODGEMENT_DATE", RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE]
                ]

            # Note: we look at changes between subsequent EPCS, however we could look at other permutations
            # e.g. first vs second, second vs third and also first vs third
            property_model_data = []
            for idx in range(0, modified_property_data.shape[0] - 1):

                if idx >= modified_property_data.shape[0] - 1:
                    break

                starting_record = variable_data.iloc[idx]
                ending_record = variable_data.iloc[idx + 1]
                rdsap_change = (
                    ending_record[RDSAP_RESPONSE] - starting_record[RDSAP_RESPONSE]
                )
                heat_demand_change = (
                    ending_record[HEAT_DEMAND_RESPONSE]
                    - starting_record[HEAT_DEMAND_RESPONSE]
                )

                # Check for a change in the starting and ending record
                check_cols = [
                    col for col in starting_record.index if col not in [
                        "LODGEMENT_DATE", "CURRENT_ENERGY_EFFICIENCY", "ENERGY_CONSUMPTION_CURRENT", "ENERGY_TARIFF"
                    ]
                ]
                all_same = True
                for col in check_cols:
                    if starting_record[col] != ending_record[col]:
                        all_same = False
                        break

                if rdsap_change <= 0:
                    if all_same | (uprn in observed_uprns):
                        if uprn not in observed_uprns:
                            dodgy_uprns.append(uprn)
                    else:
                        compare = pd.concat([starting_record, ending_record], axis=1)
                        bljd

                # TODO: We need to pre-process the data. For instance, rather than using static for roofs, walls and
                #       floors, we may want to use the U-value. We may also want to handle the (assumed) tags
                #       within descriptions

                starting_record = starting_record[
                    COMPONENT_FEATURES + ["LODGEMENT_DATE"]
                    ].add_suffix("_STARTING")
                ending_record = ending_record[
                    COMPONENT_FEATURES + ["LODGEMENT_DATE"]
                    ].add_suffix("_ENDING")

                features = pd.concat([starting_record, ending_record])

                property_model_data.append(
                    {
                        "UPRN": uprn,
                        "RDSAP_CHANGE": rdsap_change,
                        "HEAT_DEMAND_CHANGE": heat_demand_change,
                        **fixed_data,
                        **features.to_dict(),
                    }
                )

            dataset.append(property_model_data)

        cleaning_averages["LOCAL_AUTHORITY"] = df["LOCAL_AUTHORITY"].values[0]
        cleaning_dataset.append(cleaning_averages)

    # Store cleaning dataset in s3 as a parquet file
    cleaning_dataset = pd.concat(cleaning_dataset)
    save_dataframe_to_s3_parquet(
        df=cleaning_dataset,
        bucket_name="retrofit-data-dev",
        file_key="sap_change_model/cleaning_dataset.parquet",
    )

    output = pd.DataFrame(dataset)
    output.to_parquet("./dataset.parquet")


if __name__ == "__main__":
    app()