Model/model_data/simulation_system/app.py

import numpy as np
import pandas as pd
from tqdm import tqdm
from model_data.BaseUtility import BaseUtility
from pathlib import Path
from model_data.simulation_system.Settings import (
    MANDATORY_FIXED_FEATURES,
    AVERAGE_FIXED_FEATURES,
    LATEST_FIELD,
    COMPONENT_FEATURES,
    RDSAP_RESPONSE,
    HEAT_DEMAND_RESPONSE,
    COLUMNS_TO_MERGE_ON,
    FLOOR_LEVEL_MAP,
    BUILT_FORM_REMAP
)
from DataProcessor import DataProcessor

DATA_DIRECTORY = Path(__file__).parent / 'data' / 'all-domestic-certificates'

def app():
    # Get all the files in the directory

    # Data glossary:
    # https://epc.opendatacommunities.org/docs/guidance#glossary

    # List all subdirectories
    directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()]

    dataset = []
    # 116
    # 128048706
    # PosixPath('/home/ubuntu/Documents/python/hestia/Model/model_data/simulation_system/data/all-domestic-certificates/domestic-E09000021-Kingston-upon-Thames')
    for directory in tqdm(directories):

        filepath = directory / "certificates.csv"

        data_processor = DataProcessor(filepath=filepath)

        df = data_processor.pre_process()
        cleaning_averages = data_processor.make_cleaning_averages()

        for uprn, property_data in df.groupby("UPRN", observed=True):

            # Fixed features - these are property attributes that shouldn't change over time
            fixed_data = {}

             # If a property has changed building type, we can ignore the epc rating i.e. this should be 1 unique row
            if max(property_data[MANDATORY_FIXED_FEATURES].nunique()) > 1:
                continue

            # Take the latest row for both the LATEST_FEILDS and MANDATORY FIELDS
            latest_field_data = property_data[LATEST_FIELD].iloc[-1].to_dict()
            mandatory_field_data = property_data[MANDATORY_FIXED_FEATURES].iloc[-1].to_dict()

            # Taking just the last row, which is the percentage change from the latest to previous one only
            # property_data[AVERAGE_FIXED_FEATURES].fillna(value=0).pct_change().iloc[-1] > 0.1

            # Extract the columns that are not all None
            na_columns = property_data[COLUMNS_TO_MERGE_ON].isna().all()
            cleaned_columns_to_merge_on = na_columns.index[~na_columns].to_list()

            #  Get the corresponding groupby and merge, and fill in NA values
            cleaning_averages_to_merge = cleaning_averages.groupby(cleaned_columns_to_merge_on)[['TOTAL_FLOOR_AREA', 'FLOOR_HEIGHT']].mean()

            modified_property_data = pd.merge(property_data, cleaning_averages_to_merge, on=cleaned_columns_to_merge_on, suffixes=['', '_AVERAGE'])
            modified_property_data['TOTAL_FLOOR_AREA'] = modified_property_data['TOTAL_FLOOR_AREA'].fillna(modified_property_data['TOTAL_FLOOR_AREA_AVERAGE'])
            modified_property_data['FLOOR_HEIGHT'] = modified_property_data['FLOOR_HEIGHT'].fillna(modified_property_data['FLOOR_HEIGHT_AVERAGE'])
            modified_property_data = modified_property_data.drop(columns=['TOTAL_FLOOR_AREA_AVERAGE', 'FLOOR_HEIGHT_AVERAGE'])

            for field in AVERAGE_FIXED_FEATURES:

                vals =  list(modified_property_data[field].dropna().unique())
                if len(vals) > 1:
                    # Check the values are too far apart
                    # TODO: we could have multiple values here, why only use the first two?
                    if abs(vals[0] - vals[1]) / vals[0] > 0.1:
                        # Take the more recent value since it's likely to be more accurate
                        vals = [vals[-1]]

                if len(vals) == 0:
                    wrong_var

                fixed_data[field] = np.mean(vals)

            #Combine all fields together
            fixed_data.update(mandatory_field_data)
            fixed_data.update(latest_field_data)

            # We include the lodgement date here as we probably need to factor time into the
            # model, since EPC standards and rigour have changed over time
            variable_data = modified_property_data[
                COMPONENT_FEATURES + ["LODGEMENT_DATE", RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE]
                ]

            # Note: we look at changes between subsequent EPCS, however we could look at other permutations
            # e.g. first vs second, second vs third and also first vs third
            property_model_data = []
            for idx in range(0, modified_property_data.shape[0] - 1):

                if idx >= modified_property_data.shape[0] - 1:
                    break

                starting_record = variable_data.iloc[idx]
                ending_record = variable_data.iloc[idx + 1]
                rdsap_change = ending_record[RDSAP_RESPONSE] - starting_record[RDSAP_RESPONSE]
                heat_demand_change = ending_record[HEAT_DEMAND_RESPONSE] - starting_record[HEAT_DEMAND_RESPONSE]

                # TODO: We need to pre-process the data. For instance, rather than using static for roofs, walls and
                #       floors, we may want to use the U-value. We may also want to handle the (assumed) tags
                #       within descriptions

                starting_record = starting_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_STARTING")
                ending_record = ending_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_ENDING")

                features = pd.concat([starting_record, ending_record])

                property_model_data.append(
                    {
                        "UPRN": uprn,
                        "RDSAP_CHANGE": rdsap_change,
                        "HEAT_DEMAND_CHANGE": heat_demand_change,
                        **fixed_data,
                        **features.to_dict()
                    }
                )

            dataset.extend(property_model_data)

    output = pd.DataFrame(dataset)
    output.to_parquet('./dataset.parquet')


if __name__ == "__main__":
    app()