import pandas as pd from tqdm import tqdm from pathlib import Path from simulation_system.core.Settings import ( MANDATORY_FIXED_FEATURES, LATEST_FIELD, COMPONENT_FEATURES, RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE, COLUMNS_TO_MERGE_ON, EARLIEST_EPC_DATE ) from simulation_system.core.DataProcessor import DataProcessor from utils import save_dataframe_to_s3_parquet DATA_DIRECTORY = Path(__file__).parent / "simulation_system" / "data" / "all-domestic-certificates" def app(): # Get all the files in the directory # Data glossary: # https://epc.opendatacommunities.org/docs/guidance#glossary # List all subdirectories directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()] dataset = [] cleaning_dataset = [] # TODO [x] : Does energy tariff make a difference # - leave for now but it may not # TODO: [x] : Add starting SAP and head demand as a feature # TODO [x] : If SAP hasn't changed, we don't include the record # TODO [x]: If SAP gets worse, it genuinely looks like in the vast majority of cases that the building looks # worse in the newer epc, so we can switch the orders # TODO [] : Have a look at temporal features # TODO [x] : Floor area will impact the EPC so instead of averaging, we should have a starting and ending value. # TODO [x]: Same as floor area for floor height # TODO []: If fundamental building fabric changes, we should proabably discard the record # TODO [x]: Should we prune records that have an exceptionally large amount of time between them? # - leave for now and check performance after temporal features # TODO [x]: If we have multiple EPCs lodged on the same day, should we remove them? Could be corrections? # - Leave for now # for directory in tqdm(directories): filepath = directory / "certificates.csv" data_processor = DataProcessor(filepath=filepath) df = data_processor.pre_process() cleaning_averages = data_processor.make_cleaning_averages() data_by_urpn = [] for uprn, property_data in df.groupby("UPRN", observed=True): # Fixed features - these are property attributes that shouldn't change over time fixed_data = {} # If a property has changed building type, we can ignore the epc rating i.e. this should be 1 unique row if any(property_data[MANDATORY_FIXED_FEATURES].nunique() > 1): continue # Take the latest row for both the LATEST_FEILDS and MANDATORY FIELDS latest_field_data = property_data[LATEST_FIELD].iloc[-1].to_dict() mandatory_field_data = ( property_data[MANDATORY_FIXED_FEATURES].iloc[-1].to_dict() ) # Extract the columns that are not all None modified_property_data = DataProcessor.apply_averages_cleaning( data_to_clean=property_data, cleaning_data=cleaning_averages, cols_to_merge_on=COLUMNS_TO_MERGE_ON ) # Combine all fields together fixed_data.update(mandatory_field_data) fixed_data.update(latest_field_data) # We include the lodgement date here as we probably need to factor time into the # model, since EPC standards and rigour have changed over time variable_data = modified_property_data[ COMPONENT_FEATURES + ["LODGEMENT_DATE", RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE] ] # Note: we look at changes between subsequent EPCS, however we could look at other permutations # e.g. first vs second, second vs third and also first vs third property_model_data = [] for idx in range(0, modified_property_data.shape[0] - 1): if idx >= modified_property_data.shape[0] - 1: break earliest_record = variable_data.iloc[idx] latest_record = variable_data.iloc[idx + 1] # Check if the sap gets better or worse gets_better = earliest_record[RDSAP_RESPONSE] <= latest_record[RDSAP_RESPONSE] if gets_better: starting_sap = earliest_record[RDSAP_RESPONSE] starting_heat_demand = earliest_record[HEAT_DEMAND_RESPONSE] rdsap_change = latest_record[RDSAP_RESPONSE] - starting_sap heat_demand_change = latest_record[HEAT_DEMAND_RESPONSE] - starting_heat_demand else: starting_sap = latest_record[RDSAP_RESPONSE] starting_heat_demand = latest_record[HEAT_DEMAND_RESPONSE] rdsap_change = earliest_record[RDSAP_RESPONSE] - starting_sap heat_demand_change = earliest_record[HEAT_DEMAND_RESPONSE] - starting_heat_demand if rdsap_change == 0: continue if gets_better: starting_record = earliest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_STARTING") ending_record = latest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_ENDING") else: starting_record = latest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_STARTING") ending_record = earliest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_ENDING") features = pd.concat([starting_record, ending_record]) property_model_data.append( { "UPRN": uprn, "RDSAP_CHANGE": rdsap_change, "HEAT_DEMAND_CHANGE": heat_demand_change, "STARTING_SAP": starting_sap, "STARTING_HEAT_DEMAND": starting_heat_demand, **fixed_data, **features.to_dict(), } ) data_by_urpn.extend(property_model_data) data_by_urpn_df = pd.DataFrame(data_by_urpn) # Add some temporal features - we look at the days from the standard starting point in time # for the starting and ending date so all records are from a fixed point data_by_urpn_df["DAYS_TO_STARTING"] = ( pd.to_datetime(data_by_urpn_df["LODGEMENT_DATE_STARTING"]) - pd.to_datetime(EARLIEST_EPC_DATE) ).dt.days data_by_urpn_df["DAYS_TO_ENDING"] = ( pd.to_datetime(data_by_urpn_df["LODGEMENT_DATE_ENDING"]) - pd.to_datetime(EARLIEST_EPC_DATE) ).dt.days # TODO: We need to pre-process the data. For instance, rather than using static for roofs, walls and # floors, we may want to use the U-value. We may also want to handle the (assumed) tags # within descriptions dataset.append(data_by_urpn_df) cleaning_averages["LOCAL_AUTHORITY"] = df["LOCAL_AUTHORITY"].values[0] cleaning_dataset.append(cleaning_averages) # Store cleaning dataset in s3 as a parquet file cleaning_dataset = pd.concat(cleaning_dataset) save_dataframe_to_s3_parquet( df=cleaning_dataset, bucket_name="retrofit-data-dev", file_key="sap_change_model/cleaning_dataset.parquet", ) output = pd.concat(dataset) save_dataframe_to_s3_parquet( df=output, bucket_name="retrofit-data-dev", file_key="sap_change_model/dataset.parquet", ) if __name__ == "__main__": app()