mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
177 lines
7.5 KiB
Python
177 lines
7.5 KiB
Python
import pandas as pd
|
|
from tqdm import tqdm
|
|
|
|
from pathlib import Path
|
|
from simulation_system.core.Settings import (
|
|
MANDATORY_FIXED_FEATURES,
|
|
LATEST_FIELD,
|
|
COMPONENT_FEATURES,
|
|
RDSAP_RESPONSE,
|
|
HEAT_DEMAND_RESPONSE,
|
|
COLUMNS_TO_MERGE_ON,
|
|
EARLIEST_EPC_DATE
|
|
)
|
|
from simulation_system.core.DataProcessor import DataProcessor
|
|
from utils import save_dataframe_to_s3_parquet
|
|
|
|
DATA_DIRECTORY = Path(__file__).parent / "simulation_system" / "data" / "all-domestic-certificates"
|
|
|
|
|
|
def app():
|
|
# Get all the files in the directory
|
|
|
|
# Data glossary:
|
|
# https://epc.opendatacommunities.org/docs/guidance#glossary
|
|
|
|
# List all subdirectories
|
|
directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()]
|
|
|
|
dataset = []
|
|
cleaning_dataset = []
|
|
|
|
# TODO [x] : Does energy tariff make a difference
|
|
# - leave for now but it may not
|
|
# TODO: [x] : Add starting SAP and head demand as a feature
|
|
# TODO [x] : If SAP hasn't changed, we don't include the record
|
|
# TODO [x]: If SAP gets worse, it genuinely looks like in the vast majority of cases that the building looks
|
|
# worse in the newer epc, so we can switch the orders
|
|
# TODO [] : Have a look at temporal features
|
|
# TODO [x] : Floor area will impact the EPC so instead of averaging, we should have a starting and ending value.
|
|
# TODO [x]: Same as floor area for floor height
|
|
# TODO []: If fundamental building fabric changes, we should proabably discard the record
|
|
# TODO [x]: Should we prune records that have an exceptionally large amount of time between them?
|
|
# - leave for now and check performance after temporal features
|
|
# TODO [x]: If we have multiple EPCs lodged on the same day, should we remove them? Could be corrections?
|
|
# - Leave for now
|
|
#
|
|
|
|
for directory in tqdm(directories):
|
|
|
|
filepath = directory / "certificates.csv"
|
|
|
|
data_processor = DataProcessor(filepath=filepath)
|
|
|
|
df = data_processor.pre_process()
|
|
cleaning_averages = data_processor.make_cleaning_averages()
|
|
|
|
data_by_urpn = []
|
|
for uprn, property_data in df.groupby("UPRN", observed=True):
|
|
|
|
# Fixed features - these are property attributes that shouldn't change over time
|
|
fixed_data = {}
|
|
|
|
# If a property has changed building type, we can ignore the epc rating i.e. this should be 1 unique row
|
|
if any(property_data[MANDATORY_FIXED_FEATURES].nunique() > 1):
|
|
continue
|
|
|
|
# Take the latest row for both the LATEST_FEILDS and MANDATORY FIELDS
|
|
latest_field_data = property_data[LATEST_FIELD].iloc[-1].to_dict()
|
|
mandatory_field_data = (
|
|
property_data[MANDATORY_FIXED_FEATURES].iloc[-1].to_dict()
|
|
)
|
|
|
|
# Extract the columns that are not all None
|
|
modified_property_data = DataProcessor.apply_averages_cleaning(
|
|
data_to_clean=property_data,
|
|
cleaning_data=cleaning_averages,
|
|
cols_to_merge_on=COLUMNS_TO_MERGE_ON
|
|
)
|
|
|
|
# Combine all fields together
|
|
fixed_data.update(mandatory_field_data)
|
|
fixed_data.update(latest_field_data)
|
|
|
|
# We include the lodgement date here as we probably need to factor time into the
|
|
# model, since EPC standards and rigour have changed over time
|
|
variable_data = modified_property_data[
|
|
COMPONENT_FEATURES + ["LODGEMENT_DATE", RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE]
|
|
]
|
|
|
|
# Note: we look at changes between subsequent EPCS, however we could look at other permutations
|
|
# e.g. first vs second, second vs third and also first vs third
|
|
property_model_data = []
|
|
for idx in range(0, modified_property_data.shape[0] - 1):
|
|
|
|
if idx >= modified_property_data.shape[0] - 1:
|
|
break
|
|
|
|
earliest_record = variable_data.iloc[idx]
|
|
latest_record = variable_data.iloc[idx + 1]
|
|
|
|
# Check if the sap gets better or worse
|
|
gets_better = earliest_record[RDSAP_RESPONSE] <= latest_record[RDSAP_RESPONSE]
|
|
|
|
if gets_better:
|
|
starting_sap = earliest_record[RDSAP_RESPONSE]
|
|
starting_heat_demand = earliest_record[HEAT_DEMAND_RESPONSE]
|
|
rdsap_change = latest_record[RDSAP_RESPONSE] - starting_sap
|
|
heat_demand_change = latest_record[HEAT_DEMAND_RESPONSE] - starting_heat_demand
|
|
else:
|
|
starting_sap = latest_record[RDSAP_RESPONSE]
|
|
starting_heat_demand = latest_record[HEAT_DEMAND_RESPONSE]
|
|
rdsap_change = earliest_record[RDSAP_RESPONSE] - starting_sap
|
|
heat_demand_change = earliest_record[HEAT_DEMAND_RESPONSE] - starting_heat_demand
|
|
|
|
if rdsap_change == 0:
|
|
continue
|
|
|
|
if gets_better:
|
|
starting_record = earliest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_STARTING")
|
|
ending_record = latest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_ENDING")
|
|
else:
|
|
starting_record = latest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_STARTING")
|
|
ending_record = earliest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_ENDING")
|
|
|
|
features = pd.concat([starting_record, ending_record])
|
|
|
|
property_model_data.append(
|
|
{
|
|
"UPRN": uprn,
|
|
"RDSAP_CHANGE": rdsap_change,
|
|
"HEAT_DEMAND_CHANGE": heat_demand_change,
|
|
"STARTING_SAP": starting_sap,
|
|
"STARTING_HEAT_DEMAND": starting_heat_demand,
|
|
**fixed_data,
|
|
**features.to_dict(),
|
|
}
|
|
)
|
|
|
|
data_by_urpn.extend(property_model_data)
|
|
|
|
data_by_urpn_df = pd.DataFrame(data_by_urpn)
|
|
# Add some temporal features - we look at the days from the standard starting point in time
|
|
# for the starting and ending date so all records are from a fixed point
|
|
data_by_urpn_df["DAYS_TO_STARTING"] = (
|
|
pd.to_datetime(data_by_urpn_df["LODGEMENT_DATE_STARTING"]) - pd.to_datetime(EARLIEST_EPC_DATE)
|
|
).dt.days
|
|
data_by_urpn_df["DAYS_TO_ENDING"] = (
|
|
pd.to_datetime(data_by_urpn_df["LODGEMENT_DATE_ENDING"]) - pd.to_datetime(EARLIEST_EPC_DATE)
|
|
).dt.days
|
|
|
|
# TODO: We need to pre-process the data. For instance, rather than using static for roofs, walls and
|
|
# floors, we may want to use the U-value. We may also want to handle the (assumed) tags
|
|
# within descriptions
|
|
|
|
dataset.append(data_by_urpn_df)
|
|
|
|
cleaning_averages["LOCAL_AUTHORITY"] = df["LOCAL_AUTHORITY"].values[0]
|
|
cleaning_dataset.append(cleaning_averages)
|
|
|
|
# Store cleaning dataset in s3 as a parquet file
|
|
cleaning_dataset = pd.concat(cleaning_dataset)
|
|
save_dataframe_to_s3_parquet(
|
|
df=cleaning_dataset,
|
|
bucket_name="retrofit-data-dev",
|
|
file_key="sap_change_model/cleaning_dataset.parquet",
|
|
)
|
|
|
|
output = pd.concat(dataset)
|
|
save_dataframe_to_s3_parquet(
|
|
df=output,
|
|
bucket_name="retrofit-data-dev",
|
|
file_key="sap_change_model/dataset.parquet",
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
app()
|