Model/model_data/simulation_system/app.py
2023-08-03 12:09:37 +01:00

227 lines
8.1 KiB
Python

import numpy as np
import os
import pandas as pd
from tqdm import tqdm
from model_data.BaseUtility import BaseUtility
def list_subdirectories(directory_path):
return [d for d in os.listdir(directory_path) if os.path.isdir(os.path.join(directory_path, d))]
DATA_DIRECTORY = os.getcwd() + '/model_data/simulation_system/data/all-domestic-certificates'
FIXED_FEATURES = [
'PROPERTY_TYPE',
'BUILT_FORM',
'CONSTRUCTION_AGE_BAND',
'NUMBER_HABITABLE_ROOMS',
'CONSTITUENCY',
'NUMBER_HEATED_ROOMS',
'FIXED_LIGHTING_OUTLETS_COUNT',
'GLAZED_AREA',
'FLOOR_HEIGHT',
'FLOOR_LEVEL',
'TOTAL_FLOOR_AREA',
]
COMPONENT_FEATURES = [
'TRANSACTION_TYPE',
'WALLS_DESCRIPTION',
'FLOOR_DESCRIPTION',
'LIGHTING_DESCRIPTION',
'ROOF_DESCRIPTION',
'MAINHEAT_DESCRIPTION',
'HOTWATER_DESCRIPTION',
'MAIN_FUEL',
'MECHANICAL_VENTILATION',
'SECONDHEAT_DESCRIPTION',
'ENERGY_TARIFF', # Not sure if this is relevant
'SOLAR_WATER_HEATING_FLAG',
'PHOTO_SUPPLY',
'WINDOWS_DESCRIPTION',
'GLAZED_TYPE',
'MULTI_GLAZE_PROPORTION',
'LIGHTING_DESCRIPTION',
'LOW_ENERGY_LIGHTING',
'NUMBER_OPEN_FIREPLACES',
'MAINHEATCONT_DESCRIPTION',
'EXTENSION_COUNT'
]
# For these fields, we take an average if we have multiple values
AVERAGE_FIXED_FEATURES = [
"TOTAL_FLOOR_AREA",
"FLOOR_HEIGHT"
]
# For these fields, we take the latest value if we have multiple values
# Since more recent EPCs have been conducted with more rigour, we assume that the latest value is
# the most accurate
LATEST_FIELD = [
"NUMBER_HABITABLE_ROOMS",
"NUMBER_HEATED_ROOMS",
"FIXED_LIGHTING_OUTLETS_COUNT",
"CONSTRUCTION_AGE_BAND"
]
# If we see thee features changing, we don't use the EPC, since deem it not to be reliable
MANDATORY_FIXED_FEATURES = [
"PROPERTY_TYPE",
]
# For particularly old EPC data, we have inconsistent records so we'll only include EPCS that were
# conducted after 2010, since SAP09 was introduced in 2009 an later SAP12 was introduced in England
# and Wales from 31 July 2014
EARLIEST_EPC_DATE = "2014-08-01"
RDSAP_RESPONSE = "CURRENT_ENERGY_EFFICIENCY"
HEAT_DEMAND_RESPONSE = "ENERGY_CONSUMPTION_CURRENT"
def make_cleaning_averages(df):
cleaning_averages = df.groupby(
["PROPERTY_TYPE", "BUILT_FORM", "CONSTRUCTION_AGE_BAND", "NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"]
)[AVERAGE_FIXED_FEATURES].mean().reset_index()
return cleaning_averages
FLOOR_LEVEL_MAP = {
"00": 0,
"01": 1,
"02": 2,
"03": 3,
"Basement": -1,
"Ground": 0,
"1st": 1,
"2nd": 2,
"3rd": 3,
}
def app():
# Get all the files in the directory
directories = list_subdirectories(DATA_DIRECTORY)
for directory in tqdm(directories):
filepath = os.path.join(DATA_DIRECTORY, directory, "certificates.csv")
df = pd.read_csv(filepath, low_memory=False)
df = df[~pd.isnull(df["UPRN"])]
df = df[df["LODGEMENT_DATE"] >= EARLIEST_EPC_DATE]
cleaning_averages = make_cleaning_averages(df)
# We remove EPCS that were conducted for a new build, since these are performed with
# full SAP, which produces different results to the RdSAP methodology
df = df[df["TRANSACTION_TYPE"] != "new dwelling"]
df["UPRN"] = df["UPRN"].astype(int).astype(str)
counts = df.groupby("UPRN").size().reset_index()
counts.columns = ["UPRN", "count"]
counts = counts.sort_values("count", ascending=False)
# take UPRNS with multiple EPCs
counts = counts[counts["count"] > 1]
df = df[df["UPRN"].isin(counts["UPRN"])]
df = df.sort_values(["UPRN", "LODGEMENT_DATE"], ascending=True)
results = []
for uprn, property_data in df.groupby("UPRN", observed=True):
# Fixed features - these are property attributes that shouldn't change over time
ignore_epc = False
fixed_data = {}
for field in FIXED_FEATURES:
vals = property_data[field].dropna().unique()
# Remove invalid values
vals = [v for v in vals if v not in BaseUtility.DATA_ANOMALY_MATCHES]
if field == "FLOOR_LEVEL":
vals = list({FLOOR_LEVEL_MAP[v] for v in vals})
if field in AVERAGE_FIXED_FEATURES:
if len(vals) > 1:
# Check the values are too far apart
if abs(vals[0] - vals[1]) / vals[0] > 0.1:
# Take the more recent value since it's likely to be more accurate
vals = [vals[-1]]
if vals:
field_value = np.mean(vals)
else:
# Clean using averages
avgs = cleaning_averages[
(cleaning_averages["PROPERTY_TYPE"] == property_data["PROPERTY_TYPE"].iloc[0]) &
(cleaning_averages["BUILT_FORM"] == property_data["BUILT_FORM"].iloc[0]) &
(cleaning_averages["CONSTRUCTION_AGE_BAND"] == property_data["CONSTRUCTION_AGE_BAND"].iloc[
0]) &
(cleaning_averages["NUMBER_HABITABLE_ROOMS"] ==
property_data["NUMBER_HABITABLE_ROOMS"].iloc[0]) &
(cleaning_averages["NUMBER_HEATED_ROOMS"] == property_data["NUMBER_HEATED_ROOMS"].iloc[0])
]
field_value = avgs[field].iloc[0]
elif field in LATEST_FIELD:
field_value = vals[-1] if vals else None
else:
if len(vals) > 1:
if field in MANDATORY_FIXED_FEATURES:
ignore_epc = True
else:
raise ValueError("Fixed feature {} has more than one value - fix me".format(field))
field_value = vals[0] if vals else None
fixed_data[field] = field_value
if ignore_epc:
continue
# We include the lodgement date here as we probably need to factor time into the
# model, since EPC standards and rigour have changed over time
variable_data = property_data[
COMPONENT_FEATURES + ["LODGEMENT_DATE", RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE]
]
# Note: we look at changes between subsequent EPCS, however we could look at other permutations
# e.g. first vs second, second vs third and also first vs third
property_model_data = []
for idx in range(0, property_data.shape[0] - 1):
if idx >= property_data.shape[0] - 1:
break
starting_record = variable_data.iloc[idx]
ending_record = variable_data.iloc[idx + 1]
rdsap_change = ending_record[RDSAP_RESPONSE] - starting_record[RDSAP_RESPONSE]
heat_demand_change = ending_record[HEAT_DEMAND_RESPONSE] - starting_record[HEAT_DEMAND_RESPONSE]
if rdsap_change == 0:
# Assumption: We aren't interested in records that exhibit no change
continue
# TODO: We need to pre-process the data. For instance, rather than using static for roofs, walls and
# floors, we may want to use the U-value. We may also want to handle the (assumed) tags
# within descriptions
starting_record = starting_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_STARTING")
ending_record = ending_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_ENDING")
features = pd.concat([starting_record, ending_record])
property_model_data.append(
{
"UPRN": uprn,
"RDSAP_CHANGE": rdsap_change,
"HEAT_DEMAND_CHANGE": heat_demand_change,
**features.to_dict()
}
)
results.extend(property_model_data)