mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
157 lines
No EOL
7.4 KiB
Python
157 lines
No EOL
7.4 KiB
Python
import numpy as np
|
|
import pandas as pd
|
|
from tqdm import tqdm
|
|
from model_data.BaseUtility import BaseUtility
|
|
from pathlib import Path
|
|
from settings import (
|
|
MANDATORY_FIXED_FEATURES,
|
|
AVERAGE_FIXED_FEATURES,
|
|
LATEST_FIELD,
|
|
COMPONENT_FEATURES,
|
|
RDSAP_RESPONSE,
|
|
HEAT_DEMAND_RESPONSE,
|
|
FLOOR_LEVEL_MAP,
|
|
BUILT_FORM_REMAP
|
|
)
|
|
from DataProcessor import DataProcessor
|
|
|
|
DATA_DIRECTORY = Path(__file__).parent / 'data' / 'all-domestic-certificates'
|
|
|
|
def app():
|
|
# Get all the files in the directory
|
|
|
|
# Data glossary:
|
|
# https://epc.opendatacommunities.org/docs/guidance#glossary
|
|
|
|
# List all subdirectories
|
|
directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()]
|
|
|
|
dataset = []
|
|
|
|
|
|
for directory in tqdm(directories):
|
|
|
|
filepath = directory / "certificates.csv"
|
|
|
|
data_processor = DataProcessor(filepath=filepath)
|
|
|
|
df = data_processor.pre_process()
|
|
cleaning_averages = data_processor.make_cleaning_averages()
|
|
|
|
for uprn, property_data in df.groupby("UPRN", observed=True):
|
|
|
|
# Fixed features - these are property attributes that shouldn't change over time
|
|
fixed_data = {}
|
|
|
|
# Map all anomaly values to None
|
|
data_anomaly_map = dict(zip(BaseUtility.DATA_ANOMALY_MATCHES, [None]*len(BaseUtility.DATA_ANOMALY_MATCHES)))
|
|
|
|
# Use replace function to map data (if exists in key), to corresponding value - i.e. Remove invalid values
|
|
modified_property_data = property_data.replace(data_anomaly_map)
|
|
modified_property_data = modified_property_data.replace(np.NAN, None)
|
|
|
|
# If a property has changed building type, we can ignore the epc rating i.e. this should be 1 unique row
|
|
if max(modified_property_data[MANDATORY_FIXED_FEATURES].nunique()) > 1:
|
|
continue
|
|
|
|
# Remap certain columns
|
|
modified_property_data['FLOOR_LEVEL'] = modified_property_data['FLOOR_LEVEL'].replace(FLOOR_LEVEL_MAP)
|
|
modified_property_data['BUILT_FROM'] = modified_property_data['BUILT_FORM'].replace(BUILT_FORM_REMAP)
|
|
|
|
# Take the latest row for both the LATEST_FEILDS and MANDATORY FIELDS
|
|
latest_field_data = modified_property_data[LATEST_FIELD].iloc[-1].to_dict()
|
|
mandatory_field_data = modified_property_data[MANDATORY_FIXED_FEATURES].iloc[-1].to_dict()
|
|
|
|
# Taking just the last row, which is the percentage change from the latest to previous one only
|
|
# modified_property_data[AVERAGE_FIXED_FEATURES].fillna(value=0).pct_change().iloc[-1] > 0.1
|
|
|
|
# We can replace any NA values for Average fixed features
|
|
# We have columns that we want to merge on, but some of these columns are all NA values
|
|
# So we determine which columns to merge on, and get the equivalent grouping in the averages
|
|
columns_to_merge_on = ["PROPERTY_TYPE", "BUILT_FORM", "CONSTRUCTION_AGE_BAND", "NUMBER_HABITABLE_ROOMS",
|
|
"NUMBER_HEATED_ROOMS"]
|
|
|
|
if any(modified_property_data[columns_to_merge_on].isna()):
|
|
# If there are any NA value, back fill first (i.e most recent), then forward fill if needed
|
|
modified_property_data[columns_to_merge_on] = modified_property_data[columns_to_merge_on].fillna(method='bfill').fillna(method='ffill')
|
|
|
|
# Extract the columns that are non all None
|
|
na_columns = modified_property_data[columns_to_merge_on].isna().all()
|
|
columns_to_merge_on = na_columns.index[~na_columns].to_list()
|
|
|
|
# Get the corresponding groupby and merge, and fill in NA values
|
|
cleaning_averages_to_merge = cleaning_averages.groupby(columns_to_merge_on)[['TOTAL_FLOOR_AREA', 'FLOOR_HEIGHT']].mean()
|
|
modified_property_data = pd.merge(modified_property_data, cleaning_averages_to_merge, on=columns_to_merge_on, suffixes=['', '_AVERAGE'])
|
|
modified_property_data['TOTAL_FLOOR_AREA'] = modified_property_data['TOTAL_FLOOR_AREA'].fillna(modified_property_data['TOTAL_FLOOR_AREA_AVERAGE'])
|
|
modified_property_data['FLOOR_HEIGHT'] = modified_property_data['FLOOR_HEIGHT'].fillna(modified_property_data['FLOOR_HEIGHT_AVERAGE'])
|
|
modified_property_data = modified_property_data.drop(columns=['TOTAL_FLOOR_AREA_AVERAGE', 'FLOOR_HEIGHT_AVERAGE'])
|
|
|
|
for field in AVERAGE_FIXED_FEATURES:
|
|
vals = list(modified_property_data[field].dropna().unique())
|
|
if len(vals) > 1:
|
|
# Check the values are too far apart
|
|
# TODO: we could have multiple values here, why only use the first two?
|
|
if abs(vals[0] - vals[1]) / vals[0] > 0.1:
|
|
# Take the more recent value since it's likely to be more accurate
|
|
vals = [vals[-1]]
|
|
|
|
if vals:
|
|
field_value = np.mean(vals)
|
|
|
|
fixed_data[field] = field_value
|
|
|
|
#Combine all fields together
|
|
fixed_data.update(mandatory_field_data)
|
|
fixed_data.update(latest_field_data)
|
|
|
|
# We include the lodgement date here as we probably need to factor time into the
|
|
# model, since EPC standards and rigour have changed over time
|
|
variable_data = modified_property_data[
|
|
COMPONENT_FEATURES + ["LODGEMENT_DATE", RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE]
|
|
]
|
|
|
|
# Note: we look at changes between subsequent EPCS, however we could look at other permutations
|
|
# e.g. first vs second, second vs third and also first vs third
|
|
property_model_data = []
|
|
for idx in range(0, modified_property_data.shape[0] - 1):
|
|
|
|
if idx >= modified_property_data.shape[0] - 1:
|
|
break
|
|
|
|
starting_record = variable_data.iloc[idx]
|
|
ending_record = variable_data.iloc[idx + 1]
|
|
rdsap_change = ending_record[RDSAP_RESPONSE] - starting_record[RDSAP_RESPONSE]
|
|
heat_demand_change = ending_record[HEAT_DEMAND_RESPONSE] - starting_record[HEAT_DEMAND_RESPONSE]
|
|
|
|
# TODO: Should this be <= 0?
|
|
if rdsap_change == 0:
|
|
# Assumption: We aren't interested in records that exhibit no change
|
|
continue
|
|
|
|
# TODO: We need to pre-process the data. For instance, rather than using static for roofs, walls and
|
|
# floors, we may want to use the U-value. We may also want to handle the (assumed) tags
|
|
# within descriptions
|
|
|
|
starting_record = starting_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_STARTING")
|
|
ending_record = ending_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_ENDING")
|
|
|
|
features = pd.concat([starting_record, ending_record])
|
|
|
|
property_model_data.append(
|
|
{
|
|
"UPRN": uprn,
|
|
"RDSAP_CHANGE": rdsap_change,
|
|
"HEAT_DEMAND_CHANGE": heat_demand_change,
|
|
**fixed_data,
|
|
**features.to_dict()
|
|
}
|
|
)
|
|
|
|
dataset.extend(property_model_data)
|
|
|
|
output = pd.DataFrame(dataset)
|
|
output.to_parquet('./dataset.parquet')
|
|
|
|
|
|
if __name__ == "__main__":
|
|
app() |