mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
118 lines
3.9 KiB
Python
118 lines
3.9 KiB
Python
from pathlib import Path
|
|
from core.Settings import (
|
|
RDSAP_RESPONSE,
|
|
FLOOR_LEVEL_MAP,
|
|
BUILT_FORM_REMAP,
|
|
EARLIEST_EPC_DATE,
|
|
FULLY_GLAZED_DESCRIPTIONS,
|
|
FIXED_FEATURES,
|
|
LATEST_FIELD,
|
|
COMPONENT_FEATURES
|
|
)
|
|
from model_data.BaseUtility import Definitions
|
|
from tqdm import tqdm
|
|
import pandas as pd
|
|
import numpy as np
|
|
|
|
from autogluon.tabular import TabularDataset, TabularPredictor
|
|
|
|
RANDOM_SEED = 0
|
|
|
|
DATA_DIRECTORY = Path(__file__).parent / 'data' / 'all-domestic-certificates'
|
|
|
|
FLOAT_COLUMNS = [
|
|
'NUMBER_OPEN_FIREPLACES',
|
|
'EXTENSION_COUNT',
|
|
'TOTAL_FLOOR_AREA',
|
|
'PHOTO_SUPPLY',
|
|
'FIXED_LIGHTING_OUTLETS_COUNT',
|
|
'FLOOR_HEIGHT',
|
|
'NUMBER_HABITABLE_ROOMS',
|
|
'LOW_ENERGY_LIGHTING',
|
|
'MULTI_GLAZE_PROPORTION',
|
|
'NUMBER_HEATED_ROOMS'
|
|
]
|
|
|
|
|
|
def create_raw_data():
|
|
"""
|
|
Extract all information to do a simple predictor for RDSAP
|
|
"""
|
|
|
|
directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()]
|
|
# directories = directories[0:10]
|
|
dfs = []
|
|
for directory in tqdm(directories):
|
|
filepath = directory / "certificates.csv"
|
|
df = pd.read_csv(filepath, low_memory=False)
|
|
|
|
# Remove any bad uprns and ignore old/bad data
|
|
df = df[~pd.isnull(df["UPRN"])]
|
|
df = df[df["LODGEMENT_DATE"] >= EARLIEST_EPC_DATE]
|
|
df = df[df["TRANSACTION_TYPE"] != "new dwelling"]
|
|
df = df[~df["FLOOR_LEVEL"].isin(["top floor", "mid floor"])]
|
|
|
|
# Change multi glaze proportion
|
|
no_multi_glaze_proportion_index = pd.isnull(df["MULTI_GLAZE_PROPORTION"]) & (
|
|
df["WINDOWS_DESCRIPTION"].isin(FULLY_GLAZED_DESCRIPTIONS))
|
|
df.loc[no_multi_glaze_proportion_index, 'MULTI_GLAZE_PROPORTION'] = 100
|
|
|
|
# Recast
|
|
df["UPRN"] = df["UPRN"].astype(int).astype(str)
|
|
df['MAIN_HEATING_CONTROLS'] = df['MAIN_HEATING_CONTROLS'].astype(float)
|
|
|
|
# Sort Data
|
|
df = df.sort_values(["UPRN", "LODGEMENT_DATE"], ascending=True)
|
|
|
|
# Map all anomaly values to None
|
|
data_anomaly_map = dict(zip(Definitions.DATA_ANOMALY_MATCHES, [None] * len(Definitions.DATA_ANOMALY_MATCHES)))
|
|
|
|
# Use replace function to map data (if exists in key), to corresponding value - i.e. Remove invalid values
|
|
df = df.replace(data_anomaly_map)
|
|
df = df.replace(np.NAN, None)
|
|
|
|
# Remap certain columns
|
|
df['FLOOR_LEVEL'] = df['FLOOR_LEVEL'].replace(FLOOR_LEVEL_MAP)
|
|
df['BUILT_FROM'] = df['BUILT_FORM'].replace(BUILT_FORM_REMAP)
|
|
|
|
# Keep only possible modelling columns
|
|
df = df[[RDSAP_RESPONSE] + list(set(FIXED_FEATURES + LATEST_FIELD + COMPONENT_FEATURES))]
|
|
|
|
# Reduce memory usage
|
|
|
|
# df.memory_usage()
|
|
# df.dtypes
|
|
df[RDSAP_RESPONSE] = pd.to_numeric(df[RDSAP_RESPONSE], downcast='unsigned')
|
|
df[FLOAT_COLUMNS] = df[FLOAT_COLUMNS].apply(pd.to_numeric, downcast='float')
|
|
|
|
dfs.append(df)
|
|
|
|
data = pd.concat(dfs)
|
|
data.to_parquet('./energy_predictor_data.parquet')
|
|
|
|
cleaned_data = data.dropna()
|
|
# GIves you primarily flats
|
|
cleaned_data.to_parquet('./energy_predictor_cleaned_data.parquet')
|
|
|
|
|
|
def main():
|
|
data = TabularDataset(data='./model_build_data/energy_data/cleaned_data/train_validation_data.parquet')
|
|
|
|
subsample_size = round(len(data) / 100)
|
|
data = data.sample(subsample_size, random_state=RANDOM_SEED)
|
|
|
|
predictor_RDSAP = TabularPredictor(
|
|
label=RDSAP_RESPONSE,
|
|
path="agModels-predictENERGY",
|
|
problem_type="regression",
|
|
eval_metric='mean_absolute_error'
|
|
).fit(data, time_limit=800, presets='high_quality', excluded_model_types=['KNN', 'CAT'])
|
|
|
|
test_data = TabularDataset('./model_build_data/energy_data/cleaned_data/test_data.parquet')
|
|
performance = predictor_RDSAP.evaluate(test_data)
|
|
predictions = predictor_RDSAP.predict(test_data)
|
|
predictor_RDSAP.feature_importance(test_data)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|