mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-30 13:10:47 +00:00
Merge branch 'main' of https://github.com/Hestia-Homes/Model
This commit is contained in:
commit
20aa23efa0
6 changed files with 83 additions and 35 deletions
|
|
@ -1,13 +1,19 @@
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
from model_data.BaseUtility import BaseUtility
|
||||||
from simulation_system.Settings import (
|
from simulation_system.Settings import (
|
||||||
DATA_PROCESSOR_SETTINGS,
|
DATA_PROCESSOR_SETTINGS,
|
||||||
EARLIEST_EPC_DATE,
|
EARLIEST_EPC_DATE,
|
||||||
FULLY_GLAZED_DESCRIPTIONS,
|
FULLY_GLAZED_DESCRIPTIONS,
|
||||||
AVERAGE_FIXED_FEATURES,
|
AVERAGE_FIXED_FEATURES,
|
||||||
FLOOR_HEIGHT_NATIONAL_AVERAGE,
|
FLOOR_HEIGHT_NATIONAL_AVERAGE,
|
||||||
TOTAL_FLOOR_AREA_NATIONAL_AVERAGE
|
TOTAL_FLOOR_AREA_NATIONAL_AVERAGE,
|
||||||
|
FLOOR_LEVEL_MAP,
|
||||||
|
BUILT_FORM_REMAP,
|
||||||
|
COLUMNS_TO_MERGE_ON
|
||||||
)
|
)
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
|
||||||
class DataProcessor:
|
class DataProcessor:
|
||||||
|
|
@ -32,11 +38,48 @@ class DataProcessor:
|
||||||
self.recast_df_columns(column_mappings=DATA_PROCESSOR_SETTINGS['column_mappings'])
|
self.recast_df_columns(column_mappings=DATA_PROCESSOR_SETTINGS['column_mappings'])
|
||||||
self.clean_multi_glaze_proportion()
|
self.clean_multi_glaze_proportion()
|
||||||
self.retain_multiple_epc_properties(epc_minimum_count=DATA_PROCESSOR_SETTINGS['epc_minimum_count'])
|
self.retain_multiple_epc_properties(epc_minimum_count=DATA_PROCESSOR_SETTINGS['epc_minimum_count'])
|
||||||
|
self.remap_columns()
|
||||||
|
|
||||||
|
if DATA_PROCESSOR_SETTINGS['epc_minimum_count'] >= 1:
|
||||||
|
# If we have multiple EPC records, we can try and do filling
|
||||||
|
self.fill_na_fields()
|
||||||
|
|
||||||
self.data = self.data.sort_values(["UPRN", "LODGEMENT_DATE"], ascending=True)
|
self.data = self.data.sort_values(["UPRN", "LODGEMENT_DATE"], ascending=True)
|
||||||
|
|
||||||
return self.data
|
return self.data
|
||||||
|
|
||||||
|
def fill_na_fields(self, columns_to_fill: List = COLUMNS_TO_MERGE_ON):
|
||||||
|
"""
|
||||||
|
If we have a minimum of 2 epcs, we can do back fill and forward fill on certain data fields
|
||||||
|
"""
|
||||||
|
# Each uprn can fille backward from recent and forward fill from oldest
|
||||||
|
# The groupby changes the order and we use the index to make the original data
|
||||||
|
filled_data = self.data.groupby("UPRN", group_keys=True)[columns_to_fill].apply(
|
||||||
|
lambda group: group.fillna(method='bfill').fillna(method='ffill')
|
||||||
|
).reset_index().set_index('level_1').sort_index()
|
||||||
|
|
||||||
|
self.data[columns_to_fill] = filled_data[columns_to_fill]
|
||||||
|
|
||||||
|
|
||||||
|
def remap_columns(self):
|
||||||
|
"""
|
||||||
|
Remap all columns, for any non values
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Map all anomaly values to None
|
||||||
|
data_anomaly_map = dict(zip(BaseUtility.DATA_ANOMALY_MATCHES, [None]*len(BaseUtility.DATA_ANOMALY_MATCHES)))
|
||||||
|
|
||||||
|
# Use replace function to map data (if exists in key), to corresponding value - i.e. Remove invalid values
|
||||||
|
data = self.data.replace(data_anomaly_map)
|
||||||
|
data = data.replace(np.NAN, None)
|
||||||
|
|
||||||
|
# Remap certain columns
|
||||||
|
data['FLOOR_LEVEL'] = data['FLOOR_LEVEL'].replace(FLOOR_LEVEL_MAP)
|
||||||
|
data['BUILT_FROM'] = data['BUILT_FORM'].replace(BUILT_FORM_REMAP)
|
||||||
|
|
||||||
|
self.data = data
|
||||||
|
|
||||||
|
|
||||||
def make_cleaning_averages(self) -> pd.DataFrame:
|
def make_cleaning_averages(self) -> pd.DataFrame:
|
||||||
# Define a custom function to calculate the median, excluding missing values
|
# Define a custom function to calculate the median, excluding missing values
|
||||||
def median_without_missing(group):
|
def median_without_missing(group):
|
||||||
|
|
|
||||||
|
|
@ -4,6 +4,14 @@
|
||||||
TOTAL_FLOOR_AREA_NATIONAL_AVERAGE = 70
|
TOTAL_FLOOR_AREA_NATIONAL_AVERAGE = 70
|
||||||
FLOOR_HEIGHT_NATIONAL_AVERAGE = 2.45
|
FLOOR_HEIGHT_NATIONAL_AVERAGE = 2.45
|
||||||
|
|
||||||
|
COLUMNS_TO_MERGE_ON = [
|
||||||
|
"PROPERTY_TYPE",
|
||||||
|
"BUILT_FORM",
|
||||||
|
"CONSTRUCTION_AGE_BAND",
|
||||||
|
"NUMBER_HABITABLE_ROOMS",
|
||||||
|
"NUMBER_HEATED_ROOMS"
|
||||||
|
]
|
||||||
|
|
||||||
FULLY_GLAZED_DESCRIPTIONS = [
|
FULLY_GLAZED_DESCRIPTIONS = [
|
||||||
"Fully double glazed",
|
"Fully double glazed",
|
||||||
"High performance glazing",
|
"High performance glazing",
|
||||||
|
|
@ -111,4 +119,5 @@ DATA_PROCESSOR_SETTINGS = {
|
||||||
'low_memory': False,
|
'low_memory': False,
|
||||||
'epc_minimum_count': 1,
|
'epc_minimum_count': 1,
|
||||||
'column_mappings': {'UPRN': [int, str]}
|
'column_mappings': {'UPRN': [int, str]}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -10,6 +10,7 @@ from model_data.simulation_system.Settings import (
|
||||||
COMPONENT_FEATURES,
|
COMPONENT_FEATURES,
|
||||||
RDSAP_RESPONSE,
|
RDSAP_RESPONSE,
|
||||||
HEAT_DEMAND_RESPONSE,
|
HEAT_DEMAND_RESPONSE,
|
||||||
|
COLUMNS_TO_MERGE_ON,
|
||||||
FLOOR_LEVEL_MAP,
|
FLOOR_LEVEL_MAP,
|
||||||
BUILT_FORM_REMAP
|
BUILT_FORM_REMAP
|
||||||
)
|
)
|
||||||
|
|
@ -27,7 +28,9 @@ def app():
|
||||||
directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()]
|
directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()]
|
||||||
|
|
||||||
dataset = []
|
dataset = []
|
||||||
|
# 116
|
||||||
|
# 128048706
|
||||||
|
# PosixPath('/home/ubuntu/Documents/python/hestia/Model/model_data/simulation_system/data/all-domestic-certificates/domestic-E09000021-Kingston-upon-Thames')
|
||||||
for directory in tqdm(directories):
|
for directory in tqdm(directories):
|
||||||
|
|
||||||
filepath = directory / "certificates.csv"
|
filepath = directory / "certificates.csv"
|
||||||
|
|
@ -46,42 +49,21 @@ def app():
|
||||||
if max(property_data[MANDATORY_FIXED_FEATURES].nunique()) > 1:
|
if max(property_data[MANDATORY_FIXED_FEATURES].nunique()) > 1:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Map all anomaly values to None
|
|
||||||
data_anomaly_map = dict(zip(BaseUtility.DATA_ANOMALY_MATCHES, [None]*len(BaseUtility.DATA_ANOMALY_MATCHES)))
|
|
||||||
|
|
||||||
# Use replace function to map data (if exists in key), to corresponding value - i.e. Remove invalid values
|
|
||||||
modified_property_data = property_data.replace(data_anomaly_map)
|
|
||||||
modified_property_data = modified_property_data.replace(np.NAN, None)
|
|
||||||
|
|
||||||
# Remap certain columns
|
|
||||||
modified_property_data['FLOOR_LEVEL'] = modified_property_data['FLOOR_LEVEL'].replace(FLOOR_LEVEL_MAP)
|
|
||||||
modified_property_data['BUILT_FROM'] = modified_property_data['BUILT_FORM'].replace(BUILT_FORM_REMAP)
|
|
||||||
|
|
||||||
# Take the latest row for both the LATEST_FEILDS and MANDATORY FIELDS
|
# Take the latest row for both the LATEST_FEILDS and MANDATORY FIELDS
|
||||||
latest_field_data = modified_property_data[LATEST_FIELD].iloc[-1].to_dict()
|
latest_field_data = property_data[LATEST_FIELD].iloc[-1].to_dict()
|
||||||
mandatory_field_data = modified_property_data[MANDATORY_FIXED_FEATURES].iloc[-1].to_dict()
|
mandatory_field_data = property_data[MANDATORY_FIXED_FEATURES].iloc[-1].to_dict()
|
||||||
|
|
||||||
# Taking just the last row, which is the percentage change from the latest to previous one only
|
# Taking just the last row, which is the percentage change from the latest to previous one only
|
||||||
# modified_property_data[AVERAGE_FIXED_FEATURES].fillna(value=0).pct_change().iloc[-1] > 0.1
|
# property_data[AVERAGE_FIXED_FEATURES].fillna(value=0).pct_change().iloc[-1] > 0.1
|
||||||
|
|
||||||
# We can replace any NA values for Average fixed features
|
|
||||||
# We have columns that we want to merge on, but some of these columns are all NA values
|
|
||||||
# So we determine which columns to merge on, and get the equivalent grouping in the averages
|
|
||||||
columns_to_merge_on = ["PROPERTY_TYPE", "BUILT_FORM", "CONSTRUCTION_AGE_BAND", "NUMBER_HABITABLE_ROOMS",
|
|
||||||
"NUMBER_HEATED_ROOMS"]
|
|
||||||
|
|
||||||
if modified_property_data[columns_to_merge_on].isna().values.any():
|
|
||||||
# If there are any NA value, back fill first (i.e most recent), then forward fill if needed
|
|
||||||
modified_property_data[columns_to_merge_on] = modified_property_data[columns_to_merge_on].fillna(method='bfill').fillna(method='ffill')
|
|
||||||
|
|
||||||
# Extract the columns that are not all None
|
# Extract the columns that are not all None
|
||||||
na_columns = modified_property_data[columns_to_merge_on].isna().all()
|
na_columns = property_data[COLUMNS_TO_MERGE_ON].isna().all()
|
||||||
columns_to_merge_on = na_columns.index[~na_columns].to_list()
|
cleaned_columns_to_merge_on = na_columns.index[~na_columns].to_list()
|
||||||
|
|
||||||
# Get the corresponding groupby and merge, and fill in NA values
|
# Get the corresponding groupby and merge, and fill in NA values
|
||||||
cleaning_averages_to_merge = cleaning_averages.groupby(columns_to_merge_on)[['TOTAL_FLOOR_AREA', 'FLOOR_HEIGHT']].mean()
|
cleaning_averages_to_merge = cleaning_averages.groupby(cleaned_columns_to_merge_on)[['TOTAL_FLOOR_AREA', 'FLOOR_HEIGHT']].mean()
|
||||||
|
|
||||||
modified_property_data = pd.merge(modified_property_data, cleaning_averages_to_merge, on=columns_to_merge_on, suffixes=['', '_AVERAGE'])
|
modified_property_data = pd.merge(property_data, cleaning_averages_to_merge, on=cleaned_columns_to_merge_on, suffixes=['', '_AVERAGE'])
|
||||||
modified_property_data['TOTAL_FLOOR_AREA'] = modified_property_data['TOTAL_FLOOR_AREA'].fillna(modified_property_data['TOTAL_FLOOR_AREA_AVERAGE'])
|
modified_property_data['TOTAL_FLOOR_AREA'] = modified_property_data['TOTAL_FLOOR_AREA'].fillna(modified_property_data['TOTAL_FLOOR_AREA_AVERAGE'])
|
||||||
modified_property_data['FLOOR_HEIGHT'] = modified_property_data['FLOOR_HEIGHT'].fillna(modified_property_data['FLOOR_HEIGHT_AVERAGE'])
|
modified_property_data['FLOOR_HEIGHT'] = modified_property_data['FLOOR_HEIGHT'].fillna(modified_property_data['FLOOR_HEIGHT_AVERAGE'])
|
||||||
modified_property_data = modified_property_data.drop(columns=['TOTAL_FLOOR_AREA_AVERAGE', 'FLOOR_HEIGHT_AVERAGE'])
|
modified_property_data = modified_property_data.drop(columns=['TOTAL_FLOOR_AREA_AVERAGE', 'FLOOR_HEIGHT_AVERAGE'])
|
||||||
|
|
@ -95,8 +77,10 @@ def app():
|
||||||
if abs(vals[0] - vals[1]) / vals[0] > 0.1:
|
if abs(vals[0] - vals[1]) / vals[0] > 0.1:
|
||||||
# Take the more recent value since it's likely to be more accurate
|
# Take the more recent value since it's likely to be more accurate
|
||||||
vals = [vals[-1]]
|
vals = [vals[-1]]
|
||||||
|
|
||||||
|
|
||||||
|
if len(vals) == 0:
|
||||||
|
wrong_var
|
||||||
|
|
||||||
fixed_data[field] = np.mean(vals)
|
fixed_data[field] = np.mean(vals)
|
||||||
|
|
||||||
#Combine all fields together
|
#Combine all fields together
|
||||||
|
|
|
||||||
Binary file not shown.
Binary file not shown.
|
|
@ -99,18 +99,30 @@ def training(train_filepath: str, test_filepath: str) -> None:
|
||||||
# logger.info('Split data into train and validation')
|
# logger.info('Split data into train and validation')
|
||||||
|
|
||||||
logger.info('Build Model')
|
logger.info('Build Model')
|
||||||
data = TabularDataset(data=train_df)
|
|
||||||
|
data = TabularDataset(data=train_filepath)
|
||||||
|
data = data.drop(columns=['UPRN', 'HEAT_DEMAND_CHANGE'])
|
||||||
|
TOP_FEATURES = ['MAINHEAT', 'ROOF', 'WALLS', 'MAINHEATCONT', 'PHOTO', 'HOTWATER', 'SECONDHEAT']
|
||||||
|
# top_features = data.columns[data.columns.str.startswith(tuple(TOP_FEATURES))]
|
||||||
|
|
||||||
|
data = data[['RDSAP_CHANGE'] + top_features.to_list()]
|
||||||
|
# data = TabularDataset(data=train_df)
|
||||||
# data['RDSAP_CHANGE'] = data['RDSAP_CHANGE'].astype(float)
|
# data['RDSAP_CHANGE'] = data['RDSAP_CHANGE'].astype(float)
|
||||||
subsample_size = round(len(data)/4)
|
subsample_size = round(len(data)/20)
|
||||||
data = data.sample(subsample_size, random_state=RANDOM_SEED)
|
data = data.sample(subsample_size, random_state=RANDOM_SEED)
|
||||||
|
|
||||||
|
# Add custom metric class MAPE
|
||||||
|
# Have a look at temporal features
|
||||||
|
|
||||||
target_column = 'RDSAP_CHANGE'
|
target_column = 'RDSAP_CHANGE'
|
||||||
predictor_RDSAP = TabularPredictor(
|
predictor_RDSAP = TabularPredictor(
|
||||||
label=target_column,
|
label=target_column,
|
||||||
path="agModels-predictRDSAP",
|
path="agModels-predictRDSAP",
|
||||||
problem_type="regression",
|
problem_type="regression",
|
||||||
eval_metric='mean_absolute_error'
|
eval_metric='mean_absolute_error'
|
||||||
).fit(data, time_limit=8000, presets='high_quality', excluded_model_types=['KNN'])
|
).fit(data, time_limit=200, presets='best_quality', excluded_model_types=['KNN'])
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
logger.info('Evaluate matrics')
|
logger.info('Evaluate matrics')
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue