mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
Merge pull request #126 from Hestia-Homes/michael-initial
Michael initial
This commit is contained in:
commit
7df05e4691
8 changed files with 684 additions and 234 deletions
142
model_data/simulation_system/DataProcessor.py
Normal file
142
model_data/simulation_system/DataProcessor.py
Normal file
|
|
@ -0,0 +1,142 @@
|
|||
from pathlib import Path
|
||||
import pandas as pd
|
||||
from simulation_system.Settings import (
|
||||
DATA_PROCESSOR_SETTINGS,
|
||||
EARLIEST_EPC_DATE,
|
||||
FULLY_GLAZED_DESCRIPTIONS,
|
||||
AVERAGE_FIXED_FEATURES,
|
||||
FLOOR_HEIGHT_NATIONAL_AVERAGE,
|
||||
TOTAL_FLOOR_AREA_NATIONAL_AVERAGE
|
||||
)
|
||||
|
||||
|
||||
class DataProcessor:
|
||||
"""
|
||||
Handle data loading and data preprocessing
|
||||
"""
|
||||
|
||||
def __init__(self, filepath: Path) -> None:
|
||||
self.filepath = filepath
|
||||
|
||||
def load_data(self, low_memory=False) -> None:
|
||||
self.data = pd.read_csv(self.filepath, low_memory=low_memory)
|
||||
|
||||
def pre_process(self) -> pd.DataFrame:
|
||||
"""
|
||||
Load data and begin initial cleaning
|
||||
"""
|
||||
self.load_data(low_memory=DATA_PROCESSOR_SETTINGS['low_memory'])
|
||||
self.confine_data()
|
||||
self.recast_df_columns(column_mappings=DATA_PROCESSOR_SETTINGS['column_mappings'])
|
||||
self.clean_multi_glaze_proportion()
|
||||
self.retain_multiple_epc_properties(epc_minimum_count=DATA_PROCESSOR_SETTINGS['epc_minimum_count'])
|
||||
|
||||
self.data = self.data.sort_values(["UPRN", "LODGEMENT_DATE"], ascending=True)
|
||||
|
||||
return self.data
|
||||
|
||||
def make_cleaning_averages(self) -> pd.DataFrame:
|
||||
# Define a custom function to calculate the median, excluding missing values
|
||||
def median_without_missing(group):
|
||||
return group[AVERAGE_FIXED_FEATURES].median(skipna=True)
|
||||
|
||||
cleaning_averages = self.data.groupby(
|
||||
["PROPERTY_TYPE", "BUILT_FORM", "CONSTRUCTION_AGE_BAND", "NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"],
|
||||
observed=True
|
||||
).apply(median_without_missing).reset_index()
|
||||
|
||||
general_averages = self.data.groupby(["PROPERTY_TYPE", "BUILT_FORM"], observed=True).apply(
|
||||
median_without_missing).reset_index()
|
||||
|
||||
property_averages = self.data.groupby(["PROPERTY_TYPE"], observed=True).apply(
|
||||
median_without_missing).reset_index()
|
||||
|
||||
built_form_averages = self.data.groupby(["BUILT_FORM"], observed=True).apply(
|
||||
median_without_missing).reset_index()
|
||||
|
||||
# We can clean up any NA's in the cleaning averages with the general averages here
|
||||
cleaning_averages_filled = pd.merge(cleaning_averages, general_averages, on=['PROPERTY_TYPE', 'BUILT_FORM'], suffixes=['', '_AVERAGE'])
|
||||
cleaning_averages_filled = pd.merge(cleaning_averages_filled, property_averages, on=['PROPERTY_TYPE'], suffixes=['', '_PROPERTY_AVERAGE'])
|
||||
cleaning_averages_filled = pd.merge(cleaning_averages_filled, built_form_averages, on=['BUILT_FORM'], suffixes=['', '_BUILT_FORM_AVERAGE'])
|
||||
|
||||
# Replace any missing NAN values with averages for the same Property type and built form
|
||||
cleaning_averages_filled['TOTAL_FLOOR_AREA'] = cleaning_averages_filled['TOTAL_FLOOR_AREA'].fillna(cleaning_averages_filled['TOTAL_FLOOR_AREA_AVERAGE'])
|
||||
cleaning_averages_filled['FLOOR_HEIGHT'] = cleaning_averages_filled['FLOOR_HEIGHT'].fillna(cleaning_averages_filled['FLOOR_HEIGHT_AVERAGE'])
|
||||
cleaning_averages_filled = cleaning_averages_filled.drop(columns=['TOTAL_FLOOR_AREA_AVERAGE', 'FLOOR_HEIGHT_AVERAGE'])
|
||||
|
||||
# If there are still NA values i.e. the averages do not have values for a speicifc group of property tyope and built form
|
||||
# We can use just the property type average and replace
|
||||
cleaning_averages_filled['TOTAL_FLOOR_AREA'] = cleaning_averages_filled['TOTAL_FLOOR_AREA'].fillna(cleaning_averages_filled['TOTAL_FLOOR_AREA_PROPERTY_AVERAGE'])
|
||||
cleaning_averages_filled['FLOOR_HEIGHT'] = cleaning_averages_filled['FLOOR_HEIGHT'].fillna(cleaning_averages_filled['FLOOR_HEIGHT_PROPERTY_AVERAGE'])
|
||||
cleaning_averages_filled = cleaning_averages_filled.drop(columns=['TOTAL_FLOOR_AREA_PROPERTY_AVERAGE', 'FLOOR_HEIGHT_PROPERTY_AVERAGE'])
|
||||
|
||||
# If there are still NA values, use BUILT FORM averages
|
||||
cleaning_averages_filled['TOTAL_FLOOR_AREA'] = cleaning_averages_filled['TOTAL_FLOOR_AREA'].fillna(cleaning_averages_filled['TOTAL_FLOOR_AREA_BUILT_FORM_AVERAGE'])
|
||||
cleaning_averages_filled['FLOOR_HEIGHT'] = cleaning_averages_filled['FLOOR_HEIGHT'].fillna(cleaning_averages_filled['FLOOR_HEIGHT_BUILT_FORM_AVERAGE'])
|
||||
cleaning_averages_filled = cleaning_averages_filled.drop(columns=['TOTAL_FLOOR_AREA_BUILT_FORM_AVERAGE', 'FLOOR_HEIGHT_BUILT_FORM_AVERAGE'])
|
||||
|
||||
# If there still is na values, use average across all properties in consituecy
|
||||
cleaning_averages_filled['TOTAL_FLOOR_AREA'] = cleaning_averages_filled['TOTAL_FLOOR_AREA'].fillna(cleaning_averages_filled['TOTAL_FLOOR_AREA'].mean())
|
||||
cleaning_averages_filled['FLOOR_HEIGHT'] = cleaning_averages_filled['FLOOR_HEIGHT'].fillna(cleaning_averages_filled['FLOOR_HEIGHT'].mean())
|
||||
|
||||
# If the consituency is all NA values, then take UK AVERAGE VALUES
|
||||
cleaning_averages_filled['TOTAL_FLOOR_AREA'] = cleaning_averages_filled['TOTAL_FLOOR_AREA'].fillna(TOTAL_FLOOR_AREA_NATIONAL_AVERAGE)
|
||||
cleaning_averages_filled['FLOOR_HEIGHT'] = cleaning_averages_filled['FLOOR_HEIGHT'].fillna(FLOOR_HEIGHT_NATIONAL_AVERAGE)
|
||||
|
||||
return cleaning_averages_filled
|
||||
|
||||
def retain_multiple_epc_properties(self, epc_minimum_count: int = 1) -> None:
|
||||
'''
|
||||
Reduce the data futher by keeping only datasets with multiple epcs
|
||||
'''
|
||||
|
||||
counts = self.data.groupby("UPRN").size().reset_index()
|
||||
counts.columns = ["UPRN", "count"]
|
||||
|
||||
# take UPRNS with multiple EPCs
|
||||
counts = counts[counts["count"] > epc_minimum_count]
|
||||
self.data = pd.merge(self.data, counts, on='UPRN')
|
||||
|
||||
|
||||
def recast_df_columns(self, column_mappings: dict) -> None:
|
||||
"""
|
||||
Recast columns from the dataframe to ensure the behaviour we want
|
||||
"""
|
||||
|
||||
for key, values in column_mappings.items():
|
||||
if key not in self.data.columns:
|
||||
print('Column mapping incorrectly specified')
|
||||
exit(1)
|
||||
for value in values:
|
||||
self.data[key] = self.data[key].astype(value)
|
||||
|
||||
|
||||
def confine_data(self) -> None:
|
||||
"""
|
||||
Include all step to reduce down the data based on assumptions
|
||||
"""
|
||||
|
||||
# Filter 1: UPRN is a unique identifier for a property, so we remove any EPCs that don't have one
|
||||
|
||||
# Filter 2: Lodgement date is the date the EPC was lodged, so we remove any EPCs that were lodged
|
||||
# before the introduction of SAP09
|
||||
|
||||
# Filter 3: We remove EPCS that were conducted for a new build, since these are performed with
|
||||
# full SAP, which produces different results to the RdSAP methodology
|
||||
|
||||
# Filter 4: We remove floor level in top floor or mid floor since this is ambiguous
|
||||
|
||||
self.data = self.data[~pd.isnull(self.data["UPRN"])]
|
||||
self.data = self.data[self.data["LODGEMENT_DATE"] >= EARLIEST_EPC_DATE]
|
||||
self.data = self.data[self.data["TRANSACTION_TYPE"] != "new dwelling"]
|
||||
self.data = self.data[~self.data["FLOOR_LEVEL"].isin(["top floor", "mid floor"])]
|
||||
|
||||
|
||||
def clean_multi_glaze_proportion(self) -> None:
|
||||
"""
|
||||
If there is no multi-glaze proportion but the windows are fully glazed, then we should assume a score of 100
|
||||
"""
|
||||
|
||||
no_multi_glaze_proportion_index = pd.isnull(self.data["MULTI_GLAZE_PROPORTION"]) & (self.data["WINDOWS_DESCRIPTION"].isin(FULLY_GLAZED_DESCRIPTIONS))
|
||||
self.data.loc[no_multi_glaze_proportion_index, 'MULTI_GLAZE_PROPORTION'] = 100
|
||||
|
||||
22
model_data/simulation_system/Logger.py
Normal file
22
model_data/simulation_system/Logger.py
Normal file
|
|
@ -0,0 +1,22 @@
|
|||
import logging
|
||||
|
||||
def setup_logger():
|
||||
# Create a logger
|
||||
logger = logging.getLogger()
|
||||
|
||||
# Set the log level
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
# Create a formatter
|
||||
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
|
||||
|
||||
# Create a stream handler to direct logs to stdout
|
||||
stream_handler = logging.StreamHandler()
|
||||
stream_handler.setFormatter(formatter)
|
||||
|
||||
# Add the stream handler to the logger
|
||||
logger.addHandler(stream_handler)
|
||||
|
||||
return logger
|
||||
|
||||
logger = setup_logger()
|
||||
114
model_data/simulation_system/Settings.py
Normal file
114
model_data/simulation_system/Settings.py
Normal file
|
|
@ -0,0 +1,114 @@
|
|||
# Using a simply python file as settings for now
|
||||
# TODO: migrate to dynaconf
|
||||
|
||||
TOTAL_FLOOR_AREA_NATIONAL_AVERAGE = 70
|
||||
FLOOR_HEIGHT_NATIONAL_AVERAGE = 2.45
|
||||
|
||||
FULLY_GLAZED_DESCRIPTIONS = [
|
||||
"Fully double glazed",
|
||||
"High performance glazing",
|
||||
"Fully triple glazed",
|
||||
"Full secondary glazing",
|
||||
"Multiple glazing throughout",
|
||||
]
|
||||
|
||||
FIXED_FEATURES = [
|
||||
'PROPERTY_TYPE',
|
||||
'BUILT_FORM',
|
||||
'CONSTRUCTION_AGE_BAND',
|
||||
'NUMBER_HABITABLE_ROOMS',
|
||||
'CONSTITUENCY',
|
||||
'NUMBER_HEATED_ROOMS',
|
||||
'FIXED_LIGHTING_OUTLETS_COUNT',
|
||||
'FLOOR_HEIGHT',
|
||||
'FLOOR_LEVEL',
|
||||
'TOTAL_FLOOR_AREA',
|
||||
]
|
||||
|
||||
COMPONENT_FEATURES = [
|
||||
'TRANSACTION_TYPE',
|
||||
'WALLS_DESCRIPTION',
|
||||
'FLOOR_DESCRIPTION',
|
||||
'LIGHTING_DESCRIPTION',
|
||||
'ROOF_DESCRIPTION',
|
||||
'MAINHEAT_DESCRIPTION',
|
||||
'HOTWATER_DESCRIPTION',
|
||||
'MAIN_FUEL',
|
||||
'MECHANICAL_VENTILATION',
|
||||
'SECONDHEAT_DESCRIPTION',
|
||||
'ENERGY_TARIFF', # Not sure if this is relevant
|
||||
'SOLAR_WATER_HEATING_FLAG',
|
||||
'PHOTO_SUPPLY',
|
||||
'WINDOWS_DESCRIPTION',
|
||||
'GLAZED_TYPE',
|
||||
'MULTI_GLAZE_PROPORTION',
|
||||
'LIGHTING_DESCRIPTION',
|
||||
'LOW_ENERGY_LIGHTING',
|
||||
'NUMBER_OPEN_FIREPLACES',
|
||||
'MAINHEATCONT_DESCRIPTION',
|
||||
'EXTENSION_COUNT',
|
||||
# 'GLAZED_AREA', # May not need this since we have MULTI_GLAZE_PROPORTION
|
||||
]
|
||||
|
||||
# For these fields, we take an average if we have multiple values
|
||||
AVERAGE_FIXED_FEATURES = [
|
||||
"TOTAL_FLOOR_AREA",
|
||||
"FLOOR_HEIGHT"
|
||||
]
|
||||
|
||||
# For these fields, we take the latest value if we have multiple values
|
||||
# Since more recent EPCs have been conducted with more rigour, we assume that the latest value is
|
||||
# the most accurate
|
||||
LATEST_FIELD = [
|
||||
"NUMBER_HABITABLE_ROOMS",
|
||||
"NUMBER_HEATED_ROOMS",
|
||||
"FIXED_LIGHTING_OUTLETS_COUNT",
|
||||
"FLOOR_LEVEL",
|
||||
"CONSTRUCTION_AGE_BAND", # This is a field we're probably want to use verisk data for
|
||||
]
|
||||
|
||||
# If we see thee features changing, we don't use the EPC, since deem it not to be reliable
|
||||
MANDATORY_FIXED_FEATURES = [
|
||||
"PROPERTY_TYPE",
|
||||
"BUILT_FORM",
|
||||
"CONSTITUENCY"
|
||||
]
|
||||
|
||||
# For particularly old EPC data, we have inconsistent records so we'll only include EPCS that were
|
||||
# conducted after 2010, since SAP09 was introduced in 2009 an later SAP12 was introduced in England
|
||||
# and Wales from 31 July 2014
|
||||
EARLIEST_EPC_DATE = "2014-08-01"
|
||||
|
||||
RDSAP_RESPONSE = "CURRENT_ENERGY_EFFICIENCY"
|
||||
HEAT_DEMAND_RESPONSE = "ENERGY_CONSUMPTION_CURRENT"
|
||||
|
||||
def ordinal(n):
|
||||
if 10 <= n % 100 <= 20:
|
||||
suffix = 'th'
|
||||
else:
|
||||
suffix = {1: 'st', 2: 'nd', 3: 'rd'}.get(n % 10, 'th')
|
||||
|
||||
return str(n) + suffix
|
||||
|
||||
FLOOR_LEVEL_MAP = {
|
||||
"Basement": -1,
|
||||
"Ground": 0,
|
||||
"ground floor": 0,
|
||||
"20+": 20,
|
||||
"21st or above": 21,
|
||||
**{str(i).zfill(2): i for i in range(0, 21)},
|
||||
**{ordinal(i): i for i in range(-1, 21)},
|
||||
**{str(i): i for i in range(-1, 21)},
|
||||
**{i: i for i in range(-1, 21)},
|
||||
}
|
||||
|
||||
BUILT_FORM_REMAP = {
|
||||
"Enclosed End-Terrace": "End-Terrace",
|
||||
"Enclosed Mid-Terrace": "Mid-Terrace",
|
||||
}
|
||||
|
||||
DATA_PROCESSOR_SETTINGS = {
|
||||
'low_memory': False,
|
||||
'epc_minimum_count': 1,
|
||||
'column_mappings': {'UPRN': [int, str]}
|
||||
}
|
||||
|
|
@ -1,171 +1,21 @@
|
|||
import numpy as np
|
||||
import os
|
||||
import pandas as pd
|
||||
from tqdm import tqdm
|
||||
from model_data.BaseUtility import BaseUtility
|
||||
from pathlib import Path
|
||||
from model_data.simulation_system.Settings import (
|
||||
MANDATORY_FIXED_FEATURES,
|
||||
AVERAGE_FIXED_FEATURES,
|
||||
LATEST_FIELD,
|
||||
COMPONENT_FEATURES,
|
||||
RDSAP_RESPONSE,
|
||||
HEAT_DEMAND_RESPONSE,
|
||||
FLOOR_LEVEL_MAP,
|
||||
BUILT_FORM_REMAP
|
||||
)
|
||||
from DataProcessor import DataProcessor
|
||||
|
||||
|
||||
def list_subdirectories(directory_path):
|
||||
return [d for d in os.listdir(directory_path) if os.path.isdir(os.path.join(directory_path, d))]
|
||||
|
||||
|
||||
DATA_DIRECTORY = os.getcwd() + '/model_data/simulation_system/data/all-domestic-certificates'
|
||||
|
||||
FIXED_FEATURES = [
|
||||
'PROPERTY_TYPE',
|
||||
'BUILT_FORM',
|
||||
'CONSTRUCTION_AGE_BAND',
|
||||
'NUMBER_HABITABLE_ROOMS',
|
||||
'CONSTITUENCY',
|
||||
'NUMBER_HEATED_ROOMS',
|
||||
'FIXED_LIGHTING_OUTLETS_COUNT',
|
||||
'FLOOR_HEIGHT',
|
||||
'FLOOR_LEVEL',
|
||||
'TOTAL_FLOOR_AREA',
|
||||
]
|
||||
|
||||
COMPONENT_FEATURES = [
|
||||
'TRANSACTION_TYPE',
|
||||
'WALLS_DESCRIPTION',
|
||||
'FLOOR_DESCRIPTION',
|
||||
'LIGHTING_DESCRIPTION',
|
||||
'ROOF_DESCRIPTION',
|
||||
'MAINHEAT_DESCRIPTION',
|
||||
'HOTWATER_DESCRIPTION',
|
||||
'MAIN_FUEL',
|
||||
'MECHANICAL_VENTILATION',
|
||||
'SECONDHEAT_DESCRIPTION',
|
||||
'ENERGY_TARIFF', # Not sure if this is relevant
|
||||
'SOLAR_WATER_HEATING_FLAG',
|
||||
'PHOTO_SUPPLY',
|
||||
'WINDOWS_DESCRIPTION',
|
||||
'GLAZED_TYPE',
|
||||
'MULTI_GLAZE_PROPORTION',
|
||||
'LIGHTING_DESCRIPTION',
|
||||
'LOW_ENERGY_LIGHTING',
|
||||
'NUMBER_OPEN_FIREPLACES',
|
||||
'MAINHEATCONT_DESCRIPTION',
|
||||
'EXTENSION_COUNT',
|
||||
# 'GLAZED_AREA', # May not need this since we have MULTI_GLAZE_PROPORTION
|
||||
]
|
||||
|
||||
# For these fields, we take an average if we have multiple values
|
||||
AVERAGE_FIXED_FEATURES = [
|
||||
"TOTAL_FLOOR_AREA",
|
||||
"FLOOR_HEIGHT"
|
||||
]
|
||||
|
||||
# For these fields, we take the latest value if we have multiple values
|
||||
# Since more recent EPCs have been conducted with more rigour, we assume that the latest value is
|
||||
# the most accurate
|
||||
LATEST_FIELD = [
|
||||
"NUMBER_HABITABLE_ROOMS",
|
||||
"NUMBER_HEATED_ROOMS",
|
||||
"FIXED_LIGHTING_OUTLETS_COUNT",
|
||||
"CONSTRUCTION_AGE_BAND",
|
||||
"FLOOR_LEVEL",
|
||||
"CONSTRUCTION_AGE_BAND", # This is a field we're probably want to use verisk data for
|
||||
]
|
||||
|
||||
# If we see thee features changing, we don't use the EPC, since deem it not to be reliable
|
||||
MANDATORY_FIXED_FEATURES = [
|
||||
"PROPERTY_TYPE",
|
||||
"BUILT_FORM",
|
||||
"CONSTITUENCY"
|
||||
]
|
||||
|
||||
# For particularly old EPC data, we have inconsistent records so we'll only include EPCS that were
|
||||
# conducted after 2010, since SAP09 was introduced in 2009 an later SAP12 was introduced in England
|
||||
# and Wales from 31 July 2014
|
||||
EARLIEST_EPC_DATE = "2014-08-01"
|
||||
|
||||
RDSAP_RESPONSE = "CURRENT_ENERGY_EFFICIENCY"
|
||||
HEAT_DEMAND_RESPONSE = "ENERGY_CONSUMPTION_CURRENT"
|
||||
|
||||
|
||||
def make_cleaning_averages(df):
|
||||
# Define a custom function to calculate the median, excluding missing values
|
||||
def median_without_missing(group):
|
||||
return group[AVERAGE_FIXED_FEATURES].median(skipna=True)
|
||||
|
||||
cleaning_averages = df.groupby(
|
||||
["PROPERTY_TYPE", "BUILT_FORM", "CONSTRUCTION_AGE_BAND", "NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"],
|
||||
observed=True
|
||||
).apply(median_without_missing).reset_index()
|
||||
|
||||
general_averages = df.groupby(["PROPERTY_TYPE", "BUILT_FORM"], observed=True).apply(
|
||||
median_without_missing).reset_index()
|
||||
|
||||
return cleaning_averages, general_averages
|
||||
|
||||
|
||||
def iterative_filtering(cleaning_averages, property_data):
|
||||
# Define the columns to filter on
|
||||
columns_to_filter = ["PROPERTY_TYPE", "BUILT_FORM", "CONSTRUCTION_AGE_BAND", "NUMBER_HABITABLE_ROOMS",
|
||||
"NUMBER_HEATED_ROOMS"]
|
||||
|
||||
# Start with the entire cleaning_averages DataFrame
|
||||
filtered_data = cleaning_averages.copy()
|
||||
|
||||
# Iterate through the columns and apply filters one by one
|
||||
for column in columns_to_filter:
|
||||
# Apply the filter using the value from property_data
|
||||
new_filtered_data = filtered_data[filtered_data[column] == property_data[column].iloc[0]]
|
||||
|
||||
# If the filter results in no data, return the previous result
|
||||
if new_filtered_data.empty:
|
||||
continue
|
||||
|
||||
# If the filter is successful, update the filtered data
|
||||
filtered_data = new_filtered_data
|
||||
|
||||
return filtered_data
|
||||
|
||||
|
||||
def clean_multi_glaze_proportion(df):
|
||||
fully_glazed_descriptions = [
|
||||
"Fully double glazed",
|
||||
"High performance glazing",
|
||||
"Fully triple glazed",
|
||||
"Full secondary glazing",
|
||||
"Multiple glazing throughout",
|
||||
]
|
||||
|
||||
df["MULTI_GLAZE_PROPORTION"] = np.where(
|
||||
pd.isnull(df["MULTI_GLAZE_PROPORTION"]) & (df["WINDOWS_DESCRIPTION"].isin(fully_glazed_descriptions)),
|
||||
100,
|
||||
df["MULTI_GLAZE_PROPORTION"],
|
||||
)
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def ordinal(n):
|
||||
if 10 <= n % 100 <= 20:
|
||||
suffix = 'th'
|
||||
else:
|
||||
suffix = {1: 'st', 2: 'nd', 3: 'rd'}.get(n % 10, 'th')
|
||||
|
||||
return str(n) + suffix
|
||||
|
||||
|
||||
FLOOR_LEVEL_MAP = {
|
||||
"Basement": -1,
|
||||
"Ground": 0,
|
||||
"ground floor": 0,
|
||||
"20+": 20,
|
||||
"21st or above": 21,
|
||||
**{str(i).zfill(2): i for i in range(0, 21)},
|
||||
**{ordinal(i): i for i in range(-1, 21)},
|
||||
**{str(i): i for i in range(-1, 21)},
|
||||
**{i: i for i in range(-1, 21)},
|
||||
}
|
||||
|
||||
BUILT_FORM_REMAP = {
|
||||
"Enclosed End-Terrace": "End-Terrace",
|
||||
"Enclosed Mid-Terrace": "Mid-Terrace",
|
||||
}
|
||||
|
||||
DATA_DIRECTORY = Path(__file__).parent / 'data' / 'all-domestic-certificates'
|
||||
|
||||
def app():
|
||||
# Get all the files in the directory
|
||||
|
|
@ -173,108 +23,98 @@ def app():
|
|||
# Data glossary:
|
||||
# https://epc.opendatacommunities.org/docs/guidance#glossary
|
||||
|
||||
directories = list_subdirectories(DATA_DIRECTORY)
|
||||
# List all subdirectories
|
||||
directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()]
|
||||
|
||||
dataset = []
|
||||
for directory in tqdm(directories):
|
||||
filepath = os.path.join(DATA_DIRECTORY, directory, "certificates.csv")
|
||||
df = pd.read_csv(filepath, low_memory=False)
|
||||
# UPRN is a unique identifier for a property, so we remove any EPCs that don't have one
|
||||
df = df[~pd.isnull(df["UPRN"])]
|
||||
# Lodgement date is the date the EPC was lodged, so we remove any EPCs that were lodged
|
||||
# before the introduction of SAP09
|
||||
df = df[df["LODGEMENT_DATE"] >= EARLIEST_EPC_DATE]
|
||||
|
||||
cleaning_averages, general_averages = make_cleaning_averages(df)
|
||||
for directory in tqdm(directories):
|
||||
|
||||
# We remove EPCS that were conducted for a new build, since these are performed with
|
||||
# full SAP, which produces different results to the RdSAP methodology
|
||||
df = df[df["TRANSACTION_TYPE"] != "new dwelling"]
|
||||
filepath = directory / "certificates.csv"
|
||||
|
||||
df = clean_multi_glaze_proportion(df)
|
||||
data_processor = DataProcessor(filepath=filepath)
|
||||
|
||||
# We remove floor level in top floor or mid floor since this is ambiguous
|
||||
df = df[~df["FLOOR_LEVEL"].isin(["top floor", "mid floor"])]
|
||||
|
||||
df["UPRN"] = df["UPRN"].astype(int).astype(str)
|
||||
counts = df.groupby("UPRN").size().reset_index()
|
||||
counts.columns = ["UPRN", "count"]
|
||||
counts = counts.sort_values("count", ascending=False)
|
||||
|
||||
# take UPRNS with multiple EPCs
|
||||
counts = counts[counts["count"] > 1]
|
||||
df = df[df["UPRN"].isin(counts["UPRN"])]
|
||||
df = df.sort_values(["UPRN", "LODGEMENT_DATE"], ascending=True)
|
||||
df = data_processor.pre_process()
|
||||
cleaning_averages = data_processor.make_cleaning_averages()
|
||||
|
||||
for uprn, property_data in df.groupby("UPRN", observed=True):
|
||||
|
||||
# Fixed features - these are property attributes that shouldn't change over time
|
||||
|
||||
ignore_epc = False
|
||||
fixed_data = {}
|
||||
for field in FIXED_FEATURES:
|
||||
vals = property_data[field].dropna().unique()
|
||||
# Remove invalid values
|
||||
vals = [v for v in vals if v not in BaseUtility.DATA_ANOMALY_MATCHES]
|
||||
|
||||
if field == "FLOOR_LEVEL":
|
||||
vals = list({FLOOR_LEVEL_MAP[v] for v in vals})
|
||||
# If a property has changed building type, we can ignore the epc rating i.e. this should be 1 unique row
|
||||
if max(modified_property_data[MANDATORY_FIXED_FEATURES].nunique()) > 1:
|
||||
continue
|
||||
|
||||
if field == "BUILT_FORM":
|
||||
vals = list({BUILT_FORM_REMAP.get(v, v) for v in vals})
|
||||
# Map all anomaly values to None
|
||||
data_anomaly_map = dict(zip(BaseUtility.DATA_ANOMALY_MATCHES, [None]*len(BaseUtility.DATA_ANOMALY_MATCHES)))
|
||||
|
||||
# Use replace function to map data (if exists in key), to corresponding value - i.e. Remove invalid values
|
||||
modified_property_data = property_data.replace(data_anomaly_map)
|
||||
modified_property_data = modified_property_data.replace(np.NAN, None)
|
||||
|
||||
# Remap certain columns
|
||||
modified_property_data['FLOOR_LEVEL'] = modified_property_data['FLOOR_LEVEL'].replace(FLOOR_LEVEL_MAP)
|
||||
modified_property_data['BUILT_FROM'] = modified_property_data['BUILT_FORM'].replace(BUILT_FORM_REMAP)
|
||||
|
||||
if field in AVERAGE_FIXED_FEATURES:
|
||||
# Take the latest row for both the LATEST_FEILDS and MANDATORY FIELDS
|
||||
latest_field_data = modified_property_data[LATEST_FIELD].iloc[-1].to_dict()
|
||||
mandatory_field_data = modified_property_data[MANDATORY_FIXED_FEATURES].iloc[-1].to_dict()
|
||||
|
||||
if len(vals) > 1:
|
||||
# Check the values are too far apart
|
||||
if abs(vals[0] - vals[1]) / vals[0] > 0.1:
|
||||
# Take the more recent value since it's likely to be more accurate
|
||||
vals = [vals[-1]]
|
||||
# Taking just the last row, which is the percentage change from the latest to previous one only
|
||||
# modified_property_data[AVERAGE_FIXED_FEATURES].fillna(value=0).pct_change().iloc[-1] > 0.1
|
||||
|
||||
if vals:
|
||||
field_value = np.mean(vals)
|
||||
else:
|
||||
# Clean using averages
|
||||
# We can replace any NA values for Average fixed features
|
||||
# We have columns that we want to merge on, but some of these columns are all NA values
|
||||
# So we determine which columns to merge on, and get the equivalent grouping in the averages
|
||||
columns_to_merge_on = ["PROPERTY_TYPE", "BUILT_FORM", "CONSTRUCTION_AGE_BAND", "NUMBER_HABITABLE_ROOMS",
|
||||
"NUMBER_HEATED_ROOMS"]
|
||||
|
||||
if any(modified_property_data[columns_to_merge_on].isna()):
|
||||
# If there are any NA value, back fill first (i.e most recent), then forward fill if needed
|
||||
modified_property_data[columns_to_merge_on] = modified_property_data[columns_to_merge_on].fillna(method='bfill').fillna(method='ffill')
|
||||
|
||||
# Extract the columns that are not all None
|
||||
na_columns = modified_property_data[columns_to_merge_on].isna().all()
|
||||
columns_to_merge_on = na_columns.index[~na_columns].to_list()
|
||||
|
||||
avgs = iterative_filtering(cleaning_averages, property_data)
|
||||
# TODO: Should probably do a mean/median?
|
||||
field_value = avgs[field].iloc[0]
|
||||
# Get the corresponding groupby and merge, and fill in NA values
|
||||
cleaning_averages_to_merge = cleaning_averages.groupby(columns_to_merge_on)[['TOTAL_FLOOR_AREA', 'FLOOR_HEIGHT']].mean()
|
||||
modified_property_data = pd.merge(modified_property_data, cleaning_averages_to_merge, on=columns_to_merge_on, suffixes=['', '_AVERAGE'])
|
||||
modified_property_data['TOTAL_FLOOR_AREA'] = modified_property_data['TOTAL_FLOOR_AREA'].fillna(modified_property_data['TOTAL_FLOOR_AREA_AVERAGE'])
|
||||
modified_property_data['FLOOR_HEIGHT'] = modified_property_data['FLOOR_HEIGHT'].fillna(modified_property_data['FLOOR_HEIGHT_AVERAGE'])
|
||||
modified_property_data = modified_property_data.drop(columns=['TOTAL_FLOOR_AREA_AVERAGE', 'FLOOR_HEIGHT_AVERAGE'])
|
||||
|
||||
if pd.isnull(field_value):
|
||||
# Just the use the general averages
|
||||
field_value = general_averages[
|
||||
(general_averages["PROPERTY_TYPE"] == property_data["PROPERTY_TYPE"].iloc[0]) &
|
||||
(general_averages["BUILT_FORM"] == property_data["BUILT_FORM"].iloc[0])
|
||||
][field].iloc[0]
|
||||
|
||||
elif field in LATEST_FIELD:
|
||||
field_value = vals[-1] if vals else None
|
||||
else:
|
||||
if len(vals) > 1:
|
||||
if field in MANDATORY_FIXED_FEATURES:
|
||||
ignore_epc = True
|
||||
else:
|
||||
raise ValueError("Fixed feature {} has more than one value - fix me".format(field))
|
||||
|
||||
field_value = vals[0] if vals else None
|
||||
for field in AVERAGE_FIXED_FEATURES:
|
||||
vals = list(modified_property_data[field].dropna().unique())
|
||||
if len(vals) > 1:
|
||||
# Check the values are too far apart
|
||||
# TODO: we could have multiple values here, why only use the first two?
|
||||
if abs(vals[0] - vals[1]) / vals[0] > 0.1:
|
||||
# Take the more recent value since it's likely to be more accurate
|
||||
vals = [vals[-1]]
|
||||
|
||||
if vals:
|
||||
field_value = np.mean(vals)
|
||||
|
||||
fixed_data[field] = field_value
|
||||
|
||||
if ignore_epc:
|
||||
continue
|
||||
#Combine all fields together
|
||||
fixed_data.update(mandatory_field_data)
|
||||
fixed_data.update(latest_field_data)
|
||||
|
||||
# We include the lodgement date here as we probably need to factor time into the
|
||||
# model, since EPC standards and rigour have changed over time
|
||||
variable_data = property_data[
|
||||
variable_data = modified_property_data[
|
||||
COMPONENT_FEATURES + ["LODGEMENT_DATE", RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE]
|
||||
]
|
||||
|
||||
# Note: we look at changes between subsequent EPCS, however we could look at other permutations
|
||||
# e.g. first vs second, second vs third and also first vs third
|
||||
property_model_data = []
|
||||
for idx in range(0, property_data.shape[0] - 1):
|
||||
for idx in range(0, modified_property_data.shape[0] - 1):
|
||||
|
||||
if idx >= property_data.shape[0] - 1:
|
||||
if idx >= modified_property_data.shape[0] - 1:
|
||||
break
|
||||
|
||||
starting_record = variable_data.iloc[idx]
|
||||
|
|
@ -307,3 +147,10 @@ def app():
|
|||
)
|
||||
|
||||
dataset.extend(property_model_data)
|
||||
|
||||
output = pd.DataFrame(dataset)
|
||||
output.to_parquet('./dataset.parquet')
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app()
|
||||
117
model_data/simulation_system/energy_predictor.py
Normal file
117
model_data/simulation_system/energy_predictor.py
Normal file
|
|
@ -0,0 +1,117 @@
|
|||
from pathlib import Path
|
||||
from Settings import (
|
||||
RDSAP_RESPONSE,
|
||||
FLOOR_LEVEL_MAP,
|
||||
BUILT_FORM_REMAP,
|
||||
EARLIEST_EPC_DATE,
|
||||
FULLY_GLAZED_DESCRIPTIONS,
|
||||
FIXED_FEATURES,
|
||||
LATEST_FIELD,
|
||||
COMPONENT_FEATURES
|
||||
)
|
||||
from model_data.BaseUtility import BaseUtility
|
||||
from tqdm import tqdm
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
from autogluon.tabular import TabularDataset, TabularPredictor
|
||||
|
||||
RANDOM_SEED = 0
|
||||
|
||||
DATA_DIRECTORY = Path(__file__).parent / 'data' / 'all-domestic-certificates'
|
||||
|
||||
FLOAT_COLUMNS = [
|
||||
'NUMBER_OPEN_FIREPLACES',
|
||||
'EXTENSION_COUNT',
|
||||
'TOTAL_FLOOR_AREA',
|
||||
'PHOTO_SUPPLY',
|
||||
'FIXED_LIGHTING_OUTLETS_COUNT',
|
||||
'FLOOR_HEIGHT',
|
||||
'NUMBER_HABITABLE_ROOMS',
|
||||
'LOW_ENERGY_LIGHTING',
|
||||
'MULTI_GLAZE_PROPORTION',
|
||||
'NUMBER_HEATED_ROOMS'
|
||||
]
|
||||
|
||||
def create_raw_data():
|
||||
"""
|
||||
Extract all information to do a simple predictor for RDSAP
|
||||
"""
|
||||
|
||||
directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()]
|
||||
# directories = directories[0:10]
|
||||
dfs = []
|
||||
for directory in tqdm(directories):
|
||||
filepath = directory / "certificates.csv"
|
||||
df = pd.read_csv(filepath, low_memory=False)
|
||||
|
||||
# Remove any bad uprns and ignore old/bad data
|
||||
df = df[~pd.isnull(df["UPRN"])]
|
||||
df = df[df["LODGEMENT_DATE"] >= EARLIEST_EPC_DATE]
|
||||
df = df[df["TRANSACTION_TYPE"] != "new dwelling"]
|
||||
df = df[~df["FLOOR_LEVEL"].isin(["top floor", "mid floor"])]
|
||||
|
||||
# Change multi glaze proportion
|
||||
no_multi_glaze_proportion_index = pd.isnull(df["MULTI_GLAZE_PROPORTION"]) & (df["WINDOWS_DESCRIPTION"].isin(FULLY_GLAZED_DESCRIPTIONS))
|
||||
df.loc[no_multi_glaze_proportion_index, 'MULTI_GLAZE_PROPORTION'] = 100
|
||||
|
||||
# Recast
|
||||
df["UPRN"] = df["UPRN"].astype(int).astype(str)
|
||||
df['MAIN_HEATING_CONTROLS'] = df['MAIN_HEATING_CONTROLS'].astype(float)
|
||||
|
||||
# Sort Data
|
||||
df = df.sort_values(["UPRN", "LODGEMENT_DATE"], ascending=True)
|
||||
|
||||
# Map all anomaly values to None
|
||||
data_anomaly_map = dict(zip(BaseUtility.DATA_ANOMALY_MATCHES, [None]*len(BaseUtility.DATA_ANOMALY_MATCHES)))
|
||||
|
||||
# Use replace function to map data (if exists in key), to corresponding value - i.e. Remove invalid values
|
||||
df = df.replace(data_anomaly_map)
|
||||
df = df.replace(np.NAN, None)
|
||||
|
||||
# Remap certain columns
|
||||
df['FLOOR_LEVEL'] = df['FLOOR_LEVEL'].replace(FLOOR_LEVEL_MAP)
|
||||
df['BUILT_FROM'] = df['BUILT_FORM'].replace(BUILT_FORM_REMAP)
|
||||
|
||||
# Keep only possible modelling columns
|
||||
df = df[[RDSAP_RESPONSE] + list(set(FIXED_FEATURES + LATEST_FIELD + COMPONENT_FEATURES))]
|
||||
|
||||
# Reduce memory usage
|
||||
|
||||
# df.memory_usage()
|
||||
# df.dtypes
|
||||
df[RDSAP_RESPONSE] = pd.to_numeric(df[RDSAP_RESPONSE], downcast='unsigned')
|
||||
df[FLOAT_COLUMNS] = df[FLOAT_COLUMNS].apply(pd.to_numeric, downcast='float')
|
||||
|
||||
|
||||
dfs.append(df)
|
||||
|
||||
data = pd.concat(dfs)
|
||||
data.to_parquet('./energy_predictor_data.parquet')
|
||||
|
||||
cleaned_data = data.dropna()
|
||||
# GIves you primarily flats
|
||||
cleaned_data.to_parquet('./energy_predictor_cleaned_data.parquet')
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
data = TabularDataset(data='./model_build_data/energy_data/cleaned_data/train_validation_data.parquet')
|
||||
|
||||
subsample_size = round(len(data)/100)
|
||||
data = data.sample(subsample_size, random_state=RANDOM_SEED)
|
||||
|
||||
predictor_RDSAP = TabularPredictor(
|
||||
label=RDSAP_RESPONSE,
|
||||
path="agModels-predictENERGY",
|
||||
problem_type="regression",
|
||||
eval_metric='mean_absolute_error'
|
||||
).fit(data, time_limit=800, presets='high_quality', excluded_model_types=['KNN', 'CAT'])
|
||||
|
||||
test_data = TabularDataset('./model_build_data/energy_data/cleaned_data/test_data.parquet')
|
||||
performance = predictor_RDSAP.evaluate(test_data)
|
||||
predictions = predictor_RDSAP.predict(test_data)
|
||||
predictor_RDSAP.feature_importance(test_data)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
BIN
model_data/simulation_system/preprocessed_data/dataset.parquet
Normal file
BIN
model_data/simulation_system/preprocessed_data/dataset.parquet
Normal file
Binary file not shown.
77
model_data/simulation_system/test_data_generation.py
Normal file
77
model_data/simulation_system/test_data_generation.py
Normal file
|
|
@ -0,0 +1,77 @@
|
|||
from Logger import logger
|
||||
import argparse
|
||||
import pandas as pd
|
||||
from pathlib import Path
|
||||
|
||||
RANDOM_SEED = 0
|
||||
|
||||
def ingest_arguments() -> argparse.Namespace:
|
||||
"""
|
||||
Helper function to take in arguments from script start
|
||||
"""
|
||||
|
||||
parser = argparse.ArgumentParser(description='Inputs for training script')
|
||||
|
||||
parser.add_argument('--filepath', type=str, help='Location of Parquet dataset to load', required=True)
|
||||
parser.add_argument('--output-folder', type=str, help='Location of Parquet dataset to save', required=True)
|
||||
parser.add_argument('--percentage', type=float, help='Percentage of data to use as test data', default=None)
|
||||
parser.add_argument('--volume', type=int, help='Volume of data to use as test data', default=None)
|
||||
parser.add_argument('--sampling', type=str, help='Type of sampling to do for test data', choices=['random', 'stratified'], default='random')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
return args
|
||||
|
||||
def main(filepath: str, output_folder: str, percentage: float, volume: int, sampling: str):
|
||||
"""
|
||||
Load a dataset in and split out the training+validation data and the test data.
|
||||
"""
|
||||
|
||||
logger.info('---Loading Data---')
|
||||
data = pd.read_parquet(filepath).reset_index(drop=True)
|
||||
|
||||
if percentage and volume is None:
|
||||
test_amount = round(len(data)*percentage)
|
||||
elif percentage is None and volume:
|
||||
test_amount = volume
|
||||
elif percentage is None and volume is None:
|
||||
logger.error('No amount specified - please specify either a percentage or volume')
|
||||
exit(1)
|
||||
else:
|
||||
logger.info('Both percentage and volume specified - taking largest of the two')
|
||||
test_amount = max(round(len(data)*percentage), volume)
|
||||
|
||||
logger.info(f'---Extracting {test_amount} from dataset to be test data')
|
||||
|
||||
if sampling == 'random':
|
||||
logger.info('--- Using random sample method ---')
|
||||
sample_index = data.sample(n=test_amount, random_state=RANDOM_SEED).index
|
||||
|
||||
train_validation_data = data.drop(sample_index)
|
||||
test_data = data.iloc[sample_index]
|
||||
|
||||
elif sampling =='stratified':
|
||||
# Not yet implemented
|
||||
pass
|
||||
|
||||
logger.info('--- Saving data ---')
|
||||
|
||||
train_validation_data.to_parquet(Path(output_folder)/'train_validation_data.parquet')
|
||||
test_data.to_parquet(Path(output_folder)/'test_data.parquet')
|
||||
|
||||
logger.info(' ---Pipeline complete---')
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
logger.info('--- Generate test data pipeline ---')
|
||||
|
||||
args = ingest_arguments()
|
||||
|
||||
main(
|
||||
filepath=args.filepath,
|
||||
output_folder=args.output_folder,
|
||||
percentage=args.percentage,
|
||||
volume=args.volume,
|
||||
sampling=args.sampling
|
||||
)
|
||||
|
||||
131
model_data/simulation_system/training.py
Normal file
131
model_data/simulation_system/training.py
Normal file
|
|
@ -0,0 +1,131 @@
|
|||
import os
|
||||
import pandas as pd
|
||||
import argparse
|
||||
from typing import List
|
||||
from Logger import logger
|
||||
from autogluon.tabular import TabularDataset, TabularPredictor
|
||||
|
||||
|
||||
DROP_COLUMNS = ['UPRN', 'HEAT_DEMAND_CHANGE']
|
||||
FEATURE_COLUMNS = None
|
||||
RANDOM_SEED = 0
|
||||
|
||||
# FOR TESTING
|
||||
train_filepath = "./model_build_data/train_validation_data.parquet"
|
||||
test_filepath = "./model_build_data/test_data.parquet"
|
||||
|
||||
|
||||
def ingest_arguments() -> argparse.Namespace:
|
||||
"""
|
||||
Helper function to take in arguments from script start
|
||||
"""
|
||||
|
||||
parser = argparse.ArgumentParser(description='Inputs for training script')
|
||||
|
||||
parser.add_argument('--train-filepath', type=str, help='Location of Parquet dataset to load for training')
|
||||
parser.add_argument('--test-filepath', type=str, help='Location of Parquet dataset to load for testing')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
return args
|
||||
|
||||
|
||||
class DataLoader():
|
||||
|
||||
@staticmethod
|
||||
def load(filepath: str) -> pd.DataFrame:
|
||||
"""
|
||||
Load different datasets
|
||||
"""
|
||||
if filepath.endswith('.parquet'):
|
||||
df = pd.read_parquet(filepath)
|
||||
elif filepath.endswith('.csv.'):
|
||||
df = pd.read_csv(filepath)
|
||||
else:
|
||||
logger.error('Not implemented!')
|
||||
exit(1)
|
||||
|
||||
return df
|
||||
|
||||
class FeatureProcessor:
|
||||
"""
|
||||
Handle all feature manipulation before modelling
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def drop_columns(df: pd.DataFrame, drop_columns: str = DROP_COLUMNS) -> pd.DataFrame:
|
||||
df = df.drop(columns=[drop_columns])
|
||||
return df
|
||||
|
||||
def retain_features(df: pd.DataFrame, features: List[str] = None):
|
||||
"""
|
||||
Determine which columns to keep ofr modelling
|
||||
"""
|
||||
if features is None:
|
||||
features = df.columns
|
||||
else:
|
||||
if not set(features).issubset(df.columns):
|
||||
logger.error('Features defined is not contained in data')
|
||||
exit(1)
|
||||
|
||||
df = df[features]
|
||||
|
||||
return df
|
||||
|
||||
def process(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
df = self.drop_columns(df, drop_columns=DROP_COLUMNS)
|
||||
df = self.retain_features(df, features=FEATURE_COLUMNS)
|
||||
return df
|
||||
|
||||
|
||||
|
||||
def training(train_filepath: str, test_filepath: str) -> None:
|
||||
"""
|
||||
Pipeline to run training on the dataset
|
||||
"""
|
||||
|
||||
logger.info('Loading data')
|
||||
dataloader = DataLoader()
|
||||
train_df = dataloader.load(filepath=train_filepath)
|
||||
test_df = dataloader.load(filepath=test_filepath)
|
||||
|
||||
# df = pd.read_parquet(train_filepath).drop(columns=['HEAT_DEMAND_CHANGE'])
|
||||
|
||||
logger.info('Feature processing')
|
||||
feature_processor = FeatureProcessor()
|
||||
train_df = feature_processor.process(train_df)
|
||||
test_df = feature_processor.process(test_df)
|
||||
|
||||
# logger.info('Split data into train and validation')
|
||||
|
||||
logger.info('Build Model')
|
||||
data = TabularDataset(data=train_df)
|
||||
# data['RDSAP_CHANGE'] = data['RDSAP_CHANGE'].astype(float)
|
||||
subsample_size = round(len(data)/4)
|
||||
data = data.sample(subsample_size, random_state=RANDOM_SEED)
|
||||
|
||||
target_column = 'RDSAP_CHANGE'
|
||||
predictor_RDSAP = TabularPredictor(
|
||||
label=target_column,
|
||||
path="agModels-predictRDSAP",
|
||||
problem_type="regression",
|
||||
eval_metric='mean_absolute_error'
|
||||
).fit(data, time_limit=8000, presets='high_quality', excluded_model_types=['KNN'])
|
||||
|
||||
logger.info('Evaluate matrics')
|
||||
|
||||
test_data = TabularDataset('./model_build_data/test_data.parquet')
|
||||
performance = predictor_RDSAP.evaluate(test_data)
|
||||
predictions = predictor_RDSAP.predict(test_data)
|
||||
|
||||
test_data['predictions'] = predictions
|
||||
test_data['diff'] = abs(test_data['RDSAP_CHANGE'] - test_data['predictions'])
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
logger.info('---Begin Pipeline---')
|
||||
|
||||
logger.info('---Ingest Arguments---')
|
||||
args = ingest_arguments()
|
||||
|
||||
training(train_filepath=args.train_filepath, test_filepath=args.test_filepath)
|
||||
Loading…
Add table
Reference in a new issue