mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
126 lines
No EOL
3.9 KiB
Python
126 lines
No EOL
3.9 KiB
Python
import os
|
|
import pandas as pd
|
|
import argparse
|
|
from typing import List
|
|
from Logger import logger
|
|
from DataLoader import DataLoader
|
|
from autogluon.tabular import TabularDataset, TabularPredictor
|
|
|
|
|
|
DROP_COLUMNS = ['UPRN', 'HEAT_DEMAND_CHANGE']
|
|
FEATURE_COLUMNS = None
|
|
RANDOM_SEED = 0
|
|
|
|
# FOR TESTING
|
|
train_filepath = "./model_build_data/train_validation_data.parquet"
|
|
test_filepath = "./model_build_data/test_data.parquet"
|
|
|
|
|
|
def ingest_arguments() -> argparse.Namespace:
|
|
"""
|
|
Helper function to take in arguments from script start
|
|
"""
|
|
|
|
parser = argparse.ArgumentParser(description='Inputs for training script')
|
|
|
|
parser.add_argument('--train-filepath', type=str, help='Location of Parquet dataset to load for training')
|
|
parser.add_argument('--test-filepath', type=str, help='Location of Parquet dataset to load for testing')
|
|
|
|
args = parser.parse_args()
|
|
|
|
return args
|
|
|
|
class FeatureProcessor:
|
|
"""
|
|
Handle all feature manipulation before modelling
|
|
"""
|
|
|
|
@staticmethod
|
|
def drop_columns(df: pd.DataFrame, drop_columns: str = DROP_COLUMNS) -> pd.DataFrame:
|
|
df = df.drop(columns=[drop_columns])
|
|
return df
|
|
|
|
def retain_features(df: pd.DataFrame, features: List[str] = None):
|
|
"""
|
|
Determine which columns to keep ofr modelling
|
|
"""
|
|
if features is None:
|
|
features = df.columns
|
|
else:
|
|
if not set(features).issubset(df.columns):
|
|
logger.error('Features defined is not contained in data')
|
|
exit(1)
|
|
|
|
df = df[features]
|
|
|
|
return df
|
|
|
|
def process(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
df = self.drop_columns(df, drop_columns=DROP_COLUMNS)
|
|
df = self.retain_features(df, features=FEATURE_COLUMNS)
|
|
return df
|
|
|
|
|
|
|
|
def training(train_filepath: str, test_filepath: str) -> None:
|
|
"""
|
|
Pipeline to run training on the dataset
|
|
"""
|
|
|
|
logger.info('Loading data')
|
|
dataloader = DataLoader()
|
|
train_df = dataloader.load(filepath=train_filepath)
|
|
test_df = dataloader.load(filepath=test_filepath)
|
|
|
|
# df = pd.read_parquet(train_filepath).drop(columns=['HEAT_DEMAND_CHANGE'])
|
|
|
|
logger.info('Feature processing')
|
|
feature_processor = FeatureProcessor()
|
|
train_df = feature_processor.process(train_df)
|
|
test_df = feature_processor.process(test_df)
|
|
|
|
# logger.info('Split data into train and validation')
|
|
|
|
logger.info('Build Model')
|
|
|
|
data = TabularDataset(data=train_filepath)
|
|
data = data.drop(columns=['UPRN', 'HEAT_DEMAND_CHANGE'])
|
|
TOP_FEATURES = ['MAINHEAT', 'ROOF', 'WALLS', 'MAINHEATCONT', 'PHOTO', 'HOTWATER', 'SECONDHEAT']
|
|
# top_features = data.columns[data.columns.str.startswith(tuple(TOP_FEATURES))]
|
|
|
|
data = data[['RDSAP_CHANGE'] + top_features.to_list()]
|
|
# data = TabularDataset(data=train_df)
|
|
# data['RDSAP_CHANGE'] = data['RDSAP_CHANGE'].astype(float)
|
|
subsample_size = round(len(data)/20)
|
|
data = data.sample(subsample_size, random_state=RANDOM_SEED)
|
|
|
|
# Add custom metric class MAPE
|
|
# Have a look at temporal features
|
|
|
|
target_column = 'RDSAP_CHANGE'
|
|
predictor_RDSAP = TabularPredictor(
|
|
label=target_column,
|
|
path="agModels-predictRDSAP",
|
|
problem_type="regression",
|
|
eval_metric='mean_absolute_error'
|
|
).fit(data, time_limit=200, presets='best_quality', excluded_model_types=['KNN'])
|
|
|
|
|
|
|
|
logger.info('Evaluate matrics')
|
|
|
|
test_data = TabularDataset('./model_build_data/test_data.parquet')
|
|
performance = predictor_RDSAP.evaluate(test_data)
|
|
predictions = predictor_RDSAP.predict(test_data)
|
|
|
|
test_data['predictions'] = predictions
|
|
test_data['diff'] = abs(test_data['RDSAP_CHANGE'] - test_data['predictions'])
|
|
|
|
if __name__ == "__main__":
|
|
|
|
logger.info('---Begin Pipeline---')
|
|
|
|
logger.info('---Ingest Arguments---')
|
|
args = ingest_arguments()
|
|
|
|
training(train_filepath=args.train_filepath, test_filepath=args.test_filepath) |