mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
added mlmodel, prediction and training files
This commit is contained in:
parent
a90a1278a8
commit
2a18180c53
15 changed files with 529 additions and 237 deletions
|
|
@ -1,113 +0,0 @@
|
|||
"""
|
||||
MLModel class
|
||||
Key tasks:
|
||||
- Template Model class for different model types
|
||||
- Save model
|
||||
- Load Model
|
||||
- Generate Inference
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Protocol, NamedTuple
|
||||
import pandas as pd
|
||||
from autogluon import TabularPredictor
|
||||
|
||||
AUTOGLUON_HYPERPARAMETERS = ['problem_type', 'eval_metric', 'time_limit', 'presets', 'excluded_model_types']
|
||||
|
||||
class MLModel(Protocol):
|
||||
'''
|
||||
Base ML Model protocol
|
||||
'''
|
||||
|
||||
def load_model(self, filepath: Path) -> None:
|
||||
"""
|
||||
Providing a path, this function will load the model to be used. Will load to internal variable
|
||||
"""
|
||||
|
||||
def save_model(self, output_filepath: Path) -> None:
|
||||
"""
|
||||
Providing a path, this function will save the model to be used.
|
||||
"""
|
||||
|
||||
def train_model(
|
||||
self,
|
||||
data: pd.DataFrame,
|
||||
target: str,
|
||||
hyperparameter: dict) -> None:
|
||||
"""
|
||||
For the given data and hyperparameters (specified to the model), a model is trained
|
||||
"""
|
||||
|
||||
def generate_predictions(self, data: pd.DataFrame) -> pd.DataFrame:
|
||||
"""
|
||||
For the given dataframe, model is loaded and predictions are generated
|
||||
"""
|
||||
|
||||
def model_evaluation(self, validation_data: pd.DataFrame) -> NamedTuple:
|
||||
"""
|
||||
For any validation data, a set of predictions and metrics are return
|
||||
"""
|
||||
|
||||
class AutogluonModel(MLModel):
|
||||
"""
|
||||
Autogluon model that implements the MLModel Protocol
|
||||
"""
|
||||
def __init__(self) -> None:
|
||||
self.model = None
|
||||
|
||||
def load_model(self, filepath: Path) -> None:
|
||||
"""
|
||||
Providing a path, this function will load the model to be used. Will load to internal variable
|
||||
"""
|
||||
self.model = TabularPredictor.load(path=filepath)
|
||||
|
||||
|
||||
def save_model(self, output_filepath: Path) -> None:
|
||||
"""
|
||||
Providing a path, this function will save the model to be used.
|
||||
"""
|
||||
|
||||
def train_model(
|
||||
self,
|
||||
data: pd.DataFrame,
|
||||
target_column: str,
|
||||
hyperparameters: dict = None) -> None:
|
||||
"""
|
||||
For the given data and hyperparameters, a model is trained
|
||||
"""
|
||||
|
||||
if set(AUTOGLUON_HYPERPARAMETERS) != set(hyperparameters.keys()):
|
||||
print("Hyperparameters (dict) is incorrectly defined - please check what hyperparameters are required")
|
||||
exit(1)
|
||||
|
||||
self.model = TabularPredictor(
|
||||
label=target_column,
|
||||
path=hyperparameters['output_path'],
|
||||
problem_type=hyperparameters['problem_type'],
|
||||
eval_metric=hyperparameters['eval_metric']
|
||||
).fit(
|
||||
data,
|
||||
time_limit=hyperparameters['time_limit'],
|
||||
presets=hyperparameters['presets'],
|
||||
excluded_model_types=hyperparameters['excluded_model_types']
|
||||
)
|
||||
|
||||
|
||||
def generate_predictions(self, data: pd.DataFrame) -> pd.DataFrame:
|
||||
"""
|
||||
For the given dataframe, model is loaded and predictions are generated
|
||||
"""
|
||||
|
||||
if self.model is None:
|
||||
print("No model loaded/ trained")
|
||||
exit(1)
|
||||
|
||||
predictions = self.model.predict(data)
|
||||
|
||||
return predictions
|
||||
|
||||
def model_evaluation(self, validation_data: pd.DataFrame) -> NamedTuple:
|
||||
"""
|
||||
For any validation data, a set of predictions and metrics are return
|
||||
"""
|
||||
|
||||
56
model_data/simulation_system/MLModel/BaseMLModel.py
Normal file
56
model_data/simulation_system/MLModel/BaseMLModel.py
Normal file
|
|
@ -0,0 +1,56 @@
|
|||
"""
|
||||
BaseMLModel class
|
||||
This is the base protocol:
|
||||
- Any implementation will be its own seperate file
|
||||
Key tasks:
|
||||
- Template Model class for different model types
|
||||
- Save model
|
||||
- Load Model
|
||||
- Generate Inference
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Protocol, NamedTuple
|
||||
import pandas as pd
|
||||
|
||||
|
||||
class MLModel(Protocol):
|
||||
'''
|
||||
Base ML Model protocol
|
||||
'''
|
||||
|
||||
def load_model(self, filepath: Path) -> None:
|
||||
"""
|
||||
Providing a path, this function will load the model to be used. Will load to internal variable
|
||||
"""
|
||||
|
||||
def save_model(self, output_filepath: Path) -> None:
|
||||
"""
|
||||
Providing a path, this function will save the model to be used.
|
||||
"""
|
||||
|
||||
def train_model(
|
||||
self,
|
||||
data: pd.DataFrame,
|
||||
target_column: str,
|
||||
hyperparameter: dict
|
||||
) -> None:
|
||||
"""
|
||||
For the given data and hyperparameters (specified to the model), a model is trained
|
||||
"""
|
||||
|
||||
def generate_predictions(self, data: pd.DataFrame) -> pd.DataFrame:
|
||||
"""
|
||||
For the given dataframe, model is loaded and predictions are generated
|
||||
"""
|
||||
|
||||
def model_evaluation(self, validation_data: pd.DataFrame, target_column: str, metrics_location: Path = None) -> NamedTuple:
|
||||
"""
|
||||
For any validation data, a set of predictions and metrics are return
|
||||
"""
|
||||
|
||||
def optimise_model_for_deployment(self):
|
||||
"""
|
||||
Perfomance post processing on Model to ensure ready for deployment
|
||||
"""
|
||||
|
||||
140
model_data/simulation_system/MLModel/Models.py
Normal file
140
model_data/simulation_system/MLModel/Models.py
Normal file
|
|
@ -0,0 +1,140 @@
|
|||
"""
|
||||
Different implementations of the MLModel Protocol
|
||||
Uses the BaseMLModel protocol
|
||||
Key tasks:
|
||||
- Template Model class for different model types
|
||||
- Save model
|
||||
- Load Model
|
||||
- Generate Inference
|
||||
"""
|
||||
|
||||
from typing import NamedTuple
|
||||
from pathlib import Path
|
||||
import pandas as pd
|
||||
from autogluon.tabular import TabularDataset, TabularPredictor
|
||||
from sklearn.metrics import mean_absolute_percentage_error
|
||||
from core.Logger import logger
|
||||
|
||||
AUTOGLUON_HYPERPARAMETERS = ['problem_type', 'eval_metric', 'time_limit', 'presets', 'excluded_model_types']
|
||||
METRIC_FILENAME = "metrics.csv"
|
||||
|
||||
class AutogluonModel:
|
||||
"""
|
||||
Autogluon model that implements the MLModel Protocol
|
||||
"""
|
||||
def __init__(self, output_filepath: Path = None) -> None:
|
||||
self.model = None
|
||||
self.output_filepath = output_filepath
|
||||
self.predictions = None
|
||||
|
||||
def load_model(self, filepath: Path) -> None:
|
||||
"""
|
||||
Providing a path, this function will load the model to be used. Will load to internal variable
|
||||
"""
|
||||
self.model = TabularPredictor.load(path=filepath)
|
||||
|
||||
def save_model(self, output_filepath: Path = None) -> None:
|
||||
"""
|
||||
Providing a path, this function will save the model to be used.
|
||||
"""
|
||||
logger.info("Using AutoGluon Model - Model saving already occured")
|
||||
|
||||
def train_model(
|
||||
self,
|
||||
data: pd.DataFrame,
|
||||
target_column: str,
|
||||
hyperparameters: dict = None) -> None:
|
||||
"""
|
||||
For the given data and hyperparameters, a model is trained
|
||||
"""
|
||||
if self.output_filepath is None:
|
||||
logger.error("Please specify a output_filepath in order to train a model")
|
||||
exit(1)
|
||||
|
||||
if set(AUTOGLUON_HYPERPARAMETERS) != set(hyperparameters.keys()):
|
||||
print("Hyperparameters (dict) is incorrectly defined - please check what hyperparameters are required")
|
||||
exit(1)
|
||||
|
||||
AGdata = TabularDataset(data=data)
|
||||
|
||||
self.model = TabularPredictor(
|
||||
label=target_column,
|
||||
path=self.output_filepath,
|
||||
problem_type=hyperparameters['problem_type'],
|
||||
eval_metric=hyperparameters['eval_metric']
|
||||
).fit(
|
||||
AGdata,
|
||||
time_limit=hyperparameters['time_limit'],
|
||||
presets=hyperparameters['presets'],
|
||||
excluded_model_types=hyperparameters['excluded_model_types']
|
||||
)
|
||||
|
||||
|
||||
def generate_predictions(self, data: pd.DataFrame) -> pd.DataFrame:
|
||||
"""
|
||||
For the given dataframe, model is loaded and predictions are generated
|
||||
"""
|
||||
|
||||
if self.model is None:
|
||||
print("No model loaded/ trained")
|
||||
exit(1)
|
||||
|
||||
predictions = self.model.predict(data)
|
||||
|
||||
return predictions
|
||||
|
||||
def model_evaluation(
|
||||
self,
|
||||
validation_data: pd.DataFrame,
|
||||
target_column: str,
|
||||
metrics_location: Path = None,
|
||||
metric_filename: str = METRIC_FILENAME
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
For any validation data, a set of predictions and metrics are return
|
||||
"""
|
||||
if metrics_location is None:
|
||||
logger.warning("Metrics will be outputted to current folder")
|
||||
|
||||
if self.model is None:
|
||||
logger.error("No model loaded/ trained - Unable to generate evaluation")
|
||||
exit(1)
|
||||
|
||||
performance = self.model.evaluate(validation_data)
|
||||
predictions = self.generate_predictions(validation_data)
|
||||
|
||||
logger.info("Prediction used for evaluations are saved in self.prediction")
|
||||
self.predictions = predictions
|
||||
|
||||
# TODO: Can have a custom metric class that defines all different metrics we want
|
||||
metric_mape = mean_absolute_percentage_error(validation_data[target_column], predictions)
|
||||
|
||||
performance['mape'] = metric_mape
|
||||
|
||||
logger.info("Saving metric file as metric.csv")
|
||||
metrics_location.mkdir(exist_ok=True)
|
||||
|
||||
metrics_df = pd.DataFrame([performance])
|
||||
metrics_df.to_csv(metrics_location / metric_filename)
|
||||
|
||||
return metrics_df
|
||||
|
||||
def optimise_model_for_deployment(self, deployment_path: Path = None) -> None:
|
||||
"""
|
||||
We can optimise the deployment for a autogluon model
|
||||
"""
|
||||
if self.model is None:
|
||||
logger.error("No model to optimise for deployment")
|
||||
exit(1)
|
||||
|
||||
if deployment_path is None:
|
||||
logger.error("Deployment path required")
|
||||
exit(1)
|
||||
|
||||
# This will return a string path of the location
|
||||
return self.model.clone_for_deployment(deployment_path)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
0
model_data/simulation_system/MLModel/__init__.py
Normal file
0
model_data/simulation_system/MLModel/__init__.py
Normal file
21
model_data/simulation_system/core/DataLoader.py
Normal file
21
model_data/simulation_system/core/DataLoader.py
Normal file
|
|
@ -0,0 +1,21 @@
|
|||
import pandas as pd
|
||||
from core.Logger import logger
|
||||
|
||||
class DataLoader():
|
||||
|
||||
@staticmethod
|
||||
def load(filepath: str, index_col: str = None) -> pd.DataFrame:
|
||||
"""
|
||||
Load different datasets
|
||||
"""
|
||||
if filepath.endswith('.parquet'):
|
||||
df = pd.read_parquet(filepath)
|
||||
if index_col is not None:
|
||||
df = df.set_index(index_col)
|
||||
elif filepath.endswith('.csv'):
|
||||
df = pd.read_csv(filepath, index_col=index_col)
|
||||
else:
|
||||
logger.error('Not implemented!')
|
||||
exit(1)
|
||||
|
||||
return df
|
||||
|
|
@ -2,7 +2,7 @@ from pathlib import Path
|
|||
import numpy as np
|
||||
import pandas as pd
|
||||
from model_data.BaseUtility import BaseUtility
|
||||
from simulation_system.Settings import (
|
||||
from simulation_system.core.Settings import (
|
||||
DATA_PROCESSOR_SETTINGS,
|
||||
EARLIEST_EPC_DATE,
|
||||
FULLY_GLAZED_DESCRIPTIONS,
|
||||
70
model_data/simulation_system/core/FeatureProcessor.py
Normal file
70
model_data/simulation_system/core/FeatureProcessor.py
Normal file
|
|
@ -0,0 +1,70 @@
|
|||
"""
|
||||
Create additional features from the dataset
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
from typing import List
|
||||
from core.Logger import logger
|
||||
|
||||
RDSAP_CHANGE_DROP_COLUMNS = ['UPRN', 'HEAT_DEMAND_CHANGE']
|
||||
HEAT_DEMAND_CHANGE_DROP_COLUMNS = ['UPRN', 'RDSAP_CHANGE']
|
||||
|
||||
RANDOM_SEED = 0
|
||||
|
||||
class FeatureProcessor:
|
||||
"""
|
||||
Handle all feature manipulation before modelling
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def drop_unused_columns(df: pd.DataFrame, target_column: str = "RDSAP_CHANGE") -> pd.DataFrame:
|
||||
"""
|
||||
Remove the unused columns for RDS
|
||||
"""
|
||||
if target_column == "RDSAP_CHANGE":
|
||||
df = df.drop(columns=RDSAP_CHANGE_DROP_COLUMNS)
|
||||
elif target_column == "HEAT_DEMAND_CHANGE":
|
||||
df = df.drop(columns=HEAT_DEMAND_CHANGE_DROP_COLUMNS)
|
||||
return df
|
||||
|
||||
@staticmethod
|
||||
def retain_features(df: pd.DataFrame, features: List[str] = None):
|
||||
"""
|
||||
Determine which columns to keep for modelling
|
||||
"""
|
||||
if features is None:
|
||||
features = df.columns
|
||||
else:
|
||||
if not set(features).issubset(df.columns):
|
||||
logger.error('Features defined is not contained in data')
|
||||
exit(1)
|
||||
|
||||
df = df[features]
|
||||
|
||||
return df
|
||||
|
||||
@staticmethod
|
||||
def subsample_data(df: pd.DataFrame, subsample_amount: int = None) -> pd.DataFrame:
|
||||
"""
|
||||
Sample data to reduce number of rows for model building if needed
|
||||
"""
|
||||
|
||||
if subsample_amount:
|
||||
df = df.sample(subsample_amount, random_state=RANDOM_SEED)
|
||||
return df
|
||||
|
||||
|
||||
def process(
|
||||
self,
|
||||
df: pd.DataFrame,
|
||||
target_column: str = "RDSAP_CHANGE",
|
||||
features: List[str] = None,
|
||||
subsample_amount: int = None
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Pipeline to get data ready for building a model
|
||||
"""
|
||||
df = self.subsample_data(df, subsample_amount=subsample_amount)
|
||||
df = self.drop_unused_columns(df, target_column=target_column)
|
||||
df = self.retain_features(df, features=features)
|
||||
return df
|
||||
|
|
@ -1,3 +1,7 @@
|
|||
"""
|
||||
Logger that will be used throughout the application
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
||||
def setup_logger():
|
||||
|
|
@ -1,5 +1,18 @@
|
|||
# Using a simply python file as settings for now
|
||||
# TODO: migrate to dynaconf
|
||||
from pathlib import Path
|
||||
|
||||
RANDOM_SEED = 0
|
||||
|
||||
TRAIN_AND_VALIDATION_DATA_NAME = 'train_validation_data.parquet'
|
||||
TEST_DATA_NAME = 'test_data.parquet'
|
||||
|
||||
REGISTRY_FILE = "model_registry.csv"
|
||||
MODEL_DIRECTORY = "model_directory"
|
||||
REGISTRY_PATH = Path(__file__).parent.parent / MODEL_DIRECTORY / REGISTRY_FILE
|
||||
PREDICTION_LOCATION = Path("predictions")
|
||||
PREDICTION_FILE = 'prediction.json'
|
||||
METADATA_FILE = 'metadata.json'
|
||||
|
||||
TOTAL_FLOOR_AREA_NATIONAL_AVERAGE = 70
|
||||
FLOOR_HEIGHT_NATIONAL_AVERAGE = 2.45
|
||||
0
model_data/simulation_system/core/__init__.py
Normal file
0
model_data/simulation_system/core/__init__.py
Normal file
|
|
@ -1,5 +1,5 @@
|
|||
from pathlib import Path
|
||||
from Settings import (
|
||||
from core.SettingsSettings import (
|
||||
RDSAP_RESPONSE,
|
||||
FLOOR_LEVEL_MAP,
|
||||
BUILT_FORM_REMAP,
|
||||
|
|
|
|||
|
|
@ -1,23 +1,22 @@
|
|||
import numpy as np
|
||||
import pandas as pd
|
||||
from tqdm import tqdm
|
||||
from model_data.BaseUtility import BaseUtility
|
||||
from pathlib import Path
|
||||
from model_data.simulation_system.Settings import (
|
||||
from core.Settings import (
|
||||
MANDATORY_FIXED_FEATURES,
|
||||
AVERAGE_FIXED_FEATURES,
|
||||
LATEST_FIELD,
|
||||
COMPONENT_FEATURES,
|
||||
RDSAP_RESPONSE,
|
||||
HEAT_DEMAND_RESPONSE,
|
||||
COLUMNS_TO_MERGE_ON,
|
||||
FLOOR_LEVEL_MAP,
|
||||
BUILT_FORM_REMAP
|
||||
COLUMNS_TO_MERGE_ON
|
||||
)
|
||||
from DataProcessor import DataProcessor
|
||||
from core.DataProcessor import DataProcessor
|
||||
|
||||
DATA_DIRECTORY = Path(__file__).parent / 'data' / 'all-domestic-certificates'
|
||||
|
||||
# TODO: Have a look at temporal features
|
||||
|
||||
def app():
|
||||
# Get all the files in the directory
|
||||
|
||||
|
|
@ -77,9 +76,6 @@ def app():
|
|||
if abs(vals[0] - vals[1]) / vals[0] > 0.1:
|
||||
# Take the more recent value since it's likely to be more accurate
|
||||
vals = [vals[-1]]
|
||||
|
||||
if len(vals) == 0:
|
||||
wrong_var
|
||||
|
||||
fixed_data[field] = np.mean(vals)
|
||||
|
||||
|
|
@ -2,63 +2,127 @@
|
|||
Script to load MLModel class and generate predictions
|
||||
"""
|
||||
|
||||
from Logger import logger
|
||||
from MLModel import AutogluonModel
|
||||
from DataLoader import DataLoader
|
||||
import json
|
||||
import argparse
|
||||
from MLModel.Models import AutogluonModel
|
||||
from core.Logger import logger
|
||||
from core.DataLoader import DataLoader
|
||||
from pathlib import Path
|
||||
import pandas as pd
|
||||
from typing import Optional
|
||||
from datetime import datetime
|
||||
from core.Settings import (
|
||||
REGISTRY_PATH,
|
||||
PREDICTION_LOCATION,
|
||||
PREDICTION_FILE,
|
||||
METADATA_FILE
|
||||
)
|
||||
|
||||
# These will be provided in some configuration setup
|
||||
HYPERPARAMETERS = {
|
||||
'problem_type': 'regression',
|
||||
'output_path': 'agModels-predictRDSAP',
|
||||
'eval_metric': 'mean_absolute_error',
|
||||
'time_limit': 8000,
|
||||
'presets': 'best_quality',
|
||||
'excluded_model_types': ['KNN']
|
||||
TIMESTAMP = datetime.now().strftime(format="%Y-%m-%d_%H-%M-%S")
|
||||
|
||||
}
|
||||
# FOR TESTING
|
||||
# For now just loading data first and then passing into function (i.e. as if we receive json data and convert to DataFrame)
|
||||
# TEST_DATA = DataLoader.load(filepath="../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet")
|
||||
# DATA = TEST_DATA.sample(1)
|
||||
|
||||
def main(model_path: str = None, data: pd.DataFrame = None, data_path: Optional[str] = None):
|
||||
|
||||
def ingest_arguments() -> argparse.Namespace:
|
||||
"""
|
||||
Helper function to take in arguments from script start
|
||||
"""
|
||||
|
||||
parser = argparse.ArgumentParser(description='Inputs for training script')
|
||||
parser.add_argument('--model-path', type=str, help='If you wish to use a specific model, specify the model path here')
|
||||
parser.add_argument('--data', type=str, help='Location of Parquet dataset to load for testing')
|
||||
parser.add_argument('--data-path', type=str, help='Location of Parquet dataset to load for training')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
return args
|
||||
|
||||
|
||||
|
||||
def prediction(registry_path: Path, model_path: str = None, data: pd.DataFrame = None, data_path: Optional[str] = None):
|
||||
"""
|
||||
Main pipeline function
|
||||
"""
|
||||
|
||||
if model_path is None:
|
||||
logger.error("No model path provided")
|
||||
if registry_path is None:
|
||||
logger.error("No registry path provided")
|
||||
exit(1)
|
||||
|
||||
if model_path is not None:
|
||||
logger.info("User specified a model to load - ignoring registry")
|
||||
model_location = model_path
|
||||
model_type = model_path
|
||||
model_name = model_path
|
||||
else:
|
||||
# TODO: Think about where registry will sit/ type
|
||||
logger.info("Loading best model from registry")
|
||||
registry_df = pd.read_csv(registry_path)
|
||||
best_model_df = registry_df[registry_df['best_model']]
|
||||
|
||||
model_location = best_model_df['model_location'].values[0]
|
||||
model_type = best_model_df['model_type'].values[0]
|
||||
model_name = best_model_df['model_name'].values[0]
|
||||
|
||||
logger.info("--- Model Info: ---")
|
||||
logger.info(f"Model type: {model_type}")
|
||||
logger.info(f"Model name: {model_name}")
|
||||
logger.info(f"Model location: {model_location}")
|
||||
|
||||
logger.info("--- Loading Data ---")
|
||||
if data is None and data_path is None:
|
||||
logger.error("No Data/Data Path passed")
|
||||
exit(1)
|
||||
|
||||
if data_path and data is None:
|
||||
logger.info("--- Loading Data ---")
|
||||
data = DataLoader().load()
|
||||
logger.info("Loading data from provided path")
|
||||
data = DataLoader().load(filepath=data_path, index_col="UPRN")
|
||||
|
||||
# TODO: DOWNSAMPLING DOWN TO JUST USE ONE FOR PREDICTION
|
||||
data = data.sample(1)
|
||||
else:
|
||||
logger.warning('Ignoring data_path and loading data provided')
|
||||
logger.info('Using data provided')
|
||||
data = json.loads(data)
|
||||
data = pd.DataFrame([data])
|
||||
print(data)
|
||||
|
||||
logger.info("--- Loading Model ---")
|
||||
model = AutogluonModel()
|
||||
model.load_model(filepath=model_path)
|
||||
|
||||
# model.train_model(
|
||||
# data=data,
|
||||
# target_column='RDSAP_CHANGE',
|
||||
# hyperparameters=HYPERPARAMETERS
|
||||
# )
|
||||
model.load_model(filepath=model_location)
|
||||
|
||||
logger.info("--- Generating Predictions ---")
|
||||
prediction = model.generate_predictions(data=data)
|
||||
|
||||
# Save prediction some where?
|
||||
prediction.to_csv("s3?")
|
||||
# prediction.to_csv("s3?")
|
||||
|
||||
# TODO: Check how we want to structure outputs
|
||||
# For now, just categorise by uprn and timestamp
|
||||
# Assume one uprn coming in for now
|
||||
uprn = data.index.values[0]
|
||||
|
||||
# Saving prediction local for now
|
||||
logger.info("--- Outputting prediction and metadata --- ")
|
||||
output_base = PREDICTION_LOCATION / uprn / TIMESTAMP
|
||||
output_base.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
json_prediction = prediction.to_json(output_base / PREDICTION_FILE)
|
||||
prediction_metadata = {
|
||||
"model_type": model_type,
|
||||
"model_name": model_name,
|
||||
"model_location": model_location,
|
||||
"model_settings": model.model.info()
|
||||
}
|
||||
|
||||
pd.DataFrame([prediction_metadata]).to_json(output_base / METADATA_FILE)
|
||||
|
||||
return json_prediction
|
||||
|
||||
if __name__ == "__main__":
|
||||
# For now just loading data first and then passing into function (i.e. as if we receive json data and convert to DataFrame)
|
||||
data = DataLoader.load(filepath="../simulation_system/preprocessed_data/dataset.parquet")
|
||||
data_for_prediction = data.sample(1)
|
||||
|
||||
main(filepath="", data=data_for_prediction)
|
||||
args = ingest_arguments()
|
||||
|
||||
# Data can be passed in as JSON string: python3 predictions.py --data '{"TOTAL_FLOOR_AREA": 1}'
|
||||
# Data path can be passed as so: python3 predictions.py --data-path ../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet
|
||||
prediction(registry_path=REGISTRY_PATH, model_path=args.model_path, data=args.data, data_path=args.data_path)
|
||||
|
|
@ -1,9 +1,12 @@
|
|||
from Logger import logger
|
||||
from core.Logger import logger
|
||||
import argparse
|
||||
import pandas as pd
|
||||
from pathlib import Path
|
||||
|
||||
RANDOM_SEED = 0
|
||||
from core.Settings import (
|
||||
RANDOM_SEED,
|
||||
TRAIN_AND_VALIDATION_DATA_NAME,
|
||||
TEST_DATA_NAME
|
||||
)
|
||||
|
||||
def ingest_arguments() -> argparse.Namespace:
|
||||
"""
|
||||
|
|
@ -56,8 +59,8 @@ def main(filepath: str, output_folder: str, percentage: float, volume: int, samp
|
|||
|
||||
logger.info('--- Saving data ---')
|
||||
|
||||
train_validation_data.to_parquet(Path(output_folder)/'train_validation_data.parquet')
|
||||
test_data.to_parquet(Path(output_folder)/'test_data.parquet')
|
||||
train_validation_data.to_parquet(Path(output_folder)/ TRAIN_AND_VALIDATION_DATA_NAME)
|
||||
test_data.to_parquet(Path(output_folder)/ TEST_DATA_NAME)
|
||||
|
||||
logger.info(' ---Pipeline complete---')
|
||||
|
||||
|
|
|
|||
|
|
@ -1,19 +1,37 @@
|
|||
import os
|
||||
import pandas as pd
|
||||
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from typing import List
|
||||
from Logger import logger
|
||||
from DataLoader import DataLoader
|
||||
from autogluon.tabular import TabularDataset, TabularPredictor
|
||||
from core.Logger import logger
|
||||
from core.DataLoader import DataLoader
|
||||
from core.FeatureProcessor import FeatureProcessor
|
||||
from MLModel.Models import AutogluonModel
|
||||
import pandas as pd
|
||||
from core.Settings import (
|
||||
MODEL_DIRECTORY,
|
||||
REGISTRY_PATH
|
||||
)
|
||||
|
||||
TIMESTAMP = datetime.now().strftime(format="%Y-%m-%d_%H-%M-%S")
|
||||
|
||||
DROP_COLUMNS = ['UPRN', 'HEAT_DEMAND_CHANGE']
|
||||
FEATURE_COLUMNS = None
|
||||
RANDOM_SEED = 0
|
||||
# Can move to a hyperparmeters file
|
||||
# If anything we might want to have a file that can be loaded and sent to this script
|
||||
HYPERPARAMETERS = {
|
||||
'problem_type': 'regression',
|
||||
'eval_metric': 'mean_absolute_error',
|
||||
'time_limit': 60,
|
||||
'presets': 'medium_quality',
|
||||
'excluded_model_types': None
|
||||
}
|
||||
|
||||
# FOR TESTING
|
||||
train_filepath = "./model_build_data/train_validation_data.parquet"
|
||||
test_filepath = "./model_build_data/test_data.parquet"
|
||||
train_filepath = "./model_build_data/change_data/rdsap_full/train_validation_data.parquet"
|
||||
test_filepath = "./model_build_data/change_data/rdsap_full/test_data.parquet"
|
||||
target_column = "RDSAP_CHANGE"
|
||||
model_type = "autogluon"
|
||||
hyperparameter = HYPERPARAMETERS
|
||||
subsample_factor = 200
|
||||
|
||||
|
||||
def ingest_arguments() -> argparse.Namespace:
|
||||
|
|
@ -23,98 +41,112 @@ def ingest_arguments() -> argparse.Namespace:
|
|||
|
||||
parser = argparse.ArgumentParser(description='Inputs for training script')
|
||||
|
||||
parser.add_argument('--train-filepath', type=str, help='Location of Parquet dataset to load for training')
|
||||
parser.add_argument('--test-filepath', type=str, help='Location of Parquet dataset to load for testing')
|
||||
parser.add_argument('--train-filepath', type=str, help='Location of Parquet dataset to load for training', required=True)
|
||||
parser.add_argument('--test-filepath', type=str, help='Location of Parquet dataset to load for testing', required=True)
|
||||
parser.add_argument('--model-type', type=str, help='The type of model to train', choices=["autogluon"], default="autogluon")
|
||||
parser.add_argument('--target-column', type=str, help='The response variable', choices=["RDSAP_CHANGE"], default='RDSAP_CHANGE')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
return args
|
||||
|
||||
class FeatureProcessor:
|
||||
"""
|
||||
Handle all feature manipulation before modelling
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def drop_columns(df: pd.DataFrame, drop_columns: str = DROP_COLUMNS) -> pd.DataFrame:
|
||||
df = df.drop(columns=[drop_columns])
|
||||
return df
|
||||
|
||||
def retain_features(df: pd.DataFrame, features: List[str] = None):
|
||||
"""
|
||||
Determine which columns to keep ofr modelling
|
||||
"""
|
||||
if features is None:
|
||||
features = df.columns
|
||||
else:
|
||||
if not set(features).issubset(df.columns):
|
||||
logger.error('Features defined is not contained in data')
|
||||
exit(1)
|
||||
|
||||
df = df[features]
|
||||
|
||||
return df
|
||||
|
||||
def process(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
df = self.drop_columns(df, drop_columns=DROP_COLUMNS)
|
||||
df = self.retain_features(df, features=FEATURE_COLUMNS)
|
||||
return df
|
||||
|
||||
|
||||
|
||||
def training(train_filepath: str, test_filepath: str) -> None:
|
||||
def training(
|
||||
train_filepath: str,
|
||||
test_filepath: str,
|
||||
target_column: str = "RDSAP_CHANGE",
|
||||
model_type: str = "autogluon",
|
||||
hyperparameter: dict = HYPERPARAMETERS
|
||||
) -> None:
|
||||
"""
|
||||
Pipeline to run training on the dataset
|
||||
"""
|
||||
|
||||
logger.info('Loading data')
|
||||
logger.info('--- Loading data ---')
|
||||
dataloader = DataLoader()
|
||||
train_df = dataloader.load(filepath=train_filepath)
|
||||
test_df = dataloader.load(filepath=test_filepath)
|
||||
|
||||
# df = pd.read_parquet(train_filepath).drop(columns=['HEAT_DEMAND_CHANGE'])
|
||||
|
||||
logger.info('Feature processing')
|
||||
logger.info('--- Feature processing ---')
|
||||
|
||||
feature_processor = FeatureProcessor()
|
||||
train_df = feature_processor.process(train_df)
|
||||
test_df = feature_processor.process(test_df)
|
||||
|
||||
# logger.info('Split data into train and validation')
|
||||
subsample_amount = round(len(train_df)/subsample_factor)
|
||||
|
||||
logger.info('Build Model')
|
||||
train_df = feature_processor.process(train_df, target_column=target_column, subsample_amount=subsample_amount)
|
||||
test_df = feature_processor.process(test_df, target_column=target_column)
|
||||
|
||||
logger.info('--- Build Model ---')
|
||||
if model_type == "autogluon":
|
||||
model_root = f"{target_column}-{HYPERPARAMETERS['presets']}-{HYPERPARAMETERS['time_limit']}-{TIMESTAMP}".lower()
|
||||
output_base = Path(MODEL_DIRECTORY) / model_type / model_root
|
||||
|
||||
model_folder = "model"
|
||||
metrics_folder = "metrics"
|
||||
|
||||
model = AutogluonModel(
|
||||
output_filepath = output_base / model_folder
|
||||
)
|
||||
else:
|
||||
logger.error("No alternative model implemented yet")
|
||||
exit(1)
|
||||
|
||||
data = TabularDataset(data=train_filepath)
|
||||
data = data.drop(columns=['UPRN', 'HEAT_DEMAND_CHANGE'])
|
||||
TOP_FEATURES = ['MAINHEAT', 'ROOF', 'WALLS', 'MAINHEATCONT', 'PHOTO', 'HOTWATER', 'SECONDHEAT']
|
||||
# top_features = data.columns[data.columns.str.startswith(tuple(TOP_FEATURES))]
|
||||
model.train_model(
|
||||
data=train_df,
|
||||
target_column=target_column,
|
||||
hyperparameters=hyperparameter
|
||||
)
|
||||
|
||||
logger.info("--- Save Model ---")
|
||||
model.save_model(output_filepath=model.output_filepath)
|
||||
|
||||
data = data[['RDSAP_CHANGE'] + top_features.to_list()]
|
||||
# data = TabularDataset(data=train_df)
|
||||
# data['RDSAP_CHANGE'] = data['RDSAP_CHANGE'].astype(float)
|
||||
subsample_size = round(len(data)/20)
|
||||
data = data.sample(subsample_size, random_state=RANDOM_SEED)
|
||||
logger.info('--- Generate evaluation metrics ---')
|
||||
metrics_df = model.model_evaluation(
|
||||
validation_data=test_df,
|
||||
target_column=target_column,
|
||||
metrics_location = output_base / metrics_folder
|
||||
)
|
||||
|
||||
# TODO: introduce a seperate script for model optimisation, and from there, optimise for deployment
|
||||
# Imagining for now that the model trained here is the best model amongst all models built
|
||||
|
||||
# Add custom metric class MAPE
|
||||
# Have a look at temporal features
|
||||
logger.info("--- Optimising model for deployment ---")
|
||||
optimised_folder = "deployment"
|
||||
deployment_model_path = model.optimise_model_for_deployment(deployment_path= output_base / optimised_folder)
|
||||
logger.info("Optimised version of best model can be found at: {deployment_model_path}")
|
||||
|
||||
target_column = 'RDSAP_CHANGE'
|
||||
predictor_RDSAP = TabularPredictor(
|
||||
label=target_column,
|
||||
path="agModels-predictRDSAP",
|
||||
problem_type="regression",
|
||||
eval_metric='mean_absolute_error'
|
||||
).fit(data, time_limit=200, presets='best_quality', excluded_model_types=['KNN'])
|
||||
# TODO: Need a model registry - for now have this as a CSV
|
||||
# Save this in the model directory
|
||||
logger.info("--- Append registry with new model ---")
|
||||
|
||||
if REGISTRY_PATH.exists():
|
||||
logger.info("Registry file found - Loading into Dataframe")
|
||||
registry_df = pd.read_csv(REGISTRY_PATH, index_col=None)
|
||||
else:
|
||||
registry_df = pd.DataFrame(columns=['model_type', 'model_name', 'model_location', 'mean_absolute_error', 'root_mean_squared_error', 'mean_squared_error', 'r2', 'pearsonr', 'median_absolute_error', 'mape', 'best_model'])
|
||||
|
||||
model_details_df = pd.DataFrame(
|
||||
[{
|
||||
'model_type': model_type,
|
||||
'model_name': model_root,
|
||||
'model_location': deployment_model_path
|
||||
}]
|
||||
)
|
||||
|
||||
registry_row = pd.concat([model_details_df, metrics_df], axis=1)
|
||||
registry_df = pd.concat([registry_df, registry_row], axis=0).reset_index(drop=True)
|
||||
|
||||
# TODO: will need a rebuild script metric script -i.e. if we add new metrics, we will want to load models and regenerate new metrics
|
||||
# TODO: decide metric to optimise to
|
||||
registry_df = registry_df.sort_values("mean_absolute_error", ascending=False).reset_index(drop=True)
|
||||
registry_df['best_model'] = [False]*len(registry_df)
|
||||
registry_df.loc[0, 'best_model'] = True
|
||||
|
||||
logger.info('Evaluate matrics')
|
||||
logger.info("--- Saving new model to registry ---")
|
||||
registry_df.to_csv(REGISTRY_PATH, index=False)
|
||||
|
||||
test_data = TabularDataset('./model_build_data/test_data.parquet')
|
||||
performance = predictor_RDSAP.evaluate(test_data)
|
||||
predictions = predictor_RDSAP.predict(test_data)
|
||||
logger.info("--- Training Pipeline Complete --- ")
|
||||
|
||||
test_data['predictions'] = predictions
|
||||
test_data['diff'] = abs(test_data['RDSAP_CHANGE'] - test_data['predictions'])
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
|
|
@ -123,4 +155,10 @@ if __name__ == "__main__":
|
|||
logger.info('---Ingest Arguments---')
|
||||
args = ingest_arguments()
|
||||
|
||||
training(train_filepath=args.train_filepath, test_filepath=args.test_filepath)
|
||||
# TODO: Ingest hyper parameters from somewhere - currently change at the top of script
|
||||
training(
|
||||
train_filepath=args.train_filepath,
|
||||
test_filepath=args.test_filepath,
|
||||
target_column=args.target_column,
|
||||
model_type=args.model_type
|
||||
)
|
||||
Loading…
Add table
Reference in a new issue