Merge pull request #167 from Hestia-Homes/model-env

Model env
This commit is contained in:
KhalimCK 2023-08-29 21:04:10 +01:00 committed by GitHub
commit 724e6bc608
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
12 changed files with 141 additions and 126 deletions

1
.gitignore vendored
View file

@ -252,6 +252,7 @@ backend/.idea
open_uprn/.idea/
conservation_areas/.idea/
model_data/.idea/
model_data/simulation_system/.idea/
model_data/simulation_system/data*

2
.idea/Model.iml generated
View file

@ -7,7 +7,7 @@
<sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
</content>
<orderEntry type="jdk" jdkName="Python 3.10 (backend)" jdkType="Python SDK" />
<orderEntry type="jdk" jdkName="Python 3.10 (simulation_system_prediction)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

2
.idea/misc.xml generated
View file

@ -1,6 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (backend)" project-jdk-type="Python SDK" />
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (simulation_system_prediction)" project-jdk-type="Python SDK" />
<component name="PythonCompatibilityInspectionAdvertiser">
<option name="version" value="3" />
</component>

0
__init__.py Normal file
View file

View file

@ -13,15 +13,17 @@ from pathlib import Path
import pandas as pd
from autogluon.tabular import TabularDataset, TabularPredictor
from sklearn.metrics import mean_absolute_percentage_error
from core.Logger import logger
from model_data.simulation_system.core.Logger import logger
AUTOGLUON_HYPERPARAMETERS = ['problem_type', 'eval_metric', 'time_limit', 'presets', 'excluded_model_types']
METRIC_FILENAME = "metrics.csv"
class AutogluonModel:
"""
Autogluon model that implements the MLModel Protocol
"""
def __init__(self, output_filepath: Path = None) -> None:
self.model = None
self.output_filepath = output_filepath
@ -40,10 +42,10 @@ class AutogluonModel:
logger.info("Using AutoGluon Model - Model saving already occured")
def train_model(
self,
data: pd.DataFrame,
target_column: str,
hyperparameters: dict = None) -> None:
self,
data: pd.DataFrame,
target_column: str,
hyperparameters: dict = None) -> None:
"""
For the given data and hyperparameters, a model is trained
"""
@ -58,17 +60,16 @@ class AutogluonModel:
AGdata = TabularDataset(data=data)
self.model = TabularPredictor(
label=target_column,
path=self.output_filepath,
label=target_column,
path=self.output_filepath,
problem_type=hyperparameters['problem_type'],
eval_metric=hyperparameters['eval_metric']
).fit(
AGdata,
time_limit=hyperparameters['time_limit'],
presets=hyperparameters['presets'],
).fit(
AGdata,
time_limit=hyperparameters['time_limit'],
presets=hyperparameters['presets'],
excluded_model_types=hyperparameters['excluded_model_types']
)
)
def generate_predictions(self, data: pd.DataFrame) -> pd.DataFrame:
"""
@ -84,12 +85,12 @@ class AutogluonModel:
return predictions
def model_evaluation(
self,
validation_data: pd.DataFrame,
target_column: str,
metrics_location: Path = None,
metric_filename: str = METRIC_FILENAME
) -> pd.DataFrame:
self,
validation_data: pd.DataFrame,
target_column: str,
metrics_location: Path = None,
metric_filename: str = METRIC_FILENAME
) -> pd.DataFrame:
"""
For any validation data, a set of predictions and metrics are return
"""
@ -105,7 +106,7 @@ class AutogluonModel:
logger.info("Prediction used for evaluations are saved in self.prediction")
self.predictions = predictions
# TODO: Can have a custom metric class that defines all different metrics we want
metric_mape = mean_absolute_percentage_error(validation_data[target_column], predictions)
@ -117,26 +118,19 @@ class AutogluonModel:
metrics_df = pd.DataFrame([performance])
metrics_df.to_csv(metrics_location / metric_filename)
markdown_filename = metric_filename.split(".")[0] + ".md"
metrics_df.to_markdown(metrics_location/ markdown_filename)
metrics_df.to_markdown(metrics_location / markdown_filename)
return metrics_df
def optimise_model_for_deployment(self, deployment_path: Path = None) -> None:
def optimise_model_for_deployment(self, deployment_path: Path = None) -> str:
"""
We can optimise the deployment for a autogluon model
"""
if self.model is None:
logger.error("No model to optimise for deployment")
exit(1)
raise ValueError("No model to optimise for deployment")
if deployment_path is None:
logger.error("Deployment path required")
exit(1)
raise ValueError("Deployment path required")
# This will return a string path of the location
return self.model.clone_for_deployment(deployment_path)

View file

@ -1,13 +1,18 @@
import pandas as pd
from core.Logger import logger
import os
class DataLoader():
class DataLoader:
@staticmethod
def load(filepath: str, index_col: str = None) -> pd.DataFrame:
"""
Load different datasets
"""
if not os.path.exists(filepath):
raise FileNotFoundError(f"File not found: {filepath}")
if filepath.endswith('.parquet'):
df = pd.read_parquet(filepath)
if index_col is not None:
@ -15,7 +20,6 @@ class DataLoader():
elif filepath.endswith('.csv'):
df = pd.read_csv(filepath, index_col=index_col)
else:
logger.error('Not implemented!')
exit(1)
raise ValueError(f"File format not supported for file: {filepath}")
return df
return df

View file

@ -23,6 +23,7 @@ class DataProcessor:
def __init__(self, filepath: Path) -> None:
self.filepath = filepath
self.data = None
def load_data(self, low_memory=False) -> None:
self.data = pd.read_csv(self.filepath, low_memory=low_memory)

View file

@ -4,13 +4,14 @@ Create additional features from the dataset
import pandas as pd
from typing import List
from core.Logger import logger
from model_data.simulation_system.core.Logger import logger
RDSAP_CHANGE_DROP_COLUMNS = ['UPRN', 'HEAT_DEMAND_CHANGE']
HEAT_DEMAND_CHANGE_DROP_COLUMNS = ['UPRN', 'RDSAP_CHANGE']
RANDOM_SEED = 0
RANDOM_SEED = 0
class FeatureProcessor:
"""
Handle all feature manipulation before modelling
@ -38,11 +39,11 @@ class FeatureProcessor:
if not set(features).issubset(df.columns):
logger.error('Features defined is not contained in data')
exit(1)
df = df[features]
return df
@staticmethod
def subsample_data(df: pd.DataFrame, subsample_amount: int = None) -> pd.DataFrame:
"""
@ -53,14 +54,13 @@ class FeatureProcessor:
df = df.sample(subsample_amount, random_state=RANDOM_SEED)
return df
def process(
self,
df: pd.DataFrame,
target_column: str = "RDSAP_CHANGE",
features: List[str] = None,
subsample_amount: int = None
) -> pd.DataFrame:
self,
df: pd.DataFrame,
target_column: str = "RDSAP_CHANGE",
features: List[str] = None,
subsample_amount: int = None
) -> pd.DataFrame:
"""
Pipeline to get data ready for building a model
"""

View file

@ -4,14 +4,13 @@ Script to load MLModel class and generate predictions
import json
import argparse
from MLModel.Models import AutogluonModel
from core.Logger import logger
from core.DataLoader import DataLoader
from pathlib import Path
from model_data.simulation_system.MLModel.Models import AutogluonModel
from model_data.simulation_system.core.Logger import logger
from model_data.simulation_system.core.DataLoader import DataLoader
import pandas as pd
from typing import Optional
from datetime import datetime
from core.Settings import (
from model_data.simulation_system.core.Settings import (
BASE_REGISTRY_PATH,
REGISTRY_FILE,
PREDICTION_LOCATION,
@ -19,10 +18,12 @@ from core.Settings import (
METADATA_FILE
)
TIMESTAMP = datetime.now().strftime(format="%Y-%m-%d_%H-%M-%S")
TIMESTAMP = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
# FOR TESTING
# For now just loading data first and then passing into function (i.e. as if we receive json data and convert to DataFrame)
# For now just loading data first and then passing into function (i.e. as if we receive json data and convert to
# DataFrame)
# TEST_DATA = DataLoader.load(filepath="../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet")
# DATA = TEST_DATA.sample(1)
@ -33,18 +34,20 @@ def ingest_arguments() -> argparse.Namespace:
"""
parser = argparse.ArgumentParser(description='Inputs for training script')
parser.add_argument('--target-column', type=str, help='The response variable you are predicting for', choices=['RDSAP_CHANGE', 'HEAT_DEMAND_CHANGE'], default='RDSAP_CHANGE')
parser.add_argument('--model-path', type=str, help='If you wish to use a specific model, specify the model path here')
parser.add_argument('--target-column', type=str, help='The response variable you are predicting for',
choices=['RDSAP_CHANGE', 'HEAT_DEMAND_CHANGE'], default='RDSAP_CHANGE')
parser.add_argument('--model-path', type=str,
help='If you wish to use a specific model, specify the model path here')
parser.add_argument('--data', type=str, help='Json data for predictions')
parser.add_argument('--data-path', type=str, help='Location of Parquet dataset to load for training')
args = parser.parse_args()
return args
def prediction(target_column: str = "RDSAP_CHANGE", model_path: str = None, data: pd.DataFrame = None, data_path: Optional[str] = None):
def prediction(target_column: str = "RDSAP_CHANGE", model_path: str = None, data: pd.DataFrame = None,
data_path: Optional[str] = None):
"""
Main pipeline function
"""
@ -93,6 +96,7 @@ def prediction(target_column: str = "RDSAP_CHANGE", model_path: str = None, data
logger.info("--- Loading Model ---")
model = AutogluonModel()
model.load_model(filepath=model_location)
logger.info("--- Generating Predictions ---")
@ -125,10 +129,11 @@ def prediction(target_column: str = "RDSAP_CHANGE", model_path: str = None, data
return json_prediction
if __name__ == "__main__":
if __name__ == "__main__":
args = ingest_arguments()
# Data can be passed in as JSON string: python3 predictions.py --data '{"TOTAL_FLOOR_AREA": 1}'
# Data path can be passed as so: python3 predictions.py --data-path ../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet
prediction(target_column=args.target_column, model_path=args.model_path, data=args.data, data_path=args.data_path)
# Data path can be passed as so: python3 predictions.py --data-path
# ../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet
prediction(target_column=args.target_column, model_path=args.model_path, data=args.data, data_path=args.data_path)

View file

@ -0,0 +1,2 @@
autogluon==0.8.2
pandas==1.5.3

View file

@ -0,0 +1,3 @@
autogluon==0.8.2
pandas==1.5.3
seaborn==0.12.2

View file

@ -1,16 +1,13 @@
import argparse
# import boto3
import os
from pathlib import Path
from datetime import datetime
from typing import List
from core.Logger import logger
from core.DataLoader import DataLoader
from core.FeatureProcessor import FeatureProcessor
from MLModel.Models import AutogluonModel
from model_data.simulation_system.core.Logger import logger
from model_data.simulation_system.core.DataLoader import DataLoader
from model_data.simulation_system.core.FeatureProcessor import FeatureProcessor
from model_data.simulation_system.MLModel.Models import AutogluonModel
import pandas as pd
from core.Settings import (
from model_data.simulation_system.core.Settings import (
MODEL_DIRECTORY,
BASE_REGISTRY_PATH,
REGISTRY_FILE,
@ -23,7 +20,8 @@ from core.Settings import (
import seaborn as sns
import matplotlib.pyplot as plt
TIMESTAMP = datetime.now().strftime(format="%Y-%m-%d_%H-%M-%S")
TIMESTAMP = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
# FOR TESTING
# train_filepath = "./model_build_data/change_data/rdsap_full/train_validation_data.parquet"
@ -52,23 +50,27 @@ def ingest_arguments() -> argparse.Namespace:
parser = argparse.ArgumentParser(description='Inputs for training script')
parser.add_argument('--train-filepath', type=str, help='Location of Parquet dataset to load for training', required=True)
parser.add_argument('--test-filepath', type=str, help='Location of Parquet dataset to load for testing', required=True)
parser.add_argument('--model-type', type=str, help='The type of model to train', choices=["autogluon"], default="autogluon")
parser.add_argument('--target-column', type=str, help='The response variable', choices=["RDSAP_CHANGE", "HEAT_DEMAND_CHANGE"], default='RDSAP_CHANGE')
parser.add_argument('--train-filepath', type=str, help='Location of Parquet dataset to load for training',
required=True)
parser.add_argument('--test-filepath', type=str, help='Location of Parquet dataset to load for testing',
required=True)
parser.add_argument('--model-type', type=str, help='The type of model to train', choices=["autogluon"],
default="autogluon")
parser.add_argument('--target-column', type=str, help='The response variable',
choices=["RDSAP_CHANGE", "HEAT_DEMAND_CHANGE"], default='RDSAP_CHANGE')
args = parser.parse_args()
return args
def training(
train_filepath: str,
test_filepath: str,
target_column: str = "RDSAP_CHANGE",
model_type: str = "autogluon",
hyperparameters: dict = None
) -> None:
train_filepath: str,
test_filepath: str,
target_column: str = "RDSAP_CHANGE",
model_type: str = "autogluon",
hyperparameters: dict = None
) -> None:
"""
Pipeline to run training on the dataset
"""
@ -77,12 +79,12 @@ def training(
dataloader = DataLoader()
train_df = dataloader.load(filepath=train_filepath)
test_df = dataloader.load(filepath=test_filepath)
logger.info('--- Feature processing ---')
feature_processor = FeatureProcessor()
subsample_amount = round(len(train_df)/SUBSAMPLE_FACTOR)
subsample_amount = round(len(train_df) / SUBSAMPLE_FACTOR)
train_df = feature_processor.process(train_df, target_column=target_column, subsample_amount=subsample_amount)
test_df = feature_processor.process(test_df, target_column=target_column)
@ -98,71 +100,68 @@ def training(
if model_type == "autogluon":
model_root = f"{target_column}-{hyperparameters['presets']}-{hyperparameters['time_limit']}-{TIMESTAMP}".lower()
output_base = Path(MODEL_DIRECTORY) / target_column / model_type / model_root
output_base = Path(MODEL_DIRECTORY) / target_column / model_type / model_root
model = AutogluonModel(
output_filepath = output_base / MODEL_FOLDER
)
else:
logger.error("No alternative model implemented yet")
exit(1)
model.train_model(
data=train_df,
target_column=target_column,
hyperparameters=hyperparameters
output_filepath=output_base / MODEL_FOLDER
)
else:
raise ValueError("No alternative model implemented yet")
model.train_model(
data=train_df,
target_column=target_column,
hyperparameters=hyperparameters
)
logger.info("--- Save Model ---")
model.save_model(output_filepath=model.output_filepath)
logger.info('--- Generate evaluation metrics ---')
metrics_df = model.model_evaluation(
validation_data=test_df,
validation_data=test_df,
target_column=target_column,
metrics_location = output_base / METRICS_FOLDER
)
metrics_location=output_base / METRICS_FOLDER
)
logger.info("--- Generate metric outputs using predictions ---")
# TODO: can have a model.metric_outputs method
# FOr not just do it here
residual_df = pd.DataFrame(list(zip(test_df[target_column], model.predictions)), columns=['true', 'pred'])
# image formatting
# TODO: move to settings file , AXIS_FONT, TITLE_FONT
axis_fs = 18 #fontsize
title_fs = 22 #fontsize
axis_fs = 18 # fontsize
title_fs = 22 # fontsize
sns.set(style="whitegrid")
ax = sns.scatterplot(x="true", y="pred",data=residual_df)
ax = sns.scatterplot(x="true", y="pred", data=residual_df)
ax.set_aspect('equal')
ax.set_xlabel(f'True {target_column}',fontsize = axis_fs)
ax.set_ylabel(f'Predicted {target_column}', fontsize = axis_fs)#ylabel
ax.set_title('Residuals', fontsize = title_fs)
ax.set_xlabel(f'True {target_column}', fontsize=axis_fs)
ax.set_ylabel(f'Predicted {target_column}', fontsize=axis_fs) # ylabel
ax.set_title('Residuals', fontsize=title_fs)
# Square aspect ratio
ax.plot([-100, 100], [-100, 100], 'black', linewidth=1)
plt.tight_layout()
RESIDUAL_FILE = "residuals.png"
plt.savefig(output_base / METRICS_FOLDER / RESIDUAL_FILE, dpi=120)
plt.savefig(output_base / METRICS_FOLDER / RESIDUAL_FILE, dpi=120)
# TODO: for cml, we might want to have class that outputs all data and plots to add to the report
# If we want residual plot/ any plots, we will need to self host
# plt.savefig(RESIDUAL_FILE, dpi=120)
# TODO: introduce a seperate script for model optimisation, and from there, optimise for deployment
# Imagining for now that the model trained here is the best model amongst all models built
logger.info("--- Optimising model for deployment ---")
deployment_model_path = model.optimise_model_for_deployment(deployment_path= output_base / DEPLOYMENT_FOLDER)
deployment_model_path = model.optimise_model_for_deployment(deployment_path=output_base / DEPLOYMENT_FOLDER)
logger.info(f"Optimised version of best model can be found at: {deployment_model_path}")
# TODO: Need a model registry - for now have this as a CSV
# Save this in the model directory
logger.info("--- Append registry with new model ---")
registry_path = BASE_REGISTRY_PATH / target_column / REGISTRY_FILE
if registry_path.exists():
@ -170,43 +169,49 @@ def training(
registry_df = pd.read_csv(registry_path, index_col=None)
else:
# TODO: Moved columns into settings: MODEL_DETAILS and Metrics class columns
registry_df = pd.DataFrame(columns=['model_type', 'model_name', 'model_location', 'mean_absolute_error', 'root_mean_squared_error', 'mean_squared_error', 'r2', 'pearsonr', 'median_absolute_error', 'mape', 'best_model'])
registry_df = pd.DataFrame(
columns=['model_type', 'model_name', 'model_location', 'mean_absolute_error', 'root_mean_squared_error',
'mean_squared_error', 'r2', 'pearsonr', 'median_absolute_error', 'mape', 'best_model'])
model_details_df = pd.DataFrame(
[{
'model_type': model_type,
'model_name': model_root,
'model_type': model_type,
'model_name': model_root,
'model_location': deployment_model_path
}]
)
)
registry_row = pd.concat([model_details_df, metrics_df], axis=1)
registry_df = pd.concat([registry_df, registry_row], axis=0).reset_index(drop=True)
# TODO: will need a rebuild script metric script -i.e. if we add new metrics, we will want to load models and regenerate new metrics
# TODO: will need a rebuild script metric script -i.e. if we add new metrics, we will want to load models and
# regenerate new metrics
# TODO: decide metric to optimise to
registry_df = registry_df.sort_values("mean_absolute_error", ascending=False).reset_index(drop=True)
registry_df['best_model'] = [False]*len(registry_df)
registry_df['best_model'] = [False] * len(registry_df)
registry_df.loc[0, 'best_model'] = True
logger.info("--- Saving new model to registry ---")
# Ensure the directory exists
registry_path.parent.mkdir(parents=True, exist_ok=True)
registry_df.to_csv(registry_path, index=False)
logger.info("--- Training Pipeline Complete --- ")
if __name__ == "__main__":
logger.info('---Begin Pipeline---')
logger.info('---Ingest Arguments---')
args = ingest_arguments()
# To run script: python3 training.py --train-filepath ./model_build_data/change_data/rdsap_full/train_validation_data.parquet --test-filepath ./model_build_data/change_data/rdsap_full/test_data.parquet
# To run script: python3 training.py --train-filepath
# ./model_build_data/change_data/rdsap_full/train_validation_data.parquet --test-filepath
# ./model_build_data/change_data/rdsap_full/test_data.parquet
# TODO: Ingest hyper parameters from somewhere - currently change at the top of script
training(
train_filepath=args.train_filepath,
test_filepath=args.test_filepath,
target_column=args.target_column,
train_filepath=args.train_filepath,
test_filepath=args.test_filepath,
target_column=args.target_column,
model_type=args.model_type
)
)