diff --git a/.gitignore b/.gitignore
index cb17846e..be9da3aa 100644
--- a/.gitignore
+++ b/.gitignore
@@ -252,6 +252,7 @@ backend/.idea
open_uprn/.idea/
conservation_areas/.idea/
model_data/.idea/
+model_data/simulation_system/.idea/
model_data/simulation_system/data*
diff --git a/.idea/Model.iml b/.idea/Model.iml
index 05b9012b..0ded8e60 100644
--- a/.idea/Model.iml
+++ b/.idea/Model.iml
@@ -7,7 +7,7 @@
-
+
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
index 3b05c6ac..ae87bfde 100644
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -1,6 +1,6 @@
-
+
diff --git a/__init__.py b/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/model_data/simulation_system/MLModel/Models.py b/model_data/simulation_system/MLModel/Models.py
index 137f2f20..ccf6fdf8 100644
--- a/model_data/simulation_system/MLModel/Models.py
+++ b/model_data/simulation_system/MLModel/Models.py
@@ -13,15 +13,17 @@ from pathlib import Path
import pandas as pd
from autogluon.tabular import TabularDataset, TabularPredictor
from sklearn.metrics import mean_absolute_percentage_error
-from core.Logger import logger
+from model_data.simulation_system.core.Logger import logger
AUTOGLUON_HYPERPARAMETERS = ['problem_type', 'eval_metric', 'time_limit', 'presets', 'excluded_model_types']
METRIC_FILENAME = "metrics.csv"
+
class AutogluonModel:
"""
Autogluon model that implements the MLModel Protocol
"""
+
def __init__(self, output_filepath: Path = None) -> None:
self.model = None
self.output_filepath = output_filepath
@@ -40,10 +42,10 @@ class AutogluonModel:
logger.info("Using AutoGluon Model - Model saving already occured")
def train_model(
- self,
- data: pd.DataFrame,
- target_column: str,
- hyperparameters: dict = None) -> None:
+ self,
+ data: pd.DataFrame,
+ target_column: str,
+ hyperparameters: dict = None) -> None:
"""
For the given data and hyperparameters, a model is trained
"""
@@ -58,17 +60,16 @@ class AutogluonModel:
AGdata = TabularDataset(data=data)
self.model = TabularPredictor(
- label=target_column,
- path=self.output_filepath,
+ label=target_column,
+ path=self.output_filepath,
problem_type=hyperparameters['problem_type'],
eval_metric=hyperparameters['eval_metric']
- ).fit(
- AGdata,
- time_limit=hyperparameters['time_limit'],
- presets=hyperparameters['presets'],
+ ).fit(
+ AGdata,
+ time_limit=hyperparameters['time_limit'],
+ presets=hyperparameters['presets'],
excluded_model_types=hyperparameters['excluded_model_types']
- )
-
+ )
def generate_predictions(self, data: pd.DataFrame) -> pd.DataFrame:
"""
@@ -84,12 +85,12 @@ class AutogluonModel:
return predictions
def model_evaluation(
- self,
- validation_data: pd.DataFrame,
- target_column: str,
- metrics_location: Path = None,
- metric_filename: str = METRIC_FILENAME
- ) -> pd.DataFrame:
+ self,
+ validation_data: pd.DataFrame,
+ target_column: str,
+ metrics_location: Path = None,
+ metric_filename: str = METRIC_FILENAME
+ ) -> pd.DataFrame:
"""
For any validation data, a set of predictions and metrics are return
"""
@@ -105,7 +106,7 @@ class AutogluonModel:
logger.info("Prediction used for evaluations are saved in self.prediction")
self.predictions = predictions
-
+
# TODO: Can have a custom metric class that defines all different metrics we want
metric_mape = mean_absolute_percentage_error(validation_data[target_column], predictions)
@@ -117,26 +118,19 @@ class AutogluonModel:
metrics_df = pd.DataFrame([performance])
metrics_df.to_csv(metrics_location / metric_filename)
markdown_filename = metric_filename.split(".")[0] + ".md"
- metrics_df.to_markdown(metrics_location/ markdown_filename)
+ metrics_df.to_markdown(metrics_location / markdown_filename)
return metrics_df
- def optimise_model_for_deployment(self, deployment_path: Path = None) -> None:
+ def optimise_model_for_deployment(self, deployment_path: Path = None) -> str:
"""
We can optimise the deployment for a autogluon model
"""
if self.model is None:
- logger.error("No model to optimise for deployment")
- exit(1)
+ raise ValueError("No model to optimise for deployment")
if deployment_path is None:
- logger.error("Deployment path required")
- exit(1)
+ raise ValueError("Deployment path required")
# This will return a string path of the location
return self.model.clone_for_deployment(deployment_path)
-
-
-
-
-
\ No newline at end of file
diff --git a/model_data/simulation_system/core/DataLoader.py b/model_data/simulation_system/core/DataLoader.py
index 1e811f8d..dcd7af16 100644
--- a/model_data/simulation_system/core/DataLoader.py
+++ b/model_data/simulation_system/core/DataLoader.py
@@ -1,13 +1,18 @@
import pandas as pd
-from core.Logger import logger
+import os
-class DataLoader():
+
+class DataLoader:
@staticmethod
def load(filepath: str, index_col: str = None) -> pd.DataFrame:
"""
Load different datasets
"""
+
+ if not os.path.exists(filepath):
+ raise FileNotFoundError(f"File not found: {filepath}")
+
if filepath.endswith('.parquet'):
df = pd.read_parquet(filepath)
if index_col is not None:
@@ -15,7 +20,6 @@ class DataLoader():
elif filepath.endswith('.csv'):
df = pd.read_csv(filepath, index_col=index_col)
else:
- logger.error('Not implemented!')
- exit(1)
+ raise ValueError(f"File format not supported for file: {filepath}")
- return df
\ No newline at end of file
+ return df
diff --git a/model_data/simulation_system/core/DataProcessor.py b/model_data/simulation_system/core/DataProcessor.py
index 1ac53517..7b50f486 100644
--- a/model_data/simulation_system/core/DataProcessor.py
+++ b/model_data/simulation_system/core/DataProcessor.py
@@ -23,6 +23,7 @@ class DataProcessor:
def __init__(self, filepath: Path) -> None:
self.filepath = filepath
+ self.data = None
def load_data(self, low_memory=False) -> None:
self.data = pd.read_csv(self.filepath, low_memory=low_memory)
diff --git a/model_data/simulation_system/core/FeatureProcessor.py b/model_data/simulation_system/core/FeatureProcessor.py
index aef9605f..cefcee9b 100644
--- a/model_data/simulation_system/core/FeatureProcessor.py
+++ b/model_data/simulation_system/core/FeatureProcessor.py
@@ -4,13 +4,14 @@ Create additional features from the dataset
import pandas as pd
from typing import List
-from core.Logger import logger
+from model_data.simulation_system.core.Logger import logger
RDSAP_CHANGE_DROP_COLUMNS = ['UPRN', 'HEAT_DEMAND_CHANGE']
HEAT_DEMAND_CHANGE_DROP_COLUMNS = ['UPRN', 'RDSAP_CHANGE']
-RANDOM_SEED = 0
-
+RANDOM_SEED = 0
+
+
class FeatureProcessor:
"""
Handle all feature manipulation before modelling
@@ -38,11 +39,11 @@ class FeatureProcessor:
if not set(features).issubset(df.columns):
logger.error('Features defined is not contained in data')
exit(1)
-
+
df = df[features]
return df
-
+
@staticmethod
def subsample_data(df: pd.DataFrame, subsample_amount: int = None) -> pd.DataFrame:
"""
@@ -53,14 +54,13 @@ class FeatureProcessor:
df = df.sample(subsample_amount, random_state=RANDOM_SEED)
return df
-
def process(
- self,
- df: pd.DataFrame,
- target_column: str = "RDSAP_CHANGE",
- features: List[str] = None,
- subsample_amount: int = None
- ) -> pd.DataFrame:
+ self,
+ df: pd.DataFrame,
+ target_column: str = "RDSAP_CHANGE",
+ features: List[str] = None,
+ subsample_amount: int = None
+ ) -> pd.DataFrame:
"""
Pipeline to get data ready for building a model
"""
diff --git a/model_data/simulation_system/predictions.py b/model_data/simulation_system/predictions.py
index bc1b113b..aa6c2d0f 100644
--- a/model_data/simulation_system/predictions.py
+++ b/model_data/simulation_system/predictions.py
@@ -4,14 +4,13 @@ Script to load MLModel class and generate predictions
import json
import argparse
-from MLModel.Models import AutogluonModel
-from core.Logger import logger
-from core.DataLoader import DataLoader
-from pathlib import Path
+from model_data.simulation_system.MLModel.Models import AutogluonModel
+from model_data.simulation_system.core.Logger import logger
+from model_data.simulation_system.core.DataLoader import DataLoader
import pandas as pd
from typing import Optional
from datetime import datetime
-from core.Settings import (
+from model_data.simulation_system.core.Settings import (
BASE_REGISTRY_PATH,
REGISTRY_FILE,
PREDICTION_LOCATION,
@@ -19,10 +18,12 @@ from core.Settings import (
METADATA_FILE
)
-TIMESTAMP = datetime.now().strftime(format="%Y-%m-%d_%H-%M-%S")
+TIMESTAMP = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+
# FOR TESTING
-# For now just loading data first and then passing into function (i.e. as if we receive json data and convert to DataFrame)
+# For now just loading data first and then passing into function (i.e. as if we receive json data and convert to
+# DataFrame)
# TEST_DATA = DataLoader.load(filepath="../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet")
# DATA = TEST_DATA.sample(1)
@@ -33,18 +34,20 @@ def ingest_arguments() -> argparse.Namespace:
"""
parser = argparse.ArgumentParser(description='Inputs for training script')
- parser.add_argument('--target-column', type=str, help='The response variable you are predicting for', choices=['RDSAP_CHANGE', 'HEAT_DEMAND_CHANGE'], default='RDSAP_CHANGE')
- parser.add_argument('--model-path', type=str, help='If you wish to use a specific model, specify the model path here')
+ parser.add_argument('--target-column', type=str, help='The response variable you are predicting for',
+ choices=['RDSAP_CHANGE', 'HEAT_DEMAND_CHANGE'], default='RDSAP_CHANGE')
+ parser.add_argument('--model-path', type=str,
+ help='If you wish to use a specific model, specify the model path here')
parser.add_argument('--data', type=str, help='Json data for predictions')
parser.add_argument('--data-path', type=str, help='Location of Parquet dataset to load for training')
args = parser.parse_args()
return args
-
-def prediction(target_column: str = "RDSAP_CHANGE", model_path: str = None, data: pd.DataFrame = None, data_path: Optional[str] = None):
+def prediction(target_column: str = "RDSAP_CHANGE", model_path: str = None, data: pd.DataFrame = None,
+ data_path: Optional[str] = None):
"""
Main pipeline function
"""
@@ -93,6 +96,7 @@ def prediction(target_column: str = "RDSAP_CHANGE", model_path: str = None, data
logger.info("--- Loading Model ---")
model = AutogluonModel()
+
model.load_model(filepath=model_location)
logger.info("--- Generating Predictions ---")
@@ -125,10 +129,11 @@ def prediction(target_column: str = "RDSAP_CHANGE", model_path: str = None, data
return json_prediction
-if __name__ == "__main__":
+if __name__ == "__main__":
args = ingest_arguments()
# Data can be passed in as JSON string: python3 predictions.py --data '{"TOTAL_FLOOR_AREA": 1}'
- # Data path can be passed as so: python3 predictions.py --data-path ../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet
- prediction(target_column=args.target_column, model_path=args.model_path, data=args.data, data_path=args.data_path)
\ No newline at end of file
+ # Data path can be passed as so: python3 predictions.py --data-path
+ # ../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet
+ prediction(target_column=args.target_column, model_path=args.model_path, data=args.data, data_path=args.data_path)
diff --git a/model_data/simulation_system/requirements/prediction.txt b/model_data/simulation_system/requirements/prediction.txt
new file mode 100644
index 00000000..f9ce32bf
--- /dev/null
+++ b/model_data/simulation_system/requirements/prediction.txt
@@ -0,0 +1,2 @@
+autogluon==0.8.2
+pandas==1.5.3
\ No newline at end of file
diff --git a/model_data/simulation_system/requirements/training.txt b/model_data/simulation_system/requirements/training.txt
new file mode 100644
index 00000000..17e4c8da
--- /dev/null
+++ b/model_data/simulation_system/requirements/training.txt
@@ -0,0 +1,3 @@
+autogluon==0.8.2
+pandas==1.5.3
+seaborn==0.12.2
diff --git a/model_data/simulation_system/training.py b/model_data/simulation_system/training.py
index b37e7154..2a1dfcfa 100644
--- a/model_data/simulation_system/training.py
+++ b/model_data/simulation_system/training.py
@@ -1,16 +1,13 @@
-
import argparse
# import boto3
-import os
from pathlib import Path
from datetime import datetime
-from typing import List
-from core.Logger import logger
-from core.DataLoader import DataLoader
-from core.FeatureProcessor import FeatureProcessor
-from MLModel.Models import AutogluonModel
+from model_data.simulation_system.core.Logger import logger
+from model_data.simulation_system.core.DataLoader import DataLoader
+from model_data.simulation_system.core.FeatureProcessor import FeatureProcessor
+from model_data.simulation_system.MLModel.Models import AutogluonModel
import pandas as pd
-from core.Settings import (
+from model_data.simulation_system.core.Settings import (
MODEL_DIRECTORY,
BASE_REGISTRY_PATH,
REGISTRY_FILE,
@@ -23,7 +20,8 @@ from core.Settings import (
import seaborn as sns
import matplotlib.pyplot as plt
-TIMESTAMP = datetime.now().strftime(format="%Y-%m-%d_%H-%M-%S")
+TIMESTAMP = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+
# FOR TESTING
# train_filepath = "./model_build_data/change_data/rdsap_full/train_validation_data.parquet"
@@ -52,23 +50,27 @@ def ingest_arguments() -> argparse.Namespace:
parser = argparse.ArgumentParser(description='Inputs for training script')
- parser.add_argument('--train-filepath', type=str, help='Location of Parquet dataset to load for training', required=True)
- parser.add_argument('--test-filepath', type=str, help='Location of Parquet dataset to load for testing', required=True)
- parser.add_argument('--model-type', type=str, help='The type of model to train', choices=["autogluon"], default="autogluon")
- parser.add_argument('--target-column', type=str, help='The response variable', choices=["RDSAP_CHANGE", "HEAT_DEMAND_CHANGE"], default='RDSAP_CHANGE')
+ parser.add_argument('--train-filepath', type=str, help='Location of Parquet dataset to load for training',
+ required=True)
+ parser.add_argument('--test-filepath', type=str, help='Location of Parquet dataset to load for testing',
+ required=True)
+ parser.add_argument('--model-type', type=str, help='The type of model to train', choices=["autogluon"],
+ default="autogluon")
+ parser.add_argument('--target-column', type=str, help='The response variable',
+ choices=["RDSAP_CHANGE", "HEAT_DEMAND_CHANGE"], default='RDSAP_CHANGE')
args = parser.parse_args()
return args
-
+
def training(
- train_filepath: str,
- test_filepath: str,
- target_column: str = "RDSAP_CHANGE",
- model_type: str = "autogluon",
- hyperparameters: dict = None
- ) -> None:
+ train_filepath: str,
+ test_filepath: str,
+ target_column: str = "RDSAP_CHANGE",
+ model_type: str = "autogluon",
+ hyperparameters: dict = None
+) -> None:
"""
Pipeline to run training on the dataset
"""
@@ -77,12 +79,12 @@ def training(
dataloader = DataLoader()
train_df = dataloader.load(filepath=train_filepath)
test_df = dataloader.load(filepath=test_filepath)
-
+
logger.info('--- Feature processing ---')
feature_processor = FeatureProcessor()
- subsample_amount = round(len(train_df)/SUBSAMPLE_FACTOR)
+ subsample_amount = round(len(train_df) / SUBSAMPLE_FACTOR)
train_df = feature_processor.process(train_df, target_column=target_column, subsample_amount=subsample_amount)
test_df = feature_processor.process(test_df, target_column=target_column)
@@ -98,71 +100,68 @@ def training(
if model_type == "autogluon":
model_root = f"{target_column}-{hyperparameters['presets']}-{hyperparameters['time_limit']}-{TIMESTAMP}".lower()
- output_base = Path(MODEL_DIRECTORY) / target_column / model_type / model_root
+ output_base = Path(MODEL_DIRECTORY) / target_column / model_type / model_root
model = AutogluonModel(
- output_filepath = output_base / MODEL_FOLDER
- )
- else:
- logger.error("No alternative model implemented yet")
- exit(1)
-
- model.train_model(
- data=train_df,
- target_column=target_column,
- hyperparameters=hyperparameters
+ output_filepath=output_base / MODEL_FOLDER
)
-
+ else:
+ raise ValueError("No alternative model implemented yet")
+
+ model.train_model(
+ data=train_df,
+ target_column=target_column,
+ hyperparameters=hyperparameters
+ )
+
logger.info("--- Save Model ---")
model.save_model(output_filepath=model.output_filepath)
logger.info('--- Generate evaluation metrics ---')
metrics_df = model.model_evaluation(
- validation_data=test_df,
+ validation_data=test_df,
target_column=target_column,
- metrics_location = output_base / METRICS_FOLDER
- )
-
+ metrics_location=output_base / METRICS_FOLDER
+ )
+
logger.info("--- Generate metric outputs using predictions ---")
# TODO: can have a model.metric_outputs method
# FOr not just do it here
residual_df = pd.DataFrame(list(zip(test_df[target_column], model.predictions)), columns=['true', 'pred'])
-
+
# image formatting
# TODO: move to settings file , AXIS_FONT, TITLE_FONT
- axis_fs = 18 #fontsize
- title_fs = 22 #fontsize
+ axis_fs = 18 # fontsize
+ title_fs = 22 # fontsize
sns.set(style="whitegrid")
- ax = sns.scatterplot(x="true", y="pred",data=residual_df)
+ ax = sns.scatterplot(x="true", y="pred", data=residual_df)
ax.set_aspect('equal')
- ax.set_xlabel(f'True {target_column}',fontsize = axis_fs)
- ax.set_ylabel(f'Predicted {target_column}', fontsize = axis_fs)#ylabel
- ax.set_title('Residuals', fontsize = title_fs)
+ ax.set_xlabel(f'True {target_column}', fontsize=axis_fs)
+ ax.set_ylabel(f'Predicted {target_column}', fontsize=axis_fs) # ylabel
+ ax.set_title('Residuals', fontsize=title_fs)
# Square aspect ratio
ax.plot([-100, 100], [-100, 100], 'black', linewidth=1)
plt.tight_layout()
RESIDUAL_FILE = "residuals.png"
- plt.savefig(output_base / METRICS_FOLDER / RESIDUAL_FILE, dpi=120)
+ plt.savefig(output_base / METRICS_FOLDER / RESIDUAL_FILE, dpi=120)
# TODO: for cml, we might want to have class that outputs all data and plots to add to the report
# If we want residual plot/ any plots, we will need to self host
# plt.savefig(RESIDUAL_FILE, dpi=120)
-
# TODO: introduce a seperate script for model optimisation, and from there, optimise for deployment
# Imagining for now that the model trained here is the best model amongst all models built
logger.info("--- Optimising model for deployment ---")
- deployment_model_path = model.optimise_model_for_deployment(deployment_path= output_base / DEPLOYMENT_FOLDER)
+ deployment_model_path = model.optimise_model_for_deployment(deployment_path=output_base / DEPLOYMENT_FOLDER)
logger.info(f"Optimised version of best model can be found at: {deployment_model_path}")
# TODO: Need a model registry - for now have this as a CSV
# Save this in the model directory
logger.info("--- Append registry with new model ---")
-
registry_path = BASE_REGISTRY_PATH / target_column / REGISTRY_FILE
if registry_path.exists():
@@ -170,43 +169,49 @@ def training(
registry_df = pd.read_csv(registry_path, index_col=None)
else:
# TODO: Moved columns into settings: MODEL_DETAILS and Metrics class columns
- registry_df = pd.DataFrame(columns=['model_type', 'model_name', 'model_location', 'mean_absolute_error', 'root_mean_squared_error', 'mean_squared_error', 'r2', 'pearsonr', 'median_absolute_error', 'mape', 'best_model'])
+ registry_df = pd.DataFrame(
+ columns=['model_type', 'model_name', 'model_location', 'mean_absolute_error', 'root_mean_squared_error',
+ 'mean_squared_error', 'r2', 'pearsonr', 'median_absolute_error', 'mape', 'best_model'])
model_details_df = pd.DataFrame(
[{
- 'model_type': model_type,
- 'model_name': model_root,
+ 'model_type': model_type,
+ 'model_name': model_root,
'model_location': deployment_model_path
}]
- )
-
+ )
+
registry_row = pd.concat([model_details_df, metrics_df], axis=1)
registry_df = pd.concat([registry_df, registry_row], axis=0).reset_index(drop=True)
- # TODO: will need a rebuild script metric script -i.e. if we add new metrics, we will want to load models and regenerate new metrics
+ # TODO: will need a rebuild script metric script -i.e. if we add new metrics, we will want to load models and
+ # regenerate new metrics
# TODO: decide metric to optimise to
registry_df = registry_df.sort_values("mean_absolute_error", ascending=False).reset_index(drop=True)
- registry_df['best_model'] = [False]*len(registry_df)
+ registry_df['best_model'] = [False] * len(registry_df)
registry_df.loc[0, 'best_model'] = True
logger.info("--- Saving new model to registry ---")
+ # Ensure the directory exists
+ registry_path.parent.mkdir(parents=True, exist_ok=True)
registry_df.to_csv(registry_path, index=False)
logger.info("--- Training Pipeline Complete --- ")
if __name__ == "__main__":
-
logger.info('---Begin Pipeline---')
logger.info('---Ingest Arguments---')
args = ingest_arguments()
- # To run script: python3 training.py --train-filepath ./model_build_data/change_data/rdsap_full/train_validation_data.parquet --test-filepath ./model_build_data/change_data/rdsap_full/test_data.parquet
+ # To run script: python3 training.py --train-filepath
+ # ./model_build_data/change_data/rdsap_full/train_validation_data.parquet --test-filepath
+ # ./model_build_data/change_data/rdsap_full/test_data.parquet
# TODO: Ingest hyper parameters from somewhere - currently change at the top of script
training(
- train_filepath=args.train_filepath,
- test_filepath=args.test_filepath,
- target_column=args.target_column,
+ train_filepath=args.train_filepath,
+ test_filepath=args.test_filepath,
+ target_column=args.target_column,
model_type=args.model_type
- )
+ )