diff --git a/.gitignore b/.gitignore
index cb17846e..2da626a8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -127,6 +127,7 @@ venv/
ENV/
env.bak/
venv.bak/
+.training_env/
# Spyder project settings
.spyderproject
@@ -252,6 +253,7 @@ backend/.idea
open_uprn/.idea/
conservation_areas/.idea/
model_data/.idea/
+model_data/simulation_system/.idea/
model_data/simulation_system/data*
-
+model_data/simulation_system/model_directory/
diff --git a/.idea/Model.iml b/.idea/Model.iml
index 05b9012b..0ded8e60 100644
--- a/.idea/Model.iml
+++ b/.idea/Model.iml
@@ -7,7 +7,7 @@
-
+
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
index 3b05c6ac..ae87bfde 100644
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -1,6 +1,6 @@
-
+
diff --git a/__init__.py b/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/model_data/simulation_system/MLModel/Models.py b/model_data/simulation_system/MLModel/Models.py
index d5b25e64..fcb25654 100644
--- a/model_data/simulation_system/MLModel/Models.py
+++ b/model_data/simulation_system/MLModel/Models.py
@@ -129,17 +129,15 @@ class AutogluonModel:
return metrics_df
- def optimise_model_for_deployment(self, deployment_path: Path = None) -> None:
+ def optimise_model_for_deployment(self, deployment_path: Path = None) -> str:
"""
We can optimise the deployment for a autogluon model
"""
if self.model is None:
- logger.error("No model to optimise for deployment")
- exit(1)
+ raise ValueError("No model to optimise for deployment")
if deployment_path is None:
- logger.error("Deployment path required")
- exit(1)
+ raise ValueError("Deployment path required")
# This will return a string path of the location
return self.model.clone_for_deployment(deployment_path)
diff --git a/model_data/simulation_system/core/DataLoader.py b/model_data/simulation_system/core/DataLoader.py
index 1e811f8d..dcd7af16 100644
--- a/model_data/simulation_system/core/DataLoader.py
+++ b/model_data/simulation_system/core/DataLoader.py
@@ -1,13 +1,18 @@
import pandas as pd
-from core.Logger import logger
+import os
-class DataLoader():
+
+class DataLoader:
@staticmethod
def load(filepath: str, index_col: str = None) -> pd.DataFrame:
"""
Load different datasets
"""
+
+ if not os.path.exists(filepath):
+ raise FileNotFoundError(f"File not found: {filepath}")
+
if filepath.endswith('.parquet'):
df = pd.read_parquet(filepath)
if index_col is not None:
@@ -15,7 +20,6 @@ class DataLoader():
elif filepath.endswith('.csv'):
df = pd.read_csv(filepath, index_col=index_col)
else:
- logger.error('Not implemented!')
- exit(1)
+ raise ValueError(f"File format not supported for file: {filepath}")
- return df
\ No newline at end of file
+ return df
diff --git a/model_data/simulation_system/core/DataProcessor.py b/model_data/simulation_system/core/DataProcessor.py
index 1ac53517..7b50f486 100644
--- a/model_data/simulation_system/core/DataProcessor.py
+++ b/model_data/simulation_system/core/DataProcessor.py
@@ -23,6 +23,7 @@ class DataProcessor:
def __init__(self, filepath: Path) -> None:
self.filepath = filepath
+ self.data = None
def load_data(self, low_memory=False) -> None:
self.data = pd.read_csv(self.filepath, low_memory=low_memory)
diff --git a/model_data/simulation_system/core/FeatureProcessor.py b/model_data/simulation_system/core/FeatureProcessor.py
index aef9605f..8b53cb14 100644
--- a/model_data/simulation_system/core/FeatureProcessor.py
+++ b/model_data/simulation_system/core/FeatureProcessor.py
@@ -6,18 +6,21 @@ import pandas as pd
from typing import List
from core.Logger import logger
-RDSAP_CHANGE_DROP_COLUMNS = ['UPRN', 'HEAT_DEMAND_CHANGE']
-HEAT_DEMAND_CHANGE_DROP_COLUMNS = ['UPRN', 'RDSAP_CHANGE']
+RDSAP_CHANGE_DROP_COLUMNS = ["UPRN", "HEAT_DEMAND_CHANGE"]
+HEAT_DEMAND_CHANGE_DROP_COLUMNS = ["UPRN", "RDSAP_CHANGE"]
+
+RANDOM_SEED = 0
+
-RANDOM_SEED = 0
-
class FeatureProcessor:
"""
Handle all feature manipulation before modelling
"""
@staticmethod
- def drop_unused_columns(df: pd.DataFrame, target_column: str = "RDSAP_CHANGE") -> pd.DataFrame:
+ def drop_unused_columns(
+ df: pd.DataFrame, target_column: str = "RDSAP_CHANGE"
+ ) -> pd.DataFrame:
"""
Remove the unused columns for RDS
"""
@@ -36,13 +39,13 @@ class FeatureProcessor:
features = df.columns
else:
if not set(features).issubset(df.columns):
- logger.error('Features defined is not contained in data')
+ logger.error("Features defined is not contained in data")
exit(1)
-
+
df = df[features]
return df
-
+
@staticmethod
def subsample_data(df: pd.DataFrame, subsample_amount: int = None) -> pd.DataFrame:
"""
@@ -53,14 +56,13 @@ class FeatureProcessor:
df = df.sample(subsample_amount, random_state=RANDOM_SEED)
return df
-
def process(
- self,
- df: pd.DataFrame,
- target_column: str = "RDSAP_CHANGE",
- features: List[str] = None,
- subsample_amount: int = None
- ) -> pd.DataFrame:
+ self,
+ df: pd.DataFrame,
+ target_column: str = "RDSAP_CHANGE",
+ features: List[str] = None,
+ subsample_amount: int = None,
+ ) -> pd.DataFrame:
"""
Pipeline to get data ready for building a model
"""
diff --git a/model_data/simulation_system/predictions.py b/model_data/simulation_system/predictions.py
index 591b85c7..bc85b74a 100644
--- a/model_data/simulation_system/predictions.py
+++ b/model_data/simulation_system/predictions.py
@@ -4,13 +4,12 @@ Script to load MLModel class and generate predictions
import json
import argparse
-from MLModel.Models import AutogluonModel
-from core.Logger import logger
-from core.DataLoader import DataLoader
-from pathlib import Path
import pandas as pd
from typing import Optional
from datetime import datetime
+from MLModel.Models import AutogluonModel
+from core.Logger import logger
+from core.DataLoader import DataLoader
from core.Settings import (
BASE_REGISTRY_PATH,
REGISTRY_FILE,
@@ -23,7 +22,8 @@ from core.Settings import (
TIMESTAMP = datetime.now().strftime(TIMESTAMP_FORMAT)
# FOR TESTING
-# For now just loading data first and then passing into function (i.e. as if we receive json data and convert to DataFrame)
+# For now just loading data first and then passing into function (i.e. as if we receive json data and convert to
+# DataFrame)
# TEST_DATA = DataLoader.load(filepath="../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet")
# DATA = TEST_DATA.sample(1)
@@ -110,6 +110,7 @@ def prediction(
logger.info("--- Loading Model ---")
model = AutogluonModel()
+
model.load_model(filepath=model_location)
logger.info("--- Generating Predictions ---")
@@ -143,7 +144,6 @@ def prediction(
if __name__ == "__main__":
-
args = ingest_arguments()
# Data can be passed in as JSON string: python3 predictions.py --data '{"TOTAL_FLOOR_AREA": 1}'
diff --git a/model_data/simulation_system/training.py b/model_data/simulation_system/training.py
index 6a9dae31..c2ed5c21 100644
--- a/model_data/simulation_system/training.py
+++ b/model_data/simulation_system/training.py
@@ -3,11 +3,13 @@ import argparse
# import boto3
from pathlib import Path
from datetime import datetime
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+from MLModel.Models import AutogluonModel
from core.Logger import logger
from core.DataLoader import DataLoader
from core.FeatureProcessor import FeatureProcessor
-from MLModel.Models import AutogluonModel
-import pandas as pd
from core.Settings import (
MODEL_DIRECTORY,
BASE_REGISTRY_PATH,
@@ -30,8 +32,6 @@ from core.Settings import (
SEABORN_RESIDUAL_LINE_COLOUR,
SEABORN_RESIDUAL_LINE_WIDTH,
)
-import seaborn as sns
-import matplotlib.pyplot as plt
TIMESTAMP = datetime.now().strftime(TIMESTAMP_FORMAT)
@@ -137,8 +137,7 @@ def training(
model = AutogluonModel(output_filepath=output_base / MODEL_FOLDER)
else:
- logger.error("No alternative model implemented yet")
- exit(1)
+ raise ValueError("No alternative model implemented yet")
model.train_model(
data=train_df, target_column=target_column, hyperparameters=hyperparameters
@@ -207,7 +206,6 @@ def training(
# TODO: Need a model registry - for now have this as a CSV
# Save this in the model directory
logger.info("--- Append registry with new model ---")
-
registry_path = BASE_REGISTRY_PATH / target_column / REGISTRY_FILE
if registry_path.exists():
@@ -244,7 +242,8 @@ def training(
registry_row = pd.concat([model_details_df, metrics_df], axis=1)
registry_df = pd.concat([registry_df, registry_row], axis=0).reset_index(drop=True)
- # TODO: will need a rebuild script metric script -i.e. if we add new metrics, we will want to load models and regenerate new metrics
+ # TODO: will need a rebuild script metric script -i.e. if we add new metrics, we will want to load models and
+ # regenerate new metrics
# TODO: decide metric to optimise to
registry_df = registry_df.sort_values(
"mean_absolute_error", ascending=False
@@ -253,6 +252,8 @@ def training(
registry_df.loc[0, "best_model"] = True
logger.info("--- Saving new model to registry ---")
+ # Ensure the directory exists
+ registry_path.parent.mkdir(parents=True, exist_ok=True)
registry_df.to_csv(registry_path, index=False)
logger.info("--- Training Pipeline Complete --- ")
@@ -265,7 +266,9 @@ if __name__ == "__main__":
logger.info("---Ingest Arguments---")
args = ingest_arguments()
- # To run script: python3 training.py --train-filepath ./model_build_data/change_data/rdsap_full/train_validation_data.parquet --test-filepath ./model_build_data/change_data/rdsap_full/test_data.parquet
+ # To run script: python3 training.py --train-filepath
+ # ./model_build_data/change_data/rdsap_full/train_validation_data.parquet --test-filepath
+ # ./model_build_data/change_data/rdsap_full/test_data.parquet
# TODO: Ingest hyper parameters from somewhere - currently change at the top of script
training(
train_filepath=args.train_filepath,