mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
commit
724e6bc608
12 changed files with 141 additions and 126 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
|
@ -252,6 +252,7 @@ backend/.idea
|
|||
open_uprn/.idea/
|
||||
conservation_areas/.idea/
|
||||
model_data/.idea/
|
||||
model_data/simulation_system/.idea/
|
||||
|
||||
model_data/simulation_system/data*
|
||||
|
||||
|
|
|
|||
2
.idea/Model.iml
generated
2
.idea/Model.iml
generated
|
|
@ -7,7 +7,7 @@
|
|||
<sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
|
||||
</content>
|
||||
<orderEntry type="jdk" jdkName="Python 3.10 (backend)" jdkType="Python SDK" />
|
||||
<orderEntry type="jdk" jdkName="Python 3.10 (simulation_system_prediction)" jdkType="Python SDK" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
</module>
|
||||
2
.idea/misc.xml
generated
2
.idea/misc.xml
generated
|
|
@ -1,6 +1,6 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (backend)" project-jdk-type="Python SDK" />
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (simulation_system_prediction)" project-jdk-type="Python SDK" />
|
||||
<component name="PythonCompatibilityInspectionAdvertiser">
|
||||
<option name="version" value="3" />
|
||||
</component>
|
||||
|
|
|
|||
0
__init__.py
Normal file
0
__init__.py
Normal file
|
|
@ -13,15 +13,17 @@ from pathlib import Path
|
|||
import pandas as pd
|
||||
from autogluon.tabular import TabularDataset, TabularPredictor
|
||||
from sklearn.metrics import mean_absolute_percentage_error
|
||||
from core.Logger import logger
|
||||
from model_data.simulation_system.core.Logger import logger
|
||||
|
||||
AUTOGLUON_HYPERPARAMETERS = ['problem_type', 'eval_metric', 'time_limit', 'presets', 'excluded_model_types']
|
||||
METRIC_FILENAME = "metrics.csv"
|
||||
|
||||
|
||||
class AutogluonModel:
|
||||
"""
|
||||
Autogluon model that implements the MLModel Protocol
|
||||
"""
|
||||
|
||||
def __init__(self, output_filepath: Path = None) -> None:
|
||||
self.model = None
|
||||
self.output_filepath = output_filepath
|
||||
|
|
@ -40,10 +42,10 @@ class AutogluonModel:
|
|||
logger.info("Using AutoGluon Model - Model saving already occured")
|
||||
|
||||
def train_model(
|
||||
self,
|
||||
data: pd.DataFrame,
|
||||
target_column: str,
|
||||
hyperparameters: dict = None) -> None:
|
||||
self,
|
||||
data: pd.DataFrame,
|
||||
target_column: str,
|
||||
hyperparameters: dict = None) -> None:
|
||||
"""
|
||||
For the given data and hyperparameters, a model is trained
|
||||
"""
|
||||
|
|
@ -58,17 +60,16 @@ class AutogluonModel:
|
|||
AGdata = TabularDataset(data=data)
|
||||
|
||||
self.model = TabularPredictor(
|
||||
label=target_column,
|
||||
path=self.output_filepath,
|
||||
label=target_column,
|
||||
path=self.output_filepath,
|
||||
problem_type=hyperparameters['problem_type'],
|
||||
eval_metric=hyperparameters['eval_metric']
|
||||
).fit(
|
||||
AGdata,
|
||||
time_limit=hyperparameters['time_limit'],
|
||||
presets=hyperparameters['presets'],
|
||||
).fit(
|
||||
AGdata,
|
||||
time_limit=hyperparameters['time_limit'],
|
||||
presets=hyperparameters['presets'],
|
||||
excluded_model_types=hyperparameters['excluded_model_types']
|
||||
)
|
||||
|
||||
)
|
||||
|
||||
def generate_predictions(self, data: pd.DataFrame) -> pd.DataFrame:
|
||||
"""
|
||||
|
|
@ -84,12 +85,12 @@ class AutogluonModel:
|
|||
return predictions
|
||||
|
||||
def model_evaluation(
|
||||
self,
|
||||
validation_data: pd.DataFrame,
|
||||
target_column: str,
|
||||
metrics_location: Path = None,
|
||||
metric_filename: str = METRIC_FILENAME
|
||||
) -> pd.DataFrame:
|
||||
self,
|
||||
validation_data: pd.DataFrame,
|
||||
target_column: str,
|
||||
metrics_location: Path = None,
|
||||
metric_filename: str = METRIC_FILENAME
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
For any validation data, a set of predictions and metrics are return
|
||||
"""
|
||||
|
|
@ -105,7 +106,7 @@ class AutogluonModel:
|
|||
|
||||
logger.info("Prediction used for evaluations are saved in self.prediction")
|
||||
self.predictions = predictions
|
||||
|
||||
|
||||
# TODO: Can have a custom metric class that defines all different metrics we want
|
||||
metric_mape = mean_absolute_percentage_error(validation_data[target_column], predictions)
|
||||
|
||||
|
|
@ -117,26 +118,19 @@ class AutogluonModel:
|
|||
metrics_df = pd.DataFrame([performance])
|
||||
metrics_df.to_csv(metrics_location / metric_filename)
|
||||
markdown_filename = metric_filename.split(".")[0] + ".md"
|
||||
metrics_df.to_markdown(metrics_location/ markdown_filename)
|
||||
metrics_df.to_markdown(metrics_location / markdown_filename)
|
||||
|
||||
return metrics_df
|
||||
|
||||
def optimise_model_for_deployment(self, deployment_path: Path = None) -> None:
|
||||
def optimise_model_for_deployment(self, deployment_path: Path = None) -> str:
|
||||
"""
|
||||
We can optimise the deployment for a autogluon model
|
||||
"""
|
||||
if self.model is None:
|
||||
logger.error("No model to optimise for deployment")
|
||||
exit(1)
|
||||
raise ValueError("No model to optimise for deployment")
|
||||
|
||||
if deployment_path is None:
|
||||
logger.error("Deployment path required")
|
||||
exit(1)
|
||||
raise ValueError("Deployment path required")
|
||||
|
||||
# This will return a string path of the location
|
||||
return self.model.clone_for_deployment(deployment_path)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -1,13 +1,18 @@
|
|||
import pandas as pd
|
||||
from core.Logger import logger
|
||||
import os
|
||||
|
||||
class DataLoader():
|
||||
|
||||
class DataLoader:
|
||||
|
||||
@staticmethod
|
||||
def load(filepath: str, index_col: str = None) -> pd.DataFrame:
|
||||
"""
|
||||
Load different datasets
|
||||
"""
|
||||
|
||||
if not os.path.exists(filepath):
|
||||
raise FileNotFoundError(f"File not found: {filepath}")
|
||||
|
||||
if filepath.endswith('.parquet'):
|
||||
df = pd.read_parquet(filepath)
|
||||
if index_col is not None:
|
||||
|
|
@ -15,7 +20,6 @@ class DataLoader():
|
|||
elif filepath.endswith('.csv'):
|
||||
df = pd.read_csv(filepath, index_col=index_col)
|
||||
else:
|
||||
logger.error('Not implemented!')
|
||||
exit(1)
|
||||
raise ValueError(f"File format not supported for file: {filepath}")
|
||||
|
||||
return df
|
||||
return df
|
||||
|
|
|
|||
|
|
@ -23,6 +23,7 @@ class DataProcessor:
|
|||
|
||||
def __init__(self, filepath: Path) -> None:
|
||||
self.filepath = filepath
|
||||
self.data = None
|
||||
|
||||
def load_data(self, low_memory=False) -> None:
|
||||
self.data = pd.read_csv(self.filepath, low_memory=low_memory)
|
||||
|
|
|
|||
|
|
@ -4,13 +4,14 @@ Create additional features from the dataset
|
|||
|
||||
import pandas as pd
|
||||
from typing import List
|
||||
from core.Logger import logger
|
||||
from model_data.simulation_system.core.Logger import logger
|
||||
|
||||
RDSAP_CHANGE_DROP_COLUMNS = ['UPRN', 'HEAT_DEMAND_CHANGE']
|
||||
HEAT_DEMAND_CHANGE_DROP_COLUMNS = ['UPRN', 'RDSAP_CHANGE']
|
||||
|
||||
RANDOM_SEED = 0
|
||||
|
||||
RANDOM_SEED = 0
|
||||
|
||||
|
||||
class FeatureProcessor:
|
||||
"""
|
||||
Handle all feature manipulation before modelling
|
||||
|
|
@ -38,11 +39,11 @@ class FeatureProcessor:
|
|||
if not set(features).issubset(df.columns):
|
||||
logger.error('Features defined is not contained in data')
|
||||
exit(1)
|
||||
|
||||
|
||||
df = df[features]
|
||||
|
||||
return df
|
||||
|
||||
|
||||
@staticmethod
|
||||
def subsample_data(df: pd.DataFrame, subsample_amount: int = None) -> pd.DataFrame:
|
||||
"""
|
||||
|
|
@ -53,14 +54,13 @@ class FeatureProcessor:
|
|||
df = df.sample(subsample_amount, random_state=RANDOM_SEED)
|
||||
return df
|
||||
|
||||
|
||||
def process(
|
||||
self,
|
||||
df: pd.DataFrame,
|
||||
target_column: str = "RDSAP_CHANGE",
|
||||
features: List[str] = None,
|
||||
subsample_amount: int = None
|
||||
) -> pd.DataFrame:
|
||||
self,
|
||||
df: pd.DataFrame,
|
||||
target_column: str = "RDSAP_CHANGE",
|
||||
features: List[str] = None,
|
||||
subsample_amount: int = None
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Pipeline to get data ready for building a model
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -4,14 +4,13 @@ Script to load MLModel class and generate predictions
|
|||
|
||||
import json
|
||||
import argparse
|
||||
from MLModel.Models import AutogluonModel
|
||||
from core.Logger import logger
|
||||
from core.DataLoader import DataLoader
|
||||
from pathlib import Path
|
||||
from model_data.simulation_system.MLModel.Models import AutogluonModel
|
||||
from model_data.simulation_system.core.Logger import logger
|
||||
from model_data.simulation_system.core.DataLoader import DataLoader
|
||||
import pandas as pd
|
||||
from typing import Optional
|
||||
from datetime import datetime
|
||||
from core.Settings import (
|
||||
from model_data.simulation_system.core.Settings import (
|
||||
BASE_REGISTRY_PATH,
|
||||
REGISTRY_FILE,
|
||||
PREDICTION_LOCATION,
|
||||
|
|
@ -19,10 +18,12 @@ from core.Settings import (
|
|||
METADATA_FILE
|
||||
)
|
||||
|
||||
TIMESTAMP = datetime.now().strftime(format="%Y-%m-%d_%H-%M-%S")
|
||||
TIMESTAMP = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
||||
|
||||
|
||||
# FOR TESTING
|
||||
# For now just loading data first and then passing into function (i.e. as if we receive json data and convert to DataFrame)
|
||||
# For now just loading data first and then passing into function (i.e. as if we receive json data and convert to
|
||||
# DataFrame)
|
||||
# TEST_DATA = DataLoader.load(filepath="../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet")
|
||||
# DATA = TEST_DATA.sample(1)
|
||||
|
||||
|
|
@ -33,18 +34,20 @@ def ingest_arguments() -> argparse.Namespace:
|
|||
"""
|
||||
|
||||
parser = argparse.ArgumentParser(description='Inputs for training script')
|
||||
parser.add_argument('--target-column', type=str, help='The response variable you are predicting for', choices=['RDSAP_CHANGE', 'HEAT_DEMAND_CHANGE'], default='RDSAP_CHANGE')
|
||||
parser.add_argument('--model-path', type=str, help='If you wish to use a specific model, specify the model path here')
|
||||
parser.add_argument('--target-column', type=str, help='The response variable you are predicting for',
|
||||
choices=['RDSAP_CHANGE', 'HEAT_DEMAND_CHANGE'], default='RDSAP_CHANGE')
|
||||
parser.add_argument('--model-path', type=str,
|
||||
help='If you wish to use a specific model, specify the model path here')
|
||||
parser.add_argument('--data', type=str, help='Json data for predictions')
|
||||
parser.add_argument('--data-path', type=str, help='Location of Parquet dataset to load for training')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
return args
|
||||
|
||||
|
||||
|
||||
def prediction(target_column: str = "RDSAP_CHANGE", model_path: str = None, data: pd.DataFrame = None, data_path: Optional[str] = None):
|
||||
def prediction(target_column: str = "RDSAP_CHANGE", model_path: str = None, data: pd.DataFrame = None,
|
||||
data_path: Optional[str] = None):
|
||||
"""
|
||||
Main pipeline function
|
||||
"""
|
||||
|
|
@ -93,6 +96,7 @@ def prediction(target_column: str = "RDSAP_CHANGE", model_path: str = None, data
|
|||
|
||||
logger.info("--- Loading Model ---")
|
||||
model = AutogluonModel()
|
||||
|
||||
model.load_model(filepath=model_location)
|
||||
|
||||
logger.info("--- Generating Predictions ---")
|
||||
|
|
@ -125,10 +129,11 @@ def prediction(target_column: str = "RDSAP_CHANGE", model_path: str = None, data
|
|||
|
||||
return json_prediction
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = ingest_arguments()
|
||||
|
||||
# Data can be passed in as JSON string: python3 predictions.py --data '{"TOTAL_FLOOR_AREA": 1}'
|
||||
# Data path can be passed as so: python3 predictions.py --data-path ../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet
|
||||
prediction(target_column=args.target_column, model_path=args.model_path, data=args.data, data_path=args.data_path)
|
||||
# Data path can be passed as so: python3 predictions.py --data-path
|
||||
# ../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet
|
||||
prediction(target_column=args.target_column, model_path=args.model_path, data=args.data, data_path=args.data_path)
|
||||
|
|
|
|||
2
model_data/simulation_system/requirements/prediction.txt
Normal file
2
model_data/simulation_system/requirements/prediction.txt
Normal file
|
|
@ -0,0 +1,2 @@
|
|||
autogluon==0.8.2
|
||||
pandas==1.5.3
|
||||
3
model_data/simulation_system/requirements/training.txt
Normal file
3
model_data/simulation_system/requirements/training.txt
Normal file
|
|
@ -0,0 +1,3 @@
|
|||
autogluon==0.8.2
|
||||
pandas==1.5.3
|
||||
seaborn==0.12.2
|
||||
|
|
@ -1,16 +1,13 @@
|
|||
|
||||
import argparse
|
||||
# import boto3
|
||||
import os
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from typing import List
|
||||
from core.Logger import logger
|
||||
from core.DataLoader import DataLoader
|
||||
from core.FeatureProcessor import FeatureProcessor
|
||||
from MLModel.Models import AutogluonModel
|
||||
from model_data.simulation_system.core.Logger import logger
|
||||
from model_data.simulation_system.core.DataLoader import DataLoader
|
||||
from model_data.simulation_system.core.FeatureProcessor import FeatureProcessor
|
||||
from model_data.simulation_system.MLModel.Models import AutogluonModel
|
||||
import pandas as pd
|
||||
from core.Settings import (
|
||||
from model_data.simulation_system.core.Settings import (
|
||||
MODEL_DIRECTORY,
|
||||
BASE_REGISTRY_PATH,
|
||||
REGISTRY_FILE,
|
||||
|
|
@ -23,7 +20,8 @@ from core.Settings import (
|
|||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
TIMESTAMP = datetime.now().strftime(format="%Y-%m-%d_%H-%M-%S")
|
||||
TIMESTAMP = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
||||
|
||||
|
||||
# FOR TESTING
|
||||
# train_filepath = "./model_build_data/change_data/rdsap_full/train_validation_data.parquet"
|
||||
|
|
@ -52,23 +50,27 @@ def ingest_arguments() -> argparse.Namespace:
|
|||
|
||||
parser = argparse.ArgumentParser(description='Inputs for training script')
|
||||
|
||||
parser.add_argument('--train-filepath', type=str, help='Location of Parquet dataset to load for training', required=True)
|
||||
parser.add_argument('--test-filepath', type=str, help='Location of Parquet dataset to load for testing', required=True)
|
||||
parser.add_argument('--model-type', type=str, help='The type of model to train', choices=["autogluon"], default="autogluon")
|
||||
parser.add_argument('--target-column', type=str, help='The response variable', choices=["RDSAP_CHANGE", "HEAT_DEMAND_CHANGE"], default='RDSAP_CHANGE')
|
||||
parser.add_argument('--train-filepath', type=str, help='Location of Parquet dataset to load for training',
|
||||
required=True)
|
||||
parser.add_argument('--test-filepath', type=str, help='Location of Parquet dataset to load for testing',
|
||||
required=True)
|
||||
parser.add_argument('--model-type', type=str, help='The type of model to train', choices=["autogluon"],
|
||||
default="autogluon")
|
||||
parser.add_argument('--target-column', type=str, help='The response variable',
|
||||
choices=["RDSAP_CHANGE", "HEAT_DEMAND_CHANGE"], default='RDSAP_CHANGE')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
return args
|
||||
|
||||
|
||||
|
||||
def training(
|
||||
train_filepath: str,
|
||||
test_filepath: str,
|
||||
target_column: str = "RDSAP_CHANGE",
|
||||
model_type: str = "autogluon",
|
||||
hyperparameters: dict = None
|
||||
) -> None:
|
||||
train_filepath: str,
|
||||
test_filepath: str,
|
||||
target_column: str = "RDSAP_CHANGE",
|
||||
model_type: str = "autogluon",
|
||||
hyperparameters: dict = None
|
||||
) -> None:
|
||||
"""
|
||||
Pipeline to run training on the dataset
|
||||
"""
|
||||
|
|
@ -77,12 +79,12 @@ def training(
|
|||
dataloader = DataLoader()
|
||||
train_df = dataloader.load(filepath=train_filepath)
|
||||
test_df = dataloader.load(filepath=test_filepath)
|
||||
|
||||
|
||||
logger.info('--- Feature processing ---')
|
||||
|
||||
feature_processor = FeatureProcessor()
|
||||
|
||||
subsample_amount = round(len(train_df)/SUBSAMPLE_FACTOR)
|
||||
subsample_amount = round(len(train_df) / SUBSAMPLE_FACTOR)
|
||||
|
||||
train_df = feature_processor.process(train_df, target_column=target_column, subsample_amount=subsample_amount)
|
||||
test_df = feature_processor.process(test_df, target_column=target_column)
|
||||
|
|
@ -98,71 +100,68 @@ def training(
|
|||
|
||||
if model_type == "autogluon":
|
||||
model_root = f"{target_column}-{hyperparameters['presets']}-{hyperparameters['time_limit']}-{TIMESTAMP}".lower()
|
||||
output_base = Path(MODEL_DIRECTORY) / target_column / model_type / model_root
|
||||
output_base = Path(MODEL_DIRECTORY) / target_column / model_type / model_root
|
||||
|
||||
model = AutogluonModel(
|
||||
output_filepath = output_base / MODEL_FOLDER
|
||||
)
|
||||
else:
|
||||
logger.error("No alternative model implemented yet")
|
||||
exit(1)
|
||||
|
||||
model.train_model(
|
||||
data=train_df,
|
||||
target_column=target_column,
|
||||
hyperparameters=hyperparameters
|
||||
output_filepath=output_base / MODEL_FOLDER
|
||||
)
|
||||
|
||||
else:
|
||||
raise ValueError("No alternative model implemented yet")
|
||||
|
||||
model.train_model(
|
||||
data=train_df,
|
||||
target_column=target_column,
|
||||
hyperparameters=hyperparameters
|
||||
)
|
||||
|
||||
logger.info("--- Save Model ---")
|
||||
model.save_model(output_filepath=model.output_filepath)
|
||||
|
||||
logger.info('--- Generate evaluation metrics ---')
|
||||
metrics_df = model.model_evaluation(
|
||||
validation_data=test_df,
|
||||
validation_data=test_df,
|
||||
target_column=target_column,
|
||||
metrics_location = output_base / METRICS_FOLDER
|
||||
)
|
||||
|
||||
metrics_location=output_base / METRICS_FOLDER
|
||||
)
|
||||
|
||||
logger.info("--- Generate metric outputs using predictions ---")
|
||||
# TODO: can have a model.metric_outputs method
|
||||
# FOr not just do it here
|
||||
residual_df = pd.DataFrame(list(zip(test_df[target_column], model.predictions)), columns=['true', 'pred'])
|
||||
|
||||
|
||||
# image formatting
|
||||
# TODO: move to settings file , AXIS_FONT, TITLE_FONT
|
||||
axis_fs = 18 #fontsize
|
||||
title_fs = 22 #fontsize
|
||||
axis_fs = 18 # fontsize
|
||||
title_fs = 22 # fontsize
|
||||
sns.set(style="whitegrid")
|
||||
ax = sns.scatterplot(x="true", y="pred",data=residual_df)
|
||||
ax = sns.scatterplot(x="true", y="pred", data=residual_df)
|
||||
ax.set_aspect('equal')
|
||||
ax.set_xlabel(f'True {target_column}',fontsize = axis_fs)
|
||||
ax.set_ylabel(f'Predicted {target_column}', fontsize = axis_fs)#ylabel
|
||||
ax.set_title('Residuals', fontsize = title_fs)
|
||||
ax.set_xlabel(f'True {target_column}', fontsize=axis_fs)
|
||||
ax.set_ylabel(f'Predicted {target_column}', fontsize=axis_fs) # ylabel
|
||||
ax.set_title('Residuals', fontsize=title_fs)
|
||||
|
||||
# Square aspect ratio
|
||||
ax.plot([-100, 100], [-100, 100], 'black', linewidth=1)
|
||||
|
||||
plt.tight_layout()
|
||||
RESIDUAL_FILE = "residuals.png"
|
||||
plt.savefig(output_base / METRICS_FOLDER / RESIDUAL_FILE, dpi=120)
|
||||
plt.savefig(output_base / METRICS_FOLDER / RESIDUAL_FILE, dpi=120)
|
||||
|
||||
# TODO: for cml, we might want to have class that outputs all data and plots to add to the report
|
||||
# If we want residual plot/ any plots, we will need to self host
|
||||
# plt.savefig(RESIDUAL_FILE, dpi=120)
|
||||
|
||||
|
||||
# TODO: introduce a seperate script for model optimisation, and from there, optimise for deployment
|
||||
# Imagining for now that the model trained here is the best model amongst all models built
|
||||
|
||||
logger.info("--- Optimising model for deployment ---")
|
||||
|
||||
deployment_model_path = model.optimise_model_for_deployment(deployment_path= output_base / DEPLOYMENT_FOLDER)
|
||||
deployment_model_path = model.optimise_model_for_deployment(deployment_path=output_base / DEPLOYMENT_FOLDER)
|
||||
logger.info(f"Optimised version of best model can be found at: {deployment_model_path}")
|
||||
|
||||
# TODO: Need a model registry - for now have this as a CSV
|
||||
# Save this in the model directory
|
||||
logger.info("--- Append registry with new model ---")
|
||||
|
||||
registry_path = BASE_REGISTRY_PATH / target_column / REGISTRY_FILE
|
||||
|
||||
if registry_path.exists():
|
||||
|
|
@ -170,43 +169,49 @@ def training(
|
|||
registry_df = pd.read_csv(registry_path, index_col=None)
|
||||
else:
|
||||
# TODO: Moved columns into settings: MODEL_DETAILS and Metrics class columns
|
||||
registry_df = pd.DataFrame(columns=['model_type', 'model_name', 'model_location', 'mean_absolute_error', 'root_mean_squared_error', 'mean_squared_error', 'r2', 'pearsonr', 'median_absolute_error', 'mape', 'best_model'])
|
||||
registry_df = pd.DataFrame(
|
||||
columns=['model_type', 'model_name', 'model_location', 'mean_absolute_error', 'root_mean_squared_error',
|
||||
'mean_squared_error', 'r2', 'pearsonr', 'median_absolute_error', 'mape', 'best_model'])
|
||||
|
||||
model_details_df = pd.DataFrame(
|
||||
[{
|
||||
'model_type': model_type,
|
||||
'model_name': model_root,
|
||||
'model_type': model_type,
|
||||
'model_name': model_root,
|
||||
'model_location': deployment_model_path
|
||||
}]
|
||||
)
|
||||
|
||||
)
|
||||
|
||||
registry_row = pd.concat([model_details_df, metrics_df], axis=1)
|
||||
registry_df = pd.concat([registry_df, registry_row], axis=0).reset_index(drop=True)
|
||||
|
||||
# TODO: will need a rebuild script metric script -i.e. if we add new metrics, we will want to load models and regenerate new metrics
|
||||
# TODO: will need a rebuild script metric script -i.e. if we add new metrics, we will want to load models and
|
||||
# regenerate new metrics
|
||||
# TODO: decide metric to optimise to
|
||||
registry_df = registry_df.sort_values("mean_absolute_error", ascending=False).reset_index(drop=True)
|
||||
registry_df['best_model'] = [False]*len(registry_df)
|
||||
registry_df['best_model'] = [False] * len(registry_df)
|
||||
registry_df.loc[0, 'best_model'] = True
|
||||
|
||||
logger.info("--- Saving new model to registry ---")
|
||||
# Ensure the directory exists
|
||||
registry_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
registry_df.to_csv(registry_path, index=False)
|
||||
|
||||
logger.info("--- Training Pipeline Complete --- ")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
logger.info('---Begin Pipeline---')
|
||||
|
||||
logger.info('---Ingest Arguments---')
|
||||
args = ingest_arguments()
|
||||
|
||||
# To run script: python3 training.py --train-filepath ./model_build_data/change_data/rdsap_full/train_validation_data.parquet --test-filepath ./model_build_data/change_data/rdsap_full/test_data.parquet
|
||||
# To run script: python3 training.py --train-filepath
|
||||
# ./model_build_data/change_data/rdsap_full/train_validation_data.parquet --test-filepath
|
||||
# ./model_build_data/change_data/rdsap_full/test_data.parquet
|
||||
# TODO: Ingest hyper parameters from somewhere - currently change at the top of script
|
||||
training(
|
||||
train_filepath=args.train_filepath,
|
||||
test_filepath=args.test_filepath,
|
||||
target_column=args.target_column,
|
||||
train_filepath=args.train_filepath,
|
||||
test_filepath=args.test_filepath,
|
||||
target_column=args.target_column,
|
||||
model_type=args.model_type
|
||||
)
|
||||
)
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue