mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
cleaned up settings
This commit is contained in:
parent
2e5c423562
commit
3e23240afe
4 changed files with 53 additions and 32 deletions
|
|
@ -44,6 +44,10 @@ Steps for pipeline:
|
|||
- a `metadata.json`
|
||||
- This is all the metadata from the model (can change this if needed)
|
||||
|
||||
- NOTE: If you wish to change any settings, these are currently all in the `Settings.py` file
|
||||
- It will be separated out eventually but for now, it works to keep track of anything that we might want to respecify.
|
||||
- I.e. the hyperparameters for models are in here but will move into a separate configuration file
|
||||
|
||||
|
||||
# TODO:
|
||||
- Structure/ MLOps:
|
||||
|
|
@ -54,6 +58,7 @@ Steps for pipeline:
|
|||
- Sort out Model Registry
|
||||
- Sort out Data version control
|
||||
- Data Science:
|
||||
- Implement a metrics class, to hold all metric
|
||||
- Rebuild metrics script (Could be a one off but good to have)
|
||||
- Determine metrics
|
||||
- Implement and test custom model (Tensorflow Decision Trees etc)
|
||||
|
|
|
|||
|
|
@ -2,7 +2,20 @@
|
|||
# TODO: migrate to dynaconf
|
||||
from pathlib import Path
|
||||
|
||||
# Can move to a hyperparmeters file
|
||||
# If anything we might want to have a file that can be loaded and sent to this script
|
||||
MODEL_HYPERPARAMETERS = {
|
||||
"autogluon": {
|
||||
'problem_type': 'regression',
|
||||
'eval_metric': 'mean_absolute_error',
|
||||
'time_limit': 75,
|
||||
'presets': 'medium_quality',
|
||||
'excluded_model_types': None
|
||||
}
|
||||
}
|
||||
|
||||
RANDOM_SEED = 0
|
||||
SUBSAMPLE_FACTOR = 200
|
||||
|
||||
TRAIN_AND_VALIDATION_DATA_NAME = 'train_validation_data.parquet'
|
||||
TEST_DATA_NAME = 'test_data.parquet'
|
||||
|
|
@ -13,6 +26,9 @@ BASE_REGISTRY_PATH = Path(__file__).parent.parent / MODEL_DIRECTORY
|
|||
PREDICTION_LOCATION = Path("predictions")
|
||||
PREDICTION_FILE = 'prediction.json'
|
||||
METADATA_FILE = 'metadata.json'
|
||||
MODEL_FOLDER = "model"
|
||||
METRICS_FOLDER = "metrics"
|
||||
DEPLOYMENT_FOLDER = "deployment"
|
||||
|
||||
TOTAL_FLOOR_AREA_NATIONAL_AVERAGE = 70
|
||||
FLOOR_HEIGHT_NATIONAL_AVERAGE = 2.45
|
||||
|
|
|
|||
|
|
@ -33,8 +33,9 @@ def ingest_arguments() -> argparse.Namespace:
|
|||
"""
|
||||
|
||||
parser = argparse.ArgumentParser(description='Inputs for training script')
|
||||
parser.add_argument('--target-column', type=str, help='The response variable you are predicting for', choices=['RDSAP_CHANGE', 'HEAT_DEMAND_CHANGE'], default='RDSAP_CHANGE')
|
||||
parser.add_argument('--model-path', type=str, help='If you wish to use a specific model, specify the model path here')
|
||||
parser.add_argument('--data', type=str, help='Location of Parquet dataset to load for testing')
|
||||
parser.add_argument('--data', type=str, help='Json data for predictions')
|
||||
parser.add_argument('--data-path', type=str, help='Location of Parquet dataset to load for training')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
|
@ -128,4 +129,4 @@ if __name__ == "__main__":
|
|||
|
||||
# Data can be passed in as JSON string: python3 predictions.py --data '{"TOTAL_FLOOR_AREA": 1}'
|
||||
# Data path can be passed as so: python3 predictions.py --data-path ../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet
|
||||
prediction(model_path=args.model_path, data=args.data, data_path=args.data_path)
|
||||
prediction(target_column=args.target_column, model_path=args.model_path, data=args.data, data_path=args.data_path)
|
||||
|
|
@ -13,28 +13,23 @@ import pandas as pd
|
|||
from core.Settings import (
|
||||
MODEL_DIRECTORY,
|
||||
BASE_REGISTRY_PATH,
|
||||
REGISTRY_FILE
|
||||
REGISTRY_FILE,
|
||||
MODEL_FOLDER,
|
||||
METRICS_FOLDER,
|
||||
DEPLOYMENT_FOLDER,
|
||||
SUBSAMPLE_FACTOR,
|
||||
MODEL_HYPERPARAMETERS
|
||||
)
|
||||
|
||||
TIMESTAMP = datetime.now().strftime(format="%Y-%m-%d_%H-%M-%S")
|
||||
|
||||
# Can move to a hyperparmeters file
|
||||
# If anything we might want to have a file that can be loaded and sent to this script
|
||||
HYPERPARAMETERS = {
|
||||
'problem_type': 'regression',
|
||||
'eval_metric': 'mean_absolute_error',
|
||||
'time_limit': 60,
|
||||
'presets': 'medium_quality',
|
||||
'excluded_model_types': None
|
||||
}
|
||||
TIMESTAMP = datetime.now().strftime(format="%Y-%m-%d_%H-%M-%S")
|
||||
|
||||
# FOR TESTING
|
||||
train_filepath = "./model_build_data/change_data/rdsap_full/train_validation_data.parquet"
|
||||
test_filepath = "./model_build_data/change_data/rdsap_full/test_data.parquet"
|
||||
target_column = "RDSAP_CHANGE"
|
||||
model_type = "autogluon"
|
||||
hyperparameter = HYPERPARAMETERS
|
||||
subsample_factor = 200
|
||||
# train_filepath = "./model_build_data/change_data/rdsap_full/train_validation_data.parquet"
|
||||
# test_filepath = "./model_build_data/change_data/rdsap_full/test_data.parquet"
|
||||
# target_column = "RDSAP_CHANGE"
|
||||
# model_type = "autogluon"
|
||||
# hyperparameter = HYPERPARAMETERS
|
||||
# SUBSAMPLE_FACTOR = 200
|
||||
|
||||
# SESSION = boto3.Session()
|
||||
|
||||
|
|
@ -58,7 +53,7 @@ def ingest_arguments() -> argparse.Namespace:
|
|||
parser.add_argument('--train-filepath', type=str, help='Location of Parquet dataset to load for training', required=True)
|
||||
parser.add_argument('--test-filepath', type=str, help='Location of Parquet dataset to load for testing', required=True)
|
||||
parser.add_argument('--model-type', type=str, help='The type of model to train', choices=["autogluon"], default="autogluon")
|
||||
parser.add_argument('--target-column', type=str, help='The response variable', choices=["RDSAP_CHANGE"], default='RDSAP_CHANGE')
|
||||
parser.add_argument('--target-column', type=str, help='The response variable', choices=["RDSAP_CHANGE", "HEAT_DEMAND_CHANGE"], default='RDSAP_CHANGE')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
|
|
@ -70,7 +65,7 @@ def training(
|
|||
test_filepath: str,
|
||||
target_column: str = "RDSAP_CHANGE",
|
||||
model_type: str = "autogluon",
|
||||
hyperparameter: dict = HYPERPARAMETERS
|
||||
hyperparameters: dict = None
|
||||
) -> None:
|
||||
"""
|
||||
Pipeline to run training on the dataset
|
||||
|
|
@ -85,22 +80,26 @@ def training(
|
|||
|
||||
feature_processor = FeatureProcessor()
|
||||
|
||||
subsample_amount = round(len(train_df)/subsample_factor)
|
||||
subsample_amount = round(len(train_df)/SUBSAMPLE_FACTOR)
|
||||
|
||||
train_df = feature_processor.process(train_df, target_column=target_column, subsample_amount=subsample_amount)
|
||||
test_df = feature_processor.process(test_df, target_column=target_column)
|
||||
|
||||
logger.info('--- Build Model ---')
|
||||
|
||||
model_folder = "model"
|
||||
metrics_folder = "metrics"
|
||||
logger.info("--- Load Hyperparameters ---")
|
||||
|
||||
if hyperparameters is None:
|
||||
logger.info("Use base hyperparameters in settings")
|
||||
hyperparameters = MODEL_HYPERPARAMETERS[model_type]
|
||||
logger.info(f'Hyperparameters are: {hyperparameters}')
|
||||
|
||||
if model_type == "autogluon":
|
||||
model_root = f"{target_column}-{HYPERPARAMETERS['presets']}-{HYPERPARAMETERS['time_limit']}-{TIMESTAMP}".lower()
|
||||
model_root = f"{target_column}-{hyperparameters['presets']}-{hyperparameters['time_limit']}-{TIMESTAMP}".lower()
|
||||
output_base = Path(MODEL_DIRECTORY) / target_column / model_type / model_root
|
||||
|
||||
model = AutogluonModel(
|
||||
output_filepath = output_base / model_folder
|
||||
output_filepath = output_base / MODEL_FOLDER
|
||||
)
|
||||
else:
|
||||
logger.error("No alternative model implemented yet")
|
||||
|
|
@ -109,7 +108,7 @@ def training(
|
|||
model.train_model(
|
||||
data=train_df,
|
||||
target_column=target_column,
|
||||
hyperparameters=hyperparameter
|
||||
hyperparameters=hyperparameters
|
||||
)
|
||||
|
||||
logger.info("--- Save Model ---")
|
||||
|
|
@ -119,15 +118,15 @@ def training(
|
|||
metrics_df = model.model_evaluation(
|
||||
validation_data=test_df,
|
||||
target_column=target_column,
|
||||
metrics_location = output_base / metrics_folder
|
||||
metrics_location = output_base / METRICS_FOLDER
|
||||
)
|
||||
|
||||
# TODO: introduce a seperate script for model optimisation, and from there, optimise for deployment
|
||||
# Imagining for now that the model trained here is the best model amongst all models built
|
||||
|
||||
logger.info("--- Optimising model for deployment ---")
|
||||
optimised_folder = "deployment"
|
||||
deployment_model_path = model.optimise_model_for_deployment(deployment_path= output_base / optimised_folder)
|
||||
|
||||
deployment_model_path = model.optimise_model_for_deployment(deployment_path= output_base / DEPLOYMENT_FOLDER)
|
||||
logger.info("Optimised version of best model can be found at: {deployment_model_path}")
|
||||
|
||||
# TODO: Need a model registry - for now have this as a CSV
|
||||
|
|
@ -135,7 +134,7 @@ def training(
|
|||
logger.info("--- Append registry with new model ---")
|
||||
|
||||
registry_path = BASE_REGISTRY_PATH / target_column / REGISTRY_FILE
|
||||
|
||||
|
||||
if registry_path.exists():
|
||||
logger.info("Registry file found - Loading into Dataframe")
|
||||
registry_df = pd.read_csv(registry_path, index_col=None)
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue