From 961773f58af2e21537e81cea59b3cf6dc092b363 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Tue, 3 Oct 2023 22:29:55 +0000 Subject: [PATCH] add identifier column to datasets --- .../ml-pipeline/src/pipeline/2_build_model.py | 17 ++++++++++++----- .../src/pipeline/configs/settings.yaml | 3 ++- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/modules/ml-pipeline/src/pipeline/2_build_model.py b/modules/ml-pipeline/src/pipeline/2_build_model.py index f7746f9..44d72cd 100644 --- a/modules/ml-pipeline/src/pipeline/2_build_model.py +++ b/modules/ml-pipeline/src/pipeline/2_build_model.py @@ -6,7 +6,7 @@ Once we have the features, we build a model import os import yaml import pandas as pd -from typing import Union +from typing import Union, List from pathlib import Path from core.Logger import logger from core.interface.InterfaceMetrics import MLMetrics @@ -31,6 +31,9 @@ generate_metrics_params = settings.generate_metrics model_type = build_model_params["model_type"] target = feature_process_params["feature_processor_config"]["target"] +identifier_columns = feature_process_params["feature_processor_config"][ + "identifier_columns" +] model_save_location = build_model_params["model_save_filepath"] model_hyperparameters = build_model_params[model_type] train_filepath = prepare_data_params["output_train_filepath"] @@ -62,6 +65,7 @@ def build_model( model: MLModel, metrics: MLMetrics, target: str, + identifier_columns: List[str], model_save_location: str, model_hyperparameters: dict, fit_metrics_filepath: str, @@ -89,18 +93,20 @@ def build_model( logger.info("--- Training model ---") logger.info("----------------------") + model_train_data = train_data.drop(columns=identifier_columns) + model.train_model( - data=train_data, target=target, model_hyperparameters=model_hyperparameters + data=model_train_data, + target=target, + model_hyperparameters=model_hyperparameters, ) logger.info("----------------------------------") logger.info("--- Generating fit predictions ---") logger.info("----------------------------------") - prediction_data = train_data.drop(columns=target) - fit_predictions = model.predict( - data=prediction_data, post_prediction_logic=post_prediction_logic + data=train_data, post_prediction_logic=post_prediction_logic ) logger.info("------------------------------") @@ -142,6 +148,7 @@ if __name__ == "__main__": model=model, metrics=metrics, target=target, + identifier_columns=identifier_columns, model_save_location=model_save_location, model_hyperparameters=model_hyperparameters, train_filepath=train_filepath, diff --git a/modules/ml-pipeline/src/pipeline/configs/settings.yaml b/modules/ml-pipeline/src/pipeline/configs/settings.yaml index 1a3c58a..a84c095 100644 --- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml @@ -32,7 +32,8 @@ default: subsample_amount: null subsample_seed: 0 target: SAP_ENDING - drop_columns: ["UPRN", "HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "RDSAP_CHANGE", "HEAT_DEMAND_ENDING", "CARBON_ENDING"] + identifier_columns: ["UPRN"] + drop_columns: ["HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "RDSAP_CHANGE", "HEAT_DEMAND_ENDING", "CARBON_ENDING"] # retain_features: ["SAP_STARTING", "TOTAL_FLOOR_AREA_DIFF"] retain_features: null