mirror of
https://github.com/Hestia-Homes/ML.git
synced 2026-06-08 11:17:25 +00:00
Merge pull request #85 from Hestia-Homes/carbon-dev-model
Carbon dev model
This commit is contained in:
commit
5f3d9efa92
15 changed files with 51 additions and 146 deletions
2
.github/workflows/Deploy.yml
vendored
2
.github/workflows/Deploy.yml
vendored
|
|
@ -2,7 +2,7 @@ name: Sap Change Model Deploy
|
|||
|
||||
on:
|
||||
push:
|
||||
branches: [ sap-dev, sap-prod ]
|
||||
branches: [ sap-dev, sap-prod, heat-dev, heat-prod, carbon-dev, carbon-prod]
|
||||
|
||||
jobs:
|
||||
deploy:
|
||||
|
|
|
|||
|
|
@ -69,9 +69,7 @@ def handler(event, context):
|
|||
|
||||
storage_filepath = f"s3://{PREDICTIONS_BUCKET}/{portfolio_id}/{property_id}/{created_at}.parquet"
|
||||
|
||||
logger.info("-------------------------")
|
||||
logger.info(f"--- Initiate MLModel ---")
|
||||
logger.info("-------------------------")
|
||||
|
||||
build_model_params = settings.build_model
|
||||
client_params = settings.client
|
||||
|
|
@ -80,17 +78,13 @@ def handler(event, context):
|
|||
|
||||
model = model_factory(build_model_params["model_type"])
|
||||
|
||||
logger.info("----------------------------")
|
||||
logger.info(f"--- Initiate Input DataClient ---")
|
||||
logger.info("----------------------------")
|
||||
input_dataclient = dataclient_factory(
|
||||
dataclient_type="aws-s3",
|
||||
dataclient_config=client_params["aws-s3"],
|
||||
)
|
||||
|
||||
logger.info("----------------------------")
|
||||
logger.info(f"--- Initiate Output DataClient ---")
|
||||
logger.info("----------------------------")
|
||||
output_dataclient = dataclient_factory(
|
||||
dataclient_type="aws-s3",
|
||||
dataclient_config=client_params["aws-s3"],
|
||||
|
|
|
|||
|
|
@ -9,16 +9,16 @@ init: dev-conda
|
|||
.PHONY: dev-conda
|
||||
dev-conda:
|
||||
# conda deactivate || echo "Not in conda environment"
|
||||
# conda remove --name $CONDA_ENV --all -y || echo "No environment created previously"
|
||||
conda create --name $CONDA_ENV python=$(PYTHON_VERSION) -y
|
||||
# conda remove --name ${CONDA_ENV} --all -y || echo "No environment created previously"
|
||||
conda create --name ${CONDA_ENV} python=$(PYTHON_VERSION) -y
|
||||
conda init bash
|
||||
conda run -vvvv -n $CONDA_ENV pip install --upgrade pip
|
||||
conda run -vvvv -n $CONDA_ENV pip install -r src/pipeline/requirements/training/requirements-dev.txt
|
||||
conda run -vvvv -n $CONDA_ENV pip install -r src/pipeline/requirements/version_control/requirements.txt
|
||||
conda run -vvvv -n $CONDA_ENV pre-commit install
|
||||
conda run -vvvv -n $CONDA_ENV pip install ipykernel
|
||||
conda run -v -n ${CONDA_ENV} pip install --upgrade pip
|
||||
conda run -v -n ${CONDA_ENV} pip install -r src/pipeline/requirements/training/requirements-dev.txt
|
||||
conda run -v -n ${CONDA_ENV} pip install -r src/pipeline/requirements/version_control/requirements.txt
|
||||
conda run -v -n ${CONDA_ENV} pre-commit install
|
||||
conda run -v -n ${CONDA_ENV} pip install ipykernel
|
||||
echo "TO ACTIVATE ENVIRONMENT, USE THE FOLLOWING COMMAND"
|
||||
echo "conda activate $CONDA_ENV"
|
||||
echo "conda activate ${CONDA_ENV}"
|
||||
|
||||
|
||||
.PHONY: dev-pyenv
|
||||
|
|
|
|||
|
|
@ -1,3 +1,3 @@
|
|||
# The generic reproducible ML-pipeline
|
||||
# The generic reproducible ML-pipeline!
|
||||
|
||||
Pipeline required to build a model to produce an output, that gets hashed via DVC
|
||||
|
|
|
|||
|
|
@ -16,13 +16,9 @@ def run_cleanup(artefacts_directory: str, metrics_directory: str) -> None:
|
|||
Remove the directory where artefacts are stored
|
||||
"""
|
||||
|
||||
logger.info("---------------------")
|
||||
logger.info(f"--- Run Clean up ---")
|
||||
logger.info("---------------------")
|
||||
|
||||
logger.info("-------------------------")
|
||||
logger.info(f"--- Delete artefacts ---")
|
||||
logger.info("-------------------------")
|
||||
|
||||
artefact_directory_path = Path(artefacts_directory)
|
||||
|
||||
|
|
@ -31,9 +27,7 @@ def run_cleanup(artefacts_directory: str, metrics_directory: str) -> None:
|
|||
logger.info(f"Removing the directory: {artefacts_directory}")
|
||||
shutil.rmtree(artefact_directory_path)
|
||||
|
||||
logger.info("-----------------------")
|
||||
logger.info(f"--- Delete metrics ---")
|
||||
logger.info("-----------------------")
|
||||
|
||||
metrics_directory_path = Path(metrics_directory)
|
||||
|
||||
|
|
@ -45,15 +39,11 @@ def run_cleanup(artefacts_directory: str, metrics_directory: str) -> None:
|
|||
|
||||
if __name__ == "__main__":
|
||||
|
||||
logger.info("----------------------------")
|
||||
logger.info(f"--- {__file__} - Start! ---")
|
||||
logger.info("----------------------------")
|
||||
|
||||
run_cleanup(
|
||||
artefacts_directory=startup_cleanup_params["artefacts"],
|
||||
metrics_directory=startup_cleanup_params["metrics"],
|
||||
)
|
||||
|
||||
logger.info("-------------------------------")
|
||||
logger.info(f"--- {__file__} - Complete! ---")
|
||||
logger.info("-------------------------------")
|
||||
|
|
|
|||
|
|
@ -17,9 +17,7 @@ from core.DataClient import dataclient_factory
|
|||
from core.FeatureProcessor import feature_processor_factory
|
||||
from config import settings
|
||||
|
||||
logger.info("----------------------------")
|
||||
logger.info(f"--- Initiate Parameters ---")
|
||||
logger.info("----------------------------")
|
||||
|
||||
RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local")
|
||||
|
||||
|
|
@ -33,9 +31,7 @@ output_train_filepath = prepare_data_params["output_train_filepath"]
|
|||
output_test_filepath = prepare_data_params["output_test_filepath"]
|
||||
feature_processor_config = feature_process_params["feature_processor_config"]
|
||||
|
||||
logger.info("----------------------------")
|
||||
logger.info(f"--- Initiate DataClient ---")
|
||||
logger.info("----------------------------")
|
||||
|
||||
input_dataclient_type = prepare_data_params["input_dataclient_type"]
|
||||
output_dataclient_type = prepare_data_params["output_dataclient_type"]
|
||||
|
|
@ -49,9 +45,7 @@ output_dataclient = dataclient_factory(
|
|||
dataclient_config=client_params[output_dataclient_type],
|
||||
)
|
||||
|
||||
logger.info("----------------------------------")
|
||||
logger.info(f"--- Initiate FeatureProcessor ---")
|
||||
logger.info("----------------------------------")
|
||||
|
||||
feature_processor = feature_processor_factory(
|
||||
feature_process_params["feature_processor_type"]
|
||||
|
|
@ -76,15 +70,11 @@ def prepare_data(
|
|||
:param pipeline_mode: bool, Default False, this caches out the file for experimentation, objects returned in pipeline mode
|
||||
"""
|
||||
|
||||
logger.info("--------------------")
|
||||
logger.info("--- Loading data ---")
|
||||
logger.info("--------------------")
|
||||
|
||||
data = input_dataclient.load_data(location=data_filepath, load_config={})
|
||||
|
||||
logger.info("--------------------------")
|
||||
logger.info("--- Feature Processing ---")
|
||||
logger.info("--------------------------")
|
||||
|
||||
data = feature_processor.feature_process(
|
||||
data,
|
||||
|
|
@ -93,9 +83,7 @@ def prepare_data(
|
|||
new_feature_funcs=new_feature_funcs,
|
||||
)
|
||||
|
||||
logger.info("----------------------")
|
||||
logger.info("--- Splitting data ---")
|
||||
logger.info("----------------------")
|
||||
|
||||
if train_proportion == 1:
|
||||
train = data
|
||||
|
|
@ -108,9 +96,7 @@ def prepare_data(
|
|||
|
||||
train = train.reset_index(drop=True)
|
||||
|
||||
logger.info("-----------------------")
|
||||
logger.info("--- Outputting data ---")
|
||||
logger.info("-----------------------")
|
||||
|
||||
output_dataclient.save_data(
|
||||
obj=train, location=output_train_filepath, save_config=None
|
||||
|
|
@ -126,13 +112,9 @@ def prepare_data(
|
|||
|
||||
if __name__ == "__main__":
|
||||
|
||||
logger.info("----------------------------")
|
||||
logger.info(f"--- {__file__} - Start! ---")
|
||||
logger.info("----------------------------")
|
||||
|
||||
logger.info("---------------------------")
|
||||
logger.info(f"--- Prepare Data Stage ---")
|
||||
logger.info("---------------------------")
|
||||
|
||||
prepare_data(
|
||||
input_dataclient=input_dataclient,
|
||||
|
|
@ -147,6 +129,4 @@ if __name__ == "__main__":
|
|||
new_feature_funcs=new_feature_funcs,
|
||||
)
|
||||
|
||||
logger.info("-------------------------------")
|
||||
logger.info(f"--- {__file__} - Complete! ---")
|
||||
logger.info("-------------------------------")
|
||||
|
|
|
|||
|
|
@ -18,9 +18,7 @@ from core.MLMetrics import metrics_factory
|
|||
from configs.post_prediction_logic import post_prediction_logic
|
||||
from config import settings
|
||||
|
||||
logger.info("----------------------------")
|
||||
logger.info(f"--- Initiate Parameters ---")
|
||||
logger.info("----------------------------")
|
||||
|
||||
RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local")
|
||||
|
||||
|
|
@ -40,22 +38,16 @@ train_filepath = prepare_data_params["output_train_filepath"]
|
|||
test_filepath = prepare_data_params["output_test_filepath"]
|
||||
fit_metrics_filepath = build_model_params["fit_metrics_filepath"]
|
||||
|
||||
logger.info("----------------------------")
|
||||
logger.info(f"--- Initiate DataClient ---")
|
||||
logger.info("----------------------------")
|
||||
|
||||
# Output of previous prepare data step, will be where the data is
|
||||
dataclient = dataclient_factory(prepare_data_params["output_dataclient_type"])
|
||||
|
||||
logger.info("-------------------------")
|
||||
logger.info(f"--- Initiate MLModel ---")
|
||||
logger.info("-------------------------")
|
||||
|
||||
model = model_factory(model_type)
|
||||
|
||||
logger.info("-------------------------")
|
||||
logger.info(f"--- Initiate Metrics ---")
|
||||
logger.info("-------------------------")
|
||||
|
||||
metrics = metrics_factory(generate_metrics_params["metrics_type"])
|
||||
|
||||
|
|
@ -75,9 +67,7 @@ def build_model(
|
|||
test_data: Union[pd.DataFrame, None] = None,
|
||||
pipeline_mode: bool = False,
|
||||
):
|
||||
logger.info("--------------------------------------")
|
||||
logger.info("--- Loading Data for build process ---")
|
||||
logger.info("--------------------------------------")
|
||||
|
||||
if train_data is None:
|
||||
if train_filepath is None:
|
||||
|
|
@ -89,9 +79,7 @@ def build_model(
|
|||
raise ValueError(f"Need {test_filepath} if no data supplied")
|
||||
test_data = dataclient.load_data(location=test_filepath, load_config=None)
|
||||
|
||||
logger.info("----------------------")
|
||||
logger.info("--- Training model ---")
|
||||
logger.info("----------------------")
|
||||
|
||||
model.train_model(
|
||||
data=train_data.drop(columns=identifier_columns),
|
||||
|
|
@ -99,32 +87,24 @@ def build_model(
|
|||
model_hyperparameters=model_hyperparameters,
|
||||
)
|
||||
|
||||
logger.info("----------------------------------")
|
||||
logger.info("--- Generating fit predictions ---")
|
||||
logger.info("----------------------------------")
|
||||
|
||||
fit_predictions = model.predict(
|
||||
data=train_data, post_prediction_logic=post_prediction_logic
|
||||
)
|
||||
|
||||
logger.info("------------------------------")
|
||||
logger.info("--- Generating fit metrics ---")
|
||||
logger.info("------------------------------")
|
||||
|
||||
metrics_output = metrics.generate_metrics(
|
||||
target=train_data[target],
|
||||
predictions=pd.Series(fit_predictions),
|
||||
)
|
||||
|
||||
logger.info("--------------------")
|
||||
logger.info("--- Saving model ---")
|
||||
logger.info("--------------------")
|
||||
|
||||
model.save_model(path=Path(model_save_location))
|
||||
|
||||
logger.info("--------------------------")
|
||||
logger.info("--- Saving fit metrics ---")
|
||||
logger.info("--------------------------")
|
||||
|
||||
dataclient.save_data(
|
||||
obj=metrics_output, location=fit_metrics_filepath, save_config=None
|
||||
|
|
@ -133,13 +113,9 @@ def build_model(
|
|||
|
||||
if __name__ == "__main__":
|
||||
|
||||
logger.info("----------------------------")
|
||||
logger.info(f"--- {__file__} - Start! ---")
|
||||
logger.info("----------------------------")
|
||||
|
||||
logger.info("--------------------------")
|
||||
logger.info(f"--- Build Model Stage ---")
|
||||
logger.info("--------------------------")
|
||||
|
||||
build_model(
|
||||
dataclient=dataclient,
|
||||
|
|
@ -154,6 +130,4 @@ if __name__ == "__main__":
|
|||
fit_metrics_filepath=fit_metrics_filepath,
|
||||
)
|
||||
|
||||
logger.info("-------------------------------")
|
||||
logger.info(f"--- {__file__} - Complete! ---")
|
||||
logger.info("-------------------------------")
|
||||
|
|
|
|||
|
|
@ -10,9 +10,7 @@ from core.Logger import logger
|
|||
from config import settings
|
||||
from generate_predictions import generate_predictions
|
||||
|
||||
logger.info("----------------------------")
|
||||
logger.info(f"--- Initiate Parameters ---")
|
||||
logger.info("----------------------------")
|
||||
|
||||
RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local")
|
||||
|
||||
|
|
@ -33,15 +31,11 @@ model_filepath = build_model_params["model_save_filepath"]
|
|||
predictions_output_filepath = generate_predictions_params["predictions_output_filepath"]
|
||||
predictions_column_name = generate_predictions_params["predictions_column_name"]
|
||||
|
||||
logger.info("-------------------------")
|
||||
logger.info(f"--- Initiate MLModel ---")
|
||||
logger.info("-------------------------")
|
||||
|
||||
model = model_factory(build_model_params["model_type"])
|
||||
|
||||
logger.info("----------------------------")
|
||||
logger.info(f"--- Initiate DataClient ---")
|
||||
logger.info("----------------------------")
|
||||
|
||||
# We may have different locations of loading hence why we use one specified in generate_predictions.yaml
|
||||
# I.e. for metric runs, this will be a local data client
|
||||
|
|
@ -59,13 +53,9 @@ output_dataclient = dataclient_factory(
|
|||
|
||||
if __name__ == "__main__":
|
||||
|
||||
logger.info("----------------------------")
|
||||
logger.info(f"--- {__file__} - Start! ---")
|
||||
logger.info("----------------------------")
|
||||
|
||||
logger.info("----------------------------------")
|
||||
logger.info(f"--- Generate Predictions Stage---")
|
||||
logger.info("----------------------------------")
|
||||
|
||||
generate_predictions(
|
||||
input_dataclient=input_dataclient,
|
||||
|
|
@ -78,6 +68,4 @@ if __name__ == "__main__":
|
|||
predictions_column_name=predictions_column_name,
|
||||
)
|
||||
|
||||
logger.info("-------------------------------")
|
||||
logger.info(f"--- {__file__} - Complete! ---")
|
||||
logger.info("-------------------------------")
|
||||
|
|
|
|||
|
|
@ -16,9 +16,7 @@ from core.MLMetrics import metrics_factory
|
|||
from core.Logger import logger
|
||||
from config import settings
|
||||
|
||||
logger.info("----------------------------")
|
||||
logger.info(f"--- Initiate Parameters ---")
|
||||
logger.info("----------------------------")
|
||||
|
||||
RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local")
|
||||
|
||||
|
|
@ -36,15 +34,11 @@ predictions_column_name = generate_predictions_params["predictions_column_name"]
|
|||
metrics_output_filepath = generate_metrics_params["metrics_output_filepath"]
|
||||
|
||||
|
||||
logger.info("-------------------------")
|
||||
logger.info(f"--- Initiate MLModel ---")
|
||||
logger.info("-------------------------")
|
||||
|
||||
model = model_factory(build_model_params["model_type"])
|
||||
|
||||
logger.info("----------------------------")
|
||||
logger.info(f"--- Initiate DataClient ---")
|
||||
logger.info("----------------------------")
|
||||
|
||||
# Use data client for input and output, as we use dvc to cache later to the cloud
|
||||
dataclient_type = generate_metrics_params["dataclient_type"]
|
||||
|
|
@ -53,9 +47,7 @@ dataclient = dataclient_factory(
|
|||
dataclient_config=client_params[dataclient_type],
|
||||
)
|
||||
|
||||
logger.info("---------------------------")
|
||||
logger.info(f"--- Initiate MLMetrics ---")
|
||||
logger.info("---------------------------")
|
||||
|
||||
metrics = metrics_factory(generate_metrics_params["metrics_type"])
|
||||
|
||||
|
|
@ -75,34 +67,26 @@ def generate_metrics(
|
|||
For a given model, we generate prediction and evaluate this against the true target
|
||||
"""
|
||||
|
||||
logger.info("-------------------------")
|
||||
logger.info("--- Loading test data ---")
|
||||
logger.info("-------------------------")
|
||||
|
||||
test_data = input_dataclient.load_data(
|
||||
location=test_data_filepath, load_config=None
|
||||
)
|
||||
|
||||
logger.info("---------------------------")
|
||||
logger.info("--- Loading predictions ---")
|
||||
logger.info("---------------------------")
|
||||
|
||||
predictions = input_dataclient.load_data(
|
||||
location=predictions_output_filepath, load_config=None
|
||||
)
|
||||
|
||||
logger.info("--------------------------")
|
||||
logger.info("--- Generating metrics ---")
|
||||
logger.info("--------------------------")
|
||||
|
||||
metrics_output = metrics.generate_metrics(
|
||||
target=test_data[target],
|
||||
predictions=pd.Series(predictions[predictions_column_name]),
|
||||
)
|
||||
|
||||
logger.info("----------------------")
|
||||
logger.info("--- Saving metrics ---")
|
||||
logger.info("----------------------")
|
||||
|
||||
output_dataclient.save_data(
|
||||
obj=metrics_output, location=metrics_output_filepath, save_config=None
|
||||
|
|
@ -111,13 +95,9 @@ def generate_metrics(
|
|||
|
||||
if __name__ == "__main__":
|
||||
|
||||
logger.info("----------------------------")
|
||||
logger.info(f"--- {__file__} - Start! ---")
|
||||
logger.info("----------------------------")
|
||||
|
||||
logger.info("------------------------------")
|
||||
logger.info(f"--- Generate Metrics Stage---")
|
||||
logger.info("------------------------------")
|
||||
|
||||
generate_metrics(
|
||||
input_dataclient=dataclient,
|
||||
|
|
@ -131,6 +111,4 @@ if __name__ == "__main__":
|
|||
metrics_output_filepath=metrics_output_filepath,
|
||||
)
|
||||
|
||||
logger.info("-------------------------------")
|
||||
logger.info(f"--- {__file__} - Complete! ---")
|
||||
logger.info("-------------------------------")
|
||||
|
|
|
|||
|
|
@ -16,3 +16,5 @@ default:
|
|||
time_limit: 400
|
||||
presets: medium_quality
|
||||
excluded_model_types: ['KNN', 'RF']
|
||||
infer_limit: 0.05
|
||||
infer_limit_batch_size: 10000
|
||||
|
|
|
|||
|
|
@ -21,7 +21,7 @@ default:
|
|||
# data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_with_differencing.parquet
|
||||
# data_filepath: s3://retrofit-data-dev/sap_change_model/floor_area_clean_test.parquet
|
||||
# data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_without_differencing.parquet
|
||||
data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet
|
||||
data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_test.parquet
|
||||
train_proportion: 0.9
|
||||
output_train_filepath: ./data/prepared_data/train.parquet
|
||||
output_test_filepath: ./data/prepared_data/test.parquet
|
||||
|
|
|
|||
|
|
@ -21,6 +21,7 @@ def setup_logger():
|
|||
|
||||
# Add the stream handler to the logger
|
||||
logger.addHandler(stream_handler)
|
||||
logger.propagate = False
|
||||
|
||||
return logger
|
||||
|
||||
|
|
|
|||
|
|
@ -149,6 +149,8 @@ class AutogluonAutoML:
|
|||
"time_limit",
|
||||
"presets",
|
||||
"excluded_model_types",
|
||||
"infer_limit",
|
||||
"infer_limit_batch_size",
|
||||
]
|
||||
|
||||
def load_model(self, path: Union[Path, str]) -> None:
|
||||
|
|
@ -203,6 +205,8 @@ class AutogluonAutoML:
|
|||
time_limit=model_hyperparameters["time_limit"],
|
||||
presets=model_hyperparameters["presets"],
|
||||
excluded_model_types=model_hyperparameters["excluded_model_types"],
|
||||
infer_limit=model_hyperparameters["infer_limit"],
|
||||
infer_limit_batch_size=model_hyperparameters["infer_limit_batch_size"],
|
||||
)
|
||||
|
||||
def predict(
|
||||
|
|
|
|||
|
|
@ -5,8 +5,8 @@ stages:
|
|||
deps:
|
||||
- path: 1_prepare_data.py
|
||||
hash: md5
|
||||
md5: c9f030df733e318b80d1fa91b7732f79
|
||||
size: 5132
|
||||
md5: 896d3d88a4a9f68d174efe71dc089517
|
||||
size: 4222
|
||||
params:
|
||||
configs/settings.yaml:
|
||||
default.feature_processor.feature_processor_config.drop_columns:
|
||||
|
|
@ -20,7 +20,7 @@ stages:
|
|||
default.feature_processor.feature_processor_config.subsample_seed: 0
|
||||
default.feature_processor.feature_processor_config.target: CARBON_ENDING
|
||||
default.feature_processor.feature_processor_type: dataframe
|
||||
default.prepare_data.data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet
|
||||
default.prepare_data.data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_test.parquet
|
||||
default.prepare_data.input_dataclient_type: aws-s3
|
||||
default.prepare_data.output_dataclient_type: local
|
||||
default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet
|
||||
|
|
@ -29,20 +29,20 @@ stages:
|
|||
outs:
|
||||
- path: data/prepared_data/
|
||||
hash: md5
|
||||
md5: 5fd3c01804ee2994ee77fc501d178be4.dir
|
||||
size: 30137355
|
||||
md5: 73c1f7be21be8358a73c4ab5f9ec8e39.dir
|
||||
size: 32943109
|
||||
nfiles: 2
|
||||
build_model:
|
||||
cmd: python 2_build_model.py
|
||||
deps:
|
||||
- path: 2_build_model.py
|
||||
hash: md5
|
||||
md5: 84699d208874c52accaff61c6af9bb0a
|
||||
size: 5359
|
||||
md5: b824822475c222521516493e68eef9c5
|
||||
size: 4149
|
||||
- path: data/prepared_data
|
||||
hash: md5
|
||||
md5: 5fd3c01804ee2994ee77fc501d178be4.dir
|
||||
size: 30137355
|
||||
md5: 73c1f7be21be8358a73c4ab5f9ec8e39.dir
|
||||
size: 32943109
|
||||
nfiles: 2
|
||||
params:
|
||||
configs/build_model.yaml:
|
||||
|
|
@ -63,32 +63,34 @@ stages:
|
|||
excluded_model_types:
|
||||
- KNN
|
||||
- RF
|
||||
infer_limit: 0.05
|
||||
infer_limit_batch_size: 10000
|
||||
outs:
|
||||
- path: data/model/
|
||||
hash: md5
|
||||
md5: 4b49c12395a645e35e50a9de8840f08d.dir
|
||||
size: 282024140
|
||||
md5: dee1a60e6a9f4695272da8127196f714.dir
|
||||
size: 326732699
|
||||
nfiles: 24
|
||||
- path: metrics/fit_metrics.json
|
||||
hash: md5
|
||||
md5: a6d139fa59f5ddf75023bb7d3364f6d2
|
||||
size: 225
|
||||
md5: 1fefa99c7bc50d09c31bf175d5b9ee9c
|
||||
size: 226
|
||||
generate_predictions:
|
||||
cmd: python 3_generate_predictions.py
|
||||
deps:
|
||||
- path: 3_generate_predictions.py
|
||||
hash: md5
|
||||
md5: 5ef2856a5a977304f1ec01f9b4205262
|
||||
size: 3028
|
||||
md5: 0a70ad4dfe99414a75d1261c75a177b9
|
||||
size: 2464
|
||||
- path: data/model
|
||||
hash: md5
|
||||
md5: 4b49c12395a645e35e50a9de8840f08d.dir
|
||||
size: 282024140
|
||||
md5: dee1a60e6a9f4695272da8127196f714.dir
|
||||
size: 326732699
|
||||
nfiles: 24
|
||||
- path: data/prepared_data
|
||||
hash: md5
|
||||
md5: 5fd3c01804ee2994ee77fc501d178be4.dir
|
||||
size: 30137355
|
||||
md5: 73c1f7be21be8358a73c4ab5f9ec8e39.dir
|
||||
size: 32943109
|
||||
nfiles: 2
|
||||
params:
|
||||
configs/settings.yaml:
|
||||
|
|
@ -100,25 +102,25 @@ stages:
|
|||
outs:
|
||||
- path: data/predictions/
|
||||
hash: md5
|
||||
md5: 8f724261b3d17bf87067e91a1ff99077.dir
|
||||
size: 441423
|
||||
md5: d2da3b713811952b66e2c5f8c95f5407.dir
|
||||
size: 410646
|
||||
nfiles: 1
|
||||
generate_metrics:
|
||||
cmd: python 4_generate_metrics.py
|
||||
deps:
|
||||
- path: 4_generate_metrics.py
|
||||
hash: md5
|
||||
md5: 2c9fb78955a8c19cff0a098976f81d1b
|
||||
size: 4487
|
||||
md5: d09a80dd55f1f69e2a832b1991b3c406
|
||||
size: 3485
|
||||
- path: data/predictions
|
||||
hash: md5
|
||||
md5: 8f724261b3d17bf87067e91a1ff99077.dir
|
||||
size: 441423
|
||||
md5: d2da3b713811952b66e2c5f8c95f5407.dir
|
||||
size: 410646
|
||||
nfiles: 1
|
||||
- path: data/prepared_data
|
||||
hash: md5
|
||||
md5: 5fd3c01804ee2994ee77fc501d178be4.dir
|
||||
size: 30137355
|
||||
md5: 73c1f7be21be8358a73c4ab5f9ec8e39.dir
|
||||
size: 32943109
|
||||
nfiles: 2
|
||||
params:
|
||||
configs/settings.yaml:
|
||||
|
|
@ -128,15 +130,15 @@ stages:
|
|||
outs:
|
||||
- path: metrics/metrics.json
|
||||
hash: md5
|
||||
md5: 38787835f838f65c6cc75654843eb311
|
||||
size: 223
|
||||
md5: 4ed2edc06b4dad3c094a2d1be374a5de
|
||||
size: 224
|
||||
startup_cleanup:
|
||||
cmd: python 0_startup_cleanup.py
|
||||
deps:
|
||||
- path: 0_startup_cleanup.py
|
||||
hash: md5
|
||||
md5: fbb7e3b1b98b517c870f3e1df3e7f695
|
||||
size: 1676
|
||||
md5: b1b12f6b6393fbf8b83d23684df0a3d4
|
||||
size: 1220
|
||||
params:
|
||||
configs/settings.yaml:
|
||||
default.startup_cleanup.artefacts: ./data
|
||||
|
|
|
|||
|
|
@ -20,23 +20,17 @@ def generate_predictions(
|
|||
For a given model, we generate prediction and evaluate this against the true target
|
||||
"""
|
||||
|
||||
logger.info("-------------------------")
|
||||
logger.info("--- Loading test data ---")
|
||||
logger.info("-------------------------")
|
||||
|
||||
test_data = input_dataclient.load_data(
|
||||
location=test_data_filepath, load_config=None
|
||||
)
|
||||
|
||||
logger.info("---------------------")
|
||||
logger.info("--- Loading model ---")
|
||||
logger.info("---------------------")
|
||||
|
||||
model.load_model(model_filepath)
|
||||
|
||||
logger.info("------------------------------")
|
||||
logger.info("--- Generating predictions ---")
|
||||
logger.info("------------------------------")
|
||||
|
||||
prediction_data = (
|
||||
test_data.drop(columns=target) if target in test_data.columns else test_data
|
||||
|
|
@ -46,9 +40,7 @@ def generate_predictions(
|
|||
data=prediction_data, post_prediction_logic=post_prediction_logic
|
||||
)
|
||||
|
||||
logger.info("--------------------------")
|
||||
logger.info("--- Saving predictions ---")
|
||||
logger.info("--------------------------")
|
||||
|
||||
predictions_df = pd.DataFrame(predictions)
|
||||
predictions_df.columns = [predictions_column_name]
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue