added autogluon model

This commit is contained in:
Michael Duong 2023-09-12 14:24:45 +01:00
parent b3c9bc8fd7
commit 72334aeb44
9 changed files with 192 additions and 25 deletions

View file

@ -1,7 +1,15 @@
model_type: SKLearnLinearRegression
model_save_filepath: ./data/model/model.joblib
model_type: AutogluonAutoML
model_save_filepath: ./data/model/autogluonmodel/
SKLearnLinearRegression: null
SKLearnSVMRegression:
kernel: "linear"
AutogluonAutoML:
output_filepath: ./data/model/autogluonmodel/
problem_type: regression
eval_metric: mean_absolute_error
time_limit: 200
presets: medium_quality
excluded_model_types: null

View file

@ -4,4 +4,5 @@ feature_processor_config:
subsample_seed: 0
target: RDSAP_CHANGE
drop_columns: ["UPRN", "HEAT_DEMAND_CHANGE"]
retain_features: ["TOTAL_FLOOR_AREA", "FLOOR_HEIGHT"]
# retain_features: ["TOTAL_FLOOR_AREA", "FLOOR_HEIGHT"]
retain_features: null

View file

@ -0,0 +1,2 @@
artefacts: ./data
metrics: ./metrics

View file

@ -13,7 +13,9 @@ from pathlib import Path
from typing import Union, List
from sklearn import linear_model
from sklearn.svm import SVR
from autogluon.tabular import TabularDataset, TabularPredictor
from core.interface.InterfaceModels import MLModel
from core.Logger import logger
def model_factory(model_type: str) -> MLModel:
@ -23,6 +25,7 @@ def model_factory(model_type: str) -> MLModel:
models = {
"SKLearnLinearRegression": SKLearnLinearRegression(),
"SKLearnSVMRegression": SKLearnSVMRegression(),
"AutogluonAutoML": AutogluonAutoML()
# ADD OTHER MODELS HERE
}
@ -131,3 +134,78 @@ class SKLearnSVMRegression:
"""
self.predictions = pd.Series(self.model.predict(data))
return self.predictions
class AutogluonAutoML:
ACCEPTED_MODEL_HYPERPAREMETERS = [
"output_filepath",
"problem_type",
"eval_metric",
"time_limit",
"presets",
"excluded_model_types",
]
def load_model(self, path: Union[Path, str]) -> None:
"""
Method to load a model
"""
filepath = str(path)
self.model = TabularPredictor.load(path=filepath)
def save_model(self, path: Path) -> str:
"""
Method to save a model
"""
if self.model is None:
raise KeyError("No model trained/ loaded - unable to save")
logger.info("In local development mode - no need for s3 client")
logger.info("Using AutoGluon Model - Model saving already occured")
return str(path)
def train_model(
self, data: pd.DataFrame, target: str, model_hyperparameters: dict
) -> None:
"""
Method to train a model
"""
validate_dict_keys(
keys_1=list(model_hyperparameters.keys()),
keys_2=self.ACCEPTED_MODEL_HYPERPAREMETERS,
config_type="Model Hyperparameters",
)
if model_hyperparameters["output_filepath"] is None:
logger.error("Please specify a output_filepath in order to train a model")
exit(1)
AGdata = TabularDataset(data=data)
self.model = TabularPredictor(
label=target,
path=model_hyperparameters["output_filepath"],
problem_type=model_hyperparameters["problem_type"],
eval_metric=model_hyperparameters["eval_metric"],
).fit(
AGdata,
time_limit=model_hyperparameters["time_limit"],
presets=model_hyperparameters["presets"],
excluded_model_types=model_hyperparameters["excluded_model_types"],
)
def predict(self, data: pd.DataFrame) -> pd.Series:
"""
Method to predict
"""
if self.model is None:
print("No model loaded/ trained")
exit(1)
predictions = pd.Series(self.model.predict(data))
return predictions

View file

@ -23,8 +23,8 @@ stages:
deps:
- path: build_model.py
hash: md5
md5: 58315ea127dcc127e2c22ab1205fddb2
size: 3925
md5: 662cd6b1562fbbc2c7d30dd0f2375a66
size: 3948
- path: data/prepared_data
hash: md5
md5: 01a8f8f0b264ac4d61307a67bfa910b4.dir
@ -32,25 +32,32 @@ stages:
nfiles: 2
params:
configs/build_model.yaml:
AutogluonAutoML:
output_filepath: ./data/model/autogluonmodel/
problem_type: regression
eval_metric: mean_absolute_error
time_limit: 200
presets: medium_quality
excluded_model_types:
SKLearnLinearRegression:
SKLearnSVMRegression:
kernel: linear
model_save_filepath: ./data/model/model.joblib
model_type: SKLearnLinearRegression
model_save_filepath: ./data/model/autogluonmodel/
model_type: AutogluonAutoML
outs:
- path: data/model/
hash: md5
md5: 40fa511f4f401f9d2c7da814afe198ef.dir
size: 920
nfiles: 1
md5: 04a1e3bc625e7934c9f57a3fa2f1ea5c.dir
size: 1264795580
nfiles: 28
generate_predictions:
cmd: python generate_predictions.py
deps:
- path: data/model
hash: md5
md5: 40fa511f4f401f9d2c7da814afe198ef.dir
size: 920
nfiles: 1
md5: 04a1e3bc625e7934c9f57a3fa2f1ea5c.dir
size: 1264795580
nfiles: 28
- path: data/prepared_data
hash: md5
md5: 01a8f8f0b264ac4d61307a67bfa910b4.dir
@ -58,8 +65,8 @@ stages:
nfiles: 2
- path: generate_predictions.py
hash: md5
md5: 13e920c0bae8ac51dd907631578f7045
size: 4126
md5: 76c45e7575ec979e6c4c8e2cf754a720
size: 4225
params:
configs/generate_predictions.yaml:
input_dataclient_type: local
@ -70,16 +77,16 @@ stages:
outs:
- path: data/predictions/
hash: md5
md5: 01e8d3483e1f90b5d92022ee4a65bbd7.dir
size: 945933
md5: 44c298a28a0bb1367bb82d5da1a5dbd0.dir
size: 672577
nfiles: 1
generate_metrics:
cmd: python generate_metrics.py
deps:
- path: data/predictions
hash: md5
md5: 01e8d3483e1f90b5d92022ee4a65bbd7.dir
size: 945933
md5: 44c298a28a0bb1367bb82d5da1a5dbd0.dir
size: 672577
nfiles: 1
- path: data/prepared_data
hash: md5
@ -88,8 +95,8 @@ stages:
nfiles: 2
- path: generate_metrics.py
hash: md5
md5: 6276995b5e860d0f0bb4545aa5f5d347
size: 4259
md5: cc368845f62523575a9ed5c791e27815
size: 4329
params:
configs/generate_metrics.yaml:
dataclient_type: local
@ -100,5 +107,16 @@ stages:
outs:
- path: metrics/metrics.json
hash: md5
md5: 995ccf3c6c3f6a975d22aa9bc9f4964e
size: 181
md5: 3f03e50a419af6730351a5016e2ae98a
size: 182
startup_cleanup:
cmd: python startup_cleanup.py
deps:
- path: startup_cleanup.py
hash: md5
md5: f7fe2ca33004b34530da0a3ab48c1790
size: 1458
params:
configs/startup_cleanup.yaml:
artefacts: ./data
metrics: ./metrics

View file

@ -1,4 +1,12 @@
stages:
startup_cleanup:
cmd: python startup_cleanup.py
deps:
- startup_cleanup.py
params:
- configs/startup_cleanup.yaml:
- artefacts
- metrics
prepare_data:
cmd: python prepare_data.py
deps:

View file

@ -77,7 +77,9 @@ def generate_predictions(
logger.info("--- Saving predictions ---")
logger.info("--------------------------")
predictions_df = pd.DataFrame(predictions, columns=[predictions_column_name])
predictions_df = pd.DataFrame(predictions)
predictions_df.columns = [predictions_column_name]
datahandler.save_data(
dataclient=output_dataclient,
obj=predictions_df,

View file

@ -1,7 +1,7 @@
joblib==1.3.2
boto3==1.28.17
pandas==1.5.3
scikit-learn==1.3.0
autogluon==0.8.2
pyarrow==13.0.0
pre-commit==3.3.3
sphinx==7.2.5

View file

@ -0,0 +1,50 @@
"""
We remove all previous artefacts in the data folder for a dvc run
"""
import shutil
import yaml
from pathlib import Path
from core.Logger import logger
startup_cleanup_path = Path(__file__).parent / "configs" / "startup_cleanup.yaml"
startup_cleanup_params = yaml.safe_load(open(startup_cleanup_path))
def run_cleanup(artefacts_directory: str, metrics_directory: str) -> None:
"""
Remove the directory where artefacts are stored
"""
artefact_directory_path = Path(artefacts_directory)
if artefact_directory_path.exists():
logger.info(f"Removing the directory: {artefacts_directory}")
shutil.rmtree(artefact_directory_path)
metrics_directory_path = Path(metrics_directory)
if metrics_directory_path.exists():
logger.info(f"Removing the directory: {metrics_directory}")
shutil.rmtree(metrics_directory_path)
if __name__ == "__main__":
logger.info("----------------------------")
logger.info(f"--- {__file__} - Start! ---")
logger.info("----------------------------")
logger.info("---------------------")
logger.info(f"--- Run Clean up ---")
logger.info("---------------------")
run_cleanup(
artefacts_directory=startup_cleanup_params["artefacts"],
metrics_directory=startup_cleanup_params["metrics"],
)
logger.info("-------------------------------")
logger.info(f"--- {__file__} - Complete! ---")
logger.info("-------------------------------")