mirror of
https://github.com/Hestia-Homes/ML.git
synced 2026-06-08 11:17:25 +00:00
added autogluon model
This commit is contained in:
parent
b3c9bc8fd7
commit
72334aeb44
9 changed files with 192 additions and 25 deletions
|
|
@ -1,7 +1,15 @@
|
|||
model_type: SKLearnLinearRegression
|
||||
model_save_filepath: ./data/model/model.joblib
|
||||
model_type: AutogluonAutoML
|
||||
model_save_filepath: ./data/model/autogluonmodel/
|
||||
|
||||
SKLearnLinearRegression: null
|
||||
|
||||
SKLearnSVMRegression:
|
||||
kernel: "linear"
|
||||
|
||||
AutogluonAutoML:
|
||||
output_filepath: ./data/model/autogluonmodel/
|
||||
problem_type: regression
|
||||
eval_metric: mean_absolute_error
|
||||
time_limit: 200
|
||||
presets: medium_quality
|
||||
excluded_model_types: null
|
||||
|
|
|
|||
|
|
@ -4,4 +4,5 @@ feature_processor_config:
|
|||
subsample_seed: 0
|
||||
target: RDSAP_CHANGE
|
||||
drop_columns: ["UPRN", "HEAT_DEMAND_CHANGE"]
|
||||
retain_features: ["TOTAL_FLOOR_AREA", "FLOOR_HEIGHT"]
|
||||
# retain_features: ["TOTAL_FLOOR_AREA", "FLOOR_HEIGHT"]
|
||||
retain_features: null
|
||||
|
|
|
|||
|
|
@ -0,0 +1,2 @@
|
|||
artefacts: ./data
|
||||
metrics: ./metrics
|
||||
|
|
@ -13,7 +13,9 @@ from pathlib import Path
|
|||
from typing import Union, List
|
||||
from sklearn import linear_model
|
||||
from sklearn.svm import SVR
|
||||
from autogluon.tabular import TabularDataset, TabularPredictor
|
||||
from core.interface.InterfaceModels import MLModel
|
||||
from core.Logger import logger
|
||||
|
||||
|
||||
def model_factory(model_type: str) -> MLModel:
|
||||
|
|
@ -23,6 +25,7 @@ def model_factory(model_type: str) -> MLModel:
|
|||
models = {
|
||||
"SKLearnLinearRegression": SKLearnLinearRegression(),
|
||||
"SKLearnSVMRegression": SKLearnSVMRegression(),
|
||||
"AutogluonAutoML": AutogluonAutoML()
|
||||
# ADD OTHER MODELS HERE
|
||||
}
|
||||
|
||||
|
|
@ -131,3 +134,78 @@ class SKLearnSVMRegression:
|
|||
"""
|
||||
self.predictions = pd.Series(self.model.predict(data))
|
||||
return self.predictions
|
||||
|
||||
|
||||
class AutogluonAutoML:
|
||||
|
||||
ACCEPTED_MODEL_HYPERPAREMETERS = [
|
||||
"output_filepath",
|
||||
"problem_type",
|
||||
"eval_metric",
|
||||
"time_limit",
|
||||
"presets",
|
||||
"excluded_model_types",
|
||||
]
|
||||
|
||||
def load_model(self, path: Union[Path, str]) -> None:
|
||||
"""
|
||||
Method to load a model
|
||||
"""
|
||||
filepath = str(path)
|
||||
self.model = TabularPredictor.load(path=filepath)
|
||||
|
||||
def save_model(self, path: Path) -> str:
|
||||
"""
|
||||
Method to save a model
|
||||
"""
|
||||
if self.model is None:
|
||||
raise KeyError("No model trained/ loaded - unable to save")
|
||||
|
||||
logger.info("In local development mode - no need for s3 client")
|
||||
logger.info("Using AutoGluon Model - Model saving already occured")
|
||||
|
||||
return str(path)
|
||||
|
||||
def train_model(
|
||||
self, data: pd.DataFrame, target: str, model_hyperparameters: dict
|
||||
) -> None:
|
||||
"""
|
||||
Method to train a model
|
||||
"""
|
||||
|
||||
validate_dict_keys(
|
||||
keys_1=list(model_hyperparameters.keys()),
|
||||
keys_2=self.ACCEPTED_MODEL_HYPERPAREMETERS,
|
||||
config_type="Model Hyperparameters",
|
||||
)
|
||||
|
||||
if model_hyperparameters["output_filepath"] is None:
|
||||
logger.error("Please specify a output_filepath in order to train a model")
|
||||
exit(1)
|
||||
|
||||
AGdata = TabularDataset(data=data)
|
||||
|
||||
self.model = TabularPredictor(
|
||||
label=target,
|
||||
path=model_hyperparameters["output_filepath"],
|
||||
problem_type=model_hyperparameters["problem_type"],
|
||||
eval_metric=model_hyperparameters["eval_metric"],
|
||||
).fit(
|
||||
AGdata,
|
||||
time_limit=model_hyperparameters["time_limit"],
|
||||
presets=model_hyperparameters["presets"],
|
||||
excluded_model_types=model_hyperparameters["excluded_model_types"],
|
||||
)
|
||||
|
||||
def predict(self, data: pd.DataFrame) -> pd.Series:
|
||||
"""
|
||||
Method to predict
|
||||
"""
|
||||
|
||||
if self.model is None:
|
||||
print("No model loaded/ trained")
|
||||
exit(1)
|
||||
|
||||
predictions = pd.Series(self.model.predict(data))
|
||||
|
||||
return predictions
|
||||
|
|
|
|||
|
|
@ -23,8 +23,8 @@ stages:
|
|||
deps:
|
||||
- path: build_model.py
|
||||
hash: md5
|
||||
md5: 58315ea127dcc127e2c22ab1205fddb2
|
||||
size: 3925
|
||||
md5: 662cd6b1562fbbc2c7d30dd0f2375a66
|
||||
size: 3948
|
||||
- path: data/prepared_data
|
||||
hash: md5
|
||||
md5: 01a8f8f0b264ac4d61307a67bfa910b4.dir
|
||||
|
|
@ -32,25 +32,32 @@ stages:
|
|||
nfiles: 2
|
||||
params:
|
||||
configs/build_model.yaml:
|
||||
AutogluonAutoML:
|
||||
output_filepath: ./data/model/autogluonmodel/
|
||||
problem_type: regression
|
||||
eval_metric: mean_absolute_error
|
||||
time_limit: 200
|
||||
presets: medium_quality
|
||||
excluded_model_types:
|
||||
SKLearnLinearRegression:
|
||||
SKLearnSVMRegression:
|
||||
kernel: linear
|
||||
model_save_filepath: ./data/model/model.joblib
|
||||
model_type: SKLearnLinearRegression
|
||||
model_save_filepath: ./data/model/autogluonmodel/
|
||||
model_type: AutogluonAutoML
|
||||
outs:
|
||||
- path: data/model/
|
||||
hash: md5
|
||||
md5: 40fa511f4f401f9d2c7da814afe198ef.dir
|
||||
size: 920
|
||||
nfiles: 1
|
||||
md5: 04a1e3bc625e7934c9f57a3fa2f1ea5c.dir
|
||||
size: 1264795580
|
||||
nfiles: 28
|
||||
generate_predictions:
|
||||
cmd: python generate_predictions.py
|
||||
deps:
|
||||
- path: data/model
|
||||
hash: md5
|
||||
md5: 40fa511f4f401f9d2c7da814afe198ef.dir
|
||||
size: 920
|
||||
nfiles: 1
|
||||
md5: 04a1e3bc625e7934c9f57a3fa2f1ea5c.dir
|
||||
size: 1264795580
|
||||
nfiles: 28
|
||||
- path: data/prepared_data
|
||||
hash: md5
|
||||
md5: 01a8f8f0b264ac4d61307a67bfa910b4.dir
|
||||
|
|
@ -58,8 +65,8 @@ stages:
|
|||
nfiles: 2
|
||||
- path: generate_predictions.py
|
||||
hash: md5
|
||||
md5: 13e920c0bae8ac51dd907631578f7045
|
||||
size: 4126
|
||||
md5: 76c45e7575ec979e6c4c8e2cf754a720
|
||||
size: 4225
|
||||
params:
|
||||
configs/generate_predictions.yaml:
|
||||
input_dataclient_type: local
|
||||
|
|
@ -70,16 +77,16 @@ stages:
|
|||
outs:
|
||||
- path: data/predictions/
|
||||
hash: md5
|
||||
md5: 01e8d3483e1f90b5d92022ee4a65bbd7.dir
|
||||
size: 945933
|
||||
md5: 44c298a28a0bb1367bb82d5da1a5dbd0.dir
|
||||
size: 672577
|
||||
nfiles: 1
|
||||
generate_metrics:
|
||||
cmd: python generate_metrics.py
|
||||
deps:
|
||||
- path: data/predictions
|
||||
hash: md5
|
||||
md5: 01e8d3483e1f90b5d92022ee4a65bbd7.dir
|
||||
size: 945933
|
||||
md5: 44c298a28a0bb1367bb82d5da1a5dbd0.dir
|
||||
size: 672577
|
||||
nfiles: 1
|
||||
- path: data/prepared_data
|
||||
hash: md5
|
||||
|
|
@ -88,8 +95,8 @@ stages:
|
|||
nfiles: 2
|
||||
- path: generate_metrics.py
|
||||
hash: md5
|
||||
md5: 6276995b5e860d0f0bb4545aa5f5d347
|
||||
size: 4259
|
||||
md5: cc368845f62523575a9ed5c791e27815
|
||||
size: 4329
|
||||
params:
|
||||
configs/generate_metrics.yaml:
|
||||
dataclient_type: local
|
||||
|
|
@ -100,5 +107,16 @@ stages:
|
|||
outs:
|
||||
- path: metrics/metrics.json
|
||||
hash: md5
|
||||
md5: 995ccf3c6c3f6a975d22aa9bc9f4964e
|
||||
size: 181
|
||||
md5: 3f03e50a419af6730351a5016e2ae98a
|
||||
size: 182
|
||||
startup_cleanup:
|
||||
cmd: python startup_cleanup.py
|
||||
deps:
|
||||
- path: startup_cleanup.py
|
||||
hash: md5
|
||||
md5: f7fe2ca33004b34530da0a3ab48c1790
|
||||
size: 1458
|
||||
params:
|
||||
configs/startup_cleanup.yaml:
|
||||
artefacts: ./data
|
||||
metrics: ./metrics
|
||||
|
|
|
|||
|
|
@ -1,4 +1,12 @@
|
|||
stages:
|
||||
startup_cleanup:
|
||||
cmd: python startup_cleanup.py
|
||||
deps:
|
||||
- startup_cleanup.py
|
||||
params:
|
||||
- configs/startup_cleanup.yaml:
|
||||
- artefacts
|
||||
- metrics
|
||||
prepare_data:
|
||||
cmd: python prepare_data.py
|
||||
deps:
|
||||
|
|
|
|||
|
|
@ -77,7 +77,9 @@ def generate_predictions(
|
|||
logger.info("--- Saving predictions ---")
|
||||
logger.info("--------------------------")
|
||||
|
||||
predictions_df = pd.DataFrame(predictions, columns=[predictions_column_name])
|
||||
predictions_df = pd.DataFrame(predictions)
|
||||
predictions_df.columns = [predictions_column_name]
|
||||
|
||||
datahandler.save_data(
|
||||
dataclient=output_dataclient,
|
||||
obj=predictions_df,
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
joblib==1.3.2
|
||||
boto3==1.28.17
|
||||
pandas==1.5.3
|
||||
scikit-learn==1.3.0
|
||||
autogluon==0.8.2
|
||||
pyarrow==13.0.0
|
||||
pre-commit==3.3.3
|
||||
sphinx==7.2.5
|
||||
|
|
|
|||
50
modules/ml-pipeline/src/pipeline/src/startup_cleanup.py
Normal file
50
modules/ml-pipeline/src/pipeline/src/startup_cleanup.py
Normal file
|
|
@ -0,0 +1,50 @@
|
|||
"""
|
||||
We remove all previous artefacts in the data folder for a dvc run
|
||||
"""
|
||||
|
||||
import shutil
|
||||
import yaml
|
||||
from pathlib import Path
|
||||
from core.Logger import logger
|
||||
|
||||
startup_cleanup_path = Path(__file__).parent / "configs" / "startup_cleanup.yaml"
|
||||
startup_cleanup_params = yaml.safe_load(open(startup_cleanup_path))
|
||||
|
||||
|
||||
def run_cleanup(artefacts_directory: str, metrics_directory: str) -> None:
|
||||
"""
|
||||
Remove the directory where artefacts are stored
|
||||
"""
|
||||
artefact_directory_path = Path(artefacts_directory)
|
||||
|
||||
if artefact_directory_path.exists():
|
||||
|
||||
logger.info(f"Removing the directory: {artefacts_directory}")
|
||||
shutil.rmtree(artefact_directory_path)
|
||||
|
||||
metrics_directory_path = Path(metrics_directory)
|
||||
|
||||
if metrics_directory_path.exists():
|
||||
|
||||
logger.info(f"Removing the directory: {metrics_directory}")
|
||||
shutil.rmtree(metrics_directory_path)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
logger.info("----------------------------")
|
||||
logger.info(f"--- {__file__} - Start! ---")
|
||||
logger.info("----------------------------")
|
||||
|
||||
logger.info("---------------------")
|
||||
logger.info(f"--- Run Clean up ---")
|
||||
logger.info("---------------------")
|
||||
|
||||
run_cleanup(
|
||||
artefacts_directory=startup_cleanup_params["artefacts"],
|
||||
metrics_directory=startup_cleanup_params["metrics"],
|
||||
)
|
||||
|
||||
logger.info("-------------------------------")
|
||||
logger.info(f"--- {__file__} - Complete! ---")
|
||||
logger.info("-------------------------------")
|
||||
Loading…
Add table
Reference in a new issue