all scripts working

This commit is contained in:
Michael Duong 2023-09-09 19:33:27 +00:00
parent 14887054be
commit 860ea8f098
20 changed files with 182 additions and 66 deletions

View file

@ -0,0 +1,2 @@
['remote "myremote"']
url = /tmp/dvcstore

View file

@ -1,3 +1,2 @@
.dev_env/
data/
__pycache__/

View file

@ -29,8 +29,8 @@ def build_model(
target: str,
model_save_location: str,
model_hyperparameters: dict,
train_location: Union[str, None] = None,
test_location: Union[str, None] = None,
train_filepath: Union[str, None] = None,
test_filepath: Union[str, None] = None,
train_data: Union[pd.DataFrame, None] = None,
test_data: Union[pd.DataFrame, None] = None,
pipeline_mode: bool = False,
@ -41,15 +41,15 @@ def build_model(
if train_data is None:
# TODO: replace this with the data client to load
if train_location is None:
raise ValueError(f"Need {train_location}")
train_data = pd.read_parquet(train_location)
if train_filepath is None:
raise ValueError(f"Need {train_filepath}")
train_data = pd.read_parquet(train_filepath)
if test_data is None:
# TODO: replace this with the data client to load
if test_location is None:
raise ValueError(f"Need {test_location}")
test_data = pd.read_parquet(test_location)
if test_filepath is None:
raise ValueError(f"Need {test_filepath}")
test_data = pd.read_parquet(test_filepath)
logger.info("----------------------")
logger.info("--- Training model ---")
@ -79,7 +79,7 @@ if __name__ == "__main__":
logger.info(f"--- Initiate DataClient ---")
logger.info("----------------------------")
dataclient = dataclient_factory(prepare_data_params["client_type"])
dataclient = dataclient_factory(prepare_data_params["dataclient_type"])
logger.info("-------------------------")
logger.info(f"--- Initiate MLModel ---")
@ -98,8 +98,8 @@ if __name__ == "__main__":
target=build_model_params["target"],
model_save_location=build_model_params["model_save_location"],
model_hyperparameters=build_model_params[model_type],
train_location=prepare_data_params["output_train_filename"],
test_location=prepare_data_params["output_test_filename"],
train_filepath=prepare_data_params["output_train_filepath"],
test_filepath=prepare_data_params["output_test_filepath"],
)
logger.info("-------------------------------")

View file

@ -4,5 +4,8 @@ target: target
test_location: ./data/prepared_data/test.parquet
model_save_location: ./data/model/model.joblib
SKLearnLinearRegression: null
SKLearnSVMRegression:
kernel: "linear"

View file

@ -1,4 +1,2 @@
metrics_type: Regression
test_data_location: ./data/prepared_data/
predictions_output_location: ./data/predictions/predictions.csv
metrics_output_location: ./metrics/metrics.json
metrics_output_filepath: ./metrics/metrics.json

View file

@ -0,0 +1,2 @@
test_data_filepath: ./data/prepared_data/test.parquet
predictions_output_filepath: ./data/predictions/predictions.parquet

View file

@ -1,8 +1,7 @@
dataclient_type: minio
data_location: s3://dev_bucket
train_proportion: 0.8
output_location: ./data/prepared_data/
output_train_filename: train.parquet
output_test_filename: test.parquet
output_train_filepath: ./data/prepared_data/train.parquet
output_test_filepath: ./data/prepared_data/test.parquet
# cache_o

View file

@ -0,0 +1 @@
/model.joblib

View file

@ -0,0 +1,5 @@
outs:
- md5: 1edb002d64bf2b5e9853b1f81ab4c47c
size: 1096
hash: md5
path: model.joblib

View file

@ -0,0 +1 @@
/predictions.parquet

View file

@ -0,0 +1,5 @@
outs:
- md5: dc6e327c8048a3b485baf73bd5a0284c
size: 2531
hash: md5
path: predictions.parquet

View file

@ -0,0 +1,2 @@
/test.parquet
/train.parquet

View file

@ -0,0 +1,5 @@
outs:
- md5: 13c6c812693b10cfb32daf286a8dd372
size: 13587
hash: md5
path: test.parquet

View file

@ -0,0 +1,5 @@
outs:
- md5: 92b2085257087bf509e437b23cf51270
size: 22565
hash: md5
path: train.parquet

View file

@ -25,6 +25,11 @@ prepare_data_params = yaml.safe_load(open(prepare_data_path))
build_model_path = Path(__file__).parent / "configs" / "build_model.yaml"
build_model_params = yaml.safe_load(open(build_model_path))
generate_predictions_path = (
Path(__file__).parent / "configs" / "generate_predictions.yaml"
)
generate_predictions_params = yaml.safe_load(open(generate_predictions_path))
generate_metrics_path = Path(__file__).parent / "configs" / "generate_metrics.yaml"
generate_metrics_params = yaml.safe_load(open(generate_metrics_path))
@ -34,10 +39,9 @@ def generate_metrics(
model: MLModel,
metrics: MLMetrics,
target: str,
model_location: str,
test_data_location: str,
predictions_output_location: str,
metrics_output_location: str,
test_data_filepath: str,
predictions_output_filepath: str,
metrics_output_filepath: str,
):
"""
For a given model, we generate prediction and evaluate this against the true target
@ -48,35 +52,14 @@ def generate_metrics(
logger.info("-------------------------")
# TODO: replace with client loader here
test_data = pd.read_parquet(test_data_location)
test_data = pd.read_parquet(test_data_filepath)
logger.info("---------------------")
logger.info("--- Loading model ---")
logger.info("---------------------")
logger.info("---------------------------")
logger.info("--- Loading predictions ---")
logger.info("---------------------------")
model.load_model(model_location)
logger.info("------------------------------")
logger.info("--- Generating predictions ---")
logger.info("------------------------------")
# Clean test data for now
prediction_data = (
test_data.drop(columns=target) if target in test_data.columns else test_data
)
predictions = model.predict(data=prediction_data)
logger.info("--------------------------")
logger.info("--- Saving predictions ---")
logger.info("--------------------------")
# TODO: replace with client
if not Path(predictions_output_location).parent.exists():
os.mkdir(Path(predictions_output_location).parent)
predictions.to_json(predictions_output_location)
# TODO: replace with client loader here
predictions = pd.read_parquet(predictions_output_filepath)
logger.info("--------------------------")
logger.info("--- Generating metrics ---")
@ -92,10 +75,10 @@ def generate_metrics(
# TODO: replace with client
if not Path(metrics_output_location).parent.exists():
os.mkdir(Path(metrics_output_location).parent)
if not Path(metrics_output_filepath).parent.exists():
os.mkdir(Path(metrics_output_filepath).parent)
with open(metrics_output_location, "w") as f:
with open(metrics_output_filepath, "w") as f:
json.dump(metrics_output, f)
@ -114,10 +97,9 @@ if __name__ == "__main__":
model=model,
metrics=metrics,
target=build_model_params["target"],
model_location=build_model_params["model_save_location"],
test_data_location=generate_metrics_params["test_data_location"],
predictions_output_location=generate_metrics_params[
"predictions_output_location"
test_data_filepath=generate_predictions_params["test_data_filepath"],
predictions_output_filepath=generate_predictions_params[
"predictions_output_filepath"
],
metrics_output_location=generate_metrics_params["metrics_output_location"],
metrics_output_filepath=generate_metrics_params["metrics_output_filepath"],
)

View file

@ -0,0 +1,101 @@
"""
Third part of the pipeline:
After the model is built, we can evaluate its performance
"""
import os
import yaml
import json
import pandas as pd
from pathlib import Path
from core.interface.InterfaceModels import MLModel
from core.interface.InterfaceDataClient import DataClient
from core.DataClient import dataclient_factory
from core.MLModels import model_factory
from core.MLMetrics import metrics_factory
from core.Logger import logger
RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local")
prepare_data_path = Path(__file__).parent / "configs" / "prepare_data.yaml"
prepare_data_params = yaml.safe_load(open(prepare_data_path))
build_model_path = Path(__file__).parent / "configs" / "build_model.yaml"
build_model_params = yaml.safe_load(open(build_model_path))
generate_predictions_path = (
Path(__file__).parent / "configs" / "generate_predictions.yaml"
)
generate_predictions_params = yaml.safe_load(open(generate_predictions_path))
def generate_predictions(
dataclient: DataClient,
model: MLModel,
target: str,
model_location: str,
test_data_filepath: str,
predictions_output_filepath: str,
):
"""
For a given model, we generate prediction and evaluate this against the true target
"""
logger.info("-------------------------")
logger.info("--- Loading test data ---")
logger.info("-------------------------")
# TODO: replace with client loader here
test_data = pd.read_parquet(test_data_filepath)
logger.info("---------------------")
logger.info("--- Loading model ---")
logger.info("---------------------")
model.load_model(model_location)
logger.info("------------------------------")
logger.info("--- Generating predictions ---")
logger.info("------------------------------")
# Clean test data for now
prediction_data = (
test_data.drop(columns=target) if target in test_data.columns else test_data
)
predictions = model.predict(data=prediction_data)
logger.info("--------------------------")
logger.info("--- Saving predictions ---")
logger.info("--------------------------")
# TODO: replace with client
if not Path(predictions_output_filepath).parent.exists():
os.mkdir(Path(predictions_output_filepath).parent)
pd.DataFrame(predictions, columns=["predictions"]).to_parquet(
predictions_output_filepath
)
if __name__ == "__main__":
logger.info("----------------------------")
logger.info(f"--- {__file__} - Start! ---")
logger.info("----------------------------")
model = model_factory(build_model_params["model_type"])
dataclient = dataclient_factory(prepare_data_params["dataclient_type"])
generate_predictions(
dataclient=dataclient,
model=model,
target=build_model_params["target"],
model_location=build_model_params["model_save_location"],
test_data_filepath=generate_predictions_params["test_data_filepath"],
predictions_output_filepath=generate_predictions_params[
"predictions_output_filepath"
],
)

View file

@ -0,0 +1 @@
{"mean_absolute_error": 44.778045422034474, "median_absolute_error": 34.43498827111321, "mean_squared_error": 3157.322503544217, "mean_absolute_percentage_error": 0.3775608927188337}

View file

@ -33,9 +33,8 @@ def use_dummy_data() -> pd.DataFrame:
def prepare_data(
dataclient: DataClient,
train_proportion: float,
output_location: str,
output_train_filename: str = "train.parquet",
output_test_filename: str = "test.parquet",
output_train_filepath: str = "train.parquet",
output_test_filepath: str = "test.parquet",
) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""
Given a client and location, load data into the pipeline
@ -67,13 +66,17 @@ def prepare_data(
logger.info("-----------------------")
# TODO: REPLACE WITH CLIENT
output_path = Path(output_location)
if not output_path.exists():
os.makedirs(output_path)
output_directory = Path(output_train_filepath)
if not output_directory.parent.exists():
os.makedirs(output_directory.parent)
output_directory = Path(output_test_filepath)
if not output_directory.parent.exists():
os.makedirs(output_directory.parent)
logger.info("--- Outputting train and test data ---")
train.to_csv(output_path / output_train_filename, index=False)
test.to_csv(output_path / output_test_filename, index=False)
train.to_parquet(output_train_filepath)
test.to_parquet(output_test_filepath)
# client.save_data(obj=train)
# client.save_data(obj=test)
@ -100,7 +103,8 @@ if __name__ == "__main__":
prepare_data(
dataclient=dataclient,
train_proportion=params["train_proportion"],
output_location=params["output_location"],
output_train_filepath=params["output_train_filepath"],
output_test_filepath=params["output_test_filepath"],
)
logger.info("-------------------------------")

View file

@ -4,6 +4,7 @@ pandas==1.5.3
dvc==3.18.0
gto==1.0.4
scikit-learn==1.3.0
pyarrow==13.0.0
pre-commit==3.3.3
sphinx==7.2.5
sphinx_rtd_theme==1.3.0