Compare commits

...

12 commits

Author SHA1 Message Date
KhalimCK
5f3d9efa92
Merge pull request #85 from Hestia-Homes/carbon-dev-model
Carbon dev model
2023-11-27 19:20:40 +00:00
Michael Duong
f29d6af6a2 change readme 2023-11-27 19:13:23 +00:00
Michael Duong
7afc4b06b2 Merge branch 'master' of github.com:Hestia-Homes/ML into carbon-dev-model 2023-11-27 19:12:40 +00:00
Michael Duong
217fb3dca8 add inference speed check 2023-11-27 18:52:47 +00:00
Michael Duong
9a04ffde3b Merge branch 'master' of github.com:Hestia-Homes/ML into carbon-dev-model 2023-11-27 18:30:10 +00:00
Michael Duong
e6c7b2f58c Merge branch 'carbon-dev' of github.com:Hestia-Homes/ML into carbon-dev-model 2023-10-12 08:39:24 +00:00
Michael Duong
f2cc32f4b4 using good model 4000s 2023-10-12 08:38:55 +00:00
Github-Bot
2f9092f447 Update Registry 2023-10-11 15:48:52 +00:00
Github-Bot
bb2db16f61 Update Registry 2023-10-11 15:48:04 +00:00
quandanrepo
5aaebd7f44
Merge pull request #71 from Hestia-Homes/carbon-dev-model
400 second model
2023-10-11 16:47:13 +01:00
Michael Duong
680e879503 400 second model 2023-10-11 15:38:55 +00:00
Michael Duong
f4e91162ec initial model 2023-10-11 13:23:54 +00:00
16 changed files with 70 additions and 150 deletions

View file

@ -22,5 +22,13 @@
}, },
"registered": true, "registered": true,
"active": true "active": true
},
"carbon": {
"version": "v0.0.1",
"stage": {
"dev": "v0.0.1"
},
"registered": true,
"active": true
} }
} }

View file

@ -69,9 +69,7 @@ def handler(event, context):
storage_filepath = f"s3://{PREDICTIONS_BUCKET}/{portfolio_id}/{property_id}/{created_at}.parquet" storage_filepath = f"s3://{PREDICTIONS_BUCKET}/{portfolio_id}/{property_id}/{created_at}.parquet"
logger.info("-------------------------")
logger.info(f"--- Initiate MLModel ---") logger.info(f"--- Initiate MLModel ---")
logger.info("-------------------------")
build_model_params = settings.build_model build_model_params = settings.build_model
client_params = settings.client client_params = settings.client
@ -80,17 +78,13 @@ def handler(event, context):
model = model_factory(build_model_params["model_type"]) model = model_factory(build_model_params["model_type"])
logger.info("----------------------------")
logger.info(f"--- Initiate Input DataClient ---") logger.info(f"--- Initiate Input DataClient ---")
logger.info("----------------------------")
input_dataclient = dataclient_factory( input_dataclient = dataclient_factory(
dataclient_type="aws-s3", dataclient_type="aws-s3",
dataclient_config=client_params["aws-s3"], dataclient_config=client_params["aws-s3"],
) )
logger.info("----------------------------")
logger.info(f"--- Initiate Output DataClient ---") logger.info(f"--- Initiate Output DataClient ---")
logger.info("----------------------------")
output_dataclient = dataclient_factory( output_dataclient = dataclient_factory(
dataclient_type="aws-s3", dataclient_type="aws-s3",
dataclient_config=client_params["aws-s3"], dataclient_config=client_params["aws-s3"],

View file

@ -1,3 +1,3 @@
# The generic reproducible ML-pipeline # The generic reproducible ML-pipeline!
Pipeline required to build a model to produce an output, that gets hashed via DVC Pipeline required to build a model to produce an output, that gets hashed via DVC

View file

@ -16,13 +16,9 @@ def run_cleanup(artefacts_directory: str, metrics_directory: str) -> None:
Remove the directory where artefacts are stored Remove the directory where artefacts are stored
""" """
logger.info("---------------------")
logger.info(f"--- Run Clean up ---") logger.info(f"--- Run Clean up ---")
logger.info("---------------------")
logger.info("-------------------------")
logger.info(f"--- Delete artefacts ---") logger.info(f"--- Delete artefacts ---")
logger.info("-------------------------")
artefact_directory_path = Path(artefacts_directory) artefact_directory_path = Path(artefacts_directory)
@ -31,9 +27,7 @@ def run_cleanup(artefacts_directory: str, metrics_directory: str) -> None:
logger.info(f"Removing the directory: {artefacts_directory}") logger.info(f"Removing the directory: {artefacts_directory}")
shutil.rmtree(artefact_directory_path) shutil.rmtree(artefact_directory_path)
logger.info("-----------------------")
logger.info(f"--- Delete metrics ---") logger.info(f"--- Delete metrics ---")
logger.info("-----------------------")
metrics_directory_path = Path(metrics_directory) metrics_directory_path = Path(metrics_directory)
@ -45,15 +39,11 @@ def run_cleanup(artefacts_directory: str, metrics_directory: str) -> None:
if __name__ == "__main__": if __name__ == "__main__":
logger.info("----------------------------")
logger.info(f"--- {__file__} - Start! ---") logger.info(f"--- {__file__} - Start! ---")
logger.info("----------------------------")
run_cleanup( run_cleanup(
artefacts_directory=startup_cleanup_params["artefacts"], artefacts_directory=startup_cleanup_params["artefacts"],
metrics_directory=startup_cleanup_params["metrics"], metrics_directory=startup_cleanup_params["metrics"],
) )
logger.info("-------------------------------")
logger.info(f"--- {__file__} - Complete! ---") logger.info(f"--- {__file__} - Complete! ---")
logger.info("-------------------------------")

View file

@ -17,9 +17,7 @@ from core.DataClient import dataclient_factory
from core.FeatureProcessor import feature_processor_factory from core.FeatureProcessor import feature_processor_factory
from config import settings from config import settings
logger.info("----------------------------")
logger.info(f"--- Initiate Parameters ---") logger.info(f"--- Initiate Parameters ---")
logger.info("----------------------------")
RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local") RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local")
@ -33,9 +31,7 @@ output_train_filepath = prepare_data_params["output_train_filepath"]
output_test_filepath = prepare_data_params["output_test_filepath"] output_test_filepath = prepare_data_params["output_test_filepath"]
feature_processor_config = feature_process_params["feature_processor_config"] feature_processor_config = feature_process_params["feature_processor_config"]
logger.info("----------------------------")
logger.info(f"--- Initiate DataClient ---") logger.info(f"--- Initiate DataClient ---")
logger.info("----------------------------")
input_dataclient_type = prepare_data_params["input_dataclient_type"] input_dataclient_type = prepare_data_params["input_dataclient_type"]
output_dataclient_type = prepare_data_params["output_dataclient_type"] output_dataclient_type = prepare_data_params["output_dataclient_type"]
@ -49,9 +45,7 @@ output_dataclient = dataclient_factory(
dataclient_config=client_params[output_dataclient_type], dataclient_config=client_params[output_dataclient_type],
) )
logger.info("----------------------------------")
logger.info(f"--- Initiate FeatureProcessor ---") logger.info(f"--- Initiate FeatureProcessor ---")
logger.info("----------------------------------")
feature_processor = feature_processor_factory( feature_processor = feature_processor_factory(
feature_process_params["feature_processor_type"] feature_process_params["feature_processor_type"]
@ -76,15 +70,11 @@ def prepare_data(
:param pipeline_mode: bool, Default False, this caches out the file for experimentation, objects returned in pipeline mode :param pipeline_mode: bool, Default False, this caches out the file for experimentation, objects returned in pipeline mode
""" """
logger.info("--------------------")
logger.info("--- Loading data ---") logger.info("--- Loading data ---")
logger.info("--------------------")
data = input_dataclient.load_data(location=data_filepath, load_config={}) data = input_dataclient.load_data(location=data_filepath, load_config={})
logger.info("--------------------------")
logger.info("--- Feature Processing ---") logger.info("--- Feature Processing ---")
logger.info("--------------------------")
data = feature_processor.feature_process( data = feature_processor.feature_process(
data, data,
@ -93,9 +83,7 @@ def prepare_data(
new_feature_funcs=new_feature_funcs, new_feature_funcs=new_feature_funcs,
) )
logger.info("----------------------")
logger.info("--- Splitting data ---") logger.info("--- Splitting data ---")
logger.info("----------------------")
if train_proportion == 1: if train_proportion == 1:
train = data train = data
@ -108,9 +96,7 @@ def prepare_data(
train = train.reset_index(drop=True) train = train.reset_index(drop=True)
logger.info("-----------------------")
logger.info("--- Outputting data ---") logger.info("--- Outputting data ---")
logger.info("-----------------------")
output_dataclient.save_data( output_dataclient.save_data(
obj=train, location=output_train_filepath, save_config=None obj=train, location=output_train_filepath, save_config=None
@ -126,13 +112,9 @@ def prepare_data(
if __name__ == "__main__": if __name__ == "__main__":
logger.info("----------------------------")
logger.info(f"--- {__file__} - Start! ---") logger.info(f"--- {__file__} - Start! ---")
logger.info("----------------------------")
logger.info("---------------------------")
logger.info(f"--- Prepare Data Stage ---") logger.info(f"--- Prepare Data Stage ---")
logger.info("---------------------------")
prepare_data( prepare_data(
input_dataclient=input_dataclient, input_dataclient=input_dataclient,
@ -147,6 +129,4 @@ if __name__ == "__main__":
new_feature_funcs=new_feature_funcs, new_feature_funcs=new_feature_funcs,
) )
logger.info("-------------------------------")
logger.info(f"--- {__file__} - Complete! ---") logger.info(f"--- {__file__} - Complete! ---")
logger.info("-------------------------------")

View file

@ -18,9 +18,7 @@ from core.MLMetrics import metrics_factory
from configs.post_prediction_logic import post_prediction_logic from configs.post_prediction_logic import post_prediction_logic
from config import settings from config import settings
logger.info("----------------------------")
logger.info(f"--- Initiate Parameters ---") logger.info(f"--- Initiate Parameters ---")
logger.info("----------------------------")
RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local") RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local")
@ -40,22 +38,16 @@ train_filepath = prepare_data_params["output_train_filepath"]
test_filepath = prepare_data_params["output_test_filepath"] test_filepath = prepare_data_params["output_test_filepath"]
fit_metrics_filepath = build_model_params["fit_metrics_filepath"] fit_metrics_filepath = build_model_params["fit_metrics_filepath"]
logger.info("----------------------------")
logger.info(f"--- Initiate DataClient ---") logger.info(f"--- Initiate DataClient ---")
logger.info("----------------------------")
# Output of previous prepare data step, will be where the data is # Output of previous prepare data step, will be where the data is
dataclient = dataclient_factory(prepare_data_params["output_dataclient_type"]) dataclient = dataclient_factory(prepare_data_params["output_dataclient_type"])
logger.info("-------------------------")
logger.info(f"--- Initiate MLModel ---") logger.info(f"--- Initiate MLModel ---")
logger.info("-------------------------")
model = model_factory(model_type) model = model_factory(model_type)
logger.info("-------------------------")
logger.info(f"--- Initiate Metrics ---") logger.info(f"--- Initiate Metrics ---")
logger.info("-------------------------")
metrics = metrics_factory(generate_metrics_params["metrics_type"]) metrics = metrics_factory(generate_metrics_params["metrics_type"])
@ -75,9 +67,7 @@ def build_model(
test_data: Union[pd.DataFrame, None] = None, test_data: Union[pd.DataFrame, None] = None,
pipeline_mode: bool = False, pipeline_mode: bool = False,
): ):
logger.info("--------------------------------------")
logger.info("--- Loading Data for build process ---") logger.info("--- Loading Data for build process ---")
logger.info("--------------------------------------")
if train_data is None: if train_data is None:
if train_filepath is None: if train_filepath is None:
@ -89,9 +79,7 @@ def build_model(
raise ValueError(f"Need {test_filepath} if no data supplied") raise ValueError(f"Need {test_filepath} if no data supplied")
test_data = dataclient.load_data(location=test_filepath, load_config=None) test_data = dataclient.load_data(location=test_filepath, load_config=None)
logger.info("----------------------")
logger.info("--- Training model ---") logger.info("--- Training model ---")
logger.info("----------------------")
model.train_model( model.train_model(
data=train_data.drop(columns=identifier_columns), data=train_data.drop(columns=identifier_columns),
@ -99,32 +87,24 @@ def build_model(
model_hyperparameters=model_hyperparameters, model_hyperparameters=model_hyperparameters,
) )
logger.info("----------------------------------")
logger.info("--- Generating fit predictions ---") logger.info("--- Generating fit predictions ---")
logger.info("----------------------------------")
fit_predictions = model.predict( fit_predictions = model.predict(
data=train_data, post_prediction_logic=post_prediction_logic data=train_data, post_prediction_logic=post_prediction_logic
) )
logger.info("------------------------------")
logger.info("--- Generating fit metrics ---") logger.info("--- Generating fit metrics ---")
logger.info("------------------------------")
metrics_output = metrics.generate_metrics( metrics_output = metrics.generate_metrics(
target=train_data[target], target=train_data[target],
predictions=pd.Series(fit_predictions), predictions=pd.Series(fit_predictions),
) )
logger.info("--------------------")
logger.info("--- Saving model ---") logger.info("--- Saving model ---")
logger.info("--------------------")
model.save_model(path=Path(model_save_location)) model.save_model(path=Path(model_save_location))
logger.info("--------------------------")
logger.info("--- Saving fit metrics ---") logger.info("--- Saving fit metrics ---")
logger.info("--------------------------")
dataclient.save_data( dataclient.save_data(
obj=metrics_output, location=fit_metrics_filepath, save_config=None obj=metrics_output, location=fit_metrics_filepath, save_config=None
@ -133,13 +113,9 @@ def build_model(
if __name__ == "__main__": if __name__ == "__main__":
logger.info("----------------------------")
logger.info(f"--- {__file__} - Start! ---") logger.info(f"--- {__file__} - Start! ---")
logger.info("----------------------------")
logger.info("--------------------------")
logger.info(f"--- Build Model Stage ---") logger.info(f"--- Build Model Stage ---")
logger.info("--------------------------")
build_model( build_model(
dataclient=dataclient, dataclient=dataclient,
@ -154,6 +130,4 @@ if __name__ == "__main__":
fit_metrics_filepath=fit_metrics_filepath, fit_metrics_filepath=fit_metrics_filepath,
) )
logger.info("-------------------------------")
logger.info(f"--- {__file__} - Complete! ---") logger.info(f"--- {__file__} - Complete! ---")
logger.info("-------------------------------")

View file

@ -10,9 +10,7 @@ from core.Logger import logger
from config import settings from config import settings
from generate_predictions import generate_predictions from generate_predictions import generate_predictions
logger.info("----------------------------")
logger.info(f"--- Initiate Parameters ---") logger.info(f"--- Initiate Parameters ---")
logger.info("----------------------------")
RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local") RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local")
@ -33,15 +31,11 @@ model_filepath = build_model_params["model_save_filepath"]
predictions_output_filepath = generate_predictions_params["predictions_output_filepath"] predictions_output_filepath = generate_predictions_params["predictions_output_filepath"]
predictions_column_name = generate_predictions_params["predictions_column_name"] predictions_column_name = generate_predictions_params["predictions_column_name"]
logger.info("-------------------------")
logger.info(f"--- Initiate MLModel ---") logger.info(f"--- Initiate MLModel ---")
logger.info("-------------------------")
model = model_factory(build_model_params["model_type"]) model = model_factory(build_model_params["model_type"])
logger.info("----------------------------")
logger.info(f"--- Initiate DataClient ---") logger.info(f"--- Initiate DataClient ---")
logger.info("----------------------------")
# We may have different locations of loading hence why we use one specified in generate_predictions.yaml # We may have different locations of loading hence why we use one specified in generate_predictions.yaml
# I.e. for metric runs, this will be a local data client # I.e. for metric runs, this will be a local data client
@ -59,13 +53,9 @@ output_dataclient = dataclient_factory(
if __name__ == "__main__": if __name__ == "__main__":
logger.info("----------------------------")
logger.info(f"--- {__file__} - Start! ---") logger.info(f"--- {__file__} - Start! ---")
logger.info("----------------------------")
logger.info("----------------------------------")
logger.info(f"--- Generate Predictions Stage---") logger.info(f"--- Generate Predictions Stage---")
logger.info("----------------------------------")
generate_predictions( generate_predictions(
input_dataclient=input_dataclient, input_dataclient=input_dataclient,
@ -78,6 +68,4 @@ if __name__ == "__main__":
predictions_column_name=predictions_column_name, predictions_column_name=predictions_column_name,
) )
logger.info("-------------------------------")
logger.info(f"--- {__file__} - Complete! ---") logger.info(f"--- {__file__} - Complete! ---")
logger.info("-------------------------------")

View file

@ -16,9 +16,7 @@ from core.MLMetrics import metrics_factory
from core.Logger import logger from core.Logger import logger
from config import settings from config import settings
logger.info("----------------------------")
logger.info(f"--- Initiate Parameters ---") logger.info(f"--- Initiate Parameters ---")
logger.info("----------------------------")
RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local") RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local")
@ -36,15 +34,11 @@ predictions_column_name = generate_predictions_params["predictions_column_name"]
metrics_output_filepath = generate_metrics_params["metrics_output_filepath"] metrics_output_filepath = generate_metrics_params["metrics_output_filepath"]
logger.info("-------------------------")
logger.info(f"--- Initiate MLModel ---") logger.info(f"--- Initiate MLModel ---")
logger.info("-------------------------")
model = model_factory(build_model_params["model_type"]) model = model_factory(build_model_params["model_type"])
logger.info("----------------------------")
logger.info(f"--- Initiate DataClient ---") logger.info(f"--- Initiate DataClient ---")
logger.info("----------------------------")
# Use data client for input and output, as we use dvc to cache later to the cloud # Use data client for input and output, as we use dvc to cache later to the cloud
dataclient_type = generate_metrics_params["dataclient_type"] dataclient_type = generate_metrics_params["dataclient_type"]
@ -53,9 +47,7 @@ dataclient = dataclient_factory(
dataclient_config=client_params[dataclient_type], dataclient_config=client_params[dataclient_type],
) )
logger.info("---------------------------")
logger.info(f"--- Initiate MLMetrics ---") logger.info(f"--- Initiate MLMetrics ---")
logger.info("---------------------------")
metrics = metrics_factory(generate_metrics_params["metrics_type"]) metrics = metrics_factory(generate_metrics_params["metrics_type"])
@ -75,34 +67,26 @@ def generate_metrics(
For a given model, we generate prediction and evaluate this against the true target For a given model, we generate prediction and evaluate this against the true target
""" """
logger.info("-------------------------")
logger.info("--- Loading test data ---") logger.info("--- Loading test data ---")
logger.info("-------------------------")
test_data = input_dataclient.load_data( test_data = input_dataclient.load_data(
location=test_data_filepath, load_config=None location=test_data_filepath, load_config=None
) )
logger.info("---------------------------")
logger.info("--- Loading predictions ---") logger.info("--- Loading predictions ---")
logger.info("---------------------------")
predictions = input_dataclient.load_data( predictions = input_dataclient.load_data(
location=predictions_output_filepath, load_config=None location=predictions_output_filepath, load_config=None
) )
logger.info("--------------------------")
logger.info("--- Generating metrics ---") logger.info("--- Generating metrics ---")
logger.info("--------------------------")
metrics_output = metrics.generate_metrics( metrics_output = metrics.generate_metrics(
target=test_data[target], target=test_data[target],
predictions=pd.Series(predictions[predictions_column_name]), predictions=pd.Series(predictions[predictions_column_name]),
) )
logger.info("----------------------")
logger.info("--- Saving metrics ---") logger.info("--- Saving metrics ---")
logger.info("----------------------")
output_dataclient.save_data( output_dataclient.save_data(
obj=metrics_output, location=metrics_output_filepath, save_config=None obj=metrics_output, location=metrics_output_filepath, save_config=None
@ -111,13 +95,9 @@ def generate_metrics(
if __name__ == "__main__": if __name__ == "__main__":
logger.info("----------------------------")
logger.info(f"--- {__file__} - Start! ---") logger.info(f"--- {__file__} - Start! ---")
logger.info("----------------------------")
logger.info("------------------------------")
logger.info(f"--- Generate Metrics Stage---") logger.info(f"--- Generate Metrics Stage---")
logger.info("------------------------------")
generate_metrics( generate_metrics(
input_dataclient=dataclient, input_dataclient=dataclient,
@ -131,6 +111,4 @@ if __name__ == "__main__":
metrics_output_filepath=metrics_output_filepath, metrics_output_filepath=metrics_output_filepath,
) )
logger.info("-------------------------------")
logger.info(f"--- {__file__} - Complete! ---") logger.info(f"--- {__file__} - Complete! ---")
logger.info("-------------------------------")

View file

@ -13,6 +13,8 @@ default:
output_filepath: ./data/model/allmodels/ output_filepath: ./data/model/allmodels/
problem_type: regression problem_type: regression
eval_metric: mean_squared_error #mean_absolute_error eval_metric: mean_squared_error #mean_absolute_error
time_limit: 4000 time_limit: 400
presets: medium_quality presets: medium_quality
excluded_model_types: ['KNN', 'RF'] excluded_model_types: ['KNN', 'RF']
infer_limit: 0.05
infer_limit_batch_size: 10000

View file

@ -18,6 +18,11 @@ def remove_starting_columns(df):
return df return df
def keep_negative_carbon_change(df):
df = df[df["CARBON_CHANGE"] < 0]
return df
# def keep_ending_columns(df): # def keep_ending_columns(df):
# ending_column_index = [ col_name.endswith("_ENDING") for col_name in list(df.columns)] # ending_column_index = [ col_name.endswith("_ENDING") for col_name in list(df.columns)]
# keep_columns = df.columns[ending_column_index].to_list() # keep_columns = df.columns[ending_column_index].to_list()
@ -27,6 +32,7 @@ def remove_starting_columns(df):
# return df # return df
business_logic = { business_logic = {
"keep_negative_carbon_change": keep_negative_carbon_change
# "remove_starting_columns": remove_starting_columns # "remove_starting_columns": remove_starting_columns
# "keep_ENDING_COLUMNS": keep_ending_columns # "keep_ENDING_COLUMNS": keep_ending_columns
} }

View file

@ -5,17 +5,18 @@ import pandas as pd
def clip_predictions_to_minimum_value( def clip_predictions_to_minimum_value(
data: pd.DataFrame, predictions: pd.Series, minimum_value: int = 1 data: pd.DataFrame,
predictions: pd.Series,
) -> pd.Series: ) -> pd.Series:
series_name = predictions.name series_name = predictions.name
predictions.name = "predictions" predictions.name = "predictions"
predictions_df = pd.concat([data, predictions], axis=1) predictions_df = pd.concat([data, predictions], axis=1)
# We expect all prediction to be atleast one point improvement # We expect all prediction to be atleast one point improvement
replace_index = predictions_df["SAP_STARTING"] + 1 > predictions_df["predictions"] replace_index = predictions_df["predictions"] > predictions_df["CARBON_STARTING"]
predictions_df.loc[replace_index, "predictions"] = ( predictions_df.loc[replace_index, "predictions"] = predictions_df.loc[
predictions_df.loc[replace_index, "SAP_STARTING"] + minimum_value replace_index, "CARBON_STARTING"
) ]
predictions_new = predictions_df["predictions"] predictions_new = predictions_df["predictions"]
predictions_new.name = series_name predictions_new.name = series_name

View file

@ -21,7 +21,7 @@ default:
# data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_with_differencing.parquet # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_with_differencing.parquet
# data_filepath: s3://retrofit-data-dev/sap_change_model/floor_area_clean_test.parquet # data_filepath: s3://retrofit-data-dev/sap_change_model/floor_area_clean_test.parquet
# data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_without_differencing.parquet # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_without_differencing.parquet
data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_test.parquet
train_proportion: 0.9 train_proportion: 0.9
output_train_filepath: ./data/prepared_data/train.parquet output_train_filepath: ./data/prepared_data/train.parquet
output_test_filepath: ./data/prepared_data/test.parquet output_test_filepath: ./data/prepared_data/test.parquet
@ -31,9 +31,9 @@ default:
feature_processor_config: feature_processor_config:
subsample_amount: null subsample_amount: null
subsample_seed: 0 subsample_seed: 0
target: SAP_ENDING target: CARBON_ENDING
identifier_columns: ["UPRN"] identifier_columns: ["UPRN"]
drop_columns: ["HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "RDSAP_CHANGE", "HEAT_DEMAND_ENDING", "CARBON_ENDING"] drop_columns: ["HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "RDSAP_CHANGE", "HEAT_DEMAND_ENDING", "SAP_ENDING"]
# retain_features: ["SAP_STARTING", "TOTAL_FLOOR_AREA_DIFF"] # retain_features: ["SAP_STARTING", "TOTAL_FLOOR_AREA_DIFF"]
retain_features: null retain_features: null

View file

@ -21,6 +21,7 @@ def setup_logger():
# Add the stream handler to the logger # Add the stream handler to the logger
logger.addHandler(stream_handler) logger.addHandler(stream_handler)
logger.propagate = False
return logger return logger

View file

@ -149,6 +149,8 @@ class AutogluonAutoML:
"time_limit", "time_limit",
"presets", "presets",
"excluded_model_types", "excluded_model_types",
"infer_limit",
"infer_limit_batch_size",
] ]
def load_model(self, path: Union[Path, str]) -> None: def load_model(self, path: Union[Path, str]) -> None:
@ -203,6 +205,8 @@ class AutogluonAutoML:
time_limit=model_hyperparameters["time_limit"], time_limit=model_hyperparameters["time_limit"],
presets=model_hyperparameters["presets"], presets=model_hyperparameters["presets"],
excluded_model_types=model_hyperparameters["excluded_model_types"], excluded_model_types=model_hyperparameters["excluded_model_types"],
infer_limit=model_hyperparameters["infer_limit"],
infer_limit_batch_size=model_hyperparameters["infer_limit_batch_size"],
) )
def predict( def predict(

View file

@ -5,8 +5,8 @@ stages:
deps: deps:
- path: 1_prepare_data.py - path: 1_prepare_data.py
hash: md5 hash: md5
md5: c9f030df733e318b80d1fa91b7732f79 md5: 896d3d88a4a9f68d174efe71dc089517
size: 5132 size: 4222
params: params:
configs/settings.yaml: configs/settings.yaml:
default.feature_processor.feature_processor_config.drop_columns: default.feature_processor.feature_processor_config.drop_columns:
@ -14,13 +14,13 @@ stages:
- CARBON_CHANGE - CARBON_CHANGE
- RDSAP_CHANGE - RDSAP_CHANGE
- HEAT_DEMAND_ENDING - HEAT_DEMAND_ENDING
- CARBON_ENDING - SAP_ENDING
default.feature_processor.feature_processor_config.retain_features: default.feature_processor.feature_processor_config.retain_features:
default.feature_processor.feature_processor_config.subsample_amount: default.feature_processor.feature_processor_config.subsample_amount:
default.feature_processor.feature_processor_config.subsample_seed: 0 default.feature_processor.feature_processor_config.subsample_seed: 0
default.feature_processor.feature_processor_config.target: SAP_ENDING default.feature_processor.feature_processor_config.target: CARBON_ENDING
default.feature_processor.feature_processor_type: dataframe default.feature_processor.feature_processor_type: dataframe
default.prepare_data.data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet default.prepare_data.data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_test.parquet
default.prepare_data.input_dataclient_type: aws-s3 default.prepare_data.input_dataclient_type: aws-s3
default.prepare_data.output_dataclient_type: local default.prepare_data.output_dataclient_type: local
default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet
@ -29,20 +29,20 @@ stages:
outs: outs:
- path: data/prepared_data/ - path: data/prepared_data/
hash: md5 hash: md5
md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir md5: 73c1f7be21be8358a73c4ab5f9ec8e39.dir
size: 33881619 size: 32943109
nfiles: 2 nfiles: 2
build_model: build_model:
cmd: python 2_build_model.py cmd: python 2_build_model.py
deps: deps:
- path: 2_build_model.py - path: 2_build_model.py
hash: md5 hash: md5
md5: 84699d208874c52accaff61c6af9bb0a md5: b824822475c222521516493e68eef9c5
size: 5359 size: 4149
- path: data/prepared_data - path: data/prepared_data
hash: md5 hash: md5
md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir md5: 73c1f7be21be8358a73c4ab5f9ec8e39.dir
size: 33881619 size: 32943109
nfiles: 2 nfiles: 2
params: params:
configs/build_model.yaml: configs/build_model.yaml:
@ -58,37 +58,39 @@ stages:
output_filepath: ./data/model/allmodels/ output_filepath: ./data/model/allmodels/
problem_type: regression problem_type: regression
eval_metric: mean_squared_error eval_metric: mean_squared_error
time_limit: 4000 time_limit: 400
presets: medium_quality presets: medium_quality
excluded_model_types: excluded_model_types:
- KNN - KNN
- RF - RF
infer_limit: 0.05
infer_limit_batch_size: 10000
outs: outs:
- path: data/model/ - path: data/model/
hash: md5 hash: md5
md5: 7bb5156243b4db39349e80a01ffecde4.dir md5: dee1a60e6a9f4695272da8127196f714.dir
size: 473398662 size: 326732699
nfiles: 27 nfiles: 24
- path: metrics/fit_metrics.json - path: metrics/fit_metrics.json
hash: md5 hash: md5
md5: 2bb16ac67de8778fbc08171d562b34d5 md5: 1fefa99c7bc50d09c31bf175d5b9ee9c
size: 184 size: 226
generate_predictions: generate_predictions:
cmd: python 3_generate_predictions.py cmd: python 3_generate_predictions.py
deps: deps:
- path: 3_generate_predictions.py - path: 3_generate_predictions.py
hash: md5 hash: md5
md5: 5ef2856a5a977304f1ec01f9b4205262 md5: 0a70ad4dfe99414a75d1261c75a177b9
size: 3028 size: 2464
- path: data/model - path: data/model
hash: md5 hash: md5
md5: 7bb5156243b4db39349e80a01ffecde4.dir md5: dee1a60e6a9f4695272da8127196f714.dir
size: 473398662 size: 326732699
nfiles: 27 nfiles: 24
- path: data/prepared_data - path: data/prepared_data
hash: md5 hash: md5
md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir md5: 73c1f7be21be8358a73c4ab5f9ec8e39.dir
size: 33881619 size: 32943109
nfiles: 2 nfiles: 2
params: params:
configs/settings.yaml: configs/settings.yaml:
@ -100,25 +102,25 @@ stages:
outs: outs:
- path: data/predictions/ - path: data/predictions/
hash: md5 hash: md5
md5: 0bb3cf991906953def81c8204cdcfaf0.dir md5: d2da3b713811952b66e2c5f8c95f5407.dir
size: 374532 size: 410646
nfiles: 1 nfiles: 1
generate_metrics: generate_metrics:
cmd: python 4_generate_metrics.py cmd: python 4_generate_metrics.py
deps: deps:
- path: 4_generate_metrics.py - path: 4_generate_metrics.py
hash: md5 hash: md5
md5: 2c9fb78955a8c19cff0a098976f81d1b md5: d09a80dd55f1f69e2a832b1991b3c406
size: 4487 size: 3485
- path: data/predictions - path: data/predictions
hash: md5 hash: md5
md5: 0bb3cf991906953def81c8204cdcfaf0.dir md5: d2da3b713811952b66e2c5f8c95f5407.dir
size: 374532 size: 410646
nfiles: 1 nfiles: 1
- path: data/prepared_data - path: data/prepared_data
hash: md5 hash: md5
md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir md5: 73c1f7be21be8358a73c4ab5f9ec8e39.dir
size: 33881619 size: 32943109
nfiles: 2 nfiles: 2
params: params:
configs/settings.yaml: configs/settings.yaml:
@ -128,15 +130,15 @@ stages:
outs: outs:
- path: metrics/metrics.json - path: metrics/metrics.json
hash: md5 hash: md5
md5: 2e13ae67759a64261d03224f1c0d4bf4 md5: 4ed2edc06b4dad3c094a2d1be374a5de
size: 185 size: 224
startup_cleanup: startup_cleanup:
cmd: python 0_startup_cleanup.py cmd: python 0_startup_cleanup.py
deps: deps:
- path: 0_startup_cleanup.py - path: 0_startup_cleanup.py
hash: md5 hash: md5
md5: fbb7e3b1b98b517c870f3e1df3e7f695 md5: b1b12f6b6393fbf8b83d23684df0a3d4
size: 1676 size: 1220
params: params:
configs/settings.yaml: configs/settings.yaml:
default.startup_cleanup.artefacts: ./data default.startup_cleanup.artefacts: ./data

View file

@ -20,23 +20,17 @@ def generate_predictions(
For a given model, we generate prediction and evaluate this against the true target For a given model, we generate prediction and evaluate this against the true target
""" """
logger.info("-------------------------")
logger.info("--- Loading test data ---") logger.info("--- Loading test data ---")
logger.info("-------------------------")
test_data = input_dataclient.load_data( test_data = input_dataclient.load_data(
location=test_data_filepath, load_config=None location=test_data_filepath, load_config=None
) )
logger.info("---------------------")
logger.info("--- Loading model ---") logger.info("--- Loading model ---")
logger.info("---------------------")
model.load_model(model_filepath) model.load_model(model_filepath)
logger.info("------------------------------")
logger.info("--- Generating predictions ---") logger.info("--- Generating predictions ---")
logger.info("------------------------------")
prediction_data = ( prediction_data = (
test_data.drop(columns=target) if target in test_data.columns else test_data test_data.drop(columns=target) if target in test_data.columns else test_data
@ -46,9 +40,7 @@ def generate_predictions(
data=prediction_data, post_prediction_logic=post_prediction_logic data=prediction_data, post_prediction_logic=post_prediction_logic
) )
logger.info("--------------------------")
logger.info("--- Saving predictions ---") logger.info("--- Saving predictions ---")
logger.info("--------------------------")
predictions_df = pd.DataFrame(predictions) predictions_df = pd.DataFrame(predictions)
predictions_df.columns = [predictions_column_name] predictions_df.columns = [predictions_column_name]