diff --git a/etl/bill_savings/training.py b/etl/bill_savings/training.py deleted file mode 100644 index df60298b..00000000 --- a/etl/bill_savings/training.py +++ /dev/null @@ -1,57 +0,0 @@ -from pprint import pprint -import msgpack -from utils.s3 import read_from_s3 -from etl.bill_savings.EnergyConsumptionModel import EnergyConsumptionModel - - -def handler(): - """ - This function is used to train the model and store the final models in s3 as pickles - :return: - """ - - dataset_version = "2024-07-08" - - # Usage: - cleaned = read_from_s3( - s3_file_name="cleaned_epc_data/cleaned.bson", - bucket_name="retrofit-data-dev" - ) - - cleaned = msgpack.unpackb(cleaned, raw=False) - - model = EnergyConsumptionModel(cleaned=cleaned, n_jobs=2) - model.read_dataset(f'energy_consumption/{dataset_version}/energy_consumption_dataset.parquet') - model.feature_engineering() - model.save_dummy_schema(dataset_version=dataset_version) - - # For heating_kwh - model.split_dataset(target='heating_kwh') - model.fit_model(target='heating_kwh') - model.re_train_final_model(target='heating_kwh') - evaluation_results = model.evaluate_model(target='heating_kwh') - - pprint(evaluation_results["train"]) - pprint(evaluation_results["test"]) - - model.save_model(target='heating_kwh', dataset_version=dataset_version) - - # importance_df = evaluation_results["train"]["Feature Importance"] - # testing_predictions = model.testing_predictions["heating_kwh"] - # testing_predictions = testing_predictions.sort_values("residual", ascending=False) - # training_predictions = model.training_predictions["heating_kwh"] - # training_predictions = training_predictions.sort_values("residual", ascending=False) - # # Merge on model.input_data, by the index - # merged_data = testing_predictions.merge(model.input_data, left_index=True, right_index=True) - # merged_data_train = training_predictions.merge(model.input_data, left_index=True, right_index=True) - - # For hot_water_kwh - model.split_dataset(target='hot_water_kwh') - model.fit_model(target='hot_water_kwh') - model.re_train_final_model(target='hot_water_kwh') - evaluation_results = model.evaluate_model(target='hot_water_kwh') - - pprint(evaluation_results["train"]) - pprint(evaluation_results["test"]) - - model.save_model(target='hot_water_kwh', dataset_version=dataset_version)