Model/etl/bill_savings/training_data.py
2024-08-12 10:32:26 +01:00

24 lines
785 B
Python

import msgpack
from etl.bill_savings.KwhData import KwhData
from utils.s3 import read_from_s3
def app():
"""
Given the files written in our datalake in s3, this application will collate the data into a single file
and store it back in s3 for analysis
:return:
"""
cleaned = read_from_s3(
s3_file_name="cleaned_epc_data/cleaned.bson",
bucket_name="retrofit-data-dev"
)
cleaned = msgpack.unpackb(cleaned, raw=False)
# If there is any problematic data, it could be:
# s3://retrofit-datalake-dev/energy_consumption_data/2024-08-10 18:48:06.866647.pkl
kwh_data_client = KwhData(bucket="retrofit-datalake-dev")
kwh_data_client.combine()
kwh_data_client.transform(data=kwh_data_client.data, cleaned=cleaned, save=True)