diff --git a/.idea/Model.iml b/.idea/Model.iml index 4413bb06..b0f9c00d 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index 3b05c6ac..ca0e1cd9 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -1,6 +1,6 @@ - + diff --git a/etl/testing_data/__init__.py b/etl/testing_data/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/etl/testing_data/engine_inputs.py b/etl/testing_data/engine_inputs.py new file mode 100644 index 00000000..507208e3 --- /dev/null +++ b/etl/testing_data/engine_inputs.py @@ -0,0 +1,71 @@ +""" +This script will create an input csv for the recommendation engine and upload it to S3, which can be used for +testing +""" +import os + +import numpy as np +import pandas as pd +from epc_api.client import EpcClient +from utils.s3 import save_csv_to_s3 + +FILE_SIZE = 100 +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN", None) +USER_ID = 2 +PORTFOLIO_ID = 47 + + +def app(): + starting_csv = pd.read_csv("input_property_list.csv") + + remaining_files_to_sample = FILE_SIZE - len(starting_csv) + + # For the remaining addresses, 80% of them will be EPC D and below and the remaining 20% will be EPC A and above + n_epc_d_below = int(remaining_files_to_sample * 0.8) + n_epc_c_above = remaining_files_to_sample - n_epc_d_below + + n_g = int(np.ceil(n_epc_d_below / 4)) + n_f = int(np.ceil(n_epc_d_below / 4)) + n_e = int(np.ceil(n_epc_d_below / 4)) + n_d = int(np.ceil(n_epc_d_below / 4)) + n_c = int(np.ceil(n_epc_c_above / 3)) + n_b = int(np.ceil(n_epc_c_above / 3)) + n_a = int(np.ceil(n_epc_c_above / 3)) + + epc_client = EpcClient(auth_token=EPC_AUTH_TOKEN) + + g_data = epc_client.domestic.search(params={"energy-band": "g"}, size=n_g) + f_data = epc_client.domestic.search(params={"energy-band": "f"}, size=n_f) + e_data = epc_client.domestic.search(params={"energy-band": "e"}, size=n_e) + d_data = epc_client.domestic.search(params={"energy-band": "d"}, size=n_d) + c_data = epc_client.domestic.search(params={"energy-band": "c"}, size=n_c) + b_data = epc_client.domestic.search(params={"energy-band": "b"}, size=n_b) + a_data = epc_client.domestic.search(params={"energy-band": "a"}, size=n_a) + + # Combine the final data + final_data = ( + g_data["rows"] + f_data["rows"] + e_data["rows"] + d_data["rows"] + c_data["rows"] + b_data["rows"] + + a_data["rows"] + ) + + final_csv_data = pd.DataFrame( + [{"address": x["address"], "postcode": x["postcode"], "Notes": None} for x in final_data] + ) + + final_csv_data = pd.concat([starting_csv, final_csv_data]).reset_index(drop=True) + + # Store the data in s3 + filename = f"{USER_ID}/{PORTFOLIO_ID}/test_inputs.csv" + save_csv_to_s3( + dataframe=final_csv_data, + bucket_name="retrofit-plan-inputs-dev", + file_name=filename + ) + + body = { + "portfolio_id": str(PORTFOLIO_ID), + "housing_type": "Social", + "goal": "Increase EPC", + "goal_value": "B", + "trigger_file_path": filename + } diff --git a/utils/s3.py b/utils/s3.py index 7414da3f..e63b7192 100644 --- a/utils/s3.py +++ b/utils/s3.py @@ -1,5 +1,5 @@ import boto3 -from io import BytesIO +from io import BytesIO, StringIO from botocore.exceptions import NoCredentialsError, PartialCredentialsError import pandas as pd from utils.logger import setup_logger @@ -113,3 +113,33 @@ def read_dataframe_from_s3_parquet(bucket_name, file_key): df = pd.read_parquet(parquet_buffer) return df + + +def save_csv_to_s3(dataframe, bucket_name, file_name): + """ + Save a Pandas DataFrame to a CSV file in an S3 bucket. + + Parameters: + dataframe (pd.DataFrame): The Pandas DataFrame to save. + bucket_name (str): The name of the S3 bucket. + file_name (str): The name of the file to save in the S3 bucket. + + Returns: + bool: True if the file was successfully saved, False otherwise. + """ + # Initialize S3 client + s3 = boto3.client('s3') + + # Create an in-memory text stream + csv_buffer = StringIO() + + # Save DataFrame to buffer + dataframe.to_csv(csv_buffer, index=False) + + # Upload buffer contents to S3 + try: + s3.put_object(Body=csv_buffer.getvalue(), Bucket=bucket_name, Key=file_name) + return True + except Exception as e: + print(f"An error occurred: {e}") + return False