Created script to create sample input file

2026-06-08 11:17:27 +00:00 · 2023-10-11 13:19:23 +08:00 · 2023-10-11 13:19:23 +08:00 · fec6fab42a
commit fec6fab42a
parent b2142a7f8e
5 changed files with 104 additions and 3 deletions
--- a/.idea/Model.iml
+++ b/.idea/Model.iml
@ -7,7 +7,7 @@
      <sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
      <sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
    </content>
-    <orderEntry type="jdk" jdkName="Python 3.10 (backend)" jdkType="Python SDK" />
+    <orderEntry type="jdk" jdkName="Python 3.10 (model_data)" jdkType="Python SDK" />
    <orderEntry type="sourceFolder" forTests="false" />
  </component>
  <component name="PyNamespacePackagesService">
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
-  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (backend)" project-jdk-type="Python SDK" />
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (model_data)" project-jdk-type="Python SDK" />
  <component name="PythonCompatibilityInspectionAdvertiser">
    <option name="version" value="3" />
  </component>
--- a/etl/testing_data/init.py
+++ b/etl/testing_data/init.py
--- a/etl/testing_data/engine_inputs.py
+++ b/etl/testing_data/engine_inputs.py
@ -0,0 +1,71 @@
+"""
+This script will create an input csv for the recommendation engine and upload it to S3, which can be used for
+testing
+"""
+import os
+
+import numpy as np
+import pandas as pd
+from epc_api.client import EpcClient
+from utils.s3 import save_csv_to_s3
+
+FILE_SIZE = 100
+EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN", None)
+USER_ID = 2
+PORTFOLIO_ID = 47
+
+
+def app():
+    starting_csv = pd.read_csv("input_property_list.csv")
+
+    remaining_files_to_sample = FILE_SIZE - len(starting_csv)
+
+    # For the remaining addresses, 80% of them will be EPC D and below and the remaining 20% will be EPC A and above
+    n_epc_d_below = int(remaining_files_to_sample * 0.8)
+    n_epc_c_above = remaining_files_to_sample - n_epc_d_below
+
+    n_g = int(np.ceil(n_epc_d_below / 4))
+    n_f = int(np.ceil(n_epc_d_below / 4))
+    n_e = int(np.ceil(n_epc_d_below / 4))
+    n_d = int(np.ceil(n_epc_d_below / 4))
+    n_c = int(np.ceil(n_epc_c_above / 3))
+    n_b = int(np.ceil(n_epc_c_above / 3))
+    n_a = int(np.ceil(n_epc_c_above / 3))
+
+    epc_client = EpcClient(auth_token=EPC_AUTH_TOKEN)
+
+    g_data = epc_client.domestic.search(params={"energy-band": "g"}, size=n_g)
+    f_data = epc_client.domestic.search(params={"energy-band": "f"}, size=n_f)
+    e_data = epc_client.domestic.search(params={"energy-band": "e"}, size=n_e)
+    d_data = epc_client.domestic.search(params={"energy-band": "d"}, size=n_d)
+    c_data = epc_client.domestic.search(params={"energy-band": "c"}, size=n_c)
+    b_data = epc_client.domestic.search(params={"energy-band": "b"}, size=n_b)
+    a_data = epc_client.domestic.search(params={"energy-band": "a"}, size=n_a)
+
+    # Combine the final data
+    final_data = (
+        g_data["rows"] + f_data["rows"] + e_data["rows"] + d_data["rows"] + c_data["rows"] + b_data["rows"]
+        + a_data["rows"]
+    )
+
+    final_csv_data = pd.DataFrame(
+        [{"address": x["address"], "postcode": x["postcode"], "Notes": None} for x in final_data]
+    )
+
+    final_csv_data = pd.concat([starting_csv, final_csv_data]).reset_index(drop=True)
+
+    # Store the data in s3
+    filename = f"{USER_ID}/{PORTFOLIO_ID}/test_inputs.csv"
+    save_csv_to_s3(
+        dataframe=final_csv_data,
+        bucket_name="retrofit-plan-inputs-dev",
+        file_name=filename
+    )
+
+    body = {
+        "portfolio_id": str(PORTFOLIO_ID),
+        "housing_type": "Social",
+        "goal": "Increase EPC",
+        "goal_value": "B",
+        "trigger_file_path": filename
+    }
--- a/utils/s3.py
+++ b/utils/s3.py
@ -1,5 +1,5 @@
 import boto3
-from io import BytesIO
+from io import BytesIO, StringIO
 from botocore.exceptions import NoCredentialsError, PartialCredentialsError
 import pandas as pd
 from utils.logger import setup_logger
@ -113,3 +113,33 @@ def read_dataframe_from_s3_parquet(bucket_name, file_key):
    df = pd.read_parquet(parquet_buffer)

    return df
+
+
+def save_csv_to_s3(dataframe, bucket_name, file_name):
+    """
+    Save a Pandas DataFrame to a CSV file in an S3 bucket.
+
+    Parameters:
+        dataframe (pd.DataFrame): The Pandas DataFrame to save.
+        bucket_name (str): The name of the S3 bucket.
+        file_name (str): The name of the file to save in the S3 bucket.
+
+    Returns:
+        bool: True if the file was successfully saved, False otherwise.
+    """
+    # Initialize S3 client
+    s3 = boto3.client('s3')
+
+    # Create an in-memory text stream
+    csv_buffer = StringIO()
+
+    # Save DataFrame to buffer
+    dataframe.to_csv(csv_buffer, index=False)
+
+    # Upload buffer contents to S3
+    try:
+        s3.put_object(Body=csv_buffer.getvalue(), Bucket=bucket_name, Key=file_name)
+        return True
+    except Exception as e:
+        print(f"An error occurred: {e}")
+        return False