Model/etl/testing_data/engine_inputs.py
2023-10-11 20:42:04 +08:00

75 lines
2.5 KiB
Python

"""
This script will create an input csv for the recommendation engine and upload it to S3, which can be used for
testing
"""
import os
import numpy as np
import pandas as pd
from epc_api.client import EpcClient
from utils.s3 import save_csv_to_s3
FILE_SIZE = 100
EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN", None)
USER_ID = 2
PORTFOLIO_ID = 47
def app():
starting_csv = pd.read_csv("input_property_list.csv")
remaining_files_to_sample = FILE_SIZE - len(starting_csv)
# For the remaining addresses, 80% of them will be EPC D and below and the remaining 20% will be EPC A and above
n_epc_d_below = int(remaining_files_to_sample * 0.8)
n_epc_c_above = remaining_files_to_sample - n_epc_d_below
n_g = int(np.ceil(n_epc_d_below / 4))
n_f = int(np.ceil(n_epc_d_below / 4))
n_e = int(np.ceil(n_epc_d_below / 4))
n_d = int(np.ceil(n_epc_d_below / 4))
n_c = int(np.ceil(n_epc_c_above / 3))
n_b = int(np.ceil(n_epc_c_above / 3))
n_a = int(np.ceil(n_epc_c_above / 3))
epc_client = EpcClient(auth_token=EPC_AUTH_TOKEN)
g_data = epc_client.domestic.search(params={"energy-band": "g"}, size=n_g)
f_data = epc_client.domestic.search(params={"energy-band": "f"}, size=n_f)
e_data = epc_client.domestic.search(params={"energy-band": "e"}, size=n_e)
d_data = epc_client.domestic.search(params={"energy-band": "d"}, size=n_d)
c_data = epc_client.domestic.search(params={"energy-band": "c"}, size=n_c)
b_data = epc_client.domestic.search(params={"energy-band": "b"}, size=n_b)
a_data = epc_client.domestic.search(params={"energy-band": "a"}, size=n_a)
# Combine the final data
final_data = (
g_data["rows"] + f_data["rows"] + e_data["rows"] + d_data["rows"] + c_data["rows"] + b_data["rows"]
+ a_data["rows"]
)
# TODO: For the moment, don't use park homes
final_csv_data = pd.DataFrame(
[{"address": x["address"], "postcode": x["postcode"], "Notes": None} for x
in final_data if
x["property-type"] not in ["Park home"]]
)
final_csv_data = pd.concat([starting_csv, final_csv_data]).reset_index(drop=True)
# Store the data in s3
filename = f"{USER_ID}/{PORTFOLIO_ID}/test_inputs.csv"
save_csv_to_s3(
dataframe=final_csv_data,
bucket_name="retrofit-plan-inputs-dev",
file_name=filename
)
body = {
"portfolio_id": str(PORTFOLIO_ID),
"housing_type": "Social",
"goal": "Increase EPC",
"goal_value": "B",
"trigger_file_path": filename
}
print(body)