Model/sfr/principal_pitch/1_prepare_data.py
2025-06-25 14:08:22 +01:00

124 lines
4 KiB
Python

"""
This script prepares the data for the principal pitch modelling
"""
import os
import pandas as pd
from dotenv import load_dotenv
from utils.s3 import save_csv_to_s3
from etl.find_my_epc.AssetListEpcData import AssetListEpcData
load_dotenv(dotenv_path="backend/.env")
EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
PORTFOLIO_ID = 206
USER_ID = 8
EPC_TARGET = "C"
# Read the input file
properties = pd.read_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/sfr/Spring JV/Birmingham_price_top300.xlsx"
)
# Keep just the D's and below
properties = properties[properties["current_energy_rating"].isin(["D", "E", "F", "G"])].copy()
# Focus on houses
properties = properties[properties["property_type_std"] != "Flat"]
properties = properties[properties["property_type"] != "flat"]
# Rename the key columns
properties = properties.rename(
columns={
"address1": "address",
"number_of_bathrooms": "n_bathrooms",
"num_beds": "n_bedrooms"
}
)
properties["patch"] = True
# Pull the non-invasive recommendations
asset_list_epc_client = AssetListEpcData(
asset_list=properties,
epc_auth_token=EPC_AUTH_TOKEN
)
asset_list_epc_client.get_data()
asset_list_epc_client.get_non_invasive_recommendations()
asset_list_epc_client.get_patch()
extracted_df = pd.DataFrame(asset_list_epc_client.extracted_data)
epc_df = pd.DataFrame(asset_list_epc_client.epc_data)
# Find examples where patches are different to the api
compare_epc = []
for patch in asset_list_epc_client.patches:
extracted = extracted_df[extracted_df["uprn"] == patch["uprn"]].squeeze()
epc = epc_df[epc_df["uprn"] == patch["uprn"]].squeeze()
compare_epc.append(
{
"uprn": extracted["uprn"],
"address": extracted["address"],
"postcode": extracted["postcode"],
"api_epc": int(extracted["current_epc_efficiency"]),
"fme_epc": int(epc["current-energy-efficiency"]),
}
)
compare_epc = pd.DataFrame(compare_epc)
diff = compare_epc[compare_epc["api_epc"] != compare_epc["fme_epc"]]
# Compare matched addresses to make sure they are the same
compare_addresses = extracted_df[["address", "postcode", "uprn"]].merge(
epc_df[["uprn", "address1", "postcode"]].rename(columns={"address1": "epc_address1", "postcode": "epc_postcode"}),
how="left",
on=["uprn"]
)
# Add on uprn
properties = properties.merge(
extracted_df[["address", "postcode", "uprn"]],
how="left",
on=["address", "postcode"]
)
# Store the asset list in s3
filename = f"{USER_ID}/{PORTFOLIO_ID}/asset_list.csv"
save_csv_to_s3(
dataframe=properties,
bucket_name="retrofit-plan-inputs-dev",
file_name=filename
)
# Store non-invasive recommendations in S3
non_invasive_recommendations_filename = f"{USER_ID}/{PORTFOLIO_ID}/non_invasive_recommendations.csv"
save_csv_to_s3(
dataframe=pd.DataFrame(asset_list_epc_client.non_invasive_recommendations),
bucket_name="retrofit-plan-inputs-dev",
file_name=non_invasive_recommendations_filename
)
# Store patches in S3
patches_filename = ""
if asset_list_epc_client.patches:
patches_filename = f"{USER_ID}/{PORTFOLIO_ID}/patches.csv"
save_csv_to_s3(
dataframe=pd.DataFrame(asset_list_epc_client.patches),
bucket_name="retrofit-plan-inputs-dev",
file_name=patches_filename
)
body = {
"portfolio_id": str(PORTFOLIO_ID),
"housing_type": "Private",
"goal": "Increasing EPC",
"goal_value": "C",
"trigger_file_path": filename,
"already_installed_file_path": "",
"patches_file_path": patches_filename,
"non_invasive_recommendations_file_path": non_invasive_recommendations_filename,
"valuation_file_path": "",
"scenario_name": "EPC C",
"multi_plan": True,
"budget": None,
"ashp_cop": 3.5,
# This is new - when optimising, we drop scores by a few points to account for SAP 10
"simulate_sap_10": True,
"exclusions": ["external_wall_insulation"],
"required_measures": ["cavity_wall_insulation", "loft_insulation"]
}
print(body)