Model/etl/sfr/epc_average_by_postcode.py
Khalim Conn-Kowlessar 603b3e1db2 Stonewater WIP
2024-07-23 17:07:22 +01:00

80 lines
2.9 KiB
Python

import os
from tqdm import tqdm
import pandas as pd
from dotenv import load_dotenv
from backend.SearchEpc import SearchEpc
from backend.app.utils import sap_to_epc
load_dotenv(dotenv_path="backend/.env")
EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
def app():
"""
This script will retrieve EPC data, for postcodes and produce statistics on the SAP Score
:return:
"""
source_file = pd.read_excel("/Users/khalimconn-kowlessar/Downloads/Addresses - SFR rents.xlsx")
source_file["row_id"] = source_file.index
# Split out the town, which is the final portion of the string, separated by commas
source_file["Town"] = source_file["Address"].apply(lambda x: x.split(" ")[-1].strip() if not pd.isnull(x) else None)
source_file["Address"] = source_file["Address"].apply(
lambda x: " ".join(x.split(" ")[:-1]).strip() if not pd.isnull(x) else None
)
unique_postcodes = source_file[["Address", "Postcode"]].drop_duplicates()
# for each postcode, pull EPC data
collected_data = []
no_data_found = []
no_data_after_filters = []
for _, config in tqdm(unique_postcodes.iterrows(), total=len(unique_postcodes)):
address1 = config["Address"] if not pd.isnull(config["Address"]) else ""
searcher = SearchEpc(
postcode=config["Postcode"],
address1=address1,
auth_token=EPC_AUTH_TOKEN,
os_api_key=""
)
while True:
params = {
"postcode": config["Postcode"],
"address": address1,
}
results = searcher.client.domestic.search(params=params, size=10000)
if not results:
# We strip back address1
address1 = " ".join(address1.split(" ")[:-1])
if not address1:
break
else:
break
if not results:
no_data_found.append(config)
continue
data = pd.DataFrame(results["rows"])
data["current-energy-efficiency"] = data["current-energy-efficiency"].astype(int)
# Take EPCs post 2023
data["lodgement-date"] = pd.to_datetime(data["lodgement-date"], errors="coerce")
data = data[data["lodgement-date"] >= "2023-01-01"]
# Take private nrentals
data = data[data["tenure"].isin(["rental (private)", "Rented (private)"])]
if data.empty:
no_data_after_filters.append(config)
continue
agg = data.groupby(["property-type", "built-form"])["current-energy-efficiency"].mean().reset_index()
agg = agg.rename(columns={"current-energy-efficiency": "Average SAP"})
agg["Average EPC"] = agg["Average SAP"].apply(sap_to_epc)
agg.insert(0, "Postcode", config["Postcode"])
agg.insert(0, "Address", address1)
collected_data.append(agg)
collected_df = pd.concat(collected_data)
collected_df.to_csv("EPC Averages SFR.csv", index=False)