refactoring prs and oo data puls

This commit is contained in:
Khalim Conn-Kowlessar 2024-11-05 14:19:17 +00:00
parent cb4b597272
commit 2f930e3fa2
2 changed files with 113 additions and 34 deletions

View file

@ -1,10 +1,20 @@
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from dotenv import load_dotenv
from urllib.parse import urlencode
from epc_api.client import EpcClient
from utils.logger import setup_logger
from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes
from recommendations.recommendation_utils import (
estimate_perimeter,
estimate_external_wall_area,
estimate_number_of_floors
)
logger = setup_logger()
load_dotenv(dotenv_path="backend/.env")
EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
@ -64,6 +74,89 @@ ROOF_DESCRIPTIONS = [
SOCIAL_TENURES = ["Rented (social)", "rental (social)"]
def process_postcode_epcs(postcode, client):
params = {"postcode": postcode}
url = os.path.join(client.domestic.host, "search") + "?" + urlencode({"size": 1000})
response = client.domestic.call(method="get", url=url, params=params)
postcode_epcs = pd.DataFrame(response["rows"])
# Processing code here
postcode_epcs["uprn"] = np.where(
pd.isnull(postcode_epcs["uprn"]),
postcode_epcs["address"],
postcode_epcs["uprn"]
)
postcode_epcs = postcode_epcs.sort_values("lodgement-date", ascending=False)
postcode_epcs = postcode_epcs.drop_duplicates("uprn", keep="first")
return postcode_epcs
def filter_and_prepare_epcs(epcs):
epcs["Is Cavity Property"] = epcs["walls-description"].isin(CAVITY_WALL_DESCRIPTIONS) & (
epcs["current-energy-efficiency"].astype(int) <= 72
)
epcs["Solar and Loft"] = (
epcs["roof-description"].isin(ROOF_DESCRIPTIONS)
) & (
epcs["photo-supply"].isin(["0", "", "0.0"])
) & (
epcs["current-energy-efficiency"].astype(int) <= 68
)
epcs = epcs[epcs["Is Cavity Property"] | epcs["Solar and Loft"]]
epcs = epcs[~epcs["tenure"].isin(SOCIAL_TENURES)]
return epcs
def rename_and_add_columns(epcs):
epcs = epcs.rename(
columns={
"address": "Address",
"postcode": "Postcode",
"inspection-date": "Date of last EPC",
"current-energy-efficiency": "SAP score on register",
"current-energy-rating": "EPC rating on register",
"property-type": "Property Type",
"built-form": "Archetype",
"total-floor-area": "Property Floor Area",
"construction-age-band": "Property Age Band",
"floor-height": "Property Floor Height",
"number-habitable-rooms": "Number of Habitable Rooms",
"walls-description": "Wall Construction",
"roof-description": "Roof Construction",
"mainheat-description": "Heating Type",
"secondheat-description": "Secondary Heating",
"transaction-type": "Reason for last EPC",
"energy-consumption-current": "Heat Demand (kWh/m2)",
"tenure": "Tenure"
}
)
# Add additional columns as in your original code
epcs["Estimated Number of Floors"] = epcs.apply(
lambda x: estimate_number_of_floors(x["Property Type"]) if pd.notnull(x["Property Type"]) else None, axis=1
)
epcs["Estimated Perimeter (m)"] = epcs.apply(
lambda x: estimate_perimeter(
x["Property Floor Area"] / x["Estimated Number of Floors"],
x["Number of Habitable Rooms"] / x["Estimated Number of Floors"]
), axis=1
)
epcs["Estimated Heat Loss Perimeter (m2)"] = epcs.apply(
lambda x: estimate_external_wall_area(
x["Estimated Number of Floors"],
float(x["Property Floor Height"]) if x["Property Floor Height"] else 2.5,
x["Estimated Perimeter (m)"],
x["Archetype"]
), axis=1
)
epcs["Roof Insulation Thickness"] = epcs.apply(
lambda x: RoofAttributes(description=x["Roof Construction"]).process()[
"insulation_thickness"] if pd.notnull(x["Roof Construction"]) else None,
axis=1
)
return epcs
def main():
"""
This application is used to identify additional units that are private rentals or owner occupies that can be
@ -73,7 +166,13 @@ def main():
- An excel file that contains one or many tabs that include the addresses to be visited
"""
# This should be set:
output_filepath = "/Users/khalimconn-kowlessar/Documents/hestia/Route Marches/PRS and OO properties - WC 11.11.2024"
client = EpcClient(auth_token=EPC_AUTH_TOKEN)
writer = pd.ExcelWriter(output_filepath, engine="xlsxwriter")
for config in CONFIG:
logger.info("Processing %s", config["tab"])
# Read in the data
route_march_addresses = pd.read_excel(
config["filepath"],
@ -84,39 +183,18 @@ def main():
postcodes = route_march_addresses[config["postcode_column"]].unique()
epcs = []
for postcode in postcodes:
# Get the EPCs in this postcode
params = {"postcode": postcode}
client = EpcClient(auth_token=EPC_AUTH_TOKEN)
url = os.path.join(client.domestic.host, "search")
url += "?" + urlencode({k: v for k, v in {"size": 1000}.items() if v})
response = client.domestic.call(method="get", url=url, params=params)
postcode_epcs = pd.DataFrame(response["rows"])
# Get the newest EPC, per UPRN
postcode_epcs["uprn"] = np.where(
pd.isnull(postcode_epcs["uprn"]),
postcode_epcs["address"],
postcode_epcs["uprn"]
)
postcode_epcs = postcode_epcs.sort_values("lodgement-date", ascending=False)
postcode_epcs = postcode_epcs.drop_duplicates("uprn", keep="first")
postcode_epcs["Is Cavity Property"] = postcode_epcs["walls-description"].isin(
CAVITY_WALL_DESCRIPTIONS
) & (postcode_epcs["current-energy-efficiency"].astype(int) <= 72)
postcode_epcs["Solar and Loft"] = (postcode_epcs["roof-description"].isin(ROOF_DESCRIPTIONS)) & (
postcode_epcs["photo-supply"].isin(["0", "", "0.0"])) & (
postcode_epcs["current-energy-efficiency"].astype(int) <= 68
)
postcode_epcs = postcode_epcs[postcode_epcs["Is Cavity Property"] | postcode_epcs["Solar and Loft"]]
# Remove any social properties
postcode_epcs = postcode_epcs[~postcode_epcs["tenure"].isin(SOCIAL_TENURES)]
for postcode in tqdm(postcodes):
postcode_epcs = process_postcode_epcs(postcode, client)
epcs.append(postcode_epcs)
# Concatenate all postcodes' data and filter it
epcs = pd.concat(epcs)
epcs = filter_and_prepare_epcs(epcs)
epcs = rename_and_add_columns(epcs)
sheet_name = config["tab"][:31] # Excel sheet names max length of 31 characters
epcs.to_excel(writer, sheet_name=sheet_name, index=False)
# Save and close the writer outside the loop
writer.close()
logger.info("Data successfully written to %s", output_filepath)

View file

@ -6,4 +6,5 @@ usaddress==0.5.11
fuzzywuzzy==0.18.0
boto3==1.35.44
python-dotenv
tqdm
tqdm
xlsxwriter