diff --git a/etl/route_march/oo_prs_additional_units/oo_prs_additional_units.py b/etl/route_march/oo_prs_additional_units/oo_prs_additional_units.py index 345f0afe..c1b562ea 100644 --- a/etl/route_march/oo_prs_additional_units/oo_prs_additional_units.py +++ b/etl/route_march/oo_prs_additional_units/oo_prs_additional_units.py @@ -1,10 +1,20 @@ import os import pandas as pd import numpy as np +from tqdm import tqdm from dotenv import load_dotenv from urllib.parse import urlencode from epc_api.client import EpcClient +from utils.logger import setup_logger +from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes +from recommendations.recommendation_utils import ( + estimate_perimeter, + estimate_external_wall_area, + estimate_number_of_floors +) + +logger = setup_logger() load_dotenv(dotenv_path="backend/.env") EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") @@ -64,6 +74,89 @@ ROOF_DESCRIPTIONS = [ SOCIAL_TENURES = ["Rented (social)", "rental (social)"] +def process_postcode_epcs(postcode, client): + params = {"postcode": postcode} + url = os.path.join(client.domestic.host, "search") + "?" + urlencode({"size": 1000}) + response = client.domestic.call(method="get", url=url, params=params) + postcode_epcs = pd.DataFrame(response["rows"]) + + # Processing code here + postcode_epcs["uprn"] = np.where( + pd.isnull(postcode_epcs["uprn"]), + postcode_epcs["address"], + postcode_epcs["uprn"] + ) + postcode_epcs = postcode_epcs.sort_values("lodgement-date", ascending=False) + postcode_epcs = postcode_epcs.drop_duplicates("uprn", keep="first") + return postcode_epcs + + +def filter_and_prepare_epcs(epcs): + epcs["Is Cavity Property"] = epcs["walls-description"].isin(CAVITY_WALL_DESCRIPTIONS) & ( + epcs["current-energy-efficiency"].astype(int) <= 72 + ) + epcs["Solar and Loft"] = ( + epcs["roof-description"].isin(ROOF_DESCRIPTIONS) + ) & ( + epcs["photo-supply"].isin(["0", "", "0.0"]) + ) & ( + epcs["current-energy-efficiency"].astype(int) <= 68 + ) + epcs = epcs[epcs["Is Cavity Property"] | epcs["Solar and Loft"]] + epcs = epcs[~epcs["tenure"].isin(SOCIAL_TENURES)] + return epcs + + +def rename_and_add_columns(epcs): + epcs = epcs.rename( + columns={ + "address": "Address", + "postcode": "Postcode", + "inspection-date": "Date of last EPC", + "current-energy-efficiency": "SAP score on register", + "current-energy-rating": "EPC rating on register", + "property-type": "Property Type", + "built-form": "Archetype", + "total-floor-area": "Property Floor Area", + "construction-age-band": "Property Age Band", + "floor-height": "Property Floor Height", + "number-habitable-rooms": "Number of Habitable Rooms", + "walls-description": "Wall Construction", + "roof-description": "Roof Construction", + "mainheat-description": "Heating Type", + "secondheat-description": "Secondary Heating", + "transaction-type": "Reason for last EPC", + "energy-consumption-current": "Heat Demand (kWh/m2)", + "tenure": "Tenure" + } + ) + + # Add additional columns as in your original code + epcs["Estimated Number of Floors"] = epcs.apply( + lambda x: estimate_number_of_floors(x["Property Type"]) if pd.notnull(x["Property Type"]) else None, axis=1 + ) + epcs["Estimated Perimeter (m)"] = epcs.apply( + lambda x: estimate_perimeter( + x["Property Floor Area"] / x["Estimated Number of Floors"], + x["Number of Habitable Rooms"] / x["Estimated Number of Floors"] + ), axis=1 + ) + epcs["Estimated Heat Loss Perimeter (m2)"] = epcs.apply( + lambda x: estimate_external_wall_area( + x["Estimated Number of Floors"], + float(x["Property Floor Height"]) if x["Property Floor Height"] else 2.5, + x["Estimated Perimeter (m)"], + x["Archetype"] + ), axis=1 + ) + epcs["Roof Insulation Thickness"] = epcs.apply( + lambda x: RoofAttributes(description=x["Roof Construction"]).process()[ + "insulation_thickness"] if pd.notnull(x["Roof Construction"]) else None, + axis=1 + ) + return epcs + + def main(): """ This application is used to identify additional units that are private rentals or owner occupies that can be @@ -73,7 +166,13 @@ def main(): - An excel file that contains one or many tabs that include the addresses to be visited """ + # This should be set: + output_filepath = "/Users/khalimconn-kowlessar/Documents/hestia/Route Marches/PRS and OO properties - WC 11.11.2024" + client = EpcClient(auth_token=EPC_AUTH_TOKEN) + writer = pd.ExcelWriter(output_filepath, engine="xlsxwriter") + for config in CONFIG: + logger.info("Processing %s", config["tab"]) # Read in the data route_march_addresses = pd.read_excel( config["filepath"], @@ -84,39 +183,18 @@ def main(): postcodes = route_march_addresses[config["postcode_column"]].unique() epcs = [] - for postcode in postcodes: - # Get the EPCs in this postcode - - params = {"postcode": postcode} - client = EpcClient(auth_token=EPC_AUTH_TOKEN) - url = os.path.join(client.domestic.host, "search") - url += "?" + urlencode({k: v for k, v in {"size": 1000}.items() if v}) - response = client.domestic.call(method="get", url=url, params=params) - - postcode_epcs = pd.DataFrame(response["rows"]) - # Get the newest EPC, per UPRN - postcode_epcs["uprn"] = np.where( - pd.isnull(postcode_epcs["uprn"]), - postcode_epcs["address"], - postcode_epcs["uprn"] - ) - postcode_epcs = postcode_epcs.sort_values("lodgement-date", ascending=False) - postcode_epcs = postcode_epcs.drop_duplicates("uprn", keep="first") - - postcode_epcs["Is Cavity Property"] = postcode_epcs["walls-description"].isin( - CAVITY_WALL_DESCRIPTIONS - ) & (postcode_epcs["current-energy-efficiency"].astype(int) <= 72) - - postcode_epcs["Solar and Loft"] = (postcode_epcs["roof-description"].isin(ROOF_DESCRIPTIONS)) & ( - postcode_epcs["photo-supply"].isin(["0", "", "0.0"])) & ( - postcode_epcs["current-energy-efficiency"].astype(int) <= 68 - ) - - postcode_epcs = postcode_epcs[postcode_epcs["Is Cavity Property"] | postcode_epcs["Solar and Loft"]] - - # Remove any social properties - postcode_epcs = postcode_epcs[~postcode_epcs["tenure"].isin(SOCIAL_TENURES)] - + for postcode in tqdm(postcodes): + postcode_epcs = process_postcode_epcs(postcode, client) epcs.append(postcode_epcs) + # Concatenate all postcodes' data and filter it epcs = pd.concat(epcs) + epcs = filter_and_prepare_epcs(epcs) + epcs = rename_and_add_columns(epcs) + + sheet_name = config["tab"][:31] # Excel sheet names max length of 31 characters + epcs.to_excel(writer, sheet_name=sheet_name, index=False) + + # Save and close the writer outside the loop + writer.close() + logger.info("Data successfully written to %s", output_filepath) diff --git a/etl/route_march/oo_prs_additional_units/requirements.txt b/etl/route_march/oo_prs_additional_units/requirements.txt index fd763a3b..e2f4832c 100644 --- a/etl/route_march/oo_prs_additional_units/requirements.txt +++ b/etl/route_march/oo_prs_additional_units/requirements.txt @@ -6,4 +6,5 @@ usaddress==0.5.11 fuzzywuzzy==0.18.0 boto3==1.35.44 python-dotenv -tqdm \ No newline at end of file +tqdm +xlsxwriter \ No newline at end of file