From cb4b59727202b5ae10726f94c7e97bbe414cf9ab Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 5 Nov 2024 13:54:14 +0000 Subject: [PATCH 1/6] setting up route march script --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- backend/SearchEpc.py | 3 + .../oo_prs_additional_units.py | 122 ++++++++++++++++++ .../oo_prs_additional_units/requirements.txt | 9 ++ 5 files changed, 136 insertions(+), 2 deletions(-) create mode 100644 etl/route_march/oo_prs_additional_units/oo_prs_additional_units.py create mode 100644 etl/route_march/oo_prs_additional_units/requirements.txt diff --git a/.idea/Model.iml b/.idea/Model.iml index df6c4faa..0e963140 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index 50cad4ca..35513387 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/backend/SearchEpc.py b/backend/SearchEpc.py index f9e978c6..2d658c04 100644 --- a/backend/SearchEpc.py +++ b/backend/SearchEpc.py @@ -2,6 +2,7 @@ import os import time import re +from urllib.parse import urlencode import usaddress import pandas as pd import numpy as np @@ -257,6 +258,8 @@ class SearchEpc: params = {"address": self.address1, "postcode": self.postcode} url = os.path.join(self.client.domestic.host, "search") + if size: + url += "?" + urlencode({k: v for k, v in {"size": size}.items() if v}) for retry in range(self.max_retries): try: diff --git a/etl/route_march/oo_prs_additional_units/oo_prs_additional_units.py b/etl/route_march/oo_prs_additional_units/oo_prs_additional_units.py new file mode 100644 index 00000000..345f0afe --- /dev/null +++ b/etl/route_march/oo_prs_additional_units/oo_prs_additional_units.py @@ -0,0 +1,122 @@ +import os +import pandas as pd +import numpy as np +from dotenv import load_dotenv +from urllib.parse import urlencode +from epc_api.client import EpcClient + +load_dotenv(dotenv_path="backend/.env") +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") + +CONFIG = [ + { + "filepath": "/Users/khalimconn-kowlessar/Documents/hestia/Route Marches/Surveyors Sites for Week Commencing " + "11.11.2024.xlsx", + "tab": "SETTLE GBIS x 242 ", + "postcode_column": "Postcode", + }, + { + "filepath": "/Users/khalimconn-kowlessar/Documents/hestia/Route Marches/Surveyors Sites for Week Commencing " + "11.11.2024.xlsx", + "tab": "ACIS GBIS x 76", + "postcode_column": "Postcode", + }, + { + "filepath": "/Users/khalimconn-kowlessar/Documents/hestia/Route Marches/Surveyors Sites for Week Commencing " + "11.11.2024.xlsx", + "tab": "SOUTHERN GBIS x 150", + "postcode_column": "Postcode", + }, + { + "filepath": "/Users/khalimconn-kowlessar/Documents/hestia/Route Marches/Surveyors Sites for Week Commencing " + "11.11.2024.xlsx", + "tab": "COMMUNITY HOUSING GBIS x 199", + "postcode_column": "Postcode", + }, + { + "filepath": "/Users/khalimconn-kowlessar/Documents/hestia/Route Marches/Surveyors Sites for Week Commencing " + "11.11.2024.xlsx", + "tab": "EASTLIGHT GBIS x 42", + "postcode_column": "Postcode", + }, +] + +CAVITY_WALL_DESCRIPTIONS = [ + "Cavity wall, as built, no insulation (assumed)", + "Cavity wall, as built, partial insulation (assumed)", + "Cavity wall, as built, insulated (assumed)", + "Cavity wall, with internal insulation", + "Cavity wall, with external insulation", +] + +ROOF_DESCRIPTIONS = [ + "Pitched, no insulation", + "Pitched, no insulation (assumed)", + "Pitched, 25 mm loft insulation", + "Pitched, 50 mm loft insulation", + "Pitched, 75 mm loft insulation", + "Pitched, 100 mm loft insulation", + "Pitched, 150 mm loft insulation", + "Pitched, limited insulation (assumed)", + "Pitched, insulated (assumed)", +] + +SOCIAL_TENURES = ["Rented (social)", "rental (social)"] + + +def main(): + """ + This application is used to identify additional units that are private rentals or owner occupies that can be + included in the route marches + + Required inputs are the following: + - An excel file that contains one or many tabs that include the addresses to be visited + """ + + for config in CONFIG: + # Read in the data + route_march_addresses = pd.read_excel( + config["filepath"], + sheet_name=config["tab"], + engine="openpyxl" + ) + + postcodes = route_march_addresses[config["postcode_column"]].unique() + + epcs = [] + for postcode in postcodes: + # Get the EPCs in this postcode + + params = {"postcode": postcode} + client = EpcClient(auth_token=EPC_AUTH_TOKEN) + url = os.path.join(client.domestic.host, "search") + url += "?" + urlencode({k: v for k, v in {"size": 1000}.items() if v}) + response = client.domestic.call(method="get", url=url, params=params) + + postcode_epcs = pd.DataFrame(response["rows"]) + # Get the newest EPC, per UPRN + postcode_epcs["uprn"] = np.where( + pd.isnull(postcode_epcs["uprn"]), + postcode_epcs["address"], + postcode_epcs["uprn"] + ) + postcode_epcs = postcode_epcs.sort_values("lodgement-date", ascending=False) + postcode_epcs = postcode_epcs.drop_duplicates("uprn", keep="first") + + postcode_epcs["Is Cavity Property"] = postcode_epcs["walls-description"].isin( + CAVITY_WALL_DESCRIPTIONS + ) & (postcode_epcs["current-energy-efficiency"].astype(int) <= 72) + + postcode_epcs["Solar and Loft"] = (postcode_epcs["roof-description"].isin(ROOF_DESCRIPTIONS)) & ( + postcode_epcs["photo-supply"].isin(["0", "", "0.0"])) & ( + postcode_epcs["current-energy-efficiency"].astype(int) <= 68 + ) + + postcode_epcs = postcode_epcs[postcode_epcs["Is Cavity Property"] | postcode_epcs["Solar and Loft"]] + + # Remove any social properties + postcode_epcs = postcode_epcs[~postcode_epcs["tenure"].isin(SOCIAL_TENURES)] + + epcs.append(postcode_epcs) + + epcs = pd.concat(epcs) diff --git a/etl/route_march/oo_prs_additional_units/requirements.txt b/etl/route_march/oo_prs_additional_units/requirements.txt new file mode 100644 index 00000000..fd763a3b --- /dev/null +++ b/etl/route_march/oo_prs_additional_units/requirements.txt @@ -0,0 +1,9 @@ +openpyxl +epc-api-python==1.0.2 +numpy==2.1.2 +pandas==2.2.3 +usaddress==0.5.11 +fuzzywuzzy==0.18.0 +boto3==1.35.44 +python-dotenv +tqdm \ No newline at end of file From 2f930e3fa278127c8d964f92761209d4ec4b23f4 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 5 Nov 2024 14:19:17 +0000 Subject: [PATCH 2/6] refactoring prs and oo data puls --- .../oo_prs_additional_units.py | 144 ++++++++++++++---- .../oo_prs_additional_units/requirements.txt | 3 +- 2 files changed, 113 insertions(+), 34 deletions(-) diff --git a/etl/route_march/oo_prs_additional_units/oo_prs_additional_units.py b/etl/route_march/oo_prs_additional_units/oo_prs_additional_units.py index 345f0afe..c1b562ea 100644 --- a/etl/route_march/oo_prs_additional_units/oo_prs_additional_units.py +++ b/etl/route_march/oo_prs_additional_units/oo_prs_additional_units.py @@ -1,10 +1,20 @@ import os import pandas as pd import numpy as np +from tqdm import tqdm from dotenv import load_dotenv from urllib.parse import urlencode from epc_api.client import EpcClient +from utils.logger import setup_logger +from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes +from recommendations.recommendation_utils import ( + estimate_perimeter, + estimate_external_wall_area, + estimate_number_of_floors +) + +logger = setup_logger() load_dotenv(dotenv_path="backend/.env") EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") @@ -64,6 +74,89 @@ ROOF_DESCRIPTIONS = [ SOCIAL_TENURES = ["Rented (social)", "rental (social)"] +def process_postcode_epcs(postcode, client): + params = {"postcode": postcode} + url = os.path.join(client.domestic.host, "search") + "?" + urlencode({"size": 1000}) + response = client.domestic.call(method="get", url=url, params=params) + postcode_epcs = pd.DataFrame(response["rows"]) + + # Processing code here + postcode_epcs["uprn"] = np.where( + pd.isnull(postcode_epcs["uprn"]), + postcode_epcs["address"], + postcode_epcs["uprn"] + ) + postcode_epcs = postcode_epcs.sort_values("lodgement-date", ascending=False) + postcode_epcs = postcode_epcs.drop_duplicates("uprn", keep="first") + return postcode_epcs + + +def filter_and_prepare_epcs(epcs): + epcs["Is Cavity Property"] = epcs["walls-description"].isin(CAVITY_WALL_DESCRIPTIONS) & ( + epcs["current-energy-efficiency"].astype(int) <= 72 + ) + epcs["Solar and Loft"] = ( + epcs["roof-description"].isin(ROOF_DESCRIPTIONS) + ) & ( + epcs["photo-supply"].isin(["0", "", "0.0"]) + ) & ( + epcs["current-energy-efficiency"].astype(int) <= 68 + ) + epcs = epcs[epcs["Is Cavity Property"] | epcs["Solar and Loft"]] + epcs = epcs[~epcs["tenure"].isin(SOCIAL_TENURES)] + return epcs + + +def rename_and_add_columns(epcs): + epcs = epcs.rename( + columns={ + "address": "Address", + "postcode": "Postcode", + "inspection-date": "Date of last EPC", + "current-energy-efficiency": "SAP score on register", + "current-energy-rating": "EPC rating on register", + "property-type": "Property Type", + "built-form": "Archetype", + "total-floor-area": "Property Floor Area", + "construction-age-band": "Property Age Band", + "floor-height": "Property Floor Height", + "number-habitable-rooms": "Number of Habitable Rooms", + "walls-description": "Wall Construction", + "roof-description": "Roof Construction", + "mainheat-description": "Heating Type", + "secondheat-description": "Secondary Heating", + "transaction-type": "Reason for last EPC", + "energy-consumption-current": "Heat Demand (kWh/m2)", + "tenure": "Tenure" + } + ) + + # Add additional columns as in your original code + epcs["Estimated Number of Floors"] = epcs.apply( + lambda x: estimate_number_of_floors(x["Property Type"]) if pd.notnull(x["Property Type"]) else None, axis=1 + ) + epcs["Estimated Perimeter (m)"] = epcs.apply( + lambda x: estimate_perimeter( + x["Property Floor Area"] / x["Estimated Number of Floors"], + x["Number of Habitable Rooms"] / x["Estimated Number of Floors"] + ), axis=1 + ) + epcs["Estimated Heat Loss Perimeter (m2)"] = epcs.apply( + lambda x: estimate_external_wall_area( + x["Estimated Number of Floors"], + float(x["Property Floor Height"]) if x["Property Floor Height"] else 2.5, + x["Estimated Perimeter (m)"], + x["Archetype"] + ), axis=1 + ) + epcs["Roof Insulation Thickness"] = epcs.apply( + lambda x: RoofAttributes(description=x["Roof Construction"]).process()[ + "insulation_thickness"] if pd.notnull(x["Roof Construction"]) else None, + axis=1 + ) + return epcs + + def main(): """ This application is used to identify additional units that are private rentals or owner occupies that can be @@ -73,7 +166,13 @@ def main(): - An excel file that contains one or many tabs that include the addresses to be visited """ + # This should be set: + output_filepath = "/Users/khalimconn-kowlessar/Documents/hestia/Route Marches/PRS and OO properties - WC 11.11.2024" + client = EpcClient(auth_token=EPC_AUTH_TOKEN) + writer = pd.ExcelWriter(output_filepath, engine="xlsxwriter") + for config in CONFIG: + logger.info("Processing %s", config["tab"]) # Read in the data route_march_addresses = pd.read_excel( config["filepath"], @@ -84,39 +183,18 @@ def main(): postcodes = route_march_addresses[config["postcode_column"]].unique() epcs = [] - for postcode in postcodes: - # Get the EPCs in this postcode - - params = {"postcode": postcode} - client = EpcClient(auth_token=EPC_AUTH_TOKEN) - url = os.path.join(client.domestic.host, "search") - url += "?" + urlencode({k: v for k, v in {"size": 1000}.items() if v}) - response = client.domestic.call(method="get", url=url, params=params) - - postcode_epcs = pd.DataFrame(response["rows"]) - # Get the newest EPC, per UPRN - postcode_epcs["uprn"] = np.where( - pd.isnull(postcode_epcs["uprn"]), - postcode_epcs["address"], - postcode_epcs["uprn"] - ) - postcode_epcs = postcode_epcs.sort_values("lodgement-date", ascending=False) - postcode_epcs = postcode_epcs.drop_duplicates("uprn", keep="first") - - postcode_epcs["Is Cavity Property"] = postcode_epcs["walls-description"].isin( - CAVITY_WALL_DESCRIPTIONS - ) & (postcode_epcs["current-energy-efficiency"].astype(int) <= 72) - - postcode_epcs["Solar and Loft"] = (postcode_epcs["roof-description"].isin(ROOF_DESCRIPTIONS)) & ( - postcode_epcs["photo-supply"].isin(["0", "", "0.0"])) & ( - postcode_epcs["current-energy-efficiency"].astype(int) <= 68 - ) - - postcode_epcs = postcode_epcs[postcode_epcs["Is Cavity Property"] | postcode_epcs["Solar and Loft"]] - - # Remove any social properties - postcode_epcs = postcode_epcs[~postcode_epcs["tenure"].isin(SOCIAL_TENURES)] - + for postcode in tqdm(postcodes): + postcode_epcs = process_postcode_epcs(postcode, client) epcs.append(postcode_epcs) + # Concatenate all postcodes' data and filter it epcs = pd.concat(epcs) + epcs = filter_and_prepare_epcs(epcs) + epcs = rename_and_add_columns(epcs) + + sheet_name = config["tab"][:31] # Excel sheet names max length of 31 characters + epcs.to_excel(writer, sheet_name=sheet_name, index=False) + + # Save and close the writer outside the loop + writer.close() + logger.info("Data successfully written to %s", output_filepath) diff --git a/etl/route_march/oo_prs_additional_units/requirements.txt b/etl/route_march/oo_prs_additional_units/requirements.txt index fd763a3b..e2f4832c 100644 --- a/etl/route_march/oo_prs_additional_units/requirements.txt +++ b/etl/route_march/oo_prs_additional_units/requirements.txt @@ -6,4 +6,5 @@ usaddress==0.5.11 fuzzywuzzy==0.18.0 boto3==1.35.44 python-dotenv -tqdm \ No newline at end of file +tqdm +xlsxwriter \ No newline at end of file From 557c0b589862eb1391e96f1a447b3323ebace9db Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 5 Nov 2024 14:24:50 +0000 Subject: [PATCH 3/6] debugging string data --- .../oo_prs_additional_units/oo_prs_additional_units.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/etl/route_march/oo_prs_additional_units/oo_prs_additional_units.py b/etl/route_march/oo_prs_additional_units/oo_prs_additional_units.py index c1b562ea..69e08f9a 100644 --- a/etl/route_march/oo_prs_additional_units/oo_prs_additional_units.py +++ b/etl/route_march/oo_prs_additional_units/oo_prs_additional_units.py @@ -131,10 +131,14 @@ def rename_and_add_columns(epcs): } ) + epcs["Number of Habitable Rooms"] = epcs["Number of Habitable Rooms"].astype(int) + epcs["Property Floor Area"] = epcs["Property Floor Area"].astype(float) + # Add additional columns as in your original code epcs["Estimated Number of Floors"] = epcs.apply( lambda x: estimate_number_of_floors(x["Property Type"]) if pd.notnull(x["Property Type"]) else None, axis=1 ) + epcs["Estimated Perimeter (m)"] = epcs.apply( lambda x: estimate_perimeter( x["Property Floor Area"] / x["Estimated Number of Floors"], @@ -167,7 +171,9 @@ def main(): """ # This should be set: - output_filepath = "/Users/khalimconn-kowlessar/Documents/hestia/Route Marches/PRS and OO properties - WC 11.11.2024" + output_filepath = ( + "/Users/khalimconn-kowlessar/Documents/hestia/Route Marches/PRS and OO properties - WC 11.11.2024.xlsx" + ) client = EpcClient(auth_token=EPC_AUTH_TOKEN) writer = pd.ExcelWriter(output_filepath, engine="xlsxwriter") From a7aecb24629519c028d0f2d144610a2cf8dc0e7a Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 5 Nov 2024 14:29:54 +0000 Subject: [PATCH 4/6] debugging data pull --- .../oo_prs_additional_units/oo_prs_additional_units.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/etl/route_march/oo_prs_additional_units/oo_prs_additional_units.py b/etl/route_march/oo_prs_additional_units/oo_prs_additional_units.py index 69e08f9a..2c63a788 100644 --- a/etl/route_march/oo_prs_additional_units/oo_prs_additional_units.py +++ b/etl/route_march/oo_prs_additional_units/oo_prs_additional_units.py @@ -75,9 +75,12 @@ SOCIAL_TENURES = ["Rented (social)", "rental (social)"] def process_postcode_epcs(postcode, client): - params = {"postcode": postcode} + params = {"postcode": postcode.rstrip().lstrip()} url = os.path.join(client.domestic.host, "search") + "?" + urlencode({"size": 1000}) response = client.domestic.call(method="get", url=url, params=params) + if "rows" not in response: + logger.warning("No EPCs found for postcode %s", postcode) + return pd.DataFrame() postcode_epcs = pd.DataFrame(response["rows"]) # Processing code here @@ -191,6 +194,8 @@ def main(): epcs = [] for postcode in tqdm(postcodes): postcode_epcs = process_postcode_epcs(postcode, client) + if postcode_epcs.empty: + continue epcs.append(postcode_epcs) # Concatenate all postcodes' data and filter it From 4443f1aa4b3b9216f3643de376d8838e6bd89a5b Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 5 Nov 2024 14:33:01 +0000 Subject: [PATCH 5/6] re-added dropping of columns and changed default floor height to 2.4 --- .../oo_prs_additional_units.py | 29 ++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/etl/route_march/oo_prs_additional_units/oo_prs_additional_units.py b/etl/route_march/oo_prs_additional_units/oo_prs_additional_units.py index 2c63a788..93757051 100644 --- a/etl/route_march/oo_prs_additional_units/oo_prs_additional_units.py +++ b/etl/route_march/oo_prs_additional_units/oo_prs_additional_units.py @@ -111,6 +111,33 @@ def filter_and_prepare_epcs(epcs): def rename_and_add_columns(epcs): + # Retrieve just the data we need + epcs = epcs[ + [ + "uprn", + "address", + "postcode", + "property-type", + "built-form", + "inspection-date", + "current-energy-rating", + "current-energy-efficiency", + "roof-description", + "walls-description", + "transaction-type", + # New fields needed + "secondheat-description", + "total-floor-area", + "construction-age-band", + "floor-height", + "number-habitable-rooms", + "mainheat-description", + # + "energy-consumption-current", # kwh/m2 + "tenure" + ] + ] + epcs = epcs.rename( columns={ "address": "Address", @@ -151,7 +178,7 @@ def rename_and_add_columns(epcs): epcs["Estimated Heat Loss Perimeter (m2)"] = epcs.apply( lambda x: estimate_external_wall_area( x["Estimated Number of Floors"], - float(x["Property Floor Height"]) if x["Property Floor Height"] else 2.5, + float(x["Property Floor Height"]) if x["Property Floor Height"] else 2.4, x["Estimated Perimeter (m)"], x["Archetype"] ), axis=1 From 00bd1e0ce6ee788090baa98781979b06a313c812 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 5 Nov 2024 14:56:28 +0000 Subject: [PATCH 6/6] prs and oo data pulled for now --- .../oo_prs_additional_units/oo_prs_additional_units.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/etl/route_march/oo_prs_additional_units/oo_prs_additional_units.py b/etl/route_march/oo_prs_additional_units/oo_prs_additional_units.py index 93757051..3bd87a8c 100644 --- a/etl/route_march/oo_prs_additional_units/oo_prs_additional_units.py +++ b/etl/route_march/oo_prs_additional_units/oo_prs_additional_units.py @@ -134,7 +134,9 @@ def rename_and_add_columns(epcs): "mainheat-description", # "energy-consumption-current", # kwh/m2 - "tenure" + "tenure", + "Is Cavity Property", + "Solar and Loft", ] ]