From b40f72216f97d644bdf48663a9f395589d2b124b Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 7 Nov 2024 12:46:46 +0000 Subject: [PATCH] debugging retrieve_newest_find_my_epc_data --- etl/customers/ksquared/Wave3 Modelling.py | 47 ++++ .../stonewater/potential_eco_properties.py | 115 ++++++++- .../requirements/requirements-wave-3-prep.txt | 2 + etl/find_my_epc/RetrieveFindMyEpc.py | 238 ++++++++++++++++++ etl/find_my_epc/requirements.txt | 2 + 5 files changed, 396 insertions(+), 8 deletions(-) create mode 100644 etl/customers/ksquared/Wave3 Modelling.py create mode 100644 etl/find_my_epc/RetrieveFindMyEpc.py create mode 100644 etl/find_my_epc/requirements.txt diff --git a/etl/customers/ksquared/Wave3 Modelling.py b/etl/customers/ksquared/Wave3 Modelling.py new file mode 100644 index 00000000..bf9eb1e8 --- /dev/null +++ b/etl/customers/ksquared/Wave3 Modelling.py @@ -0,0 +1,47 @@ +import time + +from tqdm import tqdm +import pandas as pd +from etl.find_my_epc.RetrieveFindMyEpc import RetrieveFindMyEpc + + +def app(): + """ + This script prepares the asset lists for the additional housing associations, CAHA and Hornsey Housing Trust, + that are forming a consortium led by AIHA + :return: + """ + + hornsey_asset_list = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/SHDF - Template - EOI - Hornsey Housing " + "Trust.xlsx", + sheet_name="Ksquared-All units information", + header=3 + ) + + # We don't need the first row + hornsey_asset_list = hornsey_asset_list.iloc[1:] + # Fill NA values with empty strings + hornsey_asset_list = hornsey_asset_list.fillna("") + hornsey_asset_list["Address letter or number"] = hornsey_asset_list["Address letter or number"].astype( + str + ).str.strip() + hornsey_asset_list["Postcode"] = hornsey_asset_list["Postcode"].astype(str).str.strip() + hornsey_asset_list["Street address"] = hornsey_asset_list["Street address"].astype(str).str.strip() + # Replace double spaces + for col in ["Address letter or number", "Street address", "Postcode"]: + hornsey_asset_list[col] = hornsey_asset_list[col].str.replace(" ", " ") + + extracted_data = [] + for _, home in tqdm(hornsey_asset_list.iterrows(), total=len(hornsey_asset_list)): + time.sleep(0.5) + # Some properties do not have an epc + if not home["Energy starting band (EPC)"]: + continue + unit_number = home["Address letter or number"] + street = home["Street address"] + postcode = home["Postcode"] + address = ", ".join([x for x in [unit_number, street] if x]) + searcher = RetrieveFindMyEpc(address=address, postcode=postcode) + epc_data = searcher.retrieve_newest_find_my_epc_data() + extracted_data.append(epc_data) diff --git a/etl/customers/stonewater/potential_eco_properties.py b/etl/customers/stonewater/potential_eco_properties.py index 26321a41..4fb89113 100644 --- a/etl/customers/stonewater/potential_eco_properties.py +++ b/etl/customers/stonewater/potential_eco_properties.py @@ -236,6 +236,8 @@ def app(): epcs_to_merge = epcs_to_merge.sort_values("Date of last EPC", ascending=False) epcs_to_merge = epcs_to_merge.drop_duplicates(subset="uprn") + stonewater_cavity_properties["UPRN"] = stonewater_cavity_properties["UPRN"].astype("Int64").astype(str) + # Merge the EPCs on, with the data we need stonewater_cavity_properties = stonewater_cavity_properties.rename( columns={ @@ -265,14 +267,111 @@ def app(): # Filter on as built cavity properties additional_properties = additional_properties[ - additional_properties["Walls"].isin( - cavity_descriptions + - ["Cavity: FilledCavity", "Cavity: External", "Cavity: Internal"] - ) + additional_properties["Walls"].isin(cavity_descriptions) ] + additional_properties["Full Address"] = additional_properties["Address"].copy() + house_numbers = [] + for _, x in tqdm(additional_properties.iterrows(), total=len(additional_properties)): + house_no = SearchEpc.get_house_number(x["Address"].split(",")[0], x["Postcode"]) + if house_no is None: + house_no = x["Address"].split(",")[0] + # If we end up with a number like "01" we need to remove the leading zero + house_no = house_no.lstrip("0") + house_numbers.append( + { + "Address ID": x["Address ID"], + "Number": house_no + } + ) + + house_numbers = pd.DataFrame(house_numbers) + additional_properties = additional_properties.merge(house_numbers, how="left", on="Address ID") + additional_properties["row_id"] = additional_properties["Address ID"].copy() # Pull the EPCs for these properties - for _, home in tqdm(additional_properties.iterrows()): - full_address = home["Address"] - postcode = home["Postcode"] - address1 = full_address.split(",")[0] + additional_properties_epcs, errors = get_data(additional_properties) + + # Save this data as a pickle + # import pickle + # with open("/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/additional_properties_epcs.pkl", + # "wb") as f: + # pickle.dump(additional_properties_epcs, f) + + # We drop Full Address + additional_properties = additional_properties.drop(columns=["Full Address"]) + additional_properties2 = additional_properties[[ + "row_id", "Address", "Postcode", "Address ID", "SAP", "SAP Band", "Property Type", "Walls", "Roofs", "Glazing", + "Heating", "Main Fuel", "Hot Water", "Renewables", "Total Floor Area", + + ]].rename( + columns={ + "SAP": "Parity - Predicted SAP", + "SAP Band": "Parity - Predicted SAP Band", + "Age": "Parity - Build Age", + "Property Type": "Parity - Property Type", + "Walls": "Parity - Wall Construction", + "Roofs": "Parity - Roof Construction", + "Glazing": "Parity - Glazing Type", + "Heating": "Parity - Heating Type", + "Main Fuel": "Parity - Main Fuel", + "Hot Water": "Parity - Hot Water", + "Renewables": "Parity - Renewables", + "Total Floor Area": "Parity - Total Floor Area" + } + ).merge( + pd.DataFrame(additional_properties_epcs)[ + [ + "row_id", + "property-type", + "built-form", + "inspection-date", + "current-energy-rating", + "current-energy-efficiency", + "roof-description", + "walls-description", + "transaction-type", + "secondheat-description", + "total-floor-area", + "construction-age-band", + "floor-height", + "number-habitable-rooms", + "mainheat-description", + "energy-consumption-current" + ] + ].rename( + columns={ + "inspection-date": "Date of last EPC", + "current-energy-efficiency": "SAP score on register", + "current-energy-rating": "EPC rating on register", + "property-type": "Property Type", + "built-form": "Archetype", + "total-floor-area": "Property Floor Area", + "construction-age-band": "Property Age Band", + "floor-height": "Property Floor Height", + "number-habitable-rooms": "Number of Habitable Rooms", + "walls-description": "Wall Construction", + "roof-description": "Roof Construction", + "mainheat-description": "Heating Type", + "secondheat-description": "Secondary Heating", + "transaction-type": "Reason for last EPC", + "energy-consumption-current": "Heat Demand (kWh/m2)", + } + ), + how="left", + on="row_id" + ) + + # We save the data locally + stonewater_cavity_properties.to_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Cavity Properties.csv", + index=False + ) + additional_properties2.to_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Additional Cavity Properties.csv", + index=False + ) + # Save the survey findings + needs_cwi.to_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Properties Needing CWI - WIP.csv", + index=False + ) diff --git a/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt b/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt index 102f5930..3ad5d2c1 100644 --- a/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt +++ b/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt @@ -6,3 +6,5 @@ boto3 epc-api-python==1.0.2 usaddress==0.5.11 fuzzywuzzy==0.18.0 +python-dotenv + diff --git a/etl/find_my_epc/RetrieveFindMyEpc.py b/etl/find_my_epc/RetrieveFindMyEpc.py new file mode 100644 index 00000000..a6696021 --- /dev/null +++ b/etl/find_my_epc/RetrieveFindMyEpc.py @@ -0,0 +1,238 @@ +import requests +from bs4 import BeautifulSoup +from datetime import datetime + + +class RetrieveFindMyEpc: + SEARCH_POSTCODE_URL = ( + "https://find-energy-certificate.service.gov.uk/find-a-certificate/search-by-postcode?postcode={postcode_input}" + ) + BASE_ENERGY_URL = "https://find-energy-certificate.service.gov.uk" + + HEADERS = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' + 'Chrome/111.0.0.0 Safari/537.36' + } + + def __init__(self, address: str, postcode: str): + """ + This class is tasked with retrieving the latest EPC data from the find my epc website + :param address: The address of the property + :param postcode: The postcode of the property + """ + self.address = address + self.postcode = postcode + + self.address_cleaned = self.address.replace(",", "").replace(" ", "").lower() + + def retrieve_newest_find_my_epc_data(self): + """ + For a post code and address, we pull out all the required data from the find my epc website + """ + + postcode_input = self.postcode.replace(" ", "+") + postcode_search = self.SEARCH_POSTCODE_URL.format(postcode_input=postcode_input) + postcode_response = requests.get(postcode_search, headers=self.HEADERS) + + postcode_res = BeautifulSoup(postcode_response.text, features="html.parser") + rows = postcode_res.find_all('tr', class_='govuk-table__row') + + extracted_table = [] + for row in rows: + # Extract the address and URL + address_tag = row.find('a', class_='govuk-link') + if address_tag is None: + continue + extracted_address = None + extracted_address_url = None + if address_tag: + extracted_address = address_tag.text.strip() + extracted_address_url = address_tag['href'] + + extracted_address_cleaned = extracted_address.replace(",", "").replace(" ", "").lower() + if not extracted_address_cleaned.startswith(self.address_cleaned): + continue + + # If the address is a match, we can extract the data + + # Extract the expiry date + expiry_date_tag = row.find('td', class_='govuk-table__cell date') + expiry_date = None + if expiry_date_tag is not None: + expiry_date = expiry_date_tag.parent.find('span').text.strip() + + extracted_table.append( + { + "extracted_address": extracted_address, + "extracted_address_url": extracted_address_url, + "expiry_date": datetime.strptime(expiry_date, '%d %B %Y'), + } + ) + + if not extracted_table: + raise ValueError("No EPC found") + + if len(extracted_table) > 1: + # We take the one with the most recent expiry date + extracted_table = sorted(extracted_table, key=lambda x: x['expiry_date'], reverse=True) + + chosen_epc = self.BASE_ENERGY_URL + extracted_table[0]['extracted_address_url'] + epc_certificate = chosen_epc.split('/')[-1] + + address_response = requests.get(chosen_epc, headers=self.HEADERS) + address_res = BeautifulSoup(address_response.text, features="html.parser") + + # Key data we want to retrieve: + # 1) Rating + # 2) Bills estimates + # 3) Recommendations and SAP points + # 4) Low and zero carbon energy sources + + ratings = address_res.find('desc', {'id': 'svg-desc'}).text + current_rating = ratings.split(".")[0] + potential_rating = ratings.split(".")[1] + current_sap = int(current_rating.split(' ')[-1]) + + # Retrieve the energy consumption + bills = address_res.find('div', {'id': 'bills-affected'}) + bills_list = bills.find_all('li') + if not bills_list: + # If this is the case, it's usually becaue the EPC was very old. Early EPCs did not have this information + heating_text = None + hot_water_text = None + else: + heating_text = bills_list[0].text + hot_water_text = bills_list[1].text + + # Retrieve the recommendations and SAP points + recommendations = [] + recommendations_div = address_res.find('div', class_='epb-recommended-improvements') + if recommendations_div: + # Find all h3 headers for each step and extract their related information + step_headers = recommendations_div.find_all('h3', class_='govuk-heading-m') + previous_sap_score = current_sap + for step_num, step_header in enumerate(step_headers, start=1): + # Extract the step title (the measure) + measure_title = step_header.text.strip().replace(f"Step {step_num}: ", "") + + # Find the div containing the potential rating within the same section + potential_rating_div = step_header.find_next( + 'div', class_='epb-recommended-improvements__potential-rating' + ) + + # Check if the potential rating div is found + if potential_rating_div: + # Extract the rating text within the SVG text element + rating_text = potential_rating_div.find('text', class_='govuk-!-font-weight-bold').text.strip() + # Parse the rating text to separate the numeric rating and EPC letter + new_rating = int(rating_text.split()[0]) + new_epc = rating_text.split()[1] + + # Append the information as a dictionary to the recommendations list + recommendations.append({ + "step": step_num, + "measure": measure_title, + "new_rating": new_rating, + "new_epc": new_epc, + "sap_points": new_rating - previous_sap_score + }) + previous_sap_score = new_rating + + # Search for the assessment informaton + assessment_information = address_res.find('div', {'id': 'information'}) + # Parse this information + rows = assessment_information.find_all('div', class_='govuk-summary-list__row') + # Create a dictionary to hold the parsed information + assessment_data = {} + for row in rows: + key = row.find('dt').text.strip() + if key == "Type of assessment": + # We dont reliably extract this + continue + value_tag = row.find('dd') + + # Check if value contains a link (email) + if value_tag.find('a'): + value = value_tag.find('a').text.strip() + elif value_tag.find('summary'): + value = value_tag.find('span').text.strip() + else: + value = value_tag.text.strip() + + # These are keys that we have for both the surveyor and the acreditation scheme. Firstly, we'll + # get the surveyor's name and email so we make that information clear + if key in ["Telephone", "Email"]: + if "Assessor's " + key not in assessment_data: + assessment_data["Assessor's " + key] = value + else: + assessment_data["Accreditation Scheme's " + key] = value + continue + + assessment_data[key] = value + + expected_keys = [ + 'Assessor’s name', + "Assessor's Telephone", + "Assessor's Email", + 'Assessor’s ID', + 'Accreditation scheme', + 'Assessor’s declaration', + "Accreditation Scheme's Telephone", + "Accreditation Scheme's Email", + 'Date of assessment', + 'Date of certificate' + ] + # Check we have all the expected keys + for key in expected_keys: + if key not in assessment_data: + raise ValueError(f"Missing key: {key}") + + # Finally, we format the recommendations + recommendations = self.format_recommendations(recommendations) + + resulting_data = { + 'epc_certificate': epc_certificate, + 'current_epc_rating': current_rating.split(' ')[-6], + 'current_epc_efficiency': current_sap, + 'potential_epc_rating': potential_rating.split(' ')[-6], + "potential_epc_efficiency": int(potential_rating.split(' ')[-1]), + "heating_text": heating_text, + "hot_water_text": hot_water_text, + "recommendations": recommendations, + **assessment_data + } + + return resulting_data + + def format_recommendations(self, recommendations): + """ + This function converts the recommendations to a format that we can use in the engine as a non-intrusive survey + :param recommendations: + :return: + """ + + measure_map = { + "Internal or external wall insulation": ["internal_wall_insulation", "external_wall_insulation"], + "Hot water cylinder insulation": ["hot_water_tank_insulation"], + "Hot water cylinder thermostat": ["cylinder_thermostat"], + "High performance external doors": ["insulated_doors"], + "Floor insulation (solid floor)": ["solid_floor_insulation"], + "Double glazed windows": ["double_glazing"], + "Cavity wall insulation": ["cavity_wall_insulation"], + "Replace boiler with new condensing boiler": ["boiler_upgrade"], + } + + formatted_recommendations = [] + for rec in recommendations: + + mapped = measure_map[rec["measure"]] + for measure in mapped: + formatted_recommendations.append( + { + "type": measure, + "sap_points": rec["sap_points"], + "survey": True + } + ) + + return formatted_recommendations diff --git a/etl/find_my_epc/requirements.txt b/etl/find_my_epc/requirements.txt new file mode 100644 index 00000000..9a3fc73f --- /dev/null +++ b/etl/find_my_epc/requirements.txt @@ -0,0 +1,2 @@ +pandas +beautifulsoup4 \ No newline at end of file