debugging retrieve_newest_find_my_epc_data

This commit is contained in:
Khalim Conn-Kowlessar 2024-11-07 12:46:46 +00:00
parent 7c4e32abc9
commit b40f72216f
5 changed files with 396 additions and 8 deletions

View file

@ -0,0 +1,47 @@
import time
from tqdm import tqdm
import pandas as pd
from etl.find_my_epc.RetrieveFindMyEpc import RetrieveFindMyEpc
def app():
"""
This script prepares the asset lists for the additional housing associations, CAHA and Hornsey Housing Trust,
that are forming a consortium led by AIHA
:return:
"""
hornsey_asset_list = pd.read_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/SHDF - Template - EOI - Hornsey Housing "
"Trust.xlsx",
sheet_name="Ksquared-All units information",
header=3
)
# We don't need the first row
hornsey_asset_list = hornsey_asset_list.iloc[1:]
# Fill NA values with empty strings
hornsey_asset_list = hornsey_asset_list.fillna("")
hornsey_asset_list["Address letter or number"] = hornsey_asset_list["Address letter or number"].astype(
str
).str.strip()
hornsey_asset_list["Postcode"] = hornsey_asset_list["Postcode"].astype(str).str.strip()
hornsey_asset_list["Street address"] = hornsey_asset_list["Street address"].astype(str).str.strip()
# Replace double spaces
for col in ["Address letter or number", "Street address", "Postcode"]:
hornsey_asset_list[col] = hornsey_asset_list[col].str.replace(" ", " ")
extracted_data = []
for _, home in tqdm(hornsey_asset_list.iterrows(), total=len(hornsey_asset_list)):
time.sleep(0.5)
# Some properties do not have an epc
if not home["Energy starting band (EPC)"]:
continue
unit_number = home["Address letter or number"]
street = home["Street address"]
postcode = home["Postcode"]
address = ", ".join([x for x in [unit_number, street] if x])
searcher = RetrieveFindMyEpc(address=address, postcode=postcode)
epc_data = searcher.retrieve_newest_find_my_epc_data()
extracted_data.append(epc_data)

View file

@ -236,6 +236,8 @@ def app():
epcs_to_merge = epcs_to_merge.sort_values("Date of last EPC", ascending=False)
epcs_to_merge = epcs_to_merge.drop_duplicates(subset="uprn")
stonewater_cavity_properties["UPRN"] = stonewater_cavity_properties["UPRN"].astype("Int64").astype(str)
# Merge the EPCs on, with the data we need
stonewater_cavity_properties = stonewater_cavity_properties.rename(
columns={
@ -265,14 +267,111 @@ def app():
# Filter on as built cavity properties
additional_properties = additional_properties[
additional_properties["Walls"].isin(
cavity_descriptions +
["Cavity: FilledCavity", "Cavity: External", "Cavity: Internal"]
)
additional_properties["Walls"].isin(cavity_descriptions)
]
additional_properties["Full Address"] = additional_properties["Address"].copy()
house_numbers = []
for _, x in tqdm(additional_properties.iterrows(), total=len(additional_properties)):
house_no = SearchEpc.get_house_number(x["Address"].split(",")[0], x["Postcode"])
if house_no is None:
house_no = x["Address"].split(",")[0]
# If we end up with a number like "01" we need to remove the leading zero
house_no = house_no.lstrip("0")
house_numbers.append(
{
"Address ID": x["Address ID"],
"Number": house_no
}
)
house_numbers = pd.DataFrame(house_numbers)
additional_properties = additional_properties.merge(house_numbers, how="left", on="Address ID")
additional_properties["row_id"] = additional_properties["Address ID"].copy()
# Pull the EPCs for these properties
for _, home in tqdm(additional_properties.iterrows()):
full_address = home["Address"]
postcode = home["Postcode"]
address1 = full_address.split(",")[0]
additional_properties_epcs, errors = get_data(additional_properties)
# Save this data as a pickle
# import pickle
# with open("/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/additional_properties_epcs.pkl",
# "wb") as f:
# pickle.dump(additional_properties_epcs, f)
# We drop Full Address
additional_properties = additional_properties.drop(columns=["Full Address"])
additional_properties2 = additional_properties[[
"row_id", "Address", "Postcode", "Address ID", "SAP", "SAP Band", "Property Type", "Walls", "Roofs", "Glazing",
"Heating", "Main Fuel", "Hot Water", "Renewables", "Total Floor Area",
]].rename(
columns={
"SAP": "Parity - Predicted SAP",
"SAP Band": "Parity - Predicted SAP Band",
"Age": "Parity - Build Age",
"Property Type": "Parity - Property Type",
"Walls": "Parity - Wall Construction",
"Roofs": "Parity - Roof Construction",
"Glazing": "Parity - Glazing Type",
"Heating": "Parity - Heating Type",
"Main Fuel": "Parity - Main Fuel",
"Hot Water": "Parity - Hot Water",
"Renewables": "Parity - Renewables",
"Total Floor Area": "Parity - Total Floor Area"
}
).merge(
pd.DataFrame(additional_properties_epcs)[
[
"row_id",
"property-type",
"built-form",
"inspection-date",
"current-energy-rating",
"current-energy-efficiency",
"roof-description",
"walls-description",
"transaction-type",
"secondheat-description",
"total-floor-area",
"construction-age-band",
"floor-height",
"number-habitable-rooms",
"mainheat-description",
"energy-consumption-current"
]
].rename(
columns={
"inspection-date": "Date of last EPC",
"current-energy-efficiency": "SAP score on register",
"current-energy-rating": "EPC rating on register",
"property-type": "Property Type",
"built-form": "Archetype",
"total-floor-area": "Property Floor Area",
"construction-age-band": "Property Age Band",
"floor-height": "Property Floor Height",
"number-habitable-rooms": "Number of Habitable Rooms",
"walls-description": "Wall Construction",
"roof-description": "Roof Construction",
"mainheat-description": "Heating Type",
"secondheat-description": "Secondary Heating",
"transaction-type": "Reason for last EPC",
"energy-consumption-current": "Heat Demand (kWh/m2)",
}
),
how="left",
on="row_id"
)
# We save the data locally
stonewater_cavity_properties.to_csv(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Cavity Properties.csv",
index=False
)
additional_properties2.to_csv(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Additional Cavity Properties.csv",
index=False
)
# Save the survey findings
needs_cwi.to_csv(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Properties Needing CWI - WIP.csv",
index=False
)

View file

@ -6,3 +6,5 @@ boto3
epc-api-python==1.0.2
usaddress==0.5.11
fuzzywuzzy==0.18.0
python-dotenv

View file

@ -0,0 +1,238 @@
import requests
from bs4 import BeautifulSoup
from datetime import datetime
class RetrieveFindMyEpc:
SEARCH_POSTCODE_URL = (
"https://find-energy-certificate.service.gov.uk/find-a-certificate/search-by-postcode?postcode={postcode_input}"
)
BASE_ENERGY_URL = "https://find-energy-certificate.service.gov.uk"
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/111.0.0.0 Safari/537.36'
}
def __init__(self, address: str, postcode: str):
"""
This class is tasked with retrieving the latest EPC data from the find my epc website
:param address: The address of the property
:param postcode: The postcode of the property
"""
self.address = address
self.postcode = postcode
self.address_cleaned = self.address.replace(",", "").replace(" ", "").lower()
def retrieve_newest_find_my_epc_data(self):
"""
For a post code and address, we pull out all the required data from the find my epc website
"""
postcode_input = self.postcode.replace(" ", "+")
postcode_search = self.SEARCH_POSTCODE_URL.format(postcode_input=postcode_input)
postcode_response = requests.get(postcode_search, headers=self.HEADERS)
postcode_res = BeautifulSoup(postcode_response.text, features="html.parser")
rows = postcode_res.find_all('tr', class_='govuk-table__row')
extracted_table = []
for row in rows:
# Extract the address and URL
address_tag = row.find('a', class_='govuk-link')
if address_tag is None:
continue
extracted_address = None
extracted_address_url = None
if address_tag:
extracted_address = address_tag.text.strip()
extracted_address_url = address_tag['href']
extracted_address_cleaned = extracted_address.replace(",", "").replace(" ", "").lower()
if not extracted_address_cleaned.startswith(self.address_cleaned):
continue
# If the address is a match, we can extract the data
# Extract the expiry date
expiry_date_tag = row.find('td', class_='govuk-table__cell date')
expiry_date = None
if expiry_date_tag is not None:
expiry_date = expiry_date_tag.parent.find('span').text.strip()
extracted_table.append(
{
"extracted_address": extracted_address,
"extracted_address_url": extracted_address_url,
"expiry_date": datetime.strptime(expiry_date, '%d %B %Y'),
}
)
if not extracted_table:
raise ValueError("No EPC found")
if len(extracted_table) > 1:
# We take the one with the most recent expiry date
extracted_table = sorted(extracted_table, key=lambda x: x['expiry_date'], reverse=True)
chosen_epc = self.BASE_ENERGY_URL + extracted_table[0]['extracted_address_url']
epc_certificate = chosen_epc.split('/')[-1]
address_response = requests.get(chosen_epc, headers=self.HEADERS)
address_res = BeautifulSoup(address_response.text, features="html.parser")
# Key data we want to retrieve:
# 1) Rating
# 2) Bills estimates
# 3) Recommendations and SAP points
# 4) Low and zero carbon energy sources
ratings = address_res.find('desc', {'id': 'svg-desc'}).text
current_rating = ratings.split(".")[0]
potential_rating = ratings.split(".")[1]
current_sap = int(current_rating.split(' ')[-1])
# Retrieve the energy consumption
bills = address_res.find('div', {'id': 'bills-affected'})
bills_list = bills.find_all('li')
if not bills_list:
# If this is the case, it's usually becaue the EPC was very old. Early EPCs did not have this information
heating_text = None
hot_water_text = None
else:
heating_text = bills_list[0].text
hot_water_text = bills_list[1].text
# Retrieve the recommendations and SAP points
recommendations = []
recommendations_div = address_res.find('div', class_='epb-recommended-improvements')
if recommendations_div:
# Find all h3 headers for each step and extract their related information
step_headers = recommendations_div.find_all('h3', class_='govuk-heading-m')
previous_sap_score = current_sap
for step_num, step_header in enumerate(step_headers, start=1):
# Extract the step title (the measure)
measure_title = step_header.text.strip().replace(f"Step {step_num}: ", "")
# Find the div containing the potential rating within the same section
potential_rating_div = step_header.find_next(
'div', class_='epb-recommended-improvements__potential-rating'
)
# Check if the potential rating div is found
if potential_rating_div:
# Extract the rating text within the SVG text element
rating_text = potential_rating_div.find('text', class_='govuk-!-font-weight-bold').text.strip()
# Parse the rating text to separate the numeric rating and EPC letter
new_rating = int(rating_text.split()[0])
new_epc = rating_text.split()[1]
# Append the information as a dictionary to the recommendations list
recommendations.append({
"step": step_num,
"measure": measure_title,
"new_rating": new_rating,
"new_epc": new_epc,
"sap_points": new_rating - previous_sap_score
})
previous_sap_score = new_rating
# Search for the assessment informaton
assessment_information = address_res.find('div', {'id': 'information'})
# Parse this information
rows = assessment_information.find_all('div', class_='govuk-summary-list__row')
# Create a dictionary to hold the parsed information
assessment_data = {}
for row in rows:
key = row.find('dt').text.strip()
if key == "Type of assessment":
# We dont reliably extract this
continue
value_tag = row.find('dd')
# Check if value contains a link (email)
if value_tag.find('a'):
value = value_tag.find('a').text.strip()
elif value_tag.find('summary'):
value = value_tag.find('span').text.strip()
else:
value = value_tag.text.strip()
# These are keys that we have for both the surveyor and the acreditation scheme. Firstly, we'll
# get the surveyor's name and email so we make that information clear
if key in ["Telephone", "Email"]:
if "Assessor's " + key not in assessment_data:
assessment_data["Assessor's " + key] = value
else:
assessment_data["Accreditation Scheme's " + key] = value
continue
assessment_data[key] = value
expected_keys = [
'Assessors name',
"Assessor's Telephone",
"Assessor's Email",
'Assessors ID',
'Accreditation scheme',
'Assessors declaration',
"Accreditation Scheme's Telephone",
"Accreditation Scheme's Email",
'Date of assessment',
'Date of certificate'
]
# Check we have all the expected keys
for key in expected_keys:
if key not in assessment_data:
raise ValueError(f"Missing key: {key}")
# Finally, we format the recommendations
recommendations = self.format_recommendations(recommendations)
resulting_data = {
'epc_certificate': epc_certificate,
'current_epc_rating': current_rating.split(' ')[-6],
'current_epc_efficiency': current_sap,
'potential_epc_rating': potential_rating.split(' ')[-6],
"potential_epc_efficiency": int(potential_rating.split(' ')[-1]),
"heating_text": heating_text,
"hot_water_text": hot_water_text,
"recommendations": recommendations,
**assessment_data
}
return resulting_data
def format_recommendations(self, recommendations):
"""
This function converts the recommendations to a format that we can use in the engine as a non-intrusive survey
:param recommendations:
:return:
"""
measure_map = {
"Internal or external wall insulation": ["internal_wall_insulation", "external_wall_insulation"],
"Hot water cylinder insulation": ["hot_water_tank_insulation"],
"Hot water cylinder thermostat": ["cylinder_thermostat"],
"High performance external doors": ["insulated_doors"],
"Floor insulation (solid floor)": ["solid_floor_insulation"],
"Double glazed windows": ["double_glazing"],
"Cavity wall insulation": ["cavity_wall_insulation"],
"Replace boiler with new condensing boiler": ["boiler_upgrade"],
}
formatted_recommendations = []
for rec in recommendations:
mapped = measure_map[rec["measure"]]
for measure in mapped:
formatted_recommendations.append(
{
"type": measure,
"sap_points": rec["sap_points"],
"survey": True
}
)
return formatted_recommendations

View file

@ -0,0 +1,2 @@
pandas
beautifulsoup4