Model/etl/find_my_epc/RetrieveFindMyEpc.py
Khalim Conn-Kowlessar 77844c625e minor
2025-02-10 15:41:33 +00:00

357 lines
16 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import datetime
class RetrieveFindMyEpc:
SEARCH_POSTCODE_URL = (
"https://find-energy-certificate.service.gov.uk/find-a-certificate/search-by-postcode?postcode={postcode_input}"
)
BASE_ENERGY_URL = "https://find-energy-certificate.service.gov.uk"
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/111.0.0.0 Safari/537.36'
}
def __init__(self, address: str, postcode: str):
"""
This class is tasked with retrieving the latest EPC data from the find my epc website
:param address: The address of the property
:param postcode: The postcode of the property
"""
self.address = address
self.postcode = postcode
self.address_cleaned = self.address.replace(",", "").replace(" ", "").lower()
self.walls = []
@staticmethod
def extract_low_carbon_sources(soup):
# Find the section header
section_header = soup.find("h3", string="Low and zero carbon energy sources")
if not section_header:
return {}
# Locate the list following the header
energy_list = section_header.find_next("ul")
# Extract the list items
sources = {item.get_text(strip=True): True for item in energy_list.find_all("li")}
return sources
def retrieve_newest_find_my_epc_data(self, sap_2012_date=None):
"""
For a post code and address, we pull out all the required data from the find my epc website
"""
postcode_input = self.postcode.replace(" ", "+")
postcode_search = self.SEARCH_POSTCODE_URL.format(postcode_input=postcode_input)
postcode_response = requests.get(postcode_search, headers=self.HEADERS)
postcode_res = BeautifulSoup(postcode_response.text, features="html.parser")
rows = postcode_res.find_all('tr', class_='govuk-table__row')
extracted_table = []
for row in rows:
# Extract the address and URL
address_tag = row.find('a', class_='govuk-link')
if address_tag is None:
continue
extracted_address = None
extracted_address_url = None
if address_tag:
extracted_address = address_tag.text.strip()
extracted_address_url = address_tag['href']
extracted_address_cleaned = extracted_address.replace(",", "").replace(" ", "").lower()
if not extracted_address_cleaned.startswith(self.address_cleaned):
continue
# If the address is a match, we can extract the data
# Extract the expiry date
expiry_date_tag = row.find('td', class_='govuk-table__cell date')
expiry_date = None
if expiry_date_tag is not None:
expiry_date = expiry_date_tag.parent.find('span').text.strip()
extracted_table.append(
{
"extracted_address": extracted_address,
"extracted_address_url": extracted_address_url,
"expiry_date": datetime.strptime(expiry_date, '%d %B %Y'),
}
)
if not extracted_table:
raise ValueError("No EPC found")
if len(extracted_table) > 1:
# We take the one with the most recent expiry date
extracted_table = sorted(extracted_table, key=lambda x: x['expiry_date'], reverse=True)
chosen_epc = self.BASE_ENERGY_URL + extracted_table[0]['extracted_address_url']
epc_certificate = chosen_epc.split('/')[-1]
address_response = requests.get(chosen_epc, headers=self.HEADERS)
address_res = BeautifulSoup(address_response.text, features="html.parser")
# Key data we want to retrieve:
# 1) Rating
# 2) Bills estimates
# 3) Recommendations and SAP points
# 4) Low and zero carbon energy sources
# 5) The wall types of the property - used for determining if we have an extension wall insulation#
# recommendation
ratings = address_res.find('desc', {'id': 'svg-desc'}).text
current_rating = ratings.split(".")[0]
potential_rating = ratings.split(".")[1]
current_sap = int(current_rating.split(' ')[-1])
# Retrieve the energy consumption
bills = address_res.find('div', {'id': 'bills-affected'})
bills_list = bills.find_all('li')
if not bills_list:
# If this is the case, it's usually becaue the EPC was very old. Early EPCs did not have this information
heating_text = None
hot_water_text = None
else:
heating_text = bills_list[0].text
hot_water_text = bills_list[1].text
# Retrieve the recommendations and SAP points
recommendations = []
recommendations_div = address_res.find('div', class_='epb-recommended-improvements')
if recommendations_div:
# Find all h3 headers for each step and extract their related information
step_headers = recommendations_div.find_all('h3', class_='govuk-heading-m')
previous_sap_score = current_sap
previous_epc = current_rating.split(' ')[-6]
for step_num, step_header in enumerate(step_headers, start=1):
# Extract the step title (the measure)
measure_title = step_header.text.strip().replace(f"Step {step_num}: ", "")
# Find the div containing the potential rating within the same section
potential_rating_div = step_header.find_next(
'div', class_='epb-recommended-improvements__potential-rating'
)
# Check if the potential rating div is found
if potential_rating_div:
# Extract the rating text within the SVG text element
extracted_rating_text = potential_rating_div.find('text', class_='govuk-!-font-weight-bold')
if extracted_rating_text is not None:
rating_text = extracted_rating_text.text.strip()
else:
rating_text = " ".join([str(previous_sap_score), previous_epc])
# Parse the rating text to separate the numeric rating and EPC letter
new_rating = int(rating_text.split()[0])
new_epc = rating_text.split()[1]
# Append the information as a dictionary to the recommendations list
recommendations.append({
"step": step_num,
"measure": measure_title,
"new_rating": new_rating,
"new_epc": new_epc,
"sap_points": new_rating - previous_sap_score
})
previous_sap_score = new_rating
previous_epc = new_epc
# Search for the assessment informaton
assessment_information = address_res.find('div', {'id': 'information'})
# Parse this information
rows = assessment_information.find_all('div', class_='govuk-summary-list__row')
# Create a dictionary to hold the parsed information
assessment_data = {}
for row in rows:
key = row.find('dt').text.strip()
if key == "Type of assessment":
# We dont reliably extract this
continue
value_tag = row.find('dd')
# Check if value contains a link (email)
if value_tag.find('a'):
value = value_tag.find('a').text.strip()
elif value_tag.find('summary'):
value = value_tag.find('span').text.strip()
else:
value = value_tag.text.strip()
# These are keys that we have for both the surveyor and the acreditation scheme. Firstly, we'll
# get the surveyor's name and email so we make that information clear
if key in ["Telephone", "Email"]:
if "Assessor's " + key not in assessment_data:
assessment_data["Assessor's " + key] = value
else:
assessment_data["Accreditation Scheme's " + key] = value
continue
assessment_data[key] = value
expected_keys = [
'Assessors name',
"Assessor's Telephone",
"Assessor's Email",
'Assessors ID',
'Accreditation scheme',
'Assessors declaration',
"Accreditation Scheme's Telephone",
"Accreditation Scheme's Email",
'Date of assessment',
'Date of certificate'
]
# Check we have all the expected keys
for key in expected_keys:
if key not in assessment_data:
raise ValueError(f"Missing key: {key}")
# The wall types of the property
property_features_table = address_res.find("tbody", class_="govuk-table__body")
property_features_table = property_features_table.find_all("tr")
# Extract wall types
self.walls = []
for row in property_features_table:
cells = row.find_all("td")
if row.find("th").text.strip() == "Wall":
self.walls.append(cells[0].text.strip())
# Finally, we format the recommendations
recommendations = self.format_recommendations(recommendations, assessment_data, sap_2012_date)
# 4) Low and zero carbon energy sources
low_carbon_energy_sources = self.extract_low_carbon_sources(address_res)
resulting_data = {
'epc_certificate': epc_certificate,
'current_epc_rating': current_rating.split(' ')[-6],
'current_epc_efficiency': current_sap,
'potential_epc_rating': potential_rating.split(' ')[-6],
"potential_epc_efficiency": int(potential_rating.split(' ')[-1]),
"heating_text": heating_text,
"hot_water_text": hot_water_text,
"recommendations": recommendations,
**assessment_data,
**low_carbon_energy_sources
}
return resulting_data
def format_recommendations(self, recommendations, assessment_data, sap_2012_date=None):
"""
This function converts the recommendations to a format that we can use in the engine as a non-intrusive survey
:param recommendations: The recommendations from the EPC
:param assessment_data: The assessment data from the EPC
:param sap_2012_date: The date of the SAP 2012 update
"""
measure_map = {
"Internal or external wall insulation": ["internal_wall_insulation", "external_wall_insulation"],
"Hot water cylinder insulation": ["hot_water_tank_insulation"],
"Hot water cylinder thermostat": ["cylinder_thermostat"],
"High performance external doors": ["insulated_doors"],
"Floor insulation (solid floor)": ["solid_floor_insulation"],
"Floor insulation (suspended floor)": ["suspended_floor_insulation"],
"Double glazed windows": ["double_glazing"],
"Cavity wall insulation": ["cavity_wall_insulation"],
"Replace boiler with new condensing boiler": ["boiler_upgrade"],
"Floor insulation": ["floor_insulation"], # Recommendation typically associated to older EPCs
"Heating controls (programmer, room thermostat and TRVs)": [
"roomstat_programmer_trvs", "time_temperature_zone_control"
],
"Low energy lighting": ["low_energy_lighting"],
"Increase loft insulation to 270 mm": ["loft_insulation"],
"Heating controls (thermostatic radiator valves)": [
"roomstat_programmer_trvs", "time_temperature_zone_control"
],
"Solar water heating": ["solar_water_heating"],
"Solar photovoltaic panels, 2.5 kWp": ["solar_pv"],
"Heating controls (room thermostat and TRVs)": [
"roomstat_programmer_trvs", "time_temperature_zone_control"
],
"Change heating to gas condensing boiler": ["boiler_upgrade"],
"Fan assisted storage heaters and dual immersion cylinder": ["high_heat_retention_storage_heater"],
"Flat roof or sloping ceiling insulation": ["flat_roof_insulation"],
"Heating controls (room thermostat)": [
"roomstat_programmer_trvs", "time_temperature_zone_control"
],
"Band A condensing boiler": ["boiler_upgrade"],
"Double glazing": ["double_glazing"],
"Flue gas heat recovery device in conjunction with boiler": ["flue_gas_heat_recovery"],
"Wind turbine": ["wind_turbine"],
"Loft insulation": ["loft_insulation"],
"Solar photovoltaic (PV) panels": ["solar_pv"],
"Party wall insulation": ["party_wall_insulation"],
'Draught proofing': ["draught_proofing"],
"Roof insulation recommendation": [],
"Cavity wall insulation recommendation": [],
"Windows draught proofing": [],
"Low energy lighting for all fixed outlets": ["low_energy_lighting"],
"Cylinder thermostat recommendation": [],
"Heating controls recommendation": [],
"Replace boiler with Band A condensing boiler": ["boiler_upgrade"],
"Band A condensing gas boiler": ["boiler_upgrade"],
"Solar panel recommendation": [],
"Double glazing recommendation": [],
"Solid wall insulation recommendation": [],
"Fuel change recommendation": [],
"PV Cells recommendation": [],
"Replacement glazing units": ["double_glazing"],
"Heating controls (time and temperature zone control)": ["time_temperature_zone_control"],
"High heat retention storage heaters": ["high_heat_retention_storage_heater"],
"Gas condensing boiler": ["boiler_upgrade"],
"Change room heaters to condensing boiler": ["boiler_upgrade"],
"Cylinder thermostat": ["cylinder_thermostat"],
"Heat recovery system for mixer showers": ["heat_recovery_shower"],
"Room-in-roof insulation": ["room_in_roof_insulation"],
"Fan assisted storage heaters": [],
"Fan-assisted storage heaters": [],
"Step 1:": [],
"Step 2:": [],
'Step 3:': [],
"Biomass stove with boiler": [],
"Replace boiler with biomass boiler": [],
"Heating controls (room thermostat and thermostatic radiator valves)": [
"roomstat_programmer_trvs", "time_temperature_zone_control"
],
"Heating controls (programmer, and thermostatic radiator valves)": [
"roomstat_programmer_trvs", "time_temperature_zone_control"
],
"Heating controls (programmer and TRVs)": [
"roomstat_programmer_trvs", "time_temperature_zone_control"
],
"Heating controls (programmer and room thermostat)": [
"roomstat_programmer_trvs", "time_temperature_zone_control"
],
"Replacement warm air unit": [],
"Secondary glazing": ["secondary_glazing"]
}
survey = True
if sap_2012_date is not None:
certificate_date = datetime.strptime(assessment_data["Date of certificate"], "%d %B %Y")
if certificate_date < pd.to_datetime(sap_2012_date):
survey = False
formatted_recommendations = []
for rec in recommendations:
mapped = measure_map[rec["measure"]]
for measure in mapped:
if measure == "cavity_wall_insulation" and "solid brick" in self.walls[0].lower():
measure = "extension_cavity_wall_insulation"
to_append = {
"type": measure,
"sap_points": rec["sap_points"],
"survey": survey,
}
if measure == "solar_pv":
to_append["suitable"] = True
formatted_recommendations.append(to_append)
return formatted_recommendations