mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
debugging retrieve_newest_find_my_epc_data
This commit is contained in:
parent
7c4e32abc9
commit
b40f72216f
5 changed files with 396 additions and 8 deletions
47
etl/customers/ksquared/Wave3 Modelling.py
Normal file
47
etl/customers/ksquared/Wave3 Modelling.py
Normal file
|
|
@ -0,0 +1,47 @@
|
|||
import time
|
||||
|
||||
from tqdm import tqdm
|
||||
import pandas as pd
|
||||
from etl.find_my_epc.RetrieveFindMyEpc import RetrieveFindMyEpc
|
||||
|
||||
|
||||
def app():
|
||||
"""
|
||||
This script prepares the asset lists for the additional housing associations, CAHA and Hornsey Housing Trust,
|
||||
that are forming a consortium led by AIHA
|
||||
:return:
|
||||
"""
|
||||
|
||||
hornsey_asset_list = pd.read_excel(
|
||||
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/SHDF - Template - EOI - Hornsey Housing "
|
||||
"Trust.xlsx",
|
||||
sheet_name="Ksquared-All units information",
|
||||
header=3
|
||||
)
|
||||
|
||||
# We don't need the first row
|
||||
hornsey_asset_list = hornsey_asset_list.iloc[1:]
|
||||
# Fill NA values with empty strings
|
||||
hornsey_asset_list = hornsey_asset_list.fillna("")
|
||||
hornsey_asset_list["Address letter or number"] = hornsey_asset_list["Address letter or number"].astype(
|
||||
str
|
||||
).str.strip()
|
||||
hornsey_asset_list["Postcode"] = hornsey_asset_list["Postcode"].astype(str).str.strip()
|
||||
hornsey_asset_list["Street address"] = hornsey_asset_list["Street address"].astype(str).str.strip()
|
||||
# Replace double spaces
|
||||
for col in ["Address letter or number", "Street address", "Postcode"]:
|
||||
hornsey_asset_list[col] = hornsey_asset_list[col].str.replace(" ", " ")
|
||||
|
||||
extracted_data = []
|
||||
for _, home in tqdm(hornsey_asset_list.iterrows(), total=len(hornsey_asset_list)):
|
||||
time.sleep(0.5)
|
||||
# Some properties do not have an epc
|
||||
if not home["Energy starting band (EPC)"]:
|
||||
continue
|
||||
unit_number = home["Address letter or number"]
|
||||
street = home["Street address"]
|
||||
postcode = home["Postcode"]
|
||||
address = ", ".join([x for x in [unit_number, street] if x])
|
||||
searcher = RetrieveFindMyEpc(address=address, postcode=postcode)
|
||||
epc_data = searcher.retrieve_newest_find_my_epc_data()
|
||||
extracted_data.append(epc_data)
|
||||
|
|
@ -236,6 +236,8 @@ def app():
|
|||
epcs_to_merge = epcs_to_merge.sort_values("Date of last EPC", ascending=False)
|
||||
epcs_to_merge = epcs_to_merge.drop_duplicates(subset="uprn")
|
||||
|
||||
stonewater_cavity_properties["UPRN"] = stonewater_cavity_properties["UPRN"].astype("Int64").astype(str)
|
||||
|
||||
# Merge the EPCs on, with the data we need
|
||||
stonewater_cavity_properties = stonewater_cavity_properties.rename(
|
||||
columns={
|
||||
|
|
@ -265,14 +267,111 @@ def app():
|
|||
|
||||
# Filter on as built cavity properties
|
||||
additional_properties = additional_properties[
|
||||
additional_properties["Walls"].isin(
|
||||
cavity_descriptions +
|
||||
["Cavity: FilledCavity", "Cavity: External", "Cavity: Internal"]
|
||||
)
|
||||
additional_properties["Walls"].isin(cavity_descriptions)
|
||||
]
|
||||
additional_properties["Full Address"] = additional_properties["Address"].copy()
|
||||
house_numbers = []
|
||||
for _, x in tqdm(additional_properties.iterrows(), total=len(additional_properties)):
|
||||
house_no = SearchEpc.get_house_number(x["Address"].split(",")[0], x["Postcode"])
|
||||
if house_no is None:
|
||||
house_no = x["Address"].split(",")[0]
|
||||
# If we end up with a number like "01" we need to remove the leading zero
|
||||
house_no = house_no.lstrip("0")
|
||||
house_numbers.append(
|
||||
{
|
||||
"Address ID": x["Address ID"],
|
||||
"Number": house_no
|
||||
}
|
||||
)
|
||||
|
||||
house_numbers = pd.DataFrame(house_numbers)
|
||||
additional_properties = additional_properties.merge(house_numbers, how="left", on="Address ID")
|
||||
additional_properties["row_id"] = additional_properties["Address ID"].copy()
|
||||
|
||||
# Pull the EPCs for these properties
|
||||
for _, home in tqdm(additional_properties.iterrows()):
|
||||
full_address = home["Address"]
|
||||
postcode = home["Postcode"]
|
||||
address1 = full_address.split(",")[0]
|
||||
additional_properties_epcs, errors = get_data(additional_properties)
|
||||
|
||||
# Save this data as a pickle
|
||||
# import pickle
|
||||
# with open("/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/additional_properties_epcs.pkl",
|
||||
# "wb") as f:
|
||||
# pickle.dump(additional_properties_epcs, f)
|
||||
|
||||
# We drop Full Address
|
||||
additional_properties = additional_properties.drop(columns=["Full Address"])
|
||||
additional_properties2 = additional_properties[[
|
||||
"row_id", "Address", "Postcode", "Address ID", "SAP", "SAP Band", "Property Type", "Walls", "Roofs", "Glazing",
|
||||
"Heating", "Main Fuel", "Hot Water", "Renewables", "Total Floor Area",
|
||||
|
||||
]].rename(
|
||||
columns={
|
||||
"SAP": "Parity - Predicted SAP",
|
||||
"SAP Band": "Parity - Predicted SAP Band",
|
||||
"Age": "Parity - Build Age",
|
||||
"Property Type": "Parity - Property Type",
|
||||
"Walls": "Parity - Wall Construction",
|
||||
"Roofs": "Parity - Roof Construction",
|
||||
"Glazing": "Parity - Glazing Type",
|
||||
"Heating": "Parity - Heating Type",
|
||||
"Main Fuel": "Parity - Main Fuel",
|
||||
"Hot Water": "Parity - Hot Water",
|
||||
"Renewables": "Parity - Renewables",
|
||||
"Total Floor Area": "Parity - Total Floor Area"
|
||||
}
|
||||
).merge(
|
||||
pd.DataFrame(additional_properties_epcs)[
|
||||
[
|
||||
"row_id",
|
||||
"property-type",
|
||||
"built-form",
|
||||
"inspection-date",
|
||||
"current-energy-rating",
|
||||
"current-energy-efficiency",
|
||||
"roof-description",
|
||||
"walls-description",
|
||||
"transaction-type",
|
||||
"secondheat-description",
|
||||
"total-floor-area",
|
||||
"construction-age-band",
|
||||
"floor-height",
|
||||
"number-habitable-rooms",
|
||||
"mainheat-description",
|
||||
"energy-consumption-current"
|
||||
]
|
||||
].rename(
|
||||
columns={
|
||||
"inspection-date": "Date of last EPC",
|
||||
"current-energy-efficiency": "SAP score on register",
|
||||
"current-energy-rating": "EPC rating on register",
|
||||
"property-type": "Property Type",
|
||||
"built-form": "Archetype",
|
||||
"total-floor-area": "Property Floor Area",
|
||||
"construction-age-band": "Property Age Band",
|
||||
"floor-height": "Property Floor Height",
|
||||
"number-habitable-rooms": "Number of Habitable Rooms",
|
||||
"walls-description": "Wall Construction",
|
||||
"roof-description": "Roof Construction",
|
||||
"mainheat-description": "Heating Type",
|
||||
"secondheat-description": "Secondary Heating",
|
||||
"transaction-type": "Reason for last EPC",
|
||||
"energy-consumption-current": "Heat Demand (kWh/m2)",
|
||||
}
|
||||
),
|
||||
how="left",
|
||||
on="row_id"
|
||||
)
|
||||
|
||||
# We save the data locally
|
||||
stonewater_cavity_properties.to_csv(
|
||||
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Cavity Properties.csv",
|
||||
index=False
|
||||
)
|
||||
additional_properties2.to_csv(
|
||||
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Additional Cavity Properties.csv",
|
||||
index=False
|
||||
)
|
||||
# Save the survey findings
|
||||
needs_cwi.to_csv(
|
||||
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Properties Needing CWI - WIP.csv",
|
||||
index=False
|
||||
)
|
||||
|
|
|
|||
|
|
@ -6,3 +6,5 @@ boto3
|
|||
epc-api-python==1.0.2
|
||||
usaddress==0.5.11
|
||||
fuzzywuzzy==0.18.0
|
||||
python-dotenv
|
||||
|
||||
|
|
|
|||
238
etl/find_my_epc/RetrieveFindMyEpc.py
Normal file
238
etl/find_my_epc/RetrieveFindMyEpc.py
Normal file
|
|
@ -0,0 +1,238 @@
|
|||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
class RetrieveFindMyEpc:
|
||||
SEARCH_POSTCODE_URL = (
|
||||
"https://find-energy-certificate.service.gov.uk/find-a-certificate/search-by-postcode?postcode={postcode_input}"
|
||||
)
|
||||
BASE_ENERGY_URL = "https://find-energy-certificate.service.gov.uk"
|
||||
|
||||
HEADERS = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
|
||||
'Chrome/111.0.0.0 Safari/537.36'
|
||||
}
|
||||
|
||||
def __init__(self, address: str, postcode: str):
|
||||
"""
|
||||
This class is tasked with retrieving the latest EPC data from the find my epc website
|
||||
:param address: The address of the property
|
||||
:param postcode: The postcode of the property
|
||||
"""
|
||||
self.address = address
|
||||
self.postcode = postcode
|
||||
|
||||
self.address_cleaned = self.address.replace(",", "").replace(" ", "").lower()
|
||||
|
||||
def retrieve_newest_find_my_epc_data(self):
|
||||
"""
|
||||
For a post code and address, we pull out all the required data from the find my epc website
|
||||
"""
|
||||
|
||||
postcode_input = self.postcode.replace(" ", "+")
|
||||
postcode_search = self.SEARCH_POSTCODE_URL.format(postcode_input=postcode_input)
|
||||
postcode_response = requests.get(postcode_search, headers=self.HEADERS)
|
||||
|
||||
postcode_res = BeautifulSoup(postcode_response.text, features="html.parser")
|
||||
rows = postcode_res.find_all('tr', class_='govuk-table__row')
|
||||
|
||||
extracted_table = []
|
||||
for row in rows:
|
||||
# Extract the address and URL
|
||||
address_tag = row.find('a', class_='govuk-link')
|
||||
if address_tag is None:
|
||||
continue
|
||||
extracted_address = None
|
||||
extracted_address_url = None
|
||||
if address_tag:
|
||||
extracted_address = address_tag.text.strip()
|
||||
extracted_address_url = address_tag['href']
|
||||
|
||||
extracted_address_cleaned = extracted_address.replace(",", "").replace(" ", "").lower()
|
||||
if not extracted_address_cleaned.startswith(self.address_cleaned):
|
||||
continue
|
||||
|
||||
# If the address is a match, we can extract the data
|
||||
|
||||
# Extract the expiry date
|
||||
expiry_date_tag = row.find('td', class_='govuk-table__cell date')
|
||||
expiry_date = None
|
||||
if expiry_date_tag is not None:
|
||||
expiry_date = expiry_date_tag.parent.find('span').text.strip()
|
||||
|
||||
extracted_table.append(
|
||||
{
|
||||
"extracted_address": extracted_address,
|
||||
"extracted_address_url": extracted_address_url,
|
||||
"expiry_date": datetime.strptime(expiry_date, '%d %B %Y'),
|
||||
}
|
||||
)
|
||||
|
||||
if not extracted_table:
|
||||
raise ValueError("No EPC found")
|
||||
|
||||
if len(extracted_table) > 1:
|
||||
# We take the one with the most recent expiry date
|
||||
extracted_table = sorted(extracted_table, key=lambda x: x['expiry_date'], reverse=True)
|
||||
|
||||
chosen_epc = self.BASE_ENERGY_URL + extracted_table[0]['extracted_address_url']
|
||||
epc_certificate = chosen_epc.split('/')[-1]
|
||||
|
||||
address_response = requests.get(chosen_epc, headers=self.HEADERS)
|
||||
address_res = BeautifulSoup(address_response.text, features="html.parser")
|
||||
|
||||
# Key data we want to retrieve:
|
||||
# 1) Rating
|
||||
# 2) Bills estimates
|
||||
# 3) Recommendations and SAP points
|
||||
# 4) Low and zero carbon energy sources
|
||||
|
||||
ratings = address_res.find('desc', {'id': 'svg-desc'}).text
|
||||
current_rating = ratings.split(".")[0]
|
||||
potential_rating = ratings.split(".")[1]
|
||||
current_sap = int(current_rating.split(' ')[-1])
|
||||
|
||||
# Retrieve the energy consumption
|
||||
bills = address_res.find('div', {'id': 'bills-affected'})
|
||||
bills_list = bills.find_all('li')
|
||||
if not bills_list:
|
||||
# If this is the case, it's usually becaue the EPC was very old. Early EPCs did not have this information
|
||||
heating_text = None
|
||||
hot_water_text = None
|
||||
else:
|
||||
heating_text = bills_list[0].text
|
||||
hot_water_text = bills_list[1].text
|
||||
|
||||
# Retrieve the recommendations and SAP points
|
||||
recommendations = []
|
||||
recommendations_div = address_res.find('div', class_='epb-recommended-improvements')
|
||||
if recommendations_div:
|
||||
# Find all h3 headers for each step and extract their related information
|
||||
step_headers = recommendations_div.find_all('h3', class_='govuk-heading-m')
|
||||
previous_sap_score = current_sap
|
||||
for step_num, step_header in enumerate(step_headers, start=1):
|
||||
# Extract the step title (the measure)
|
||||
measure_title = step_header.text.strip().replace(f"Step {step_num}: ", "")
|
||||
|
||||
# Find the div containing the potential rating within the same section
|
||||
potential_rating_div = step_header.find_next(
|
||||
'div', class_='epb-recommended-improvements__potential-rating'
|
||||
)
|
||||
|
||||
# Check if the potential rating div is found
|
||||
if potential_rating_div:
|
||||
# Extract the rating text within the SVG text element
|
||||
rating_text = potential_rating_div.find('text', class_='govuk-!-font-weight-bold').text.strip()
|
||||
# Parse the rating text to separate the numeric rating and EPC letter
|
||||
new_rating = int(rating_text.split()[0])
|
||||
new_epc = rating_text.split()[1]
|
||||
|
||||
# Append the information as a dictionary to the recommendations list
|
||||
recommendations.append({
|
||||
"step": step_num,
|
||||
"measure": measure_title,
|
||||
"new_rating": new_rating,
|
||||
"new_epc": new_epc,
|
||||
"sap_points": new_rating - previous_sap_score
|
||||
})
|
||||
previous_sap_score = new_rating
|
||||
|
||||
# Search for the assessment informaton
|
||||
assessment_information = address_res.find('div', {'id': 'information'})
|
||||
# Parse this information
|
||||
rows = assessment_information.find_all('div', class_='govuk-summary-list__row')
|
||||
# Create a dictionary to hold the parsed information
|
||||
assessment_data = {}
|
||||
for row in rows:
|
||||
key = row.find('dt').text.strip()
|
||||
if key == "Type of assessment":
|
||||
# We dont reliably extract this
|
||||
continue
|
||||
value_tag = row.find('dd')
|
||||
|
||||
# Check if value contains a link (email)
|
||||
if value_tag.find('a'):
|
||||
value = value_tag.find('a').text.strip()
|
||||
elif value_tag.find('summary'):
|
||||
value = value_tag.find('span').text.strip()
|
||||
else:
|
||||
value = value_tag.text.strip()
|
||||
|
||||
# These are keys that we have for both the surveyor and the acreditation scheme. Firstly, we'll
|
||||
# get the surveyor's name and email so we make that information clear
|
||||
if key in ["Telephone", "Email"]:
|
||||
if "Assessor's " + key not in assessment_data:
|
||||
assessment_data["Assessor's " + key] = value
|
||||
else:
|
||||
assessment_data["Accreditation Scheme's " + key] = value
|
||||
continue
|
||||
|
||||
assessment_data[key] = value
|
||||
|
||||
expected_keys = [
|
||||
'Assessor’s name',
|
||||
"Assessor's Telephone",
|
||||
"Assessor's Email",
|
||||
'Assessor’s ID',
|
||||
'Accreditation scheme',
|
||||
'Assessor’s declaration',
|
||||
"Accreditation Scheme's Telephone",
|
||||
"Accreditation Scheme's Email",
|
||||
'Date of assessment',
|
||||
'Date of certificate'
|
||||
]
|
||||
# Check we have all the expected keys
|
||||
for key in expected_keys:
|
||||
if key not in assessment_data:
|
||||
raise ValueError(f"Missing key: {key}")
|
||||
|
||||
# Finally, we format the recommendations
|
||||
recommendations = self.format_recommendations(recommendations)
|
||||
|
||||
resulting_data = {
|
||||
'epc_certificate': epc_certificate,
|
||||
'current_epc_rating': current_rating.split(' ')[-6],
|
||||
'current_epc_efficiency': current_sap,
|
||||
'potential_epc_rating': potential_rating.split(' ')[-6],
|
||||
"potential_epc_efficiency": int(potential_rating.split(' ')[-1]),
|
||||
"heating_text": heating_text,
|
||||
"hot_water_text": hot_water_text,
|
||||
"recommendations": recommendations,
|
||||
**assessment_data
|
||||
}
|
||||
|
||||
return resulting_data
|
||||
|
||||
def format_recommendations(self, recommendations):
|
||||
"""
|
||||
This function converts the recommendations to a format that we can use in the engine as a non-intrusive survey
|
||||
:param recommendations:
|
||||
:return:
|
||||
"""
|
||||
|
||||
measure_map = {
|
||||
"Internal or external wall insulation": ["internal_wall_insulation", "external_wall_insulation"],
|
||||
"Hot water cylinder insulation": ["hot_water_tank_insulation"],
|
||||
"Hot water cylinder thermostat": ["cylinder_thermostat"],
|
||||
"High performance external doors": ["insulated_doors"],
|
||||
"Floor insulation (solid floor)": ["solid_floor_insulation"],
|
||||
"Double glazed windows": ["double_glazing"],
|
||||
"Cavity wall insulation": ["cavity_wall_insulation"],
|
||||
"Replace boiler with new condensing boiler": ["boiler_upgrade"],
|
||||
}
|
||||
|
||||
formatted_recommendations = []
|
||||
for rec in recommendations:
|
||||
|
||||
mapped = measure_map[rec["measure"]]
|
||||
for measure in mapped:
|
||||
formatted_recommendations.append(
|
||||
{
|
||||
"type": measure,
|
||||
"sap_points": rec["sap_points"],
|
||||
"survey": True
|
||||
}
|
||||
)
|
||||
|
||||
return formatted_recommendations
|
||||
2
etl/find_my_epc/requirements.txt
Normal file
2
etl/find_my_epc/requirements.txt
Normal file
|
|
@ -0,0 +1,2 @@
|
|||
pandas
|
||||
beautifulsoup4
|
||||
Loading…
Add table
Reference in a new issue