added a patch method to scraping epc data

This commit is contained in:
Khalim Conn-Kowlessar 2025-04-10 23:10:52 +01:00
parent fd2600b9ba
commit 2d71ad25ef
14 changed files with 564 additions and 621 deletions

2
.idea/Model.iml generated
View file

@ -7,7 +7,7 @@
<sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
</content>
<orderEntry type="jdk" jdkName="AssetList" jdkType="Python SDK" />
<orderEntry type="jdk" jdkName="Fastapi-backend" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="PyNamespacePackagesService">

2
.idea/misc.xml generated
View file

@ -3,7 +3,7 @@
<component name="Black">
<option name="sdkName" value="Python 3.10 (backend)" />
</component>
<component name="ProjectRootManager" version="2" project-jdk-name="AssetList" project-jdk-type="Python SDK" />
<component name="ProjectRootManager" version="2" project-jdk-name="Fastapi-backend" project-jdk-type="Python SDK" />
<component name="PyCharmProfessionalAdvertiser">
<option name="shown" value="true" />
</component>

File diff suppressed because it is too large Load diff

View file

@ -10,6 +10,7 @@ from asset_list.mappings.built_form import BUILT_FORM_MAPPINGS
from asset_list.mappings.walls import WALL_CONSTRUCTION_MAPPINGS
from asset_list.mappings.heating_systems import HEATING_MAPPINGS
from asset_list.mappings.exising_pv import EXISTING_PV_MAPPINGS
from asset_list.mappings.roof import ROOF_CONSTRUCTION_MAPPINGS
from asset_list.utils import get_data
from dotenv import load_dotenv
@ -88,6 +89,63 @@ def app():
# - We want: fully insulated property (all wall types), EPC D or below (floors should be solid)
# - Or the insulation required is loft/cavity (floors should be solid)
# Torus
data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Torus/Phase 1"
data_filename = "Torus Property Asset List - Phase 1.xlsx"
sheet_name = "TORUS"
postcode_column = 'Postcode'
fulladdress_column = None
address1_column = "AddressLine1"
address1_method = None
address_cols_to_concat = ["AddressLine1", "AddressLine2", "AddressLine3"]
missing_postcodes_method = None
landlord_year_built = "Property Age"
landlord_os_uprn = "NatUPRN"
landlord_property_type = "Property Type"
landlord_built_form = "Built Form"
landlord_wall_construction = "Wall Construction"
landlord_roof_construction = "Roof Construction"
landlord_heating_system = "Space Heating Source"
landlord_existing_pv = "Low Carbon Technology (Solar PV)"
landlord_property_id = "UPRN"
landlord_sap = "SAP Score"
outcomes_filename = None
outcomes_sheetname = None
outcomes_postcode = None
outcomes_houseno = None
outcomes_id = None
outcomes_address = None
master_filepaths = []
master_to_asset_list_filepath = None
phase = True
# Ealing - houses
data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Ealing"
data_filename = "Ealing_rechecked_cleaned_05042025.csv"
sheet_name = None
postcode_column = 'Postcode'
fulladdress_column = "Address"
address1_column = None
address1_method = "house_number_extraction"
address_cols_to_concat = []
missing_postcodes_method = None
landlord_year_built = "Year Built"
landlord_os_uprn = None
landlord_property_type = "Property Type Code"
landlord_built_form = None
landlord_wall_construction = None
landlord_heating_system = None
landlord_existing_pv = None
landlord_property_id = "Property ref"
outcomes_filename = None
outcomes_sheetname = None
outcomes_postcode = None
outcomes_houseno = None
outcomes_id = None
outcomes_address = None
master_filepaths = []
master_to_asset_list_filepath = None
# Southern Midlands
data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Southern/Midlands Properties - Apr 2025"
data_filename = "Southern Housing Midlands Property List - combined.xlsx"
@ -446,8 +504,11 @@ def app():
landlord_property_type=landlord_property_type,
landlord_built_form=landlord_built_form,
landlord_wall_construction=landlord_wall_construction,
landlord_roof_construction=landlord_roof_construction,
landlord_heating_system=landlord_heating_system,
landlord_existing_pv=landlord_existing_pv
landlord_existing_pv=landlord_existing_pv,
landlord_sap=landlord_sap,
phase=phase
)
asset_list.init_standardise()
@ -486,6 +547,13 @@ def app():
).items()
if k not in EXISTING_PV_MAPPINGS
}
new_roof_construction_map = {
k: v for k, v in (
asset_list.variable_mappings[asset_list.landlord_roof_construction] if
asset_list.landlord_roof_construction else {}
).items()
if k not in ROOF_CONSTRUCTION_MAPPINGS
}
asset_list.apply_standardiation()
@ -511,7 +579,7 @@ def app():
epc_api_only = False
force_retrieve_data = False
skip = None # Used to skip already completed chunks
chunk_size = 5000
chunk_size = 1000
filename = "Chunk {i}.csv"
download_folder = os.path.join(data_folder, "Chunks")
if not os.path.exists(download_folder):
@ -529,8 +597,6 @@ def app():
if any(x in folder_contents for x in downloaded_files):
skip = max([i for i in chunk_indexes if filename.format(i=i) in folder_contents])
# folder_contents = [f for f in folder_contents if "nodata" not in f and f.endswith(".csv")]
for i in range(0, len(asset_list.standardised_asset_list), chunk_size):
print(f"Processing chunk {i} to {i + chunk_size}")
if skip is not None and not force_retrieve_data:

View file

@ -80,5 +80,32 @@ BUILT_FORM_MAPPINGS = {
'House: MidTerrace': 'mid-terrace',
'House: EndTerrace': 'end-terrace',
'Bungalow: EndTerrace': 'end-terrace',
'Bungalow: MidTerrace': 'mid-terrace'
'Bungalow: MidTerrace': 'mid-terrace',
'Flat: Semi Detached: Mid Floor': 'semi-detached',
'Maisonette: Mid Terrace: Top Floor': 'mid-terrace',
'Flat: Enclosed Mid Terrace: Mid Floor': 'mid-terrace',
'Flat: Enclosed Mid Terrace: Ground Floor': 'mid-terrace',
'Flat: Detached: Ground Floor': 'detached',
'Flat: Detached: Mid Floor': 'detached',
'Flat: Detached: Top Floor': 'detached',
'Flat: Enclosed End Terrace: Mid Floor': 'end-terrace',
'Bungalow: Detached': 'detached',
'Maisonette: End Terrace: Mid Floor': 'end-terrace',
'Maisonette: Detached: Top Floor': 'detached',
'Flat: Enclosed End Terrace: Ground Floor': 'end-terrace',
'Flat: Enclosed Mid Terrace: Top Floor': 'mid-terrace',
'House: EnclosedEndTerrace': 'end-terrace',
'3 Ext. Wall Flat': 'semi-detached',
'Bungalow Detached': 'detached',
'Bungalow End Terrace': 'end-terrace',
'Bungalow Mid Terrace': 'mid-terrace',
'Bungalow Semi Detached': 'detached',
'Maisonette 2 Ext. Wall': 'mid-terrace',
'Maisonette 3 Ext. Wall': 'semi-detached',
'End-terrace': 'end-terrace',
'Mid-terrace': 'mid-terrace',
'Semi-detached': 'semi-detached',
'Detached': 'detached',
'Flat / maisonette': 'unknown',
'2014 onwards': 'unknown'
}

View file

@ -1,3 +1,5 @@
import numpy as np
STANDARD_EXISTING_PV = {
"already has PV", "no PV", "unknown"
}
@ -9,4 +11,10 @@ EXISTING_PV_MAPPINGS = {
"yes": "already has PV",
True: "already has PV",
False: "no PV",
np.nan: 'unknown',
'PV: 2kWp array': 'already has PV',
'PV: 25% roof area, PV: 3.6kWp array': 'already has PV',
'PV: 10% roof area, PV: 2kWp array': 'already has PV',
'PV: 50% roof area': 'already has PV',
'Solar PV': 'already has PV'
}

View file

@ -21,7 +21,9 @@ STANDARD_HEATING_SYSTEMS = {
'oil fuel',
'solid fuel',
'gas combi boiler',
'unknown'
'unknown',
"electric ceiling",
"electric underfloor"
}
HEATING_MAPPINGS = {
@ -143,5 +145,30 @@ HEATING_MAPPINGS = {
'Boiler: A rated Regular Boiler Electricity: Electricity': 'electric boiler',
'Community Heating Systems: Community boilers only (RdSAP) Gas: Mains Gas (Community)': 'communal gas boiler',
'Boiler: A rated Combi Gas: Mains Gas': 'gas condensing combi',
'Boiler: A rated CPSU Electricity: Electricity': 'electric boiler'
'Boiler: A rated CPSU Electricity: Electricity': 'electric boiler',
'Heat Pump: Electric Heat pumps: Ground source heat pump with flow temperature <= 35°C': 'ground source heat pump',
'Heat Pump: Electric Heat pumps: Ground source heat pump in other cases': 'ground source heat pump',
'Electric Storage Systems: High heat retention storage heaters': 'high heat retention storage heaters',
'Heat Pump: Electric Heat pumps: Air source heat pump with flow temperature <= 35°C': 'air source heat pump',
'Electric (direct acting) room heaters: Panel, convector or radiant heaters': 'room heaters',
'Boiler: C rated Combi': 'gas combi boiler',
'Boiler: B rated Regular Boiler': 'gas condensing boiler',
'Boiler: E rated Combi': 'gas combi boiler',
'Boiler: A rated Combi': 'gas combi boiler',
'Boiler: E rated Regular Boiler': 'gas condensing boiler',
'Community Heating Systems: Community boilers only (RdSAP)': 'district heating',
'Boiler: C rated Regular Boiler': 'gas condensing boiler',
'Boiler: A rated Regular Boiler': 'gas condensing boiler',
'Electric Storage Systems: Fan storage heaters': 'electric storage heaters',
'Boiler: F rated Combi': 'gas combi boiler',
'Room heaters': 'room heaters',
'Room Heaters': 'room heaters',
'Boiler': 'gas condensing boiler',
'Heat Pump (Wet)': 'air source heat pump',
'Community Heating': 'district heating',
'Heat pump (wet)': 'air source heat pump',
'Electric ceiling heating': 'electric ceiling',
'Electric under floor heating': 'electric underfloor',
'Community heating': 'district heating'
}

View file

@ -136,5 +136,20 @@ PROPERTY_MAPPING = {
'Flat: Semi Detached: Top Floor': 'flat',
'Flat: Mid Terrace: Ground Floor': 'flat',
'Bungalow: MidTerrace': 'bungalow',
'Flat: Enclosed End Terrace: Top Floor': 'flat'
'Flat: Enclosed End Terrace: Top Floor': 'flat',
'Flat: Semi Detached: Mid Floor': 'flat',
'Maisonette: Mid Terrace: Top Floor': 'maisonette',
'House: EnclosedEndTerrace': 'house',
'Flat: Detached: Ground Floor': 'flat',
'Flat: Detached: Mid Floor': 'flat',
'Flat: Detached: Top Floor': 'flat',
'Bungalow: Detached': 'bungalow',
'Maisonette: End Terrace: Mid Floor': 'maisonette',
'Maisonette: Detached: Top Floor': 'maisonette',
'Flat: Enclosed Mid Terrace: Mid Floor': 'flat',
'Flat: Enclosed Mid Terrace: Ground Floor': 'flat',
'Flat: Enclosed End Terrace: Mid Floor': 'flat',
'Flat: Enclosed End Terrace: Ground Floor': 'flat',
'Flat: Enclosed Mid Terrace: Top Floor': 'flat',
'2013 onwards': 'unknown'
}

View file

@ -0,0 +1,26 @@
import numpy as np
STANDARD_ROOF_CONSTRUCTIONS = {
"pitched access to loft",
"pitched no access to loft",
"pitched unknown access to loft",
"piched unknown insulation",
"pitched insulated",
"another dwelling above",
"flat unknown insulation",
"unknown insulated",
"unknown",
}
ROOF_CONSTRUCTION_MAPPINGS = {
'Flat': 'flat unknown insulation',
'Pitched (access to loft)': 'pitched access to loft',
'Pitched (no access to loft)': 'pitched no access to loft',
'Another dwelling above': 'another dwelling above',
'Same dwelling above': 'another dwelling above',
'As-built': 'unknown',
'ND (inferred)': 'unknown',
'2018 onwards': 'unknown',
'Pitched (vaulted ceiling)': 'pitched insulated',
np.nan: "unknown"
}

View file

@ -147,5 +147,15 @@ WALL_CONSTRUCTION_MAPPINGS = {
'Cavity: AsBuilt (1983-1995), Cavity: FilledCavity': 'filled cavity',
'SolidBrick: AsBuilt': 'solid brick unknown insulation',
'Cavity: FilledCavity': 'filled cavity',
'SolidBrick: Internal': 'insulated solid brick'
'SolidBrick: Internal': 'insulated solid brick',
'Cavity: External': 'filled cavity',
'Sandstone: Internal': 'sandstone or limestone',
'Cavity: AsBuilt (Pre 1976)': 'cavity unknown insulation',
'System build': 'system built',
'Solid brick': 'solid brick unknown insulation',
'Stone': 'sandstone or limestone',
'Timber frame': 'timber frame unknown insulation',
'2017 onwards': 'new build - average thermal transmittance',
'ND (inferred)': 'unknown',
'Flat / maisonette': 'other'
}

View file

@ -755,6 +755,10 @@ class SearchEpc:
"photo-supply"]
)
estimated_epc["co2-emiss-curr-per-floor-area"] = (
estimated_epc["co2-emissions-current"] / estimated_epc["total-floor-area"]
)
estimated_epc["postcode"] = self.postcode
if not self.uprn:
# Update self.uprn too

View file

@ -4,7 +4,7 @@ from dotenv import load_dotenv
from utils.s3 import save_csv_to_s3
from etl.find_my_epc.AssetListEpcData import AssetListEpcData
PORTFOLIO_ID = 138
PORTFOLIO_ID = 140
USER_ID = 8
load_dotenv(dotenv_path="backend/.env")
@ -19,14 +19,17 @@ def app():
asset_list = [
{
"address": "42 Rippolson Road",
"postcode": "SE18 1NS",
"uprn": 100020999275,
"address": "Brow Cottage",
"postcode": "YO18 7PZ",
"uprn": 10007630752,
"property_type": "House",
"built_form": "Semi-Detached",
"patch": True
},
{
"address": "66 Riverdale Road",
"postcode": "DA8 1PX",
"uprn": 100020235516
"address": "Wyburn",
"postcode": "DT1 2LL",
"uprn": 100040630290
},
]
asset_list = pd.DataFrame(asset_list)
@ -46,6 +49,7 @@ def app():
)
asset_list_epc_client.get_data()
asset_list_epc_client.get_non_invasive_recommendations()
asset_list_epc_client.get_patch()
# Store non-invasive recommendations in S3
non_invasive_recommendations_filename = f"{USER_ID}/{PORTFOLIO_ID}/non_invasive_recommendations.csv"
@ -55,14 +59,24 @@ def app():
file_name=non_invasive_recommendations_filename
)
# Store patches in S3
patches_filename = ""
if asset_list_epc_client.patches:
patches_filename = f"{USER_ID}/{PORTFOLIO_ID}/patches.csv"
save_csv_to_s3(
dataframe=pd.DataFrame(asset_list_epc_client.patches),
bucket_name="retrofit-plan-inputs-dev",
file_name=patches_filename
)
valuation_data = [
{
"valuation": 469_000,
"uprn": 100020999275,
"uprn": 10007630752,
},
{
"valuation": 382_000,
"uprn": 100020235516
"valuation": 373_000,
"uprn": 100040630290
},
]
# Store valuation data to s3
@ -80,7 +94,7 @@ def app():
"goal_value": "C",
"trigger_file_path": filename,
"already_installed_file_path": "",
"patches_file_path": "",
"patches_file_path": patches_filename,
"non_invasive_recommendations_file_path": non_invasive_recommendations_filename,
"valuation_file_path": valuation_filename,
"scenario_name": "Full package remote assessment",

View file

@ -26,6 +26,7 @@ class AssetListEpcData:
self.extracted_data = None
self.non_invasive_recommendations = None
self.patches = None
@staticmethod
def check_asset_list(asset_list):
@ -52,6 +53,21 @@ class AssetListEpcData:
} for r in self.extracted_data
]
def get_patch(self):
"""
:return:
"""
if self.extracted_data is None:
raise ValueError("extracted data is missing - run get_data first")
self.patches = [
{
"uprn": r.get("uprn"),
**r.get("patch")
} for r in self.extracted_data if r.get("patch")
]
def get_data(self):
logger.info("Retrieving data for given asset list")
@ -67,11 +83,18 @@ class AssetListEpcData:
postcode=pc,
uprn=home.get("uprn"),
auth_token=self.epc_auth_token,
os_api_key=""
os_api_key="",
)
epc_searcher.ordnance_survey_client.property_type = home.get("property_type")
epc_searcher.ordnance_survey_client.built_form = home.get("built_form")
epc_searcher.find_property(skip_os=True)
if epc_searcher.newest_epc is None:
continue
if not pd.isnull(home.get("patch")):
epc_searcher.newest_epc["address1"] = add1
# Attempt both methods:
try:
find_epc_searcher = RetrieveFindMyEpc(
@ -89,14 +112,22 @@ class AssetListEpcData:
time.sleep(0.5)
# We need uprn
extracted_data.append(
{
to_append = {
"uprn": home.get("uprn"),
"address": home["address"],
"postcode": home["postcode"],
**find_epc_data,
}
)
if not pd.isnull(home.get("patch")):
to_append["patch"] = {
"current-energy-rating": find_epc_data["current_epc_rating"],
"current-energy-efficiency": find_epc_data["current_epc_efficiency"],
"potential-energy-rating": find_epc_data["potential_epc_rating"],
"potential-energy-efficiency": find_epc_data["potential_epc_efficiency"],
**find_epc_data["epc_data"]
}
extracted_data.append(to_append)
self.extracted_data = extracted_data
logger.info("Data Extrction complete")

View file

@ -1,3 +1,4 @@
import re
import pandas as pd
import requests
from bs4 import BeautifulSoup
@ -45,6 +46,85 @@ class RetrieveFindMyEpc:
sources = {item.get_text(strip=True): True for item in energy_list.find_all("li")}
return sources
@staticmethod
def get_text(elem):
return elem.get_text(strip=True) if elem else None
def extract_epc_data(self, soup):
results = {}
# 1. Total floor area
results['total-floor-area'] = int(self.get_text(
soup.find("dt", string="Total floor area").find_next_sibling("dd")
).split(" ")[0])
# Table with features
rows = soup.select("table.govuk-table tbody tr")
rating_map = {
"Very poor": "Very Poor",
"Very good": "Very Good"
}
def get_feature_row_text(feature_name, index=0):
matches = [row for row in rows if row.find("th") and feature_name in row.find("th").text]
if len(matches) > index:
cells = matches[index].find_all("td")
description = self.get_text(cells[0])
rating = self.get_text(cells[1])
return description, rating_map.get(rating, rating)
return None, None
# 2-3. First wall description and rating
results['walls-description'], results['walls-energy-eff'] = get_feature_row_text("Wall", 0)
# 4-5. First roof description and rating
results['roof-description'], results['roof-energy-eff'] = get_feature_row_text("Roof", 0)
# 6-7. Windows description and rating
results['windows-description'], results['windows-energy-eff'] = get_feature_row_text("Window")
# 8-9. Main heating description and rating
results['mainheat-description'], results['mainheat-energy-eff'] = get_feature_row_text("Main heating")
# 10-11. Main heating control description and rating
results['mainheatcont-description'], results['mainheatc-energy-eff'] = get_feature_row_text(
"Main heating control"
)
# 12-13. Hot water description and rating
results['hotwater-description'], results['hot-water-energy-ef'] = get_feature_row_text("Hot water")
# 14-15. Lighting description and rating
results['lighting-description'], results['lighting-energy-eff'] = get_feature_row_text("Lighting")
# 16. Floor description
results['floor-description'], _ = get_feature_row_text("Floor")
# 17. Secondary heating description
results['secondheat-description'], _ = get_feature_row_text("Secondary heating")
# 18. Primary energy use
p_energy = soup.find(string=lambda t: "primary energy use for this property per year" in t.lower())
# We should always have this
match = re.search(r"(\d+)\s+kilowatt", p_energy)
results['energy-consumption-current'] = int(match.group(1)) if match else None
# 19. Current CO2 emissions
co2_now = soup.find("dd", id="eir-property-produces")
# We should always have this
match = re.search(r"([\d.]+)", co2_now.text)
results['co2-emissions-current'] = float(match.group(1)) if match else None
# Need co2-emiss-curr-per-floor-area
# 20. Potential CO2 emissions
co2_pot = soup.find("dd", id="eir-potential-production")
match = re.search(r"([\d.]+)", co2_pot.text)
results['co2-emissions-potential'] = float(match.group(1)) if match else None
return results
def retrieve_newest_find_my_epc_data(self, sap_2012_date=None):
"""
For a post code and address, we pull out all the required data from the find my epc website
@ -115,6 +195,9 @@ class RetrieveFindMyEpc:
potential_rating = ratings.split(".")[1]
current_sap = int(current_rating.split(' ')[-1])
# Floor area
address_res.find()
# Retrieve the energy consumption
bills = address_res.find('div', {'id': 'bills-affected'})
bills_list = bills.find_all('li')
@ -232,6 +315,9 @@ class RetrieveFindMyEpc:
# 4) Low and zero carbon energy sources
low_carbon_energy_sources = self.extract_low_carbon_sources(address_res)
# 5) Pull out the EPC data
epc_data = self.extract_epc_data(address_res)
resulting_data = {
'epc_certificate': epc_certificate,
'current_epc_rating': current_rating.split(' ')[-6],
@ -241,8 +327,9 @@ class RetrieveFindMyEpc:
"heating_text": heating_text,
"hot_water_text": hot_water_text,
"recommendations": recommendations,
"epc_data": epc_data,
**assessment_data,
**low_carbon_energy_sources
**low_carbon_energy_sources,
}
return resulting_data