Model/etl/find_my_epc/RetrieveFindMyEpc.py

import time
import re
import requests
import pandas as pd
from copy import deepcopy
from bs4 import BeautifulSoup
from datetime import datetime

from utils.logger import setup_logger

logger = setup_logger()


class RetrieveFindMyEpc:
    SEARCH_POSTCODE_URL = (
        "https://find-energy-certificate.service.gov.uk/find-a-certificate/search-by-postcode?postcode={postcode_input}"
    )
    BASE_ENERGY_URL = "https://find-energy-certificate.service.gov.uk"

    HEADERS = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                      'Chrome/111.0.0.0 Safari/537.36'
    }

    def __init__(
        self, address: str, postcode: str, rrn: str = None, address_postal_town: str = "", sap_rating: int = None
    ):
        """
        This class is tasked with retrieving the latest EPC data from the find my epc website
        :param address: The address of the property
        :param postcode: The postcode of the property
        :param rrn: The RRN of the EPC (if known)
        """
        self.address = address
        self.postcode = postcode
        self.rrn = rrn

        self.address_cleaned = self.address.replace(",", "").replace(" ", "").lower()

        # Containers for the extracted components
        self.walls = []

        self.address_postal_town = address_postal_town
        if self.address_postal_town:
            self.address_postal_town = self.address_postal_town.replace(",", "").replace(" ", "").lower()

        self.sap_rating = sap_rating

    @staticmethod
    def extract_low_carbon_sources(soup):
        # Find the section header
        section_header = soup.find("h3", string="Low and zero carbon energy sources")
        if not section_header:
            return {}

        # Locate the list following the header
        energy_list = section_header.find_next("ul")

        # Extract the list items
        sources = {item.get_text(strip=True): True for item in energy_list.find_all("li")}
        return sources

    @staticmethod
    def get_text(elem):
        return elem.get_text(strip=True) if elem else None

    def extract_epc_data(self, soup):

        results = {}

        # 1. Total floor area
        # We have some isntances of very old EPCs where the total floor area is not available
        tfa = self.get_text(
            soup.find("dt", string="Total floor area").find_next_sibling("dd")
        ).split(" ")[0]
        results['total-floor-area'] = int(tfa) if tfa != "Not" else None

        # Table with features
        rows = soup.select("table.govuk-table tbody tr")

        rating_map = {
            "Very poor": "Very Poor",
            "Very good": "Very Good"
        }

        def get_feature_row_text(feature_name, index=0):
            matches = [row for row in rows if row.find("th") and feature_name in row.find("th").text]
            if len(matches) > index:
                # A commonly seen case is when feature_name is Main heating and we want to make sure we get
                # main heating and not main heating control
                if feature_name == "Main heating":
                    matches = [
                        row for row in matches if row.find("th") and row.find("th").text.strip() == "Main heating"
                    ]
                cells = matches[index].find_all("td")
                description = self.get_text(cells[0])
                rating = self.get_text(cells[1])
                return description, rating_map.get(rating, rating)
            return None, None

        # 2-3. First wall description and rating
        results['walls-description'], results['walls-energy-eff'] = get_feature_row_text("Wall", 0)

        # 4-5. First roof description and rating
        results['roof-description'], results['roof-energy-eff'] = get_feature_row_text("Roof", 0)

        # 6-7. Windows description and rating
        results['windows-description'], results['windows-energy-eff'] = get_feature_row_text("Window")

        # 8-9. Main heating description and rating
        results['mainheat-description'], results['mainheat-energy-eff'] = get_feature_row_text("Main heating")

        # 10-11. Main heating control description and rating
        results['mainheatcont-description'], results['mainheatc-energy-eff'] = get_feature_row_text(
            "Main heating control"
        )

        # 12-13. Hot water description and rating
        results['hotwater-description'], results['hot-water-energy-ef'] = get_feature_row_text("Hot water")

        # 14-15. Lighting description and rating
        results['lighting-description'], results['lighting-energy-eff'] = get_feature_row_text("Lighting")

        # 16. Floor description
        results['floor-description'], _ = get_feature_row_text("Floor")

        # 17. Secondary heating description
        results['secondheat-description'], _ = get_feature_row_text("Secondary heating")

        # 18. Primary energy use
        p_energy = soup.find(string=lambda t: "primary energy use for this property per year" in t.lower())
        # We should always have this
        match = re.search(r"(\d+)\s+kilowatt", p_energy)
        results['energy-consumption-current'] = int(match.group(1)) if match else None

        # 19. Current CO2 emissions
        co2_now = soup.find("dd", id="eir-property-produces")
        # We should always have this
        match = re.search(r"([\d.]+)", co2_now.text)
        results['co2-emissions-current'] = float(match.group(1)) if match else None
        # Need co2-emiss-curr-per-floor-area

        # 20. Potential CO2 emissions
        co2_pot = soup.find("dd", id="eir-potential-production")
        match = re.search(r"([\d.]+)", co2_pot.text)
        results['co2-emissions-potential'] = float(match.group(1)) if match else None

        return results

    def _extract_epc_from_soup(self, soup, epc_certificate, sap_2012_date=None):

        ratings = soup.find('desc', {'id': 'svg-desc'}).text
        current_rating = ratings.split(".")[0]
        potential_rating = ratings.split(".")[1]
        current_sap = int(current_rating.split(' ')[-1])

        # Retrieve the energy consumption
        bills = soup.find('div', {'id': 'bills-affected'})
        bills_list = bills.find_all('li')
        if not bills_list:
            # If this is the case, it's usually becaue the EPC was very old. Early EPCs did not have this information
            heating_text = None
            hot_water_text = None
        else:
            heating_text = bills_list[0].text
            hot_water_text = bills_list[1].text

        # Retrieve the recommendations and SAP points
        recommendations = []
        recommendations_div = soup.find('div', class_='epb-recommended-improvements')
        if recommendations_div:
            # Find all h3 headers for each step and extract their related information
            step_headers = recommendations_div.find_all('h3', class_='govuk-heading-m')
            previous_sap_score = current_sap
            previous_epc = current_rating.split(' ')[-6]
            for step_num, step_header in enumerate(step_headers, start=1):
                # Extract the step title (the measure)
                measure_title = step_header.text.strip().replace(f"Step {step_num}: ", "")

                # Find the div containing the potential rating within the same section
                potential_rating_div = step_header.find_next(
                    'div', class_='epb-recommended-improvements__potential-rating'
                )

                # Check if the potential rating div is found
                if potential_rating_div:
                    # Extract the rating text within the SVG text element
                    extracted_rating_text = potential_rating_div.find('text', class_='govuk-!-font-weight-bold')
                    if extracted_rating_text is not None:
                        rating_text = extracted_rating_text.text.strip()
                    else:
                        rating_text = " ".join([str(previous_sap_score), previous_epc])
                    # Parse the rating text to separate the numeric rating and EPC letter
                    new_rating = int(rating_text.split()[0])
                    new_epc = rating_text.split()[1]

                    # Append the information as a dictionary to the recommendations list
                    recommendations.append({
                        "step": step_num,
                        "measure": measure_title,
                        "new_rating": new_rating,
                        "new_epc": new_epc,
                        "sap_points": new_rating - previous_sap_score
                    })
                    previous_sap_score = new_rating
                    previous_epc = new_epc

        # Search for the assessment informaton
        assessment_information = soup.find('div', {'id': 'information'})
        # Parse this information
        rows = assessment_information.find_all('div', class_='govuk-summary-list__row')
        # Create a dictionary to hold the parsed information
        assessment_data = {}
        for row in rows:
            key = row.find('dt').text.strip()
            if key == "Type of assessment":
                # We dont reliably extract this
                continue
            value_tag = row.find('dd')

            # Check if value contains a link (email)
            if value_tag.find('a'):
                value = value_tag.find('a').text.strip()
            elif value_tag.find('summary'):
                value = value_tag.find('span').text.strip()
            else:
                value = value_tag.text.strip()

            # These are keys that we have for both the surveyor and the acreditation scheme. Firstly, we'll
            # get the surveyor's name and email so we make that information clear
            if key in ["Telephone", "Email"]:
                if "Assessor's " + key not in assessment_data:
                    assessment_data["Assessor's " + key] = value
                else:
                    assessment_data["Accreditation Scheme's " + key] = value
                continue

            assessment_data[key] = value

        expected_keys = [
            'Assessor’s name',
            "Assessor's Telephone",
            "Assessor's Email",
            'Assessor’s ID',
            'Accreditation scheme',
            'Assessor’s declaration',
            "Accreditation Scheme's Telephone",
            "Accreditation Scheme's Email",
            'Date of assessment',
            'Date of certificate'
        ]
        # Check we have all the expected keys
        for key in expected_keys:
            if key not in assessment_data:
                raise ValueError(f"Missing key: {key}")

        # The wall types of the property
        property_features_table = soup.find("tbody", class_="govuk-table__body")
        property_features_table = property_features_table.find_all("tr")

        property_components = self.extract_property_components(property_features_table)

        # Extract walls
        self.walls = [x["description"] for x in property_components if x["component_name"] == "Wall"]

        # Finally, we format the recommendations
        recommendations = self.format_recommendations(recommendations, assessment_data, sap_2012_date)

        # 4) Low and zero carbon energy sources
        low_carbon_energy_sources = self.extract_low_carbon_sources(soup)

        # 5) Pull out the EPC data
        epc_data = self.extract_epc_data(soup)

        resulting_data = {
            'epc_certificate': epc_certificate,
            'current_epc_rating': current_rating.split(' ')[-6],
            'current_epc_efficiency': current_sap,
            'potential_epc_rating': potential_rating.split(' ')[-6],
            "potential_epc_efficiency": int(potential_rating.split(' ')[-1]),
            "heating_text": heating_text,
            "hot_water_text": hot_water_text,
            "recommendations": recommendations,
            "epc_data": epc_data,
            **assessment_data,
            **low_carbon_energy_sources,
        }

        return resulting_data

    def retrieve_all_find_my_epc_data(self, sap_2012_date=None):

        """
        This is a quick function to retrieve all the data from the find my epc website for a given postcode and address.
        Using this to fulfill a short term need to retrieve all history for a property
        :param sap_2012_date:
        :return:
        """

        if self.rrn:
            # We build the URL directly
            epc_certificate = self.rrn
            chosen_epc = f"{self.BASE_ENERGY_URL}/energy-certificate/{epc_certificate}"
        else:
            chosen_epc, epc_certificate = self._find_epc_page()

        address_response = requests.get(chosen_epc, headers=self.HEADERS)
        address_res = BeautifulSoup(address_response.text, features="html.parser")

        # We check the section on "Other cerificates for this property and get the url"
        # Find the section for other certificates
        other_cert_section = address_res.find('div', id='other_certificates_and_reports')

        # Extract all certificate number rows (anchor tags within a govuk-summary-list)
        other_cert_links = other_cert_section.select('dd.govuk-summary-list__value a')

        other_certificates = []
        for link in other_cert_links:
            cert_number = link.text.strip()
            cert_url = link['href'].strip()
            other_certificates.append({
                "certificate_number": cert_number,
                "certificate_url": f"https://find-energy-certificate.service.gov.uk{cert_url}"
            })

        # Always include the currently selected EPC first
        soup_list = [address_res]

        # Add additional historic certificates
        for link in other_cert_links:
            cert_url = f"https://find-energy-certificate.service.gov.uk{link['href'].strip()}"
            response = requests.get(cert_url, headers=self.HEADERS)
            time.sleep(0.3)
            soup_list.append(BeautifulSoup(response.text, features="html.parser"))

        all_find_my_epc_data = []
        for soup in soup_list:
            # Start with the primary one
            all_find_my_epc_data.append(self._extract_epc_from_soup(soup, epc_certificate, sap_2012_date))

        return all_find_my_epc_data

    def _find_epc_page(self):
        """
        This function is used to find the EPC page source for a given address and postcode.
        It is done by fetching the page, associating to the postcode and then matching the
        addresses on the page to the address we have been given.
        :return:
        """

        postcode_input = self.postcode.replace(" ", "+")
        postcode_search = self.SEARCH_POSTCODE_URL.format(postcode_input=postcode_input)
        postcode_response = requests.get(postcode_search, headers=self.HEADERS)

        postcode_res = BeautifulSoup(postcode_response.text, features="html.parser")
        rows = postcode_res.find_all('tr', class_='govuk-table__row')

        extracted_table, backup_flat = [], []
        for row in rows:
            # Extract the address and URL
            address_tag = row.find('a', class_='govuk-link')
            if address_tag is None:
                continue
            extracted_address = None
            extracted_address_url = None
            if address_tag:
                extracted_address = address_tag.text.strip()
                extracted_address_url = address_tag['href']

                extracted_address_cleaned = (
                    extracted_address.replace(",", "").replace(" ", "").lower()
                )

                no_primary_match = not extracted_address_cleaned.startswith(self.address_cleaned)
                no_backup_match = True if not self.address_postal_town else not (
                    extracted_address_cleaned.startswith(self.address_postal_town)
                )

                if no_primary_match and no_backup_match:
                    if self.address_cleaned.startswith("flat"):
                        # We have a flat address, so we can try and match without the flat number
                        flat_removed_address = self.address_cleaned[4:]
                        if extracted_address_cleaned.startswith(flat_removed_address):
                            # We have a backup match
                            backup_flat.append(
                                {
                                    "extracted_address": extracted_address,
                                    "extracted_address_url": extracted_address_url,
                                }
                            )
                    continue

                # If the address is a match, we can extract the data

            # Extract the expiry date
            expiry_date_tag = row.find('td', class_='govuk-table__cell date')
            expiry_date = None
            if expiry_date_tag is not None:
                expiry_date = expiry_date_tag.parent.find('span').text.strip()

            extracted_table.append(
                {
                    "extracted_address": extracted_address,
                    "extracted_address_url": extracted_address_url,
                    "expiry_date": datetime.strptime(expiry_date, '%d %B %Y'),
                }
            )

        if not extracted_table and not backup_flat:
            # This is a relatively new change, as of November 2025, but we see cases where properties do not
            # have data appearing on the find my EPC website, particularly for older EPCs. In this case, we allo
            # for us to not find any information and return nothing
            return None, None

        if not extracted_table:
            extracted_table = deepcopy(backup_flat)

        if len(extracted_table) > 1:
            # We take the one with the most recent expiry date
            extracted_table = sorted(extracted_table, key=lambda x: x['expiry_date'], reverse=True)

        chosen_epc = self.BASE_ENERGY_URL + extracted_table[0]['extracted_address_url']
        epc_certificate = chosen_epc.split('/')[-1]

        return chosen_epc, epc_certificate

    @staticmethod
    def extract_property_components(property_features_table: list):
        """
        Function to pull out a table for property components, marking their appearance index
        :param property_features_table: The table of property features, as extracted by BeautifulSoup
        :return: List of property components with appearance index
        """
        property_components = []
        for row in property_features_table:
            cells = row.find_all("td")
            component_name = row.find("th").text.strip()
            property_components.append(
                {
                    "component_name": component_name,
                    "description": cells[0].text.strip(),
                    "efficiency": cells[1].text.strip(),
                }
            )
        # Add an appearance index, which will indicate if the component appears multiple times, so this
        # becomes a reference for the building part the component is associated to (main, extensions, etc)
        # We want to inject this appearance index into the component dictionaries
        component_count = {}
        for component in property_components:
            name = component['component_name']
            if name not in component_count:
                component_count[name] = 0
            component['appearance_index'] = component_count[name]
            component_count[name] += 1

        return property_components

    def retrieve_newest_find_my_epc_data(
        self, sap_2012_date=None, return_page=False, epc_page_source=None, rrn=None
    ):
        """
        For a post code and address, we pull out all the required data from the find my epc website
        """

        if epc_page_source is None and rrn is None:
            chosen_epc, rrn = self._find_epc_page()
            if chosen_epc is None:
                # We have no resulting data
                logger.info("No EPC found for address %s, postcode %s", self.address, self.postcode)
                return {}

            address_response = requests.get(chosen_epc, headers=self.HEADERS)
            epc_page_source = address_response.text
            address_res = BeautifulSoup(address_response.text, features="html.parser")
        elif self.rrn or rrn:
            epc_certificate = self.rrn if self.rrn else rrn
            chosen_epc = f"{self.BASE_ENERGY_URL}/energy-certificate/{epc_certificate}"
            address_response = requests.get(chosen_epc, headers=self.HEADERS)
            epc_page_source = address_response.text
            address_res = BeautifulSoup(address_response.text, features="html.parser")
        else:
            if rrn is None:
                raise ValueError("rrn must be provided if epc_page_source is provided")
            address_res = BeautifulSoup(epc_page_source, features="html.parser")

        # Key data we want to retrieve:
        # 1) Rating
        # 2) Bills estimates
        # 3) Recommendations and SAP points
        # 4) Low and zero carbon energy sources
        # 5) The wall types of the property - used for determining if we have an extension wall insulation#
        #    recommendation

        ratings = address_res.find('desc', {'id': 'svg-desc'}).text
        current_rating = ratings.split(".")[0]
        potential_rating = ratings.split(".")[1]
        current_sap = int(current_rating.split(' ')[-1])

        if self.sap_rating:
            if current_sap != self.sap_rating and not rrn:
                # This means we likely have the wrong data. If we are in this scenario, we return nothing
                return {
                    "epc_certificate": None,
                    "page_source": None,
                }

        # Retrieve the energy consumption
        bills = address_res.find('div', {'id': 'bills-affected'})
        bills_list = bills.find_all('li')
        if not bills_list:
            # If this is the case, it's usually becaue the EPC was very old. Early EPCs did not have this information
            heating_text = None
            hot_water_text = None
        else:
            heating_text = bills_list[0].text
            hot_water_text = bills_list[1].text

        # Retrieve the recommendations and SAP points
        recommendations = []
        recommendations_div = address_res.find('div', class_='epb-recommended-improvements')
        if recommendations_div:
            # Find all h3 headers for each step and extract their related information
            step_headers = recommendations_div.find_all('h3', class_='govuk-heading-m')
            previous_sap_score = current_sap
            previous_epc = current_rating.split(' ')[-6]
            for step_num, step_header in enumerate(step_headers, start=1):
                # Extract the step title (the measure)
                measure_title = step_header.text.strip().replace(f"Step {step_num}: ", "")

                # Find the div containing the potential rating within the same section
                potential_rating_div = step_header.find_next(
                    'div', class_='epb-recommended-improvements__potential-rating'
                )

                # Check if the potential rating div is found
                if potential_rating_div:
                    # Extract the rating text within the SVG text element
                    extracted_rating_text = potential_rating_div.find('text', class_='govuk-!-font-weight-bold')
                    if extracted_rating_text is not None:
                        rating_text = extracted_rating_text.text.strip()
                    else:
                        rating_text = " ".join([str(previous_sap_score), previous_epc])
                    # Parse the rating text to separate the numeric rating and EPC letter
                    new_rating = int(rating_text.split()[0])
                    new_epc = rating_text.split()[1]

                    # Append the information as a dictionary to the recommendations list
                    recommendations.append({
                        "step": step_num,
                        "measure": measure_title,
                        "new_rating": new_rating,
                        "new_epc": new_epc,
                        "sap_points": new_rating - previous_sap_score
                    })
                    previous_sap_score = new_rating
                    previous_epc = new_epc

        # Search for the assessment informaton
        assessment_information = address_res.find('div', {'id': 'information'})
        # Parse this information
        rows = assessment_information.find_all('div', class_='govuk-summary-list__row')
        # Create a dictionary to hold the parsed information
        assessment_data = {}
        for row in rows:
            key = row.find('dt').text.strip()
            if key == "Type of assessment":
                # We dont reliably extract this
                continue
            value_tag = row.find('dd')

            # Check if value contains a link (email)
            if value_tag.find('a'):
                value = value_tag.find('a').text.strip()
            elif value_tag.find('summary'):
                value = value_tag.find('span').text.strip()
            else:
                value = value_tag.text.strip()

            # These are keys that we have for both the surveyor and the acreditation scheme. Firstly, we'll
            # get the surveyor's name and email so we make that information clear
            if key in ["Telephone", "Email"]:
                if "Assessor's " + key not in assessment_data:
                    assessment_data["Assessor's " + key] = value
                else:
                    assessment_data["Accreditation Scheme's " + key] = value
                continue

            assessment_data[key] = value

        expected_keys = [
            'Assessor’s name',
            "Assessor's Telephone",
            "Assessor's Email",
            'Assessor’s ID',
            'Accreditation scheme',
            'Assessor’s declaration',
            "Accreditation Scheme's Telephone",
            "Accreditation Scheme's Email",
            'Date of assessment',
            'Date of certificate'
        ]
        # Check we have all the expected keys
        for key in expected_keys:
            if key not in assessment_data:
                raise ValueError(f"Missing key: {key}")

        # The wall types of the property
        property_features_table = address_res.find("tbody", class_="govuk-table__body")
        property_features_table = property_features_table.find_all("tr")

        property_components = self.extract_property_components(property_features_table)

        # Extract walls
        self.walls = [x["description"] for x in property_components if x["component_name"] == "Wall"]

        # Finally, we format the recommendations
        recommendations = self.format_recommendations(recommendations, assessment_data, sap_2012_date)

        # 4) Low and zero carbon energy sources
        low_carbon_energy_sources = self.extract_low_carbon_sources(address_res)

        # 5) Pull out the EPC data
        epc_data = self.extract_epc_data(address_res)

        # Pull out the address information which can be found in the box with the class "epc-address"
        # We split it up on break tags
        addr = address_res.find("p", class_="epc-address").get_text(separator="\n").strip()
        lines = addr.split("\n")
        if len(lines) > 2:
            address1 = lines[0]
            address2 = lines[1]
            postcode = lines[-1]
        else:
            address1 = lines[0]
            address2 = ""
            postcode = lines[-1]

        resulting_data = {
            'epc_certificate': rrn,
            'current_epc_rating': current_rating.split(' ')[-6],
            'current_epc_efficiency': current_sap,
            'potential_epc_rating': potential_rating.split(' ')[-6],
            "potential_epc_efficiency": int(potential_rating.split(' ')[-1]),
            "heating_text": heating_text,
            "hot_water_text": hot_water_text,
            "recommendations": recommendations,
            "property_components": property_components,
            "epc_data": epc_data,
            **assessment_data,
            **low_carbon_energy_sources,
            "page_source": epc_page_source,
            # Add in address a postcode from the page - covers use cases where we are given RRN
            "address1": address1,
            "address2": address2,
            "postcode": postcode,
        }

        if return_page:
            # We return the page text as well, which can be parsed again later
            return resulting_data, epc_page_source

        return resulting_data

    def format_recommendations(self, recommendations, assessment_data, sap_2012_date=None):
        """
        This function converts the recommendations to a format that we can use in the engine as a non-intrusive survey
        :param recommendations: The recommendations from the EPC
        :param assessment_data: The assessment data from the EPC
        :param sap_2012_date: The date of the SAP 2012 update
        """

        measure_map = {
            "Internal or external wall insulation": ["internal_wall_insulation", "external_wall_insulation"],
            "Hot water cylinder insulation": ["hot_water_tank_insulation"],
            "Hot water cylinder thermostat": ["cylinder_thermostat"],
            "High performance external doors": ["insulated_doors"],
            "Floor insulation (solid floor)": ["solid_floor_insulation"],
            "Floor insulation (suspended floor)": ["suspended_floor_insulation"],
            "Double glazed windows": ["double_glazing"],
            "Cavity wall insulation": ["cavity_wall_insulation"],
            "Replace boiler with new condensing boiler": ["boiler_upgrade"],
            "Floor insulation": ["floor_insulation"],  # Recommendation typically associated to older EPCs
            "Heating controls (programmer, room thermostat and TRVs)": [
                "roomstat_programmer_trvs", "time_temperature_zone_control"
            ],
            "Low energy lighting": ["low_energy_lighting"],
            "Increase loft insulation to 270 mm": ["loft_insulation"],
            "Heating controls (thermostatic radiator valves)": [
                "roomstat_programmer_trvs", "time_temperature_zone_control"
            ],
            "Solar water heating": ["solar_water_heating"],
            "Solar photovoltaic panels, 2.5 kWp": ["solar_pv"],
            "Heating controls (room thermostat and TRVs)": [
                "roomstat_programmer_trvs", "time_temperature_zone_control"
            ],
            "Change heating to gas condensing boiler": ["boiler_upgrade"],
            "Fan assisted storage heaters and dual immersion cylinder": ["high_heat_retention_storage_heaters"],
            "Flat roof or sloping ceiling insulation": ["flat_roof_insulation", "sloping_ceiling_insulation"],
            "Heating controls (room thermostat)": [
                "roomstat_programmer_trvs", "time_temperature_zone_control"
            ],
            "Band A condensing boiler": ["boiler_upgrade"],
            "Double glazing": ["double_glazing"],
            "Flue gas heat recovery device in conjunction with boiler": ["flue_gas_heat_recovery"],
            "Wind turbine": ["wind_turbine"],
            "Loft insulation": ["loft_insulation"],
            "Solar photovoltaic (PV) panels": ["solar_pv"],
            "Party wall insulation": ["party_wall_insulation"],
            'Draught proofing': ["draught_proofing"],
            "Roof insulation recommendation": [],
            "Cavity wall insulation recommendation": [],
            "Windows draught proofing": [],
            "Low energy lighting for all fixed outlets": ["low_energy_lighting"],
            "Cylinder thermostat recommendation": [],
            "Heating controls recommendation": [],
            "Replace boiler with Band A condensing boiler": ["boiler_upgrade"],
            "Band A condensing gas boiler": ["boiler_upgrade"],
            "Install Band A condensing heating unit": ["boiler_upgrade"],
            "Solar panel recommendation": [],
            "Double glazing recommendation": [],
            "Solid wall insulation recommendation": [],
            "Fuel change recommendation": [],
            "PV Cells recommendation": [],
            "Replacement glazing units": ["double_glazing"],
            "Heating controls (time and temperature zone control)": ["time_temperature_zone_control"],
            "High heat retention storage heaters": ["high_heat_retention_storage_heaters"],
            "Gas condensing boiler": ["boiler_upgrade"],
            "Change room heaters to condensing boiler": ["boiler_upgrade"],
            "Cylinder thermostat": ["cylinder_thermostat"],
            "Heat recovery system for mixer showers": ["heat_recovery_shower"],
            "Room-in-roof insulation": ["room_in_roof_insulation"],
            "Fan assisted storage heaters": [],
            "Fan-assisted storage heaters": [],
            "Step 1:": [],
            "Step 2:": [],
            'Step 3:': [],
            'Step 4:': [],
            'Step 5:': [],
            "Biomass stove with boiler": [],
            "Replace boiler with biomass boiler": [],
            "Heating controls (room thermostat and thermostatic radiator valves)": [
                "roomstat_programmer_trvs", "time_temperature_zone_control"
            ],
            "Heating controls (programmer, and thermostatic radiator valves)": [
                "roomstat_programmer_trvs", "time_temperature_zone_control"
            ],
            "Heating controls (programmer and TRVs)": [
                "roomstat_programmer_trvs", "time_temperature_zone_control"
            ],
            "Heating controls (programmer and room thermostat)": [
                "roomstat_programmer_trvs", "time_temperature_zone_control"
            ],
            "Replacement warm air unit": [],
            "Secondary glazing": ["secondary_glazing"],
            "Condensing heating unit": ["boiler_upgrade"],
            '???': [],
            'Solar photovoltaic panels, 2.5kWp': ["solar_pv"],
            'Heating controls (programmer, room thermostat and thermostatic radiator valves)': [
                "roomstat_programmer_trvs", "time_temperature_zone_control"
            ],
            'Translation missing: en.improvement_code.41.title': [],
            "Condensing boiler (separate from the range cooker)": ["boiler_upgrade"],
            "Heating controls (programmer and thermostatic radiator valves)": [
                "roomstat_programmer_trvs", "time_temperature_zone_control"
            ],
            'Heating controls (programmer room thermostat and thermostatic radiator valves)': [
                "roomstat_programmer_trvs", "time_temperature_zone_control"
            ],
            "Internal wall insulation": ["internal_wall_insulation"],
            "High heat retention storage heaters and dual immersion cylinder and dual rate meter": [
                "high_heat_retention_storage_heaters"
            ],
            "High heat retention storage heaters and dual rate meter": [
                "high_heat_retention_storage_heaters"
            ],
            "Increase loft insulation to 250mm": ["loft_insulation"],
            "Solar photovoltaics panels, 25% of roof area": ["solar_pv"],
            'Air or ground source heat pump': ["air_source_heat_pump"],
            "Add PV Battery": ["solar_pv_battery"],
            "Add PV diverter": ["solar_pv_diverter"],  # Don't have a recommendation yet
            "Draughtproof single-glazed windows": ["double_glazing"],
            "Upgrade heating controls": ["roomstat_programmer_trvs", "time_temperature_zone_control"],
            "Low energy lighting recommendation": ["low_energy_lighting"],
            "Install cavity wall insulation": ["cavity_wall_insulation"],
            "Install solar water heating": ["solar_water_heating"],
            'Install photovoltaics, 25% of roof area': ["solar_pv"],
        }

        survey = True
        if sap_2012_date is not None:
            certificate_date = datetime.strptime(assessment_data["Date of certificate"], "%d %B %Y")
            if certificate_date < pd.to_datetime(sap_2012_date):
                survey = False

        formatted_recommendations = []
        for rec in recommendations:
            mapped = measure_map[rec["measure"]]
            for measure in mapped:
                if measure == "cavity_wall_insulation" and "solid brick" in self.walls[0].lower():
                    measure = "extension_cavity_wall_insulation"
                to_append = {
                    "type": measure,
                    "sap_points": rec["sap_points"],
                    "survey": survey,
                }
                if measure == "solar_pv":
                    to_append["suitable"] = True
                formatted_recommendations.append(to_append)

        return formatted_recommendations

    @classmethod
    def get_from_epc(cls, epc, epc_page_source=None, rrn=None, address_postal_town=None, sap_rating=None):

        if epc_page_source is not None and rrn is None:
            raise ValueError("rrn must be provided if epc_page_source is provided")

        searcher = cls(
            address=epc["address"], postcode=epc["postcode"], address_postal_town=address_postal_town,
            sap_rating=sap_rating
        )
        find_epc_data = searcher.retrieve_newest_find_my_epc_data(epc_page_source=epc_page_source, rrn=rrn)

        non_invasive_recommendations = {
            "uprn": epc["uprn"],
            "address": epc["address"],
            "postcode": epc["postcode"],
            "recommendations": find_epc_data.get("recommendations", []),
        }

        lodgment_date = find_epc_data.get("Date of certificate", None)
        if not pd.isnull(lodgment_date):
            lodgment_date = str(datetime.strptime(str(lodgment_date), "%d %B %Y"))

        # We need to add the patch information
        patch = {
            "current-energy-rating": find_epc_data.get("current_epc_rating"),
            "current-energy-efficiency": find_epc_data.get("current_epc_efficiency"),
            "potential-energy-rating": find_epc_data.get("potential_epc_rating"),
            "potential-energy-efficiency": find_epc_data.get("potential_epc_efficiency"),
            **find_epc_data.get("epc_data", {}),
            "lodgement-date": lodgment_date
        }

        page_source = {
            "rrn": find_epc_data.get("epc_certificate"),
            "page_source": find_epc_data.get("page_source")
        }

        property_components = find_epc_data.get("property_components", [])

        return non_invasive_recommendations, patch, page_source, property_components

    @classmethod
    def get_from_epc_with_fallback(
        cls, epc, epc_page, rrn, cleaned_address=None, config_address=None, address_postal_town=None
    ):
        """
        Attempt get_from_epc with:
        1) Original EPC
        2) EPC with cleaned address
        3) EPC with configured address
        in that order.
        """

        # The data we'll use to attempt retrieval
        # 1) Original
        attempts = [epc]

        # 2) Cleaned
        if cleaned_address:
            modified = deepcopy(epc)
            for k in ["address", "address1"]:
                modified[k] = cleaned_address
            attempts.append(modified)

        # 3) Config address fallback
        if config_address:
            modified = deepcopy(epc)
            for k in ["address", "address1"]:
                modified[k] = config_address
            attempts.append(modified)

        sap_rating = float(epc["current-energy-efficiency"])

        # Iterate attempts
        last_error = None
        for idx, attempt in enumerate(attempts, start=1):
            try:
                return cls.get_from_epc(
                    attempt, epc_page, rrn=rrn, address_postal_town=address_postal_town, sap_rating=sap_rating
                )
            except Exception as e:
                last_error = e
                logger.error(f"Attempt {idx} failed: {e}")

        raise RuntimeError(f"All EPC retrieval attempts failed: {last_error}")