Model/etl/bill_savings/data_collection.py

import time
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta

import requests
import inspect
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup
from etl.epc.settings import EARLIEST_EPC_DATE
from pathlib import Path
import numpy as np
from utils.s3 import save_pickle_to_s3

src_file_path = inspect.getfile(lambda: None)

EPC_DIRECTORY = Path(src_file_path).parent / "local_data" / "all-domestic-certificates"
SEARCH_POSTCODE_URL = (
    "https://find-energy-certificate.service.gov.uk/find-a-certificate/search-by-postcode?postcode={postcode_input}"
)
BASE_ENERGY_URL = "https://find-energy-certificate.service.gov.uk"


def calculate_expiry_date(lodgement_date):
    lodgement_date_dt = datetime.strptime(lodgement_date, '%Y-%m-%d')
    expiry_date_dt = lodgement_date_dt + relativedelta(years=10) - timedelta(days=1)
    return expiry_date_dt.strftime('%-d %B %Y')


def retrieve_find_my_epc_data(uprn: int, postcode: str, address: str, expected_expiry_date: str):
    """
    For a post code and address, we pull out all the required data from the find my epc website
    """

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                      'Chrome/111.0.0.0 Safari/537.36'
    }
    postcode_input = postcode.replace(" ", "+")
    postcode_search = SEARCH_POSTCODE_URL.format(postcode_input=postcode_input)
    postcode_response = requests.get(postcode_search, headers=headers)

    address_cleaned = address.replace(",", "").replace(" ", "").lower()
    postcode_res = BeautifulSoup(postcode_response.text, features="html.parser")
    rows = postcode_res.find_all('tr', class_='govuk-table__row')

    extracted_table = []
    for row in rows:
        # Extract the address and URL
        address_tag = row.find('a', class_='govuk-link')
        if address_tag is None:
            continue
        extracted_address = None
        extracted_address_url = None
        if address_tag:
            extracted_address = address_tag.text.strip()
            extracted_address_url = address_tag['href']

            extracted_address_cleaned = extracted_address.replace(",", "").replace(" ", "").lower()
            if not extracted_address_cleaned.startswith(address_cleaned):
                continue

            # If the address is a match, we can extract the data

        # Extract the expiry date
        expiry_date_tag = row.find('td', class_='govuk-table__cell date')
        expiry_date = None
        if expiry_date_tag is not None:
            expiry_date = expiry_date_tag.parent.find('span').text.strip()

        extracted_table.append(
            {
                "extracted_address": extracted_address,
                "extracted_address_url": extracted_address_url,
                "expiry_date": expiry_date
            }
        )

    extracted_table = [entry for entry in extracted_table if entry['expiry_date'] == expected_expiry_date]

    if len(extracted_table) > 1:
        print("Multiple candidates found, skipping for now")
        return None

    if not extracted_table:
        print("No candidates found, skipping for now")
        return None

    chosen_epc = BASE_ENERGY_URL + extracted_table[0]['extracted_address_url']
    epc_certificate = chosen_epc.split('/')[-1]

    address_response = requests.get(chosen_epc, headers=headers)
    address_res = BeautifulSoup(address_response.text, features="html.parser")

    ratings = address_res.find('desc', {'id': 'svg-desc'}).text
    current_rating = ratings.split(".")[0]
    potential_rating = ratings.split(".")[1]

    # Retrieve the energy consumption
    bills = address_res.find('div', {'id': 'bills-affected'})
    bills_list = bills.find_all('li')
    if not bills_list:
        # If this is the case, it's usually becaue the EPC was very old. Early EPCs did not have this information
        heating_text = None
        hot_water_text = None
    else:
        heating_text = bills_list[0].text
        hot_water_text = bills_list[1].text

    # Search for the assessment informaton
    assessment_information = address_res.find('div', {'id': 'information'})
    # Parse this information
    rows = assessment_information.find_all('div', class_='govuk-summary-list__row')
    # Create a dictionary to hold the parsed information
    assessment_data = {}
    for row in rows:
        key = row.find('dt').text.strip()
        if key == "Type of assessment":
            # We dont reliably extract this
            continue
        value_tag = row.find('dd')

        # Check if value contains a link (email)
        if value_tag.find('a'):
            value = value_tag.find('a').text.strip()
        elif value_tag.find('summary'):
            value = value_tag.find('span').text.strip()
        else:
            value = value_tag.text.strip()

        assessment_data[key] = value

    expected_keys = [
        'Assessor’s name', 'Telephone', 'Email', 'Accreditation scheme', 'Assessor’s ID', 'Assessor’s declaration',
        'Date of assessment', 'Date of certificate'
    ]
    # Check we have all the expected keys
    for key in expected_keys:
        if key not in assessment_data:
            raise ValueError(f"Missing key: {key}")

    resulting_data = {
        'extracted_uprn': uprn,
        'extracted_address': address,
        'epc_certificate': epc_certificate,
        'current_epc_rating': current_rating.split(' ')[-6],
        'current_epc_efficiency': int(current_rating.split(' ')[-1]),
        'potential_epc_rating': potential_rating.split(' ')[-6],
        "potential_epc_efficiency": int(potential_rating.split(' ')[-1]),
        "heating_text": heating_text,
        "hot_water_text": hot_water_text,
        **assessment_data
    }

    return resulting_data


def app():
    """
    This application is tasked with pulling a large quantity of data from the find my epc website, containing the
    estimated energy consumption for properties
    :return:
    """

    epc_directories = [entry for entry in EPC_DIRECTORY.iterdir() if entry.is_dir()]

    sample_size = 500

    energy_consumption_data = []
    for i, directory in tqdm(enumerate(epc_directories), total=len(epc_directories)):
        try:
            # Skip the first 50
            if i < 256:
                continue

            data = pd.read_csv(directory / "certificates.csv", low_memory=False)
            # Rename the columns to the same format as the api returns
            data.columns = [c.replace("_", "-").lower() for c in data.columns]

            # Take just date before the date threshold
            data = data[data["lodgement-date"] >= EARLIEST_EPC_DATE]

            data = data[~pd.isnull(data["uprn"])]
            # Take just the newest EPC per uprn, based on lodgement-date
            data = data.sort_values("lodgement-date", ascending=False).drop_duplicates("uprn")

            data = data.sample(sample_size, replace=False)
            # We use the addreess data to find the related information

            collected_data = []
            for _, property_data in data.iterrows():
                time.sleep(np.random.uniform(0.2, 1.5))

                uprn = int(property_data["uprn"])
                address = property_data["address1"]
                postcode = property_data["postcode"]
                expected_expiry_date = calculate_expiry_date(property_data["lodgement-date"])

                response = retrieve_find_my_epc_data(
                    uprn=uprn,
                    postcode=postcode,
                    address=address,
                    expected_expiry_date=expected_expiry_date
                )
                if response is None:
                    continue
                collected_data.append(
                    {
                        **response,
                        "epc": property_data.to_dict(),
                        "epc_directory": str(directory)
                    }
                )

            energy_consumption_data.extend(collected_data)
        except Exception as e:
            print(f"Error for directory {directory}: {e}")
            # If we have an error, then we wait for a bit since it's likely due to timeout
            time.sleep(300)
            continue

    # Store the pickle in s3
    save_time = datetime.now()
    save_pickle_to_s3(
        energy_consumption_data, bucket_name="retrofit-datalake-dev",
        s3_file_name=f"energy_consumption_data/{save_time}.pkl"
    )