created basic data collection process

2026-07-27 23:35:01 +00:00 · 2024-07-02 11:06:11 +01:00 · 2024-07-02 11:06:11 +01:00 · 1db6dfebdf
commit 1db6dfebdf
parent eac2046765
1 changed files with 86 additions and 24 deletions
--- a/etl/bill_savings/data_collection.py
+++ b/etl/bill_savings/data_collection.py
@ -1,13 +1,79 @@
+import time
+
+import requests
 import inspect
 import pandas as pd
 from tqdm import tqdm
-from etl.epc_clean.EpcClean import EpcClean
+from bs4 import BeautifulSoup
 from etl.epc.settings import EARLIEST_EPC_DATE
 from pathlib import Path
+import numpy as np

 src_file_path = inspect.getfile(lambda: None)

 EPC_DIRECTORY = Path(src_file_path).parent / "local_data" / "all-domestic-certificates"
+SEARCH_POSTCODE_URL = (
+    "https://find-energy-certificate.service.gov.uk/find-a-certificate/search-by-postcode?postcode={postcode_input}"
+)
+BASE_ENERGY_URL = "https://find-energy-certificate.service.gov.uk"
+
+
+def retrieve_find_my_epc_data(uprn: int, postcode: str, address: str):
+    """
+    For a post code and address, we pull out all the required data from the find my epc website
+    """
+
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
+                      'Chrome/111.0.0.0 Safari/537.36'
+    }
+    postcode_input = postcode.replace(" ", "+")
+    postcode_search = SEARCH_POSTCODE_URL.format(postcode_input=postcode_input)
+    postcode_response = requests.get(postcode_search, headers=headers)
+
+    postcode_res = BeautifulSoup(postcode_response.text, features="html.parser")
+    address_links_full = postcode_res.findAll('a', {'class': 'govuk-link', 'rel': 'nofollow'})
+    address_links = {element.text.lstrip().rstrip(): BASE_ENERGY_URL + element['href'] for element in
+                     address_links_full}
+
+    address_cleaned = address.replace(",", "").replace(" ", "").lower()
+    address_links_cleaned = [
+        x.replace(",", "").replace(" ", "").lower() for x in list(address_links.keys())
+    ]
+
+    index_of_address = [key.startswith(address_cleaned) for key in address_links_cleaned]
+    if sum(index_of_address) > 1:
+        # If we have two or more addresses, we can't be sure which one is the correct one so we exit for simplicity
+        return None
+    chosen_epc = address_links[list(address_links.keys())[np.where(index_of_address)[0][0]]]
+
+    epc_certificate = chosen_epc.split('/')[-1]
+
+    address_response = requests.get(chosen_epc, headers=headers)
+    address_res = BeautifulSoup(address_response.text, features="html.parser")
+
+    ratings = address_res.find('desc', {'id': 'svg-desc'}).text
+    current_rating = ratings.split(".")[0]
+    potential_rating = ratings.split(".")[1]
+
+    # Retrieve the energy consumption
+    bills = address_res.find('div', {'id': 'bills-affected'})
+    heating_text = bills.find_all('li')[0].text
+    hot_water_text = bills.find_all('li')[1].text
+
+    resulting_data = {
+        'uprn': uprn,
+        'address': address,
+        'epc_certificate': epc_certificate,
+        'current_epc_rating': current_rating.split(' ')[-6],
+        'current_epc_efficiency': int(current_rating.split(' ')[-1]),
+        'potential_epc_rating': potential_rating.split(' ')[-6],
+        "potential_epc_efficiency": int(potential_rating.split(' ')[-1]),
+        "heating_text": heating_text,
+        "hot_water_text": hot_water_text,
+    }
+
+    return resulting_data


 def app():
@ -20,7 +86,9 @@ def app():
    cleaned_data = {}
    epc_directories = [entry for entry in EPC_DIRECTORY.iterdir() if entry.is_dir()]

-    data = []
+    sample_size = 100
+
+    energy_consumption_data = []
    for directory in tqdm(epc_directories):
        data = pd.read_csv(directory / "certificates.csv", low_memory=False)
        # Rename the columns to the same format as the api returns
@ -28,29 +96,23 @@ def app():
        # Take just date before the date threshold
        data = data[data["lodgement-date"] >= EARLIEST_EPC_DATE]

-        data = data[~pd.isnull(data["uprn"])]
-        data = data[data["mains-gas-flag"] == "N"]
-        data = data[data["main-fuel"] == "electricity (not community)"]
-        data[data["current-energy-efficiency"].astype(float) > 80]["uprn"].astype(int)
+        data = data.sample(sample_size)
+        # We use the addreess data to find the related information

-        # Convert to list of dictioaries as returned by the api
-        data = data.to_dict("records")
+        collected_data = []
+        for _, property_data in data.iterrows():
+            # Sleep for a random time between 0.1 and 1.5 seconds
+            time.sleep(np.random.uniform(0.1, 1.5))

-        # Incorporate input data into cleaning
-        cleaner = EpcClean(data)
+            uprn = int(property_data["uprn"])
+            address = property_data["address1"]
+            postcode = property_data["postcode"]

-        cleaner.clean()
-        # Extended cleaned_data
-        for k, data in cleaner.cleaned.items():
-            if k not in cleaned_data:
-                cleaned_data[k] = data
-            else:
-                existing_descriptions = [x["original_description"] for x in cleaned_data[k]]
-                new_data = [x for x in data if x["original_description"] not in existing_descriptions]
-                cleaned_data[k].extend(new_data)
+            response = retrieve_find_my_epc_data(
+                uprn=uprn,
+                postcode=postcode,
+                address=address
+            )
+            collected_data.append(response)

-    # Basic check to make sure all descriptions are unique
-    for _, cleaned in cleaned_data.items():
-        descriptions = [x["original_description"] for x in cleaned]
-        if len(descriptions) != len(set(descriptions)):
-            raise ValueError("Duplicated descriptions found, check me")
+        energy_consumption_data.extend(energy_consumption_data)