From 591b9522251bcf138357b96128ff4cdebaa5607a Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 30 Sep 2024 11:53:04 +0100 Subject: [PATCH] adding the assessment information to retrieve_find_my_epc_data --- etl/bill_savings/data_collection.py | 42 +++++++++++++++++-- etl/customers/aiha/epc_surveyor_list.py | 41 ++++++++++++++++++ .../MainheatControlAttributes.py | 7 +++- 3 files changed, 86 insertions(+), 4 deletions(-) create mode 100644 etl/customers/aiha/epc_surveyor_list.py diff --git a/etl/bill_savings/data_collection.py b/etl/bill_savings/data_collection.py index 49bcff82..ee8a228f 100644 --- a/etl/bill_savings/data_collection.py +++ b/etl/bill_savings/data_collection.py @@ -100,9 +100,44 @@ def retrieve_find_my_epc_data(uprn: int, postcode: str, address: str, expected_e bills = address_res.find('div', {'id': 'bills-affected'}) bills_list = bills.find_all('li') if not bills_list: - return None - heating_text = bills_list[0].text - hot_water_text = bills_list[1].text + # If this is the case, it's usually becaue the EPC was very old. Early EPCs did not have this information + heating_text = None + hot_water_text = None + else: + heating_text = bills_list[0].text + hot_water_text = bills_list[1].text + + # Search for the assessment informaton + assessment_information = address_res.find('div', {'id': 'information'}) + # Parse this information + rows = assessment_information.find_all('div', class_='govuk-summary-list__row') + # Create a dictionary to hold the parsed information + assessment_data = {} + for row in rows: + key = row.find('dt').text.strip() + if key == "Type of assessment": + # We dont reliably extract this + continue + value_tag = row.find('dd') + + # Check if value contains a link (email) + if value_tag.find('a'): + value = value_tag.find('a').text.strip() + elif value_tag.find('summary'): + value = value_tag.find('span').text.strip() + else: + value = value_tag.text.strip() + + assessment_data[key] = value + + expected_keys = [ + 'Assessor’s name', 'Telephone', 'Email', 'Accreditation scheme', 'Assessor’s ID', 'Assessor’s declaration', + 'Date of assessment', 'Date of certificate' + ] + # Check we have all the expected keys + for key in expected_keys: + if key not in assessment_data: + raise ValueError(f"Missing key: {key}") resulting_data = { 'extracted_uprn': uprn, @@ -114,6 +149,7 @@ def retrieve_find_my_epc_data(uprn: int, postcode: str, address: str, expected_e "potential_epc_efficiency": int(potential_rating.split(' ')[-1]), "heating_text": heating_text, "hot_water_text": hot_water_text, + **assessment_data } return resulting_data diff --git a/etl/customers/aiha/epc_surveyor_list.py b/etl/customers/aiha/epc_surveyor_list.py new file mode 100644 index 00000000..b85139ae --- /dev/null +++ b/etl/customers/aiha/epc_surveyor_list.py @@ -0,0 +1,41 @@ +import pandas as pd +import numpy as np +import time +from tqdm import tqdm +from etl.bill_savings.data_collection import retrieve_find_my_epc_data, calculate_expiry_date + + +def main(): + """ + This script handles pulling the surveyor names and acreditation details for Surveyors who have completed + the newest EPC for AIHA's properties + """ + + epc_data = pd.read_csv("/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/epc_data.csv") + epc_data = epc_data[["uprn", "address", "address1", "postcode", "lodgement-date"]] + + epc_collected_data = [] + for _, unit in tqdm(epc_data.iterrows(), total=len(epc_data)): + time.sleep(np.random.uniform(0.2, 1.5)) + uprn = int(unit["uprn"]) + address = unit["address1"] + postcode = unit["postcode"] + expected_expiry_date = calculate_expiry_date(unit["lodgement-date"]) + + response = retrieve_find_my_epc_data( + uprn=uprn, + postcode=postcode, + address=address, + expected_expiry_date=expected_expiry_date + ) + if response is None: + raise Exception("fix me") + epc_collected_data.append(response) + + epc_collected_data = pd.DataFrame(epc_collected_data) + + for x in epc_collected_data: + keys = x.keys() + # Check for None keys + if any(k is None for k in keys): + frew diff --git a/etl/epc_clean/epc_attributes/MainheatControlAttributes.py b/etl/epc_clean/epc_attributes/MainheatControlAttributes.py index 4a846498..a13823d2 100644 --- a/etl/epc_clean/epc_attributes/MainheatControlAttributes.py +++ b/etl/epc_clean/epc_attributes/MainheatControlAttributes.py @@ -120,10 +120,15 @@ class MainheatControlAttributes(Definitions): "rheoli'r t l llaw": "manual charge control", } + NO_DATA_DESCRIPTIONS = [ + "SAP05:Main-Heating-Controls", + "SAP:Main-Heating-Controls", + ] + def __init__(self, description: str): self.description: str = clean_description(description.lower()).strip() self.nodata = not self.description or description in self.DATA_ANOMALY_MATCHES or ( - description == "SAP05:Main-Heating-Controls" + description in self.NO_DATA_DESCRIPTIONS ) translation = self.WELSH_TEXT.get(self.description)