From b40f72216f97d644bdf48663a9f395589d2b124b Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Thu, 7 Nov 2024 12:46:46 +0000
Subject: [PATCH] debugging retrieve_newest_find_my_epc_data

---
 etl/customers/ksquared/Wave3 Modelling.py     |  47 ++++
 .../stonewater/potential_eco_properties.py    | 115 ++++++++-
 .../requirements/requirements-wave-3-prep.txt |   2 +
 etl/find_my_epc/RetrieveFindMyEpc.py          | 238 ++++++++++++++++++
 etl/find_my_epc/requirements.txt              |   2 +
 5 files changed, 396 insertions(+), 8 deletions(-)
 create mode 100644 etl/customers/ksquared/Wave3 Modelling.py
 create mode 100644 etl/find_my_epc/RetrieveFindMyEpc.py
 create mode 100644 etl/find_my_epc/requirements.txt

diff --git a/etl/customers/ksquared/Wave3 Modelling.py b/etl/customers/ksquared/Wave3 Modelling.py
new file mode 100644
index 00000000..bf9eb1e8
--- /dev/null
+++ b/etl/customers/ksquared/Wave3 Modelling.py	
@@ -0,0 +1,47 @@
+import time
+
+from tqdm import tqdm
+import pandas as pd
+from etl.find_my_epc.RetrieveFindMyEpc import RetrieveFindMyEpc
+
+
+def app():
+    """
+    This script prepares the asset lists for the additional housing associations, CAHA and Hornsey Housing Trust,
+    that are forming a consortium led by AIHA
+    :return:
+    """
+
+    hornsey_asset_list = pd.read_excel(
+        "/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/SHDF - Template - EOI - Hornsey Housing "
+        "Trust.xlsx",
+        sheet_name="Ksquared-All units information",
+        header=3
+    )
+
+    # We don't need the first row
+    hornsey_asset_list = hornsey_asset_list.iloc[1:]
+    # Fill NA values with empty strings
+    hornsey_asset_list = hornsey_asset_list.fillna("")
+    hornsey_asset_list["Address letter or number"] = hornsey_asset_list["Address letter or number"].astype(
+        str
+    ).str.strip()
+    hornsey_asset_list["Postcode"] = hornsey_asset_list["Postcode"].astype(str).str.strip()
+    hornsey_asset_list["Street address"] = hornsey_asset_list["Street address"].astype(str).str.strip()
+    # Replace double spaces
+    for col in ["Address letter or number", "Street address", "Postcode"]:
+        hornsey_asset_list[col] = hornsey_asset_list[col].str.replace("  ", " ")
+
+    extracted_data = []
+    for _, home in tqdm(hornsey_asset_list.iterrows(), total=len(hornsey_asset_list)):
+        time.sleep(0.5)
+        # Some properties do not have an epc
+        if not home["Energy starting band (EPC)"]:
+            continue
+        unit_number = home["Address letter or number"]
+        street = home["Street address"]
+        postcode = home["Postcode"]
+        address = ", ".join([x for x in [unit_number, street] if x])
+        searcher = RetrieveFindMyEpc(address=address, postcode=postcode)
+        epc_data = searcher.retrieve_newest_find_my_epc_data()
+        extracted_data.append(epc_data)
diff --git a/etl/customers/stonewater/potential_eco_properties.py b/etl/customers/stonewater/potential_eco_properties.py
index 26321a41..4fb89113 100644
--- a/etl/customers/stonewater/potential_eco_properties.py
+++ b/etl/customers/stonewater/potential_eco_properties.py
@@ -236,6 +236,8 @@ def app():
     epcs_to_merge = epcs_to_merge.sort_values("Date of last EPC", ascending=False)
     epcs_to_merge = epcs_to_merge.drop_duplicates(subset="uprn")
 
+    stonewater_cavity_properties["UPRN"] = stonewater_cavity_properties["UPRN"].astype("Int64").astype(str)
+
     # Merge the EPCs on, with the data we need
     stonewater_cavity_properties = stonewater_cavity_properties.rename(
         columns={
@@ -265,14 +267,111 @@ def app():
 
     # Filter on as built cavity properties
     additional_properties = additional_properties[
-        additional_properties["Walls"].isin(
-            cavity_descriptions +
-            ["Cavity: FilledCavity", "Cavity: External", "Cavity: Internal"]
-        )
+        additional_properties["Walls"].isin(cavity_descriptions)
     ]
+    additional_properties["Full Address"] = additional_properties["Address"].copy()
+    house_numbers = []
+    for _, x in tqdm(additional_properties.iterrows(), total=len(additional_properties)):
+        house_no = SearchEpc.get_house_number(x["Address"].split(",")[0], x["Postcode"])
+        if house_no is None:
+            house_no = x["Address"].split(",")[0]
+        # If we end up with a number like "01" we need to remove the leading zero
+        house_no = house_no.lstrip("0")
+        house_numbers.append(
+            {
+                "Address ID": x["Address ID"],
+                "Number": house_no
+            }
+        )
+
+    house_numbers = pd.DataFrame(house_numbers)
+    additional_properties = additional_properties.merge(house_numbers, how="left", on="Address ID")
+    additional_properties["row_id"] = additional_properties["Address ID"].copy()
 
     # Pull the EPCs for these properties
-    for _, home in tqdm(additional_properties.iterrows()):
-        full_address = home["Address"]
-        postcode = home["Postcode"]
-        address1 = full_address.split(",")[0]
+    additional_properties_epcs, errors = get_data(additional_properties)
+
+    # Save this data as a pickle
+    # import pickle
+    # with open("/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/additional_properties_epcs.pkl",
+    # "wb") as f:
+    #     pickle.dump(additional_properties_epcs, f)
+
+    # We drop Full Address
+    additional_properties = additional_properties.drop(columns=["Full Address"])
+    additional_properties2 = additional_properties[[
+        "row_id", "Address", "Postcode", "Address ID", "SAP", "SAP Band", "Property Type", "Walls", "Roofs", "Glazing",
+        "Heating", "Main Fuel", "Hot Water", "Renewables", "Total Floor Area",
+
+    ]].rename(
+        columns={
+            "SAP": "Parity - Predicted SAP",
+            "SAP Band": "Parity - Predicted SAP Band",
+            "Age": "Parity - Build Age",
+            "Property Type": "Parity - Property Type",
+            "Walls": "Parity - Wall Construction",
+            "Roofs": "Parity - Roof Construction",
+            "Glazing": "Parity - Glazing Type",
+            "Heating": "Parity - Heating Type",
+            "Main Fuel": "Parity - Main Fuel",
+            "Hot Water": "Parity - Hot Water",
+            "Renewables": "Parity - Renewables",
+            "Total Floor Area": "Parity - Total Floor Area"
+        }
+    ).merge(
+        pd.DataFrame(additional_properties_epcs)[
+            [
+                "row_id",
+                "property-type",
+                "built-form",
+                "inspection-date",
+                "current-energy-rating",
+                "current-energy-efficiency",
+                "roof-description",
+                "walls-description",
+                "transaction-type",
+                "secondheat-description",
+                "total-floor-area",
+                "construction-age-band",
+                "floor-height",
+                "number-habitable-rooms",
+                "mainheat-description",
+                "energy-consumption-current"
+            ]
+        ].rename(
+            columns={
+                "inspection-date": "Date of last EPC",
+                "current-energy-efficiency": "SAP score on register",
+                "current-energy-rating": "EPC rating on register",
+                "property-type": "Property Type",
+                "built-form": "Archetype",
+                "total-floor-area": "Property Floor Area",
+                "construction-age-band": "Property Age Band",
+                "floor-height": "Property Floor Height",
+                "number-habitable-rooms": "Number of Habitable Rooms",
+                "walls-description": "Wall Construction",
+                "roof-description": "Roof Construction",
+                "mainheat-description": "Heating Type",
+                "secondheat-description": "Secondary Heating",
+                "transaction-type": "Reason for last EPC",
+                "energy-consumption-current": "Heat Demand (kWh/m2)",
+            }
+        ),
+        how="left",
+        on="row_id"
+    )
+
+    # We save the data locally
+    stonewater_cavity_properties.to_csv(
+        "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Cavity Properties.csv",
+        index=False
+    )
+    additional_properties2.to_csv(
+        "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Additional Cavity Properties.csv",
+        index=False
+    )
+    # Save the survey findings
+    needs_cwi.to_csv(
+        "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Properties Needing CWI - WIP.csv",
+        index=False
+    )
diff --git a/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt b/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt
index 102f5930..3ad5d2c1 100644
--- a/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt
+++ b/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt
@@ -6,3 +6,5 @@ boto3
 epc-api-python==1.0.2
 usaddress==0.5.11
 fuzzywuzzy==0.18.0
+python-dotenv
+
diff --git a/etl/find_my_epc/RetrieveFindMyEpc.py b/etl/find_my_epc/RetrieveFindMyEpc.py
new file mode 100644
index 00000000..a6696021
--- /dev/null
+++ b/etl/find_my_epc/RetrieveFindMyEpc.py
@@ -0,0 +1,238 @@
+import requests
+from bs4 import BeautifulSoup
+from datetime import datetime
+
+
+class RetrieveFindMyEpc:
+    SEARCH_POSTCODE_URL = (
+        "https://find-energy-certificate.service.gov.uk/find-a-certificate/search-by-postcode?postcode={postcode_input}"
+    )
+    BASE_ENERGY_URL = "https://find-energy-certificate.service.gov.uk"
+
+    HEADERS = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
+                      'Chrome/111.0.0.0 Safari/537.36'
+    }
+
+    def __init__(self, address: str, postcode: str):
+        """
+        This class is tasked with retrieving the latest EPC data from the find my epc website
+        :param address: The address of the property
+        :param postcode: The postcode of the property
+        """
+        self.address = address
+        self.postcode = postcode
+
+        self.address_cleaned = self.address.replace(",", "").replace(" ", "").lower()
+
+    def retrieve_newest_find_my_epc_data(self):
+        """
+        For a post code and address, we pull out all the required data from the find my epc website
+        """
+
+        postcode_input = self.postcode.replace(" ", "+")
+        postcode_search = self.SEARCH_POSTCODE_URL.format(postcode_input=postcode_input)
+        postcode_response = requests.get(postcode_search, headers=self.HEADERS)
+
+        postcode_res = BeautifulSoup(postcode_response.text, features="html.parser")
+        rows = postcode_res.find_all('tr', class_='govuk-table__row')
+
+        extracted_table = []
+        for row in rows:
+            # Extract the address and URL
+            address_tag = row.find('a', class_='govuk-link')
+            if address_tag is None:
+                continue
+            extracted_address = None
+            extracted_address_url = None
+            if address_tag:
+                extracted_address = address_tag.text.strip()
+                extracted_address_url = address_tag['href']
+
+                extracted_address_cleaned = extracted_address.replace(",", "").replace(" ", "").lower()
+                if not extracted_address_cleaned.startswith(self.address_cleaned):
+                    continue
+
+                # If the address is a match, we can extract the data
+
+            # Extract the expiry date
+            expiry_date_tag = row.find('td', class_='govuk-table__cell date')
+            expiry_date = None
+            if expiry_date_tag is not None:
+                expiry_date = expiry_date_tag.parent.find('span').text.strip()
+
+            extracted_table.append(
+                {
+                    "extracted_address": extracted_address,
+                    "extracted_address_url": extracted_address_url,
+                    "expiry_date": datetime.strptime(expiry_date, '%d %B %Y'),
+                }
+            )
+
+        if not extracted_table:
+            raise ValueError("No EPC found")
+
+        if len(extracted_table) > 1:
+            # We take the one with the most recent expiry date
+            extracted_table = sorted(extracted_table, key=lambda x: x['expiry_date'], reverse=True)
+
+        chosen_epc = self.BASE_ENERGY_URL + extracted_table[0]['extracted_address_url']
+        epc_certificate = chosen_epc.split('/')[-1]
+
+        address_response = requests.get(chosen_epc, headers=self.HEADERS)
+        address_res = BeautifulSoup(address_response.text, features="html.parser")
+
+        # Key data we want to retrieve:
+        # 1) Rating
+        # 2) Bills estimates
+        # 3) Recommendations and SAP points
+        # 4) Low and zero carbon energy sources
+
+        ratings = address_res.find('desc', {'id': 'svg-desc'}).text
+        current_rating = ratings.split(".")[0]
+        potential_rating = ratings.split(".")[1]
+        current_sap = int(current_rating.split(' ')[-1])
+
+        # Retrieve the energy consumption
+        bills = address_res.find('div', {'id': 'bills-affected'})
+        bills_list = bills.find_all('li')
+        if not bills_list:
+            # If this is the case, it's usually becaue the EPC was very old. Early EPCs did not have this information
+            heating_text = None
+            hot_water_text = None
+        else:
+            heating_text = bills_list[0].text
+            hot_water_text = bills_list[1].text
+
+        # Retrieve the recommendations and SAP points
+        recommendations = []
+        recommendations_div = address_res.find('div', class_='epb-recommended-improvements')
+        if recommendations_div:
+            # Find all h3 headers for each step and extract their related information
+            step_headers = recommendations_div.find_all('h3', class_='govuk-heading-m')
+            previous_sap_score = current_sap
+            for step_num, step_header in enumerate(step_headers, start=1):
+                # Extract the step title (the measure)
+                measure_title = step_header.text.strip().replace(f"Step {step_num}: ", "")
+
+                # Find the div containing the potential rating within the same section
+                potential_rating_div = step_header.find_next(
+                    'div', class_='epb-recommended-improvements__potential-rating'
+                )
+
+                # Check if the potential rating div is found
+                if potential_rating_div:
+                    # Extract the rating text within the SVG text element
+                    rating_text = potential_rating_div.find('text', class_='govuk-!-font-weight-bold').text.strip()
+                    # Parse the rating text to separate the numeric rating and EPC letter
+                    new_rating = int(rating_text.split()[0])
+                    new_epc = rating_text.split()[1]
+
+                    # Append the information as a dictionary to the recommendations list
+                    recommendations.append({
+                        "step": step_num,
+                        "measure": measure_title,
+                        "new_rating": new_rating,
+                        "new_epc": new_epc,
+                        "sap_points": new_rating - previous_sap_score
+                    })
+                    previous_sap_score = new_rating
+
+        # Search for the assessment informaton
+        assessment_information = address_res.find('div', {'id': 'information'})
+        # Parse this information
+        rows = assessment_information.find_all('div', class_='govuk-summary-list__row')
+        # Create a dictionary to hold the parsed information
+        assessment_data = {}
+        for row in rows:
+            key = row.find('dt').text.strip()
+            if key == "Type of assessment":
+                # We dont reliably extract this
+                continue
+            value_tag = row.find('dd')
+
+            # Check if value contains a link (email)
+            if value_tag.find('a'):
+                value = value_tag.find('a').text.strip()
+            elif value_tag.find('summary'):
+                value = value_tag.find('span').text.strip()
+            else:
+                value = value_tag.text.strip()
+
+            # These are keys that we have for both the surveyor and the acreditation scheme. Firstly, we'll
+            # get the surveyor's name and email so we make that information clear
+            if key in ["Telephone", "Email"]:
+                if "Assessor's " + key not in assessment_data:
+                    assessment_data["Assessor's " + key] = value
+                else:
+                    assessment_data["Accreditation Scheme's " + key] = value
+                continue
+
+            assessment_data[key] = value
+
+        expected_keys = [
+            'Assessor’s name',
+            "Assessor's Telephone",
+            "Assessor's Email",
+            'Assessor’s ID',
+            'Accreditation scheme',
+            'Assessor’s declaration',
+            "Accreditation Scheme's Telephone",
+            "Accreditation Scheme's Email",
+            'Date of assessment',
+            'Date of certificate'
+        ]
+        # Check we have all the expected keys
+        for key in expected_keys:
+            if key not in assessment_data:
+                raise ValueError(f"Missing key: {key}")
+
+        # Finally, we format the recommendations
+        recommendations = self.format_recommendations(recommendations)
+
+        resulting_data = {
+            'epc_certificate': epc_certificate,
+            'current_epc_rating': current_rating.split(' ')[-6],
+            'current_epc_efficiency': current_sap,
+            'potential_epc_rating': potential_rating.split(' ')[-6],
+            "potential_epc_efficiency": int(potential_rating.split(' ')[-1]),
+            "heating_text": heating_text,
+            "hot_water_text": hot_water_text,
+            "recommendations": recommendations,
+            **assessment_data
+        }
+
+        return resulting_data
+
+    def format_recommendations(self, recommendations):
+        """
+        This function converts the recommendations to a format that we can use in the engine as a non-intrusive survey
+        :param recommendations:
+        :return:
+        """
+
+        measure_map = {
+            "Internal or external wall insulation": ["internal_wall_insulation", "external_wall_insulation"],
+            "Hot water cylinder insulation": ["hot_water_tank_insulation"],
+            "Hot water cylinder thermostat": ["cylinder_thermostat"],
+            "High performance external doors": ["insulated_doors"],
+            "Floor insulation (solid floor)": ["solid_floor_insulation"],
+            "Double glazed windows": ["double_glazing"],
+            "Cavity wall insulation": ["cavity_wall_insulation"],
+            "Replace boiler with new condensing boiler": ["boiler_upgrade"],
+        }
+
+        formatted_recommendations = []
+        for rec in recommendations:
+
+            mapped = measure_map[rec["measure"]]
+            for measure in mapped:
+                formatted_recommendations.append(
+                    {
+                        "type": measure,
+                        "sap_points": rec["sap_points"],
+                        "survey": True
+                    }
+                )
+
+        return formatted_recommendations
diff --git a/etl/find_my_epc/requirements.txt b/etl/find_my_epc/requirements.txt
new file mode 100644
index 00000000..9a3fc73f
--- /dev/null
+++ b/etl/find_my_epc/requirements.txt
@@ -0,0 +1,2 @@
+pandas
+beautifulsoup4
\ No newline at end of file