Merge pull request #372 from Hestia-Homes/settle-epc-data

Settle epc data
2026-07-27 23:35:01 +00:00 · 2024-11-12 15:50:06 +00:00 · 2024-11-12 15:50:06 +00:00 · 579d403301
commit 579d403301
parent e25d2f473f dfa37f86d4
4 changed files with 339 additions and 18 deletions
--- a/.idea/Model.iml
+++ b/.idea/Model.iml
@ -7,7 +7,7 @@
      <sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
      <sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
    </content>
-    <orderEntry type="jdk" jdkName="Fastapi-backend" jdkType="Python SDK" />
+    <orderEntry type="jdk" jdkName="Stonewater-wave-3" jdkType="Python SDK" />
    <orderEntry type="sourceFolder" forTests="false" />
  </component>
  <component name="PyNamespacePackagesService">
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@ -3,7 +3,7 @@
  <component name="Black">
    <option name="sdkName" value="Python 3.10 (backend)" />
  </component>
-  <component name="ProjectRootManager" version="2" project-jdk-name="Fastapi-backend" project-jdk-type="Python SDK" />
+  <component name="ProjectRootManager" version="2" project-jdk-name="Stonewater-wave-3" project-jdk-type="Python SDK" />
  <component name="PyCharmProfessionalAdvertiser">
    <option name="shown" value="true" />
  </component>
--- a/etl/customers/settle/route_march_2024_11_08.py
+++ b/etl/customers/settle/route_march_2024_11_08.py
@ -0,0 +1,226 @@
+import os
+import time
+
+import pandas as pd
+from tqdm import tqdm
+
+from dotenv import load_dotenv
+from utils.s3 import read_excel_from_s3
+from backend.SearchEpc import SearchEpc
+from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes
+
+from recommendations.recommendation_utils import (
+    estimate_perimeter,
+    estimate_external_wall_area,
+    estimate_number_of_floors
+)
+
+load_dotenv(dotenv_path="backend/.env")
+EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
+
+
+def get_data(asset_list):
+    epc_data = []
+    errors = []
+    for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)):
+        try:
+            postcode = home["Postcode"]
+            house_number = home["AddressLine1"]
+            full_address = ", ".join([home["AddressLine1"], home["AddressLine4"], home["AddressLine5"]])
+
+            searcher = SearchEpc(
+                address1=str(house_number),
+                postcode=postcode,
+                auth_token=EPC_AUTH_TOKEN,
+                os_api_key="",
+                property_type=None,
+                fast=True,
+                full_address=full_address,
+                max_retries=5
+            )
+            # Force the skipping of estimating the EPC
+            searcher.ordnance_survey_client.property_type = None
+            searcher.ordnance_survey_client.built_form = None
+
+            searcher.find_property(skip_os=True)
+            if searcher.newest_epc is None:
+                continue
+
+            # Look for EPC recommendatons
+            try:
+                property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"])
+            except:
+                property_recommendations = {"rows": []}
+
+            epc = {
+                "row_id": home["row_id"],
+                **searcher.newest_epc.copy(),
+                "recommendations": property_recommendations["rows"]
+            }
+
+            epc_data.append(epc)
+        except Exception as e:
+            errors.append(home["row_id"])
+            time.sleep(5)
+
+    return epc_data, errors
+
+
+def app():
+    """
+    This app is EPC pulling data for some properties owned by Livewest
+
+    Data request contents:
+    Date of last EPC
+    Reason for EPC
+    SAP score on register
+    Property Type
+    Property Area
+    Property Age
+    Any Dimensions (HLP,PW,RH)
+    Property Wall Construction
+    Heating Type
+    Secondary Heating
+    Loft Insulation Depth
+
+    Additional if possible:
+    Heat loss calculations
+    EPC recommendations
+    Property UPRN
+
+    """
+    asset_list = pd.read_excel(
+        "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Settle/SETTLE FULL PROPOSED PROGRAMME.xlsx",
+        header=0
+    )
+    asset_list["row_id"] = asset_list.index
+
+    epc_data, errors = get_data(asset_list)
+
+    # We now retrieve any failed properties
+    asset_list_failed = asset_list[asset_list["row_id"].isin(errors)]
+    epc_data_failed, _ = get_data(asset_list_failed)
+
+    # Append the failed data to the main data
+    epc_data.extend(epc_data_failed)
+
+    epc_df = pd.DataFrame(epc_data)
+
+    # We expand out the recommendations
+    recommendations_df = epc_df[["row_id", "recommendations"]]
+
+    unique_recommendations = set()
+    for _, row in recommendations_df.iterrows():
+        unique_recommendations.update([rec["improvement-summary-text"] for rec in row["recommendations"]])
+
+    columns = ["row_id"] + list(unique_recommendations)
+    transformed_data = []
+    for _, row in recommendations_df.iterrows():
+        # Initialize a dictionary for this row with False for all recommendations
+        row_data = {col: False for col in columns}
+        row_data["row_id"] = row["row_id"]
+
+        # Set True for each recommendation present in this row
+        for rec in row["recommendations"]:
+            recommendation_text = rec["improvement-summary-text"]
+            row_data[recommendation_text] = True
+
+        # Append the row data to transformed_data
+        transformed_data.append(row_data)
+
+    transformed_df = pd.DataFrame(transformed_data)
+    # Drop the column that is ""
+    transformed_df = transformed_df.drop(columns=[""])
+
+    # Retrieve just the data we need
+    epc_df = epc_df[
+        [
+            "row_id",
+            "uprn",
+            "property-type",
+            "built-form",
+            "inspection-date",
+            "current-energy-rating",
+            "current-energy-efficiency",
+            "roof-description",
+            "walls-description",
+            "transaction-type",
+            # New fields needed
+            "secondheat-description",
+            "total-floor-area",
+            "construction-age-band",
+            "floor-height",
+            "number-habitable-rooms",
+            "mainheat-description",
+            #
+            "energy-consumption-current",  # kwh/m2
+        ]
+    ]
+
+    asset_list = asset_list.merge(
+        epc_df,
+        how="left",
+        on="row_id"
+    ).merge(
+        transformed_df,
+        how="left",
+        on="row_id"
+    )
+
+    asset_list = asset_list.drop(columns=["row_id"])
+
+    # Rename the columns
+    asset_list = asset_list.rename(columns={
+        "inspection-date": "Date of last EPC",
+        "current-energy-efficiency": "SAP score on register",
+        "current-energy-rating": "EPC rating on register",
+        "property-type": "Property Type",
+        "built-form": "Archetype",
+        "total-floor-area": "Property Floor Area",
+        "construction-age-band": "Property Age Band",
+        "floor-height": "Property Floor Height",
+        "number-habitable-rooms": "Number of Habitable Rooms",
+        "walls-description": "Wall Construction",
+        "roof-description": "Roof Construction",
+        "mainheat-description": "Heating Type",
+        "secondheat-description": "Secondary Heating",
+        "transaction-type": "Reason for last EPC",
+        "energy-consumption-current": "Heat Demand (kWh/m2)"
+    })
+
+    asset_list["Estimated Number of Floors"] = asset_list.apply(
+        lambda x: estimate_number_of_floors(property_type=x["Property Type"]) if not pd.isnull(
+            x["Property Type"]) else None, axis=1
+    )
+
+    asset_list["Property Floor Area"] = asset_list["Property Floor Area"].astype(float)
+    # Replace "" value with None
+    asset_list["Number of Habitable Rooms"] = asset_list["Number of Habitable Rooms"].replace("", None)
+    asset_list["Number of Habitable Rooms"] = asset_list["Number of Habitable Rooms"].astype(float)
+
+    asset_list["Estimated Perimeter (m)"] = asset_list.apply(
+        lambda x: estimate_perimeter(
+            floor_area=x["Property Floor Area"] / x["Estimated Number of Floors"],
+            num_rooms=x["Number of Habitable Rooms"] / x["Estimated Number of Floors"],
+        ), axis=1
+    )
+
+    asset_list["Estimated Heat Loss Perimeter (m2)"] = asset_list.apply(
+        lambda x: estimate_external_wall_area(
+            num_floors=x["Estimated Number of Floors"],
+            floor_height=float(x["Property Floor Height"]) if x["Property Floor Height"] else 2.5,
+            perimeter=x["Estimated Perimeter (m)"],
+            built_form=x["Archetype"]
+        ),
+        axis=1
+    )
+
+    asset_list["Roof Insulation Thickness"] = asset_list.apply(
+        lambda x: RoofAttributes(description=x["Roof Construction"]).process()["insulation_thickness"] if not pd.isnull(
+            x["Roof Construction"]) else None,
+        axis=1
+    )
+
+    # Store as an excel
+    filename = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Settle/Settle EPC Data pull - 08 Nov 2024.xlsx"
+    asset_list.to_excel(filename, index=False)
--- a/etl/customers/stonewater/Wave
+++ b/etl/customers/stonewater/Wave
@ -8,7 +8,7 @@ from collections import Counter

 CUSTOMER_FOLDER_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater"
 SURVEY_FOLDERS = os.path.join(CUSTOMER_FOLDER_PATH, "StonewaterSurveys_{i}")
-NUM_FOLDERS = 14
+NUM_FOLDERS = 15


 def sap_to_epc(sap_points: int | float):
@ -871,7 +871,10 @@ def main():

    # We now merge on the coordinator data so that against each property, we can map the measures
    retrofit_packages_board = pd.read_excel(
-        os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater 3.0 Updated SAP Pre & Modelled 29.10.24.xlsx"),
+        os.path.join(
+            CUSTOMER_FOLDER_PATH,
+            "Stonewater_SHDF_3_0_Board_work_in_progress_-_Operations_1731315080 11.11.24.xlsx"
+        ),
        header=4
    )
    retrofit_packages_board = retrofit_packages_board[~pd.isnull(retrofit_packages_board["Name"])]
@ -902,13 +905,25 @@ def main():
        # '102 Cheaton Close': '',
        # 'Flat 16 Spring Gardens': '',
        # '4 Apple Close': '',
-        '25 Folly Lane': '',
-
+        # '25 Folly Lane': '',
+        '2 Calshot Walk': 'StonewaterSurveys_3/156-3-2 Calshot Walk-MK41 8QS',
+        '21 Constitution Hill': 'StonewaterSurveys_1/112-11-21 Constitution Hill-BH14 0PX',
+        '22 Constitution Hill': 'StonewaterSurveys_4/185-8-22 Constitution Hill-BH14 0PX',
+        '2 Marches Cottages, School Lane, Leominster': 'StonewaterSurveys_5/224-1-2 School Lane-HR6 8AA',
+        '26, Copthorn House, Brighton Road': 'StonewaterSurveys_15/133-1-26 Brighton Road-KT20 6BQ',
+        '4, Old St Marys, Ripley Lane': "StonewaterSurveys_15/433-3-4 Ripley Lane-KT24 6JG",
+        '1 Nelson House, Short Street': 'StonewaterSurveys_15/89-2-1 Short Street-GU11 1HX',
+        "18 Nelson House, Short Street": 'StonewaterSurveys_15/25-3- 18 Short Street- GU11 1HX',
+        '3 Nelson House, Short Street': 'StonewaterSurveys_2/138-1-3 Short Street-GU11 1HX',
+        '16, Copthorn House, Brighton Road': 'StonewaterSurveys_13/78-3-16 Brighton Road-KT20 6BQ',
+        '20 Nelson House, Short Street': 'StonewaterSurveys_15/89-1-20 Short Street-GU11 1HX',
+        '7 Croft Street': 'StonewaterSurveys_8/333-2-7 Croft Street-HR6 8LA'
    }

    # We now match this retrofit packages board to the extracted data
    matching_lookup = []
    for _, home in tqdm(retrofit_packages_board.iterrows(), total=len(retrofit_packages_board)):
+
        # Handle the case that has the wrong postcode in the asset data
        if home["Name"] in manual_filters:
            filtered = extracted_data[extracted_data["survey_folder"] == manual_filters[home["Name"]]].copy()
@ -972,7 +987,11 @@ def main():
    missing_ids = list(missing_ids)
    if missing_ids:
        # We check that the missing ids have no data yet
-        if len(missing_ids) != 8:
+        # missed = retrofit_packages_board[retrofit_packages_board["Address ID"].isin(missing_ids)]
+        # missed[["Name", "Postcode", "Archetype ID", "Arch. Group Rank"]].to_csv(
+        #     CUSTOMER_FOLDER_PATH + "/missed_debugging.csv")
+
+        if len(missing_ids) != 6:
            raise Exception("Unacceptable number of missings")

    if matching_lookup["Address ID"].duplicated().sum():
@ -1065,12 +1084,20 @@ def main():
    stonewater_data["Package Includes Windows"] = ~pd.isnull(stonewater_data["Window Upgrade"])
    windows_data["Address ID"] = windows_data["Address ID"].astype(float)
    stonewater_data = stonewater_data.merge(windows_data, on="Address ID", how="left")
+    stonewater_data = stonewater_data.sort_values("Archetype ID", ascending=True)

    if stonewater_data["Address ID"].duplicated().sum():
        raise Exception("Duplicate Address IDs")

+    for c in [
+        'Window attributes - Fitted/renewed date',
+        'Parent Asset Window attributes - Fitted/renewed date',
+        'Fitted/renewed date'
+    ]:
+        stonewater_data[c] = stonewater_data[c].astype(str)
+
    # Save this data to excel
-    stonewater_data.to_excel(CUSTOMER_FOLDER_PATH + "/Stonewater - costed retrofit packages.xlsx", index=False)
+    stonewater_data.to_excel(CUSTOMER_FOLDER_PATH + "/Stonewater - costed retrofit packages V2.xlsx", index=False)

    cost_sheet = [
        {
@ -1155,7 +1182,7 @@ def main():

    create_proposed_wave_3_bid(
        costed_packages_filepath=os.path.join(
-            CUSTOMER_FOLDER_PATH, "Stonewater - Costed Retrofit Packages 20241030 (WIP) MR Review v1.xlsx"
+            CUSTOMER_FOLDER_PATH, "Stonewater - Costed Retrofit Packages 20241030 (WIP) Single Model V3.xlsx"
        ),
        archetypes_sheet_filepath=os.path.join(
            CUSTOMER_FOLDER_PATH, "Stonewater SHDF_3_0_Board Triage 22.05.24 - Archetyped V3.1.xlsx"
@ -1165,8 +1192,8 @@ def main():

 def create_proposed_wave_3_bid(costed_packages_filepath, archetypes_sheet_filepath):
    # We read in the costed packages
-    # Note: Header as 12 is for Matt Ratcliff's reviewed version
    costed_packages = pd.read_excel(costed_packages_filepath, header=13, sheet_name="Modelled Packages")
+    costed_packages = costed_packages[~pd.isnull(costed_packages["Address"])]

    archetypes_to_cost = costed_packages[
        [
@ -1195,16 +1222,11 @@ def create_proposed_wave_3_bid(costed_packages_filepath, archetypes_sheet_filepa
                 'Existing Primary Heating System',
                 'Existing Primary Heating PCDF Reference'])

-    # We take properties that are EPC D and below (61% of units)
+    # We take properties that are EPC D and below (59% of units)
    archetypes_to_cost = archetypes_to_cost[archetypes_to_cost["Current EPC Band"].isin(["D", "E", "F", "G"])]

    archetypes_to_cost["Has been modelled"] = ~pd.isnull(archetypes_to_cost["Modelled SAP Band"])

-    average_cost = archetypes_to_cost[
-        archetypes_to_cost["Has been modelled"]
-    ]['Total Cost of Measures inc Contingency'].mean()
-    print(average_cost)
-
    # These are the Arhetypes that will likely be suitable for Wave 3
    archetypes_sheet = pd.read_excel(archetypes_sheet_filepath, header=4)
    archetypes_sheet = archetypes_sheet[~pd.isnull(archetypes_sheet["Address ID"])]
@ -1218,7 +1240,21 @@ def create_proposed_wave_3_bid(costed_packages_filepath, archetypes_sheet_filepa
        how="left"
    )

-    proposed_sample = archetypes_sheet[archetypes_sheet["Archetype ID"].isin(archetypes_to_cost["Archetype ID"])]
+    proposed_sample = archetypes_sheet[
+        archetypes_sheet["Archetype ID"].astype(str).isin(archetypes_to_cost["Archetype ID"].astype(int).astype(str))
+    ]
+
+    not_proposed = archetypes_sheet[
+        ~archetypes_sheet["Archetype ID"].astype(str).isin(archetypes_to_cost["Archetype ID"].astype(int).astype(str))
+    ]
+
+    # archetypes_without_survey = []
+    # for p in list(set(not_proposed)):
+    #     filtered = costed_packages[costed_packages["Archetype ID"].astype(int).astype(str) == p]
+    #     if filtered.empty:
+    #         archetypes_without_survey.append(p)
+
+    # Can we propose anything about archetypes that were not surveyed?

    proposed_sample = proposed_sample[
        [
@ -1229,6 +1265,8 @@ def create_proposed_wave_3_bid(costed_packages_filepath, archetypes_sheet_filepa

    # We classify into high and low confidence

+    archetypes_to_cost["Surveyed Main Roof"] = archetypes_to_cost["Surveyed Main Roof"].fillna("")
+
    match_classification = []
    for _, home in tqdm(proposed_sample.iterrows(), total=len(proposed_sample)):

@ -1313,8 +1351,65 @@ def create_proposed_wave_3_bid(costed_packages_filepath, archetypes_sheet_filepa
        None, proposed_sample["Total Cost of Measures inc Contingency"]
    )

+    proposed_sample = proposed_sample.sort_values("Archetype ID", ascending=True)
+
    # Save excel
-    proposed_sample.to_excel(CUSTOMER_FOLDER_PATH + "/Stonewater - Proposed Wave 3 Bid (WIP).xlsx", index=False)
+    proposed_sample.to_excel(CUSTOMER_FOLDER_PATH + "/Stonewater - Proposed Wave 3 Bid V2 (WIP).xlsx", index=False)
+
+    # For each postcode that's in the bid, we also summarise the number of units in the bid and number left out
+    proposed_sample_postcodes = proposed_sample["Postcode"].unique()
+
+    postcode_summary = []
+    for postcode in proposed_sample_postcodes:
+        in_proposal = proposed_sample[proposed_sample["Postcode"] == postcode]
+        not_in_proposal = not_proposed[not_proposed["Postcode"] == postcode]
+        postcode_summary.append(
+            {
+                "Postcode": postcode,
+                "Number of properties in Proposal": len(in_proposal),
+                "Number of properties not in Proposal": len(not_in_proposal)
+            }
+        )
+    postcode_summary = pd.DataFrame(postcode_summary)
+    postcode_summary = postcode_summary.sort_values(
+        "Number of properties not in Proposal",
+        ascending=False).reset_index(drop=True)
+
+    postcode_summary.to_excel(
+        CUSTOMER_FOLDER_PATH + "/Stonewater - Proposed Wave 3 Bid Postcode Summary.xlsx", index=False
+    )
+
+
+def find_remaining_surveys():
+    """
+    This compares a list of properties that have been surveyed against a list of properties that I have produced
+    costed retrofit packages for, so I know what needs to be downloaded from Sharepoint
+    :return:
+    """
+
+    surveyed = pd.read_excel(
+        "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater"
+        "/Stonewater_SHDF_3_0_Board_work_in_progress_- 07.11.24.xlsx",
+        header=4
+    )
+
+    costed = pd.read_excel(
+        "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater - Costed Retrofit Packages "
+        "20241030 (WIP) MR Review v1.xlsx",
+        header=13,
+        sheet_name="Modelled Packages"
+    )
+    costed = costed[~pd.isnull(costed["Address ID"])]
+
+    needed = surveyed[~surveyed["Address ID"].isin(costed["Address ID"])]
+
+    needed["id"] = needed["Archetype ID"].astype(str) + "-" + needed["Arch. Group Rank"].astype(str)
+    needed = needed.sort_values("id", ascending=True)
+    needed[["id", "Name", "Postcode"]].to_csv(
+        "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/needed_surveys.csv"
+    )
+
+    assert needed.shape[0] + costed.shape[0] == surveyed.shape[0]

 # if __name__ == "__main__":
 #     main()