Model/etl/customers/stonewater/potential_eco_properties.py

import os
import time
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
from dotenv import load_dotenv
from backend.SearchEpc import SearchEpc
from utils.s3 import read_from_s3, read_pickle_from_s3
import msoffcrypto
from io import BytesIO

load_dotenv(dotenv_path="backend/.env")
EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")


def get_data(asset_list):
    epc_data = []
    errors = []
    for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)):
        try:
            postcode = home["Postcode"]
            house_number = home["Number"]
            full_address = home["Full Address"]

            searcher = SearchEpc(
                address1=str(house_number),
                postcode=postcode,
                auth_token=EPC_AUTH_TOKEN,
                os_api_key="",
                property_type=None,
                fast=True,
                full_address=full_address,
                max_retries=5
            )
            # Force the skipping of estimating the EPC
            searcher.ordnance_survey_client.property_type = None
            searcher.ordnance_survey_client.built_form = None

            searcher.find_property(skip_os=True)
            if searcher.newest_epc is None:
                continue

            # Look for EPC recommendatons
            try:
                property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"])
            except:
                property_recommendations = {"rows": []}

            epc = {
                "row_id": home["row_id"],
                **searcher.newest_epc.copy(),
                "recommendations": property_recommendations["rows"]
            }

            epc_data.append(epc)
        except Exception as e:
            errors.append(home["row_id"])
            time.sleep(5)

    return epc_data, errors


def app():
    """
    This code creates a list of cavity properties, for review
    """

    # Read in the password protected master
    # TODO: This file should be deleted!

    # Path to the password-protected Excel file
    file_path = ("/Users/khalimconn-kowlessar/Downloads/STONEWATER MASTER SHEET - UPDATED 20.5.24 - K- PASSWORD "
                 "PROTECTED.xlsx")
    password = "STONE123"  # Replace with the actual password

    # Open the file and decrypt it
    with open(file_path, "rb") as f:
        decrypted_file = BytesIO()
        office_file = msoffcrypto.OfficeFile(f)
        office_file.load_key(password=password)
        office_file.decrypt(decrypted_file)

    # Read the decrypted file into a DataFrame
    eco_rolling_master = pd.read_excel(decrypted_file, sheet_name="Sheet1", engine="openpyxl")

    eco_rolling_master = eco_rolling_master[
        ~eco_rolling_master['INSTALL/CANCELLATION DATE'].str.contains("CANCELLED")
    ]

    archetyped_properties = pd.read_excel(
        "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24 - "
        "Archetyped V3.1.xlsx",
        header=4
    )

    cavity_descriptions = [
        "Cavity: AsBuilt (1983-1995)",
        "Cavity: AsBuilt (Post 1995)",
        "Cavity: AsBuilt (Pre 1976)",
        "Cavity: AsBuilt (1976-1982)",
    ]

    archetyped_properties["Is Cavity Property"] = archetyped_properties["Wall Type"].isin(cavity_descriptions)
    # We also identify any properties where properties were found to need cavity wall insulation

    costed_packages = pd.read_excel(
        "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater - Costed Retrofit Packages "
        "20241030 (WIP) Single Model V2.xlsx",
        sheet_name="Modelled Packages",
        header=13
    )

    needs_cwi = costed_packages[
        costed_packages["Main Wall Insulation"].isin(
            [
                "Poss Extract CWI & Refill (issues identified)",
                "CWI RdSAP Default"
            ]
        )
    ][["Address ID", "Address", "Current SAP Rating", "Current EPC Band", "Postcode", "Archetype ID",
       "Main Wall Insulation",
       "Main Roof Type", "Main Roof Insulation", "Main Roof Insulation Thickness"]]

    # We flag these properties
    archetyped_properties["Survey shows CWI needed for Archetype"] = archetyped_properties["Archetype ID"].isin(
        needs_cwi["Archetype ID"]
    )

    archetyped_properties = archetyped_properties[~pd.isnull(archetyped_properties["Address ID"])]
    archetyped_properties = archetyped_properties[archetyped_properties["Address ID"] != "Address ID"]

    # this is the big list!!!
    features = pd.read_csv(
        "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Osmosis Reviewed - Parity Download 18.7 - "
        "master sheet.csv",
        encoding='latin1'
    )
    features["Address ID"] = features["Address ID"].astype(str)

    features_to_merge = features[
        [
            "Address ID", "Organisation Reference", "Age", "Property Type", "Walls", "Roofs", "Glazing", "Heating",
            "Main Fuel",
            "Hot Water",
            "Renewables", "Total Floor Area"
        ]
    ]

    stonewater_cavity_properties = archetyped_properties[
        ["Name", "Postcode", "Osm. ID", "Org. ref.", "Address ID", "UPRN", "UDPRN", "Archetype ID", "House no",
         "Street name",
         "Address line 2", "City/Town", "Is Cavity Property", "Survey shows CWI needed for Archetype"]
    ].merge(
        features_to_merge, how="left", on="Address ID"
    )

    # We filter this down to the properties that are cavity properties
    stonewater_cavity_properties = stonewater_cavity_properties[
        stonewater_cavity_properties["Is Cavity Property"] |
        stonewater_cavity_properties["Survey shows CWI needed for Archetype"]
        ]

    stonewater_cavity_properties["Reason Included"] = "As Built Cavity Property"
    stonewater_cavity_properties["Reason Included"] = np.where(
        stonewater_cavity_properties["Survey shows CWI needed for Archetype"] &
        ~stonewater_cavity_properties["Is Cavity Property"],
        "Survey revealed potential need for CWI or extract and re-fill",
        stonewater_cavity_properties["Reason Included"]
    )
    stonewater_cavity_properties["Reason Included"] = np.where(
        stonewater_cavity_properties["Survey shows CWI needed for Archetype"] &
        stonewater_cavity_properties["Is Cavity Property"],
        "Surveyed revealed potential need for CWI or extract and re-fill and is an as built cavity property",
        stonewater_cavity_properties["Reason Included"]
    )
    # We indicate the exact properties that need CWI, based on survey findings
    stonewater_cavity_properties["Reason Included"] = np.where(
        stonewater_cavity_properties["Address ID"].isin(
            needs_cwi[needs_cwi["Main Wall Insulation"] == "CWI RdSAP Default"]["Address ID"].astype(int).astype(
                str).values
        ),
        "Survey showed this property needs CWI",
        stonewater_cavity_properties["Reason Included"]
    )

    stonewater_cavity_properties["Reason Included"] = np.where(
        stonewater_cavity_properties["Address ID"].isin(
            needs_cwi[needs_cwi["Main Wall Insulation"] == "Poss Extract CWI & Refill (issues identified)"][
                "Address ID"].astype(int).astype(str).values
        ),
        "Survey showed this property could need extract and re-fill",
        stonewater_cavity_properties["Reason Included"]
    )

    # We flag units that were installed under ECO3
    numeric_ids = eco_rolling_master[eco_rolling_master["STONEWATER UPRN"] != "NOT ON ASSET LIST"]
    numeric_ids = numeric_ids[~pd.isnull(numeric_ids["STONEWATER UPRN"])]
    numeric_ids["STONEWATER UPRN"] = numeric_ids["STONEWATER UPRN"].astype(int)

    stonewater_cavity_properties["Installed under ECO3"] = stonewater_cavity_properties["Org. ref."].isin(
        numeric_ids['STONEWATER UPRN'].values
    )

    # Which postcodes were installed under ECO3
    priority_list_eco3 = stonewater_cavity_properties[
        stonewater_cavity_properties["Installed under ECO3"]
    ]["Postcode"].unique()

    # These are properties that were not installed under ECO3, that have the same postcodes as properties
    # installed under ECO3

    # These are 66 properties we might want to start with as an immediate priority
    stonewater_cavity_properties["Same Postcode as Installed under ECO3"] = (
        ~stonewater_cavity_properties["Installed under ECO3"] & (
        stonewater_cavity_properties["Postcode"].isin(priority_list_eco3)
    )
    )

    stonewater_cavity_properties["UPRN"] = stonewater_cavity_properties["UPRN"].astype("Int64").astype(str)
    # Find the postcodes where an Osmosis survey revealed a need for CWI
    postcodes_found_needing_cwi = stonewater_cavity_properties[
        stonewater_cavity_properties["Reason Included"].isin(
            [
                "Survey revealed potential need for CWI or extract and re-fill",
                "Surveyed revealed potential need for CWI or extract and re-fill and is an as built cavity property",
                "Survey showed this property needs CWI",
                "Survey showed this property could need extract and re-fill"
            ]
        )
    ]["Postcode"].unique()

    stonewater_cavity_properties["Suspected Needs CWI - not surveyed"] = (
        (
            stonewater_cavity_properties[
                "Postcode"].isin(
                postcodes_found_needing_cwi)
        ) & (
            ~stonewater_cavity_properties[
                "Reason Included"].isin(
                [
                    "Survey revealed potential need "
                    "for CWI or extract and re-fill",
                    "Surveyed revealed potential "
                    "need for CWI or extract and "
                    "re-fill and is an as built "
                    "cavity property",
                    "Survey showed this property "
                    "needs CWI",
                    "Survey showed this property "
                    "could need extract and re-fill"
                ]
            )
        )
    )

    # Merge the EPCs on, with the data we need
    stonewater_cavity_properties = stonewater_cavity_properties.rename(
        columns={
            "Age": "Parity - Build Age",
            "Property Type": "Parity - Property Type",
            "Walls": "Parity - Wall Construction",
            "Roofs": "Parity - Roof Construction",
            "Glazing": "Parity - Glazing Type",
            "Heating": "Parity - Heating Type",
            "Main Fuel": "Parity - Main Fuel",
            "Hot Water": "Parity - Hot Water",
            "Renewables": "Parity - Renewables",
            "Total Floor Area": "Parity - Total Floor Area"
        }
    )

    # We now flag the additional properties in the as built list

    additional_properties = features[
        ~features["Address ID"].isin(archetyped_properties["Address ID"].values)
    ]

    # Filter on as built cavity properties
    additional_properties = additional_properties[
        additional_properties["Walls"].isin(cavity_descriptions)
    ]
    additional_properties["Full Address"] = additional_properties["Address"].copy()
    house_numbers = []
    for _, x in tqdm(additional_properties.iterrows(), total=len(additional_properties)):
        house_no = SearchEpc.get_house_number(x["Address"].split(",")[0], x["Postcode"])
        if house_no is None:
            house_no = x["Address"].split(",")[0]
        # If we end up with a number like "01" we need to remove the leading zero
        house_no = house_no.lstrip("0")
        house_numbers.append(
            {
                "Address ID": x["Address ID"],
                "Number": house_no
            }
        )

    house_numbers = pd.DataFrame(house_numbers)
    additional_properties = additional_properties.merge(house_numbers, how="left", on="Address ID")
    additional_properties["row_id"] = additional_properties["Address ID"].copy()

    # Flag any units in this list that were installed under ECO3
    additional_properties["Installed under ECO3"] = additional_properties["Organisation Reference"].isin(
        numeric_ids['STONEWATER UPRN'].values
    )

    # Additional list ECO3
    additional_list_eco3 = additional_properties[additional_properties["Installed under ECO3"]]["Postcode"].unique()

    # These are properties that were not installed under ECO3, that have the same postcodes as properties
    # installed under ECO3
    # These are 297 properties we might want to start with as an immediate priority
    additional_properties["Same Postcode as Installed under ECO3"] = (
        ~additional_properties["Installed under ECO3"] & (
        additional_properties["Postcode"].isin(additional_list_eco3)
    )
    )

    # We do some additional manual checks, for ECO3 properties that were installed that didn't get matched to either
    # dataaset
    numeric_ids["In asset list"] = numeric_ids["STONEWATER UPRN"].isin(
        stonewater_cavity_properties['Org. ref.'].astype(int).values
    )
    numeric_ids["In asset list"] = numeric_ids["In asset list"] | (
        numeric_ids["STONEWATER UPRN"].isin(
            additional_properties['Organisation Reference'].astype(int).values
        )
    )

    # eco3_installs_not_in_asset_list = numeric_ids[~numeric_ids["In asset list"]]
    # # We now take samples of properties randomly and manually check the ID against the asset list
    # print(eco3_installs_not_in_asset_list.sample(1)[["STONEWATER UPRN", "Post Code", "NO ", "Street / Block Name", ]])
    # # Checked STONEWATER UPRN
    # # 9862, BH15 1NR, 33, THE QUAY FOYER [x]
    # # 12785, S01 66PN, 57, SEACOLE GARDENS [x]
    # # 26071,  MK42 0TE,  51,  De Havilland Avenue, Shortstown [x]
    # # 18213,  HR6 9UW, 20 Ford Street [x]
    # # 24344, LU4 9FF, 6 SEAL CLOSE [x]
    # # 31222,  SN14 0QZ, 7 HARDBROOK COURT [x]
    # # 9343, SP4 7XL, 10 OAK PLACE [x]
    # # 34730, LU5 5TN, 4 TUDOR DRIVE [x]
    # # 7021,  BN27 2BZ, 32 BUTTS FIELD []
    #
    # stonewater_cavity_properties[stonewater_cavity_properties['Org. ref.'] == 7021]
    # stonewater_cavity_properties[stonewater_cavity_properties['Postcode'] == "BN27 2BZ"]["Name"]
    #
    # additional_properties[additional_properties['Organisation Reference'] == 7021]
    # additional_properties[additional_properties['Postcode'] == "BN27 2BZ"][["Address"]]

    # Pull the EPCs for these properties
    # additional_properties_epcs, errors = get_data(additional_properties)

    # Save this data as a pickle
    # import pickle
    # with open("/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/additional_properties_epcs.pkl",
    # "wb") as f:
    #     pickle.dump(additional_properties_epcs, f)

    additional_properties["Suspected Needs CWI - not surveyed"] = (
        (
            additional_properties["Postcode"].isin(postcodes_found_needing_cwi) &
            ~additional_properties["Installed under ECO3"]
        )
    )

    # We drop Full Address
    additional_properties = additional_properties.drop(columns=["Full Address"])
    additional_properties2 = additional_properties[[
        "Address", "Postcode", "Address ID", "SAP", "SAP Band", "Property Type", "Walls", "Roofs", "Glazing",
        "Heating", "Main Fuel", "Hot Water", "Renewables", "Total Floor Area", 'Installed under ECO3',
        'Same Postcode as Installed under ECO3', "Organisation Reference",
    ]].rename(
        columns={
            "Organisation Reference": "Org. ref.",
            "SAP": "Parity - Predicted SAP",
            "SAP Band": "Parity - Predicted SAP Band",
            "Age": "Parity - Build Age",
            "Property Type": "Parity - Property Type",
            "Walls": "Parity - Wall Construction",
            "Roofs": "Parity - Roof Construction",
            "Glazing": "Parity - Glazing Type",
            "Heating": "Parity - Heating Type",
            "Main Fuel": "Parity - Main Fuel",
            "Hot Water": "Parity - Hot Water",
            "Renewables": "Parity - Renewables",
            "Total Floor Area": "Parity - Total Floor Area"
        }
    )

    # Combine the data:

    stonewater_cavity_properties2 = stonewater_cavity_properties.merge(
        features[["Address", "Organisation Reference"]], how="left", on="Organisation Reference"
    )
    full_dataset = pd.concat([stonewater_cavity_properties2, additional_properties2])
    full_dataset = full_dataset.drop(columns=['Osm. ID'])

    # We not define the priority list for non-intrusives
    full_dataset["Postal Region"] = full_dataset["Postcode"].str.split(" ").str[0].str[0:2]
    full_dataset["Postal Region 2"] = full_dataset["Postcode"].str.split(" ").str[0]

    # Strip out anything we definitely don't want
    full_dataset = full_dataset[~full_dataset["Installed under ECO3"]]

    areas = full_dataset[full_dataset["Suspected Needs CWI - not surveyed"] == True]["Postal Region 2"].unique()

    priorities = full_dataset[
        full_dataset["Postal Region 2"].isin(areas)
    ]

    region_prevalance = priorities["Postal Region 2"].value_counts().to_frame().reset_index()
    region_prevalance = region_prevalance[region_prevalance["count"] > 100]
    df = priorities[priorities["Postal Region 2"].isin(region_prevalance["Postal Region 2"].values)]

    df["Postal Region"].value_counts()
    df["Postal Region 2"].value_counts()

    if df["Installed under ECO3"].sum():
        raise ValueError("There are properties in the priority list that were installed under ECO3")

    df.to_csv(
        "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Non-intrusives/10022025 Non-Intrusives - "
        "revised list.csv",
        index=False
    )

    # We save the data locally
    # stonewater_cavity_properties.to_csv(
    #     "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Cavity Properties - priority "
    #     "postcodes.csv",
    #     index=False
    # )
    # additional_properties2.to_csv(
    #     "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Additional Cavity Properties - "
    #     "non-priority postcodes.csv",
    #     index=False
    # )
    # # Save the survey findings
    # needs_cwi.to_csv(
    #     "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Properties Needing CWI -
    #     WIP.csv",
    #     index=False
    # )


def cross_reference_epc_programme():
    eco3_fallout = pd.read_excel(
        "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/STONEWATER LIST OF ADDRESSES TO BE "
        "SURVEYED - ECO3 NOT COMPLETED.xlsx"
    )

    for _, x in eco3_fallout.iterrows():
        house_no = SearchEpc.get_house_number(x["ADDRESS"], "")
        if house_no is None:
            house_no = x["ADDRESS"].split(",")[0]
        x["house_number"] = house_no

    eco3_fallout["house_number"] = eco3_fallout.apply(
        lambda x: SearchEpc.get_house_number(x["ADDRESS"], ""), axis=1
    )

    # for _, x in eco3_fallout.ite

    stonewater_modelled_above_c = pd.read_csv(
        "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Osmosis Reviewed - Parity Download 18.7 - "
        "master sheet.csv",
        encoding='latin1'
    )

    stonewater_modelled_above_c["house_number"] = stonewater_modelled_above_c.apply(
        lambda x: SearchEpc.get_house_number(x["Address"], x["Postcode"]), axis=1
    )

    eco3_fallout_matched_to_above_c = []
    for _, property in eco3_fallout.iterrows():
        # Match on house number
        match = stonewater_modelled_above_c[
            stonewater_modelled_above_c["house_number"] == property["house_number"]
            ]

        # We do a fuzzy match on the address, with levenstein distance

        from fuzzywuzzy import fuzz
        match = stonewater_modelled_above_c[
            stonewater_modelled_above_c["Address"].apply(lambda x: fuzz.ratio(x, property["ADDRESS"]) > 90)
        ]
        match.head()


def finalise_list_for_non_intrusives():
    non_intrusives_list = pd.read_excel(
        "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Non-intrusives/20250207 Stonewater "
        "Non-Intrusives.xlsx"
    )

    # Remove anything installed under ECO3
    non_intrusives_list = non_intrusives_list[~non_intrusives_list["Installed under ECO3"]]

    # We make any properties that were surveyed by Osmosis
    packages = pd.read_excel(
        "/Users/khalimconn-kowlessar/Downloads/Stonewater - Bid Packages WIP 14.11.20 V2 "
        "(1).xlsx",
        header=13,
        sheet_name="Modelled Packages"
    )

    non_intrusives_list["Surveyed by Osmosis"] = non_intrusives_list["Address ID"].isin(
        packages["Address ID"].values
    )
    # Removed 54 addresses
    final_non_intrusives = non_intrusives_list[
        ~non_intrusives_list["Surveyed by Osmosis"]
    ]

    features = pd.read_csv(
        "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Osmosis Reviewed - Parity Download 18.7 - "
        "master sheet.csv",
        encoding='latin1'
    )

    # Add on the orgnisaion reference
    final_non_intrusives = final_non_intrusives.merge(
        features[["Organisation Reference", "Address ID"]],
        how="left",
        on="Address ID"
    )

    final_non_intrusives["Postal Region"] = final_non_intrusives["Postcode"].str.split(" ").str[0].str[0:2]
    selected_regions = final_non_intrusives[
        final_non_intrusives["Include in non-intrusives"]
    ]["Postcode"].unique()

    final_non_intrusives["Is in region"] = final_non_intrusives["Postcode"].isin(selected_regions)

    # Filter down:
    final_non_intrusives = final_non_intrusives[
        final_non_intrusives["Is in region"]
    ]

    final_non_intrusives.to_excel(
        "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Non-intrusives/10022025 Non-Intrusives "
        "List - final.xlsx")