Model/etl/route_march_data_pull/app.py

import os
import time

import pandas as pd
import numpy as np
from tqdm import tqdm

from dotenv import load_dotenv
from backend.SearchEpc import SearchEpc
from etl.find_my_epc.RetrieveFindMyEpc import RetrieveFindMyEpc
from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes

from recommendations.recommendation_utils import (
    estimate_perimeter,
    estimate_external_wall_area,
    estimate_number_of_floors
)

load_dotenv(dotenv_path="backend/.env")
EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")


def get_data(asset_list, fulladdress_column, address1_column, postcode_column):
    epc_data = []
    errors = []
    for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)):
        postcode = home[postcode_column]
        house_number = home[address1_column]
        full_address = home[fulladdress_column]

        searcher = SearchEpc(
            address1=str(house_number),
            postcode=postcode,
            auth_token=EPC_AUTH_TOKEN,
            os_api_key="",
            property_type=None,
            fast=True,
            full_address=full_address,
            max_retries=5
        )
        # Force the skipping of estimating the EPC
        searcher.ordnance_survey_client.property_type = None
        searcher.ordnance_survey_client.built_form = None

        searcher.find_property(skip_os=True)
        if searcher.newest_epc is None:
            continue

        # Look for EPC recommendatons
        try:
            property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"])
        except:
            property_recommendations = {"rows": []}

        # Retrieve data from FindMyEPC
        find_epc_searcher = RetrieveFindMyEpc(
            address=searcher.newest_epc["address"], postcode=searcher.newest_epc["postcode"]
        )
        find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data()
        time.sleep(np.random.uniform(0.1, 1))
        try:
            postcode = home[postcode_column]
            house_number = home[address1_column]
            full_address = home[fulladdress_column]

            searcher = SearchEpc(
                address1=str(house_number),
                postcode=postcode,
                auth_token=EPC_AUTH_TOKEN,
                os_api_key="",
                property_type=None,
                fast=True,
                full_address=full_address,
                max_retries=5
            )
            # Force the skipping of estimating the EPC
            searcher.ordnance_survey_client.property_type = None
            searcher.ordnance_survey_client.built_form = None

            searcher.find_property(skip_os=True)
            if searcher.newest_epc is None:
                continue

            # Look for EPC recommendatons
            try:
                property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"])
            except:
                property_recommendations = {"rows": []}

            # Retrieve data from FindMyEPC
            find_epc_searcher = RetrieveFindMyEpc(
                address=searcher.newest_epc["address"], postcode=searcher.newest_epc["postcode"]
            )
            find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data()
            time.sleep(np.random.uniform(0.1, 1))

            epc = {
                "row_id": home["row_id"],
                **searcher.newest_epc.copy(),
                "recommendations": property_recommendations["rows"],
                "find_my_epc_data": find_epc_data,
            }

            epc_data.append(epc)
        except Exception as e:
            errors.append(home["row_id"])
            time.sleep(5)

    return epc_data, errors


def extract_address1(asset_list, full_address_col, method="first_two_words"):
    if method == "first_two_words":
        asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[:2].str.join(" ")
        return asset_list

    raise ValueError(f"Method {method} not recognized")


def app():
    """
    This app is EPC pulling data for some properties owned by Livewest

    Data request contents:
    Date of last EPC
    Reason for EPC
    SAP score on register
    Property Type
    Property Area
    Property Age
    Any Dimensions (HLP,PW,RH)
    Property Wall Construction
    Heating Type
    Secondary Heating
    Loft Insulation Depth

    Additional if possible:
    Heat loss calculations
    EPC recommendations
    Property UPRN

    """
    DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/P&F/"
    DATA_FILENAME = "BELOW C - WFT FINDINGS ON INSPECTION PLUS SUGGESTED ACTION.xlsx"
    POSTCODE_COLUMN = "Postcode"
    FULLADDRESS_COLUMN = "Address"
    ADDRESS1_COLUMN = None
    ADDRESS1_METHOD = "first_two_words"

    asset_list = pd.read_excel(os.path.join(DATA_FOLDER, DATA_FILENAME), header=0)
    asset_list["row_id"] = asset_list.index

    # We clean up portential non-breaking spaces, and double spaces
    for col in [c for c in [POSTCODE_COLUMN, FULLADDRESS_COLUMN, ADDRESS1_COLUMN] if c is not None]:
        asset_list[col] = asset_list[col].str.replace('\xa0', ' ', regex=False)
        asset_list[col] = asset_list[col].str.replace('  ', ' ', regex=False)

    if ADDRESS1_COLUMN is None:
        ADDRESS1_COLUMN = "address1_extracted"
        asset_list = extract_address1(asset_list, FULLADDRESS_COLUMN, ADDRESS1_METHOD)

    epc_data, errors = get_data(
        asset_list=asset_list,
        fulladdress_column=FULLADDRESS_COLUMN,
        address1_column=ADDRESS1_COLUMN,
        postcode_column=POSTCODE_COLUMN
    )

    # We now retrieve any failed properties
    asset_list_failed = asset_list[asset_list["row_id"].isin(errors)]
    epc_data_failed, _ = get_data(
        asset_list=asset_list_failed,
        fulladdress_column=FULLADDRESS_COLUMN,
        address1_column=ADDRESS1_COLUMN,
        postcode_column=POSTCODE_COLUMN
    )

    # Append the failed data to the main data
    epc_data.extend(epc_data_failed)

    epc_df = pd.DataFrame(epc_data)

    # We expand out the recommendations
    recommendations_df = epc_df[["row_id", "recommendations"]]

    unique_recommendations = set()
    for _, row in recommendations_df.iterrows():
        unique_recommendations.update([rec["improvement-summary-text"] for rec in row["recommendations"]])

    columns = ["row_id"] + list(unique_recommendations)
    transformed_data = []
    for _, row in recommendations_df.iterrows():
        # Initialize a dictionary for this row with False for all recommendations
        row_data = {col: False for col in columns}
        row_data["row_id"] = row["row_id"]

        # Set True for each recommendation present in this row
        for rec in row["recommendations"]:
            recommendation_text = rec["improvement-summary-text"]
            row_data[recommendation_text] = True

        # Append the row data to transformed_data
        transformed_data.append(row_data)

    transformed_df = pd.DataFrame(transformed_data)
    # Drop the column that is ""
    transformed_df = transformed_df.drop(columns=[""])

    # Get the find my epc data
    find_my_epc_data = epc_df[["row_id", "find_my_epc_data"]].drop(columns=["find_my_epc_data"]).join(
        pd.json_normalize(epc_df["find_my_epc_data"])
    )
    # We check if we get the solar pv column:
    if "Solar photovoltaics" not in find_my_epc_data.columns:
        find_my_epc_data["Solar photovoltaics"] = False

    # Retrieve just the data we need
    epc_df = epc_df[
        [
            "row_id",
            "uprn",
            "property-type",
            "built-form",
            "inspection-date",
            "current-energy-rating",
            "current-energy-efficiency",
            "roof-description",
            "walls-description",
            "transaction-type",
            # New fields needed
            "secondheat-description",
            "total-floor-area",
            "construction-age-band",
            "floor-height",
            "number-habitable-rooms",
            "mainheat-description",
            #
            "energy-consumption-current",  # kwh/m2
            "photo-supply",
        ]
    ]

    asset_list = asset_list.merge(
        epc_df,
        how="left",
        on="row_id"
    ).merge(
        find_my_epc_data[
            [
                "row_id", "heating_text", "hot_water_text", 'Assessor’s name',
                "Assessor's Telephone", "Assessor's Email", "Accreditation scheme",
                "Assessor’s ID", "Solar photovoltaics"
            ]
        ].rename(
            columns={
                "Solar photovoltaics": "Has Solar PV",
                "heating_text": "Heating Estimated kWh",
                "hot_water_text": "Hot Water Estimated kWh",
            }
        ),
        how="left",
        on="row_id"
    )

    asset_list["Has Solar PV"] = asset_list["Has Solar PV"] | ~asset_list["photo-supply"].isin(["0.0", 0, None, ""])
    asset_list = asset_list.drop(columns=["photo-supply"])

    # Rename the columns
    asset_list = asset_list.rename(columns={
        "inspection-date": "Date of last EPC",
        "current-energy-efficiency": "SAP score on register",
        "current-energy-rating": "EPC rating on register",
        "property-type": "Property Type",
        "built-form": "Archetype",
        "total-floor-area": "Property Floor Area",
        "construction-age-band": "Property Age Band",
        "floor-height": "Property Floor Height",
        "number-habitable-rooms": "Number of Habitable Rooms",
        "walls-description": "Wall Construction",
        "roof-description": "Roof Construction",
        "mainheat-description": "Heating Type",
        "secondheat-description": "Secondary Heating",
        "transaction-type": "Reason for last EPC",
        "energy-consumption-current": "Heat Demand (kWh/m2)",
    })

    asset_list["Estimated Number of Floors"] = asset_list.apply(
        lambda x: estimate_number_of_floors(property_type=x["Property Type"]) if not pd.isnull(
            x["Property Type"]) else None, axis=1
    )

    asset_list["Property Floor Area"] = asset_list["Property Floor Area"].astype(float)
    # Replace "" value with None
    asset_list["Number of Habitable Rooms"] = asset_list["Number of Habitable Rooms"].replace("", None)
    asset_list["Number of Habitable Rooms"] = asset_list["Number of Habitable Rooms"].astype(float)

    asset_list["Estimated Perimeter (m)"] = asset_list.apply(
        lambda x: estimate_perimeter(
            floor_area=x["Property Floor Area"] / x["Estimated Number of Floors"],
            num_rooms=x["Number of Habitable Rooms"] / x["Estimated Number of Floors"],
        ), axis=1
    )

    asset_list["Estimated Heat Loss Perimeter (m2)"] = asset_list.apply(
        lambda x: estimate_external_wall_area(
            num_floors=x["Estimated Number of Floors"],
            floor_height=float(x["Property Floor Height"]) if x["Property Floor Height"] else 2.5,
            perimeter=x["Estimated Perimeter (m)"],
            built_form=x["Archetype"]
        ),
        axis=1
    )

    asset_list["Roof Insulation Thickness"] = asset_list.apply(
        lambda x: RoofAttributes(description=x["Roof Construction"]).process()["insulation_thickness"] if not pd.isnull(
            x["Roof Construction"]) else None,
        axis=1
    )

    # For all of the columns in transformed_df, prefix with "Recommendation: "
    for col in transformed_df.columns:
        if col == "row_id":
            continue
        transformed_df = transformed_df.rename(columns={col: f"Recommendation: {col}"})

    asset_list = asset_list.merge(
        transformed_df,
        how="left",
        on="row_id"
    )
    asset_list = asset_list.drop(columns=["row_id"])

    # Store as an excel
    filename = os.path.join(DATA_FOLDER, ".".join(DATA_FILENAME.split(".")[:-1])) + " EPC Data Pull.xlsx"
    asset_list.to_excel(filename, index=False)