Model/etl/customers/urban_splash/asset_list.py

import os

import pandas as pd
from tqdm import tqdm

from dotenv import load_dotenv
from utils.s3 import read_excel_from_s3
from backend.SearchEpc import SearchEpc
from epc_api.client import EpcClient
from utils.s3 import save_csv_to_s3

# Read in the .env file in backend
load_dotenv(dotenv_path="backend/.env")
EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")

USER_ID = 8
PORTFOLIO_ID = 66
SECOND_SCENARIO_PORTFOLIO_ID = 65

# We also create a second portfolio for a subset of properties that do not meet the install requirements
# We drop these uprns from the first plan
second_portfolio_uprns = [
    10070056840, 10070056846, 10070056847, 10070056843, 10070056848, 10070056844, 10070056849,
    10070056829, 10070056920, 10023345463
]


def app():
    """
    This application will read in the Urban Splash data, in the dev AWS account, and pre-process it. There are a
    few issues with the file, including incorrect postcodes.

    The customer is interested in the following:
    - Getting properties to an EPC C
    - Doing do within a budget of £5,000
    :return:
    """

    potential_postcodes = ["BD9 5BQ", "BD9 5BR", "BD9 5BN"]

    raw_asset_list = read_excel_from_s3(
        bucket_name="retrofit-datalake-dev",
        file_key="customers/urban_splash/raw_asset_list/USRF - Velvet Mill EPC.xlsx",
        header_row=2
    )

    # We have a series of apartment numbers that are "Apartment 001", "Apartment 002", etc. We need to convert these
    # to "Apartment 1", "Apartment 2", etc.
    raw_asset_list["address1"] = raw_asset_list["Unit Number"].str.replace(
        "Apartment 00", "Apartment ", regex=True
    )
    raw_asset_list["address1"] = raw_asset_list["address1"].str.replace(
        "Apartment 0", "Apartment ", regex=True
    )

    # For each entry in the asset list, we make an api call to the EPC database to get the EPC data. We'll retrieve the
    # uprn for the property, as well as a nice address and postcode that we can use. We'll also try and deduce the
    # likely wall construction, since many of the homes are new builds, based on their newest EPC

    epc_data = []
    processed_asset_list = []
    for _, row in tqdm(raw_asset_list.iterrows(), total=len(raw_asset_list)):

        newest_epc = None
        idx = 0

        while newest_epc is None:
            postcode = potential_postcodes[idx]
            searcher = SearchEpc(
                address1=row.address1, postcode=postcode, auth_token=EPC_AUTH_TOKEN, os_api_key=""
            )
            searcher.find_property(skip_os=True)

            if searcher.newest_epc is None:
                if idx == len(potential_postcodes) - 1:
                    break
                idx += 1
            else:
                newest_epc = searcher.newest_epc

        if newest_epc is None:
            raise Exception("FX ME")

        if row["Beds"] == "Studio":
            number_heated_rooms = 2
            number_habitable_rooms = 2
        else:
            # Assume one room for communal space, one room for bathroom
            number_heated_rooms = row["Beds"] + 2
            number_habitable_rooms = row["Beds"] + 2

        to_append = {
            **row.to_dict(),
            "uprn": newest_epc["uprn"],
            "address": newest_epc["address1"],
            "postcode": newest_epc["postcode"],
            # "walls-description": newest_epc["walls-description"],
            # "roof-description": newest_epc["roof-description"],
            # "floor-description": newest_epc["floor-description"],
            # "total-floor-area": newest_epc["total-floor-area"],
            "full-address": newest_epc["address"],
            "number-heated-rooms": number_heated_rooms,
            "number-habitable-rooms": number_habitable_rooms,
        }

        processed_asset_list.append(to_append)
        epc_data.append(newest_epc)

    processed_asset_list_df = pd.DataFrame(processed_asset_list)

    epc_data_df = pd.DataFrame(epc_data)

    # We store this data
    # Store the data in s3
    filename = f"{USER_ID}/{PORTFOLIO_ID}/test_inputs.csv"
    save_csv_to_s3(
        dataframe=processed_asset_list_df[
            ~processed_asset_list_df["uprn"].astype(int).isin(second_portfolio_uprns)
        ],
        bucket_name="retrofit-plan-inputs-dev",
        file_name=filename
    )

    body = {
        "portfolio_id": str(PORTFOLIO_ID),
        "housing_type": "Private",
        "goal": "Increase EPC",
        "goal_value": "C",
        "trigger_file_path": filename,
        "budget": None,
    }
    print(body)

    subset = processed_asset_list_df[
        processed_asset_list_df["uprn"].astype(int).isin(second_portfolio_uprns)
    ]

    filename2 = f"{USER_ID}/{SECOND_SCENARIO_PORTFOLIO_ID}/test_inputs.csv"
    save_csv_to_s3(
        dataframe=subset,
        bucket_name="retrofit-plan-inputs-dev",
        file_name=filename2
    )

    body = {
        "portfolio_id": str(SECOND_SCENARIO_PORTFOLIO_ID),
        "housing_type": "Private",
        "goal": "Increase EPC",
        "goal_value": "C",
        "trigger_file_path": filename,
        "budget": None,
    }
    print(body)

    # Some basic analysis on the heating, heating controls and hot water systems

    # All of the heating systems are rated very poor, poor or average. When it's average, they are all also
    # "Room heaters, electric", but the house has "Programmer and appliance thermostats" for the heating controls.
    # which is more efficient
    pd.set_option('display.max_rows', 500)
    pd.set_option('display.max_columns', 500)
    pd.set_option('display.width', 1000)

    # Heating
    print(epc_data_df[["mainheat-description", "mainheatcont-description", "mainheat-energy-eff"]].drop_duplicates())
    #                    mainheat-description              mainheatcont-description mainheat-energy-eff
    # 0                Room heaters, electric        Programmer and room thermostat           Very Poor
    # 12               Room heaters, electric  Programmer and appliance thermostats             Average
    # 20  Electric storage heaters, radiators                  Celect-type controls                Poor

    # Hot water
    print(epc_data_df[["hotwater-description", "hot-water-energy-eff"]].drop_duplicates())
    #                    hotwater-description hot-water-energy-eff
    # 0   Electric immersion, standard tariff            Very Poor
    # 12         Electric immersion, off-peak              Average

    # We now retrieve EPCS for all of the properties that are in these postcodes very obviously for the velvet mill
    # We'll use this information to get a sense of the likely wall/roof/floor construction for the properties

    # client = EpcClient(auth_token=EPC_AUTH_TOKEN)
    #
    # neighbouring_epcs = []
    # for pc in potential_postcodes:
    #     response = client.domestic.search(params={"postcode": pc}, size=1000)
    #     data = response["rows"]
    #
    #     # keep just rows that are clearly for the velvet mill
    #     data = [x for x in data if "velvet" in x["address1"].lower()]
    #
    #     neighbouring_epcs.extend(data)
    #
    # neighbouring_epcs_df = pd.DataFrame(neighbouring_epcs)
    # neighbouring_epcs_df["walls-description"].value_counts()
    # neighbouring_epcs_df["roof-description"].value_counts()
    # neighbouring_epcs_df["floor-description"].value_counts()