Model/etl/customers/newhaven/newhaven_study.py

import inspect
import pandas as pd
from etl.epc.settings import EARLIEST_EPC_DATE
from pathlib import Path
import numpy as np
from utils.s3 import save_csv_to_s3

src_file_path = inspect.getfile(lambda: None)

EPC_DIRECTORY = Path(src_file_path).parent / "local_data" / "all-domestic-certificates"
CUSTOMER_DATA_DIRECTORY = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Newhaven/Data"

USER_ID = 8
PORTFOLIO_ID = 90


def make_asset_list():
    """
    Set up a small asset list for the study
    """

    # Read in EPC data for Lewes
    lewes_directory = EPC_DIRECTORY / "domestic-E07000063-Lewes/certificates.csv"
    epc_data = pd.read_csv(lewes_directory, low_memory=False)
    # Rename the columns to the same format as the api returns
    epc_data.columns = [c.replace("_", "-").lower() for c in epc_data.columns]

    # Take just date before the date threshold
    epc_data = epc_data[epc_data["lodgement-date"] >= EARLIEST_EPC_DATE]

    epc_data = epc_data[~pd.isnull(epc_data["uprn"])]
    epc_data["uprn"] = epc_data["uprn"].astype(int).astype(str)
    # Take the newest EPC per uprn
    epc_data = epc_data.sort_values("lodgement-date").groupby("uprn").last().reset_index()
    # /Users/khalimconn-kowlessar/Documents/hestia/Customers/Newhaven/Data/
    # We read in the multiple data sources
    address_base = pd.read_csv(
        f"{CUSTOMER_DATA_DIRECTORY}/OS AddressBase Premium/OS AddressBase Premium.csv",
        low_memory=False,
    )
    # Filter on resi
    address_base = address_base[address_base["Primary Code Description"] == "Residential"]
    address_base["UPRN"] = address_base["UPRN"].astype(int).astype(str)

    pv_potential = pd.read_csv(
        f"{CUSTOMER_DATA_DIRECTORY}/Domestic Rooftop PV Potential/Domestic Rooftop PV Potential.csv",
        low_memory=False,
    )
    pv_potential["UPRN"] = pv_potential["UPRN"].astype(int).astype(str)

    ashp_potential = pd.read_csv(
        f"{CUSTOMER_DATA_DIRECTORY}/Air Source Heat Pump Potential/Air Source Heat Pump Potential.csv",
        low_memory=False,
    )
    ashp_potential["UPRN"] = ashp_potential["UPRN"].astype(int).astype(str)

    ashp_potential[ashp_potential["UPRN"] == "100060067063"].squeeze()

    insulation_potential = pd.read_csv(
        f"{CUSTOMER_DATA_DIRECTORY}/Insulation Potential/Insulation Potential.csv",
        low_memory=False,
    )
    insulation_potential["UPRN"] = insulation_potential["UPRN"].astype(int).astype(str)

    renewables_cost = pd.read_csv(
        f"{CUSTOMER_DATA_DIRECTORY}/Low Carbon Technology Costs/Low Carbon Technology Costs.csv",
        low_memory=False,
    )
    renewables_cost["UPRN"] = renewables_cost["UPRN"].astype(int).astype(str)

    # Merge the EPC data onto address base
    asset_list = address_base[
        [
            "UPRN", "Class Description", "Relative Height - Eaves",
        ]
    ].merge(
        epc_data[
            ["uprn", "current-energy-efficiency", "current-energy-rating", "address1", "postcode", "floor-height",
             "property-type", "built-form", "co2-emissions-current"]],
        how="left",
        left_on="UPRN",
        right_on="uprn"
    ).drop(
        columns=["uprn"]
    ).merge(
        insulation_potential[["UPRN", "EPC Rating", "Wall Area [m^2]", "Building Area [m^2]"]],
        how="left",
        on="UPRN"
    ).rename(
        columns={"Wall Area [m^2]": "insulation_wall_area", "Building Area [m^2]": "floor_area"}
    )

    had_an_epc = asset_list[~pd.isnull(asset_list["current-energy-efficiency"])]
    below_b = asset_list[asset_list["current-energy-efficiency"].astype(float) <= 80].shape
    below_c = asset_list[asset_list["current-energy-efficiency"].astype(float) <= 69].shape
    had_an_epc["energy-efficiency-rating"].value_counts()
    asset_list["current-energy-rating"].value_counts()
    asset_list["co2-emissions-current"].mean()
    # # Get the underlying data of a histograme
    import matplotlib.pyplot as plt
    n, bins, patches = plt.hist(asset_list["co2-emissions-current"], bins=100, color="blue", alpha=0.7)
    #
    bins = np.arange(0, asset_list["co2-emissions-current"].max(), 1)  # Bins from 50 to 150 with a step of 10
    #
    # # Step 3: Calculate the frequency of data in each bin
    hist, bin_edges = np.histogram(asset_list["co2-emissions-current"], bins=bins)

    # Take properties below a B - there are 2844 units
    asset_list = asset_list[asset_list["current-energy-efficiency"].astype(float) <= 80]
    # Drop caravans
    asset_list = asset_list[asset_list["Class Description"] != "Caravan"]
    asset_list = asset_list[~pd.isnull(asset_list["current-energy-efficiency"])]

    # Take a sample, for properties that have an EPC, with a seed
    # asset_list = asset_list.sample(frac=0.5, random_state=42)

    AVG_FLOOR_HEIGHT = asset_list["floor-height"].median()

    def estimate_n_floors(
        building_height, floor_height, address_base_property_description, epc_property_type,
    ):

        if address_base_property_description == "Self Contained Flat (Includes Maisonette / Apartment)":
            if epc_property_type in ["Flat"]:
                return 1
            if epc_property_type == "Maisonette":
                return 2
            return None

        if pd.isnull(floor_height):
            return np.round(building_height / AVG_FLOOR_HEIGHT)

        return np.round(building_height / floor_height)

    # Estimate the number of floors
    asset_list["number_of_floors"] = asset_list.apply(
        lambda x: estimate_n_floors(
            building_height=x["Relative Height - Eaves"],
            floor_height=x["floor-height"],
            address_base_property_description=x["Class Description"],
            epc_property_type=x["property-type"],
        ),
        axis=1
    )
    # Drop any entires with null floors because that means the ordnance survey data doesn't align with the epc data
    asset_list = asset_list[~pd.isnull(asset_list["number_of_floors"])]
    # Drop any entries with null insulation wall area
    asset_list = asset_list[~pd.isnull(asset_list["insulation_wall_area"])]

    # D    0.419929
    # C    0.391459
    # E    0.160142
    # F    0.017794
    # G    0.010676

    # Total asset list:
    # D    0.450409
    # C    0.412016
    # E    0.110203
    # F    0.020263
    # G    0.007110

    # We do the followings:
    # 1) Create final asset list
    # 2) Create Non-intrusive recommendations
    # 3) Create a third party costing object

    cost_testing = renewables_cost.merge(
        insulation_potential, how="inner", on="UPRN"
    )

    cost_testing["cwi_cost_per_m2"] = cost_testing["Insulation - Cavity Wall - Total"] / cost_testing["Wall Area [m^2]"]
    # Their cavity wall insulation is £8 per m^2

    cost_testing["ewi_cost_per_m2"] = cost_testing["Insulation - External Wall - Total"] / cost_testing[
        "Wall Area [m^2]"]

    cost_testing["li_cost_per_m2"] = cost_testing["Insulation - Loft - Total"] / cost_testing["Building Area [m^2]"]

    cost_testing["underfloor_cost_per_m2"] = cost_testing["Insulation - Under Floor- Total"] / cost_testing[
        "Building Area [m^2]"]

    final_asset_list = asset_list.rename(
        columns={"UPRN": "uprn", "address1": "address", "floor_area": "insulation_floor_area"}
    )[["uprn", "address", "postcode", "insulation_wall_area", "insulation_floor_area", "number_of_floors"]]

    # Create non-invasive recommendations, which come from the solar potential and ASHP potential data sources
    non_invasive_recommendations = []
    for _, row in final_asset_list.iterrows():
        property_ashp_potential = ashp_potential[
            (ashp_potential["UPRN"] == row["uprn"]) & ashp_potential["Overall Suitability Rating"]
            ]
        property_pv_potential = pv_potential[
            (pv_potential["UPRN"] == row["uprn"]) & pv_potential["Overall Suitability"]
            ]
        property_costs = renewables_cost[renewables_cost["UPRN"] == row["uprn"]]

        property_non_invasive_recs = []
        if not property_ashp_potential.empty:

            if property_costs.empty:
                similar_properties = ashp_potential[
                    ashp_potential["Overall Suitability Rating"] &
                    (ashp_potential["Recommended Heat Pump Size [kW]"] ==
                     property_ashp_potential["Recommended Heat Pump Size [kW]"].values[0])
                    ].merge(
                    renewables_cost, how="inner", on="UPRN"
                )
                property_costs = similar_properties[["Air Source Heat Pump - Total"]].mean().to_frame().T

            property_non_invasive_recs.append(
                {
                    "type": "air_source_heat_pump",
                    "suitable": True,
                    "size": property_ashp_potential["Recommended Heat Pump Size [kW]"].values[0],
                    "cost": property_costs["Air Source Heat Pump - Total"].values[0],
                    "ashp_only_heating_recommendation": True
                }
            )
        else:
            property_non_invasive_recs.append(
                {
                    "type": "air_source_heat_pump",
                    "suitable": False
                }
            )

        if not property_pv_potential.empty:
            property_non_invasive_recs.append(
                {
                    "type": "solar_pv",
                    "suitable": True,
                    "array_wattage": property_pv_potential["Recommended Array Size [kW]"].values[0] * 1000,
                    "initial_ac_kwh_per_year": property_pv_potential["Annual Generation [kWh]"].values[0],
                    "panneled_roof_area": property_pv_potential["Roof area suitable for PV [m^2]"].values[0],
                    "cost": property_costs["Rooftop PV - Total"].values[0],
                }
            )
        else:
            property_non_invasive_recs.append(
                {
                    "type": "solar_pv",
                    "suitable": False
                }
            )

        non_invasive_recommendations.append(
            {
                "uprn": row["uprn"],
                "recommendations": property_non_invasive_recs,
            }
        )

    # Save the asset list

    # Store the asset list in s3
    filename = f"{USER_ID}/{PORTFOLIO_ID}/pilot.csv"
    save_csv_to_s3(
        dataframe=final_asset_list,
        bucket_name="retrofit-plan-inputs-dev",
        file_name=filename
    )

    # Store non-invasive recommendations in S3
    non_invasive_recommendations_filename = f"{USER_ID}/{PORTFOLIO_ID}/non_invasive_recommendations.csv"
    save_csv_to_s3(
        dataframe=pd.DataFrame(non_invasive_recommendations),
        bucket_name="retrofit-plan-inputs-dev",
        file_name=non_invasive_recommendations_filename
    )

    # We add a patch to one of the units because there's no data for the built form
    # We would be able to handle this automatically in the future, when using OS API
    patches = [
        {
            "uprn": "10033266220",
            "built-form": "Semi-Detached",
        },
        {'uprn': '10033266219', 'built-form': 'Semi-Detached'}
    ]

    # Store patches in s3
    patches_filename = f"{USER_ID}/{PORTFOLIO_ID}/patches.json"
    save_csv_to_s3(
        dataframe=pd.DataFrame(patches),
        bucket_name="retrofit-plan-inputs-dev",
        file_name=patches_filename
    )

    # Create three scenarios
    body1 = {
        "portfolio_id": str(PORTFOLIO_ID),
        "housing_type": "Private",
        "goal": "Increasing EPC",
        "goal_value": "A",
        "trigger_file_path": filename,
        "already_installed_file_path": "",
        "patches_file_path": patches_filename,
        "non_invasive_recommendations_file_path": non_invasive_recommendations_filename,
        "scenario_name": "Demand Reduction - no solid wall, windows, LEDs",
        "multi_plan": True,
        "exclusions": [
            "internal_wall_insulation", "external_wall_insulation", "floor_insulation", "heating", "solar_pv",
            "lighting", "windows", "secondary_heating"
        ],
        "budget": None,
    }
    print(body1)

    body2 = {
        "portfolio_id": str(PORTFOLIO_ID),
        "housing_type": "Private",
        "goal": "Increasing EPC",
        "goal_value": "A",
        "trigger_file_path": filename,
        "already_installed_file_path": "",
        "patches_file_path": patches_filename,
        "non_invasive_recommendations_file_path": non_invasive_recommendations_filename,
        "scenario_name": "Demand Reduction - no solid wall, floors or heating",
        "multi_plan": True,
        "exclusions": [
            "internal_wall_insulation", "external_wall_insulation", "floor_insulation", "heating", "solar_pv",
        ],
        "budget": None,
    }
    print(body2)

    # 2.5 - full fabric, no decant
    body2_5 = {
        "portfolio_id": str(PORTFOLIO_ID),
        "housing_type": "Private",
        "goal": "Increasing EPC",
        "goal_value": "A",
        "trigger_file_path": filename,
        "already_installed_file_path": "",
        "patches_file_path": patches_filename,
        "non_invasive_recommendations_file_path": non_invasive_recommendations_filename,
        "scenario_name": "Demand Reduction - no solid wall, floors or heating",
        "multi_plan": True,
        "exclusions": [
            "internal_wall_insulation", "floor_insulation", "heating", "solar_pv",
        ],
        "budget": None,
    }
    print(body2_5)

    # Scenario B
    body3 = {
        "portfolio_id": str(PORTFOLIO_ID),
        "housing_type": "Private",
        "goal": "Increasing EPC",
        "goal_value": "A",
        "trigger_file_path": filename,
        "already_installed_file_path": "",
        "patches_file_path": patches_filename,
        "non_invasive_recommendations_file_path": non_invasive_recommendations_filename,
        "scenario_name": "Demand Reduction, Heating Systems, Solar PV - no solid wall or floors",
        "multi_plan": True,
        "exclusions": ["internal_wall_insulation", "external_wall_insulation", "floor_insulation"],
        "budget": None,
    }
    print(body3)

    # Scenario 4 - deep fabric, no IWI, floor
    body4 = {
        "portfolio_id": str(PORTFOLIO_ID),
        "housing_type": "Private",
        "goal": "Increasing EPC",
        "goal_value": "A",
        "trigger_file_path": filename,
        "already_installed_file_path": "",
        "patches_file_path": patches_filename,
        "non_invasive_recommendations_file_path": non_invasive_recommendations_filename,
        "scenario_name": "Whole House",
        "multi_plan": True,
        "budget": None,
    }
    print(body4)