Model/etl/customers/urban_splash/slides.py

"""
This script contains the code to generate the data required to populate the slides
We connect to the database amd extract the data for the portfolio needed so it is recommended to use
a environment akin to the backend to run this script
"""
import pandas as pd
import numpy as np
from backend.app.db.connection import db_engine
from sqlalchemy.orm import sessionmaker
from etl.customers.slide_utils import (
    plot_epc_distribution,
    get_property_details_by_portfolio_id,
    get_plan_by_portfolio_id,
    get_properties_with_default_recommendations,
    create_powerpoint,
    create_recommendations_summary
)

PORTFOLIO_ID = 66
SECOND_SCENARIO_PORTFOLIO_ID = 65
EPC_TARGET = "C"
SAP_TARGET = 69
CUSTOMER_KEY = "urban_splash"


def app():
    # Connect to database
    session = sessionmaker(bind=db_engine)()

    ########################################################################
    # Get the data we need
    ########################################################################

    # Get the properties for the portfolio
    properties = get_properties_with_default_recommendations(session, PORTFOLIO_ID)
    properties_df = pd.DataFrame(properties)

    # We now pull the data for the property details
    property_details = get_property_details_by_portfolio_id(session, PORTFOLIO_ID)
    property_details_df = pd.DataFrame(property_details)
    # Merge on uprn
    property_details_df = property_details_df.merge(
        properties_df[["uprn", "id"]].rename(columns={"id": "property_id"}),
        on="property_id"
    )

    plans = get_plan_by_portfolio_id(session, PORTFOLIO_ID)
    plans_df = pd.DataFrame(plans)

    # Unnest the recommendations. Each recommendation is a list of dictionaries
    recommendations_exploded = properties_df["recommendations"].explode().tolist()
    recommendations_df = pd.DataFrame([r for r in recommendations_exploded if not pd.isnull(r)])

    recommendations_summary = create_recommendations_summary(recommendations_df, properties_df, SAP_TARGET)

    # Get the data for the second scenario portfolio
    properties_second_scenario = get_properties_with_default_recommendations(session, SECOND_SCENARIO_PORTFOLIO_ID)
    properties_second_scenario_df = pd.DataFrame(properties_second_scenario)

    propert_details_second_scenario = get_property_details_by_portfolio_id(session, SECOND_SCENARIO_PORTFOLIO_ID)
    property_details_second_scenario_df = pd.DataFrame(propert_details_second_scenario)
    # Merge on uprn
    property_details_second_scenario_df = property_details_second_scenario_df.merge(
        properties_second_scenario_df[["uprn", "id"]].rename(columns={"id": "property_id"}),
        on="property_id"
    )

    plans_second_scenario = get_plan_by_portfolio_id(session, SECOND_SCENARIO_PORTFOLIO_ID)
    plans_second_scenario_df = pd.DataFrame(plans_second_scenario)
    # Merge on uprn so we can compare properties across portfolios
    plans_second_scenario_df = plans_second_scenario_df.merge(
        properties_second_scenario_df[["uprn", "id"]].rename(columns={"id": "property_id"}), on="property_id"
    )

    recommendations_exploded_second_scenario = properties_second_scenario_df["recommendations"].explode().tolist()
    recommendations_second_scenario_df = pd.DataFrame(
        [r for r in recommendations_exploded_second_scenario if not pd.isnull(r)]
    )

    recommendations_summary_second_scenario = create_recommendations_summary(
        recommendations_second_scenario_df, properties_second_scenario_df, SAP_TARGET
    )

    # Combine the data for both scenarios
    full_property_details = pd.concat([property_details_df, property_details_second_scenario_df])
    full_properties = pd.concat([properties_df, properties_second_scenario_df])

    epc_rating_summary = full_properties.groupby("current_epc_rating").size().reset_index(name="count")
    epc_rating_summary["percentage"] = epc_rating_summary["count"] / epc_rating_summary["count"].sum() * 100

    ########################################################################
    # We pull out the data for the slides
    ########################################################################

    ############
    # Slide 1:
    ############
    # visual
    epc_plot, figure_path = plot_epc_distribution(
        epc_rating_summary, CUSTOMER_KEY, title="", background_color="white", bar_height=0.75, font_size=15
    )

    # floor area - upper and lower bounds

    # Take just properties that are below EPC C
    properties_needing_work = full_properties[
        full_properties["current_sap_points"] < SAP_TARGET
        ]
    property_details_needing_work = full_property_details[
        full_property_details["uprn"].isin(properties_needing_work["uprn"])
    ]

    min_area, max_area, average_area = (
        full_property_details["total_floor_area"].min(),
        full_property_details["total_floor_area"].max(),
        full_property_details["total_floor_area"].mean()
    )

    # Annual energy consumption - upper and lower bounds
    min_energy_consumption, max_energy_consumption, average_consumption, total_consumption = (
        property_details_needing_work["adjusted_energy_consumption"].min(),
        property_details_needing_work["adjusted_energy_consumption"].max(),
        property_details_needing_work["adjusted_energy_consumption"].mean(),
        property_details_needing_work["adjusted_energy_consumption"].sum()
    )

    # Co2 emissions - upper and lower bounds
    min_co2, max_co2, average_co2, total_co2 = (
        property_details_needing_work["co2_emissions"].min(),
        property_details_needing_work["co2_emissions"].max(),
        property_details_needing_work["co2_emissions"].mean(),
        property_details_needing_work["co2_emissions"].sum()
    )

    # Valuation: upper and lower bounds and average - take positive values in case we have just a sample
    valuation_df = properties_df[properties_df["current_valuation"] > 0]
    min_valuation, max_valuation, average_valuation = (
        valuation_df["current_valuation"].min(),
        valuation_df["current_valuation"].max(),
        valuation_df["current_valuation"].median()
    )

    recommendations_df.keys()

    slide_1_commentary = (
        f"Floor areas range from {min_area} to {max_area} square meters, with an average of {average_area} square "
        f"meters. \n"
        f"Annual energy consumption ranges from {min_energy_consumption} to {max_energy_consumption} kWh, with an "
        f"average of {average_consumption} kWh. \n"
        f"CO2 emissions range from {min_co2} to {max_co2} tonnes, with an average of {average_co2} tonnes. \n"
        f"Valuations range from £{min_valuation} to £{max_valuation} £, with an average of £"
        f"{average_valuation}.\n"
    )

    ############
    # Slide 2:
    ############
    # What it would take to hit EPC C

    # We calculate the number of units that will make it to an EPC C

    units_hitting_target = recommendations_summary[
        recommendations_summary["expected_epc_rating"] == EPC_TARGET
        ]

    n_units_to_target = units_hitting_target.shape[0]

    measures = "Electrical heating system upgrades & heating controls and Hot water system improvements"

    # Costs
    (
        expected_cost_per_unit_lower,
        expected_cost_per_unit_upper,
        expected_project_cost,
    ) = (
        units_hitting_target["total_cost"].min(),
        units_hitting_target["total_cost"].max(),
        units_hitting_target["total_cost"].sum()
    )

    # Per property
    # Take positive entries just in case we we have a sample
    valuation_impact_df = plans_df[plans_df["property_id"].isin(units_hitting_target["property_id"])]
    valuation_impact_df = valuation_impact_df[valuation_impact_df["valuation_increase_lower_bound"] > 0]
    min_valuation_impact, max_valuation_impact, average_valuation_impact = (
        valuation_impact_df["valuation_increase_lower_bound"].median(),
        valuation_impact_df["valuation_increase_upper_bound"].median(),
        valuation_impact_df["valuation_increase_average"].median()
    )

    # Bill savings per property
    min_bill_savings, max_bill_savings, average_bill_savings = (
        units_hitting_target["total_bill_savings"].min(),
        units_hitting_target["total_bill_savings"].max(),
        units_hitting_target["total_bill_savings"].mean()
    )

    # Total CO2 reduction of portfolio
    min_co2_reduction, max_co2_reduction, average_co2_reduction, total_co2_reduction = (
        units_hitting_target["total_carbon"].min(),
        units_hitting_target["total_carbon"].max(),
        units_hitting_target["total_carbon"].mean(),
        units_hitting_target["total_carbon"].sum()
    )

    slide_2_commentary = (
        f"{n_units_to_target} units expected to achieve EPC {EPC_TARGET} \n"
        f"Expected cost: {expected_cost_per_unit_lower} - {expected_cost_per_unit_upper}, total project: £"
        f"{expected_project_cost}\n"
        f"Measures include: {measures}\n"
        f"Valuation increase per property: £{min_valuation_impact}-{max_valuation_impact}, average: £"
        f"{average_valuation_impact}\n"
        f"Bill savings per property: £{min_bill_savings}-{max_bill_savings}, average: £{average_bill_savings}\n"
        f"Total CO2 reduction: {min_co2_reduction}-{max_co2_reduction} tonnes, average: {average_co2_reduction}\n"
        f"tonnes, total for the {n_units_to_target} properties: {total_co2_reduction} tonnes\n"
    )

    ############
    # Slide 3:
    ############

    units_missed_target = recommendations_summary_second_scenario.copy()

    n_units_missed_target = units_missed_target.shape[0]

    # How close were the properties that missed the target
    # We calculate the difference between the expected sap points and the lower bound sap points for the target

    # min_difference, max_difference, average_difference = (
    #     np.ceil(units_missed_target["sap_difference"].min()),
    #     np.ceil(units_missed_target["sap_difference"].max()),
    #     np.ceil(units_missed_target["sap_difference"].mean())
    # )

    second_scenario_measures = ("Electrical heating system upgrades & heating controls, Hot water system improvements "
                                "and internal wall insulation")

    # Just take all of the units in the second scenario, since they're borderline
    units_hitting_target_second_scenario = recommendations_summary_second_scenario[
        # (recommendations_summary_second_scenario["expected_epc_rating"] == EPC_TARGET) &
        (recommendations_summary_second_scenario["uprn"].isin(units_missed_target["uprn"].values))
    ]

    n_units_hitting_second_scenario = units_hitting_target_second_scenario[
        units_hitting_target_second_scenario["expected_epc_rating"] == EPC_TARGET
        ].shape[0]

    # Impact on second scenario
    # Costs
    (
        expected_cost_per_unit_lower_second_scenario,
        expected_cost_per_unit_upper_second_scenario,
        expected_project_cost_second_scenario,
    ) = (
        recommendations_summary_second_scenario["total_cost"].min(),
        recommendations_summary_second_scenario["total_cost"].max(),
        recommendations_summary_second_scenario["total_cost"].sum()
    )

    valuation_impact_df_second_scenario = plans_second_scenario_df[
        plans_second_scenario_df["uprn"].isin(units_hitting_target_second_scenario["uprn"])
    ]
    valuation_impact_df_second_scenario = valuation_impact_df_second_scenario[
        valuation_impact_df_second_scenario["valuation_increase_lower_bound"] > 0
        ]
    (
        min_valuation_impact_second_scenario,
        max_valuation_impact_second_scenario,
        average_valuation_impact_second_scenario
    ) = (
        valuation_impact_df_second_scenario["valuation_increase_lower_bound"].median(),
        valuation_impact_df_second_scenario["valuation_increase_upper_bound"].median(),
        valuation_impact_df_second_scenario["valuation_increase_average"].median()
    )

    # Bill savings per property
    min_bill_savings_second_scenario, max_bill_savings_second_scenario, average_bill_savings_second_scenario = (
        units_hitting_target_second_scenario["total_bill_savings"].min(),
        units_hitting_target_second_scenario["total_bill_savings"].max(),
        units_hitting_target_second_scenario["total_bill_savings"].mean()
    )

    # Total CO2 reduction of portfolio
    (
        min_co2_reduction_second_scenario,
        max_co2_reduction_second_scenario,
        average_co2_reduction_second_scenario,
        total_co2_reduction_second_scenario
    ) = (
        units_hitting_target_second_scenario["total_carbon"].min(),
        units_hitting_target_second_scenario["total_carbon"].max(),
        units_hitting_target_second_scenario["total_carbon"].mean(),
        units_hitting_target_second_scenario["total_carbon"].sum()
    )

    # Values for the leftovers
    units_missing_second_scenario = recommendations_summary_second_scenario[
        (recommendations_summary_second_scenario["expected_epc_rating"] != EPC_TARGET) &
        (recommendations_summary_second_scenario["uprn"].isin(units_missed_target["uprn"].values))
        ]

    min_difference_second_scenario, max_difference_second_scenario, average_difference_second_scenario = (
        np.ceil(units_missing_second_scenario["sap_difference"].min()),
        np.ceil(units_missing_second_scenario["sap_difference"].max()),
        np.ceil(units_missing_second_scenario["sap_difference"].mean())
    )

    slide_3_text = (
        f"{n_units_missed_target} units look like they would miss the EPC {EPC_TARGET} by {min_difference}-"
        f"{max_difference} points \n"
        "When on site, an assessor may be able to identify further improvements to bring the properties up to an EPC "
        f"{EPC_TARGET}.\n"
        f"We have looked at a more extensive package for these properties, including: {second_scenario_measures}\n"
        f"Of the {n_units_missed_target} properties, a further {units_hitting_target_second_scenario.shape[0]} are "
        f"expected to achieve EPC {EPC_TARGET} with these measures.\n"
        f"Expected cost: {expected_cost_per_unit_lower_second_scenario} - "
        f"{expected_cost_per_unit_upper_second_scenario}, "
        f"total project: £"
        f"{expected_project_cost_second_scenario}\n"
        f"Valuation increase per property: £{min_valuation_impact_second_scenario}-"
        f"{max_valuation_impact_second_scenario}, average: £"
        f"{average_valuation_impact_second_scenario}\n"
        f"Bill savings per property: £{min_bill_savings_second_scenario}-{max_bill_savings_second_scenario}, "
        f"average: £{average_bill_savings_second_scenario}\n"
        f"Total CO2 reduction: {min_co2_reduction_second_scenario}-{max_co2_reduction_second_scenario} tonnes, "
        f"average: "
        f"{average_co2_reduction_second_scenario}\n"
        f"tonnes, total for the {n_units_hitting_second_scenario} properties: {total_co2_reduction_second_scenario} "
        f"tonnes\n"
        f"Even in the second scenario, the remaining {units_missing_second_scenario.shape[0]} properties are expected "
        f"to miss EPC {EPC_TARGET} by {min_difference_second_scenario} point on average - they should be visited by "
        f"an assessor"
    )

    slide_data = {
        'slide_1': {
            "title": "EPC Rating Distribution",
            'image_path': figure_path,  # Pass the path to the saved image
            "text": slide_1_commentary
        },
        "slide_2": {
            "title": f"Properties that achieve EPC {EPC_TARGET}",
            "text": slide_2_commentary,
        },
        "slide 3": {
            "title": f"Properties that miss EPC {EPC_TARGET}",
            "text": slide_3_text
        }
    }

    save_location = f"etl/customers/{CUSTOMER_KEY}/{CUSTOMER_KEY}_tech_slides.pptx"
    create_powerpoint(slide_data, save_location)