Model/etl/customers/mod/pilot/2. Create Excel Model.py

from pprint import pprint
import pandas as pd
import numpy as np
from backend.app.utils import sap_to_epc
from sqlalchemy.orm import sessionmaker
from backend.app.db.connection import db_engine
from backend.app.db.models.recommendations import Recommendation, Plan, PlanRecommendations
from backend.app.db.models.portfolio import PropertyModel, PropertyDetailsEpcModel


def get_data(portfolio_id, scenario_ids):
    session = sessionmaker(bind=db_engine)()
    session.begin()

    # Get properties and their details for a specific portfolio
    properties_query = session.query(
        PropertyModel,
        PropertyDetailsEpcModel
    ).join(
        PropertyDetailsEpcModel, PropertyModel.id == PropertyDetailsEpcModel.property_id
    ).filter(
        PropertyModel.portfolio_id == portfolio_id  # Filter by portfolio ID
    ).all()

    # Transform properties data to include all fields dynamically
    properties_data = [
        {**{col.name: getattr(prop.PropertyModel, col.name) for col in PropertyModel.__table__.columns},
         **{col.name: getattr(prop.PropertyDetailsEpcModel, col.name) for col in
            PropertyDetailsEpcModel.__table__.columns}}
        for prop in properties_query
    ]

    # Get property IDs from fetched properties

    # Get plans linked to the fetched properties
    plans_query = session.query(Plan).filter(Plan.scenario_id.in_(scenario_ids)).all()

    # Transform plans data to include all fields dynamically
    plans_data = [
        {col.name: getattr(plan, col.name) for col in Plan.__table__.columns}
        for plan in plans_query
    ]

    # Extract plan IDs for filtering recommendations through PlanRecommendations
    plan_ids = [plan['id'] for plan in plans_data]

    # Get recommendations through PlanRecommendations for those plans and that are default
    recommendations_query = session.query(
        Recommendation,
        Plan.scenario_id
    ).join(
        PlanRecommendations, Recommendation.id == PlanRecommendations.recommendation_id
    ).join(
        Plan, Plan.id == PlanRecommendations.plan_id  # Join with Plan to access scenario_id
    ).filter(
        PlanRecommendations.plan_id.in_(plan_ids),
        Recommendation.default == True  # Filtering for default recommendations
    ).all()

    # Transform recommendations data to include all fields dynamically and include scenario_id
    recommendations_data = [
        {**{col.name: getattr(rec.Recommendation, col.name) if hasattr(rec, 'Recommendation')
        else getattr(rec, col.name) for
            col in Recommendation.__table__.columns},
         "Scenario ID": rec.scenario_id}
        for rec in recommendations_query
    ]

    session.close()

    return properties_data, plans_data, recommendations_data


def app():
    """
    Given a portfolio and a scenario, this function prepares an excel model to present the data
    """

    # Set the inputs:
    portfolio_id = 139
    scenario_ids = [237, 238]

    properties_data, plans_data, recommendations_data = get_data(
        portfolio_id=portfolio_id, scenario_ids=scenario_ids
    )

    properties_df = pd.DataFrame(properties_data)
    plans_df = pd.DataFrame(plans_data)
    recommendations_df = pd.DataFrame(recommendations_data)

    # Merge on the orignal data
    mod_property_data = pd.read_csv(
        "/Users/khalimconn-kowlessar/Documents/hestia/Customers/MOD/Pilot Programme/MOD property data.csv"
    )

    property_asset_data = properties_df.merge(
        mod_property_data.drop(columns=["address", "postcode", "tenure"]), how="left", on="uprn"
    )

    property_asset_data["is_pitched"] = property_asset_data["roof"].str.contains("pitched", case=False)
    property_asset_data["pre_1970"] = property_asset_data["BUILD_YEAR"] < 1970
    property_asset_data["wall_type"] = property_asset_data["walls"].str.split(" ").str[0].str.strip()
    property_asset_data["is_insulated"] = (
        property_asset_data["walls"].str.split(",").str[1].str.strip().isin(
            ["filled cavity", "with external insulation", "filled cavity and external insulation"]
        ) | property_asset_data["walls"].str.split(",").str[2].str.strip().isin(["insulated"])
    )
    property_asset_data["is_insulated"] = np.where(
        property_asset_data["is_insulated"], "Insulated", "Uninsulated"
    )
    property_asset_data["is_pitched"] = np.where(
        property_asset_data["is_pitched"], "Pitched roof", "Not Pitched Roof"
    )
    property_asset_data["pre_1970"] = np.where(
        property_asset_data["pre_1970"], "Pre 1970", "Post 1970"
    )

    archetype_variables = ["property_type", "wall_type", "is_insulated", "is_pitched", "pre_1970"]

    assigned_archetypes = (
        property_asset_data.groupby(
            archetype_variables
        ).size().reset_index().rename(columns={0: "n_properties"}).sort_values("n_properties", ascending=False)
    )

    # Make the archetype ID a concatenation of the variables
    assigned_archetypes["archetype_id"] = assigned_archetypes[archetype_variables].apply(
        lambda x: "_".join(x.astype(str)), axis=1
    )

    # Most prominent archetypes
    prominent_archetypes = assigned_archetypes.head(6)
    other_archetypes = assigned_archetypes.tail(-6)
    # 2 or fewer properties in the other archetypes

    property_asset_data = property_asset_data.merge(
        assigned_archetypes[archetype_variables + ["archetype_id"]],
        how="left",
        on=archetype_variables
    )

    # Create age bands:
    # 1960-1969
    # 1970-1979
    # 1980-1989
    # 1990-1999
    # 2000+
    property_asset_data["age_band"] = pd.cut(
        property_asset_data["BUILD_YEAR"],
        bins=[1959, 1969, 1979, 1989, 1999, 2022],
        labels=["1960-1969", "1970-1979", "1980-1989", "1990-1999", "2000+"]
    )

    # Create floor area bands
    # 0-73
    # 74-97
    # 98-199
    # 200+
    property_asset_data["floor_area_band"] = pd.cut(
        property_asset_data["total_floor_area"],
        bins=[0, 73, 97, 199, 10000],
        labels=["0-73", "74-97", "98-199", "200+"]
    )

    property_asset_data["archetype_group"] = property_asset_data["archetype_id"].copy()
    property_asset_data["archetype_group"] = np.where(
        property_asset_data["archetype_id"].isin(other_archetypes["archetype_id"].values),
        "other",
        property_asset_data["archetype_group"]
    )

    # For colour
    wall_types = (
        property_asset_data[["wall_type"]].value_counts().to_frame().reset_index().rename(
            columns={"wall_type": "Wall Type"}
        )
    )
    # Group into age bands
    ages = (
        property_asset_data[["age_band"]].value_counts()
        .to_frame()
        .reset_index().sort_values("age_band", ascending=True)
        .rename(columns={"age_band": "Age Band"})
    )
    floor_area_bands = (
        property_asset_data[["floor_area_band"]].value_counts()
        .to_frame()
        .reset_index().sort_values("floor_area_band", ascending=True)
        .rename(columns={"floor_area_band": "Floor Area Band"})
    )
    archetype_counts = (
        property_asset_data[["archetype_group"]].
        value_counts().
        to_frame().
        reset_index()
        .rename(columns={"archetype_group": "Archetype"})
    )
    property_types = (
        (property_asset_data["property_type"] + ": " + property_asset_data["built_form"]).
        value_counts().
        to_frame().
        reset_index()
        .rename(columns={"index": "Property Type", 0: "Count"})
    )

    # epc breakdown
    epc_breakdown = (
        property_asset_data["current_epc_rating"]
        .apply(lambda x: x.value)
        .value_counts()
        .to_frame()
        .reset_index()
    )

    # Figures for the deck
    # Carbon per property
    totals = property_asset_data[
        [
            "Total_household_members",
            "co2_emissions", "current_energy_demand", "current_energy_demand_heating_hotwater",
            "heating_cost_current", "hot_water_cost_current", "lighting_cost_current",
            "appliances_cost_current", "gas_standing_charge", "electricity_standing_charge"
        ]
    ].copy()
    totals["total_cost"] = (
        totals["heating_cost_current"] +
        totals["hot_water_cost_current"] +
        totals["lighting_cost_current"] +
        totals["appliances_cost_current"] +
        totals["gas_standing_charge"] +
        totals["electricity_standing_charge"]
    )
    print(
        totals[
            [
                "Total_household_members",
                "co2_emissions",
                "current_energy_demand",
                "total_cost",
            ]
        ].mean()
    )

    # Store these to an excel
    # with pd.ExcelWriter(
    #     "/Users/khalimconn-kowlessar/Documents/hestia/Customers/MOD/Pilot Programme/MOD archetype breakdowns.xlsx"
    # ) as writer:
    #     wall_types.to_excel(writer, sheet_name="Wall Types", index=False)
    #     ages.to_excel(writer, sheet_name="Ages", index=False)
    #     floor_area_bands.to_excel(writer, sheet_name="Floor Area Bands", index=False)
    #     archetype_counts.to_excel(writer, sheet_name="Archetype Counts", index=False)
    #     epc_breakdown.to_excel(writer, sheet_name="EPC Rating", index=False)

    contingency = 0.26

    # We prepare the outputs, by scenario
    scenario_data = {}
    for scenario in scenario_ids:

        scenario_recommendations_df = recommendations_df[
            recommendations_df["Scenario ID"] == scenario
            ].copy()

        scenario_recommendations_df["contingency"] = contingency * scenario_recommendations_df["estimated_cost"]
        scenario_recommendations_df["total_cost"] = (
            scenario_recommendations_df["estimated_cost"] + scenario_recommendations_df["contingency"]
        )

        recommended_measures_df = scenario_recommendations_df[
            ["property_id", "measure_type", "estimated_cost", "default"]
        ]

        recommended_measures_df = recommended_measures_df[recommended_measures_df["default"]]
        recommended_measures_df = recommended_measures_df.drop(columns=["default"])

        # Metrics by property ID
        aggregated_metrics = scenario_recommendations_df[
            [
                "property_id", "type", "default", "sap_points",
                "energy_cost_savings", "kwh_savings", "co2_equivalent_savings", "estimated_cost", "contingency",
                "total_cost"
            ]
        ]
        aggregated_metrics = aggregated_metrics[aggregated_metrics["default"]]
        aggregated_metrics = aggregated_metrics.groupby("property_id")[
            ["sap_points", "co2_equivalent_savings", "energy_cost_savings", "kwh_savings", "estimated_cost",
             "total_cost", "contingency"]
        ].sum().reset_index()

        recommendations_measures_pivot = recommended_measures_df.pivot(
            index='property_id',
            columns='measure_type',
            values='estimated_cost'
        )
        recommendations_measures_pivot = recommendations_measures_pivot.reset_index()
        recommendations_measures_pivot = recommendations_measures_pivot.fillna(0)

        # We flag with boolean if the measure is recommended
        for c in recommendations_measures_pivot.columns:
            if c == "property_id":
                continue
            recommendations_measures_pivot["Recommendation: " + c] = recommendations_measures_pivot[c] > 0

        # We now create a final output
        df = properties_df[
            [
                "property_id", "uprn", "address", "postcode", "property_type", "walls", "roof", "heating", "windows",
                "current_epc_rating", "current_sap_points", "total_floor_area", "number_of_rooms",
                "co2_emissions", "current_energy_demand", "current_energy_demand_heating_hotwater",
                "heating_cost_current", "hot_water_cost_current", "lighting_cost_current",
                "appliances_cost_current", "gas_standing_charge", "electricity_standing_charge"
            ]
        ].merge(
            recommendations_measures_pivot, how="left", on="property_id"
        ).merge(
            aggregated_metrics, how="left", on="property_id"
        )

        df["bills_total_cost"] = (
            df["heating_cost_current"] + df["hot_water_cost_current"] + df["lighting_cost_current"] +
            df["appliances_cost_current"] + df["gas_standing_charge"] + df["electricity_standing_charge"]
        )

        df = df.drop(columns=["property_id"])
        for c in ["sap_points", "co2_equivalent_savings", "energy_cost_savings", "kwh_savings"]:
            df[c] = df[c].fillna(0)

        df = df.rename(
            columns={
                "uprn": "UPRN",
                "address": "Address",
                "postcode": "Postcode",
                "walls": "Walls",
                "roof": "Roof",
                "heating": "Heating",
                "windows": "Windows",
                "current_epc_rating": "Current EPC Rating",
                "current_sap_points": "Current SAP Points",
                "total_floor_area": "Total Floor Area",
                "number_of_rooms": "Number of Habitable Rooms",
                "floor_height": "Floor Height",
            }
        )

        # Calculate post SAP
        df["Predicted Post Works SAP"] = df["Current SAP Points"] + df["sap_points"]
        df["Predicted Post Works SAP"] = df["Predicted Post Works SAP"].round()
        df["Predicted Post Works EPC"] = df["Predicted Post Works SAP"].apply(lambda x: sap_to_epc(x))

        # Calculate the relative savings on carbon, kwh, and bills
        df["relative_carbon_savings"] = df["co2_equivalent_savings"] / df["co2_emissions"]
        df["relative_kwh_savings"] = df["kwh_savings"] / df["current_energy_demand"]
        df["relative_bill_savings"] = df["energy_cost_savings"] / df["bills_total_cost"]

        # Add on the archetype
        df = df.merge(
            property_asset_data[["uprn", "archetype_group"]], how="left", left_on="UPRN", right_on="uprn"
        )

        # For properties that don't make it to EPC B, check why. E.g. for a property that has an oil boiler, it
        # the bills go up recommending HHRSH, so it doesn't make it to EPC B
        # For mid-terrace units, use the ordnance survey API to check if there is space for a heat pump?
        # DO it manually???

        # Doesn't make it
        # misses = df[df["Predicted Post Works EPC"] == "C"]
        # # 5 of them are flats and so are difficult to get to EPC B without renewables. Possibly not worth it from an
        # # ROI perspective
        #
        # misses[["UPRN", "Address", "Postcode", "property_type"]]

        #              UPRN                               Address  Postcode property_type
        # 2    100120988937              13 Sidbury Circular Road   SP9 7HX          Flat  No further action
        # 3    100120988998              74 Sidbury Circular Road   SP9 7JA          Flat  No further action
        # 4    100120989416                       47 Zouch Avenue   SP9 7LR          Flat  No further action
        # 6    100060585002  42, Muscott Close, Shipton Bellinger   SP9 7TX         House  Can probably take a heat pump
        # 37    10000801072        34 Luffenham Place, Chicksands  SG17 5XH         House  Already surveyed as having
        # an ASHP - should be looked at
        # 121  100120988259                      8, Karachi Close   SP9 7LW          Flat
        # 122  100121101217                     599, Pepper Place  BA12 0DW          Flat
        # 140  100021455241         33 Blenheim Crescent, Ruislip   HA4 7HA         House - Solar isnt recommended
        # due to bug
        # 149  100120915656            10 Bower Green, Shrivenham   SN6 8TU         House - Solar isn't recommended
        # due to bug

        scenario_data[scenario] = df

    printing_scenario_id = scenario_ids[0]
    # EPC breakdown
    print(scenario_data[printing_scenario_id]['Predicted Post Works EPC'].value_counts())
    # Cost
    # Total cost
    print(scenario_data[printing_scenario_id]["total_cost"].sum())
    # Base cost
    print(scenario_data[printing_scenario_id]["estimated_cost"].sum())
    # Contingency
    print(scenario_data[printing_scenario_id]["contingency"].sum())
    # Costs averaged per unit
    print(scenario_data[printing_scenario_id]["total_cost"].mean())
    print(scenario_data[printing_scenario_id]["estimated_cost"].mean())
    print(scenario_data[printing_scenario_id]["contingency"].mean())

    # Average relative savings
    print(scenario_data[printing_scenario_id]["relative_carbon_savings"].mean())
    print(scenario_data[printing_scenario_id]["relative_kwh_savings"].mean())
    print(scenario_data[printing_scenario_id]["relative_bill_savings"].mean())

    measure_details = {}
    for scenario in scenario_ids:
        measure_details[scenario] = {}
        recommendation_cols = [c for c in scenario_data[scenario].columns if "Recommendation:" in c]
        measure_details[scenario]["count"] = scenario_data[scenario][recommendation_cols].sum().to_dict()
        # Get average cost per measure
        measure_columns = [
            c.split("Recommendation: ")[1] for c in scenario_data[scenario].columns if "Recommendation:" in c
        ]
        # Take the mean, drop zero columns
        measure_costs = {}
        for m in measure_columns:
            measure_costs[m] = float(scenario_data[scenario][scenario_data[scenario][m] > 0][m].mean())
        measure_details[scenario]["cost_per_measure"] = measure_costs

    pprint(measure_details[scenario_ids[0]]["count"])
    pprint(measure_details[scenario_ids[1]]["count"])

    # Cost per measures
    pprint(measure_details[scenario_ids[0]]["cost_per_measure"])
    pprint(measure_details[scenario_ids[1]]["cost_per_measure"])

    # Do not get to EPC B:
    # 5 are flats
    # 1) 34 Luffenham Place, Chicksands SG17 5XH, has been surveyed as having a low performing heat pump -
    # should be looked at but several surrounding properties have been surveyed in a similar fashion
    # 2) 42, Muscott Close, Shipton Bellinger SP9 7TX, has an oil boiler and the bills go up recommending HHRSH.
    #    we could non-intrusively recommend a heat pump.
    # 3) 33 Blenheim Crescent, Ruislip, HA4 7HA, 100021455241 Solar potential modelling returned nothing -
    # manual review indicates that there are multiple trees surrouding the south facing side of the property
    # 4) 10 Bower Green, Shrivenham, SN6 8TU - Solar isn't recommended without further survey due to the local
    # area being surrounded by trees

    # Scenario adjustments:
    # Exclude: boiler_upgrade
    # Make ASHP COP 3.5

    # Metrics we need by scenario:
    # Cost
    # contingency
    # Carbon
    # kwh
    # bill savings
    scenario_metrics = {}
    for scenario in scenario_ids:
        df = scenario_data[scenario].copy()

        avg_savings = df[
            ["sap_points", "co2_equivalent_savings", "energy_cost_savings", "kwh_savings", "estimated_cost",
             "total_cost", "contingency"]
        ].mean().to_dict()
        avg_savings["cost_per_sap_point"] = avg_savings["total_cost"] / avg_savings["sap_points"]
        avg_savings["cost_per_carbon"] = avg_savings["total_cost"] / avg_savings["co2_equivalent_savings"]
        scenario_metrics[scenario] = avg_savings

    pprint(scenario_metrics[scenario_ids[0]])
    pprint(scenario_metrics[scenario_ids[1]])

    scenario_data[scenario_ids[0]]["loft_insulation"][
        scenario_data[scenario_ids[0]]["loft_insulation"] > 0
        ].mean()

    scenario_data[scenario_ids[0]]["cavity_wall_insulation"][
        scenario_data[scenario_ids[0]]["cavity_wall_insulation"] > 0
        ].mean()

    # Testing checking floor risk

    import requests

    def get_flood_risk(lat, lon, radius_km=1):
        url = "https://environment.data.gov.uk/flood-monitoring/id/floods"
        params = {
            'lat': lat,
            'long': lon,
            'dist': radius_km  # search radius in km
        }

        response = requests.get(url, params=params)
        response.raise_for_status()
        data = response.json()

        flood_warnings = data.get("items", [])

        if not flood_warnings:
            print("No active flood warnings near this location.")
        else:
            print(f"{len(flood_warnings)} warning(s) found near the location:")
            for warning in flood_warnings:
                print(f"- Area: {warning.get('description')}")
                print(f"  Severity: {warning.get('severity')} (Level {warning.get('severityLevel')})")
                print(f"  Message changed at: {warning.get('timeMessageChanged')}")
                print()

        return flood_warnings

    from shapely.geometry import shape, Point
    def get_flood_areas_near_point(lat, lon, radius_km=2):
        url = "https://environment.data.gov.uk/flood-monitoring/id/floodAreas"
        params = {
            'lat': lat,
            'long': lon,
            'dist': radius_km
        }

        response = requests.get(url, params=params)
        response.raise_for_status()
        return response.json().get("items", [])

    def point_in_flood_area(lat, lon):
        flood_areas = get_flood_areas_near_point(lat, lon, radius_km=1)
        point = Point(lon, lat)  # GeoJSON uses (lon, lat) format

        for area in flood_areas:
            polygon_url = area.get("polygon")
            if not polygon_url:
                continue

            polygon_response = requests.get(polygon_url)
            polygon_response.raise_for_status()
            polygon_geojson = polygon_response.json()

            features = polygon_geojson.get("features", [])
            if not features:
                continue

            flood_polygon = shape(features[0]['geometry'])

            try:
                is_inside = flood_polygon.contains(point)
            except:
                is_inside = False

            if is_inside:
                print(f"📍 Point is inside flood area: {area['label']} ({area['notation']})")
                return area

    from tqdm import tqdm
    floor_warnings_data = []
    for _, property in tqdm(property_asset_data.iterrows(), total=len(property_asset_data)):
        # warnings = floor_warnings_data.extend(
        #     get_flood_risk(lat=property["LATITUDE"], lon=property["LONGITUDE"], radius_km=1)
        # )

        resp = point_in_flood_area(lat=property["LATITUDE"], lon=property["LONGITUDE"])
        if resp:
            floor_warnings_data.append(
                {
                    "uprn": property["uprn"],
                    "address": property["address"],
                    "postcode": property["postcode"],
                    "area": resp
                }
            )
            continue

    import plotly.graph_objects as go

    labels = [
        "House_Cavity_Insulated_Pitched roof_Pre 1970",
        "House_Cavity_Insulated_Pitched roof_Post 1970",
        "House_Cavity_Uninsulated_Pitched roof_Pre 1970",
        "House_Cavity_Uninsulated_Pitched roof_Post 1970",
        "other",
        "House_System_Uninsulated_Pitched roof_Pre 1970",
        "House_Solid_Uninsulated_Not Pitched Roof_Pre 1970"
    ]

    values = [62, 36, 21, 16, 16, 4, 2]

    hovertext = [
        "Loft insulation, draft proofing",
        "Top-up loft insulation",
        "Cavity wall insulation, loft insulation",
        "Cavity wall insulation, ventilation",
        "Bespoke retrofit measures",
        "External wall insulation, roof insulation",
        "Flat roof insulation, internal wall insulation"
    ]

    fig = go.Figure(go.Treemap(
        labels=labels,
        parents=[""] * len(labels),  # No root
        values=values,
        hovertext=hovertext,
        hoverinfo="text",
        textinfo="none",
        marker=dict(
            line=dict(color="white", width=4),
            colors=values,
            colorscale="Blues"
        )
    ))

    fig.update_layout(
        margin=dict(t=10, l=10, r=10, b=10),
        plot_bgcolor="white",
        paper_bgcolor="white"
    )

    fig.show()

    # Get the recommended measures by scenario id
    recommendation_cols = [c for c in scenario_data[scenario_ids[1]].columns if "Recommendation:" in c]
    measure_counts_by_scenario = scenario_data[scenario_ids[1]].groupby("archetype_group")[
        recommendation_cols
    ].sum().reset_index()

    measure_counts_by_scenario.to_csv(
        "/Users/khalimconn-kowlessar/Documents/hestia/Customers/MOD/Pilot Programme/measure_counts_by_scenario.csv"
    )

    # Estimate average valuation improvment by scenarios
    valuation_data = pd.read_csv(
        "/Users/khalimconn-kowlessar/Documents/hestia/Customers/MOD/Pilot Programme/property_valuation.csv"
    )

    from backend.ml_models.Valuation import PropertyValuation

    uplift = []
    for _, x in valuation_data.iterrows():
        uprn = x["uprn"]

        to_append = {"uprn": uprn}
        for _id in scenario_ids:
            scenario = scenario_data[_id][
                scenario_data[_id]["uprn"] == uprn
                ].squeeze()

            val = PropertyValuation.estimate_valuation_improvement(
                current_value=x["valuation"],
                current_epc=scenario["Current EPC Rating"].value,
                target_epc=scenario["Predicted Post Works EPC"],
                total_cost=None
            )

            to_append[_id] = val["average_increase"]

        uplift.append(to_append)

    uplift = pd.DataFrame(uplift)
    print(uplift[scenario_ids[0]].mean())
    # £8,161
    print(uplift[scenario_ids[1]].mean())
    # £16,938