Model/etl/customers/mod/pilot/2. Create Excel Model.py

805 lines
27 KiB
Python

from pprint import pprint
import pandas as pd
import numpy as np
from backend.app.utils import sap_to_epc
from sqlalchemy.orm import sessionmaker
from backend.app.db.connection import db_engine
from backend.app.db.models.recommendations import (
Recommendation,
PlanModel,
PlanRecommendations,
)
from backend.app.db.models.portfolio import PropertyModel, PropertyDetailsEpcModel
def get_data(portfolio_id, scenario_ids):
session = sessionmaker(bind=db_engine)()
session.begin()
# Get properties and their details for a specific portfolio
properties_query = (
session.query(PropertyModel, PropertyDetailsEpcModel)
.join(
PropertyDetailsEpcModel,
PropertyModel.id == PropertyDetailsEpcModel.property_id,
)
.filter(PropertyModel.portfolio_id == portfolio_id) # Filter by portfolio ID
.all()
)
# Transform properties data to include all fields dynamically
properties_data = [
{
**{
col.name: getattr(prop.PropertyModel, col.name)
for col in PropertyModel.__table__.columns
},
**{
col.name: getattr(prop.PropertyDetailsEpcModel, col.name)
for col in PropertyDetailsEpcModel.__table__.columns
},
}
for prop in properties_query
]
# Get property IDs from fetched properties
# Get plans linked to the fetched properties
plans_query = (
session.query(PlanModel).filter(PlanModel.scenario_id.in_(scenario_ids)).all()
)
# Transform plans data to include all fields dynamically
plans_data = [
{col.name: getattr(plan, col.name) for col in PlanModel.__table__.columns}
for plan in plans_query
]
# Extract plan IDs for filtering recommendations through PlanRecommendations
plan_ids = [plan["id"] for plan in plans_data]
# Get recommendations through PlanRecommendations for those plans and that are default
recommendations_query = (
session.query(Recommendation, PlanModel.scenario_id)
.join(
PlanRecommendations,
Recommendation.id == PlanRecommendations.recommendation_id,
)
.join(
PlanModel,
PlanModel.id
== PlanRecommendations.plan_id, # Join with Plan to access scenario_id
)
.filter(
PlanRecommendations.plan_id.in_(plan_ids),
Recommendation.default == True, # Filtering for default recommendations
)
.all()
)
# Transform recommendations data to include all fields dynamically and include scenario_id
recommendations_data = [
{
**{
col.name: (
getattr(rec.Recommendation, col.name)
if hasattr(rec, "Recommendation")
else getattr(rec, col.name)
)
for col in Recommendation.__table__.columns
},
"Scenario ID": rec.scenario_id,
}
for rec in recommendations_query
]
session.close()
return properties_data, plans_data, recommendations_data
def app():
"""
Given a portfolio and a scenario, this function prepares an excel model to present the data
"""
# Set the inputs:
portfolio_id = 139
scenario_ids = [237, 238]
properties_data, plans_data, recommendations_data = get_data(
portfolio_id=portfolio_id, scenario_ids=scenario_ids
)
properties_df = pd.DataFrame(properties_data)
plans_df = pd.DataFrame(plans_data)
recommendations_df = pd.DataFrame(recommendations_data)
# Merge on the orignal data
mod_property_data = pd.read_csv(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/MOD/Pilot Programme/MOD property data.csv"
)
property_asset_data = properties_df.merge(
mod_property_data.drop(columns=["address", "postcode", "tenure"]),
how="left",
on="uprn",
)
property_asset_data["is_pitched"] = property_asset_data["roof"].str.contains(
"pitched", case=False
)
property_asset_data["pre_1970"] = property_asset_data["BUILD_YEAR"] < 1970
property_asset_data["wall_type"] = (
property_asset_data["walls"].str.split(" ").str[0].str.strip()
)
property_asset_data["is_insulated"] = property_asset_data["walls"].str.split(
","
).str[1].str.strip().isin(
[
"filled cavity",
"with external insulation",
"filled cavity and external insulation",
]
) | property_asset_data[
"walls"
].str.split(
","
).str[
2
].str.strip().isin(
["insulated"]
)
property_asset_data["is_insulated"] = np.where(
property_asset_data["is_insulated"], "Insulated", "Uninsulated"
)
property_asset_data["is_pitched"] = np.where(
property_asset_data["is_pitched"], "Pitched roof", "Not Pitched Roof"
)
property_asset_data["pre_1970"] = np.where(
property_asset_data["pre_1970"], "Pre 1970", "Post 1970"
)
archetype_variables = [
"property_type",
"wall_type",
"is_insulated",
"is_pitched",
"pre_1970",
]
assigned_archetypes = (
property_asset_data.groupby(archetype_variables)
.size()
.reset_index()
.rename(columns={0: "n_properties"})
.sort_values("n_properties", ascending=False)
)
# Make the archetype ID a concatenation of the variables
assigned_archetypes["archetype_id"] = assigned_archetypes[
archetype_variables
].apply(lambda x: "_".join(x.astype(str)), axis=1)
# Most prominent archetypes
prominent_archetypes = assigned_archetypes.head(6)
other_archetypes = assigned_archetypes.tail(-6)
# 2 or fewer properties in the other archetypes
property_asset_data = property_asset_data.merge(
assigned_archetypes[archetype_variables + ["archetype_id"]],
how="left",
on=archetype_variables,
)
# Create age bands:
# 1960-1969
# 1970-1979
# 1980-1989
# 1990-1999
# 2000+
property_asset_data["age_band"] = pd.cut(
property_asset_data["BUILD_YEAR"],
bins=[1959, 1969, 1979, 1989, 1999, 2022],
labels=["1960-1969", "1970-1979", "1980-1989", "1990-1999", "2000+"],
)
# Create floor area bands
# 0-73
# 74-97
# 98-199
# 200+
property_asset_data["floor_area_band"] = pd.cut(
property_asset_data["total_floor_area"],
bins=[0, 73, 97, 199, 10000],
labels=["0-73", "74-97", "98-199", "200+"],
)
property_asset_data["archetype_group"] = property_asset_data["archetype_id"].copy()
property_asset_data["archetype_group"] = np.where(
property_asset_data["archetype_id"].isin(
other_archetypes["archetype_id"].values
),
"other",
property_asset_data["archetype_group"],
)
# For colour
wall_types = (
property_asset_data[["wall_type"]]
.value_counts()
.to_frame()
.reset_index()
.rename(columns={"wall_type": "Wall Type"})
)
# Group into age bands
ages = (
property_asset_data[["age_band"]]
.value_counts()
.to_frame()
.reset_index()
.sort_values("age_band", ascending=True)
.rename(columns={"age_band": "Age Band"})
)
floor_area_bands = (
property_asset_data[["floor_area_band"]]
.value_counts()
.to_frame()
.reset_index()
.sort_values("floor_area_band", ascending=True)
.rename(columns={"floor_area_band": "Floor Area Band"})
)
archetype_counts = (
property_asset_data[["archetype_group"]]
.value_counts()
.to_frame()
.reset_index()
.rename(columns={"archetype_group": "Archetype"})
)
property_types = (
(
property_asset_data["property_type"]
+ ": "
+ property_asset_data["built_form"]
)
.value_counts()
.to_frame()
.reset_index()
.rename(columns={"index": "Property Type", 0: "Count"})
)
# epc breakdown
epc_breakdown = (
property_asset_data["current_epc_rating"]
.apply(lambda x: x.value)
.value_counts()
.to_frame()
.reset_index()
)
# Figures for the deck
# Carbon per property
totals = property_asset_data[
[
"Total_household_members",
"co2_emissions",
"current_energy_demand",
"current_energy_demand_heating_hotwater",
"heating_cost_current",
"hot_water_cost_current",
"lighting_cost_current",
"appliances_cost_current",
"gas_standing_charge",
"electricity_standing_charge",
]
].copy()
totals["total_cost"] = (
totals["heating_cost_current"]
+ totals["hot_water_cost_current"]
+ totals["lighting_cost_current"]
+ totals["appliances_cost_current"]
+ totals["gas_standing_charge"]
+ totals["electricity_standing_charge"]
)
print(
totals[
[
"Total_household_members",
"co2_emissions",
"current_energy_demand",
"total_cost",
]
].mean()
)
# Store these to an excel
# with pd.ExcelWriter(
# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/MOD/Pilot Programme/MOD archetype breakdowns.xlsx"
# ) as writer:
# wall_types.to_excel(writer, sheet_name="Wall Types", index=False)
# ages.to_excel(writer, sheet_name="Ages", index=False)
# floor_area_bands.to_excel(writer, sheet_name="Floor Area Bands", index=False)
# archetype_counts.to_excel(writer, sheet_name="Archetype Counts", index=False)
# epc_breakdown.to_excel(writer, sheet_name="EPC Rating", index=False)
contingency = 0.26
# We prepare the outputs, by scenario
scenario_data = {}
for scenario in scenario_ids:
scenario_recommendations_df = recommendations_df[
recommendations_df["Scenario ID"] == scenario
].copy()
scenario_recommendations_df["contingency"] = (
contingency * scenario_recommendations_df["estimated_cost"]
)
scenario_recommendations_df["total_cost"] = (
scenario_recommendations_df["estimated_cost"]
+ scenario_recommendations_df["contingency"]
)
recommended_measures_df = scenario_recommendations_df[
["property_id", "measure_type", "estimated_cost", "default"]
]
recommended_measures_df = recommended_measures_df[
recommended_measures_df["default"]
]
recommended_measures_df = recommended_measures_df.drop(columns=["default"])
# Metrics by property ID
aggregated_metrics = scenario_recommendations_df[
[
"property_id",
"type",
"default",
"sap_points",
"energy_cost_savings",
"kwh_savings",
"co2_equivalent_savings",
"estimated_cost",
"contingency",
"total_cost",
]
]
aggregated_metrics = aggregated_metrics[aggregated_metrics["default"]]
aggregated_metrics = (
aggregated_metrics.groupby("property_id")[
[
"sap_points",
"co2_equivalent_savings",
"energy_cost_savings",
"kwh_savings",
"estimated_cost",
"total_cost",
"contingency",
]
]
.sum()
.reset_index()
)
recommendations_measures_pivot = recommended_measures_df.pivot(
index="property_id", columns="measure_type", values="estimated_cost"
)
recommendations_measures_pivot = recommendations_measures_pivot.reset_index()
recommendations_measures_pivot = recommendations_measures_pivot.fillna(0)
# We flag with boolean if the measure is recommended
for c in recommendations_measures_pivot.columns:
if c == "property_id":
continue
recommendations_measures_pivot["Recommendation: " + c] = (
recommendations_measures_pivot[c] > 0
)
# We now create a final output
df = (
properties_df[
[
"property_id",
"uprn",
"address",
"postcode",
"property_type",
"walls",
"roof",
"heating",
"windows",
"current_epc_rating",
"current_sap_points",
"total_floor_area",
"number_of_rooms",
"co2_emissions",
"current_energy_demand",
"current_energy_demand_heating_hotwater",
"heating_cost_current",
"hot_water_cost_current",
"lighting_cost_current",
"appliances_cost_current",
"gas_standing_charge",
"electricity_standing_charge",
]
]
.merge(recommendations_measures_pivot, how="left", on="property_id")
.merge(aggregated_metrics, how="left", on="property_id")
)
df["bills_total_cost"] = (
df["heating_cost_current"]
+ df["hot_water_cost_current"]
+ df["lighting_cost_current"]
+ df["appliances_cost_current"]
+ df["gas_standing_charge"]
+ df["electricity_standing_charge"]
)
df = df.drop(columns=["property_id"])
for c in [
"sap_points",
"co2_equivalent_savings",
"energy_cost_savings",
"kwh_savings",
]:
df[c] = df[c].fillna(0)
df = df.rename(
columns={
"uprn": "UPRN",
"address": "Address",
"postcode": "Postcode",
"walls": "Walls",
"roof": "Roof",
"heating": "Heating",
"windows": "Windows",
"current_epc_rating": "Current EPC Rating",
"current_sap_points": "Current SAP Points",
"total_floor_area": "Total Floor Area",
"number_of_rooms": "Number of Habitable Rooms",
"floor_height": "Floor Height",
}
)
# Calculate post SAP
df["Predicted Post Works SAP"] = df["Current SAP Points"] + df["sap_points"]
df["Predicted Post Works SAP"] = df["Predicted Post Works SAP"].round()
df["Predicted Post Works EPC"] = df["Predicted Post Works SAP"].apply(
lambda x: sap_to_epc(x)
)
# Calculate the relative savings on carbon, kwh, and bills
df["relative_carbon_savings"] = (
df["co2_equivalent_savings"] / df["co2_emissions"]
)
df["relative_kwh_savings"] = df["kwh_savings"] / df["current_energy_demand"]
df["relative_bill_savings"] = df["energy_cost_savings"] / df["bills_total_cost"]
# Add on the archetype
df = df.merge(
property_asset_data[["uprn", "archetype_group"]],
how="left",
left_on="UPRN",
right_on="uprn",
)
# For properties that don't make it to EPC B, check why. E.g. for a property that has an oil boiler, it
# the bills go up recommending HHRSH, so it doesn't make it to EPC B
# For mid-terrace units, use the ordnance survey API to check if there is space for a heat pump?
# DO it manually???
# Doesn't make it
# misses = df[df["Predicted Post Works EPC"] == "C"]
# # 5 of them are flats and so are difficult to get to EPC B without renewables. Possibly not worth it from an
# # ROI perspective
#
# misses[["UPRN", "Address", "Postcode", "property_type"]]
# UPRN Address Postcode property_type
# 2 100120988937 13 Sidbury Circular Road SP9 7HX Flat No further action
# 3 100120988998 74 Sidbury Circular Road SP9 7JA Flat No further action
# 4 100120989416 47 Zouch Avenue SP9 7LR Flat No further action
# 6 100060585002 42, Muscott Close, Shipton Bellinger SP9 7TX House Can probably take a heat pump
# 37 10000801072 34 Luffenham Place, Chicksands SG17 5XH House Already surveyed as having
# an ASHP - should be looked at
# 121 100120988259 8, Karachi Close SP9 7LW Flat
# 122 100121101217 599, Pepper Place BA12 0DW Flat
# 140 100021455241 33 Blenheim Crescent, Ruislip HA4 7HA House - Solar isnt recommended
# due to bug
# 149 100120915656 10 Bower Green, Shrivenham SN6 8TU House - Solar isn't recommended
# due to bug
scenario_data[scenario] = df
printing_scenario_id = scenario_ids[0]
# EPC breakdown
print(
scenario_data[printing_scenario_id]["Predicted Post Works EPC"].value_counts()
)
# Cost
# Total cost
print(scenario_data[printing_scenario_id]["total_cost"].sum())
# Base cost
print(scenario_data[printing_scenario_id]["estimated_cost"].sum())
# Contingency
print(scenario_data[printing_scenario_id]["contingency"].sum())
# Costs averaged per unit
print(scenario_data[printing_scenario_id]["total_cost"].mean())
print(scenario_data[printing_scenario_id]["estimated_cost"].mean())
print(scenario_data[printing_scenario_id]["contingency"].mean())
# Average relative savings
print(scenario_data[printing_scenario_id]["relative_carbon_savings"].mean())
print(scenario_data[printing_scenario_id]["relative_kwh_savings"].mean())
print(scenario_data[printing_scenario_id]["relative_bill_savings"].mean())
measure_details = {}
for scenario in scenario_ids:
measure_details[scenario] = {}
recommendation_cols = [
c for c in scenario_data[scenario].columns if "Recommendation:" in c
]
measure_details[scenario]["count"] = (
scenario_data[scenario][recommendation_cols].sum().to_dict()
)
# Get average cost per measure
measure_columns = [
c.split("Recommendation: ")[1]
for c in scenario_data[scenario].columns
if "Recommendation:" in c
]
# Take the mean, drop zero columns
measure_costs = {}
for m in measure_columns:
measure_costs[m] = float(
scenario_data[scenario][scenario_data[scenario][m] > 0][m].mean()
)
measure_details[scenario]["cost_per_measure"] = measure_costs
pprint(measure_details[scenario_ids[0]]["count"])
pprint(measure_details[scenario_ids[1]]["count"])
# Cost per measures
pprint(measure_details[scenario_ids[0]]["cost_per_measure"])
pprint(measure_details[scenario_ids[1]]["cost_per_measure"])
# Do not get to EPC B:
# 5 are flats
# 1) 34 Luffenham Place, Chicksands SG17 5XH, has been surveyed as having a low performing heat pump -
# should be looked at but several surrounding properties have been surveyed in a similar fashion
# 2) 42, Muscott Close, Shipton Bellinger SP9 7TX, has an oil boiler and the bills go up recommending HHRSH.
# we could non-intrusively recommend a heat pump.
# 3) 33 Blenheim Crescent, Ruislip, HA4 7HA, 100021455241 Solar potential modelling returned nothing -
# manual review indicates that there are multiple trees surrouding the south facing side of the property
# 4) 10 Bower Green, Shrivenham, SN6 8TU - Solar isn't recommended without further survey due to the local
# area being surrounded by trees
# Scenario adjustments:
# Exclude: boiler_upgrade
# Make ASHP COP 3.5
# Metrics we need by scenario:
# Cost
# contingency
# Carbon
# kwh
# bill savings
scenario_metrics = {}
for scenario in scenario_ids:
df = scenario_data[scenario].copy()
avg_savings = (
df[
[
"sap_points",
"co2_equivalent_savings",
"energy_cost_savings",
"kwh_savings",
"estimated_cost",
"total_cost",
"contingency",
]
]
.mean()
.to_dict()
)
avg_savings["cost_per_sap_point"] = (
avg_savings["total_cost"] / avg_savings["sap_points"]
)
avg_savings["cost_per_carbon"] = (
avg_savings["total_cost"] / avg_savings["co2_equivalent_savings"]
)
scenario_metrics[scenario] = avg_savings
pprint(scenario_metrics[scenario_ids[0]])
pprint(scenario_metrics[scenario_ids[1]])
scenario_data[scenario_ids[0]]["loft_insulation"][
scenario_data[scenario_ids[0]]["loft_insulation"] > 0
].mean()
scenario_data[scenario_ids[0]]["cavity_wall_insulation"][
scenario_data[scenario_ids[0]]["cavity_wall_insulation"] > 0
].mean()
# Testing checking floor risk
import requests
def get_flood_risk(lat, lon, radius_km=1):
url = "https://environment.data.gov.uk/flood-monitoring/id/floods"
params = {"lat": lat, "long": lon, "dist": radius_km} # search radius in km
response = requests.get(url, params=params)
response.raise_for_status()
data = response.json()
flood_warnings = data.get("items", [])
if not flood_warnings:
print("No active flood warnings near this location.")
else:
print(f"{len(flood_warnings)} warning(s) found near the location:")
for warning in flood_warnings:
print(f"- Area: {warning.get('description')}")
print(
f" Severity: {warning.get('severity')} (Level {warning.get('severityLevel')})"
)
print(f" Message changed at: {warning.get('timeMessageChanged')}")
print()
return flood_warnings
from shapely.geometry import shape, Point
def get_flood_areas_near_point(lat, lon, radius_km=2):
url = "https://environment.data.gov.uk/flood-monitoring/id/floodAreas"
params = {"lat": lat, "long": lon, "dist": radius_km}
response = requests.get(url, params=params)
response.raise_for_status()
return response.json().get("items", [])
def point_in_flood_area(lat, lon):
flood_areas = get_flood_areas_near_point(lat, lon, radius_km=1)
point = Point(lon, lat) # GeoJSON uses (lon, lat) format
for area in flood_areas:
polygon_url = area.get("polygon")
if not polygon_url:
continue
polygon_response = requests.get(polygon_url)
polygon_response.raise_for_status()
polygon_geojson = polygon_response.json()
features = polygon_geojson.get("features", [])
if not features:
continue
flood_polygon = shape(features[0]["geometry"])
try:
is_inside = flood_polygon.contains(point)
except:
is_inside = False
if is_inside:
print(
f"📍 Point is inside flood area: {area['label']} ({area['notation']})"
)
return area
from tqdm import tqdm
floor_warnings_data = []
for _, property in tqdm(
property_asset_data.iterrows(), total=len(property_asset_data)
):
# warnings = floor_warnings_data.extend(
# get_flood_risk(lat=property["LATITUDE"], lon=property["LONGITUDE"], radius_km=1)
# )
resp = point_in_flood_area(lat=property["LATITUDE"], lon=property["LONGITUDE"])
if resp:
floor_warnings_data.append(
{
"uprn": property["uprn"],
"address": property["address"],
"postcode": property["postcode"],
"area": resp,
}
)
continue
import plotly.graph_objects as go
labels = [
"House_Cavity_Insulated_Pitched roof_Pre 1970",
"House_Cavity_Insulated_Pitched roof_Post 1970",
"House_Cavity_Uninsulated_Pitched roof_Pre 1970",
"House_Cavity_Uninsulated_Pitched roof_Post 1970",
"other",
"House_System_Uninsulated_Pitched roof_Pre 1970",
"House_Solid_Uninsulated_Not Pitched Roof_Pre 1970",
]
values = [62, 36, 21, 16, 16, 4, 2]
hovertext = [
"Loft insulation, draft proofing",
"Top-up loft insulation",
"Cavity wall insulation, loft insulation",
"Cavity wall insulation, ventilation",
"Bespoke retrofit measures",
"External wall insulation, roof insulation",
"Flat roof insulation, internal wall insulation",
]
fig = go.Figure(
go.Treemap(
labels=labels,
parents=[""] * len(labels), # No root
values=values,
hovertext=hovertext,
hoverinfo="text",
textinfo="none",
marker=dict(
line=dict(color="white", width=4), colors=values, colorscale="Blues"
),
)
)
fig.update_layout(
margin=dict(t=10, l=10, r=10, b=10), plot_bgcolor="white", paper_bgcolor="white"
)
fig.show()
# Get the recommended measures by scenario id
recommendation_cols = [
c for c in scenario_data[scenario_ids[1]].columns if "Recommendation:" in c
]
measure_counts_by_scenario = (
scenario_data[scenario_ids[1]]
.groupby("archetype_group")[recommendation_cols]
.sum()
.reset_index()
)
measure_counts_by_scenario.to_csv(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/MOD/Pilot Programme/measure_counts_by_scenario.csv"
)
# Estimate average valuation improvment by scenarios
valuation_data = pd.read_csv(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/MOD/Pilot Programme/property_valuation.csv"
)
from backend.ml_models.Valuation import PropertyValuation
uplift = []
for _, x in valuation_data.iterrows():
uprn = x["uprn"]
to_append = {"uprn": uprn}
for _id in scenario_ids:
scenario = scenario_data[_id][scenario_data[_id]["uprn"] == uprn].squeeze()
val = PropertyValuation.estimate_valuation_improvement(
current_value=x["valuation"],
current_epc=scenario["Current EPC Rating"].value,
target_epc=scenario["Predicted Post Works EPC"],
total_cost=None,
)
to_append[_id] = val["average_increase"]
uplift.append(to_append)
uplift = pd.DataFrame(uplift)
print(uplift[scenario_ids[0]].mean())
# £8,161
print(uplift[scenario_ids[1]].mean())
# £16,938