Model/etl/customers/mod/pilot/2. Create Excel Model.py
2025-03-27 18:58:57 +00:00

652 lines
25 KiB
Python

from pprint import pprint
import pandas as pd
import numpy as np
from backend.app.utils import sap_to_epc
from sqlalchemy.orm import sessionmaker
from backend.app.db.connection import db_engine
from backend.app.db.models.recommendations import Recommendation, Plan, PlanRecommendations
from backend.app.db.models.portfolio import PropertyModel, PropertyDetailsEpcModel
def get_data(portfolio_id, scenario_ids):
session = sessionmaker(bind=db_engine)()
session.begin()
# Get properties and their details for a specific portfolio
properties_query = session.query(
PropertyModel,
PropertyDetailsEpcModel
).join(
PropertyDetailsEpcModel, PropertyModel.id == PropertyDetailsEpcModel.property_id
).filter(
PropertyModel.portfolio_id == portfolio_id # Filter by portfolio ID
).all()
# Transform properties data to include all fields dynamically
properties_data = [
{**{col.name: getattr(prop.PropertyModel, col.name) for col in PropertyModel.__table__.columns},
**{col.name: getattr(prop.PropertyDetailsEpcModel, col.name) for col in
PropertyDetailsEpcModel.__table__.columns}}
for prop in properties_query
]
# Get property IDs from fetched properties
# Get plans linked to the fetched properties
plans_query = session.query(Plan).filter(Plan.scenario_id.in_(scenario_ids)).all()
# Transform plans data to include all fields dynamically
plans_data = [
{col.name: getattr(plan, col.name) for col in Plan.__table__.columns}
for plan in plans_query
]
# Extract plan IDs for filtering recommendations through PlanRecommendations
plan_ids = [plan['id'] for plan in plans_data]
# Get recommendations through PlanRecommendations for those plans and that are default
recommendations_query = session.query(
Recommendation,
Plan.scenario_id
).join(
PlanRecommendations, Recommendation.id == PlanRecommendations.recommendation_id
).join(
Plan, Plan.id == PlanRecommendations.plan_id # Join with Plan to access scenario_id
).filter(
PlanRecommendations.plan_id.in_(plan_ids),
Recommendation.default == True # Filtering for default recommendations
).all()
# Transform recommendations data to include all fields dynamically and include scenario_id
recommendations_data = [
{**{col.name: getattr(rec.Recommendation, col.name) if hasattr(rec, 'Recommendation')
else getattr(rec, col.name) for
col in Recommendation.__table__.columns},
"Scenario ID": rec.scenario_id}
for rec in recommendations_query
]
session.close()
return properties_data, plans_data, recommendations_data
def app():
"""
Given a portfolio and a scenario, this function prepares an excel model to present the data
"""
# Set the inputs:
portfolio_id = 139
scenario_ids = [237, 238]
properties_data, plans_data, recommendations_data = get_data(
portfolio_id=portfolio_id, scenario_ids=scenario_ids
)
properties_df = pd.DataFrame(properties_data)
plans_df = pd.DataFrame(plans_data)
recommendations_df = pd.DataFrame(recommendations_data)
# Merge on the orignal data
mod_property_data = pd.read_csv(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/MOD/Pilot Programme/MOD property data.csv"
)
property_asset_data = properties_df.merge(
mod_property_data.drop(columns=["address", "postcode", "tenure"]), how="left", on="uprn"
)
property_asset_data["is_pitched"] = property_asset_data["roof"].str.contains("pitched", case=False)
property_asset_data["pre_1970"] = property_asset_data["BUILD_YEAR"] < 1970
property_asset_data["wall_type"] = property_asset_data["walls"].str.split(" ").str[0].str.strip()
property_asset_data["is_insulated"] = (
property_asset_data["walls"].str.split(",").str[1].str.strip().isin(
["filled cavity", "with external insulation", "filled cavity and external insulation"]
) | property_asset_data["walls"].str.split(",").str[2].str.strip().isin(["insulated"])
)
property_asset_data["is_insulated"] = np.where(
property_asset_data["is_insulated"], "Insulated", "Uninsulated"
)
property_asset_data["is_pitched"] = np.where(
property_asset_data["is_pitched"], "Pitched roof", "Not Pitched Roof"
)
property_asset_data["pre_1970"] = np.where(
property_asset_data["pre_1970"], "Pre 1970", "Post 1970"
)
archetype_variables = ["property_type", "wall_type", "is_insulated", "is_pitched", "pre_1970"]
assigned_archetypes = (
property_asset_data.groupby(
archetype_variables
).size().reset_index().rename(columns={0: "n_properties"}).sort_values("n_properties", ascending=False)
)
# Make the archetype ID a concatenation of the variables
assigned_archetypes["archetype_id"] = assigned_archetypes[archetype_variables].apply(
lambda x: "_".join(x.astype(str)), axis=1
)
# Most prominent archetypes
prominent_archetypes = assigned_archetypes.head(6)
other_archetypes = assigned_archetypes.tail(-6)
# 2 or fewer properties in the other archetypes
property_asset_data = property_asset_data.merge(
assigned_archetypes[archetype_variables + ["archetype_id"]],
how="left",
on=archetype_variables
)
# Create age bands:
# 1960-1969
# 1970-1979
# 1980-1989
# 1990-1999
# 2000+
property_asset_data["age_band"] = pd.cut(
property_asset_data["BUILD_YEAR"],
bins=[1959, 1969, 1979, 1989, 1999, 2022],
labels=["1960-1969", "1970-1979", "1980-1989", "1990-1999", "2000+"]
)
# Create floor area bands
# 0-73
# 74-97
# 98-199
# 200+
property_asset_data["floor_area_band"] = pd.cut(
property_asset_data["total_floor_area"],
bins=[0, 73, 97, 199, 10000],
labels=["0-73", "74-97", "98-199", "200+"]
)
property_asset_data["archetype_group"] = property_asset_data["archetype_id"].copy()
property_asset_data["archetype_group"] = np.where(
property_asset_data["archetype_id"].isin(other_archetypes["archetype_id"].values),
"other",
property_asset_data["archetype_group"]
)
# For colour
wall_types = (
property_asset_data[["wall_type"]].value_counts().to_frame().reset_index().rename(
columns={"wall_type": "Wall Type"}
)
)
# Group into age bands
ages = (
property_asset_data[["age_band"]].value_counts()
.to_frame()
.reset_index().sort_values("age_band", ascending=True)
.rename(columns={"age_band": "Age Band"})
)
floor_area_bands = (
property_asset_data[["floor_area_band"]].value_counts()
.to_frame()
.reset_index().sort_values("floor_area_band", ascending=True)
.rename(columns={"floor_area_band": "Floor Area Band"})
)
archetype_counts = (
property_asset_data[["archetype_group"]].
value_counts().
to_frame().
reset_index()
.rename(columns={"archetype_group": "Archetype"})
)
property_types = (
(property_asset_data["property_type"] + ": " + property_asset_data["built_form"]).
value_counts().
to_frame().
reset_index()
.rename(columns={"index": "Property Type", 0: "Count"})
)
# epc breakdown
epc_breakdown = (
property_asset_data["current_epc_rating"]
.apply(lambda x: x.value)
.value_counts()
.to_frame()
.reset_index()
)
# Figures for the deck
# Carbon per property
totals = property_asset_data[
[
"Total_household_members",
"co2_emissions", "current_energy_demand", "current_energy_demand_heating_hotwater",
"heating_cost_current", "hot_water_cost_current", "lighting_cost_current",
"appliances_cost_current", "gas_standing_charge", "electricity_standing_charge"
]
].copy()
totals["total_cost"] = (
totals["heating_cost_current"] +
totals["hot_water_cost_current"] +
totals["lighting_cost_current"] +
totals["appliances_cost_current"] +
totals["gas_standing_charge"] +
totals["electricity_standing_charge"]
)
print(
totals[
[
"Total_household_members",
"co2_emissions",
"current_energy_demand",
"total_cost",
]
].mean()
)
# Store these to an excel
# with pd.ExcelWriter(
# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/MOD/Pilot Programme/MOD archetype breakdowns.xlsx"
# ) as writer:
# wall_types.to_excel(writer, sheet_name="Wall Types", index=False)
# ages.to_excel(writer, sheet_name="Ages", index=False)
# floor_area_bands.to_excel(writer, sheet_name="Floor Area Bands", index=False)
# archetype_counts.to_excel(writer, sheet_name="Archetype Counts", index=False)
# epc_breakdown.to_excel(writer, sheet_name="EPC Rating", index=False)
contingency = 0.26
# We prepare the outputs, by scenario
scenario_data = {}
for scenario in scenario_ids:
scenario_recommendations_df = recommendations_df[
recommendations_df["Scenario ID"] == scenario
].copy()
scenario_recommendations_df["contingency"] = contingency * scenario_recommendations_df["estimated_cost"]
scenario_recommendations_df["total_cost"] = (
scenario_recommendations_df["estimated_cost"] + scenario_recommendations_df["contingency"]
)
recommended_measures_df = scenario_recommendations_df[
["property_id", "measure_type", "estimated_cost", "default"]
]
recommended_measures_df = recommended_measures_df[recommended_measures_df["default"]]
recommended_measures_df = recommended_measures_df.drop(columns=["default"])
# Metrics by property ID
aggregated_metrics = scenario_recommendations_df[
[
"property_id", "type", "default", "sap_points",
"energy_cost_savings", "kwh_savings", "co2_equivalent_savings", "estimated_cost", "contingency",
"total_cost"
]
]
aggregated_metrics = aggregated_metrics[aggregated_metrics["default"]]
aggregated_metrics = aggregated_metrics.groupby("property_id")[
["sap_points", "co2_equivalent_savings", "energy_cost_savings", "kwh_savings", "estimated_cost",
"total_cost", "contingency"]
].sum().reset_index()
recommendations_measures_pivot = recommended_measures_df.pivot(
index='property_id',
columns='measure_type',
values='estimated_cost'
)
recommendations_measures_pivot = recommendations_measures_pivot.reset_index()
recommendations_measures_pivot = recommendations_measures_pivot.fillna(0)
# We flag with boolean if the measure is recommended
for c in recommendations_measures_pivot.columns:
if c == "property_id":
continue
recommendations_measures_pivot["Recommendation: " + c] = recommendations_measures_pivot[c] > 0
# We now create a final output
df = properties_df[
[
"property_id", "uprn", "address", "postcode", "property_type", "walls", "roof", "heating", "windows",
"current_epc_rating", "current_sap_points", "total_floor_area", "number_of_rooms",
"co2_emissions", "current_energy_demand", "current_energy_demand_heating_hotwater",
"heating_cost_current", "hot_water_cost_current", "lighting_cost_current",
"appliances_cost_current", "gas_standing_charge", "electricity_standing_charge"
]
].merge(
recommendations_measures_pivot, how="left", on="property_id"
).merge(
aggregated_metrics, how="left", on="property_id"
)
df["bills_total_cost"] = (
df["heating_cost_current"] + df["hot_water_cost_current"] + df["lighting_cost_current"] +
df["appliances_cost_current"] + df["gas_standing_charge"] + df["electricity_standing_charge"]
)
df = df.drop(columns=["property_id"])
for c in ["sap_points", "co2_equivalent_savings", "energy_cost_savings", "kwh_savings"]:
df[c] = df[c].fillna(0)
df = df.rename(
columns={
"uprn": "UPRN",
"address": "Address",
"postcode": "Postcode",
"walls": "Walls",
"roof": "Roof",
"heating": "Heating",
"windows": "Windows",
"current_epc_rating": "Current EPC Rating",
"current_sap_points": "Current SAP Points",
"total_floor_area": "Total Floor Area",
"number_of_rooms": "Number of Habitable Rooms",
"floor_height": "Floor Height",
}
)
# Calculate post SAP
df["Predicted Post Works SAP"] = df["Current SAP Points"] + df["sap_points"]
df["Predicted Post Works SAP"] = df["Predicted Post Works SAP"].round()
df["Predicted Post Works EPC"] = df["Predicted Post Works SAP"].apply(lambda x: sap_to_epc(x))
# Calculate the relative savings on carbon, kwh, and bills
df["relative_carbon_savings"] = df["co2_equivalent_savings"] / df["co2_emissions"]
df["relative_kwh_savings"] = df["kwh_savings"] / df["current_energy_demand"]
df["relative_bill_savings"] = df["energy_cost_savings"] / df["bills_total_cost"]
# Add on the archetype
df = df.merge(
property_asset_data[["uprn", "archetype_group"]], how="left", left_on="UPRN", right_on="uprn"
)
# For properties that don't make it to EPC B, check why. E.g. for a property that has an oil boiler, it
# the bills go up recommending HHRSH, so it doesn't make it to EPC B
# For mid-terrace units, use the ordnance survey API to check if there is space for a heat pump?
# DO it manually???
# Doesn't make it
# misses = df[df["Predicted Post Works EPC"] == "C"]
# # 5 of them are flats and so are difficult to get to EPC B without renewables. Possibly not worth it from an
# # ROI perspective
#
# misses[["UPRN", "Address", "Postcode", "property_type"]]
# UPRN Address Postcode property_type
# 2 100120988937 13 Sidbury Circular Road SP9 7HX Flat No further action
# 3 100120988998 74 Sidbury Circular Road SP9 7JA Flat No further action
# 4 100120989416 47 Zouch Avenue SP9 7LR Flat No further action
# 6 100060585002 42, Muscott Close, Shipton Bellinger SP9 7TX House Can probably take a heat pump
# 37 10000801072 34 Luffenham Place, Chicksands SG17 5XH House Already surveyed as having
# an ASHP - should be looked at
# 121 100120988259 8, Karachi Close SP9 7LW Flat
# 122 100121101217 599, Pepper Place BA12 0DW Flat
# 140 100021455241 33 Blenheim Crescent, Ruislip HA4 7HA House - Solar isnt recommended
# due to bug
# 149 100120915656 10 Bower Green, Shrivenham SN6 8TU House - Solar isn't recommended
# due to bug
scenario_data[scenario] = df
printing_scenario_id = scenario_ids[0]
# EPC breakdown
print(scenario_data[printing_scenario_id]['Predicted Post Works EPC'].value_counts())
# Cost
# Total cost
print(scenario_data[printing_scenario_id]["total_cost"].sum())
# Base cost
print(scenario_data[printing_scenario_id]["estimated_cost"].sum())
# Contingency
print(scenario_data[printing_scenario_id]["contingency"].sum())
# Costs averaged per unit
print(scenario_data[printing_scenario_id]["total_cost"].mean())
print(scenario_data[printing_scenario_id]["estimated_cost"].mean())
print(scenario_data[printing_scenario_id]["contingency"].mean())
# Average relative savings
print(scenario_data[printing_scenario_id]["relative_carbon_savings"].mean())
print(scenario_data[printing_scenario_id]["relative_kwh_savings"].mean())
print(scenario_data[printing_scenario_id]["relative_bill_savings"].mean())
measure_details = {}
for scenario in scenario_ids:
measure_details[scenario] = {}
recommendation_cols = [c for c in scenario_data[scenario].columns if "Recommendation:" in c]
measure_details[scenario]["count"] = scenario_data[scenario][recommendation_cols].sum().to_dict()
# Get average cost per measure
measure_columns = [
c.split("Recommendation: ")[1] for c in scenario_data[scenario].columns if "Recommendation:" in c
]
# Take the mean, drop zero columns
measure_costs = {}
for m in measure_columns:
measure_costs[m] = float(scenario_data[scenario][scenario_data[scenario][m] > 0][m].mean())
measure_details[scenario]["cost_per_measure"] = measure_costs
pprint(measure_details[scenario_ids[0]]["count"])
pprint(measure_details[scenario_ids[1]]["count"])
# Cost per measures
pprint(measure_details[scenario_ids[0]]["cost_per_measure"])
pprint(measure_details[scenario_ids[1]]["cost_per_measure"])
# Do not get to EPC B:
# 5 are flats
# 1) 34 Luffenham Place, Chicksands SG17 5XH, has been surveyed as having a low performing heat pump -
# should be looked at but several surrounding properties have been surveyed in a similar fashion
# 2) 42, Muscott Close, Shipton Bellinger SP9 7TX, has an oil boiler and the bills go up recommending HHRSH.
# we could non-intrusively recommend a heat pump.
# 3) 33 Blenheim Crescent, Ruislip, HA4 7HA, 100021455241 Solar potential modelling returned nothing -
# manual review indicates that there are multiple trees surrouding the south facing side of the property
# 4) 10 Bower Green, Shrivenham, SN6 8TU - Solar isn't recommended without further survey due to the local
# area being surrounded by trees
# Scenario adjustments:
# Exclude: boiler_upgrade
# Make ASHP COP 3.5
# Metrics we need by scenario:
# Cost
# contingency
# Carbon
# kwh
# bill savings
scenario_metrics = {}
for scenario in scenario_ids:
df = scenario_data[scenario].copy()
avg_savings = df[
["sap_points", "co2_equivalent_savings", "energy_cost_savings", "kwh_savings", "estimated_cost",
"total_cost", "contingency"]
].mean().to_dict()
avg_savings["cost_per_sap_point"] = avg_savings["total_cost"] / avg_savings["sap_points"]
avg_savings["cost_per_carbon"] = avg_savings["total_cost"] / avg_savings["co2_equivalent_savings"]
scenario_metrics[scenario] = avg_savings
pprint(scenario_metrics[scenario_ids[0]])
pprint(scenario_metrics[scenario_ids[1]])
scenario_data[scenario_ids[0]]["loft_insulation"][
scenario_data[scenario_ids[0]]["loft_insulation"] > 0
].mean()
scenario_data[scenario_ids[0]]["cavity_wall_insulation"][
scenario_data[scenario_ids[0]]["cavity_wall_insulation"] > 0
].mean()
# Testing checking floor risk
import requests
def get_flood_risk(lat, lon, radius_km=1):
url = "https://environment.data.gov.uk/flood-monitoring/id/floods"
params = {
'lat': lat,
'long': lon,
'dist': radius_km # search radius in km
}
response = requests.get(url, params=params)
response.raise_for_status()
data = response.json()
flood_warnings = data.get("items", [])
if not flood_warnings:
print("No active flood warnings near this location.")
else:
print(f"{len(flood_warnings)} warning(s) found near the location:")
for warning in flood_warnings:
print(f"- Area: {warning.get('description')}")
print(f" Severity: {warning.get('severity')} (Level {warning.get('severityLevel')})")
print(f" Message changed at: {warning.get('timeMessageChanged')}")
print()
return flood_warnings
from shapely.geometry import shape, Point
def get_flood_areas_near_point(lat, lon, radius_km=2):
url = "https://environment.data.gov.uk/flood-monitoring/id/floodAreas"
params = {
'lat': lat,
'long': lon,
'dist': radius_km
}
response = requests.get(url, params=params)
response.raise_for_status()
return response.json().get("items", [])
def point_in_flood_area(lat, lon):
flood_areas = get_flood_areas_near_point(lat, lon, radius_km=1)
point = Point(lon, lat) # GeoJSON uses (lon, lat) format
for area in flood_areas:
polygon_url = area.get("polygon")
if not polygon_url:
continue
polygon_response = requests.get(polygon_url)
polygon_response.raise_for_status()
polygon_geojson = polygon_response.json()
features = polygon_geojson.get("features", [])
if not features:
continue
flood_polygon = shape(features[0]['geometry'])
try:
is_inside = flood_polygon.contains(point)
except:
is_inside = False
if is_inside:
print(f"📍 Point is inside flood area: {area['label']} ({area['notation']})")
return area
from tqdm import tqdm
floor_warnings_data = []
for _, property in tqdm(property_asset_data.iterrows(), total=len(property_asset_data)):
# warnings = floor_warnings_data.extend(
# get_flood_risk(lat=property["LATITUDE"], lon=property["LONGITUDE"], radius_km=1)
# )
resp = point_in_flood_area(lat=property["LATITUDE"], lon=property["LONGITUDE"])
if resp:
floor_warnings_data.append(
{
"uprn": property["uprn"],
"address": property["address"],
"postcode": property["postcode"],
"area": resp
}
)
continue
import plotly.graph_objects as go
labels = [
"House_Cavity_Insulated_Pitched roof_Pre 1970",
"House_Cavity_Insulated_Pitched roof_Post 1970",
"House_Cavity_Uninsulated_Pitched roof_Pre 1970",
"House_Cavity_Uninsulated_Pitched roof_Post 1970",
"other",
"House_System_Uninsulated_Pitched roof_Pre 1970",
"House_Solid_Uninsulated_Not Pitched Roof_Pre 1970"
]
values = [62, 36, 21, 16, 16, 4, 2]
hovertext = [
"Loft insulation, draft proofing",
"Top-up loft insulation",
"Cavity wall insulation, loft insulation",
"Cavity wall insulation, ventilation",
"Bespoke retrofit measures",
"External wall insulation, roof insulation",
"Flat roof insulation, internal wall insulation"
]
fig = go.Figure(go.Treemap(
labels=labels,
parents=[""] * len(labels), # No root
values=values,
hovertext=hovertext,
hoverinfo="text",
textinfo="none",
marker=dict(
line=dict(color="white", width=4),
colors=values,
colorscale="Blues"
)
))
fig.update_layout(
margin=dict(t=10, l=10, r=10, b=10),
plot_bgcolor="white",
paper_bgcolor="white"
)
fig.show()
# Get the recommended measures by scenario id
recommendation_cols = [c for c in scenario_data[scenario_ids[1]].columns if "Recommendation:" in c]
measure_counts_by_scenario = scenario_data[scenario_ids[1]].groupby("archetype_group")[
recommendation_cols
].sum().reset_index()
measure_counts_by_scenario.to_csv(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/MOD/Pilot Programme/measure_counts_by_scenario.csv"
)
# Estimate average valuation improvment by scenarios
valuation_data = pd.read_csv(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/MOD/Pilot Programme/property_valuation.csv"
)
from backend.ml_models.Valuation import PropertyValuation
uplift = []
for _, x in valuation_data.iterrows():
uprn = x["uprn"]
to_append = {"uprn": uprn}
for _id in scenario_ids:
scenario = scenario_data[_id][
scenario_data[_id]["uprn"] == uprn
].squeeze()
val = PropertyValuation.estimate_valuation_improvement(
current_value=x["valuation"],
current_epc=scenario["Current EPC Rating"].value,
target_epc=scenario["Predicted Post Works EPC"],
total_cost=None
)
to_append[_id] = val["average_increase"]
uplift.append(to_append)
uplift = pd.DataFrame(uplift)
print(uplift[scenario_ids[0]].mean())
# £8,161
print(uplift[scenario_ids[1]].mean())
# £16,938