Model/etl/customers/newhaven/newhaven_study.py
2024-08-16 09:50:37 +01:00

378 lines
14 KiB
Python

import inspect
import pandas as pd
from etl.epc.settings import EARLIEST_EPC_DATE
from pathlib import Path
import numpy as np
from utils.s3 import save_csv_to_s3
src_file_path = inspect.getfile(lambda: None)
EPC_DIRECTORY = Path(src_file_path).parent / "local_data" / "all-domestic-certificates"
CUSTOMER_DATA_DIRECTORY = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Newhaven/Data"
USER_ID = 8
PORTFOLIO_ID = 90
def make_asset_list():
"""
Set up a small asset list for the study
"""
# Read in EPC data for Lewes
lewes_directory = EPC_DIRECTORY / "domestic-E07000063-Lewes/certificates.csv"
epc_data = pd.read_csv(lewes_directory, low_memory=False)
# Rename the columns to the same format as the api returns
epc_data.columns = [c.replace("_", "-").lower() for c in epc_data.columns]
# Take just date before the date threshold
epc_data = epc_data[epc_data["lodgement-date"] >= EARLIEST_EPC_DATE]
epc_data = epc_data[~pd.isnull(epc_data["uprn"])]
epc_data["uprn"] = epc_data["uprn"].astype(int).astype(str)
# Take the newest EPC per uprn
epc_data = epc_data.sort_values("lodgement-date").groupby("uprn").last().reset_index()
# /Users/khalimconn-kowlessar/Documents/hestia/Customers/Newhaven/Data/
# We read in the multiple data sources
address_base = pd.read_csv(
f"{CUSTOMER_DATA_DIRECTORY}/OS AddressBase Premium/OS AddressBase Premium.csv",
low_memory=False,
)
# Filter on resi
address_base = address_base[address_base["Primary Code Description"] == "Residential"]
address_base["UPRN"] = address_base["UPRN"].astype(int).astype(str)
pv_potential = pd.read_csv(
f"{CUSTOMER_DATA_DIRECTORY}/Domestic Rooftop PV Potential/Domestic Rooftop PV Potential.csv",
low_memory=False,
)
pv_potential["UPRN"] = pv_potential["UPRN"].astype(int).astype(str)
ashp_potential = pd.read_csv(
f"{CUSTOMER_DATA_DIRECTORY}/Air Source Heat Pump Potential/Air Source Heat Pump Potential.csv",
low_memory=False,
)
ashp_potential["UPRN"] = ashp_potential["UPRN"].astype(int).astype(str)
ashp_potential[ashp_potential["UPRN"] == "100060067063"].squeeze()
insulation_potential = pd.read_csv(
f"{CUSTOMER_DATA_DIRECTORY}/Insulation Potential/Insulation Potential.csv",
low_memory=False,
)
insulation_potential["UPRN"] = insulation_potential["UPRN"].astype(int).astype(str)
renewables_cost = pd.read_csv(
f"{CUSTOMER_DATA_DIRECTORY}/Low Carbon Technology Costs/Low Carbon Technology Costs.csv",
low_memory=False,
)
renewables_cost["UPRN"] = renewables_cost["UPRN"].astype(int).astype(str)
# Merge the EPC data onto address base
asset_list = address_base[
[
"UPRN", "Class Description", "Relative Height - Eaves",
]
].merge(
epc_data[
["uprn", "current-energy-efficiency", "current-energy-rating", "address1", "postcode", "floor-height",
"property-type", "built-form", "co2-emissions-current"]],
how="left",
left_on="UPRN",
right_on="uprn"
).drop(
columns=["uprn"]
).merge(
insulation_potential[["UPRN", "EPC Rating", "Wall Area [m^2]", "Building Area [m^2]"]],
how="left",
on="UPRN"
).rename(
columns={"Wall Area [m^2]": "insulation_wall_area", "Building Area [m^2]": "floor_area"}
)
had_an_epc = asset_list[~pd.isnull(asset_list["current-energy-efficiency"])]
below_b = asset_list[asset_list["current-energy-efficiency"].astype(float) <= 80].shape
below_c = asset_list[asset_list["current-energy-efficiency"].astype(float) <= 69].shape
had_an_epc["energy-efficiency-rating"].value_counts()
asset_list["current-energy-rating"].value_counts()
asset_list["co2-emissions-current"].mean()
# # Get the underlying data of a histograme
import matplotlib.pyplot as plt
n, bins, patches = plt.hist(asset_list["co2-emissions-current"], bins=100, color="blue", alpha=0.7)
#
bins = np.arange(0, asset_list["co2-emissions-current"].max(), 1) # Bins from 50 to 150 with a step of 10
#
# # Step 3: Calculate the frequency of data in each bin
hist, bin_edges = np.histogram(asset_list["co2-emissions-current"], bins=bins)
# Take properties below a B - there are 2844 units
asset_list = asset_list[asset_list["current-energy-efficiency"].astype(float) <= 80]
# Drop caravans
asset_list = asset_list[asset_list["Class Description"] != "Caravan"]
asset_list = asset_list[~pd.isnull(asset_list["current-energy-efficiency"])]
# Take a sample, for properties that have an EPC, with a seed
# asset_list = asset_list.sample(frac=0.5, random_state=42)
AVG_FLOOR_HEIGHT = asset_list["floor-height"].median()
def estimate_n_floors(
building_height, floor_height, address_base_property_description, epc_property_type,
):
if address_base_property_description == "Self Contained Flat (Includes Maisonette / Apartment)":
if epc_property_type in ["Flat"]:
return 1
if epc_property_type == "Maisonette":
return 2
return None
if pd.isnull(floor_height):
return np.round(building_height / AVG_FLOOR_HEIGHT)
return np.round(building_height / floor_height)
# Estimate the number of floors
asset_list["number_of_floors"] = asset_list.apply(
lambda x: estimate_n_floors(
building_height=x["Relative Height - Eaves"],
floor_height=x["floor-height"],
address_base_property_description=x["Class Description"],
epc_property_type=x["property-type"],
),
axis=1
)
# Drop any entires with null floors because that means the ordnance survey data doesn't align with the epc data
asset_list = asset_list[~pd.isnull(asset_list["number_of_floors"])]
# Drop any entries with null insulation wall area
asset_list = asset_list[~pd.isnull(asset_list["insulation_wall_area"])]
# D 0.419929
# C 0.391459
# E 0.160142
# F 0.017794
# G 0.010676
# Total asset list:
# D 0.450409
# C 0.412016
# E 0.110203
# F 0.020263
# G 0.007110
# We do the followings:
# 1) Create final asset list
# 2) Create Non-intrusive recommendations
# 3) Create a third party costing object
cost_testing = renewables_cost.merge(
insulation_potential, how="inner", on="UPRN"
)
cost_testing["cwi_cost_per_m2"] = cost_testing["Insulation - Cavity Wall - Total"] / cost_testing["Wall Area [m^2]"]
# Their cavity wall insulation is £8 per m^2
cost_testing["ewi_cost_per_m2"] = cost_testing["Insulation - External Wall - Total"] / cost_testing[
"Wall Area [m^2]"]
cost_testing["li_cost_per_m2"] = cost_testing["Insulation - Loft - Total"] / cost_testing["Building Area [m^2]"]
cost_testing["underfloor_cost_per_m2"] = cost_testing["Insulation - Under Floor- Total"] / cost_testing[
"Building Area [m^2]"]
final_asset_list = asset_list.rename(
columns={"UPRN": "uprn", "address1": "address", "floor_area": "insulation_floor_area"}
)[["uprn", "address", "postcode", "insulation_wall_area", "insulation_floor_area", "number_of_floors"]]
# Create non-invasive recommendations, which come from the solar potential and ASHP potential data sources
non_invasive_recommendations = []
for _, row in final_asset_list.iterrows():
property_ashp_potential = ashp_potential[
(ashp_potential["UPRN"] == row["uprn"]) & ashp_potential["Overall Suitability Rating"]
]
property_pv_potential = pv_potential[
(pv_potential["UPRN"] == row["uprn"]) & pv_potential["Overall Suitability"]
]
property_costs = renewables_cost[renewables_cost["UPRN"] == row["uprn"]]
property_non_invasive_recs = []
if not property_ashp_potential.empty:
if property_costs.empty:
similar_properties = ashp_potential[
ashp_potential["Overall Suitability Rating"] &
(ashp_potential["Recommended Heat Pump Size [kW]"] ==
property_ashp_potential["Recommended Heat Pump Size [kW]"].values[0])
].merge(
renewables_cost, how="inner", on="UPRN"
)
property_costs = similar_properties[["Air Source Heat Pump - Total"]].mean().to_frame().T
property_non_invasive_recs.append(
{
"type": "air_source_heat_pump",
"suitable": True,
"size": property_ashp_potential["Recommended Heat Pump Size [kW]"].values[0],
"cost": property_costs["Air Source Heat Pump - Total"].values[0],
"ashp_only_heating_recommendation": True
}
)
else:
property_non_invasive_recs.append(
{
"type": "air_source_heat_pump",
"suitable": False
}
)
if not property_pv_potential.empty:
property_non_invasive_recs.append(
{
"type": "solar_pv",
"suitable": True,
"array_wattage": property_pv_potential["Recommended Array Size [kW]"].values[0] * 1000,
"initial_ac_kwh_per_year": property_pv_potential["Annual Generation [kWh]"].values[0],
"panneled_roof_area": property_pv_potential["Roof area suitable for PV [m^2]"].values[0],
"cost": property_costs["Rooftop PV - Total"].values[0],
}
)
else:
property_non_invasive_recs.append(
{
"type": "solar_pv",
"suitable": False
}
)
non_invasive_recommendations.append(
{
"uprn": row["uprn"],
"recommendations": property_non_invasive_recs,
}
)
# Save the asset list
# Store the asset list in s3
filename = f"{USER_ID}/{PORTFOLIO_ID}/pilot.csv"
save_csv_to_s3(
dataframe=final_asset_list,
bucket_name="retrofit-plan-inputs-dev",
file_name=filename
)
# Store non-invasive recommendations in S3
non_invasive_recommendations_filename = f"{USER_ID}/{PORTFOLIO_ID}/non_invasive_recommendations.csv"
save_csv_to_s3(
dataframe=pd.DataFrame(non_invasive_recommendations),
bucket_name="retrofit-plan-inputs-dev",
file_name=non_invasive_recommendations_filename
)
# We add a patch to one of the units because there's no data for the built form
# We would be able to handle this automatically in the future, when using OS API
patches = [
{
"uprn": "10033266220",
"built-form": "Semi-Detached",
},
{'uprn': '10033266219', 'built-form': 'Semi-Detached'}
]
# Store patches in s3
patches_filename = f"{USER_ID}/{PORTFOLIO_ID}/patches.json"
save_csv_to_s3(
dataframe=pd.DataFrame(patches),
bucket_name="retrofit-plan-inputs-dev",
file_name=patches_filename
)
# Create three scenarios
body1 = {
"portfolio_id": str(PORTFOLIO_ID),
"housing_type": "Private",
"goal": "Increasing EPC",
"goal_value": "A",
"trigger_file_path": filename,
"already_installed_file_path": "",
"patches_file_path": patches_filename,
"non_invasive_recommendations_file_path": non_invasive_recommendations_filename,
"scenario_name": "Demand Reduction - no solid wall, windows, LEDs",
"multi_plan": True,
"exclusions": [
"internal_wall_insulation", "external_wall_insulation", "floor_insulation", "heating", "solar_pv",
"lighting", "windows", "secondary_heating"
],
"budget": None,
}
print(body1)
body2 = {
"portfolio_id": str(PORTFOLIO_ID),
"housing_type": "Private",
"goal": "Increasing EPC",
"goal_value": "A",
"trigger_file_path": filename,
"already_installed_file_path": "",
"patches_file_path": patches_filename,
"non_invasive_recommendations_file_path": non_invasive_recommendations_filename,
"scenario_name": "Demand Reduction - no solid wall, floors or heating",
"multi_plan": True,
"exclusions": [
"internal_wall_insulation", "external_wall_insulation", "floor_insulation", "heating", "solar_pv",
],
"budget": None,
}
print(body2)
# 2.5 - full fabric, no decant
body2_5 = {
"portfolio_id": str(PORTFOLIO_ID),
"housing_type": "Private",
"goal": "Increasing EPC",
"goal_value": "A",
"trigger_file_path": filename,
"already_installed_file_path": "",
"patches_file_path": patches_filename,
"non_invasive_recommendations_file_path": non_invasive_recommendations_filename,
"scenario_name": "Demand Reduction - no solid wall, floors or heating",
"multi_plan": True,
"exclusions": [
"internal_wall_insulation", "floor_insulation", "heating", "solar_pv",
],
"budget": None,
}
print(body2_5)
# Scenario B
body3 = {
"portfolio_id": str(PORTFOLIO_ID),
"housing_type": "Private",
"goal": "Increasing EPC",
"goal_value": "A",
"trigger_file_path": filename,
"already_installed_file_path": "",
"patches_file_path": patches_filename,
"non_invasive_recommendations_file_path": non_invasive_recommendations_filename,
"scenario_name": "Demand Reduction, Heating Systems, Solar PV - no solid wall or floors",
"multi_plan": True,
"exclusions": ["internal_wall_insulation", "external_wall_insulation", "floor_insulation"],
"budget": None,
}
print(body3)
# Scenario 4 - deep fabric, no IWI, floor
body4 = {
"portfolio_id": str(PORTFOLIO_ID),
"housing_type": "Private",
"goal": "Increasing EPC",
"goal_value": "A",
"trigger_file_path": filename,
"already_installed_file_path": "",
"patches_file_path": patches_filename,
"non_invasive_recommendations_file_path": non_invasive_recommendations_filename,
"scenario_name": "Whole House",
"multi_plan": True,
"budget": None,
}
print(body4)