mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
287 lines
11 KiB
Python
287 lines
11 KiB
Python
# We use some sample properties from Newhaven to use as a testing dataset for implementing the model fixes
|
|
|
|
|
|
import inspect
|
|
import pandas as pd
|
|
from etl.epc.settings import EARLIEST_EPC_DATE
|
|
from pathlib import Path
|
|
from utils.s3 import save_csv_to_s3
|
|
|
|
src_file_path = inspect.getfile(lambda: None)
|
|
|
|
EPC_DIRECTORY = Path(src_file_path).parent / "local_data" / "all-domestic-certificates"
|
|
|
|
USER_ID = 8
|
|
PORTFOLIO_ID = -1
|
|
|
|
|
|
def app():
|
|
"""
|
|
This application is tasked with pulling a large quantity of data from the find my epc website, containing the
|
|
estimated energy consumption for properties
|
|
:return:
|
|
"""
|
|
|
|
lewes_directory = EPC_DIRECTORY / "domestic-E07000063-Lewes/certificates.csv"
|
|
|
|
data = pd.read_csv(lewes_directory, low_memory=False)
|
|
# Rename the columns to the same format as the api returns
|
|
data.columns = [c.replace("_", "-").lower() for c in data.columns]
|
|
|
|
# Take just date before the date threshold
|
|
data = data[data["lodgement-date"] >= EARLIEST_EPC_DATE]
|
|
|
|
data = data[~pd.isnull(data["uprn"])]
|
|
data = data[data["current-energy-efficiency"].astype(float) < 52]
|
|
data = data.sample(10)
|
|
|
|
# Create an asset list
|
|
asset_list = data[["uprn", "address1", "postcode"]].copy().rename(columns={"address1": "address"})
|
|
asset_list["uprn"] = asset_list["uprn"].astype(str)
|
|
|
|
filename = f"{USER_ID}/{PORTFOLIO_ID}/pilot.csv"
|
|
save_csv_to_s3(
|
|
dataframe=asset_list,
|
|
bucket_name="retrofit-plan-inputs-dev",
|
|
file_name=filename
|
|
)
|
|
|
|
body = {
|
|
"portfolio_id": str(PORTFOLIO_ID),
|
|
"housing_type": "Private",
|
|
"goal": "Increasing EPC",
|
|
"goal_value": "B",
|
|
"trigger_file_path": filename,
|
|
"already_installed_file_path": "",
|
|
"patches_file_path": "",
|
|
"non_invasive_recommendations_file_path": "",
|
|
"budget": None,
|
|
}
|
|
print(body)
|
|
|
|
|
|
# This is some temp code, which is for diagnosing the issues with the bills models
|
|
heating_training_data_filepath = "sap_change_model/2024-08-06-11-19-49/dataset_rooms.parquet"
|
|
|
|
# For the heating model:
|
|
heating_drop_columns = [
|
|
"sap_ending", "heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_ending", "carbon_ending",
|
|
"lighting_cost_ending", "hot_water_cost_ending",
|
|
# "days_to_ending", "days_to_starting", # TODO This is in the live version
|
|
'number_habitable_rooms_starting', 'number_habitable_rooms_ending', 'number_heated_rooms_starting',
|
|
'number_heated_rooms_ending',
|
|
'number_habitable_rooms', 'number_heated_rooms'
|
|
]
|
|
|
|
heating_response = "heating_cost_ending"
|
|
|
|
# for the hot water model (older dataset)
|
|
hot_water_training_data_filepath = "sap_change_model/2024-07-10-20-28-54/dataset_rooms.parquet"
|
|
|
|
hot_water_drop_columns = [
|
|
"sap_ending", "heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_ending", "carbon_ending",
|
|
"lighting_cost_ending", "heating_cost_ending",
|
|
"days_to_starting", "days_to_ending",
|
|
'number_habitable_rooms_starting', 'number_habitable_rooms_ending', 'number_heated_rooms_starting',
|
|
'number_heated_rooms_ending',
|
|
'number_habitable_rooms', 'number_heated_rooms'
|
|
]
|
|
|
|
# Diagnose heating
|
|
from utils.s3 import read_dataframe_from_s3_parquet
|
|
|
|
train = read_dataframe_from_s3_parquet(
|
|
bucket_name="retrofit-data-dev",
|
|
file_key=heating_training_data_filepath
|
|
)
|
|
|
|
# Drop the columns that aren't used
|
|
train = train.drop(columns=heating_drop_columns)
|
|
|
|
# if the value is postive, it means the ending cost is bigger than the starting (which means it got more expensive)
|
|
train["cost_diference"] = (train["heating_cost_ending"] - train["heating_cost_starting"])
|
|
change_direction = train["cost_diference"] > 0
|
|
change_direction.value_counts(normalize=True)
|
|
|
|
average_costs_by_time_starting = train.groupby(
|
|
["lodgement_year_starting", "lodgement_month_starting"]
|
|
)["heating_cost_starting"].mean().reset_index().sort_values(["lodgement_year_starting", "lodgement_month_starting"])
|
|
|
|
average_costs_by_time_ending = train.groupby(
|
|
["lodgement_year_ending", "lodgement_month_ending"]
|
|
)["heating_cost_ending"].mean().reset_index().sort_values(["lodgement_year_ending", "lodgement_month_ending"])
|
|
|
|
# Check by photo supply values - if the property is gas, solar panels won't have an affect on the heating or hot
|
|
# water so let's look for electric homes
|
|
# Across the entire dataset, there is no correlation
|
|
# Even for electric properties, there is no correlation
|
|
photo_supply_averages = train[
|
|
train["fuel_type_ending"] == "electricity"
|
|
].groupby(["photo_supply_ending"])["heating_cost_ending"].mean().reset_index()
|
|
|
|
photo_supply_to_size = train.groupby("photo_supply_ending")["total_floor_area_ending"].mean().reset_index()
|
|
photo_supply_to_size[["photo_supply_ending", "total_floor_area_ending"]].corr()
|
|
train[["total_floor_area_ending", "heating_cost_ending"]].corr()
|
|
# Bigger properties end up with smaller photo_supply values. This will be because the array size likely remains fairly
|
|
# consistent but takes up a smaller proportion of the roof. Typically, the bigger the floor area, the higher the heating
|
|
# costs, but bigger units also have smaller photo_supply
|
|
adding_solar = train[
|
|
(train["photo_supply_ending"] > 0) & (train["photo_supply_starting"] == 0)
|
|
]
|
|
is_positive = (adding_solar["cost_diference"] > 0)
|
|
is_positive.value_counts(normalize=True)
|
|
|
|
photo_supply_by_time = (
|
|
train[
|
|
train["fuel_type_ending"] == "electricity"
|
|
].groupby(
|
|
["lodgement_year_ending", "photo_supply_ending"]
|
|
)["heating_cost_ending"].mean().reset_index().sort_values(
|
|
["lodgement_year_ending", "photo_supply_ending"], ascending=True)
|
|
)
|
|
# Plot
|
|
photo_supply_by_time[["photo_supply_ending", "heating_cost_ending"]].corr()
|
|
photo_supply_by_time.plot()
|
|
|
|
# Observations
|
|
# 1) We retain all of the potential columns, however they are just based on the starting EPC
|
|
# 2) 21% of the the time, the ending heating cost is more than the starting but this is clearly a minority
|
|
# 3) Let's get ride of estimated perimeter starting and ending
|
|
|
|
# Things I should check
|
|
# 1) Do we updated the lodgment_year_ending and lodgement_month_ending
|
|
# 2) Should we adjust costs to now, as well as lodgement_dates to today? Since 2023, costs have increased a lot so
|
|
# any savings should be benchmarked against what a customer is paying now
|
|
# 3) It might make sense to create a feature between floor area and photo supply, to give a more consistent estimate
|
|
# of a panel size for the property
|
|
|
|
# Get an example and score with the models
|
|
example = train[
|
|
(train["photo_supply_starting"] == 0) &
|
|
(train["photo_supply_ending"] > 0) &
|
|
(train["heating_cost_starting"] > train["heating_cost_ending"])
|
|
].sample(1)
|
|
|
|
# example["lodgement_month_starting"]
|
|
# example["lodgement_year_starting"]
|
|
# example["lodgement_month_ending"]
|
|
# example["lodgement_year_ending"].values[0]
|
|
#
|
|
# example["lodgement_year_ending"] = 2023
|
|
# example["days_to_ending"] = 3500
|
|
# example["days_to_starting"]
|
|
|
|
# {'heating_cost_predictions': predictions
|
|
# 0 378.5}
|
|
resp = model_api.predict_all(
|
|
df=example,
|
|
bucket="retrofit-data-dev",
|
|
prediction_buckets=get_prediction_buckets(),
|
|
model_prefixes=["heating_cost_predictions"],
|
|
extract_ids=False
|
|
)
|
|
|
|
# Step 1: get a cost for today
|
|
p.create_base_difference_epc_record(cleaned)
|
|
cwi_impact = p.base_difference_record.df.copy()
|
|
for k in property_recommendations[0][0]["simulation_config"]:
|
|
cwi_impact[k] = property_recommendations[0][0]["simulation_config"][k]
|
|
|
|
# 2212.4 - Baseline
|
|
today = model_api.predict_all(
|
|
df=p.base_difference_record.df.copy(),
|
|
bucket="retrofit-data-dev",
|
|
prediction_buckets=get_prediction_buckets(),
|
|
model_prefixes=["heating_cost_predictions"],
|
|
extract_ids=False
|
|
)
|
|
|
|
# impact of CWI - 1908
|
|
cwi_response = model_api.predict_all(
|
|
df=cwi_impact,
|
|
bucket="retrofit-data-dev",
|
|
prediction_buckets=get_prediction_buckets(),
|
|
model_prefixes=["heating_cost_predictions"],
|
|
extract_ids=False
|
|
)
|
|
|
|
pv_impact = cwi_impact.copy()
|
|
pv_impact["photo_supply_ending"] = 50
|
|
pv_impact["heating_cost_starting"] = 2212.4
|
|
|
|
pv_response = model_api.predict_all(
|
|
df=pv_impact,
|
|
bucket="retrofit-data-dev",
|
|
prediction_buckets=get_prediction_buckets(),
|
|
model_prefixes=["heating_cost_predictions"],
|
|
extract_ids=False
|
|
)
|
|
|
|
# Testing kwh for vde
|
|
base_prediction = model_api.predict_all(
|
|
df=epcs_for_scoring,
|
|
bucket=get_settings().DATA_BUCKET,
|
|
prediction_buckets=get_prediction_buckets(),
|
|
model_prefixes=["heating_kwh_predictions"],
|
|
extract_ids=False
|
|
)
|
|
|
|
cwi_epc = pd.DataFrame([property_scoring_epcs[1].copy()])
|
|
cwi_epc = add_features_from_code(cwi_epc)
|
|
cwi_epc = add_estimate_annual_kwh(cwi_epc)
|
|
# cwi_epc["walls-description"] = "Cavity wall, filled cavity"
|
|
# cwi_epc["walls-energy-eff"] = "Good"
|
|
# cwi_epc["heating-cost-current"] = 1650
|
|
# cwi_epc["current-energy-efficiency"] = 72
|
|
# cwi_epc["current-energy-rating"] = "C"
|
|
# cwi_epc["co2-emissions-current"] = 3.7
|
|
# cwi_epc["energy-consumption-current"] = 121
|
|
# cwi_epc["co2-emiss-curr-per-floor-area"] = 19
|
|
# cwi_epc["photo-supply"] = 0
|
|
# cwi_epc["energy-consumption-current"] =
|
|
# cwi_epc["roof-description"] = "Pitched, 300 mm loft insulation"
|
|
# cwi_epc["roof-energy-eff"] = "Very Good"
|
|
# cwi_epc["heating-cost-current"] = 1264
|
|
|
|
# "heating-cost-current": rec_impact["epc_heating_cost"],
|
|
# "hot-water-cost-current": rec_impact["epc_hot_water_cost"],
|
|
# # CO₂ emissions per square metre floor area per year in kg/m². Since CO₂ emissions are in tonnes
|
|
# # per year, we multiply by 1000 to get kg/m²
|
|
# "co2-emiss-curr-per-floor-area": round(
|
|
# 1000 * (rec_impact["carbon"] / self.data["total-floor-area"])
|
|
# ),
|
|
# "co2-emissions-current": rec_impact["carbon"],
|
|
# "current-energy-rating": sap_to_epc(rec_impact["sap"]),
|
|
# "current-energy-efficiency": int(np.floor(rec_impact["sap"])),
|
|
# "energy-consumption-current": rec_impact["heat_demand"],
|
|
# "lighting-cost-current": rec_impact["epc_lighting_cost"],
|
|
# "id": "+".join([str(self.id), rec_id])
|
|
|
|
cwi_prediction = model_api.predict_all(
|
|
df=cwi_epc,
|
|
bucket=get_settings().DATA_BUCKET,
|
|
prediction_buckets=get_prediction_buckets(),
|
|
model_prefixes=["heating_kwh_predictions", "hotwater_kwh_predictions"],
|
|
extract_ids=False
|
|
)
|
|
|
|
# 77 perryn
|
|
starting_heating = 19837.2
|
|
starting_hot_water = 2974.1
|
|
|
|
ending_heating = 17041.1
|
|
ending_hot_water = 2735.3
|
|
|
|
# 44 lindlings
|
|
starting_heating = 13327.1
|
|
starting_hot_water = 2349.5
|
|
|
|
ending_heating = 9672.3
|
|
ending_hot_water = 2030.2
|
|
|
|
ending_heating = 8695.1
|
|
ending_hot_water = 2437.0
|
|
|
|
heating_impact = starting_heating - ending_heating
|
|
hot_water_impact = starting_hot_water - ending_hot_water
|
|
total_impact = heating_impact + hot_water_impact
|