mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
implementing the scoring process into testing pipeline
This commit is contained in:
parent
7c109ebf5d
commit
1028861a1b
1 changed files with 73 additions and 9 deletions
|
|
@ -2,9 +2,12 @@ import json
|
|||
|
||||
import pandas as pd
|
||||
from tqdm import tqdm
|
||||
from utils.s3 import read_dataframe_from_s3_parquet, save_data_to_s3
|
||||
from utils.s3 import read_dataframe_from_s3_parquet, save_data_to_s3, save_dataframe_to_s3_parquet
|
||||
from backend.Property import Property
|
||||
|
||||
# This is the github pr number
|
||||
MODEL_VERSION = "100"
|
||||
|
||||
|
||||
def app():
|
||||
dataset = read_dataframe_from_s3_parquet(
|
||||
|
|
@ -76,6 +79,20 @@ def app():
|
|||
~dataset["internal_insulation"])
|
||||
]["walls_thermal_transmittance"].min()
|
||||
|
||||
ending_colums = [col for col in dataset.columns if col.endswith("_ending")]
|
||||
# For the purpose of scoring, we want to simulate JUST the impact of the measure we're testing. We therefore
|
||||
# need to make sure that every "_ending" column is equal to its starting value
|
||||
column_config = {}
|
||||
for ending_col in ending_colums:
|
||||
base_col = ending_col.replace("_ending", "")
|
||||
# We check if the starting column ends with _starting or is just the base col
|
||||
if base_col + "_starting" in dataset.columns:
|
||||
column_config[ending_col] = base_col + "_starting"
|
||||
elif base_col in dataset.columns:
|
||||
column_config[ending_col] = base_col
|
||||
else:
|
||||
raise ValueError("something went wrong")
|
||||
|
||||
loft_insulation_testing_data = []
|
||||
solid_wall_testing_data = []
|
||||
cavity_wall_testing_data = []
|
||||
|
|
@ -93,7 +110,11 @@ def app():
|
|||
(dataset["built_form"] == property_config.built_form) &
|
||||
(dataset["floor_area_quantile"] == property_config.floor_area_quantile) &
|
||||
(dataset["construction_age_band"] == property_config.construction_age_band)
|
||||
]
|
||||
].copy()
|
||||
|
||||
# Re-set all of the ending columns
|
||||
for col in ending_colums:
|
||||
population[col] = population[column_config[col]]
|
||||
|
||||
# 1) Loft insulation
|
||||
|
||||
|
|
@ -185,9 +206,9 @@ def app():
|
|||
)
|
||||
|
||||
# Simulated EWI
|
||||
best_external_wall_uvalue_wall_insulation_simulation = Property.create_recommendation_scoring_data(
|
||||
external_wall_insulation_simulation = Property.create_recommendation_scoring_data(
|
||||
property_id=row["uprn"].values[0],
|
||||
recommendation_record=row.copy(),
|
||||
recommendation_record=row.copy().to_dict("records")[0],
|
||||
recommendation={
|
||||
"recommendation_id": "external_wall_insulation",
|
||||
"type": "external_wall_insulation",
|
||||
|
|
@ -198,7 +219,7 @@ def app():
|
|||
|
||||
# The iww/ewi simulations will be next to each other, so we can see how they differ for the same property
|
||||
solid_wall_testing_data.append(internal_wall_insulation_simulation)
|
||||
solid_wall_testing_data.append(best_external_wall_uvalue_wall_insulation_simulation)
|
||||
solid_wall_testing_data.append(external_wall_insulation_simulation)
|
||||
|
||||
# 3) Cavity wall insulation
|
||||
cavity_wall_sample = population[
|
||||
|
|
@ -291,9 +312,7 @@ def app():
|
|||
]
|
||||
|
||||
if not single_glazing_sample.empty:
|
||||
row = single_glazing_sample[
|
||||
single_glazing_sample["multi_glaze_proportion_starting"] == value
|
||||
].sample(1)
|
||||
row = single_glazing_sample.sample(1)
|
||||
|
||||
# For single glazed windows, we can recommend double glazing or secondary glazing
|
||||
|
||||
|
|
@ -456,7 +475,7 @@ def app():
|
|||
# We store all of this data in s3, as it is
|
||||
save_data_to_s3(
|
||||
bucket_name="retrofit-datalake-dev",
|
||||
s3_file_name="sap_change_model/loft_insulation_testing_data.parquet",
|
||||
s3_file_name="sap_change_model/simulation-pipeline-data.json",
|
||||
data=json.dumps(
|
||||
{
|
||||
"loft_insulation_testing_data": loft_insulation_testing_data,
|
||||
|
|
@ -472,3 +491,48 @@ def app():
|
|||
}
|
||||
)
|
||||
)
|
||||
|
||||
# For each simulation type, we score against the model
|
||||
from backend.ml_models.api import ModelApi
|
||||
from datetime import datetime
|
||||
|
||||
created_at = datetime.now().isoformat()
|
||||
model_api = ModelApi(portfolio_id="simulation-testing-pipeline", timestamp=created_at)
|
||||
model_api.MODEL_PREFIXES = ["sap_change_predictions"]
|
||||
|
||||
# 1) Loft insulation
|
||||
# We chunk up the data into 200 rows
|
||||
loft_insulation_testing_df = pd.DataFrame(loft_insulation_testing_data)
|
||||
|
||||
loft_insulation_predictions = []
|
||||
loft_to_loop_over = range(0, loft_insulation_testing_df.shape[0], 200)
|
||||
for chunk in tqdm(loft_to_loop_over, total=len(loft_to_loop_over)):
|
||||
loft_insulation_predictions_dict = model_api.predict_all(
|
||||
df=loft_insulation_testing_df.iloc[chunk:chunk + 200],
|
||||
bucket="retrofit-data-dev",
|
||||
prediction_buckets={
|
||||
"sap_change_predictions": "retrofit-sap-predictions-dev",
|
||||
}
|
||||
)
|
||||
|
||||
loft_insulation_predictions.append(loft_insulation_predictions_dict["sap_change_predictions"])
|
||||
|
||||
loft_insulation_predictions = pd.concat(loft_insulation_predictions)
|
||||
# Store final parquet in s3
|
||||
save_dataframe_to_s3_parquet(
|
||||
df=loft_insulation_predictions,
|
||||
bucket_name="retrofit-data-dev",
|
||||
file_key=f"sap_change_model/simulation-pipeline-loft-insulation-predictions_{MODEL_VERSION}.parquet"
|
||||
)
|
||||
|
||||
# We now merge the loft insulation predictions onto the scoring data and calculate exactly how much the insulation
|
||||
# is worth
|
||||
|
||||
loft_insulation_comparison_df = loft_insulation_testing_df[
|
||||
["simulation_ending_insulation_thickness", "simulation_starting_insulation_thickness", "uprn", "id", ""]
|
||||
].merge(
|
||||
loft_insulation_predictions,
|
||||
left_on="id",
|
||||
right_on="id",
|
||||
how="left"
|
||||
)
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue