implementing the scoring process into testing pipeline

This commit is contained in:
Khalim Conn-Kowlessar 2024-02-12 16:54:16 +00:00
parent 7c109ebf5d
commit 1028861a1b

View file

@ -2,9 +2,12 @@ import json
import pandas as pd
from tqdm import tqdm
from utils.s3 import read_dataframe_from_s3_parquet, save_data_to_s3
from utils.s3 import read_dataframe_from_s3_parquet, save_data_to_s3, save_dataframe_to_s3_parquet
from backend.Property import Property
# This is the github pr number
MODEL_VERSION = "100"
def app():
dataset = read_dataframe_from_s3_parquet(
@ -76,6 +79,20 @@ def app():
~dataset["internal_insulation"])
]["walls_thermal_transmittance"].min()
ending_colums = [col for col in dataset.columns if col.endswith("_ending")]
# For the purpose of scoring, we want to simulate JUST the impact of the measure we're testing. We therefore
# need to make sure that every "_ending" column is equal to its starting value
column_config = {}
for ending_col in ending_colums:
base_col = ending_col.replace("_ending", "")
# We check if the starting column ends with _starting or is just the base col
if base_col + "_starting" in dataset.columns:
column_config[ending_col] = base_col + "_starting"
elif base_col in dataset.columns:
column_config[ending_col] = base_col
else:
raise ValueError("something went wrong")
loft_insulation_testing_data = []
solid_wall_testing_data = []
cavity_wall_testing_data = []
@ -93,7 +110,11 @@ def app():
(dataset["built_form"] == property_config.built_form) &
(dataset["floor_area_quantile"] == property_config.floor_area_quantile) &
(dataset["construction_age_band"] == property_config.construction_age_band)
]
].copy()
# Re-set all of the ending columns
for col in ending_colums:
population[col] = population[column_config[col]]
# 1) Loft insulation
@ -185,9 +206,9 @@ def app():
)
# Simulated EWI
best_external_wall_uvalue_wall_insulation_simulation = Property.create_recommendation_scoring_data(
external_wall_insulation_simulation = Property.create_recommendation_scoring_data(
property_id=row["uprn"].values[0],
recommendation_record=row.copy(),
recommendation_record=row.copy().to_dict("records")[0],
recommendation={
"recommendation_id": "external_wall_insulation",
"type": "external_wall_insulation",
@ -198,7 +219,7 @@ def app():
# The iww/ewi simulations will be next to each other, so we can see how they differ for the same property
solid_wall_testing_data.append(internal_wall_insulation_simulation)
solid_wall_testing_data.append(best_external_wall_uvalue_wall_insulation_simulation)
solid_wall_testing_data.append(external_wall_insulation_simulation)
# 3) Cavity wall insulation
cavity_wall_sample = population[
@ -291,9 +312,7 @@ def app():
]
if not single_glazing_sample.empty:
row = single_glazing_sample[
single_glazing_sample["multi_glaze_proportion_starting"] == value
].sample(1)
row = single_glazing_sample.sample(1)
# For single glazed windows, we can recommend double glazing or secondary glazing
@ -456,7 +475,7 @@ def app():
# We store all of this data in s3, as it is
save_data_to_s3(
bucket_name="retrofit-datalake-dev",
s3_file_name="sap_change_model/loft_insulation_testing_data.parquet",
s3_file_name="sap_change_model/simulation-pipeline-data.json",
data=json.dumps(
{
"loft_insulation_testing_data": loft_insulation_testing_data,
@ -472,3 +491,48 @@ def app():
}
)
)
# For each simulation type, we score against the model
from backend.ml_models.api import ModelApi
from datetime import datetime
created_at = datetime.now().isoformat()
model_api = ModelApi(portfolio_id="simulation-testing-pipeline", timestamp=created_at)
model_api.MODEL_PREFIXES = ["sap_change_predictions"]
# 1) Loft insulation
# We chunk up the data into 200 rows
loft_insulation_testing_df = pd.DataFrame(loft_insulation_testing_data)
loft_insulation_predictions = []
loft_to_loop_over = range(0, loft_insulation_testing_df.shape[0], 200)
for chunk in tqdm(loft_to_loop_over, total=len(loft_to_loop_over)):
loft_insulation_predictions_dict = model_api.predict_all(
df=loft_insulation_testing_df.iloc[chunk:chunk + 200],
bucket="retrofit-data-dev",
prediction_buckets={
"sap_change_predictions": "retrofit-sap-predictions-dev",
}
)
loft_insulation_predictions.append(loft_insulation_predictions_dict["sap_change_predictions"])
loft_insulation_predictions = pd.concat(loft_insulation_predictions)
# Store final parquet in s3
save_dataframe_to_s3_parquet(
df=loft_insulation_predictions,
bucket_name="retrofit-data-dev",
file_key=f"sap_change_model/simulation-pipeline-loft-insulation-predictions_{MODEL_VERSION}.parquet"
)
# We now merge the loft insulation predictions onto the scoring data and calculate exactly how much the insulation
# is worth
loft_insulation_comparison_df = loft_insulation_testing_df[
["simulation_ending_insulation_thickness", "simulation_starting_insulation_thickness", "uprn", "id", ""]
].merge(
loft_insulation_predictions,
left_on="id",
right_on="id",
how="left"
)