created distributed scoring for prediction

This commit is contained in:
Khalim Conn-Kowlessar 2024-02-27 11:02:12 +00:00
parent 0fbf004512
commit 7b080094fd

View file

@ -1166,10 +1166,9 @@ def get_epc_data(
]
# condition 1 - identified for gbis and not eligible
condition_1 = (
identified_for_gbis and not eligibility.gbis_warmfront and not eligibility.eco4_warmfront[
"eligible"]
) & consider_penultimate_epc
condition_1 = (identified_for_gbis and not eligibility.gbis_warmfront
and not eligibility.eco4_warmfront["eligible"]
) & consider_penultimate_epc
# condition 2 - identified for eco4 and not eligible
condition_2 = (identified_for_eco4 and not eligibility.eco4_warmfront[
@ -1246,10 +1245,12 @@ def get_epc_data(
"uprn": eligibility.epc["uprn"],
"is_estimated": searcher.newest_epc.get("estimated") is not None,
"property_type": eligibility.epc["property-type"],
"gbis_eligible": eligibility.gbis_warmfront,
"eco4_eligible": eligibility.eco4_warmfront["eligible"],
"eco4_message": eligibility.eco4_warmfront["message"],
"eco4_strict": eligibility.eco4_warmfront["strict"],
"gbis_eligible": eligibility.gbis_warmfront["eligible"],
"gbis_message": eligibility.gbis_warmfront["message"],
"gbis_strict": eligibility.gbis_warmfront["strict"],
"sap": float(eligibility.epc["current-energy-efficiency"]),
# Property components
"roof": eligibility.roof["clean_description"],
@ -1279,24 +1280,32 @@ def get_epc_data(
)
model_api = ModelApi(portfolio_id="-".join([ha_name, "eligibility"]), timestamp=created_at)
model_api.MODEL_PREFIXES = ["sap_change_predictions"]
all_predictions = model_api.predict_all(
df=scoring_df,
bucket="retrofit-data-dev",
prediction_buckets={
"sap_change_predictions": "retrofit-sap-predictions-dev",
"heat_demand_predictions": "retrofit-heat-predictions-dev",
"carbon_change_predictions": "retrofit-carbon-predictions-dev"
}
)
scoring_df["id"] = scoring_df["id"] + "phase=0"
# We split up the scoring_df and score
predictions = []
to_loop_over = range(0, scoring_df.shape[0], 400)
for chunk in tqdm(to_loop_over, total=len(to_loop_over)):
predictions_dict = model_api.predict_all(
df=scoring_df.iloc[chunk:chunk + 400],
bucket="retrofit-data-dev",
prediction_buckets={
"sap_change_predictions": "retrofit-sap-predictions-dev",
}
)
predictions = all_predictions["sap_change_predictions"].copy()
predictions.append(predictions_dict["sap_change_predictions"])
predictions = pd.concat(predictions)
predictions_size = predictions.shape[0]
predictions = predictions.rename(columns={"property_id": "row_id"}).merge(
results_df[["row_id", "sap"]], how="left", on="row_id"
)
if predictions.shape[0] != predictions_size:
raise ValueError("Predictions size has changed")
predictions["sap_uplift"] = predictions["predictions"] - predictions["sap"]
predictions = predictions.groupby("row_id")["sap_uplift"].sum().reset_index()
results_df = results_df.merge(
predictions[["sap_uplift", "row_id"]],
@ -1339,9 +1348,12 @@ def get_epc_data(
eligibility_assessment = pd.DataFrame(eligibility_assessment)
# Make sure the results haven't changed in size
results_df = results_df.merge(
eligibility_assessment, how="left", on="row_id"
)
if results_df.shape[0] != len(results):
raise ValueError("results has changed size")
# We store the results in S3 as a pickle
save_pickle_to_s3(
@ -1809,6 +1821,8 @@ def app():
loader.load()
loader.ha_facts_and_figures()
loader.facts_and_figures.to_csv("facts_and_figures.csv", index=False)
# We load in the additional data required to perform the analysis
cleaned = read_from_s3(
s3_file_name="cleaned_epc_data/cleaned.bson",