created distributed scoring for prediction

2026-07-27 23:35:01 +00:00 · 2024-02-27 11:02:12 +00:00 · 2024-02-27 11:02:12 +00:00 · 7b080094fd
commit 7b080094fd
parent 0fbf004512
1 changed files with 30 additions and 16 deletions
--- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
+++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
@ -1166,10 +1166,9 @@ def get_epc_data(
            ]

            # condition 1 - identified for gbis and not eligible
-            condition_1 = (
-                              identified_for_gbis and not eligibility.gbis_warmfront and not eligibility.eco4_warmfront[
-                              "eligible"]
-                          ) & consider_penultimate_epc
+            condition_1 = (identified_for_gbis and not eligibility.gbis_warmfront
+                           and not eligibility.eco4_warmfront["eligible"]
+                           ) & consider_penultimate_epc

            # condition 2 - identified for eco4 and not eligible
            condition_2 = (identified_for_eco4 and not eligibility.eco4_warmfront[
@ -1246,10 +1245,12 @@ def get_epc_data(
                    "uprn": eligibility.epc["uprn"],
                    "is_estimated": searcher.newest_epc.get("estimated") is not None,
                    "property_type": eligibility.epc["property-type"],
-                    "gbis_eligible": eligibility.gbis_warmfront,
                    "eco4_eligible": eligibility.eco4_warmfront["eligible"],
                    "eco4_message": eligibility.eco4_warmfront["message"],
                    "eco4_strict": eligibility.eco4_warmfront["strict"],
+                    "gbis_eligible": eligibility.gbis_warmfront["eligible"],
+                    "gbis_message": eligibility.gbis_warmfront["message"],
+                    "gbis_strict": eligibility.gbis_warmfront["strict"],
                    "sap": float(eligibility.epc["current-energy-efficiency"]),
                    # Property components
                    "roof": eligibility.roof["clean_description"],
@ -1279,24 +1280,32 @@ def get_epc_data(
            )

            model_api = ModelApi(portfolio_id="-".join([ha_name, "eligibility"]), timestamp=created_at)
+            model_api.MODEL_PREFIXES = ["sap_change_predictions"]

-            all_predictions = model_api.predict_all(
-                df=scoring_df,
-                bucket="retrofit-data-dev",
-                prediction_buckets={
-                    "sap_change_predictions": "retrofit-sap-predictions-dev",
-                    "heat_demand_predictions": "retrofit-heat-predictions-dev",
-                    "carbon_change_predictions": "retrofit-carbon-predictions-dev"
-                }
-            )
+            scoring_df["id"] = scoring_df["id"] + "phase=0"
+            # We split up the scoring_df and score
+            predictions = []
+            to_loop_over = range(0, scoring_df.shape[0], 400)
+            for chunk in tqdm(to_loop_over, total=len(to_loop_over)):
+                predictions_dict = model_api.predict_all(
+                    df=scoring_df.iloc[chunk:chunk + 400],
+                    bucket="retrofit-data-dev",
+                    prediction_buckets={
+                        "sap_change_predictions": "retrofit-sap-predictions-dev",
+                    }
+                )

-            predictions = all_predictions["sap_change_predictions"].copy()
+                predictions.append(predictions_dict["sap_change_predictions"])
+
+            predictions = pd.concat(predictions)
+            predictions_size = predictions.shape[0]

            predictions = predictions.rename(columns={"property_id": "row_id"}).merge(
                results_df[["row_id", "sap"]], how="left", on="row_id"
            )
+            if predictions.shape[0] != predictions_size:
+                raise ValueError("Predictions size has changed")
            predictions["sap_uplift"] = predictions["predictions"] - predictions["sap"]
-            predictions = predictions.groupby("row_id")["sap_uplift"].sum().reset_index()

            results_df = results_df.merge(
                predictions[["sap_uplift", "row_id"]],
@ -1339,9 +1348,12 @@ def get_epc_data(

            eligibility_assessment = pd.DataFrame(eligibility_assessment)

+            # Make sure the results haven't changed in size
            results_df = results_df.merge(
                eligibility_assessment, how="left", on="row_id"
            )
+            if results_df.shape[0] != len(results):
+                raise ValueError("results has changed size")

        # We store the results in S3 as a pickle
        save_pickle_to_s3(
@ -1809,6 +1821,8 @@ def app():
    loader.load()
    loader.ha_facts_and_figures()

+    loader.facts_and_figures.to_csv("facts_and_figures.csv", index=False)
+
    # We load in the additional data required to perform the analysis
    cleaned = read_from_s3(
        s3_file_name="cleaned_epc_data/cleaned.bson",