setting up app

2026-07-27 23:35:01 +00:00 · 2023-12-28 12:16:29 +00:00 · 2023-12-28 12:16:29 +00:00 · 65b39750a3
commit 65b39750a3
parent d9a3ac37c9
2 changed files with 191 additions and 1 deletions
--- a/etl/eligibility/ha_15_32/ha24_app.py
+++ b/etl/eligibility/ha_15_32/ha24_app.py
@ -395,7 +395,7 @@ def analyse_results(results_df, data, survey_list):
 def app():
    data, survey_list = load_data()

-    data["row_id"] = ["ha16_" + str(i) for i in range(0, len(data))]
+    data["row_id"] = ["ha24_" + str(i) for i in range(0, len(data))]

    cleaned = read_from_s3(
        s3_file_name="cleaned_epc_data/cleaned.bson",
--- a/etl/eligibility/ha_15_32/ha25_app.py
+++ b/etl/eligibility/ha_15_32/ha25_app.py
@ -75,6 +75,7 @@ def load_data():

    # We could re-concatenate but we only care about HouseNo for the moment
    asset_list = pd.concat([asset_list, split_addresses[["HouseNo"]]], axis=1)
+    asset_list["postcode"] = asset_list["postcode"].str.strip()

    # We analysis historical ECO3 survey list
    eco3_survey_workbook = openpyxl.load_workbook(f'etl/eligibility/ha_15_32/HESTIA - HA 25 ECO3 SURVEY LIST.xlsx')
@ -274,5 +275,194 @@ def load_data():
    return data, eco4_prospects_survey_list


+def get_epc_data(data, cleaned, cleaning_data, created_at):
+    scoring_data = []
+    results = []
+    nodata = []
+
+    for _, property_meta in tqdm(data.iterrows(), total=len(data)):
+        searcher = SearchEpc(
+            address1=property_meta["HouseNo"],
+            postcode=property_meta["postcode"],
+            size=1000
+        )
+        searcher.search()
+
+        if searcher.data is None:
+            nodata.append(property_meta)
+            continue
+
+        newest_epc, older_epcs, full_sap_epc = searcher.retrieve(address=property_meta["Address"])
+        # We also want to get the penultimate epc
+        penultimate_epc, _ = searcher.filter_newest_epc(older_epcs)
+        if not penultimate_epc:
+            penultimate_epc = newest_epc
+
+        eligibility = Eligibility(epc=newest_epc, cleaned=cleaned)
+        eligibility.check_gbis_warmfront()
+        eligibility.check_eco4_warmfront()
+
+        if (not eligibility.eco4_warmfront["eligible"]) and (not eligibility.gbis_warmfront) and (
+            property_meta["warmfront_identified"]
+        ):
+            eligibility = Eligibility(epc=penultimate_epc, cleaned=cleaned)
+            eligibility.check_gbis_warmfront()
+            eligibility.check_eco4_warmfront()
+            # If this is the case, we need to update the older epcs
+            older_epcs = [
+                x for x in older_epcs if x["lmk-key"] not in [newest_epc["lmk-key"], penultimate_epc["lmk-key"]]
+            ]
+
+        # Full checks
+        eligibility.check_gbis()
+        eligibility.check_eco4()
+
+        if eligibility.eco4_warmfront["eligible"]:
+            if eligibility.epc["uprn"] == "":
+                eligibility.epc["uprn"] = int(property_meta["row_id"].split("_")[1])
+
+            scoring_dictionary = prepare_model_data_row(
+                property_id=property_meta["row_id"],
+                modelling_epc=eligibility.epc,
+                cleaned=cleaned,
+                cleaning_data=cleaning_data,
+                created_at=created_at,
+                old_data=older_epcs,
+                full_sap_epc=full_sap_epc
+            )
+            scoring_data.extend(scoring_dictionary)
+
+        results.append(
+            {
+                "row_id": property_meta["row_id"],
+                "uprn": eligibility.epc["uprn"],
+                "Address": property_meta["Address"],
+                "Postcode": property_meta["Postcode"],
+                "property_type": eligibility.epc["property-type"],
+                "gbis_eligible": eligibility.gbis_warmfront,
+                "eco4_eligible": eligibility.eco4_warmfront["eligible"],
+                "eco4_message": eligibility.eco4_warmfront["message"],
+                "sap": float(eligibility.epc["current-energy-efficiency"]),
+                "gbis_eligible_future": eligibility.gbis["eligible"],
+                "gbis_eligible_future_message": eligibility.gbis["message"],
+                "eco4_eligible_future": eligibility.eco4["eligible"],
+                "eco4_eligible_future_message": eligibility.eco4["message"],
+                # Property components
+                "roof": eligibility.roof["clean_description"],
+                "walls": eligibility.walls["clean_description"],
+                "cavity_type": eligibility.cavity["type"],
+                "heating": eligibility.epc["mainheat-description"],
+                "tenure": eligibility.tenure,
+                "date_epc": eligibility.epc["lodgement-date"],
+            }
+        )
+
+    scoring_df = pd.DataFrame(scoring_data)
+
+    # Perform the same cleaning as in the model - first clean number of room variables though
+    scoring_df = DataProcessor.apply_averages_cleaning(
+        data_to_clean=scoring_df,
+        cleaning_data=cleaning_data,
+        cols_to_merge_on=['PROPERTY_TYPE', 'BUILT_FORM', 'CONSTRUCTION_AGE_BAND', 'LOCAL_AUTHORITY'],
+        colnames=["NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"],
+    )
+
+    scoring_df = DataProcessor.apply_averages_cleaning(
+        data_to_clean=scoring_df,
+        cleaning_data=cleaning_data,
+        cols_to_merge_on=COLUMNS_TO_MERGE_ON + ["LOCAL_AUTHORITY"],
+    ).drop(columns=["LOCAL_AUTHORITY"])
+
+    scoring_df = DataProcessor.clean_missings_after_description_process(
+        scoring_df,
+        ignore_cols=[c for c in scoring_df.columns if ("thermal_transmittance" in c) or (
+            "insulation_thickness" in c) or ("ENERGY_EFF" in c)]
+    )
+
+    scoring_df = DataProcessor.clean_efficiency_variables(scoring_df)
+    scoring_df["UPRN"] = scoring_df["UPRN"].astype(int)
+
+    model_api = ModelApi(portfolio_id="ha33-eligibility", timestamp=created_at)
+    all_predictions = model_api.predict_all(
+        df=scoring_df,
+        bucket="retrofit-data-dev",
+        prediction_buckets={
+            "sap_change_predictions": "retrofit-sap-predictions-dev",
+            "heat_demand_predictions": "retrofit-heat-predictions-dev",
+            "carbon_change_predictions": "retrofit-carbon-predictions-dev"
+        }
+    )
+
+    predictions = all_predictions["sap_change_predictions"].copy()
+
+    results_df = pd.DataFrame(results)
+
+    predictions = predictions.rename(columns={"property_id": "row_id"}).merge(
+        results_df[["row_id", "sap"]], how="left", on="row_id"
+    )
+    predictions["sap_uplift"] = predictions["predictions"] - predictions["sap"]
+    predictions = predictions.groupby("row_id")["sap_uplift"].sum().reset_index()
+
+    results_df = results_df.merge(
+        predictions[["sap_uplift", "row_id"]],
+        how="left",
+        on="row_id"
+    )
+    results_df["post_install_sap"] = results_df["sap"] + results_df["sap_uplift"]
+
+    eligibility_assessment = []
+    for _, row in results_df[results_df["eco4_eligible"] == True].iterrows():
+        # The upgrade requirements are dependent on the current SAP
+
+        # If the property is an F or G, it only needs to upgrade to an %
+        if row["sap"] <= 38:
+            if row["post_install_sap"] >= 57:
+                eligibility_classification = "highest confidence"
+            elif row["post_install_sap"] >= 55:
+                eligibility_classification = "high confidence"
+            elif row["post_install_sap"] >= 53:
+                eligibility_classification = "medium confidence"
+            else:
+                eligibility_classification = "unlikely"
+        else:
+
+            if row["post_install_sap"] >= 71:
+                eligibility_classification = "highest confidence"
+            elif row["post_install_sap"] >= 69:
+                eligibility_classification = "high confidence"
+            elif row["post_install_sap"] >= 67:
+                eligibility_classification = "medium confidence"
+            else:
+                eligibility_classification = "unlikely"
+
+        eligibility_assessment.append(
+            {
+                "row_id": row["row_id"],
+                "eligibility_classification": eligibility_classification
+            }
+        )
+
+    eligibility_assessment = pd.DataFrame(eligibility_assessment)
+
+    results_df = results_df.merge(
+        eligibility_assessment, how="left", on="row_id"
+    )
+    return results_df, scoring_data, nodata
+
+
 def app():
    data, eco4_prospects_survey_list = load_data()
+
+    data["row_id"] = ["ha25_" + str(i) for i in range(0, len(data))]
+
+    cleaned = read_from_s3(
+        s3_file_name="cleaned_epc_data/cleaned.bson",
+        bucket_name="retrofit-data-dev"
+    )
+    cleaned = msgpack.unpackb(cleaned, raw=False)
+
+    cleaning_data = read_parquet_from_s3(
+        bucket_name="retrofit-data-dev", file_key="sap_change_model/cleaning_dataset.parquet",
+    )
+
+    created_at = datetime.now().isoformat()