From 1645f9ab9ed84bdb90fa2a732d697111b36bd17b Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Tue, 19 Nov 2024 22:00:00 +0000
Subject: [PATCH] updating stonewater modelling code to use new data

---
 .../stonewater/Wave 3 Preparation.py          | 288 +++++++++++++++---
 1 file changed, 247 insertions(+), 41 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index 426097e8..f4195592 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -1071,10 +1071,13 @@ def main():
     ]
 
     # We now merge on the coordinator data so that against each property, we can map the measures
+    # TODO: Get the pre & post primary energy numbers
+    # TODO: Make sure the numbers are going down
+
     retrofit_packages_board = pd.read_excel(
         os.path.join(
             CUSTOMER_FOLDER_PATH,
-            "Stonewater_SHDF_3_0_Board_work_in_progress_-_Operations_1731315080 11.11.24.xlsx"
+            "Stonewater_SHDF_3_0_Board_work_in_progress_-_Operations_1732034933 Final 19.11.24.xlsx"
         ),
         header=4
     )
@@ -1084,6 +1087,18 @@ def main():
         retrofit_packages_board["RA"].isin(["Invoiced", "Completed"])
     ]
 
+    # populated_primary_energy = retrofit_packages_board[
+    #     ~pd.isnull(retrofit_packages_board['BASE Primary energy (13a-272)'])
+    # ]
+    #
+    # z = populated_primary_energy[
+    #     populated_primary_energy['POST Primary energy (13a - 272)'] > populated_primary_energy[
+    #         'BASE Primary energy (13a-272)']
+    #     ]
+    #
+    # all(populated_primary_energy['POST Primary energy (13a - 272)'] <= populated_primary_energy[
+    #     'BASE Primary energy (13a-272)'])
+
     # Replace \n with ""
     extracted_data["Postcode"] = extracted_data["Postcode"].str.replace("\n", "")
 
@@ -1192,7 +1207,7 @@ def main():
         # missed[["Name", "Postcode", "Archetype ID", "Arch. Group Rank"]].to_csv(
         #     CUSTOMER_FOLDER_PATH + "/missed_debugging.csv")
 
-        if len(missing_ids) != 6:
+        if len(missing_ids) != 1:
             raise Exception("Unacceptable number of missings")
 
     if matching_lookup["Address ID"].duplicated().sum():
@@ -1239,7 +1254,6 @@ def main():
 
     if stonewater_data["Address ID"].duplicated().sum():
         raise Exception("Duplicate Address IDs")
-
     # Create a section for costs
     for measure in measure_columns:
         stonewater_data[f"Cost of {measure}"] = None
@@ -1297,8 +1311,41 @@ def main():
     ]:
         stonewater_data[c] = stonewater_data[c].astype(str)
 
+    # FIll the primary energy numbers from the excel
+    stonewater_data = stonewater_data.merge(
+        retrofit_packages_board[
+            [
+                "Name", "Address ID", "BASE Primary energy (13a-272)", "POST Primary energy (13a - 272)"
+            ]
+        ],
+        on=["Address ID", "Name"],
+        how="left"
+    )
+    stonewater_data["Primary Energy Use (kWh/yr)"] = np.where(
+        pd.isnull(stonewater_data["Primary Energy Use (kWh/yr)"]),
+        stonewater_data["BASE Primary energy (13a-272)"],
+        stonewater_data["Primary Energy Use (kWh/yr)"]
+    )
+    stonewater_data = stonewater_data.drop(columns=["BASE Primary energy (13a-272)"])
+
+    # Add on organisation reference
+    original_archetypes = pd.read_excel(
+        "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24 "
+        "- Archetyped V3.1.xlsx",
+        header=4
+    )
+    original_archetypes = original_archetypes[~pd.isnull(original_archetypes["Address ID"])]
+    original_archetypes = original_archetypes[original_archetypes["Address ID"] != "Address ID"]
+    original_archetypes["Address ID"] = original_archetypes["Address ID"].astype(int)
+
+    stonewater_data = stonewater_data.merge(
+        original_archetypes[["Address ID", 'Org. ref.']],
+        on="Address ID",
+        how="left"
+    )
+
     # Save this data to excel
-    stonewater_data.to_excel(CUSTOMER_FOLDER_PATH + "/Stonewater - costed retrofit packages V3.xlsx", index=False)
+    stonewater_data.to_excel(CUSTOMER_FOLDER_PATH + "/Stonewater - costed retrofit packages V4.xlsx", index=False)
 
     cost_sheet = [
         {
@@ -1677,6 +1724,12 @@ def propsed_wave_3_sample():
     asset_list = asset_list[asset_list["Address ID"] != "Address ID"]
     asset_list["Address ID"] = asset_list["Address ID"].astype(int)
 
+    asset_list["Street name"] = np.where(
+        pd.isnull(asset_list["Street name"]),
+        asset_list["Postcode"],
+        asset_list["Street name"]
+    )
+
     # Create the postal region, taking the first part of the postcode
     asset_list["Postal Region"] = asset_list["Postcode"].str.split(" ").str[0]
     asset_list["Street and Region"] = asset_list["Street name"] + " " + asset_list["Postal Region"]
@@ -1684,43 +1737,16 @@ def propsed_wave_3_sample():
 
     # Keep just the columns we need
     asset_list = asset_list[
-        ["UPRN", "Address ID", "Archetype ID", "Postal Region", "Postcode", "Street and Region",
+        ["UPRN", "Address ID", 'Org. ref.', "Archetype ID", "Postal Region", "Name", "Postcode", "Street and Region",
          "Property Type", "Wall Type", "Roof Type", "Heating"]
     ]
 
-    # Updated packages: to_excel(CUSTOMER_FOLDER_PATH + "/Stonewater - costed retrofit packages V3.xlsx", index=False)
     survey_results = pd.read_excel(
-        os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater - Bid Packages WIP 14.11.24.xlsx"),
+        os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater - Bid Packages WIP 14.11.19 V2.xlsx"),
         header=13,
         sheet_name="Modelled Packages"
     )
 
-    additional_survey_data = pd.read_excel(
-        os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater - costed retrofit packages V3.xlsx"),
-        header=0
-    )
-
-    survey_results = survey_results.drop(
-        columns=["Main Roof Type", "Main Roof Insulation", "Main Roof Insulation Thickness"]
-    ).merge(
-        additional_survey_data[
-            [
-                "Address ID",
-                "Main Wall Type", "Main Wall Insulation_x", "Main Wall Thickness",
-                "Main Building Alternative Wall Type", "Main Building Alternative Wall Insulation",
-                "Main Building Alternative Wall Thickness",
-                "Main Roof Type", "Main Roof Insulation", "Main Roof Insulation Thickness"
-            ]
-        ].rename(
-            columns={
-                "Main Wall Insulation_x": "Main Wall Insulation Type",
-            }
-        ),
-        how="left",
-        on="Address ID"
-    )
-
-    # TOOD: We probably want the actual surveyed wall, roof, heating type
     survey_results = survey_results[
         [
             "Address ID", "Archetype ID", "Current SAP Rating", "Current EPC Band", "Postcode",
@@ -1768,6 +1794,105 @@ def propsed_wave_3_sample():
     if survey_results_with_original_features.shape[0] != survey_results.shape[0]:
         raise ValueError("Something went wrong")
 
+    # Against properties that have NO package ref, we assign a package ref
+    properties_with_packages = survey_results_with_original_features[
+        ~pd.isnull(survey_results_with_original_features["Package Ref"])
+    ]
+
+    properties_without_packages = survey_results_with_original_features[
+        (survey_results_with_original_features["Current SAP Rating"] < 69) & pd.isnull(
+            survey_results_with_original_features["Package Ref"]
+        )
+        ]
+
+    # Change this to a lookup
+    package_ratings = pd.DataFrame([
+        {
+            "1A": 1,
+            "1B": 2,
+            "2A": 3,
+            "2B": 4,
+            "3A": 5,
+            "3B": 6,
+            4: 7
+        }
+    ])
+    package_ratings = pd.melt(package_ratings, var_name="Package Ref", value_name="Rank")
+
+    mapped_package_refs = []
+    for _, property in tqdm(properties_without_packages.iterrows(), total=len(properties_without_packages)):
+        # Same archetype?
+        matches = properties_with_packages[properties_with_packages["Archetype ID"] == property["Archetype ID"]]
+
+        if matches.empty:
+            # Similar property
+            matches = properties_with_packages[
+                (properties_with_packages["Property Type"].str.split(":").str[0] ==
+                 property["Property Type"].split(":")[0]) &
+                (properties_with_packages["Wall Type"] == property["Wall Type"]) &
+                (properties_with_packages["Roof Type"].str.split(":").str[0] == property["Roof Type"].split(":")[0]) &
+                (properties_with_packages["Heating"].str.split(":").str[0] == property["Heating"].split(":")[0])
+                ]
+        if matches.empty:
+            matches = properties_with_packages[
+                (properties_with_packages["Property Type"].str.split(":").str[0] ==
+                 property["Property Type"].split(":")[0]) &
+                (properties_with_packages["Wall Type"].str.split(":").str[0] == property["Wall Type"].split(":")[0]) &
+                (properties_with_packages["Roof Type"].str.split(":").str[0] == property["Roof Type"].split(":")[0]) &
+                (properties_with_packages["Heating"].str.split(":").str[0] == property["Heating"].split(":")[0])
+                ]
+        if matches.empty:
+            raise Exception("Implement me")
+        if matches.shape[0] > 1:
+            # Take the package with the highest rank
+            matches = matches.merge(
+                package_ratings,
+                on="Package Ref",
+                how="left"
+            ).sort_values("Rank", ascending=False).head(1)
+
+        mapped_package_refs.append(
+            {
+                "Address ID": property["Address ID"],
+                "Matched Package Ref": matches["Package Ref"].values[0]
+            }
+        )
+
+    mapped_package_refs = pd.DataFrame(mapped_package_refs)
+
+    survey_results = survey_results.merge(
+        mapped_package_refs,
+        on="Address ID",
+        how="left"
+    )
+    survey_results["Package Ref"] = np.where(
+        pd.notnull(survey_results["Matched Package Ref"]),
+        survey_results["Matched Package Ref"],
+        survey_results["Package Ref"]
+    )
+    survey_results = survey_results.drop(columns=["Matched Package Ref"])
+
+    # Do the same with survey_results_with_original_features
+    survey_results_with_original_features = survey_results_with_original_features.merge(
+        mapped_package_refs,
+        on="Address ID",
+        how="left"
+    )
+    survey_results_with_original_features["Package Ref"] = np.where(
+        pd.notnull(survey_results_with_original_features["Matched Package Ref"]),
+        survey_results_with_original_features["Matched Package Ref"],
+        survey_results_with_original_features["Package Ref"]
+    )
+    survey_results_with_original_features = survey_results_with_original_features.drop(columns=["Matched Package Ref"])
+
+    # Save the data for reference
+    # mapped_package_refs = mapped_package_refs.merge(
+    #     asset_list[["Name", "Postcode", "Address ID", "Org. ref."]],
+    #     on="Address ID",
+    #     how="left"
+    # )
+    # mapped_package_refs.to_csv(os.path.join(CUSTOMER_FOLDER_PATH, "mapped_package_refs.csv"), index=False)
+
     # We get longitude & Latitude
     archetyping_spatial_features = read_pickle_from_s3(
         bucket_name="retrofit-data-dev", s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.pkl",
@@ -1911,7 +2036,8 @@ def propsed_wave_3_sample():
             'Current EPC Band', 'Current SAP Rating',
             'Survey: Main Wall Type', 'Survey: Main Alternative Wall',
             'Survey: Main Roof Type', 'Survey: Primary Heating System',
-            'Survey: Matching Address ID', 'Distance to Closest Match (m)'
+            'Survey: Matching Address ID', 'Distance to Closest Match (m)',
+            "Package Ref"
         ]:
             region_assets[col] = np.where(
                 pd.isnull(region_assets[col]) & pd.notnull(region_assets[col + suffix]),
@@ -2027,7 +2153,7 @@ def propsed_wave_3_sample():
                     "Archetype ID", "Address ID", "Current EPC Band", "Current SAP Rating",
                     'Survey: Main Wall Type', 'Survey: Main Alternative Wall', 'Survey: Main Roof Type',
                     'Survey: Primary Heating System', "Survey: Matching Address ID", 'Distance to Closest Match (m)',
-                    "Match Type"
+                    "Match Type", "Package Ref"
                 ]
             )
 
@@ -2183,6 +2309,13 @@ def propsed_wave_3_sample():
 
             closest_match = surveyed.iloc[0]
 
+            # The closest property may be an EPC C, we we take the package ref from the property that's the nearest
+            # with non-NA package ref
+            if expected_epc in ["C", "B", "A"]:
+                package_ref = None
+            else:
+                package_ref = surveyed["Package Ref"].dropna().values[0]
+
             final_missed_matches.append(
                 {
                     "Address ID": a_id,
@@ -2195,7 +2328,7 @@ def propsed_wave_3_sample():
                     "Survey: Primary Heating System": closest_match["Survey: Primary Heating System"],
                     "Survey: Matching Address ID": closest_match["Address ID"],
                     'Distance to Closest Match (m)': closest_match["distance_meters"],
-                    "Package Ref": closest_match["Package Ref"]
+                    "Package Ref": package_ref
                 }
             )
             continue
@@ -2225,6 +2358,11 @@ def propsed_wave_3_sample():
 
     results = pd.concat(results)
 
+    results[
+        pd.isnull(results["Package Ref"]) & (results["Current EPC Band"] == "D")
+        ]["Postal Region"]
+    results[resul]
+
     # Check if there are missings in current epc band, current sap rating or any of the survey attributes
     for c in (
         [
@@ -2269,8 +2407,6 @@ def propsed_wave_3_sample():
     street_summary["Gain"] = street_summary[gain_columns].sum(axis=1)
     street_summary["Loss"] = street_summary[loss_columns].sum(axis=1)
 
-    print(street_summary.sum())
-
     selected_rows, _ = optimise(
         gain=street_summary["Gain"].values,
         loss=street_summary["Loss"].values,
@@ -2334,9 +2470,6 @@ def propsed_wave_3_sample():
         package_summary, how="left", on="Street and Region"
     )
     street_bid_structure = street_bid_structure.sort_values("Gain", ascending=False)
-    street_bid_structure.to_csv(
-        os.path.join(CUSTOMER_FOLDER_PATH, "Street Bid Structure.csv"), index=False
-    )
 
     individual_units_programme = results.copy()
     individual_units_programme["Unit in Programme"] = individual_units_programme["Street and Region"].isin(
@@ -2386,6 +2519,79 @@ def propsed_wave_3_sample():
             .str.strip()  # Strip leading/trailing spaces
         )
 
+    # Any EPC C properties that have been included should be flagged as potential low carbon heating
+    selected_epc_c = individual_units_programme[
+        (individual_units_programme["Current EPC Band"].isin(["C", "B", "A", "Needs Survey"])) &
+        (individual_units_programme["Unit in Programme"])
+        ]
+
+    flat_wall_map = {
+        "CA Cavity: F Filled Cavity": False,
+        "CA Cavity: A As Built": True,
+        "SO Solid Brick: A As Built": True,
+        "Not Surveyed": False
+    }
+
+    heating_map = {
+        "BGW Post 98 Combi condens. with auto ign.": False,
+        "BGB Post 98 Regular condens. with auto ign.": False,
+        "SEK High heat retention storage heaters": False,
+        "SEB Modern slimline storage heaters": True,
+        "Not Surveyed": False
+    }
+
+    infill_data = []
+    for _, epc_c_property in selected_epc_c.iterrows():
+        if epc_c_property["Property Type"].split(":")[0] == "Flat":
+            # Look for a wall insulation measure
+            infill = flat_wall_map[epc_c_property["Survey: Main Wall Type"]]
+            infill_data.append(
+                {
+                    "Address ID": epc_c_property["Address ID"],
+                    "Street and Region": epc_c_property["Street and Region"],
+                    "Possible Flat Infill?": infill
+                }
+            )
+            continue
+
+        infill = heating_map[epc_c_property["Survey: Primary Heating System"]]
+        infill_data.append(
+            {
+                "Address ID": epc_c_property["Address ID"],
+                "Street and Region": epc_c_property["Street and Region"],
+                "Low Carbon Heating Infill?": infill
+            }
+        )
+    infill_data = pd.DataFrame(infill_data)
+
+    individual_units_programme = individual_units_programme.merge(
+        infill_data[["Address ID", 'Possible Flat Infill?', 'Low Carbon Heating Infill?']],
+        how="left", on="Address ID"
+    )
+
+    for c in ['Possible Flat Infill?', 'Low Carbon Heating Infill?']:
+        individual_units_programme[c] = individual_units_programme[c].fillna(False)
+
+    infill_by_street = infill_data.pivot_table(
+        index='Street and Region',
+        values=['Possible Flat Infill?', 'Low Carbon Heating Infill?'],
+        aggfunc='sum',
+        fill_value=0
+    ).reset_index()
+
+    street_bid_structure = street_bid_structure.merge(
+        infill_by_street, how="left", on="Street and Region"
+    )
+
+    for c in ['Low Carbon Heating Infill?', 'Possible Flat Infill?']:
+        street_bid_structure[c] = street_bid_structure[c].fillna(0)
+
+    street_bid_structure.to_csv(
+        os.path.join(CUSTOMER_FOLDER_PATH, "Street Bid Structure.csv"), index=False
+    )
+
+    # TODO: Add the full Address!!!
+
     individual_units_programme.to_csv(
         os.path.join(CUSTOMER_FOLDER_PATH, "Individual units - programme.csv"), index=False
     )