fixing filling of property

2026-07-27 23:35:01 +00:00 · 2024-11-19 13:54:46 +00:00 · 2024-11-19 13:54:46 +00:00 · d163ca9931
commit d163ca9931
parent d65c99f62a
1 changed files with 98 additions and 90 deletions
--- a/etl/customers/stonewater/Wave
+++ b/etl/customers/stonewater/Wave
@ -1669,7 +1669,7 @@ def propsed_wave_3_sample():
        header=4
    )

-    # TODO: We drop 302 properties that are not priority postcodes - confirm w/ Stonewater and 7 properties missing
+    # TODO: We drop 7 properties missing
    # UPRN
    asset_list = asset_list[~asset_list["Archetype ID"].isin(["MISSING UPRN"])]
    # Clean address ids
@ -1699,15 +1699,23 @@ def propsed_wave_3_sample():
        os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater - costed retrofit packages V3.xlsx"),
        header=0
    )
-    survey_results = survey_results.merge(
+
+    survey_results = survey_results.drop(
+        columns=["Main Roof Type", "Main Roof Insulation", "Main Roof Insulation Thickness"]
+    ).merge(
        additional_survey_data[
            [
                "Address ID",
                "Main Wall Type", "Main Wall Insulation_x", "Main Wall Thickness",
                "Main Building Alternative Wall Type", "Main Building Alternative Wall Insulation",
-                "Main Building Alternative Wall Thickness"
+                "Main Building Alternative Wall Thickness",
+                "Main Roof Type", "Main Roof Insulation", "Main Roof Insulation Thickness"
            ]
-        ].rename(columns={"Main Wall Insulation_x": "Main Wall Insulation Type"}),
+        ].rename(
+            columns={
+                "Main Wall Insulation_x": "Main Wall Insulation Type",
+            }
+        ),
        how="left",
        on="Address ID"
    )
@ -1718,6 +1726,7 @@ def propsed_wave_3_sample():
            "Address ID", "Archetype ID", "Current SAP Rating", "Current EPC Band", "Postcode",
            "Main Roof Type", "Main Roof Insulation", "Main Roof Insulation Thickness",
            "Existing Primary Heating System",
+            "Package Ref",
            "Main Wall Type", "Main Wall Insulation Type", "Main Wall Thickness",
            "Main Building Alternative Wall Type", "Main Building Alternative Wall Insulation",
            "Main Building Alternative Wall Thickness"
@ -1727,6 +1736,7 @@ def propsed_wave_3_sample():
            "Existing Primary Heating System": "Survey: Primary Heating System"
        }
    )
+
    survey_results["Postal Region"] = survey_results["Postcode"].str.split(" ").str[0]
    # Concatenate from the wall information
    survey_results["Survey: Main Wall Type"] = survey_results["Main Wall Type"].astype(str) + ": " + survey_results[
@ -1929,7 +1939,7 @@ def propsed_wave_3_sample():
        region_assets = region_assets.merge(
            exact_surveyed[
                ["Address ID", "Current EPC Band", "Current SAP Rating"] + survey_attribute_columns + [
-                    "Survey: Matching Address ID"
+                    "Survey: Matching Address ID", "Package Ref"
                ]
                ],
            on="Address ID",
@ -2005,6 +2015,7 @@ def propsed_wave_3_sample():
                        'Survey: Primary Heating System': closest_match["Survey: Primary Heating System"],
                        "Survey: Matching Address ID": closest_match["Address ID"],
                        'Distance to Closest Match (m)': closest_match["distance_meters"],
+                        "Package Ref": closest_match["Package Ref"],
                        "Match Type": match_type
                    }
                )
@ -2015,7 +2026,8 @@ def propsed_wave_3_sample():
                columns=[
                    "Archetype ID", "Address ID", "Current EPC Band", "Current SAP Rating",
                    'Survey: Main Wall Type', 'Survey: Main Alternative Wall', 'Survey: Main Roof Type',
-                    'Survey: Primary Heating System', "Survey: Matching Address ID", 'Distance to Closest Match (m)'
+                    'Survey: Primary Heating System', "Survey: Matching Address ID", 'Distance to Closest Match (m)',
+                    "Match Type"
                ]
            )

@ -2032,8 +2044,8 @@ def propsed_wave_3_sample():
        # Label the tier 1 properties
        region_assets["Confidence Tier"] = np.where(
            region_assets["Current EPC Band_method1"].isin(["D", "E", "F", "G"]) &
-            pd.isnull(region_assets["Confidence Tier"]),
-            "1 - Archetype surveyed in region", region_assets["Confidence Tier"]
+            pd.isnull(region_assets["Confidence Tier"]) & ~pd.isnull(region_assets["Match Type"]),
+            region_assets["Match Type"], region_assets["Confidence Tier"]
        )

        # Handle EPC C
@ -2046,86 +2058,7 @@ def propsed_wave_3_sample():
        region_assets = fill_survey_columns(region_assets, suffix="_method1")

        method_1_columns = [c for c in region_assets.columns if c.endswith("_method1")]
-        region_assets = region_assets.drop(columns=method_1_columns)
-
-        missed_archetypes = set(archetype_ids) - set(region_surveyed["Archetype ID"])
-
-        # archetype_surveyed = []
-        for arch_id in missed_archetypes:
-            for _, property in region_assets[region_assets["Archetype ID"] == arch_id].iterrows():
-                archetype_data = survey_results_with_original_features[
-                    survey_results["Archetype ID"] == arch_id
-                    ].copy()
-                if archetype_data.empty:
-                    continue
-                raise Exception("IMPLEMENT ME")
-        #         archetype_data["distance_meters"] = haversine(
-        #             lat1=property.latitude, lon1=property.longitude,
-        #             lat2=archetype_data["latitude"].values, lon2=archetype_data["longitude"].values
-        #         )
-        #         expected_sap = np.average(
-        #             archetype_data["Current SAP Rating"], weights=1 / (archetype_data["distance_meters"] + 1)
-        #         )
-        #         expected_epc = sap_to_epc(expected_sap)
-        #         archetype_surveyed.append(
-        #             {
-        #                 "Archetype ID": arch_id,
-        #                 "Address ID": property["Address ID"],
-        #                 "Current EPC Band": expected_epc
-        #             }
-        #         )
-        # archetype_surveyed = pd.DataFrame(archetype_surveyed)
-        # if archetype_surveyed.empty:
-        #     archetype_surveyed = pd.DataFrame(
-        #         columns=[
-        #             "Archetype ID", "Address ID", "Current EPC Band", "Current SAP Rating",
-        #             'Survey: Main Wall Type', 'Survey: Main Alternative Wall', 'Survey: Main Roof Type',
-        #             'Survey: Primary Heating System', "Survey: Matching Address ID", 'Distance to Closest Match (m)'
-        #         ]
-        #     )
-        #
-        # region_assets = region_assets.merge(
-        #     archetype_surveyed,
-        #     on=["Archetype ID", "Address ID"],
-        #     how="left",
-        #     suffixes=("", "_method2")
-        # )
-        #
-        # region_assets["Confidence Tier"] = np.where(
-        #     region_assets["Current EPC Band_method2"].isin(["D", "E", "F", "G"]) & pd.isnull(
-        #         region_assets["Confidence Tier"]),
-        #     "2 - same archetype", region_assets["Confidence Tier"]
-        # )
-        #
-        # for col in [
-        #     'Current EPC Band', 'Current SAP Rating',
-        #     'Survey: Main Wall Type', 'Survey: Main Alternative Wall',
-        #     'Survey: Main Roof Type', 'Survey: Primary Heating System',
-        #     'Survey: Matching Address ID', 'Distance to Closest Match (m)'
-        # ]:
-        #     region_assets[col] = np.where(
-        #         pd.isnull(region_assets[col]) & pd.notnull(region_assets[col + "_method2"]),
-        #         region_assets[col + "_method2"], region_assets[col]
-        #     )
-        #
-        # method_2_columns = [c for c in region_assets.columns if c.endswith("_method2")]
-        # region_assets = region_assets.drop(columns=method_2_columns)
-
-        # We label EPC C properties
-        # region_assets["Confidence Tier"] = np.where(
-        #     region_assets["Current EPC Band"].isin(["C", "B", "A"]) & pd.isnull(region_assets["Confidence Tier"]),
-        #     "5 - EPC C or above", region_assets["Confidence Tier"]
-        # )
-        #
-        # region_assets["Confidence Tier"] = np.where(
-        #     region_assets["Archetype ID"] == "EPC C OR ABOVE",
-        #     "5 - EPC C or above", region_assets["Confidence Tier"]
-        # )
-        #
-        # region_assets["Current EPC Band"] = np.where(
-        #     region_assets["Archetype ID"] == "EPC C OR ABOVE",
-        #     "C", region_assets["Current EPC Band"]
-        # )
+        region_assets = region_assets.drop(columns=method_1_columns + ["Match Type"])

        missed_addressids = region_assets[pd.isnull(region_assets["Confidence Tier"])]["Address ID"].unique().tolist()

@ -2217,6 +2150,7 @@ def propsed_wave_3_sample():
                        "Survey: Primary Heating System": "Not Surveyed",
                        "Survey: Matching Address ID": "Not Surveyed",
                        'Distance to Closest Match (m)': 9999999,
+                        "Package Ref": "Not Surveyed",
                    }
                )
                continue
@ -2261,6 +2195,7 @@ def propsed_wave_3_sample():
                    "Survey: Primary Heating System": closest_match["Survey: Primary Heating System"],
                    "Survey: Matching Address ID": closest_match["Address ID"],
                    'Distance to Closest Match (m)': closest_match["distance_meters"],
+                    "Package Ref": closest_match["Package Ref"]
                }
            )
            continue
@ -2292,8 +2227,10 @@ def propsed_wave_3_sample():

    # Check if there are missings in current epc band, current sap rating or any of the survey attributes
    for c in (
-        ["Current EPC Band", "Current SAP Rating", "Survey: Matching Address ID", 'Distance to Closest Match (m)'] +
-        survey_attribute_columns):
+        [
+            "Current EPC Band", "Current SAP Rating", "Survey: Matching Address ID", 'Distance to Closest Match (m)'] +
+        survey_attribute_columns
+    ):
        if pd.isnull(results[c]).sum():
            raise Exception("Something went wrong")

@ -2382,5 +2319,76 @@ def propsed_wave_3_sample():
    total_bid_size = bid_size + no_loss_postcodes["Gain"].sum()
    print(total_bid_size)

+    # Label final outputs
+    # We create a summary of packages by street
+    results["Package Ref"] = results["Package Ref"].fillna("Incomplete")
+    results["Package Ref"] = results["Package Ref"].astype(str)
+    package_summary = results.pivot_table(
+        index='Street and Region',
+        columns='Package Ref',
+        aggfunc='size',
+        fill_value=0
+    ).reset_index()
+
+    street_bid_structure = street_summary.merge(
+        package_summary, how="left", on="Street and Region"
+    )
+    street_bid_structure = street_bid_structure.sort_values("Gain", ascending=False)
+    street_bid_structure.to_csv(
+        os.path.join(CUSTOMER_FOLDER_PATH, "Street Bid Structure.csv"), index=False
+    )
+
+    individual_units_programme = results.copy()
+    individual_units_programme["Unit in Programme"] = individual_units_programme["Street and Region"].isin(
+        street_bid_structure[street_bid_structure["Selected"]]["Street and Region"].values
+    )
+
+    # Merge on Stonewaters ID
+    asset_list_ids = pd.read_excel(
+        "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24 "
+        "- Archetyped V3.1.xlsx",
+        header=4
+    )[["Address ID", "Org. ref."]]
+    # Clean address ids
+    asset_list_ids = asset_list_ids[~pd.isnull(asset_list_ids["Address ID"])]
+    asset_list_ids = asset_list_ids[asset_list_ids["Address ID"] != "Address ID"]
+    asset_list_ids["Address ID"] = asset_list_ids["Address ID"].astype(int)
+    individual_units_programme = individual_units_programme.merge(
+        asset_list_ids,
+        how="left",
+        on="Address ID",
+    )
+
+    individual_units_programme = individual_units_programme.merge(
+        asset_list_ids.rename(
+            columns={"Org. ref.": "Survey: Org. ref.", "Address ID": "Survey: Matching Address ID"}
+        ),
+        how="left",
+        on="Survey: Matching Address ID"
+    )
+
+    individual_units_programme["Survey: Org. ref."] = np.where(
+        (individual_units_programme["Survey: Matching Address ID"] == "Not Surveyed"),
+        "Not Surveyed",
+        individual_units_programme["Survey: Org. ref."]
+    )
+
+    if pd.isnull(individual_units_programme["Survey: Org. ref."]).sum() or pd.isnull(
+        individual_units_programme["Org. ref."]).sum():
+        raise ValueError("something went wrong")
+
+    for col in ["Survey: Main Roof Type", "Survey: Main Wall Type", "Survey: Main Alternative Wall"]:
+        individual_units_programme[col] = (
+            individual_units_programme[col]
+            .str.replace(r': nan(?=$|:)', '', regex=True)  # Remove ': nan' at the end or before another ':'
+            .str.replace(r':\s+:', ': ', regex=True)  # Replace occurrences of ': :' with ': '
+            .str.replace(r'\s+', ' ', regex=True)  # Replace multiple spaces with a single space
+            .str.strip()  # Strip leading/trailing spaces
+        )
+
+    individual_units_programme.to_csv(
+        os.path.join(CUSTOMER_FOLDER_PATH, "Individual units - programme.csv"), index=False
+    )
+
 # if __name__ == "__main__":
 #     main()