working on stonewater matching algorithm

2026-07-27 23:35:01 +00:00 · 2024-11-16 15:49:08 +00:00 · 2024-11-16 15:49:08 +00:00 · dc1cf6d604
commit dc1cf6d604
parent 31c5935577
3 changed files with 171 additions and 9 deletions
--- a/etl/customers/southend/epc_data_pull_2024_11_14.py
+++ b/etl/customers/southend/epc_data_pull_2024_11_14.py
@ -229,7 +229,3 @@ def app():
    filename = ("/Users/khalimconn-kowlessar/Documents/hestia/Customers/southend/southend EPC Data pull - 14 Nov "
                "2024.xlsx")
    asset_list.to_excel(filename, index=False)
-
-    asset_list["% of the Roof with PV"].value_counts()
-
-    asset_list[asset_list["% of the Roof with PV"] == "50.0"][["Address", "Postcode"]]
--- a/etl/customers/stonewater/Wave
+++ b/etl/customers/stonewater/Wave
@ -117,7 +117,7 @@ def extract_summary_report(pdf_path):
    - Fuel Bill
    - Address
    """
-    
+
    data = {
        "Address": None,
        "Postcode": None,
@ -1618,5 +1618,136 @@ def append_stonewater_id():
        index=False
    )

+
+def propsed_wave_3_sample():
+    """
+    Stonewater want to ensure that the properties that when selecting properties for wave 3, they choose properties
+    such that most of the properties within a geographical area are treatable within the bid.
+    Name, if we take a geographical area (which could be postal region) they want the most, and ideally all, of the
+    properties within that geographical area to be included within the bid
+    :return:
+    """
+
+    asset_list = pd.read_excel(
+        "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24 "
+        "- Archetyped V3.1.xlsx",
+        header=4
+    )
+    # Clean address ids
+    asset_list = asset_list[~pd.isnull(asset_list["Address ID"])]
+    asset_list = asset_list[asset_list["Address ID"] != "Address ID"]
+    asset_list["Address ID"] = asset_list["Address ID"].astype(int)
+
+    # Create the postal region, taking the first part of the postcode
+    asset_list["Postal Region"] = asset_list["Postcode"].str.split(" ").str[0]
+    unique_postal_regions = asset_list["Postal Region"].unique()
+
+    # Keep just the columns we need
+    asset_list = asset_list[
+        ["Address ID", "Archetype ID", "Postal Region", "Postcode", "Property Type", "Wall Type", "Roof Type",
+         "Heating"]
+    ]
+
+    survey_results = pd.read_excel(
+        os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater - Bid Packages WIP 14.11.24.xlsx"),
+        header=13,
+        sheet_name="Modelled Packages"
+    )
+
+    # TOOD: We probably want the actual surveyed wall, roof, heating type
+    survey_results = survey_results[
+        ["Address ID", "Archetype ID", "Current SAP Rating", "Current EPC Band", "Postcode"]
+    ]
+    survey_results["Postal Region"] = survey_results["Postcode"].str.split(" ").str[0]
+
+    survey_results_with_original_features = survey_results.merge(
+        asset_list[["Address ID", "Property Type", "Wall Type", "Roof Type", "Heating"]],
+        on="Address ID",
+        how="left"
+    )
+
+    if survey_results_with_original_features.shape[0] != survey_results.shape[0]:
+        raise ValueError("Something went wrong")
+
+    # Tier definitions
+    # Tier 1: We have a property in the same postal region and same archetype that was surveyed and is below EPC D
+    # Tier 2: We have a property in the same archetype that was surveyed and is below EPC D
+    #
+
+    for region in unique_postal_regions:
+        # Take all of the properties in that region
+        region_assets = asset_list[asset_list["Postal Region"] == region].copy()
+        archetypes = region_assets["Archetype ID"].unique()
+        # We get the properties that have been surveyed
+        region_surveyed = survey_results[
+            survey_results["Archetype ID"].isin(archetypes) &
+            (survey_results["Postal Region"] == region)
+            ][["Archetype ID", "Current EPC Band"]].drop_duplicates()
+
+        if region_surveyed["Archetype ID"].duplicated().sum():
+            raise NotImplementedError("Fix me")
+
+        region_assets = region_assets.merge(
+            region_surveyed,
+            on="Archetype ID",
+            how="left"
+        )
+
+        # Label the tier 1 properties
+        region_assets["Confidence Tier"] = None
+        region_assets["Confidence Tier"] = np.where(
+            region_assets["Current EPC Band"].isin(["D", "E", "F", "G"]),
+            "1", region_assets["Confidence Tier"]
+        )
+        # TODO: Turn into a function
+        missed_archetypes = set(archetypes) - set(region_surveyed["Archetype ID"])
+
+        region_surveyed = survey_results[
+            survey_results["Archetype ID"].isin(missed_archetypes)
+        ][["Archetype ID", "Current EPC Band"]].drop_duplicates()
+
+        if region_surveyed["Archetype ID"].duplicated().sum():
+            raise NotImplementedError("Fix me 2")
+
+        region_assets = region_assets.merge(
+            region_surveyed,
+            on="Archetype ID",
+            how="left",
+            suffixes=("", "_method2")
+        )
+
+        region_assets["Confidence Tier"] = np.where(
+            region_assets["Current EPC Band_method2"].isin(["D", "E", "F", "G"]),
+            "2 - same archetype", region_assets["Confidence Tier"]
+        )
+
+        region_assets["Current EPC Band"] = region_assets["Current EPC Band"].fillna(
+            region_assets["Current EPC Band_method2"])
+
+        region_assets = region_assets.drop(columns=["Current EPC Band_method2"])
+
+        missed_addressids = region_assets[pd.isnull(region_assets["Confidence Tier"])]["Address ID"].unique().tolist()
+
+        # This means that this archetype was never surveyed and so we need to find a sufficiently similar property
+        for a_id in missed_addressids:
+            property = asset_list[asset_list["Address ID"] == a_id].squeeze()
+
+            surveyed_same_postcode = survey_results_with_original_features[
+                (survey_results_with_original_features["Postcode"] == property["Postcode"]) &
+                (survey_results_with_original_features["Property Type"] == property["Property Type"])
+                ]
+
+            surveyed_same_region = survey_results_with_original_features[
+                (survey_results_with_original_features["Postal Region"] == property["Postal Region"]) &
+                (survey_results_with_original_features["Property Type"] == property["Property Type"])
+                ]
+
+        same_postcode = survey_results[
+            survey_results["Archetype ID"].isin(missed_archetypes) &
+            (survey_results["Postal Region"] == region)
+            ][["Archetype ID", "Current EPC Band"]].drop_duplicates()
+
+        pd.isnull(region_assets["Current EPC Band"]).sum()
+
 # if __name__ == "__main__":
 #     main()
--- a/etl/route_march_data_pull/app.py
+++ b/etl/route_march_data_pull/app.py
@ -206,6 +206,14 @@ def app():
    # Drop the column that is ""
    transformed_df = transformed_df.drop(columns=[""])

+    # Get the find my epc data
+    find_my_epc_data = epc_df[["row_id", "find_my_epc_data"]].drop(columns=["find_my_epc_data"]).join(
+        pd.json_normalize(epc_df["find_my_epc_data"])
+    )
+    # We check if we get the solar pv column:
+    if "Solar photovoltaics" not in find_my_epc_data.columns:
+        find_my_epc_data["Solar photovoltaics"] = False
+
    # Retrieve just the data we need
    epc_df = epc_df[
        [
@ -228,6 +236,7 @@ def app():
            "mainheat-description",
            #
            "energy-consumption-current",  # kwh/m2
+            "photo-supply",
        ]
    ]

@ -236,12 +245,25 @@ def app():
        how="left",
        on="row_id"
    ).merge(
-        transformed_df,
+        find_my_epc_data[
+            [
+                "row_id", "heating_text", "hot_water_text", 'Assessor’s name',
+                "Assessor's Telephone", "Assessor's Email", "Accreditation scheme",
+                "Assessor’s ID", "Solar photovoltaics"
+            ]
+        ].rename(
+            columns={
+                "Solar photovoltaics": "Has Solar PV",
+                "heating_text": "Heating Estimated kWh",
+                "hot_water_text": "Hot Water Estimated kWh",
+            }
+        ),
        how="left",
        on="row_id"
    )

-    asset_list = asset_list.drop(columns=["row_id"])
+    asset_list["Has Solar PV"] = asset_list["Has Solar PV"] | ~asset_list["photo-supply"].isin(["0.0", 0, None, ""])
+    asset_list = asset_list.drop(columns=["photo-supply"])

    # Rename the columns
    asset_list = asset_list.rename(columns={
@ -259,7 +281,7 @@ def app():
        "mainheat-description": "Heating Type",
        "secondheat-description": "Secondary Heating",
        "transaction-type": "Reason for last EPC",
-        "energy-consumption-current": "Heat Demand (kWh/m2)"
+        "energy-consumption-current": "Heat Demand (kWh/m2)",
    })

    asset_list["Estimated Number of Floors"] = asset_list.apply(
@ -295,6 +317,19 @@ def app():
        axis=1
    )

+    # For all of the columns in transformed_df, prefix with "Recommendation: "
+    for col in transformed_df.columns:
+        if col == "row_id":
+            continue
+        transformed_df = transformed_df.rename(columns={col: f"Recommendation: {col}"})
+
+    asset_list = asset_list.merge(
+        transformed_df,
+        how="left",
+        on="row_id"
+    )
+    asset_list = asset_list.drop(columns=["row_id"])
+
    # Store as an excel
-    filename = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Settle/Settle EPC Data pull - 08 Nov 2024.xlsx"
+    filename = os.path.join(DATA_FOLDER, ".".join(DATA_FILENAME.split(".")[:-1])) + " EPC Data Pull.xlsx"
    asset_list.to_excel(filename, index=False)