pulling out data from best match

2026-07-27 23:35:01 +00:00 · 2024-11-18 20:30:57 +00:00 · 2024-11-18 20:30:57 +00:00 · a7857c0375
commit a7857c0375
parent 377d9929e4
3 changed files with 83 additions and 94 deletions
--- a/etl/customers/stonewater/Wave
+++ b/etl/customers/stonewater/Wave
@ -1727,7 +1727,7 @@ def propsed_wave_3_sample():
            "Existing Primary Heating System": "Survey: Primary Heating System"
        }
    )
-
+    survey_results["Postal Region"] = survey_results["Postcode"].str.split(" ").str[0]
    # Concatenate from the wall information
    survey_results["Survey: Main Wall Type"] = survey_results["Main Wall Type"].astype(str) + ": " + survey_results[
        "Main Wall Insulation Type"].astype(str)
@ -1872,6 +1872,8 @@ def propsed_wave_3_sample():
        'Survey: Primary Heating System'
    ]

+    survey_results["Survey: Matching Address ID"] = survey_results["Address ID"].copy()
+
    results = []
    for region in tqdm(unique_postal_regions):
        # Take all of the properties in that region
@ -1884,10 +1886,14 @@ def propsed_wave_3_sample():

        region_assets = region_assets.merge(
            exact_surveyed[
-                ["Address ID", "Current EPC Band", "Current SAP Rating"] + survey_attribute_columns],
+                ["Address ID", "Current EPC Band", "Current SAP Rating"] + survey_attribute_columns + [
+                    "Survey: Matching Address ID"
+                ]
+                ],
            on="Address ID",
            how="left"
        )
+        region_assets['Distance to Closest Match (m)'] = 0

        # Label the tier 1 properties
        region_assets["Confidence Tier"] = None
@ -1901,61 +1907,62 @@ def propsed_wave_3_sample():
            "5 - property was surveyed", region_assets["Confidence Tier"]
        )

-        archetypes = region_assets[
+        archetype_ids = region_assets[
            pd.isnull(region_assets["Confidence Tier"])
        ]["Archetype ID"].unique()
        # We get the properties that have been surveyed
-        region_surveyed = survey_results[
-            survey_results["Archetype ID"].isin(archetypes) &
-            (survey_results["Postal Region"] == region)
-            ][["Archetype ID", "Current EPC Band"]].drop_duplicates()

-        if region_surveyed["Archetype ID"].duplicated().sum():
+        region_surveyed = []
+        for arch_id in archetype_ids:
+            for _, property in region_assets[region_assets["Archetype ID"] == arch_id].iterrows():
+                archetype_data = survey_results_with_original_features[
+                    survey_results["Archetype ID"] == arch_id
+                    ].copy()
+                if archetype_data.empty:
+                    continue
+                if archetype_data.shape[0] > 1:
+                    # Look for an exact match, or as close as possible
+                    archetype_data_filtered = match_property_to_surveyed(property, archetype_data)
+                    if not archetype_data_filtered.empty:
+                        archetype_data = archetype_data_filtered

-            region_surveyed = []
-            for arch_id in archetypes:
-                for _, property in region_assets[region_assets["Archetype ID"] == arch_id].iterrows():
-                    archetype_data = survey_results_with_original_features[
-                        survey_results["Archetype ID"] == arch_id
-                        ].copy()
-                    if archetype_data.empty:
-                        continue
-                    if archetype_data.shape[0] > 1:
-                        # Look for an exact match, or as close as possible
-                        archetype_data_filtered = match_property_to_surveyed(property, archetype_data)
-                        if not archetype_data_filtered.empty:
-                            archetype_data = archetype_data_filtered
+                archetype_data["distance_meters"] = haversine(
+                    lat1=property.latitude, lon1=property.longitude,
+                    lat2=archetype_data["latitude"].values, lon2=archetype_data["longitude"].values
+                )
+                expected_sap = np.average(
+                    archetype_data["Current SAP Rating"], weights=1 / (archetype_data["distance_meters"] + 1)
+                )
+                expected_epc = sap_to_epc(expected_sap)

-                    archetype_data["distance_meters"] = haversine(
-                        lat1=property.latitude, lon1=property.longitude,
-                        lat2=archetype_data["latitude"].values, lon2=archetype_data["longitude"].values
-                    )
-                    expected_sap = np.average(
-                        archetype_data["Current SAP Rating"], weights=1 / (archetype_data["distance_meters"] + 1)
-                    )
-                    expected_epc = sap_to_epc(expected_sap)
-                    region_surveyed.append(
-                        {
-                            "Archetype ID": arch_id,
-                            "Address ID": property["Address ID"],
-                            "Current EPC Band": expected_epc
-                        }
-                    )
+                # We take the features of the closest matching property
+                closest_match = archetype_data.sort_values("distance_meters", ascending=True).iloc[0]

-            region_surveyed = pd.DataFrame(region_surveyed)
-            region_assets = region_assets.merge(
-                region_surveyed,
-                on=["Archetype ID", "Address ID"],
-                how="left",
-                suffixes=("", "_method1")
-            )
-        else:
-            region_assets = region_assets.merge(
-                region_surveyed,
-                on="Archetype ID",
-                how="left",
-                suffixes=("", "_method1")
-            )
+                region_surveyed.append(
+                    {
+                        "Archetype ID": arch_id,
+                        "Address ID": property["Address ID"],
+                        "Current EPC Band": expected_epc,
+                        "Current SAP Rating": expected_sap,
+                        'Survey: Main Wall Type': closest_match["Survey: Main Wall Type"],
+                        'Survey: Main Alternative Wall': closest_match["Survey: Main Alternative Wall"],
+                        'Survey: Main Roof Type': closest_match["Survey: Main Roof Type"],
+                        'Survey: Primary Heating System': closest_match["Survey: Primary Heating System"],
+                        "Survey: Matching Address ID": closest_match["Address ID"],
+                        'Distance to Closest Match (m)': closest_match["distance_meters"]
+                    }
+                )
+
+        region_surveyed = pd.DataFrame(region_surveyed)
+        starting_shape = region_assets.shape[0]
+        region_assets = region_assets.merge(
+            region_surveyed,
+            on=["Archetype ID", "Address ID"],
+            how="left",
+            suffixes=("", "_method1")
+        )
+        if region_assets.shape[0] != starting_shape:
+            raise ValueError("Something went wrong")

        # Label the tier 1 properties
        region_assets["Confidence Tier"] = np.where(
@ -2326,7 +2333,9 @@ def propsed_wave_3_sample():
    results = pd.concat(results)

    # Check if there are missings in current epc band, current sap rating or any of the survey attributes
-    for c in ["Current EPC Band", "Current SAP Rating"] + survey_attribute_columns:
+    for c in (
+        ["Current EPC Band", "Current SAP Rating", "Survey: Matching Address ID", 'Distance to Closest Match (m)'] +
+        survey_attribute_columns):
        if pd.isnull(results[c]).sum():
            raise Exception("Something went wrong")

--- a/etl/find_my_epc/RetrieveFindMyEpc.py
+++ b/etl/find_my_epc/RetrieveFindMyEpc.py
@ -269,6 +269,7 @@ class RetrieveFindMyEpc:
            "Loft insulation": ["loft_insulation"],
            "Solar photovoltaic (PV) panels": ["solar_pv"],
            "Party wall insulation": ["party_wall_insulation"],
+            'Draught proofing': ["draught_proofing"],
        }

        survey = True
--- a/etl/route_march_data_pull/app.py
+++ b/etl/route_march_data_pull/app.py
@ -23,41 +23,8 @@ EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
 def get_data(asset_list, fulladdress_column, address1_column, postcode_column):
    epc_data = []
    errors = []
+    no_epc = []
    for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)):
-        postcode = home[postcode_column]
-        house_number = home[address1_column]
-        full_address = home[fulladdress_column]
-
-        searcher = SearchEpc(
-            address1=str(house_number),
-            postcode=postcode,
-            auth_token=EPC_AUTH_TOKEN,
-            os_api_key="",
-            property_type=None,
-            fast=True,
-            full_address=full_address,
-            max_retries=5
-        )
-        # Force the skipping of estimating the EPC
-        searcher.ordnance_survey_client.property_type = None
-        searcher.ordnance_survey_client.built_form = None
-
-        searcher.find_property(skip_os=True)
-        if searcher.newest_epc is None:
-            continue
-
-        # Look for EPC recommendatons
-        try:
-            property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"])
-        except:
-            property_recommendations = {"rows": []}
-
-        # Retrieve data from FindMyEPC
-        find_epc_searcher = RetrieveFindMyEpc(
-            address=searcher.newest_epc["address"], postcode=searcher.newest_epc["postcode"]
-        )
-        find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data()
-        time.sleep(np.random.uniform(0.1, 1))
        try:
            postcode = home[postcode_column]
            house_number = home[address1_column]
@ -79,6 +46,7 @@ def get_data(asset_list, fulladdress_column, address1_column, postcode_column):

            searcher.find_property(skip_os=True)
            if searcher.newest_epc is None:
+                no_epc.append(home["row_id"])
                continue

            # Look for EPC recommendatons
@ -106,7 +74,7 @@ def get_data(asset_list, fulladdress_column, address1_column, postcode_column):
            errors.append(home["row_id"])
            time.sleep(5)

-    return epc_data, errors
+    return epc_data, errors, no_epc


 def extract_address1(asset_list, full_address_col, method="first_two_words"):
@ -140,26 +108,37 @@ def app():
    Property UPRN

    """
-    DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/P&F/"
-    DATA_FILENAME = "BELOW C - WFT FINDINGS ON INSPECTION PLUS SUGGESTED ACTION.xlsx"
+    DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/"
+    DATA_FILENAME = "Bromford programme review.xlsx"
+    SHEET_NAME = "Bromford"
    POSTCODE_COLUMN = "Postcode"
-    FULLADDRESS_COLUMN = "Address"
-    ADDRESS1_COLUMN = None
+    FULLADDRESS_COLUMN = None
+    ADDRESS1_COLUMN = "No."
    ADDRESS1_METHOD = "first_two_words"
+    ADDRESS_COLS_TO_CONCAT = ["No.", "Address"]

-    asset_list = pd.read_excel(os.path.join(DATA_FOLDER, DATA_FILENAME), header=0)
+    asset_list = pd.read_excel(os.path.join(DATA_FOLDER, DATA_FILENAME), header=0, sheet_name=SHEET_NAME)
+    asset_list = asset_list[~pd.isnull(asset_list["Postcode"])]
    asset_list["row_id"] = asset_list.index

    # We clean up portential non-breaking spaces, and double spaces
    for col in [c for c in [POSTCODE_COLUMN, FULLADDRESS_COLUMN, ADDRESS1_COLUMN] if c is not None]:
+        asset_list[col] = asset_list[col].astype(str)
        asset_list[col] = asset_list[col].str.replace('\xa0', ' ', regex=False)
        asset_list[col] = asset_list[col].str.replace('  ', ' ', regex=False)

    if ADDRESS1_COLUMN is None:
        ADDRESS1_COLUMN = "address1_extracted"
-        asset_list = extract_address1(asset_list, FULLADDRESS_COLUMN, ADDRESS1_METHOD)
+        asset_list = extract_address1(
+            asset_list=asset_list, full_address_col=FULLADDRESS_COLUMN, method=ADDRESS1_METHOD
+        )

-    epc_data, errors = get_data(
+    if FULLADDRESS_COLUMN is None:
+        FULLADDRESS_COLUMN = "fulladdress_extracted"
+        # We concatenate the columns in ADDRESS_COLS_TO_CONCAT, on commas
+        asset_list[FULLADDRESS_COLUMN] = asset_list[ADDRESS_COLS_TO_CONCAT].apply(lambda x: ", ".join(x), axis=1)
+
+    epc_data, errors, no_epc = get_data(
        asset_list=asset_list,
        fulladdress_column=FULLADDRESS_COLUMN,
        address1_column=ADDRESS1_COLUMN,
@ -168,7 +147,7 @@ def app():

    # We now retrieve any failed properties
    asset_list_failed = asset_list[asset_list["row_id"].isin(errors)]
-    epc_data_failed, _ = get_data(
+    epc_data_failed, _, _ = get_data(
        asset_list=asset_list_failed,
        fulladdress_column=FULLADDRESS_COLUMN,
        address1_column=ADDRESS1_COLUMN,