From a7857c0375949f5d45d47afe41f59e07de883e71 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Mon, 18 Nov 2024 20:30:57 +0000
Subject: [PATCH] pulling out data from best match

---
 .../stonewater/Wave 3 Preparation.py          | 111 ++++++++++--------
 etl/find_my_epc/RetrieveFindMyEpc.py          |   1 +
 etl/route_march_data_pull/app.py              |  65 ++++------
 3 files changed, 83 insertions(+), 94 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index aa9e4488..08236d5b 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -1727,7 +1727,7 @@ def propsed_wave_3_sample():
             "Existing Primary Heating System": "Survey: Primary Heating System"
         }
     )
-
+    survey_results["Postal Region"] = survey_results["Postcode"].str.split(" ").str[0]
     # Concatenate from the wall information
     survey_results["Survey: Main Wall Type"] = survey_results["Main Wall Type"].astype(str) + ": " + survey_results[
         "Main Wall Insulation Type"].astype(str)
@@ -1872,6 +1872,8 @@ def propsed_wave_3_sample():
         'Survey: Primary Heating System'
     ]
 
+    survey_results["Survey: Matching Address ID"] = survey_results["Address ID"].copy()
+
     results = []
     for region in tqdm(unique_postal_regions):
         # Take all of the properties in that region
@@ -1884,10 +1886,14 @@ def propsed_wave_3_sample():
 
         region_assets = region_assets.merge(
             exact_surveyed[
-                ["Address ID", "Current EPC Band", "Current SAP Rating"] + survey_attribute_columns],
+                ["Address ID", "Current EPC Band", "Current SAP Rating"] + survey_attribute_columns + [
+                    "Survey: Matching Address ID"
+                ]
+                ],
             on="Address ID",
             how="left"
         )
+        region_assets['Distance to Closest Match (m)'] = 0
 
         # Label the tier 1 properties
         region_assets["Confidence Tier"] = None
@@ -1901,61 +1907,62 @@ def propsed_wave_3_sample():
             "5 - property was surveyed", region_assets["Confidence Tier"]
         )
 
-        archetypes = region_assets[
+        archetype_ids = region_assets[
             pd.isnull(region_assets["Confidence Tier"])
         ]["Archetype ID"].unique()
         # We get the properties that have been surveyed
-        region_surveyed = survey_results[
-            survey_results["Archetype ID"].isin(archetypes) &
-            (survey_results["Postal Region"] == region)
-            ][["Archetype ID", "Current EPC Band"]].drop_duplicates()
 
-        if region_surveyed["Archetype ID"].duplicated().sum():
+        region_surveyed = []
+        for arch_id in archetype_ids:
+            for _, property in region_assets[region_assets["Archetype ID"] == arch_id].iterrows():
+                archetype_data = survey_results_with_original_features[
+                    survey_results["Archetype ID"] == arch_id
+                    ].copy()
+                if archetype_data.empty:
+                    continue
+                if archetype_data.shape[0] > 1:
+                    # Look for an exact match, or as close as possible
+                    archetype_data_filtered = match_property_to_surveyed(property, archetype_data)
+                    if not archetype_data_filtered.empty:
+                        archetype_data = archetype_data_filtered
 
-            region_surveyed = []
-            for arch_id in archetypes:
-                for _, property in region_assets[region_assets["Archetype ID"] == arch_id].iterrows():
-                    archetype_data = survey_results_with_original_features[
-                        survey_results["Archetype ID"] == arch_id
-                        ].copy()
-                    if archetype_data.empty:
-                        continue
-                    if archetype_data.shape[0] > 1:
-                        # Look for an exact match, or as close as possible
-                        archetype_data_filtered = match_property_to_surveyed(property, archetype_data)
-                        if not archetype_data_filtered.empty:
-                            archetype_data = archetype_data_filtered
+                archetype_data["distance_meters"] = haversine(
+                    lat1=property.latitude, lon1=property.longitude,
+                    lat2=archetype_data["latitude"].values, lon2=archetype_data["longitude"].values
+                )
+                expected_sap = np.average(
+                    archetype_data["Current SAP Rating"], weights=1 / (archetype_data["distance_meters"] + 1)
+                )
+                expected_epc = sap_to_epc(expected_sap)
 
-                    archetype_data["distance_meters"] = haversine(
-                        lat1=property.latitude, lon1=property.longitude,
-                        lat2=archetype_data["latitude"].values, lon2=archetype_data["longitude"].values
-                    )
-                    expected_sap = np.average(
-                        archetype_data["Current SAP Rating"], weights=1 / (archetype_data["distance_meters"] + 1)
-                    )
-                    expected_epc = sap_to_epc(expected_sap)
-                    region_surveyed.append(
-                        {
-                            "Archetype ID": arch_id,
-                            "Address ID": property["Address ID"],
-                            "Current EPC Band": expected_epc
-                        }
-                    )
+                # We take the features of the closest matching property
+                closest_match = archetype_data.sort_values("distance_meters", ascending=True).iloc[0]
 
-            region_surveyed = pd.DataFrame(region_surveyed)
-            region_assets = region_assets.merge(
-                region_surveyed,
-                on=["Archetype ID", "Address ID"],
-                how="left",
-                suffixes=("", "_method1")
-            )
-        else:
-            region_assets = region_assets.merge(
-                region_surveyed,
-                on="Archetype ID",
-                how="left",
-                suffixes=("", "_method1")
-            )
+                region_surveyed.append(
+                    {
+                        "Archetype ID": arch_id,
+                        "Address ID": property["Address ID"],
+                        "Current EPC Band": expected_epc,
+                        "Current SAP Rating": expected_sap,
+                        'Survey: Main Wall Type': closest_match["Survey: Main Wall Type"],
+                        'Survey: Main Alternative Wall': closest_match["Survey: Main Alternative Wall"],
+                        'Survey: Main Roof Type': closest_match["Survey: Main Roof Type"],
+                        'Survey: Primary Heating System': closest_match["Survey: Primary Heating System"],
+                        "Survey: Matching Address ID": closest_match["Address ID"],
+                        'Distance to Closest Match (m)': closest_match["distance_meters"]
+                    }
+                )
+
+        region_surveyed = pd.DataFrame(region_surveyed)
+        starting_shape = region_assets.shape[0]
+        region_assets = region_assets.merge(
+            region_surveyed,
+            on=["Archetype ID", "Address ID"],
+            how="left",
+            suffixes=("", "_method1")
+        )
+        if region_assets.shape[0] != starting_shape:
+            raise ValueError("Something went wrong")
 
         # Label the tier 1 properties
         region_assets["Confidence Tier"] = np.where(
@@ -2326,7 +2333,9 @@ def propsed_wave_3_sample():
     results = pd.concat(results)
 
     # Check if there are missings in current epc band, current sap rating or any of the survey attributes
-    for c in ["Current EPC Band", "Current SAP Rating"] + survey_attribute_columns:
+    for c in (
+        ["Current EPC Band", "Current SAP Rating", "Survey: Matching Address ID", 'Distance to Closest Match (m)'] +
+        survey_attribute_columns):
         if pd.isnull(results[c]).sum():
             raise Exception("Something went wrong")
 
diff --git a/etl/find_my_epc/RetrieveFindMyEpc.py b/etl/find_my_epc/RetrieveFindMyEpc.py
index 913a04b8..d5a5134f 100644
--- a/etl/find_my_epc/RetrieveFindMyEpc.py
+++ b/etl/find_my_epc/RetrieveFindMyEpc.py
@@ -269,6 +269,7 @@ class RetrieveFindMyEpc:
             "Loft insulation": ["loft_insulation"],
             "Solar photovoltaic (PV) panels": ["solar_pv"],
             "Party wall insulation": ["party_wall_insulation"],
+            'Draught proofing': ["draught_proofing"],
         }
 
         survey = True
diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py
index f24c5bb2..1e478b0c 100644
--- a/etl/route_march_data_pull/app.py
+++ b/etl/route_march_data_pull/app.py
@@ -23,41 +23,8 @@ EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
 def get_data(asset_list, fulladdress_column, address1_column, postcode_column):
     epc_data = []
     errors = []
+    no_epc = []
     for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)):
-        postcode = home[postcode_column]
-        house_number = home[address1_column]
-        full_address = home[fulladdress_column]
-
-        searcher = SearchEpc(
-            address1=str(house_number),
-            postcode=postcode,
-            auth_token=EPC_AUTH_TOKEN,
-            os_api_key="",
-            property_type=None,
-            fast=True,
-            full_address=full_address,
-            max_retries=5
-        )
-        # Force the skipping of estimating the EPC
-        searcher.ordnance_survey_client.property_type = None
-        searcher.ordnance_survey_client.built_form = None
-
-        searcher.find_property(skip_os=True)
-        if searcher.newest_epc is None:
-            continue
-
-        # Look for EPC recommendatons
-        try:
-            property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"])
-        except:
-            property_recommendations = {"rows": []}
-
-        # Retrieve data from FindMyEPC
-        find_epc_searcher = RetrieveFindMyEpc(
-            address=searcher.newest_epc["address"], postcode=searcher.newest_epc["postcode"]
-        )
-        find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data()
-        time.sleep(np.random.uniform(0.1, 1))
         try:
             postcode = home[postcode_column]
             house_number = home[address1_column]
@@ -79,6 +46,7 @@ def get_data(asset_list, fulladdress_column, address1_column, postcode_column):
 
             searcher.find_property(skip_os=True)
             if searcher.newest_epc is None:
+                no_epc.append(home["row_id"])
                 continue
 
             # Look for EPC recommendatons
@@ -106,7 +74,7 @@ def get_data(asset_list, fulladdress_column, address1_column, postcode_column):
             errors.append(home["row_id"])
             time.sleep(5)
 
-    return epc_data, errors
+    return epc_data, errors, no_epc
 
 
 def extract_address1(asset_list, full_address_col, method="first_two_words"):
@@ -140,26 +108,37 @@ def app():
     Property UPRN
 
     """
-    DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/P&F/"
-    DATA_FILENAME = "BELOW C - WFT FINDINGS ON INSPECTION PLUS SUGGESTED ACTION.xlsx"
+    DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/"
+    DATA_FILENAME = "Bromford programme review.xlsx"
+    SHEET_NAME = "Bromford"
     POSTCODE_COLUMN = "Postcode"
-    FULLADDRESS_COLUMN = "Address"
-    ADDRESS1_COLUMN = None
+    FULLADDRESS_COLUMN = None
+    ADDRESS1_COLUMN = "No."
     ADDRESS1_METHOD = "first_two_words"
+    ADDRESS_COLS_TO_CONCAT = ["No.", "Address"]
 
-    asset_list = pd.read_excel(os.path.join(DATA_FOLDER, DATA_FILENAME), header=0)
+    asset_list = pd.read_excel(os.path.join(DATA_FOLDER, DATA_FILENAME), header=0, sheet_name=SHEET_NAME)
+    asset_list = asset_list[~pd.isnull(asset_list["Postcode"])]
     asset_list["row_id"] = asset_list.index
 
     # We clean up portential non-breaking spaces, and double spaces
     for col in [c for c in [POSTCODE_COLUMN, FULLADDRESS_COLUMN, ADDRESS1_COLUMN] if c is not None]:
+        asset_list[col] = asset_list[col].astype(str)
         asset_list[col] = asset_list[col].str.replace('\xa0', ' ', regex=False)
         asset_list[col] = asset_list[col].str.replace('  ', ' ', regex=False)
 
     if ADDRESS1_COLUMN is None:
         ADDRESS1_COLUMN = "address1_extracted"
-        asset_list = extract_address1(asset_list, FULLADDRESS_COLUMN, ADDRESS1_METHOD)
+        asset_list = extract_address1(
+            asset_list=asset_list, full_address_col=FULLADDRESS_COLUMN, method=ADDRESS1_METHOD
+        )
 
-    epc_data, errors = get_data(
+    if FULLADDRESS_COLUMN is None:
+        FULLADDRESS_COLUMN = "fulladdress_extracted"
+        # We concatenate the columns in ADDRESS_COLS_TO_CONCAT, on commas
+        asset_list[FULLADDRESS_COLUMN] = asset_list[ADDRESS_COLS_TO_CONCAT].apply(lambda x: ", ".join(x), axis=1)
+
+    epc_data, errors, no_epc = get_data(
         asset_list=asset_list,
         fulladdress_column=FULLADDRESS_COLUMN,
         address1_column=ADDRESS1_COLUMN,
@@ -168,7 +147,7 @@ def app():
 
     # We now retrieve any failed properties
     asset_list_failed = asset_list[asset_list["row_id"].isin(errors)]
-    epc_data_failed, _ = get_data(
+    epc_data_failed, _, _ = get_data(
         asset_list=asset_list_failed,
         fulladdress_column=FULLADDRESS_COLUMN,
         address1_column=ADDRESS1_COLUMN,