From 9ac6b25b9fa1adf91926109d6a5610d50bee28b8 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Tue, 23 Jan 2024 18:06:34 +0000
Subject: [PATCH] improving data read code to create standardised
 matching_address and house number

---
 backend/ml_models/Valuation.py                |   2 +
 .../ha_15_32/ha_analysis_batch_3.py           | 134 +++++++++++++++---
 etl/testing_data/livewest_pilot.py            |  38 +++++
 .../the_guiness_partnership_pilot.py          |  38 +++++
 4 files changed, 192 insertions(+), 20 deletions(-)
 create mode 100644 etl/testing_data/livewest_pilot.py
 create mode 100644 etl/testing_data/the_guiness_partnership_pilot.py

diff --git a/backend/ml_models/Valuation.py b/backend/ml_models/Valuation.py
index dadef9a9..ff771252 100644
--- a/backend/ml_models/Valuation.py
+++ b/backend/ml_models/Valuation.py
@@ -22,6 +22,8 @@ class PropertyValuation:
         100021192109: 650000,  # Based on Zoopla
         766249482: 358000,  # Based on Zoopla estimate for 19 Spring Lane, 3 bedroom semi-detached
         100120703802: 277000,  # Based on Zoopla
+        10014469685: 286000,  # Based on Zoopla
+        10001328782: 196000,  # Based on Zoopla
     }
 
     # We base our valuation uplifts on a number of sources
diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
index 85f8704d..54cd7c58 100644
--- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
+++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
@@ -1,8 +1,7 @@
 import os
-import msgpack
 import openpyxl
 from pathlib import Path
-from tqdm import tqdm
+import msgpack
 from datetime import datetime
 import pandas as pd
 import numpy as np
@@ -48,6 +47,14 @@ class DataLoader:
         }
     }
 
+    MIN_ROWS = {
+        "ha_1": 2,
+        "ha_6": 2,
+        "ha_14": 3,  # The spreadsheet starts from the third row
+        "ha_39": 2,
+        "ha_107": 2,
+    }
+
     def __init__(self, files, use_cache):
         self.files = files
         self.use_cache = use_cache
@@ -60,11 +67,14 @@ class DataLoader:
             sheet = workbook[sheet_name]
         else:
             sheet = workbook.active
-        sheet_colnames = [cell.value for cell in sheet[1]]
+        sheet_colnames = [cell.value for cell in sheet[self.MIN_ROWS[ha_name] - 1]]
 
         rows_data = []
         rows_colors = []
-        for row in tqdm(sheet.iter_rows(min_row=2, values_only=False)):  # Assuming the first row is headers
+        for row in tqdm(
+            sheet.iter_rows(min_row=self.MIN_ROWS[ha_name], values_only=False)
+        ):  # Assuming the first row is headers
+
             row_data = [cell.value for cell in row]  # This will get you the cell values
             row_color = row[0].fill.start_color.index if row[0].fill.start_color.index != '00000000' else None
             # row_color = COLOR_INDEX[row_color]
@@ -73,8 +83,12 @@ class DataLoader:
 
         asset_list = pd.DataFrame(rows_data, columns=sheet_colnames)
         asset_list = asset_list.loc[:, asset_list.columns.notnull()]
+
         asset_list['row_color'] = rows_colors
 
+        # Remove entirely empty roww - consider all rows apart from row_color
+        asset_list = asset_list.loc[asset_list.loc[:, asset_list.columns != 'row_color'].notnull().any(axis=1)]
+
         asset_list_colours = self.COLOUR_CONFIG[ha_name]["asset_list"]
 
         asset_list["row_colour_name"] = np.where(
@@ -92,6 +106,54 @@ class DataLoader:
         # Add in asset_list_row_id
         asset_list["asset_list_row_id"] = [ha_name + str(i) for i in range(0, len(asset_list))]
 
+        # Prepare the asset list
+        # Depending on the HA, we need to rename some columns
+        if ha_name == "ha_1":
+            asset_list["matching_address"] = asset_list["Address"].str.lower().str.strip()
+            asset_list["matching_postcode"] = asset_list["Address - Postcode"].str.lower().str.strip()
+        elif ha_name == "ha_6":
+            asset_list["matching_address"] = asset_list["propertyaddress"].str.lower().str.strip()
+            asset_list["matching_postcode"] = asset_list["Post Code"].str.lower().str.strip()
+        elif ha_name == "ha_14":
+            # Create matching_address by concatenating Address 1, Address 2, Address 3, Address 4, Postcode
+            asset_list["matching_address"] = asset_list["Address 1"].str.lower().str.strip() + ", " + \
+                                             asset_list["Address 2"].str.lower().str.strip() + ", " + \
+                                             asset_list["Address 3"].str.lower().str.strip() + ", " + \
+                                             asset_list["Address 4"].str.lower().str.strip() + ", " + \
+                                             asset_list["Postcode"].str.lower().str.strip()
+            asset_list["matching_postcode"] = asset_list["Postcode"].str.lower().str.strip()
+        elif ha_name == "ha_39":
+            # Create matching_address by concatenating add_1, add_2, add_3, add_4, add_5, post_code
+            asset_list["matching_address"] = asset_list["add_1"].astype(str).str.lower().str.strip() + ", " + \
+                                             asset_list["add_2"].str.lower().str.strip() + ", " + \
+                                             asset_list["add_3"].str.lower().str.strip() + ", " + \
+                                             asset_list["add_4"].str.lower().str.strip() + ", " + \
+                                             asset_list["add_5"].str.lower().str.strip() + ", " + \
+                                             asset_list["post_code"].str.lower().str.strip()
+            asset_list["matching_postcode"] = asset_list["post_code"].str.lower().str.strip()
+        elif ha_name == "ha_107":
+            # Create matching_address by concatenating House No, Street, Town, District, Postcode
+            asset_list["matching_address"] = asset_list["House No"].astype(str).str.lower().str.strip() + ", " + \
+                                             asset_list["Street"].str.lower().str.strip() + ", " + \
+                                             asset_list["Town"].str.lower().str.strip() + ", " + \
+                                             asset_list["District"].str.lower().str.strip() + ", " + \
+                                             asset_list["Postcode"].str.lower().str.strip()
+            asset_list["matching_postcode"] = asset_list["Postcode"].str.lower().str.strip()
+        else:
+            raise NotImplementedError("implement me")
+
+        if ha_name in ["ha_107"]:
+            asset_list["HouseNo"] = asset_list["House No"].copy()
+        else:
+            split_addresses = asset_list['matching_address'].str.split(',', expand=True)
+            house_numbers = split_addresses[0].str.split(' ', expand=True)
+            # THe first column should be HouseNo - we aren't interested in the other columns, but we don't know how
+            # many columns there might be
+            house_numbers = house_numbers.iloc[:, 0:1]
+            house_numbers.columns = ['HouseNo']
+
+            asset_list = pd.concat([asset_list, house_numbers[["HouseNo"]]], axis=1)
+
         return asset_list
 
     def load_survey_list(self, file_path, ha_name, asset_list, sheet_name=None):
@@ -165,22 +227,10 @@ class DataLoader:
     def merge_ha_6(asset_list, survey_list):
 
         # Correct the asset list
-        asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("Baggott Place", "Baggotts Place")
-        asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("Cherry Tree", "Cherrytree")
-        asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("Maryhill Close", "Mary Hill Close")
-        asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("Moffat Way", "Moffatt Way")
-
-        # Prepare the asset list
-        asset_list["matching_address"] = asset_list["propertyaddress"].str.lower().str.strip()
-        asset_list["matching_postcode"] = asset_list["Post Code"].str.lower().str.strip()
-
-        split_addresses = asset_list['matching_address'].str.split(',', expand=True)
-        split_addresses.columns = ['temp', 'address2', 'address3', 'address4', 'address5']
-        house_numbers = split_addresses['temp'].str.split(' ', expand=True)
-        house_numbers.columns = ['HouseNo', 'part1', 'part2', "part3", "part4", "part5"]
-
-        asset_list = pd.concat([asset_list, house_numbers[["HouseNo"]]], axis=1)
-        del split_addresses, house_numbers
+        asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("baggott place", "baggotts place")
+        asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("cherry tree", "cherrytree")
+        asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("maryhill close", "mary hill close")
+        asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("moffat way", "moffatt way")
 
         # Correct the survey list
         survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
@@ -403,6 +453,30 @@ class DataLoader:
         )
 
 
+def get_epc_data(loader):
+    if not loader.data:
+        raise ValueError("Data not found - please run loader.load() first")
+
+    property_type_lookup = {}
+
+    for ha_name, data_assets in loader.data.items():
+        # For each HA, we read pull in the data required, and store in S3
+        asset_list = data_assets["asset_list"]
+
+        # We iterate through the asset list and pull what we need
+        for index, property_meta in tqdm(asset_list.iterrows(), total=len(asset_list)):
+            searcher = SearchEpc(
+                address1=property_meta["No."],
+                postcode=property_meta["Postcode"],
+                auth_token=EPC_AUTH_TOKEN,
+                os_api_key=None,
+                full_address=property_meta["Address"]
+            )
+            searcher.ordnance_survey_client.property_type = property_type_lookup[property_meta["Type"]]["property-type"]
+            searcher.ordnance_survey_client.built_form = property_type_lookup[property_meta["Type"]]["built-form"]
+            searcher.find_property(skip_os=True)
+
+
 def app():
     """
     This app contains the housign association analysis for HAs 1, 6, 14, 39 and 107.
@@ -451,3 +525,23 @@ def app():
 
     loader = DataLoader(files, use_cache)
     loader.load()
+
+    # TODO: We probably need to make sure that we have all of the columns that we need
+
+    # We load in the additional data required to perform the analysis
+
+    cleaned = read_from_s3(
+        s3_file_name="cleaned_epc_data/cleaned.bson",
+        bucket_name="retrofit-data-dev"
+    )
+    cleaned = msgpack.unpackb(cleaned, raw=False)
+
+    cleaning_data = read_dataframe_from_s3_parquet(
+        bucket_name="retrofit-data-dev", file_key="sap_change_model/cleaning_dataset.parquet",
+    )
+
+    created_at = datetime.now().isoformat()
+
+    photo_supply_lookup, floor_area_decile_thresholds = SolarPhotoSupply.load(bucket="retrofit-data-dev")
+
+    get_epc_data(loader)
diff --git a/etl/testing_data/livewest_pilot.py b/etl/testing_data/livewest_pilot.py
new file mode 100644
index 00000000..580c16d0
--- /dev/null
+++ b/etl/testing_data/livewest_pilot.py
@@ -0,0 +1,38 @@
+"""
+This script will create an input csv for the recommendation engine and upload it to S3, which can be used for
+testing
+"""
+import os
+
+import pandas as pd
+from utils.s3 import save_csv_to_s3
+
+EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN", None)
+USER_ID = 8
+PORTFOLIO_ID = 61
+
+
+def app():
+    pilot_file = pd.DataFrame(
+        [
+            {"address": "42, Foxes Field", "postcode": "TR18 3RJ", "Notes": None},
+            {"address": "11, Cranley Gardens", "postcode": "TQ13 8UT", "Notes": None},
+        ]
+    )
+
+    # Store the data in s3
+    filename = f"{USER_ID}/{PORTFOLIO_ID}/livewest_pilot_file.csv"
+    save_csv_to_s3(
+        dataframe=pilot_file,
+        bucket_name="retrofit-plan-inputs-dev",
+        file_name=filename
+    )
+
+    body = {
+        "portfolio_id": str(PORTFOLIO_ID),
+        "housing_type": "Social",
+        "goal": "Increase EPC",
+        "goal_value": "C",
+        "trigger_file_path": filename
+    }
+    print(body)
diff --git a/etl/testing_data/the_guiness_partnership_pilot.py b/etl/testing_data/the_guiness_partnership_pilot.py
new file mode 100644
index 00000000..496ea7ea
--- /dev/null
+++ b/etl/testing_data/the_guiness_partnership_pilot.py
@@ -0,0 +1,38 @@
+"""
+This script will create an input csv for the recommendation engine and upload it to S3, which can be used for
+testing
+"""
+import os
+
+import pandas as pd
+from utils.s3 import save_csv_to_s3
+
+EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN", None)
+USER_ID = 8
+PORTFOLIO_ID = 59
+
+
+def app():
+    pilot_file = pd.DataFrame(
+        [
+            {"address": "10 Elm Close", "postcode": "CV37 8XL", "Notes": None},
+            {"address": "21, Spring Lane", "postcode": "MK17 0QP", "Notes": None},
+        ]
+    )
+
+    # Store the data in s3
+    filename = f"{USER_ID}/{PORTFOLIO_ID}/the_guiness_partnership_pilot_file.csv"
+    save_csv_to_s3(
+        dataframe=pilot_file,
+        bucket_name="retrofit-plan-inputs-dev",
+        file_name=filename
+    )
+
+    body = {
+        "portfolio_id": str(PORTFOLIO_ID),
+        "housing_type": "Social",
+        "goal": "Increase EPC",
+        "goal_value": "C",
+        "trigger_file_path": filename
+    }
+    print(body)