From 0c1ce64789938b97a0dfb687e3fda9dab0e5504d Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Thu, 18 Jan 2024 14:32:24 +0000
Subject: [PATCH 01/48] removed temp code and fixed bug where cleaning data is
 lower case in newdata mode

---
 backend/app/plan/router.py | 22 ++++-------
 etl/epc/DataProcessor.py   | 81 ++++++++++++++++++++------------------
 2 files changed, 50 insertions(+), 53 deletions(-)

diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py
index 8c199145..b3d1c623 100644
--- a/backend/app/plan/router.py
+++ b/backend/app/plan/router.py
@@ -28,8 +28,6 @@ from backend.app.utils import epc_to_sap_lower_bound, read_csv_from_s3, sap_to_e
 
 from backend.ml_models.api import ModelApi
 from backend.Property import Property
-from etl.epc.DataProcessor import EPCDataProcessor
-from etl.epc.settings import COLUMNS_TO_MERGE_ON
 from etl.solar.SolarPhotoSupply import SolarPhotoSupply
 
 from recommendations.optimiser.CostOptimiser import CostOptimiser
@@ -68,7 +66,6 @@ async def trigger_plan(body: PlanTriggerRequest):
         )
 
         input_properties = []
-
         for config in plan_input:
             # We validate each record in the file. If the record is NOT valid, we need to handle this accordingly
 
@@ -96,13 +93,16 @@ async def trigger_plan(body: PlanTriggerRequest):
             )
 
             epc_records = {
-                'original_epc': epc_searcher.newest_epc,
-                'full_sap_epc': epc_searcher.full_sap_epc,
-                'old_data': epc_searcher.older_epcs,
+                'original_epc': epc_searcher.newest_epc.copy(),
+                'full_sap_epc': epc_searcher.full_sap_epc.copy(),
+                'old_data': epc_searcher.older_epcs.copy(),
             }
 
-            prepared_epc = EPCRecord(epc_records=epc_records, run_mode="newdata",
-                                     cleaning_data=cleaning_data)  # This uses all the epc records to clean the data
+            prepared_epc = EPCRecord(
+                epc_records=epc_records,
+                run_mode="newdata",
+                cleaning_data=cleaning_data
+            )
 
             input_properties.append(
                 Property(
@@ -173,8 +173,6 @@ async def trigger_plan(body: PlanTriggerRequest):
                 "carbon_change_predictions": get_settings().CARBON_PREDICTIONS_BUCKET
             }
         )
-        # all_predictions["heat_demand_predictions"]= all_predictions["sap_change_predictions"].copy()
-        # all_predictions["carbon_change_predictions"] = all_predictions["sap_change_predictions"].copy()
 
         # Insert the predictions into the recommendations and run the optimiser
         logger.info("Optimising recommendations")
@@ -310,10 +308,6 @@ async def trigger_plan(body: PlanTriggerRequest):
             }
         )
 
-        # all_combined_predictions["heat_demand_predictions"]= all_combined_predictions["sap_change_predictions"].copy()
-        # all_combined_predictions["carbon_change_predictions"] = all_combined_predictions[
-        # "sap_change_predictions"].copy()
-
         # We update the carbon and heat demand predictions
         for property_id, property_recommendations in recommendations.items():
             combined_heat_demand = all_combined_predictions["heat_demand_predictions"]
diff --git a/etl/epc/DataProcessor.py b/etl/epc/DataProcessor.py
index 801a9456..5dfeea1a 100644
--- a/etl/epc/DataProcessor.py
+++ b/etl/epc/DataProcessor.py
@@ -33,7 +33,6 @@ NO_SUFFIX_COMPONENT_COLS = [x.lower() for x in NO_SUFFIX_COMPONENT_COLS]
 ENDING_SUFFIX_COMPONENT_COLS = [x.lower() for x in ENDING_SUFFIX_COMPONENT_COLS]
 POTENTIAL_COLUMNS = [x.lower() for x in POTENTIAL_COLUMNS]
 
-
 # These lookups are used to clean the construction age band
 construction_age_bounds_map = {
     "England and Wales: before 1900": {"l": 0, "u": 1899},
@@ -74,7 +73,8 @@ class EPCDataProcessor:
     Handle data loading and data preprocessing
     """
 
-    def __init__(self, data: pd.DataFrame | None = None, cleaning_averages: pd.DataFrame | None = None, run_mode: str = "training", violation_mode: bool = False) -> None:
+    def __init__(self, data: pd.DataFrame | None = None, cleaning_averages: pd.DataFrame | None = None,
+                 run_mode: str = "training", violation_mode: bool = False) -> None:
         """
         :param filepath: If specified, is the physical location of the data
         :param is_newdata: Indicates if we are processing new, testing data.
@@ -82,23 +82,23 @@ class EPCDataProcessor:
                         want to perform, such as confine_data()
         """
         is_data_a_dataframe = isinstance(data, pd.DataFrame)
-        self.data : pd.DataFrame = data if is_data_a_dataframe else pd.DataFrame()
+        self.data: pd.DataFrame = data if is_data_a_dataframe else pd.DataFrame()
 
         is_cleaning_averages_a_dataframe = isinstance(cleaning_averages, pd.DataFrame)
-        self.cleaning_averages : pd.DataFrame = cleaning_averages if is_cleaning_averages_a_dataframe else pd.DataFrame()
+        self.cleaning_averages: pd.DataFrame = cleaning_averages if is_cleaning_averages_a_dataframe else pd.DataFrame()
 
         # FOR NOW IF VIOLATION MODE IS ON, WE USE RUN MODE AS NEWDATA
         self.violation_mode = violation_mode
         if run_mode not in ["training", "newdata"]:
             raise ValueError("Run mode must be either training or newdata")
         self.run_mode = run_mode if not violation_mode else "newdata"
-    
+
     def prepare_data(self, filepath: Path | str | None = None) -> None:
         """
         Given the run mode, we apply the relevant pipeline steps
         Ignore step is used to highlight which steps are not needed in newdata
         """
-        
+
         ignore_step = True if self.run_mode == "newdata" else False
 
         if filepath is not None:
@@ -126,7 +126,7 @@ class EPCDataProcessor:
         self.fill_na_fields()
 
         self.sort_data_by_uprn_lodgement_date(ignore_step=ignore_step)
-        
+
         # Final re-casting after data transformed and prepared
         self.recast_df_columns(column_mappings=COLUMNTYPES, auto_subset_columns=True)
         self.recast_all_data(column_mappings=COLUMNTYPES, auto_subset_columns=True)
@@ -137,32 +137,36 @@ class EPCDataProcessor:
         self.make_cleaning_averages(ignore_step=ignore_step)
 
         # TODO: check if this has impact on training dataset
-        cleaned_data = self.apply_averages_cleaning(
-            data_to_clean=self.data,
-            cleaning_data=self.cleaning_averages,
-            cols_to_merge_on=['PROPERTY_TYPE', 'BUILT_FORM', 'CONSTRUCTION_AGE_BAND', 'LOCAL_AUTHORITY'],
-            colnames=["NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"],
-        )
+        # cleaned_data = self.apply_averages_cleaning(
+        #     data_to_clean=self.data,
+        #     cleaning_data=self.cleaning_averages,
+        #     cols_to_merge_on=['PROPERTY_TYPE', 'BUILT_FORM', 'CONSTRUCTION_AGE_BAND', 'LOCAL_AUTHORITY'],
+        #     colnames=["NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"],
+        # )
+
+        # When running in newdata mode, cleaning_averages has lower cases so we co-erce back to upper
+        cleaning_averages = self.cleaning_averages.copy()
+        if self.run_mode == "newdata":
+            cleaning_averages.columns = cleaning_averages.columns.str.upper()
 
         cleaned_data = self.apply_averages_cleaning(
-                data_to_clean=self.data,
-                cleaning_data=self.cleaning_averages,
-                cols_to_merge_on=COLUMNS_TO_MERGE_ON,
-            )
-        
+            data_to_clean=self.data,
+            cleaning_data=cleaning_averages,
+            cols_to_merge_on=COLUMNS_TO_MERGE_ON,
+        )
+
         self.data = self.data if cleaned_data is None else cleaned_data
 
         self.add_local_authority_to_cleaning_average(ignore_step=ignore_step)
         self.cast_cleaning_averages_columns_to_lower(ignore_step=ignore_step)
         self.cast_data_columns_to_lower()
 
-            
     def cast_data_columns_to_lower(self):
         """
         Convert all columns names to lower
         """
         self.data.columns = self.data.columns.str.lower()
-    
+
     def cast_cleaning_averages_columns_to_lower(self, ignore_step: bool = False):
         """
         Convert all column names to lower
@@ -171,9 +175,9 @@ class EPCDataProcessor:
 
         if ignore_step:
             return
-        
+
         self.cleaning_averages.columns = self.cleaning_averages.columns.str.lower()
-    
+
     def add_local_authority_to_cleaning_average(self, ignore_step: bool = False):
         """
         Add the Local authority column to the cleaning averages
@@ -182,7 +186,7 @@ class EPCDataProcessor:
 
         if ignore_step:
             return
-        
+
         self.cleaning_averages["LOCAL_AUTHORITY"] = self.data["LOCAL_AUTHORITY"].values[0]
 
     def fill_invalid_constituency_fields(self, ignore_step: bool = False):
@@ -195,7 +199,7 @@ class EPCDataProcessor:
 
         if ignore_step:
             return
-        
+
         self.data = self.data.fillna({"CONSTITUENCY": self.data["CONSTITUENCY"].mode().values[0]})
 
     def sort_data_by_uprn_lodgement_date(self, ignore_step: bool = False):
@@ -218,7 +222,6 @@ class EPCDataProcessor:
         for col in convert_to_lower:
             self.data[col] = self.data[col].str.lower()
 
-
     def remap_build_form(self):
         """
         Remap build form to standard values
@@ -226,7 +229,6 @@ class EPCDataProcessor:
         """
         self.data["BUILT_FORM"] = self.data["BUILT_FORM"].replace(BUILT_FORM_REMAP)
 
-
     def remap_anomalies(self):
         """
         Remap anomalies to None
@@ -258,7 +260,7 @@ class EPCDataProcessor:
 
         if ignore_step:
             return
-        
+
         self.data["FLOOR_LEVEL"] = self.data["FLOOR_LEVEL"].replace(FLOOR_LEVEL_MAP)
 
     def load_data(self, filepath, low_memory=False) -> None:
@@ -404,7 +406,8 @@ class EPCDataProcessor:
     #         self.data = self.data.sort_values(["UPRN", "LODGEMENT_DATE"], ascending=True)
 
     #     # Final re-casting after data transformed and prepared
-    #     coltypes = {k: v for k, v in COLUMNTYPES.items() if k in self.data.columns} if self.is_newdata else COLUMNTYPES
+    #     coltypes = {k: v for k, v in COLUMNTYPES.items() if k in self.data.columns} if self.is_newdata else
+    #     COLUMNTYPES
     #     for k, v in coltypes.items():
     #         self.data[k] = self.data[k].astype(v) 
     #     self.data = self.data.astype(coltypes)
@@ -423,7 +426,7 @@ class EPCDataProcessor:
     #             cleaning_data=self.cleaning_averages,
     #             cols_to_merge_on=COLUMNS_TO_MERGE_ON
     #         )
-            
+
     #         self.cleaning_averages["LOCAL_AUTHORITY"] = self.data["LOCAL_AUTHORITY"].values[0]
     #         self.cleaning_averages.columns = self.cleaning_averages.columns.str.lower()
 
@@ -431,7 +434,6 @@ class EPCDataProcessor:
 
     #     return self.data, self.cleaning_averages
 
-
     def na_remapping(self, auto_subset_columns: bool = False):
 
         fill_na_map_apply = {
@@ -578,7 +580,7 @@ class EPCDataProcessor:
         if self.violation_mode:
             # TODO: to fill in
             return
-        
+
         if ignore_step:
             return
 
@@ -604,15 +606,15 @@ class EPCDataProcessor:
                     self.data[key] = self.data[key].astype(value)
             else:
                 self.data[key] = self.data[key].astype(values)
-        
+
     def recast_all_data(self, column_mappings: dict, auto_subset_columns: bool = False) -> None:
         """
         Using a dictionary to recast all columns at once
-        """     
+        """
 
         if auto_subset_columns:
             column_mappings = {k: v for k, v in column_mappings.items() if k in self.data.columns}
-        
+
         self.data = self.data.astype(column_mappings)
 
     def confine_data(self, ignore_step: bool = False):
@@ -642,7 +644,7 @@ class EPCDataProcessor:
                     violation_missing_hotwater_description,
                     violation_missing_roof_description,
                     violation_invalid_property_type,
-                ], axis=1, 
+                ], axis=1,
                 keys=[
                     "violation_uprn_missing",
                     "violation_old_lodgment_date",
@@ -654,8 +656,8 @@ class EPCDataProcessor:
                     "violation_missing_roof_description",
                     "violation_invalid_property_type",
                 ]
-                )
-            
+            )
+
             self.data = pd.concat([self.data, violation_df], axis=1)
 
         if ignore_step:
@@ -703,7 +705,7 @@ class EPCDataProcessor:
         if self.violation_mode:
             # TODO:
             return
-        
+
         if ignore_step:
             return
 
@@ -721,7 +723,8 @@ class EPCDataProcessor:
         self.data["PHOTO_SUPPLY"] = self.data["PHOTO_SUPPLY"].fillna(0)
 
     @staticmethod
-    def apply_averages_cleaning(data_to_clean, cleaning_data, cols_to_merge_on, colnames=None, ignore_step: bool = False):
+    def apply_averages_cleaning(data_to_clean, cleaning_data, cols_to_merge_on, colnames=None,
+                                ignore_step: bool = False):
         """
         Clean the input DataFrame using averages from a cleaning DataFrame.
 

From 1699102cd9a6b5357e04390980899fb8e4b29178 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Thu, 18 Jan 2024 15:12:00 +0000
Subject: [PATCH 02/48] added tests for clean_ventilation

---
 backend/app/plan/router.py      | 26 +++++++++
 etl/epc/Record.py               |  7 ++-
 etl/epc/tests/test_epcrecord.py | 98 +++++++++++++++++++++++++++++++++
 3 files changed, 129 insertions(+), 2 deletions(-)
 create mode 100644 etl/epc/tests/test_epcrecord.py

diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py
index b3d1c623..d869bcb5 100644
--- a/backend/app/plan/router.py
+++ b/backend/app/plan/router.py
@@ -164,6 +164,32 @@ async def trigger_plan(body: PlanTriggerRequest):
 
         model_api = ModelApi(portfolio_id=body.portfolio_id, timestamp=created_at)
 
+        recommendations_scoring_data.head()
+        z = recommendations_scoring_data[recommendations_scoring_data["uprn"] == 100070505235].copy()
+        z = z[z["roof_thermal_transmittance"] != z["roof_thermal_transmittance_ending"]]
+        z["roof_thermal_transmittance_ending"] = 0.4
+        z["roof_energy_eff_ending"] = "Average"
+
+        now = model_api.predict_all(
+            df=z,
+            bucket=get_settings().DATA_BUCKET,
+            prediction_buckets={
+                "sap_change_predictions": get_settings().SAP_PREDICTIONS_BUCKET,
+                "heat_demand_predictions": get_settings().HEAT_PREDICTIONS_BUCKET,
+                "carbon_change_predictions": get_settings().CARBON_PREDICTIONS_BUCKET
+            }
+        )
+
+        now["sap_change_predictions"]
+        input_properties[1].data["mechanical-ventilation"]
+        #         id  predictions property_id recommendation_id
+        # 0   3696+9         56.3        3696                 9
+        # 1  3696+10         56.8        3696                10
+        # 2  3696+11         56.3        3696                11
+        # 3  3696+12         56.8        3696                12
+        # With good rather than very good
+        now["sap_change_predictions"]
+
         all_predictions = model_api.predict_all(
             df=recommendations_scoring_data,
             bucket=get_settings().DATA_BUCKET,
diff --git a/etl/epc/Record.py b/etl/epc/Record.py
index 70586749..f1dde43e 100644
--- a/etl/epc/Record.py
+++ b/etl/epc/Record.py
@@ -635,8 +635,11 @@ class EPCRecord:
         This method will clean the ventilation, if empty or invalid
         """
         self.prepared_epc['mechanical-ventilation'] = None if (
-            self.mechanical_ventilation == "" or self.mechanical_ventilation in DATA_ANOMALY_MATCHES) else (
-            self.mechanical_ventilation)
+            (self.prepared_epc['mechanical-ventilation'] == "") or
+            (self.prepared_epc['mechanical-ventilation'] in DATA_ANOMALY_MATCHES)
+        ) else (
+            self.prepared_epc['mechanical-ventilation']
+        )
 
     def _field_validation(self):
         """
diff --git a/etl/epc/tests/test_epcrecord.py b/etl/epc/tests/test_epcrecord.py
new file mode 100644
index 00000000..f55bd30a
--- /dev/null
+++ b/etl/epc/tests/test_epcrecord.py
@@ -0,0 +1,98 @@
+import pytest
+from utils.s3 import read_dataframe_from_s3_parquet
+from etl.epc.Record import EPCRecord
+from unittest.mock import Mock
+
+
+class TestEpcRecord:
+
+    @pytest.fixture()
+    def cleaning_data(self):
+        cleaning_data = read_dataframe_from_s3_parquet(
+            bucket_name="retrofit-data-dev", file_key="sap_change_model/cleaning_dataset.parquet",
+        )
+
+        return cleaning_data
+
+    @pytest.fixture()
+    def epc_records_1(self):
+        epc_records_1 = {
+            'original_epc': {
+                'low-energy-fixed-light-count': '', 'address': '139 School Road, Hall Green',
+                'uprn-source': 'Energy Assessor', 'floor-height': '2.6', 'heating-cost-potential': '1138',
+                'unheated-corridor-length': '', 'hot-water-cost-potential': '175',
+                'construction-age-band': 'England and Wales: 1900-1929', 'potential-energy-rating': 'B',
+                'mainheat-energy-eff': 'Good', 'windows-env-eff': 'Average', 'lighting-energy-eff': 'Very Good',
+                'environment-impact-potential': '82', 'glazed-type': 'double glazing, unknown install date',
+                'heating-cost-current': '2711', 'address3': '',
+                'mainheatcont-description': 'Programmer, TRVs and bypass',
+                'sheating-energy-eff': 'N/A', 'property-type': 'House', 'local-authority-label': 'Birmingham',
+                'fixed-lighting-outlets-count': '11', 'energy-tariff': 'Single', 'mechanical-ventilation': 'natural',
+                'hot-water-cost-current': '310', 'county': '', 'postcode': 'B28 8JF', 'solar-water-heating-flag': 'N',
+                'constituency': 'E14000562', 'co2-emissions-potential': '2.0', 'number-heated-rooms': '4',
+                'floor-description': 'Suspended, no insulation (assumed)', 'energy-consumption-potential': '107',
+                'local-authority': 'E08000025', 'built-form': 'Semi-Detached', 'number-open-fireplaces': '0',
+                'windows-description': 'Fully double glazed', 'glazed-area': 'Normal', 'inspection-date': '2023-07-05',
+                'mains-gas-flag': 'Y', 'co2-emiss-curr-per-floor-area': '65', 'address1': '139 School Road',
+                'heat-loss-corridor': '', 'flat-storey-count': '', 'constituency-label': 'Birmingham, Hall Green',
+                'roof-energy-eff': 'Average', 'total-floor-area': '103.0', 'building-reference-number': '10004697322',
+                'environment-impact-current': '43', 'co2-emissions-current': '6.7',
+                'roof-description': 'Pitched, 100 mm loft insulation', 'floor-energy-eff': 'N/A',
+                'number-habitable-rooms': '4', 'address2': 'Hall Green', 'hot-water-env-eff': 'Good',
+                'posttown': 'BIRMINGHAM', 'mainheatc-energy-eff': 'Average', 'main-fuel': 'mains gas (not community)',
+                'lighting-env-eff': 'Very Good', 'windows-energy-eff': 'Average', 'floor-env-eff': 'N/A',
+                'sheating-env-eff': 'N/A', 'lighting-description': 'Low energy lighting in 82% of fixed outlets',
+                'roof-env-eff': 'Average', 'walls-energy-eff': 'Very Poor', 'photo-supply': '0.0',
+                'lighting-cost-potential': '182', 'mainheat-env-eff': 'Good', 'multi-glaze-proportion': '100',
+                'main-heating-controls': '', 'lodgement-datetime': '2023-07-13 08:23:07', 'flat-top-storey': '',
+                'current-energy-rating': 'E', 'secondheat-description': 'None', 'walls-env-eff': 'Very Poor',
+                'transaction-type': 'rental', 'uprn': '100070505235', 'current-energy-efficiency': '51',
+                'energy-consumption-current': '366', 'mainheat-description': 'Boiler and radiators, mains gas',
+                'lighting-cost-current': '182', 'lodgement-date': '2023-07-13', 'extension-count': '0',
+                'mainheatc-env-eff': 'Average',
+                'lmk-key': 'c1d137711da433fb3cced74b1a6848da8bbc1159d076455d26d7b4668982601e',
+                'wind-turbine-count': '0',
+                'tenure': 'Rented (social)', 'floor-level': '', 'potential-energy-efficiency': '84',
+                'hot-water-energy-eff': 'Good', 'low-energy-lighting': '82',
+                'walls-description': 'Solid brick, as built, no insulation (assumed)',
+                'hotwater-description': 'From main system'}, 'full_sap_epc': {}, 'old_data': []
+        }
+        return epc_records_1
+
+    def test_clean_mechanical_ventilation(self, cleaning_data, epc_records_1):
+        # We have an epc with Natural ventilation - the resulting epc should also have natural ventulation
+
+        record = EPCRecord(cleaning_data=cleaning_data)
+        record.prepared_epc = {
+            "mechanical-ventilation": "natural"
+        }
+        record._clean_ventilation()
+
+        assert record.prepared_epc["mechanical-ventilation"] == "natural"
+
+        record2 = EPCRecord(cleaning_data=cleaning_data)
+        record2.prepared_epc = {
+            "mechanical-ventilation": ""
+        }
+
+        record2._clean_ventilation()
+
+        assert record2.prepared_epc["mechanical-ventilation"] is None
+
+        record3 = EPCRecord(cleaning_data=cleaning_data)
+        record3.prepared_epc = {
+            "mechanical-ventilation": None
+        }
+
+        record3._clean_ventilation()
+
+        assert record3.prepared_epc["mechanical-ventilation"] is None
+
+        record4 = EPCRecord(cleaning_data=cleaning_data)
+        record4.prepared_epc = {
+            "mechanical-ventilation": "INVALID"
+        }
+
+        record4._clean_ventilation()
+
+        assert record4.prepared_epc["mechanical-ventilation"] is None

From dbe13586da99dbbd28a126eb02537c8987564faf Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Thu, 18 Jan 2024 18:10:24 +0000
Subject: [PATCH 03/48] creating unit tests, added test cases for router

---
 backend/Property.py             |   5 +-
 backend/app/plan/router.py      |  36 ++------
 backend/ml_models/Valuation.py  |  10 +-
 etl/epc/DataProcessor.py        |   5 +-
 etl/epc/Record.py               |  43 ++++++---
 etl/epc/tests/test_epcrecord.py | 158 +++++++++++++++++++++++++++++++-
 6 files changed, 207 insertions(+), 50 deletions(-)

diff --git a/backend/Property.py b/backend/Property.py
index 98325b15..c1055eb9 100644
--- a/backend/Property.py
+++ b/backend/Property.py
@@ -222,7 +222,10 @@ class Property(Definitions):
                 proposed_depth = min(valid_numeric_values, key=lambda x: abs(x - proposed_depth))
 
             recommendation_record["roof_insulation_thickness_ending"] = str(proposed_depth)
-            recommendation_record["roof_energy_eff_ending"] = "Very Good"
+            if recommendation["type"] == "loft_insulation":
+                recommendation_record["roof_energy_eff_ending"] = "Good"
+            else:
+                recommendation_record["roof_energy_eff_ending"] = "Very Good"
         else:
             # Fill missing roof u-values - this fill is not based on recommended upgrades
             if recommendation_record["roof_thermal_transmittance_ending"] is None:
diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py
index d869bcb5..521ec615 100644
--- a/backend/app/plan/router.py
+++ b/backend/app/plan/router.py
@@ -65,6 +65,16 @@ async def trigger_plan(body: PlanTriggerRequest):
             bucket_name=get_settings().DATA_BUCKET, file_key="sap_change_model/cleaning_dataset.parquet",
         )
 
+        # For testing:
+        # plan_input.extend(
+        #     [
+        #         {'address': '73 Long Chaulden', 'postcode': 'HP1 2HX', 'Notes': ''},
+        #         {'address': '8 Lindlings', 'postcode': 'HP1 2HA', 'Notes': ''},
+        #         {'address': '44 Lindlings', 'postcode': 'HP1 2HE', 'Notes': ''},
+        #         {'address': '46 Chaulden Terrace', 'postcode': 'HP1 2AN', 'Notes': ''},
+        #     ]
+        # )
+
         input_properties = []
         for config in plan_input:
             # We validate each record in the file. If the record is NOT valid, we need to handle this accordingly
@@ -164,32 +174,6 @@ async def trigger_plan(body: PlanTriggerRequest):
 
         model_api = ModelApi(portfolio_id=body.portfolio_id, timestamp=created_at)
 
-        recommendations_scoring_data.head()
-        z = recommendations_scoring_data[recommendations_scoring_data["uprn"] == 100070505235].copy()
-        z = z[z["roof_thermal_transmittance"] != z["roof_thermal_transmittance_ending"]]
-        z["roof_thermal_transmittance_ending"] = 0.4
-        z["roof_energy_eff_ending"] = "Average"
-
-        now = model_api.predict_all(
-            df=z,
-            bucket=get_settings().DATA_BUCKET,
-            prediction_buckets={
-                "sap_change_predictions": get_settings().SAP_PREDICTIONS_BUCKET,
-                "heat_demand_predictions": get_settings().HEAT_PREDICTIONS_BUCKET,
-                "carbon_change_predictions": get_settings().CARBON_PREDICTIONS_BUCKET
-            }
-        )
-
-        now["sap_change_predictions"]
-        input_properties[1].data["mechanical-ventilation"]
-        #         id  predictions property_id recommendation_id
-        # 0   3696+9         56.3        3696                 9
-        # 1  3696+10         56.8        3696                10
-        # 2  3696+11         56.3        3696                11
-        # 3  3696+12         56.8        3696                12
-        # With good rather than very good
-        now["sap_change_predictions"]
-
         all_predictions = model_api.predict_all(
             df=recommendations_scoring_data,
             bucket=get_settings().DATA_BUCKET,
diff --git a/backend/ml_models/Valuation.py b/backend/ml_models/Valuation.py
index 018b4678..dadef9a9 100644
--- a/backend/ml_models/Valuation.py
+++ b/backend/ml_models/Valuation.py
@@ -96,11 +96,11 @@ class PropertyValuation:
 
         if not value:
             return {
-                "current_value": None,
-                "lower_bound_increased_value": None,
-                "upper_bound_increased_value": None,
-                "average_increased_value": None,
-                "average_increase": None
+                "current_value": 0,
+                "lower_bound_increased_value": 0,
+                "upper_bound_increased_value": 0,
+                "average_increased_value": 0,
+                "average_increase": 0
             }
 
         current_epc = property_instance.data["current-energy-rating"]
diff --git a/etl/epc/DataProcessor.py b/etl/epc/DataProcessor.py
index 5dfeea1a..4c4651a4 100644
--- a/etl/epc/DataProcessor.py
+++ b/etl/epc/DataProcessor.py
@@ -723,8 +723,9 @@ class EPCDataProcessor:
         self.data["PHOTO_SUPPLY"] = self.data["PHOTO_SUPPLY"].fillna(0)
 
     @staticmethod
-    def apply_averages_cleaning(data_to_clean, cleaning_data, cols_to_merge_on, colnames=None,
-                                ignore_step: bool = False):
+    def apply_averages_cleaning(
+        data_to_clean, cleaning_data, cols_to_merge_on, colnames=None, ignore_step: bool = False
+    ):
         """
         Clean the input DataFrame using averages from a cleaning DataFrame.
 
diff --git a/etl/epc/Record.py b/etl/epc/Record.py
index f1dde43e..4474baf1 100644
--- a/etl/epc/Record.py
+++ b/etl/epc/Record.py
@@ -380,13 +380,21 @@ class EPCRecord:
             else:
                 # Use averages from the cleaning dataset, based on the property type, built form, construction age
                 # band and local authority
+
+                cleaning_data = self.cleaning_data.copy()
+                # When running in new-data more, the columns will have been coerced to lower case so we push them
+                # back to upper case
+                if self.run_mode == "newdata":
+                    cleaning_data.columns = [x.upper() for x in cleaning_data.columns]
+
                 cleaned_property_data = EPCDataProcessor.apply_averages_cleaning(
                     data_to_clean=self.epc_record_as_dataframe("prepared_epc", replace_empty_string=True),
                     cleaning_data=self.cleaning_data,
                     cols_to_merge_on=['PROPERTY_TYPE', 'BUILT_FORM', 'CONSTRUCTION_AGE_BAND', 'LOCAL_AUTHORITY'],
                 )
                 self.prepared_epc["fixed-lighting-outlets-count"] = round(
-                    cleaned_property_data["FIXED_LIGHTING_OUTLETS_COUNT"].values[0])
+                    cleaned_property_data["FIXED_LIGHTING_OUTLETS_COUNT"].values[0]
+                )
         else:
             self.prepared_epc["fixed-lighting-outlets-count"] = float(self.prepared_epc["fixed-lighting-outlets-count"])
 
@@ -460,14 +468,14 @@ class EPCRecord:
         if not self.prepared_epc:
             raise ValueError("EPC Recrod doesn not contain epc data")
 
-        map = {
+        mains_gas_map = {
             "Y": True,
             "N": False,
         }
 
         self.prepared_epc["mains-gas-flag"] = None if (
             self.prepared_epc["mains-gas-flag"] == "" or self.prepared_epc["mains-gas-flag"] in DATA_ANOMALY_MATCHES
-        ) else map[self.prepared_epc["mains-gas-flag"]]
+        ) else mains_gas_map[self.prepared_epc["mains-gas-flag"]]
 
     def _clean_heat_loss_corridor(self):
         """
@@ -476,15 +484,18 @@ class EPCRecord:
         if not self.prepared_epc:
             raise ValueError("EPC Recrod doesn not contain epc data")
 
-        map = {
-            "no corridor": False,
-            "unheated corridor": True,
-            "heated corridor": False
-        }
+        valid_values = [
+            "no corridor",
+            "unheated corridor",
+            "heated corridor"
+        ]
 
-        self.prepared_epc["heat-loss-corridor"] = False if self.prepared_epc[
-                                                               "heat-loss-corridor"] in DATA_ANOMALY_MATCHES else map[
-            self.prepared_epc["heat-loss-corridor"]]
+        self.prepared_epc["heat-loss-corridor"] = (
+            "no corridor" if self.prepared_epc["heat-loss-corridor"] in DATA_ANOMALY_MATCHES else
+            self.prepared_epc["heat-loss-corridor"]
+        )
+        if self.prepared_epc["heat-loss-corridor"] not in valid_values:
+            self.prepared_epc["heat-loss-corridor"] = "no corridor"
 
         self.prepared_epc["unheated-corridor-length"] = (
             float(self.prepared_epc["unheated-corridor-length"]) if
@@ -572,11 +583,13 @@ class EPCRecord:
         if not self.prepared_epc:
             raise ValueError("EPC Recrod doesn not contain epc data")
 
-        self.prepared_epc['built-form'] = BUILT_FORM_REMAP.get(self.prepared_epc["built-form"],
-                                                               self.prepared_epc["built-form"])
+        self.prepared_epc['built-form'] = BUILT_FORM_REMAP.get(
+            self.prepared_epc["built-form"], self.prepared_epc["built-form"]
+        )
+
         if self.prepared_epc["built-form"] in DATA_ANOMALY_MATCHES:
-            if self.prepared_epc["property-type"] == "Flat":
-                self.prepared_epc["built-form"] = "Semi-Detached"
+            if self.prepared_epc["property-type"] in ["Flat", "Maisonette"]:
+                self.prepared_epc["built-form"] = "End-Terrace"
 
     def _clean_age_band(self):
         """
diff --git a/etl/epc/tests/test_epcrecord.py b/etl/epc/tests/test_epcrecord.py
index f55bd30a..06b8ed06 100644
--- a/etl/epc/tests/test_epcrecord.py
+++ b/etl/epc/tests/test_epcrecord.py
@@ -1,7 +1,8 @@
 import pytest
 from utils.s3 import read_dataframe_from_s3_parquet
 from etl.epc.Record import EPCRecord
-from unittest.mock import Mock
+from etl.epc.settings import DATA_ANOMALY_MATCHES
+import random
 
 
 class TestEpcRecord:
@@ -96,3 +97,158 @@ class TestEpcRecord:
         record4._clean_ventilation()
 
         assert record4.prepared_epc["mechanical-ventilation"] is None
+
+    def test_clean_energy_valid_values(self, cleaning_data, epc_records_1):
+        record = EPCRecord(cleaning_data=cleaning_data)
+        record.prepared_epc = {
+            "energy-consumption-current": "200",
+            "co2-emissions-current": "5.5"
+        }
+        record._clean_energy()
+
+        assert record.prepared_epc["energy-consumption-current"] == 200.0
+        assert record.prepared_epc["co2-emissions-current"] == 5.5
+
+    def test_clean_energy_empty_values(self, cleaning_data, epc_records_1):
+        # We cannot have invalid values so this should raise an exception
+        record = EPCRecord(cleaning_data=cleaning_data)
+        record.prepared_epc = {
+            "energy-consumption-current": "",
+            "co2-emissions-current": ""
+        }
+        record._clean_energy()
+
+        with pytest.raises(ValueError):
+            record._clean_energy()
+
+    def test_clean_built_form_valid_remap(self, cleaning_data, epc_records_1):
+        record = EPCRecord(cleaning_data=cleaning_data)
+        # Assuming "Semi" should be remapped to "Semi-Detached"
+        record.prepared_epc = {
+            "built-form": "Semi-Detached",
+            "property-type": "Flat"  # Assuming this affects the remapping
+        }
+        record._clean_built_form()
+
+        assert record.prepared_epc["built-form"] == "Semi-Detached"
+
+    def test_clean_built_form_anomaly(self, cleaning_data, epc_records_1):
+        record = EPCRecord(cleaning_data=cleaning_data)
+
+        record.prepared_epc = {
+            "built-form": "",
+            "property-type": "Flat"
+        }
+        record._clean_built_form()
+
+        assert record.prepared_epc["built-form"] == "End-Terrace"
+
+    def test_clean_floor_area_valid(self, cleaning_data):
+        record = EPCRecord(cleaning_data=cleaning_data)
+        record.prepared_epc = {
+            "total-floor-area": "120.5"
+        }
+        record._clean_floor_area()
+
+        assert record.prepared_epc["total-floor-area"] == 120.5
+
+    def test_clean_floor_area_empty(self, cleaning_data):
+        record = EPCRecord(cleaning_data=cleaning_data)
+        record.prepared_epc = {
+            "total-floor-area": ""
+        }
+        # We have no known case of missing floor area
+        with pytest.raises(ValueError):
+            record._clean_floor_area()
+
+    def test_clean_heat_loss_corridor_valid(self, cleaning_data):
+        record = EPCRecord(cleaning_data=cleaning_data)
+        record.prepared_epc = {
+            "heat-loss-corridor": "unheated corridor",
+            "unheated-corridor-length": ""
+        }
+        record._clean_heat_loss_corridor()
+
+        assert record.prepared_epc["heat-loss-corridor"] == "unheated corridor"
+
+    def test_clean_heat_loss_corridor_anomaly(self, cleaning_data):
+        record = EPCRecord(cleaning_data=cleaning_data)
+        # Assuming "InvalidCorridor" is an anomaly
+        record.prepared_epc = {
+            "heat-loss-corridor": "InvalidCorridor",
+            "unheated-corridor-length": ""
+        }
+        record._clean_heat_loss_corridor()
+
+        assert record.prepared_epc["heat-loss-corridor"] == "no corridor"
+
+    def test_clean_mains_gas_valid(self, cleaning_data):
+        record = EPCRecord(cleaning_data=cleaning_data)
+        record.prepared_epc = {
+            "mains-gas-flag": "Y"
+        }
+        record._clean_mains_gas()
+
+        assert record.prepared_epc["mains-gas-flag"] is True
+
+    def test_clean_mains_gas_anomaly(self, cleaning_data):
+        record = EPCRecord(cleaning_data=cleaning_data)
+        record.prepared_epc = {
+            "mains-gas-flag": "InvalidValue"
+        }
+        # It should always be Y or N or an anomally value
+        with pytest.raises(ValueError):
+            record._clean_mains_gas()
+
+        record = EPCRecord(cleaning_data=cleaning_data)
+        record.prepared_epc = {
+            "mains-gas-flag": random.choice(list(DATA_ANOMALY_MATCHES))
+        }
+        record._clean_mains_gas()
+
+        assert record.prepared_epc["mains-gas-flag"] is None
+
+    def test_clean_solar_hot_water_valid(self, cleaning_data):
+        record = EPCRecord(cleaning_data=cleaning_data)
+        record.prepared_epc = {
+            "solar-water-heating-flag": "Y"
+        }
+        record._clean_solar_hot_water()
+
+        assert record.prepared_epc["solar-water-heating-flag"] is True
+
+    def test_clean_solar_hot_water_empty(self, cleaning_data):
+        record = EPCRecord(cleaning_data=cleaning_data)
+        record.prepared_epc = {
+            "solar-water-heating-flag": ""
+        }
+        record._clean_solar_hot_water()
+
+        assert record.prepared_epc["solar-water-heating-flag"] is None
+
+    def test_clean_number_lighting_outlets_valid(self, cleaning_data, epc_records_1):
+        record = EPCRecord(cleaning_data=cleaning_data, epc_records=epc_records_1)
+        record.prepared_epc = {
+            "fixed-lighting-outlets-count": "5"
+        }
+        record._clean_number_lighting_outlets()
+
+        assert record.prepared_epc["fixed-lighting-outlets-count"] == 5.0
+
+    def test_clean_number_lighting_outlets_empty(self, cleaning_data, epc_records_1):
+        record = EPCRecord(cleaning_data=cleaning_data)
+        record.run_mode = "newdata"
+        record.prepared_epc = {
+            "fixed-lighting-outlets-count": "",
+            "property-type": "Flat",
+            "built-form": "Semi-Detached",
+            "construction-age-band": "England and Wales: 1900-1929",
+            "local-authority": "E08000025",
+            "number-habitable-rooms": "4",
+            "number-heated-rooms": "4",
+        }
+        record.old_data = []
+        record.full_sap_epc = []
+        record._clean_number_lighting_outlets()
+
+        assert record.prepared_epc["fixed-lighting-outlets-count"] == 8.0

From 86dd6efdc387a2b6c67a9244db3382d6fe7896ab Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Fri, 19 Jan 2024 11:27:51 +0000
Subject: [PATCH 04/48] fixed bug in lighting outlets cleaning

---
 backend/app/plan/router.py | 20 ++++++++++++--------
 etl/epc/Record.py          |  2 +-
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py
index 521ec615..d3471e8f 100644
--- a/backend/app/plan/router.py
+++ b/backend/app/plan/router.py
@@ -66,14 +66,18 @@ async def trigger_plan(body: PlanTriggerRequest):
         )
 
         # For testing:
-        # plan_input.extend(
-        #     [
-        #         {'address': '73 Long Chaulden', 'postcode': 'HP1 2HX', 'Notes': ''},
-        #         {'address': '8 Lindlings', 'postcode': 'HP1 2HA', 'Notes': ''},
-        #         {'address': '44 Lindlings', 'postcode': 'HP1 2HE', 'Notes': ''},
-        #         {'address': '46 Chaulden Terrace', 'postcode': 'HP1 2AN', 'Notes': ''},
-        #     ]
-        # )
+        plan_input.extend(
+            [
+                {'address': '73 Long Chaulden', 'postcode': 'HP1 2HX', 'Notes': ''},
+                {'address': '8 Lindlings', 'postcode': 'HP1 2HA', 'Notes': ''},
+                {'address': '44 Lindlings', 'postcode': 'HP1 2HE', 'Notes': ''},
+                {'address': '46 Chaulden Terrace', 'postcode': 'HP1 2AN', 'Notes': ''},
+                {'address': '4, Heather Shaw', 'postcode': 'BA14 7JS', 'Notes': ''},
+                {'address': '16 Glastonbury Road', 'postcode': 'M32 9PE', 'Notes': ''},
+                {'address': '31 Loddon Way', 'postcode': 'BA15 1HG', 'Notes': ''},
+                {'address': '62 Pearmain Drive', 'postcode': 'NG3 3DJ', 'Notes': ''},
+            ]
+        )
 
         input_properties = []
         for config in plan_input:
diff --git a/etl/epc/Record.py b/etl/epc/Record.py
index 4474baf1..cdbafd7e 100644
--- a/etl/epc/Record.py
+++ b/etl/epc/Record.py
@@ -389,7 +389,7 @@ class EPCRecord:
 
                 cleaned_property_data = EPCDataProcessor.apply_averages_cleaning(
                     data_to_clean=self.epc_record_as_dataframe("prepared_epc", replace_empty_string=True),
-                    cleaning_data=self.cleaning_data,
+                    cleaning_data=cleaning_data,
                     cols_to_merge_on=['PROPERTY_TYPE', 'BUILT_FORM', 'CONSTRUCTION_AGE_BAND', 'LOCAL_AUTHORITY'],
                 )
                 self.prepared_epc["fixed-lighting-outlets-count"] = round(

From e7c0b9169cffafef4898131d4e3fc0e4e4421827 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Fri, 19 Jan 2024 11:34:09 +0000
Subject: [PATCH 05/48] fixing datetime bug in SearchEpc

---
 backend/SearchEpc.py       |  2 +-
 backend/app/plan/router.py | 24 ++++++++++++------------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/backend/SearchEpc.py b/backend/SearchEpc.py
index d69d8d86..4f6fd33d 100644
--- a/backend/SearchEpc.py
+++ b/backend/SearchEpc.py
@@ -472,7 +472,7 @@ class SearchEpc:
 
                 if not epc_data.empty:
                     # Further processing of the EPC data
-                    epc_data['lodgement-datetime'] = pd.to_datetime(epc_data['lodgement-datetime'], format='mixed')
+                    epc_data['lodgement-datetime'] = pd.to_datetime(epc_data['lodgement-datetime'], errors='coerce')
                     epc_data = epc_data.sort_values("lodgement-datetime", ascending=False).groupby("uprn").head(1)
                     epc_data["house_number"] = epc_data["address"].apply(lambda add1: self.get_house_number(add1))
                     epc_data["numeric_house_number"] = epc_data["house_number"].apply(
diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py
index d3471e8f..39944fe3 100644
--- a/backend/app/plan/router.py
+++ b/backend/app/plan/router.py
@@ -66,18 +66,18 @@ async def trigger_plan(body: PlanTriggerRequest):
         )
 
         # For testing:
-        plan_input.extend(
-            [
-                {'address': '73 Long Chaulden', 'postcode': 'HP1 2HX', 'Notes': ''},
-                {'address': '8 Lindlings', 'postcode': 'HP1 2HA', 'Notes': ''},
-                {'address': '44 Lindlings', 'postcode': 'HP1 2HE', 'Notes': ''},
-                {'address': '46 Chaulden Terrace', 'postcode': 'HP1 2AN', 'Notes': ''},
-                {'address': '4, Heather Shaw', 'postcode': 'BA14 7JS', 'Notes': ''},
-                {'address': '16 Glastonbury Road', 'postcode': 'M32 9PE', 'Notes': ''},
-                {'address': '31 Loddon Way', 'postcode': 'BA15 1HG', 'Notes': ''},
-                {'address': '62 Pearmain Drive', 'postcode': 'NG3 3DJ', 'Notes': ''},
-            ]
-        )
+        # plan_input.extend(
+        #     [
+        #         {'address': '73 Long Chaulden', 'postcode': 'HP1 2HX', 'Notes': ''},
+        #         {'address': '8 Lindlings', 'postcode': 'HP1 2HA', 'Notes': ''},
+        #         {'address': '44 Lindlings', 'postcode': 'HP1 2HE', 'Notes': ''},
+        #         {'address': '46 Chaulden Terrace', 'postcode': 'HP1 2AN', 'Notes': ''},
+        #         {'address': '4, Heather Shaw', 'postcode': 'BA14 7JS', 'Notes': ''},
+        #         {'address': '16 Glastonbury Road', 'postcode': 'M32 9PE', 'Notes': ''},
+        #         {'address': '31 Loddon Way', 'postcode': 'BA15 1HG', 'Notes': ''},
+        #         {'address': '62 Pearmain Drive', 'postcode': 'NG3 3DJ', 'Notes': ''},
+        #     ]
+        # )
 
         input_properties = []
         for config in plan_input:

From 804e8fb720e473b746e4491a0a5e0700fc486d90 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Fri, 19 Jan 2024 12:00:47 +0000
Subject: [PATCH 06/48] handling Epc Record when the EPC has been interpolates

---
 etl/epc/Record.py               | 30 +++++-----
 etl/epc/tests/test_epcrecord.py | 99 +++++++++++++++++++++++++++++++++
 2 files changed, 115 insertions(+), 14 deletions(-)

diff --git a/etl/epc/Record.py b/etl/epc/Record.py
index cdbafd7e..2535f204 100644
--- a/etl/epc/Record.py
+++ b/etl/epc/Record.py
@@ -349,7 +349,7 @@ class EPCRecord:
 
         self.prepared_epc["floor-level"] = (
             FLOOR_LEVEL_MAP[self.prepared_epc["floor-level"]] if
-            self.prepared_epc["floor-level"] not in DATA_ANOMALY_MATCHES else None
+            self.prepared_epc["floor-level"] not in list(DATA_ANOMALY_MATCHES) + ["", None] else None
         )
 
     def _clean_number_lighting_outlets(self):
@@ -499,7 +499,7 @@ class EPCRecord:
 
         self.prepared_epc["unheated-corridor-length"] = (
             float(self.prepared_epc["unheated-corridor-length"]) if
-            self.prepared_epc["unheated-corridor-length"] != "" else None
+            self.prepared_epc["unheated-corridor-length"] not in ["", None] else None
         )
 
     def _clean_count_variables(self):
@@ -509,18 +509,18 @@ class EPCRecord:
         if not self.prepared_epc:
             raise ValueError("EPC Record doesn not contain epc data")
 
-        fields = {
-            "number_of_open_fireplaces": "number-open-fireplaces",
-            "number_of_extensions": "extension-count",
-            "number_of_storeys": "flat-storey-count",
-            "number_of_rooms": "number-habitable-rooms",
-        }
+        fields = [
+            "number-open-fireplaces",
+            "extension-count",
+            "flat-storey-count",
+            "number-habitable-rooms"
+        ]
 
-        null_attributes = ["number_of_storeys", "number_of_rooms"]
+        null_attributes = ["flat-storey-count", "number-habitable-rooms"]
 
-        for attribute, epc_field in fields.items():
-            value = self.prepared_epc[epc_field]
-            if value == "" or value in DATA_ANOMALY_MATCHES:
+        for attribute in fields:
+            value = self.prepared_epc[attribute]
+            if value in ["", None] or value in DATA_ANOMALY_MATCHES:
                 if attribute in null_attributes:
                     value = None
                 else:
@@ -537,8 +537,9 @@ class EPCRecord:
         if not self.prepared_epc:
             raise ValueError("EPC Recrod doesn not contain epc data")
 
-        self.prepared_epc['wind-turbine-count'] = int(self.prepared_epc['wind-turbine-count']) if self.prepared_epc[
-                                                                                                      'wind-turbine-count'] != "" else None
+        self.prepared_epc['wind-turbine-count'] = int(
+            self.prepared_epc['wind-turbine-count']
+        ) if self.prepared_epc['wind-turbine-count'] not in ["", None] else None
 
     def _clean_solar_hot_water(self):
         """
@@ -551,6 +552,7 @@ class EPCRecord:
             "Y": True,
             "N": False,
             "": None,
+            None: None
         }
 
         self.prepared_epc['solar-water-heating-flag'] = value_map[self.prepared_epc['solar-water-heating-flag']]
diff --git a/etl/epc/tests/test_epcrecord.py b/etl/epc/tests/test_epcrecord.py
index 06b8ed06..48ad5148 100644
--- a/etl/epc/tests/test_epcrecord.py
+++ b/etl/epc/tests/test_epcrecord.py
@@ -171,6 +171,16 @@ class TestEpcRecord:
 
         assert record.prepared_epc["heat-loss-corridor"] == "unheated corridor"
 
+        record = EPCRecord(cleaning_data=cleaning_data)
+        record.prepared_epc = {
+            "heat-loss-corridor": "unheated corridor",
+            "unheated-corridor-length": None
+        }
+        record._clean_heat_loss_corridor()
+
+        assert record.prepared_epc["heat-loss-corridor"] == "unheated corridor"
+        assert record.prepared_epc["unheated-corridor-length"] is None
+
     def test_clean_heat_loss_corridor_anomaly(self, cleaning_data):
         record = EPCRecord(cleaning_data=cleaning_data)
         # Assuming "InvalidCorridor" is an anomaly
@@ -252,3 +262,92 @@ class TestEpcRecord:
         record._clean_number_lighting_outlets()
 
         assert record.prepared_epc["fixed-lighting-outlets-count"] == 8.0
+
+    def test_clean_count_variables(self, cleaning_data):
+        record = EPCRecord(cleaning_data=cleaning_data)
+
+        record.prepared_epc = {
+            "number-open-fireplaces": "1",
+            "extension-count": None,
+            "flat-storey-count": "",
+            "number-habitable-rooms": "INVALID!",
+        }
+
+        record._clean_count_variables()
+
+        assert record.prepared_epc["number-open-fireplaces"] == 1.0
+        assert record.prepared_epc["extension-count"] == 0
+        assert record.prepared_epc["flat-storey-count"] is None
+        assert record.prepared_epc["number-habitable-rooms"] is None
+
+    def test_clean_floor_level(self, cleaning_data):
+        record = EPCRecord(cleaning_data=cleaning_data)
+
+        record.prepared_epc = {
+            "floor-level": "1",
+        }
+
+        record._clean_floor_level()
+
+        assert record.prepared_epc["floor-level"] == 1.0
+
+        record = EPCRecord(cleaning_data=cleaning_data)
+
+        record.prepared_epc = {
+            "floor-level": "",
+        }
+
+        record._clean_floor_level()
+
+        assert record.prepared_epc["floor-level"] is None
+
+        record = EPCRecord(cleaning_data=cleaning_data)
+
+        record.prepared_epc = {
+            "floor-level": None,
+        }
+
+        record._clean_floor_level()
+
+        assert record.prepared_epc["floor-level"] is None
+
+    def test_clean_solar_hot_water(self, cleaning_data):
+        record = EPCRecord(cleaning_data=cleaning_data)
+
+        record.prepared_epc = {
+            "solar-water-heating-flag": "Y",
+        }
+
+        record._clean_solar_hot_water()
+
+        assert record.prepared_epc["solar-water-heating-flag"] is True
+
+        record = EPCRecord(cleaning_data=cleaning_data)
+
+        record.prepared_epc = {
+            "solar-water-heating-flag": "N",
+        }
+
+        record._clean_solar_hot_water()
+
+        assert record.prepared_epc["solar-water-heating-flag"] is False
+
+        record = EPCRecord(cleaning_data=cleaning_data)
+
+        record.prepared_epc = {
+            "solar-water-heating-flag": "",
+        }
+
+        record._clean_solar_hot_water()
+
+        assert record.prepared_epc["solar-water-heating-flag"] is None
+
+        record = EPCRecord(cleaning_data=cleaning_data)
+
+        record.prepared_epc = {
+            "solar-water-heating-flag": None,
+        }
+
+        record._clean_solar_hot_water()
+
+        assert record.prepared_epc["solar-water-heating-flag"] is None

From 3cf13c651cb1f16297df94a63a161663f20b490b Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Fri, 19 Jan 2024 13:36:36 +0000
Subject: [PATCH 07/48] changing some of the gets to [] gets in Property class

---
 backend/Property.py        | 22 +++++++++++-----------
 backend/app/plan/router.py |  2 ++
 2 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/backend/Property.py b/backend/Property.py
index c1055eb9..736ab4f1 100644
--- a/backend/Property.py
+++ b/backend/Property.py
@@ -68,7 +68,7 @@ class Property(Definitions):
         self.in_conservation_area, self.is_listed, self.is_heritage = None, None, None
         self.restricted_measures = False
         self.year_built = epc_record.get("year_built")
-        self.number_of_rooms = epc_record.prepared_epc.get("number_of_rooms")
+        self.number_of_rooms = epc_record.prepared_epc["number_heated_rooms"]
         self.age_band = epc_record.get("age_band")
         self.construction_age_band = epc_record.get("construction_age_band")
         self.number_of_floors = epc_record.get("number_of_floors")
@@ -81,7 +81,7 @@ class Property(Definitions):
             "co2_emissions": epc_record.get("co2_emissions_current"),
         }
         self.ventilation = {
-            "ventilation": epc_record.get("mechanical_ventilation"),
+            "ventilation": epc_record.prepared_epc["mechanical_ventilation"],
         }
         self.solar_pv = {
             "solar_pv": epc_record.get("photo_supply"),
@@ -90,28 +90,28 @@ class Property(Definitions):
             "solar_hot_water": epc_record.get("solar_water_heating_flag"),
         }
         self.wind_turbine = {
-            "wind_turbine": epc_record.prepared_epc.get("wind_turbine_count"),
+            "wind_turbine": epc_record.prepared_epc["wind_turbine_count"],
         }
         self.number_of_open_fireplaces = {
-            "number_of_open_fireplaces": epc_record.prepared_epc.get("number_of_open_fireplaces"),
+            "number_of_open_fireplaces": epc_record.prepared_epc["number_open_fireplaces"],
         }
         self.number_of_extensions = {
-            "number_of_extensions": epc_record.prepared_epc.get("number_of_extensions"),
+            "number_of_extensions": epc_record.prepared_epc["extension_count"],
         }
         self.number_of_storeys = {
-            "number_of_storeys": epc_record.prepared_epc.get("number_of_storeys"),
+            "number_of_storeys": epc_record.prepared_epc["flat_storey_count"],
         }
         self.heat_loss_corridor = {
-            "heat_loss_corridor": epc_record.prepared_epc.get("heat_loss_corridor"),
-            "length": epc_record.prepared_epc.get("unheated_corridor_length"),
+            "heat_loss_corridor": epc_record.prepared_epc["heat_loss_corridor"],
+            "length": epc_record.prepared_epc["unheated_corridor_length"],
         }
-        self.mains_gas = epc_record.prepared_epc.get('mains_gas_flag')
-        self.floor_height = epc_record.prepared_epc.get('floor_height')
+        self.mains_gas = epc_record.prepared_epc['mains_gas_flag']
+        self.floor_height = epc_record.prepared_epc['floor_height']
         self.insulation_wall_area = None
         self.floor_area = epc_record.prepared_epc.get('total_floor_area')
         self.pitched_roof_area = None
         self.insulation_floor_area = None
-        self.number_lighting_outlets = epc_record.prepared_epc.get("fixed_lighting_outlets_count")
+        self.number_lighting_outlets = epc_record.prepared_epc["fixed_lighting_outlets_count"]
         self.floor_level = None
         self.number_of_windows = None
         self.solar_pv_roof_area = None
diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py
index 39944fe3..6e9c4f50 100644
--- a/backend/app/plan/router.py
+++ b/backend/app/plan/router.py
@@ -159,6 +159,8 @@ async def trigger_plan(body: PlanTriggerRequest):
             recommender = Recommendations(property_instance=p, materials=materials)
             property_recommendations = recommender.recommend()
 
+            recommender.wall_recomender.estimated_u_value
+
             if not property_recommendations:
                 continue
 

From 807e6d5047dcfce2bb1a2e4bf9f548ebc419b01a Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Fri, 19 Jan 2024 16:28:35 +0000
Subject: [PATCH 08/48] align processing of solar hot water flag between engine
 and model

---
 backend/app/plan/router.py              | 16 ----------------
 etl/epc/Record.py                       |  8 ++++----
 recommendations/FloorRecommendations.py |  1 +
 recommendations/RoofRecommendations.py  |  1 +
 4 files changed, 6 insertions(+), 20 deletions(-)

diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py
index 6e9c4f50..b3d1c623 100644
--- a/backend/app/plan/router.py
+++ b/backend/app/plan/router.py
@@ -65,20 +65,6 @@ async def trigger_plan(body: PlanTriggerRequest):
             bucket_name=get_settings().DATA_BUCKET, file_key="sap_change_model/cleaning_dataset.parquet",
         )
 
-        # For testing:
-        # plan_input.extend(
-        #     [
-        #         {'address': '73 Long Chaulden', 'postcode': 'HP1 2HX', 'Notes': ''},
-        #         {'address': '8 Lindlings', 'postcode': 'HP1 2HA', 'Notes': ''},
-        #         {'address': '44 Lindlings', 'postcode': 'HP1 2HE', 'Notes': ''},
-        #         {'address': '46 Chaulden Terrace', 'postcode': 'HP1 2AN', 'Notes': ''},
-        #         {'address': '4, Heather Shaw', 'postcode': 'BA14 7JS', 'Notes': ''},
-        #         {'address': '16 Glastonbury Road', 'postcode': 'M32 9PE', 'Notes': ''},
-        #         {'address': '31 Loddon Way', 'postcode': 'BA15 1HG', 'Notes': ''},
-        #         {'address': '62 Pearmain Drive', 'postcode': 'NG3 3DJ', 'Notes': ''},
-        #     ]
-        # )
-
         input_properties = []
         for config in plan_input:
             # We validate each record in the file. If the record is NOT valid, we need to handle this accordingly
@@ -159,8 +145,6 @@ async def trigger_plan(body: PlanTriggerRequest):
             recommender = Recommendations(property_instance=p, materials=materials)
             property_recommendations = recommender.recommend()
 
-            recommender.wall_recomender.estimated_u_value
-
             if not property_recommendations:
                 continue
 
diff --git a/etl/epc/Record.py b/etl/epc/Record.py
index 2535f204..1c6d694d 100644
--- a/etl/epc/Record.py
+++ b/etl/epc/Record.py
@@ -549,10 +549,10 @@ class EPCRecord:
             raise ValueError("EPC Recrod doesn not contain epc data")
 
         value_map = {
-            "Y": True,
-            "N": False,
-            "": None,
-            None: None
+            "Y": "Y",
+            "N": "N",
+            "": "N",
+            None: "N"
         }
 
         self.prepared_epc['solar-water-heating-flag'] = value_map[self.prepared_epc['solar-water-heating-flag']]
diff --git a/recommendations/FloorRecommendations.py b/recommendations/FloorRecommendations.py
index a246c8cb..2f568264 100644
--- a/recommendations/FloorRecommendations.py
+++ b/recommendations/FloorRecommendations.py
@@ -109,6 +109,7 @@ class FloorRecommendations(Definitions):
             insulation_thickness=self.property.floor["insulation_thickness"],
             wall_type=self.property.wall_type
         )
+
         self.estimated_u_value = u_value
 
         if u_value < self.BUILDING_REGULATIONS_PART_L_MAX_U_VALUE:
diff --git a/recommendations/RoofRecommendations.py b/recommendations/RoofRecommendations.py
index dc1aff3f..0bbfd69d 100644
--- a/recommendations/RoofRecommendations.py
+++ b/recommendations/RoofRecommendations.py
@@ -91,6 +91,7 @@ class RoofRecommendations:
             raise NotImplementedError("Implement me")
 
         u_value = get_roof_u_value(**{**self.property.roof, "age_band": self.property.age_band})
+
         self.estimated_u_value = u_value
         if u_value <= self.BUILDING_REGULATIONS_PART_L_MAX_U_VALUE:
             # The Roof is already compliant

From 24709b98d604fe93edfcbb959c728dbf9dac60f7 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Fri, 19 Jan 2024 16:51:49 +0000
Subject: [PATCH 09/48] Added more wall u-value tests

---
 .idea/Model.iml                               |  2 +-
 .idea/misc.xml                                |  2 +-
 backend/Property.py                           |  6 ++--
 etl/epc/Record.py                             | 19 +++++++++++++
 .../tests/test_data/wall_uvalue_test_cases.py | 28 +++++++++++++++++++
 5 files changed, 53 insertions(+), 4 deletions(-)

diff --git a/.idea/Model.iml b/.idea/Model.iml
index 4413bb06..b0f9c00d 100644
--- a/.idea/Model.iml
+++ b/.idea/Model.iml
@@ -7,7 +7,7 @@
       <sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
       <sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
     </content>
-    <orderEntry type="jdk" jdkName="Python 3.10 (backend)" jdkType="Python SDK" />
+    <orderEntry type="jdk" jdkName="Python 3.10 (model_data)" jdkType="Python SDK" />
     <orderEntry type="sourceFolder" forTests="false" />
   </component>
   <component name="PyNamespacePackagesService">
diff --git a/.idea/misc.xml b/.idea/misc.xml
index 6f308057..1122b380 100644
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -3,7 +3,7 @@
   <component name="Black">
     <option name="sdkName" value="Python 3.10 (backend)" />
   </component>
-  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (backend)" project-jdk-type="Python SDK" />
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (model_data)" project-jdk-type="Python SDK" />
   <component name="PythonCompatibilityInspectionAdvertiser">
     <option name="version" value="3" />
   </component>
diff --git a/backend/Property.py b/backend/Property.py
index 736ab4f1..ee496552 100644
--- a/backend/Property.py
+++ b/backend/Property.py
@@ -88,6 +88,7 @@ class Property(Definitions):
         }
         self.solar_hot_water = {
             "solar_hot_water": epc_record.get("solar_water_heating_flag"),
+            "solar_hot_water_boolean": epc_record.get("solar_water_heating_flag_bool"),
         }
         self.wind_turbine = {
             "wind_turbine": epc_record.prepared_epc["wind_turbine_count"],
@@ -104,6 +105,7 @@ class Property(Definitions):
         self.heat_loss_corridor = {
             "heat_loss_corridor": epc_record.prepared_epc["heat_loss_corridor"],
             "length": epc_record.prepared_epc["unheated_corridor_length"],
+            "heat_loss_corridor_boolean": epc_record.get("heat_loss_corridor_bool"),
         }
         self.mains_gas = epc_record.prepared_epc['mains_gas_flag']
         self.floor_height = epc_record.prepared_epc['floor_height']
@@ -436,10 +438,10 @@ class Property(Definitions):
             "mainfuel": self.main_fuel["clean_description"],
             "ventilation": self.ventilation["ventilation"],
             "solar_pv": self.solar_pv["solar_pv"],
-            "solar_hot_water": self.solar_hot_water["solar_hot_water"],
+            "solar_hot_water": self.solar_hot_water["solar_hot_water_boolean"],
             "wind_turbine": self.wind_turbine["wind_turbine"],
             "floor_height": self.floor_height,
-            "heat_loss_corridor": self.heat_loss_corridor["heat_loss_corridor"],
+            "heat_loss_corridor": self.heat_loss_corridor["heat_loss_corridor_boolean"],
             "unheated_corridor_length": self.heat_loss_corridor["length"],
             "number_of_open_fireplaces": self.number_of_open_fireplaces["number_of_open_fireplaces"],
             "number_of_extensions": self.number_of_extensions["number_of_extensions"],
diff --git a/etl/epc/Record.py b/etl/epc/Record.py
index 1c6d694d..6fb4d5d9 100644
--- a/etl/epc/Record.py
+++ b/etl/epc/Record.py
@@ -102,6 +102,8 @@ class EPCRecord:
     year_built: int = None
     number_of_floors: int = None
     number_of_open_fireplaces: int = None
+    heat_loss_corridor_bool: bool = None
+    solar_water_heating_flag_bool: bool = None
 
     def __post_init__(self):
         # We can have validation and cleaning steps for each of the fields
@@ -490,6 +492,12 @@ class EPCRecord:
             "heated corridor"
         ]
 
+        boolean_map = {
+            "no corridor": False,
+            "unheated corridor": True,
+            "heated corridor": False
+        }
+
         self.prepared_epc["heat-loss-corridor"] = (
             "no corridor" if self.prepared_epc["heat-loss-corridor"] in DATA_ANOMALY_MATCHES else
             self.prepared_epc["heat-loss-corridor"]
@@ -502,6 +510,9 @@ class EPCRecord:
             self.prepared_epc["unheated-corridor-length"] not in ["", None] else None
         )
 
+        # We create boolean versions of heat-loss-corridor
+        self.heat_loss_corridor_bool = boolean_map[self.prepared_epc["heat-loss-corridor"]]
+
     def _clean_count_variables(self):
         """
         This method will clean the count variables, if empty or invalid
@@ -555,8 +566,16 @@ class EPCRecord:
             None: "N"
         }
 
+        boolean_map = {
+            "Y": True,
+            "N": False,
+        }
+
         self.prepared_epc['solar-water-heating-flag'] = value_map[self.prepared_epc['solar-water-heating-flag']]
 
+        # Create a boolean version for storage in the database
+        self.solar_water_heating_flag_bool = boolean_map[self.prepared_epc['solar-water-heating-flag']]
+
     def _clean_solar_pv(self):
         """
         This method will clean the solar pv, if empty or invalid
diff --git a/recommendations/tests/test_data/wall_uvalue_test_cases.py b/recommendations/tests/test_data/wall_uvalue_test_cases.py
index e0c6ebe3..87f1ad3f 100644
--- a/recommendations/tests/test_data/wall_uvalue_test_cases.py
+++ b/recommendations/tests/test_data/wall_uvalue_test_cases.py
@@ -76,5 +76,33 @@ wall_uvalue_test_cases = [
         "is_granite_or_whinstone": False,
         "is_sandstone_or_limestone": False,
         "uvalue": 0
+    },
+    {
+        "clean_description": "Cavity wall, as built, insulated",
+        "age_band": "F",
+        "is_granite_or_whinstone": False,
+        "is_sandstone_or_limestone": False,
+        "uvalue": 0.4
+    },
+    {
+        "clean_description": "Cavity wall, as built, insulated",
+        "age_band": "D",
+        "is_granite_or_whinstone": False,
+        "is_sandstone_or_limestone": False,
+        "uvalue": 0.7
+    },
+    {
+        "clean_description": "Cavity wall, filled cavity",
+        "age_band": "E",
+        "is_granite_or_whinstone": False,
+        "is_sandstone_or_limestone": False,
+        "uvalue": 0.7
+    },
+    {
+        "clean_description": "Cavity wall, as built, no insulation",
+        "age_band": "E",
+        "is_granite_or_whinstone": False,
+        "is_sandstone_or_limestone": False,
+        "uvalue": 1.5
     }
 ]

From 80c35d42df60e365f3a93218f6bb2affa0650dab Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Fri, 19 Jan 2024 17:11:06 +0000
Subject: [PATCH 10/48] Added floor uvalue cases

---
 .../test_data/floor_uvalue_test_cases.py      | 30 +++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/recommendations/tests/test_data/floor_uvalue_test_cases.py b/recommendations/tests/test_data/floor_uvalue_test_cases.py
index 91d3814f..7104fd9d 100644
--- a/recommendations/tests/test_data/floor_uvalue_test_cases.py
+++ b/recommendations/tests/test_data/floor_uvalue_test_cases.py
@@ -29,4 +29,34 @@ floor_uvalue_test_cases = [
         "insulation_thickness": None,
         "expected": ValueError,
     },
+    # 16 Glastonbury road EPR - the EPR has 0.71 due to the property having 320mm wall thickness, but default being 250
+    {
+        "floor_type": "suspended",
+        "area": 34.5,
+        "perimeter": 16.7,
+        "age_band": "D",
+        "wall_type": "cavity",
+        "insulation_thickness": None,
+        "expected": 0.72,
+    },
+    # 31 Loddon Way - the EPR has 0.5 due to the property having 320mm wall thickness, but default being 250
+    {
+        "floor_type": "solid",
+        "area": 52.08,
+        "perimeter": 16.2,
+        "age_band": "E",
+        "wall_type": "cavity",
+        "insulation_thickness": None,
+        "expected": 0.52,
+    },
+    # 62 Pearmain Drive
+    {
+        "floor_type": "solid",
+        "area": 38.64,
+        "perimeter": 18.1,
+        "age_band": "E",
+        "wall_type": "cavity",
+        "insulation_thickness": None,
+        "expected": 0.69,
+    },
 ]

From 43f3169e0c77b15203800dfe15ea8747649fbad7 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Fri, 19 Jan 2024 17:25:53 +0000
Subject: [PATCH 11/48] corrected unit tests:

---
 backend/tests/test_sap_model_prep.py | 1000 --------------------------
 etl/epc/tests/test_epcrecord.py      |   23 +-
 2 files changed, 14 insertions(+), 1009 deletions(-)
 delete mode 100644 backend/tests/test_sap_model_prep.py

diff --git a/backend/tests/test_sap_model_prep.py b/backend/tests/test_sap_model_prep.py
deleted file mode 100644
index 89c436ce..00000000
--- a/backend/tests/test_sap_model_prep.py
+++ /dev/null
@@ -1,1000 +0,0 @@
-from backend.Property import Property
-from etl.epc.DataProcessor import DataProcessor
-from backend.app.plan.utils import create_recommendation_scoring_data, get_cleaned
-from etl.epc.settings import COLUMNS_TO_MERGE_ON
-import pandas as pd
-import pytest
-import msgpack
-
-from utils.s3 import read_dataframe_from_s3_parquet, read_from_s3
-
-
-# Handy code for selecting testing data
-# import pickle
-#
-# with open("sap_dataset.pickle", "rb") as f:
-#     sap_change_dataset = pickle.load(f)
-#
-# search_from = sap_change_dataset[
-#     (sap_change_dataset["walls_thermal_transmittance_ENDING"] == sap_change_dataset["walls_thermal_transmittance"]) &
-#     sap_change_dataset["is_to_unheated_space"]
-#     ]
-# search_from = search_from[
-#     (search_from["roof_thermal_transmittance_ENDING"] == search_from["roof_thermal_transmittance"]) &
-#     (search_from["floor_thermal_transmittance_ENDING"] != search_from["floor_thermal_transmittance"]) &
-#     (search_from["MECHANICAL_VENTILATION_ENDING"] == search_from["MECHANICAL_VENTILATION_STARTING"]) &
-#     (search_from["SECONDHEAT_DESCRIPTION_ENDING"] == search_from["SECONDHEAT_DESCRIPTION_STARTING"]) &
-#     (search_from["GLAZED_TYPE_ENDING"] == search_from["GLAZED_TYPE_STARTING"])
-#     ]
-#
-# # Find a record where the only difference is cavity wall getting filled
-# ending_cols = [c for c in search_from.columns if "_ENDING" in c]
-#
-# ignore = [
-#     "SAP_ENDING", "HEAT_DEMAND_ENDING", "CARBON_ENDING", "TRANSACTION_TYPE_ENDING", "FLOOR_HEIGHT_ENDING",
-#     "DAYS_TO_ENDING", "TOTAL_FLOOR_AREA_ENDING"
-# ]
-#
-# ending_cols = [c for c in ending_cols if c not in ignore]
-#
-# for _, row in tqdm(search_from.iterrows(), total=search_from.shape[0]):
-#
-#     same = True
-#     starting_cols = []
-#     for c in ending_cols:
-#
-#         starting_col = c.replace("_ENDING", "")
-#         if starting_col not in search_from.columns:
-#             starting_col = c.replace("_ENDING", "_STARTING")
-#             if starting_col not in search_from.columns:
-#                 raise Exception("something went wrong")
-#
-#         starting_cols.append(starting_col)
-#
-#         # We want them to be different
-#         if c == "floor_thermal_transmittance_ENDING":
-#             if (row[c] == row[starting_col]) | (row[starting_col] != "natural"):
-#                 same = False
-#                 break
-#             else:
-#                 continue
-#
-#         # We now check if the starting and ending values are the same
-#         if row[c] != row[starting_col]:
-#             same = False
-#             break
-#
-#     if same:
-#         raise Exception("We found one!")
-#
-#     fixed_cols = [c for c in search_from.columns if c not in starting_cols + ending_cols]
-#
-#     import pandas as pd
-#
-#     start = row[["SAP_STARTING"] + starting_cols]
-#     start.index = [c.replace("_STARTING", "") for c in start.index]
-#     end = row[["SAP_ENDING"] + ending_cols]
-#     end.index = [c.replace("_ENDING", "") for c in end.index]
-#     start["type"] = "starting"
-#     end["type"] = "ending"
-#
-#     compare = pd.concat([start, end], axis=1)
-#
-# ending_lmk = "1252008839062019090910572351658131"
-# starting_lmk = "1252008819542014122308482236142128"
-#
-# client = EpcClient(auth_token=EPC_AUTH_TOKEN)
-# result = client.domestic.search(params={"address": "Flat 14 Charles House, Freemens Way", "postcode": "CT14 9DL"})
-# starting_epc = [x for x in result["rows"] if x["lmk-key"] == starting_lmk][0]
-# ending_epc = [x for x in result["rows"] if x["lmk-key"] == ending_lmk][0]
-
-
-# with open(
-#     os.path.abspath(os.path.dirname(__file__)) + "/backend/tests/test_data/cleaned.pickle", "rb"
-# ) as f:
-#     cleaned = pickle.load(f)
-
-# with open(
-#     os.path.abspath(os.path.dirname(__file__)) + "/backend/tests/test_data/cleaning_data.pickle", "rb"
-# ) as f:
-#     cleaning_data = pickle.load(f)
-
-# TODO: Need to do floors, suspended and solid and to unheated space
-
-
-class TestSapModelPrep:
-
-    @pytest.fixture
-    def cleaning_data(self):
-        return read_dataframe_from_s3_parquet(
-            bucket_name="retrofit-data-dev", file_key="sap_change_model/cleaning_dataset.parquet",
-        )
-
-    @pytest.fixture
-    def cleaned(self):
-        cleaned = read_from_s3(
-            s3_file_name="cleaned_epc_data/cleaned.bson",
-            bucket_name="retrofit-data-dev"
-        )
-
-        cleaned = msgpack.unpackb(cleaned, raw=False)
-        return cleaned
-
-    @pytest.fixture
-    def photo_supply_lookup(self):
-        photo_supply_lookup = read_dataframe_from_s3_parquet(
-            bucket_name="retrofit-data-dev", file_key="solar_pv_supply/photo_supply_lookup.parquet",
-        )
-        return photo_supply_lookup
-
-    @pytest.fixture
-    def floor_area_decile_thresholds(self):
-        floor_area_decile_thresholds = read_dataframe_from_s3_parquet(
-            bucket_name="retrofit-data-dev", file_key="solar_pv_supply/floor_area_decile_thresholds.parquet",
-        )
-        return floor_area_decile_thresholds
-
-    def test_fill_cavity_wall(self, cleaned, cleaning_data, photo_supply_lookup, floor_area_decile_thresholds):
-        """
-        We ensure that the process that prepares the data in the engine code results in the same data as
-        the model is trained on
-        """
-
-        # This is an actual starting EPC
-        starting_epc = {
-            'low-energy-fixed-light-count': '', 'address': '26, Vicarage Lane, Eaton',
-            'uprn-source': 'Address Matched', 'floor-height': '2.39', 'heating-cost-potential': '942',
-            'unheated-corridor-length': '', 'hot-water-cost-potential': '97',
-            'construction-age-band': 'England and Wales: 1967-1975', 'potential-energy-rating': 'D',
-            'mainheat-energy-eff': 'Average', 'windows-env-eff': 'Good', 'lighting-energy-eff': 'Average',
-            'environment-impact-potential': '53',
-            'glazed-type': 'double glazing installed during or after 2002', 'heating-cost-current': '1475',
-            'address3': '', 'mainheatcont-description': 'Programmer, room thermostat and TRVs',
-            'sheating-energy-eff': 'N/A', 'property-type': 'House', 'local-authority-label': 'Melton',
-            'fixed-lighting-outlets-count': '', 'energy-tariff': 'Single',
-            'mechanical-ventilation': 'natural', 'hot-water-cost-current': '96', 'county': 'Leicestershire',
-            'postcode': 'NG32 1SP', 'solar-water-heating-flag': 'Y', 'constituency': 'E14000909',
-            'co2-emissions-potential': '5.7', 'number-heated-rooms': '7',
-            'floor-description': 'Suspended, no insulation (assumed)',
-            'energy-consumption-potential': '177', 'local-authority': 'E07000133', 'built-form': 'Detached',
-            'number-open-fireplaces': '1', 'windows-description': 'Fully double glazed',
-            'glazed-area': 'Normal', 'inspection-date': '2016-09-22', 'mains-gas-flag': 'N',
-            'co2-emiss-curr-per-floor-area': '87', 'address1': '26, Vicarage Lane',
-            'heat-loss-corridor': 'NO DATA!', 'flat-storey-count': '',
-            'constituency-label': 'Rutland and Melton', 'roof-energy-eff': 'Very Poor',
-            'total-floor-area': '116.0', 'building-reference-number': '4940047478',
-            'environment-impact-current': '29', 'co2-emissions-current': '10.0',
-            'roof-description': 'Pitched, limited insulation (assumed)', 'floor-energy-eff': 'NO DATA!',
-            'number-habitable-rooms': '7', 'address2': 'Eaton', 'hot-water-env-eff': 'Good',
-            'posttown': 'GRANTHAM', 'mainheatc-energy-eff': 'Good', 'main-fuel': 'oil (not community)',
-            'lighting-env-eff': 'Average', 'windows-energy-eff': 'Good', 'floor-env-eff': 'N/A',
-            'sheating-env-eff': 'N/A',
-            'lighting-description': 'Low energy lighting in 31% of fixed outlets',
-            'roof-env-eff': 'Very Poor', 'walls-energy-eff': 'Poor', 'photo-supply': '',
-            'lighting-cost-potential': '69', 'mainheat-env-eff': 'Good', 'multi-glaze-proportion': '100',
-            'main-heating-controls': '2106', 'lodgement-datetime': '2016-09-23 20:29:01',
-            'flat-top-storey': '', 'current-energy-rating': 'F',
-            'secondheat-description': 'Room heaters, dual fuel (mineral and wood)', 'walls-env-eff': 'Poor',
-            'transaction-type': 'marketed sale', 'uprn': '100030534042', 'current-energy-efficiency': '34',
-            'energy-consumption-current': '343', 'mainheat-description': 'Boiler and radiators, oil',
-            'lighting-cost-current': '117', 'lodgement-date': '2016-09-23', 'extension-count': '2',
-            'mainheatc-env-eff': 'Good', 'lmk-key': '1481856849902016092320290148762028',
-            'wind-turbine-count': '0', 'tenure': 'owner-occupied', 'floor-level': 'NODATA!',
-            'potential-energy-efficiency': '64', 'hot-water-energy-eff': 'Good',
-            'low-energy-lighting': '31',
-            'walls-description': 'Cavity wall, as built, no insulation (assumed)',
-            'hotwater-description': 'From main system, plus solar'
-        }
-
-        # This is the training data as we prepare it in the engine
-        # This is an actual record from the training data
-        row = {
-            'UPRN': '100030534042', 'RDSAP_CHANGE': 12, 'HEAT_DEMAND_CHANGE': -72,
-            'CARBON_CHANGE': -2.0999999999999996, 'SAP_STARTING': 34, 'SAP_ENDING': 46, 'HEAT_DEMAND_STARTING': 343,
-            'HEAT_DEMAND_ENDING': 271, 'CARBON_STARTING': 10.0, 'CARBON_ENDING': 7.9, 'PROPERTY_TYPE': 'House',
-            'BUILT_FORM': 'Detached', 'CONSTITUENCY': 'E14000909', 'NUMBER_HABITABLE_ROOMS': 7.0,
-            'NUMBER_HEATED_ROOMS': 7.0, 'FIXED_LIGHTING_OUTLETS_COUNT': 21.0,
-            'CONSTRUCTION_AGE_BAND': 'England and Wales: 1967-1975', 'TRANSACTION_TYPE_STARTING': 'marketed sale',
-            'MECHANICAL_VENTILATION_STARTING': 'natural',
-            'SECONDHEAT_DESCRIPTION_STARTING': 'Room heaters, dual fuel (mineral and wood)',
-            'ENERGY_TARIFF_STARTING': 'Single', 'SOLAR_WATER_HEATING_FLAG_STARTING': 'Y',
-            'PHOTO_SUPPLY_STARTING': 0.0, 'GLAZED_TYPE_STARTING': 'double glazing installed during or after 2002',
-            'MULTI_GLAZE_PROPORTION_STARTING': 100.0, 'LOW_ENERGY_LIGHTING_STARTING': 31.0,
-            'NUMBER_OPEN_FIREPLACES_STARTING': 1.0, 'EXTENSION_COUNT_STARTING': 2.0,
-            'TOTAL_FLOOR_AREA_STARTING': 116.0, 'FLOOR_HEIGHT_STARTING': 2.39,
-            'TRANSACTION_TYPE_ENDING': 'marketed sale', 'MECHANICAL_VENTILATION_ENDING': 'natural',
-            'SECONDHEAT_DESCRIPTION_ENDING': 'Room heaters, dual fuel (mineral and wood)',
-            'ENERGY_TARIFF_ENDING': 'Single', 'SOLAR_WATER_HEATING_FLAG_ENDING': 'Y', 'PHOTO_SUPPLY_ENDING': 0.0,
-            'GLAZED_TYPE_ENDING': 'double glazing installed during or after 2002',
-            'MULTI_GLAZE_PROPORTION_ENDING': 100.0, 'LOW_ENERGY_LIGHTING_ENDING': 31.0,
-            'NUMBER_OPEN_FIREPLACES_ENDING': 1.0, 'EXTENSION_COUNT_ENDING': 2.0, 'TOTAL_FLOOR_AREA_ENDING': 116.0,
-            'FLOOR_HEIGHT_ENDING': 2.41, 'DAYS_TO_STARTING': 784, 'DAYS_TO_ENDING': 867,
-            'walls_thermal_transmittance': 1.5, 'is_cavity_wall': True, 'is_filled_cavity': False,
-            'is_solid_brick': False, 'is_system_built': False, 'is_timber_frame': False,
-            'is_granite_or_whinstone': False, 'is_as_built': True, 'is_cob': False,
-            'is_sandstone_or_limestone': False, 'is_park_home': False, 'walls_insulation_thickness': 'none',
-            'external_insulation': False, 'internal_insulation': False, 'walls_thermal_transmittance_ENDING': 0.7,
-            'is_park_home_ENDING': False, 'walls_insulation_thickness_ENDING': 'average',
-            'external_insulation_ENDING': False, 'internal_insulation_ENDING': False,
-            'floor_thermal_transmittance': 0.52, 'is_to_unheated_space': False, 'is_to_external_air': False,
-            'is_suspended': True, 'is_solid': False, 'another_property_below': False,
-            'floor_insulation_thickness': 'none', 'floor_thermal_transmittance_ENDING': 0.52,
-            'floor_insulation_thickness_ENDING': 'none', 'roof_thermal_transmittance': 1.5, 'is_pitched': True,
-            'is_roof_room': False, 'is_loft': False, 'is_flat': False, 'is_thatched': False, 'is_at_rafters': False,
-            'has_dwelling_above': False, 'roof_insulation_thickness': 'below average',
-            'roof_thermal_transmittance_ENDING': 1.5, 'roof_insulation_thickness_ENDING': 'below average',
-            'heater_type': 'Unknown', 'system_type': 'from main system', 'thermostat_characteristics': 'Unknown',
-            'heating_scope': 'Unknown', 'energy_recovery': 'Unknown', 'hotwater_tariff_type': 'Unknown',
-            'extra_features': 'plus solar', 'chp_systems': 'Unknown', 'distribution_system': 'Unknown',
-            'no_system_present': 'Unknown', 'appliance': 'Unknown', 'heater_type_ENDING': 'Unknown',
-            'system_type_ENDING': 'from main system', 'thermostat_characteristics_ENDING': 'Unknown',
-            'heating_scope_ENDING': 'Unknown', 'energy_recovery_ENDING': 'Unknown',
-            'hotwater_tariff_type_ENDING': 'Unknown', 'extra_features_ENDING': 'plus solar',
-            'chp_systems_ENDING': 'Unknown', 'distribution_system_ENDING': 'Unknown',
-            'no_system_present_ENDING': 'Unknown', 'appliance_ENDING': 'Unknown', 'has_radiators': True,
-            'has_fan_coil_units': False, 'has_pipes_in_screed_above_insulation': False,
-            'has_pipes_in_insulated_timber_floor': False, 'has_pipes_in_concrete_slab': False, 'has_boiler': True,
-            'has_air_source_heat_pump': False, 'has_room_heaters': False, 'has_electric_storage_heaters': False,
-            'has_warm_air': False, 'has_electric_underfloor_heating': False, 'has_electric_ceiling_heating': False,
-            'has_community_scheme': False, 'has_ground_source_heat_pump': False, 'has_no_system_present': False,
-            'has_portable_electric_heaters': False, 'has_water_source_heat_pump': False,
-            'has_electric_heat_pump': False, 'has_micro-cogeneration': False, 'has_solar_assisted_heat_pump': False,
-            'has_exhaust_source_heat_pump': False, 'has_community_heat_pump': False, 'has_electric': False,
-            'has_mains_gas': False, 'has_wood_logs': False, 'has_coal': False, 'has_oil': True,
-            'has_wood_pellets': False, 'has_anthracite': False, 'has_dual_fuel_mineral_and_wood': False,
-            'has_smokeless_fuel': False, 'has_lpg': False, 'has_b30k': False, 'has_electricaire': False,
-            'has_assumed_for_most_rooms': False, 'has_underfloor_heating': False, 'has_radiators_ENDING': True,
-            'has_fan_coil_units_ENDING': False, 'has_pipes_in_screed_above_insulation_ENDING': False,
-            'has_pipes_in_insulated_timber_floor_ENDING': False, 'has_pipes_in_concrete_slab_ENDING': False,
-            'has_boiler_ENDING': True, 'has_air_source_heat_pump_ENDING': False, 'has_room_heaters_ENDING': False,
-            'has_electric_storage_heaters_ENDING': False, 'has_warm_air_ENDING': False,
-            'has_electric_underfloor_heating_ENDING': False, 'has_electric_ceiling_heating_ENDING': False,
-            'has_community_scheme_ENDING': False, 'has_ground_source_heat_pump_ENDING': False,
-            'has_no_system_present_ENDING': False, 'has_portable_electric_heaters_ENDING': False,
-            'has_water_source_heat_pump_ENDING': False, 'has_electric_heat_pump_ENDING': False,
-            'has_micro-cogeneration_ENDING': False, 'has_solar_assisted_heat_pump_ENDING': False,
-            'has_exhaust_source_heat_pump_ENDING': False, 'has_community_heat_pump_ENDING': False,
-            'has_electric_ENDING': False, 'has_mains_gas_ENDING': False, 'has_wood_logs_ENDING': False,
-            'has_coal_ENDING': False, 'has_oil_ENDING': True, 'has_wood_pellets_ENDING': False,
-            'has_anthracite_ENDING': False, 'has_dual_fuel_mineral_and_wood_ENDING': False,
-            'has_smokeless_fuel_ENDING': False, 'has_lpg_ENDING': False, 'has_b30k_ENDING': False,
-            'has_electricaire_ENDING': False, 'has_assumed_for_most_rooms_ENDING': False,
-            'has_underfloor_heating_ENDING': False, 'thermostatic_control': 'room thermostat',
-            'charging_system': 'Unknown', 'switch_system': 'programmer', 'no_control': 'Unknown',
-            'dhw_control': 'Unknown', 'community_heating': 'Unknown', 'multiple_room_thermostats': False,
-            'auxiliary_systems': 'Unknown', 'trvs': 'trvs', 'rate_control': 'Unknown',
-            'thermostatic_control_ENDING': 'room thermostat', 'charging_system_ENDING': 'Unknown',
-            'switch_system_ENDING': 'programmer', 'no_control_ENDING': 'Unknown', 'dhw_control_ENDING': 'Unknown',
-            'community_heating_ENDING': 'Unknown', 'multiple_room_thermostats_ENDING': False,
-            'auxiliary_systems_ENDING': 'Unknown', 'trvs_ENDING': 'trvs', 'rate_control_ENDING': 'Unknown',
-            'glazing_type': 'double', 'glazing_type_ENDING': 'double', 'fuel_type': 'oil',
-            'main-fuel_tariff_type': 'Unknown', 'is_community': False,
-            'no_individual_heating_or_community_network': False, 'complex_fuel_type': 'Unknown',
-            'fuel_type_ENDING': 'oil', 'main-fuel_tariff_type_ENDING': 'Unknown', 'is_community_ENDING': False,
-            'no_individual_heating_or_community_network_ENDING': False, 'complex_fuel_type_ENDING': 'Unknown',
-            'estimated_perimeter_STARTING': 30.531014675946444, 'estimated_perimeter_ENDING': 30.531014675946444,
-            'HOT_WATER_ENERGY_EFF_STARTING': "Good",
-            "FLOOR_ENERGY_EFF_STARTING": "Unknown",
-            "WINDOWS_ENERGY_EFF_STARTING": "Good",
-            "WALLS_ENERGY_EFF_STARTING": "Poor",
-            "SHEATING_ENERGY_EFF_STARTING": "Unknown",
-            "ROOF_ENERGY_EFF_STARTING": "Very Poor",
-            "MAINHEAT_ENERGY_EFF_STARTING": "Average",
-            "MAINHEATC_ENERGY_EFF_STARTING": "Good",
-            "LIGHTING_ENERGY_EFF_STARTING": "Average",
-            "POTENTIAL_ENERGY_EFFICIENCY": 64,
-            "ENVIRONMENT_IMPACT_POTENTIAL": 53,
-            "ENERGY_CONSUMPTION_POTENTIAL": 177.0,
-            "CO2_EMISSIONS_POTENTIAL": 5.7,
-            "HOT_WATER_ENERGY_EFF_ENDING": "Good",
-            "FLOOR_ENERGY_EFF_ENDING": "Unknown",
-            "WINDOWS_ENERGY_EFF_ENDING": "Good",
-            "WALLS_ENERGY_EFF_ENDING": "Good",
-            "SHEATING_ENERGY_EFF_ENDING": "Unknown",
-            "ROOF_ENERGY_EFF_ENDING": "Very Poor",
-            "MAINHEAT_ENERGY_EFF_ENDING": "Average",
-            "MAINHEATC_ENERGY_EFF_ENDING": "Good",
-            "LIGHTING_ENERGY_EFF_ENDING": "Average",
-        }
-
-        home = Property(
-            id=0,
-            postcode=starting_epc["postcode"],
-            address=starting_epc["address1"],
-            data=starting_epc
-        )
-        home.get_components(cleaned, photo_supply_lookup, floor_area_decile_thresholds)
-
-        data_processor = DataProcessor(None, newdata=True)
-        data_processor.insert_data(pd.DataFrame([home.get_model_data()]))
-
-        data_processor.pre_process()
-
-        starting_epc_data = data_processor.get_component_features(suffix="_STARTING")
-        ending_epc_data = data_processor.get_component_features(suffix="_ENDING")
-        fixed_data = data_processor.get_fixed_features()
-
-        ending_lodgement_date = '2016-12-15'
-
-        ending_epc_data["DAYS_TO_ENDING"] = data_processor.calculate_days_to(ending_lodgement_date)
-
-        recommendation = {
-            "recommendation_id": 0,
-            "new_u_value": 0.7,
-            "type": "cavity_wall_insulation"
-        }
-
-        test_record = create_recommendation_scoring_data(
-            property=home,
-            recommendation=recommendation,
-            starting_epc_data=starting_epc_data,
-            ending_epc_data=ending_epc_data,
-            fixed_data=fixed_data,
-        )
-        test_record = pd.DataFrame([test_record])
-
-        # Test the final cleaning:
-        test_record = DataProcessor.apply_averages_cleaning(
-            data_to_clean=test_record,
-            cleaning_data=cleaning_data,
-            cols_to_merge_on=COLUMNS_TO_MERGE_ON + ["LOCAL_AUTHORITY"]
-        ).drop(columns=["LOCAL_AUTHORITY"])
-
-        test_record = DataProcessor.clean_missings_after_description_process(
-            test_record, [
-                c for c in test_record.columns if
-                ("thermal_transmittance" in c) or ("insulation_thickness" in c)
-            ]
-        )
-
-        # Test that the data has been set up correctly
-
-        # Things to fix:
-        # [] Filled cavity should have an average insulation thickness in the cleaned data
-
-        for c in test_record.columns:
-            if c in ["id", "SAP_ENDING", "HEAT_DEMAND_ENDING", "CARBON_ENDING"]:
-                continue
-
-            if c == "FLOOR_HEIGHT_ENDING":
-                assert (row[c] - test_record[c].values[0]) <= 0.020001
-                continue
-
-            if c == "walls_insulation_thickness_ENDING":
-                assert row[c] == "average"
-                assert test_record[c].values[0] == "above average"
-                continue
-
-            assert test_record[c].values[0] == row[c]
-
-    def test_internal_wall_insulation(self, cleaned, cleaning_data, photo_supply_lookup, floor_area_decile_thresholds):
-
-        starting_epc2 = {
-            'low-energy-fixed-light-count': '2', 'address': 'FLAT 12, WAREHOUSE W, 3 WESTERN GATEWAY',
-            'uprn-source': 'Energy Assessor', 'floor-height': '3.64', 'heating-cost-potential': '465',
-            'unheated-corridor-length': '', 'hot-water-cost-potential': '185',
-            'construction-age-band': 'England and Wales: 1900-1929', 'potential-energy-rating': 'C',
-            'mainheat-energy-eff': 'Very Poor', 'windows-env-eff': 'Average', 'lighting-energy-eff': 'Poor',
-            'environment-impact-potential': '51', 'glazed-type': 'double glazing installed during or after 2002',
-            'heating-cost-current': '1223', 'address3': '3 WESTERN GATEWAY',
-            'mainheatcont-description': 'Programmer and appliance thermostats', 'sheating-energy-eff': 'N/A',
-            'property-type': 'Flat', 'local-authority-label': 'Newham', 'fixed-lighting-outlets-count': '12',
-            'energy-tariff': 'off-peak 7 hour', 'mechanical-ventilation': 'natural', 'hot-water-cost-current': '342',
-            'county': '', 'postcode': 'E16 1BD', 'solar-water-heating-flag': 'N', 'constituency': 'E14001032',
-            'co2-emissions-potential': '3.6', 'number-heated-rooms': '2', 'floor-description': '(other premises below)',
-            'energy-consumption-potential': '307', 'local-authority': 'E09000025', 'built-form': 'Mid-Terrace',
-            'number-open-fireplaces': '0', 'windows-description': 'Partial double glazing', 'glazed-area': 'Normal',
-            'inspection-date': '2020-10-14', 'mains-gas-flag': 'N', 'co2-emiss-curr-per-floor-area': '66',
-            'address1': 'FLAT 12', 'heat-loss-corridor': 'heated corridor', 'flat-storey-count': '',
-            'constituency-label': 'West Ham', 'roof-energy-eff': 'N/A', 'total-floor-area': '70.0',
-            'building-reference-number': '10000539740', 'environment-impact-current': '42',
-            'co2-emissions-current': '4.6', 'roof-description': '(another dwelling above)', 'floor-energy-eff': 'N/A',
-            'number-habitable-rooms': '2', 'address2': 'WAREHOUSE W', 'hot-water-env-eff': 'Poor', 'posttown': 'LONDON',
-            'mainheatc-energy-eff': 'Good', 'main-fuel': 'electricity (not community)', 'lighting-env-eff': 'Poor',
-            'windows-energy-eff': 'Average', 'floor-env-eff': 'N/A', 'sheating-env-eff': 'N/A',
-            'lighting-description': 'Low energy lighting in 17% of fixed outlets', 'roof-env-eff': 'N/A',
-            'walls-energy-eff': 'Very Poor', 'photo-supply': '0.0', 'lighting-cost-potential': '67',
-            'mainheat-env-eff': 'Poor', 'multi-glaze-proportion': '61', 'main-heating-controls': '',
-            'lodgement-datetime': '2020-10-14 00:00:00', 'flat-top-storey': 'N', 'current-energy-rating': 'F',
-            'secondheat-description': 'None', 'walls-env-eff': 'Very Poor', 'transaction-type': 'marketed sale',
-            'uprn': '10012839482', 'current-energy-efficiency': '33', 'energy-consumption-current': '393',
-            'mainheat-description': 'Room heaters, electric', 'lighting-cost-current': '110',
-            'lodgement-date': '2020-10-14', 'extension-count': '0', 'mainheatc-env-eff': 'Good',
-            'lmk-key': 'b0d82f468273bec55ec5676a809b8e36b55db940ffa92f482a482f6aaa38eb1d', 'wind-turbine-count': '0',
-            'tenure': 'Owner-occupied', 'floor-level': '01', 'potential-energy-efficiency': '71',
-            'hot-water-energy-eff': 'Very Poor', 'low-energy-lighting': '17',
-            'walls-description': 'Solid brick, as built, no insulation (assumed)',
-            'hotwater-description': 'Electric immersion, standard tariff'
-        }
-
-        row2 = {
-            'UPRN': '10012839482', 'RDSAP_CHANGE': 8, 'HEAT_DEMAND_CHANGE': -59,
-            'CARBON_CHANGE': -0.5999999999999996, 'SAP_STARTING': 33, 'SAP_ENDING': 41, 'HEAT_DEMAND_STARTING': 393,
-            'HEAT_DEMAND_ENDING': 334, 'CARBON_STARTING': 4.6, 'CARBON_ENDING': 4.0, 'PROPERTY_TYPE': 'Flat',
-            'BUILT_FORM': 'Mid-Terrace', 'CONSTITUENCY': 'E14001032', 'NUMBER_HABITABLE_ROOMS': 2.0,
-            'NUMBER_HEATED_ROOMS': 2.0, 'FIXED_LIGHTING_OUTLETS_COUNT': 12.0,
-            'CONSTRUCTION_AGE_BAND': 'England and Wales: 1996-2002', 'TRANSACTION_TYPE_STARTING': 'marketed sale',
-            'MECHANICAL_VENTILATION_STARTING': 'natural', 'SECONDHEAT_DESCRIPTION_STARTING': 'None',
-            'ENERGY_TARIFF_STARTING': 'off-peak 7 hour', 'SOLAR_WATER_HEATING_FLAG_STARTING': 'N',
-            'PHOTO_SUPPLY_STARTING': 0.0, 'GLAZED_TYPE_STARTING': 'double glazing installed during or after 2002',
-            'MULTI_GLAZE_PROPORTION_STARTING': 61.0, 'LOW_ENERGY_LIGHTING_STARTING': 17.0,
-            'NUMBER_OPEN_FIREPLACES_STARTING': 0.0, 'EXTENSION_COUNT_STARTING': 0.0,
-            'TOTAL_FLOOR_AREA_STARTING': 70.0, 'FLOOR_HEIGHT_STARTING': 3.64,
-            'TRANSACTION_TYPE_ENDING': 'marketed sale', 'MECHANICAL_VENTILATION_ENDING': 'natural',
-            'SECONDHEAT_DESCRIPTION_ENDING': 'None', 'ENERGY_TARIFF_ENDING': 'off-peak 7 hour',
-            'SOLAR_WATER_HEATING_FLAG_ENDING': 'N', 'PHOTO_SUPPLY_ENDING': 0.0,
-            'GLAZED_TYPE_ENDING': 'double glazing installed during or after 2002',
-            'MULTI_GLAZE_PROPORTION_ENDING': 61.0, 'LOW_ENERGY_LIGHTING_ENDING': 17.0,
-            'NUMBER_OPEN_FIREPLACES_ENDING': 0.0, 'EXTENSION_COUNT_ENDING': 0.0, 'TOTAL_FLOOR_AREA_ENDING': 70.0,
-            'FLOOR_HEIGHT_ENDING': 3.64, 'DAYS_TO_STARTING': 2266, 'DAYS_TO_ENDING': 2307,
-            'walls_thermal_transmittance': 1.7, 'is_cavity_wall': False, 'is_filled_cavity': False,
-            'is_solid_brick': True, 'is_system_built': False, 'is_timber_frame': False,
-            'is_granite_or_whinstone': False, 'is_as_built': True, 'is_cob': False,
-            'is_sandstone_or_limestone': False, 'is_park_home': False, 'walls_insulation_thickness': 'none',
-            'external_insulation': False, 'internal_insulation': False, 'walls_thermal_transmittance_ENDING': 0.21,
-            'is_park_home_ENDING': False, 'walls_insulation_thickness_ENDING': 'average',
-            'external_insulation_ENDING': False, 'internal_insulation_ENDING': False,
-            'floor_thermal_transmittance': 0.0, 'is_to_unheated_space': False, 'is_to_external_air': False,
-            'is_suspended': False, 'is_solid': False, 'another_property_below': True,
-            'floor_insulation_thickness': 'none', 'floor_thermal_transmittance_ENDING': 0.0,
-            'floor_insulation_thickness_ENDING': 'none', 'roof_thermal_transmittance': 0.0, 'is_pitched': False,
-            'is_roof_room': False, 'is_loft': False, 'is_flat': False, 'is_thatched': False, 'is_at_rafters': False,
-            'has_dwelling_above': True, 'roof_insulation_thickness': 'none',
-            'roof_thermal_transmittance_ENDING': 0.0, 'roof_insulation_thickness_ENDING': 'none',
-            'heater_type': 'electric immersion', 'system_type': 'Unknown', 'thermostat_characteristics': 'Unknown',
-            'heating_scope': 'Unknown', 'energy_recovery': 'Unknown', 'hotwater_tariff_type': 'standard tariff',
-            'extra_features': 'Unknown', 'chp_systems': 'Unknown', 'distribution_system': 'Unknown',
-            'no_system_present': 'Unknown', 'appliance': 'Unknown', 'heater_type_ENDING': 'electric immersion',
-            'system_type_ENDING': 'Unknown', 'thermostat_characteristics_ENDING': 'Unknown',
-            'heating_scope_ENDING': 'Unknown', 'energy_recovery_ENDING': 'Unknown',
-            'hotwater_tariff_type_ENDING': 'standard tariff', 'extra_features_ENDING': 'Unknown',
-            'chp_systems_ENDING': 'Unknown', 'distribution_system_ENDING': 'Unknown',
-            'no_system_present_ENDING': 'Unknown', 'appliance_ENDING': 'Unknown', 'has_radiators': False,
-            'has_fan_coil_units': False, 'has_pipes_in_screed_above_insulation': False,
-            'has_pipes_in_insulated_timber_floor': False, 'has_pipes_in_concrete_slab': False, 'has_boiler': False,
-            'has_air_source_heat_pump': False, 'has_room_heaters': True, 'has_electric_storage_heaters': False,
-            'has_warm_air': False, 'has_electric_underfloor_heating': False, 'has_electric_ceiling_heating': False,
-            'has_community_scheme': False, 'has_ground_source_heat_pump': False, 'has_no_system_present': False,
-            'has_portable_electric_heaters': False, 'has_water_source_heat_pump': False,
-            'has_electric_heat_pump': False, 'has_micro-cogeneration': False, 'has_solar_assisted_heat_pump': False,
-            'has_exhaust_source_heat_pump': False, 'has_community_heat_pump': False, 'has_electric': True,
-            'has_mains_gas': False, 'has_wood_logs': False, 'has_coal': False, 'has_oil': False,
-            'has_wood_pellets': False, 'has_anthracite': False, 'has_dual_fuel_mineral_and_wood': False,
-            'has_smokeless_fuel': False, 'has_lpg': False, 'has_b30k': False, 'has_electricaire': False,
-            'has_assumed_for_most_rooms': False, 'has_underfloor_heating': False, 'has_radiators_ENDING': False,
-            'has_fan_coil_units_ENDING': False, 'has_pipes_in_screed_above_insulation_ENDING': False,
-            'has_pipes_in_insulated_timber_floor_ENDING': False, 'has_pipes_in_concrete_slab_ENDING': False,
-            'has_boiler_ENDING': False, 'has_air_source_heat_pump_ENDING': False, 'has_room_heaters_ENDING': True,
-            'has_electric_storage_heaters_ENDING': False, 'has_warm_air_ENDING': False,
-            'has_electric_underfloor_heating_ENDING': False, 'has_electric_ceiling_heating_ENDING': False,
-            'has_community_scheme_ENDING': False, 'has_ground_source_heat_pump_ENDING': False,
-            'has_no_system_present_ENDING': False, 'has_portable_electric_heaters_ENDING': False,
-            'has_water_source_heat_pump_ENDING': False, 'has_electric_heat_pump_ENDING': False,
-            'has_micro-cogeneration_ENDING': False, 'has_solar_assisted_heat_pump_ENDING': False,
-            'has_exhaust_source_heat_pump_ENDING': False, 'has_community_heat_pump_ENDING': False,
-            'has_electric_ENDING': True, 'has_mains_gas_ENDING': False, 'has_wood_logs_ENDING': False,
-            'has_coal_ENDING': False, 'has_oil_ENDING': False, 'has_wood_pellets_ENDING': False,
-            'has_anthracite_ENDING': False, 'has_dual_fuel_mineral_and_wood_ENDING': False,
-            'has_smokeless_fuel_ENDING': False, 'has_lpg_ENDING': False, 'has_b30k_ENDING': False,
-            'has_electricaire_ENDING': False, 'has_assumed_for_most_rooms_ENDING': False,
-            'has_underfloor_heating_ENDING': False, 'thermostatic_control': 'appliance thermostats',
-            'charging_system': 'Unknown', 'switch_system': 'programmer', 'no_control': 'Unknown',
-            'dhw_control': 'Unknown', 'community_heating': 'Unknown', 'multiple_room_thermostats': False,
-            'auxiliary_systems': 'Unknown', 'trvs': 'Unknown', 'rate_control': 'Unknown',
-            'thermostatic_control_ENDING': 'appliance thermostats', 'charging_system_ENDING': 'Unknown',
-            'switch_system_ENDING': 'programmer', 'no_control_ENDING': 'Unknown', 'dhw_control_ENDING': 'Unknown',
-            'community_heating_ENDING': 'Unknown', 'multiple_room_thermostats_ENDING': False,
-            'auxiliary_systems_ENDING': 'Unknown', 'trvs_ENDING': 'Unknown', 'rate_control_ENDING': 'Unknown',
-            'glazing_type': 'double', 'glazing_type_ENDING': 'double', 'fuel_type': 'electricity',
-            'main-fuel_tariff_type': 'Unknown', 'is_community': False,
-            'no_individual_heating_or_community_network': False, 'complex_fuel_type': 'Unknown',
-            'fuel_type_ENDING': 'electricity', 'main-fuel_tariff_type_ENDING': 'Unknown',
-            'is_community_ENDING': False, 'no_individual_heating_or_community_network_ENDING': False,
-            'complex_fuel_type_ENDING': 'Unknown', 'estimated_perimeter_STARTING': 35.4964786985977,
-            'estimated_perimeter_ENDING': 35.4964786985977,
-            'HOT_WATER_ENERGY_EFF_STARTING': "Very Poor",
-            "FLOOR_ENERGY_EFF_STARTING": "Unknown",
-            "WINDOWS_ENERGY_EFF_STARTING": "Average",
-            "WALLS_ENERGY_EFF_STARTING": "Very Poor",
-            "SHEATING_ENERGY_EFF_STARTING": "Unknown",
-            "ROOF_ENERGY_EFF_STARTING": "Unknown",
-            "MAINHEAT_ENERGY_EFF_STARTING": "Very Poor",
-            "MAINHEATC_ENERGY_EFF_STARTING": "Good",
-            "LIGHTING_ENERGY_EFF_STARTING": "Poor",
-            "POTENTIAL_ENERGY_EFFICIENCY": 71,
-            "ENVIRONMENT_IMPACT_POTENTIAL": 51,
-            "ENERGY_CONSUMPTION_POTENTIAL": 307,
-            "CO2_EMISSIONS_POTENTIAL": 3.6,
-            'HOT_WATER_ENERGY_EFF_ENDING': "Very Poor",
-            "FLOOR_ENERGY_EFF_ENDING": "Unknown",
-            "WINDOWS_ENERGY_EFF_ENDING": "Average",
-            "WALLS_ENERGY_EFF_ENDING": "Good",
-            "SHEATING_ENERGY_EFF_ENDING": "Unknown",
-            "ROOF_ENERGY_EFF_ENDING": "Unknown",
-            "MAINHEAT_ENERGY_EFF_ENDING": "Very Poor",
-            "MAINHEATC_ENERGY_EFF_ENDING": "Good",
-            "LIGHTING_ENERGY_EFF_ENDING": "Poor",
-        }
-
-        home2 = Property(
-            id=0,
-            postcode=starting_epc2["postcode"],
-            address=starting_epc2["address1"],
-            data=starting_epc2
-        )
-        home2.get_components(cleaned, photo_supply_lookup, floor_area_decile_thresholds)
-        home2.set_number_lighting_outlets(None)
-
-        data_processor2 = DataProcessor(None, newdata=True)
-        data_processor2.insert_data(pd.DataFrame([home2.get_model_data()]))
-
-        data_processor2.pre_process()
-
-        starting_epc_data2 = data_processor2.get_component_features(suffix="_STARTING")
-        ending_epc_data2 = data_processor2.get_component_features(suffix="_ENDING")
-        fixed_data2 = data_processor2.get_fixed_features()
-
-        ending_lodgement_date2 = '2020-11-24'
-
-        ending_epc_data2["DAYS_TO_ENDING"] = data_processor2.calculate_days_to(ending_lodgement_date2)
-
-        recommendation2 = {
-            "recommendation_id": 0,
-            "new_u_value": 0.21,
-            "type": "internal_wall_insulation"
-        }
-
-        test_record2 = create_recommendation_scoring_data(
-            property=home2,
-            recommendation=recommendation2,
-            starting_epc_data=starting_epc_data2,
-            ending_epc_data=ending_epc_data2,
-            fixed_data=fixed_data2,
-        )
-        test_record2 = pd.DataFrame([test_record2])
-
-        # Test the final cleaning:
-        test_record2 = DataProcessor.apply_averages_cleaning(
-            data_to_clean=test_record2,
-            cleaning_data=cleaning_data,
-            cols_to_merge_on=COLUMNS_TO_MERGE_ON + ["LOCAL_AUTHORITY"]
-        ).drop(columns=["LOCAL_AUTHORITY"])
-
-        test_record2 = DataProcessor.clean_missings_after_description_process(
-            test_record2, [
-                c for c in test_record2.columns if
-                ("thermal_transmittance" in c) or ("insulation_thickness" in c)
-            ]
-        )
-
-        for c in test_record2.columns:
-            if c in ["id", "SAP_ENDING", "HEAT_DEMAND_ENDING", "CARBON_ENDING"]:
-                continue
-
-            if c == "FLOOR_HEIGHT_ENDING":
-                assert (row2[c] - test_record2[c].values[0]) <= 0.020001
-                continue
-
-            if c == "walls_insulation_thickness_ENDING":
-                assert row2[c] == "average"
-                assert test_record2[c].values[0] == "above average"
-                continue
-
-            if c == "CONSTRUCTION_AGE_BAND":
-                # For this, we have different values in the original data
-                assert row2[c] == "England and Wales: 1996-2002"
-                assert test_record2[c].values[0] == "England and Wales: 1900-1929"
-                continue
-
-            assert test_record2[c].values[0] == row2[c]
-
-    def test_ventilation(self, cleaned, cleaning_data, photo_supply_lookup, floor_area_decile_thresholds):
-
-        starting_epc3 = {
-            'low-energy-fixed-light-count': '', 'address': '45 Shepperson Road', 'uprn-source': 'Energy Assessor',
-            'floor-height': '1.87', 'heating-cost-potential': '645', 'unheated-corridor-length': '',
-            'hot-water-cost-potential': '69', 'construction-age-band': 'England and Wales: 1900-1929',
-            'potential-energy-rating': 'C', 'mainheat-energy-eff': 'Good', 'windows-env-eff': 'Average',
-            'lighting-energy-eff': 'Average', 'environment-impact-potential': '75',
-            'glazed-type': 'double glazing, unknown install date', 'heating-cost-current': '1028', 'address3': '',
-            'mainheatcont-description': 'Programmer, TRVs and bypass', 'sheating-energy-eff': 'N/A',
-            'property-type': 'House', 'local-authority-label': 'Sheffield', 'fixed-lighting-outlets-count': '21',
-            'energy-tariff': 'Single', 'mechanical-ventilation': 'natural', 'hot-water-cost-current': '96',
-            'county': '', 'postcode': 'S6 4FG', 'solar-water-heating-flag': 'N', 'constituency': 'E14000921',
-            'co2-emissions-potential': '2.9', 'number-heated-rooms': '5',
-            'floor-description': 'Suspended, no insulation (assumed)', 'energy-consumption-potential': '152',
-            'local-authority': 'E08000019', 'built-form': 'Enclosed Mid-Terrace', 'number-open-fireplaces': '0',
-            'windows-description': 'Fully double glazed', 'glazed-area': 'Normal', 'inspection-date': '2022-06-13',
-            'mains-gas-flag': 'Y', 'co2-emiss-curr-per-floor-area': '59', 'address1': '45 Shepperson Road',
-            'heat-loss-corridor': '', 'flat-storey-count': '',
-            'constituency-label': 'Sheffield, Brightside and Hillsborough', 'roof-energy-eff': 'Very Poor',
-            'total-floor-area': '107.0', 'building-reference-number': '10002892085', 'environment-impact-current': '46',
-            'co2-emissions-current': '6.3', 'roof-description': 'Pitched, no insulation (assumed)',
-            'floor-energy-eff': 'N/A', 'number-habitable-rooms': '5', 'address2': '', 'hot-water-env-eff': 'Good',
-            'posttown': 'SHEFFIELD', 'mainheatc-energy-eff': 'Average', 'main-fuel': 'mains gas (not community)',
-            'lighting-env-eff': 'Average', 'windows-energy-eff': 'Average', 'floor-env-eff': 'N/A',
-            'sheating-env-eff': 'N/A', 'lighting-description': 'Low energy lighting in 43% of fixed outlets',
-            'roof-env-eff': 'Very Poor', 'walls-energy-eff': 'Very Poor', 'photo-supply': '0.0',
-            'lighting-cost-potential': '83', 'mainheat-env-eff': 'Good', 'multi-glaze-proportion': '100',
-            'main-heating-controls': '', 'lodgement-datetime': '2023-05-27 12:15:21', 'flat-top-storey': '',
-            'current-energy-rating': 'E', 'secondheat-description': 'None', 'walls-env-eff': 'Very Poor',
-            'transaction-type': 'marketed sale', 'uprn': '100051073214', 'current-energy-efficiency': '54',
-            'energy-consumption-current': '335', 'mainheat-description': 'Boiler and radiators, mains gas',
-            'lighting-cost-current': '131', 'lodgement-date': '2023-05-27', 'extension-count': '1',
-            'mainheatc-env-eff': 'Average',
-            'lmk-key': 'dc1a4da246562656132b8e36e0534cd90b09fa40fc584e25e644e2d9ab86a247', 'wind-turbine-count': '0',
-            'tenure': 'Not defined - use in the case of a new dwelling for which the intended tenure in not known. It '
-                      'is not to be used for an existing dwelling',
-            'floor-level': '', 'potential-energy-efficiency': '80', 'hot-water-energy-eff': 'Good',
-            'low-energy-lighting': '43',
-            'walls-description': 'Sandstone or limestone, as built, no insulation (assumed)',
-            'hotwater-description': 'From main system'
-        }
-
-        row3 = {
-            'UPRN': '100051073214', 'RDSAP_CHANGE': 2, 'HEAT_DEMAND_CHANGE': -22, 'CARBON_CHANGE': -0.39999999999999947,
-            'SAP_STARTING': 54, 'SAP_ENDING': 56, 'HEAT_DEMAND_STARTING': 335, 'HEAT_DEMAND_ENDING': 313,
-            'CARBON_STARTING': 6.3, 'CARBON_ENDING': 5.9, 'PROPERTY_TYPE': 'House', 'BUILT_FORM': 'Mid-Terrace',
-            'CONSTITUENCY': 'E14000921', 'NUMBER_HABITABLE_ROOMS': 5.0, 'NUMBER_HEATED_ROOMS': 5.0,
-            'FIXED_LIGHTING_OUTLETS_COUNT': 21.0, 'CONSTRUCTION_AGE_BAND': 'England and Wales: 1900-1929',
-            'TRANSACTION_TYPE_STARTING': 'marketed sale', 'MECHANICAL_VENTILATION_STARTING': 'natural',
-            'SECONDHEAT_DESCRIPTION_STARTING': 'None', 'ENERGY_TARIFF_STARTING': 'Single',
-            'SOLAR_WATER_HEATING_FLAG_STARTING': 'N', 'PHOTO_SUPPLY_STARTING': 0.0,
-            'GLAZED_TYPE_STARTING': 'double glazing, unknown install date', 'MULTI_GLAZE_PROPORTION_STARTING': 100.0,
-            'LOW_ENERGY_LIGHTING_STARTING': 43.0, 'NUMBER_OPEN_FIREPLACES_STARTING': 0.0,
-            'EXTENSION_COUNT_STARTING': 1.0, 'TOTAL_FLOOR_AREA_STARTING': 107.0, 'FLOOR_HEIGHT_STARTING': 1.87,
-            'TRANSACTION_TYPE_ENDING': 'marketed sale', 'MECHANICAL_VENTILATION_ENDING': 'mechanical, extract only',
-            'SECONDHEAT_DESCRIPTION_ENDING': 'None', 'ENERGY_TARIFF_ENDING': 'Single',
-            'SOLAR_WATER_HEATING_FLAG_ENDING': 'N', 'PHOTO_SUPPLY_ENDING': 0.0,
-            'GLAZED_TYPE_ENDING': 'double glazing, unknown install date', 'MULTI_GLAZE_PROPORTION_ENDING': 100.0,
-            'LOW_ENERGY_LIGHTING_ENDING': 43.0, 'NUMBER_OPEN_FIREPLACES_ENDING': 0.0, 'EXTENSION_COUNT_ENDING': 1.0,
-            'TOTAL_FLOOR_AREA_ENDING': 107.0, 'FLOOR_HEIGHT_ENDING': 1.87, 'DAYS_TO_STARTING': 3221,
-            'DAYS_TO_ENDING': 2874, 'walls_thermal_transmittance': 2.0, 'is_cavity_wall': False,
-            'is_filled_cavity': False, 'is_solid_brick': False, 'is_system_built': False, 'is_timber_frame': False,
-            'is_granite_or_whinstone': False, 'is_as_built': True, 'is_cob': False, 'is_sandstone_or_limestone': True,
-            'is_park_home': False, 'walls_insulation_thickness': 'none', 'external_insulation': False,
-            'internal_insulation': False, 'walls_thermal_transmittance_ENDING': 2.0, 'is_park_home_ENDING': False,
-            'walls_insulation_thickness_ENDING': 'none', 'external_insulation_ENDING': False,
-            'internal_insulation_ENDING': False, 'floor_thermal_transmittance': 0.51, 'is_to_unheated_space': False,
-            'is_to_external_air': False, 'is_suspended': True, 'is_solid': False, 'another_property_below': False,
-            'floor_insulation_thickness': 'none', 'floor_thermal_transmittance_ENDING': 0.51,
-            'floor_insulation_thickness_ENDING': 'none', 'roof_thermal_transmittance': 2.3, 'is_pitched': True,
-            'is_roof_room': False, 'is_loft': False, 'is_flat': False, 'is_thatched': False, 'is_at_rafters': False,
-            'has_dwelling_above': False, 'roof_insulation_thickness': 'none', 'roof_thermal_transmittance_ENDING': 2.3,
-            'roof_insulation_thickness_ENDING': 'none', 'heater_type': 'Unknown', 'system_type': 'from main system',
-            'thermostat_characteristics': 'Unknown', 'heating_scope': 'Unknown', 'energy_recovery': 'Unknown',
-            'hotwater_tariff_type': 'Unknown', 'extra_features': 'Unknown', 'chp_systems': 'Unknown',
-            'distribution_system': 'Unknown', 'no_system_present': 'Unknown', 'appliance': 'Unknown',
-            'heater_type_ENDING': 'Unknown', 'system_type_ENDING': 'from main system',
-            'thermostat_characteristics_ENDING': 'Unknown', 'heating_scope_ENDING': 'Unknown',
-            'energy_recovery_ENDING': 'Unknown', 'hotwater_tariff_type_ENDING': 'Unknown',
-            'extra_features_ENDING': 'Unknown', 'chp_systems_ENDING': 'Unknown',
-            'distribution_system_ENDING': 'Unknown', 'no_system_present_ENDING': 'Unknown',
-            'appliance_ENDING': 'Unknown', 'has_radiators': True, 'has_fan_coil_units': False,
-            'has_pipes_in_screed_above_insulation': False, 'has_pipes_in_insulated_timber_floor': False,
-            'has_pipes_in_concrete_slab': False, 'has_boiler': True, 'has_air_source_heat_pump': False,
-            'has_room_heaters': False, 'has_electric_storage_heaters': False, 'has_warm_air': False,
-            'has_electric_underfloor_heating': False, 'has_electric_ceiling_heating': False,
-            'has_community_scheme': False, 'has_ground_source_heat_pump': False, 'has_no_system_present': False,
-            'has_portable_electric_heaters': False, 'has_water_source_heat_pump': False,
-            'has_electric_heat_pump': False, 'has_micro-cogeneration': False, 'has_solar_assisted_heat_pump': False,
-            'has_exhaust_source_heat_pump': False, 'has_community_heat_pump': False, 'has_electric': False,
-            'has_mains_gas': True, 'has_wood_logs': False, 'has_coal': False, 'has_oil': False,
-            'has_wood_pellets': False, 'has_anthracite': False, 'has_dual_fuel_mineral_and_wood': False,
-            'has_smokeless_fuel': False, 'has_lpg': False, 'has_b30k': False, 'has_electricaire': False,
-            'has_assumed_for_most_rooms': False, 'has_underfloor_heating': False, 'has_radiators_ENDING': True,
-            'has_fan_coil_units_ENDING': False, 'has_pipes_in_screed_above_insulation_ENDING': False,
-            'has_pipes_in_insulated_timber_floor_ENDING': False, 'has_pipes_in_concrete_slab_ENDING': False,
-            'has_boiler_ENDING': True, 'has_air_source_heat_pump_ENDING': False, 'has_room_heaters_ENDING': False,
-            'has_electric_storage_heaters_ENDING': False, 'has_warm_air_ENDING': False,
-            'has_electric_underfloor_heating_ENDING': False, 'has_electric_ceiling_heating_ENDING': False,
-            'has_community_scheme_ENDING': False, 'has_ground_source_heat_pump_ENDING': False,
-            'has_no_system_present_ENDING': False, 'has_portable_electric_heaters_ENDING': False,
-            'has_water_source_heat_pump_ENDING': False, 'has_electric_heat_pump_ENDING': False,
-            'has_micro-cogeneration_ENDING': False, 'has_solar_assisted_heat_pump_ENDING': False,
-            'has_exhaust_source_heat_pump_ENDING': False, 'has_community_heat_pump_ENDING': False,
-            'has_electric_ENDING': False, 'has_mains_gas_ENDING': True, 'has_wood_logs_ENDING': False,
-            'has_coal_ENDING': False, 'has_oil_ENDING': False, 'has_wood_pellets_ENDING': False,
-            'has_anthracite_ENDING': False, 'has_dual_fuel_mineral_and_wood_ENDING': False,
-            'has_smokeless_fuel_ENDING': False, 'has_lpg_ENDING': False, 'has_b30k_ENDING': False,
-            'has_electricaire_ENDING': False, 'has_assumed_for_most_rooms_ENDING': False,
-            'has_underfloor_heating_ENDING': False, 'thermostatic_control': 'Unknown', 'charging_system': 'Unknown',
-            'switch_system': 'programmer', 'no_control': 'Unknown', 'dhw_control': 'Unknown',
-            'community_heating': 'Unknown', 'multiple_room_thermostats': False, 'auxiliary_systems': 'bypass',
-            'trvs': 'trvs', 'rate_control': 'Unknown', 'thermostatic_control_ENDING': 'Unknown',
-            'charging_system_ENDING': 'Unknown', 'switch_system_ENDING': 'programmer', 'no_control_ENDING': 'Unknown',
-            'dhw_control_ENDING': 'Unknown', 'community_heating_ENDING': 'Unknown',
-            'multiple_room_thermostats_ENDING': False, 'auxiliary_systems_ENDING': 'bypass', 'trvs_ENDING': 'trvs',
-            'rate_control_ENDING': 'Unknown', 'glazing_type': 'double', 'glazing_type_ENDING': 'double',
-            'fuel_type': 'mains gas', 'main-fuel_tariff_type': 'Unknown', 'is_community': False,
-            'no_individual_heating_or_community_network': False, 'complex_fuel_type': 'Unknown',
-            'fuel_type_ENDING': 'mains gas', 'main-fuel_tariff_type_ENDING': 'Unknown', 'is_community_ENDING': False,
-            'no_individual_heating_or_community_network_ENDING': False, 'complex_fuel_type_ENDING': 'Unknown',
-            'estimated_perimeter_STARTING': 30.06908711617298, 'estimated_perimeter_ENDING': 30.06908711617298,
-            'HOT_WATER_ENERGY_EFF_STARTING': "Good",
-            "FLOOR_ENERGY_EFF_STARTING": "Unknown",
-            "WINDOWS_ENERGY_EFF_STARTING": "Average",
-            "WALLS_ENERGY_EFF_STARTING": "Very Poor",
-            "SHEATING_ENERGY_EFF_STARTING": "Unknown",
-            "ROOF_ENERGY_EFF_STARTING": "Very Poor",
-            "MAINHEAT_ENERGY_EFF_STARTING": "Good",
-            "MAINHEATC_ENERGY_EFF_STARTING": "Average",
-            "LIGHTING_ENERGY_EFF_STARTING": "Average",
-            "POTENTIAL_ENERGY_EFFICIENCY": 80,
-            "ENVIRONMENT_IMPACT_POTENTIAL": 75,
-            "ENERGY_CONSUMPTION_POTENTIAL": 152,
-            "CO2_EMISSIONS_POTENTIAL": 2.9,
-            'HOT_WATER_ENERGY_EFF_ENDING': "Good",
-            "FLOOR_ENERGY_EFF_ENDING": "Unknown",
-            "WINDOWS_ENERGY_EFF_ENDING": "Average",
-            "WALLS_ENERGY_EFF_ENDING": "Very Poor",
-            "SHEATING_ENERGY_EFF_ENDING": "Unknown",
-            "ROOF_ENERGY_EFF_ENDING": "Very Poor",
-            "MAINHEAT_ENERGY_EFF_ENDING": "Good",
-            "MAINHEATC_ENERGY_EFF_ENDING": "Average",
-            "LIGHTING_ENERGY_EFF_ENDING": "Average",
-        }
-
-        home3 = Property(
-            id=0,
-            postcode=starting_epc3["postcode"],
-            address=starting_epc3["address1"],
-            data=starting_epc3
-        )
-        home3.get_components(cleaned, photo_supply_lookup, floor_area_decile_thresholds)
-        home3.set_number_lighting_outlets(None)
-
-        data_processor3 = DataProcessor(None, newdata=True)
-        data_processor3.insert_data(pd.DataFrame([home3.get_model_data()]))
-
-        data_processor3.pre_process()
-
-        starting_epc_data3 = data_processor3.get_component_features(suffix="_STARTING")
-        ending_epc_data3 = data_processor3.get_component_features(suffix="_ENDING")
-        fixed_data3 = data_processor3.get_fixed_features()
-
-        ending_lodgement_date3 = '2022-06-14'
-
-        ending_epc_data3["DAYS_TO_ENDING"] = data_processor3.calculate_days_to(ending_lodgement_date3)
-
-        recommendation3 = {
-            "recommendation_id": 0,
-            "type": "mechanical_ventilation"
-        }
-
-        test_record3 = create_recommendation_scoring_data(
-            property=home3,
-            recommendation=recommendation3,
-            starting_epc_data=starting_epc_data3,
-            ending_epc_data=ending_epc_data3,
-            fixed_data=fixed_data3,
-        )
-        test_record3 = pd.DataFrame([test_record3])
-
-        # Test the final cleaning:
-        test_record3 = DataProcessor.apply_averages_cleaning(
-            data_to_clean=test_record3,
-            cleaning_data=cleaning_data,
-            cols_to_merge_on=COLUMNS_TO_MERGE_ON + ["LOCAL_AUTHORITY"]
-        ).drop(columns=["LOCAL_AUTHORITY"])
-
-        test_record3 = DataProcessor.clean_missings_after_description_process(
-            test_record3, [
-                c for c in test_record3.columns if
-                ("thermal_transmittance" in c) or ("insulation_thickness" in c)
-            ]
-        )
-
-        for c in test_record3.columns:
-            if c in ["id", "SAP_ENDING", "HEAT_DEMAND_ENDING", "CARBON_ENDING"]:
-                continue
-
-            assert test_record3[c].values[0] == row3[c]
-
-    def test_fireplaces(self, cleaned, cleaning_data, photo_supply_lookup, floor_area_decile_thresholds):
-
-        starting_epc4 = {
-            'low-energy-fixed-light-count': '', 'address': '9 Glebe Road, Asfordby Hill',
-            'uprn-source': 'Energy Assessor', 'floor-height': '2.4', 'heating-cost-potential': '501',
-            'unheated-corridor-length': '', 'hot-water-cost-potential': '70',
-            'construction-age-band': 'England and Wales: 1930-1949', 'potential-energy-rating': 'C',
-            'mainheat-energy-eff': 'Good', 'windows-env-eff': 'Average', 'lighting-energy-eff': 'Average',
-            'environment-impact-potential': '76', 'glazed-type': 'double glazing, unknown install date',
-            'heating-cost-current': '723', 'address3': '',
-            'mainheatcont-description': 'Programmer and room thermostat', 'sheating-energy-eff': 'N/A',
-            'property-type': 'House', 'local-authority-label': 'Melton',
-            'fixed-lighting-outlets-count': '14', 'energy-tariff': 'dual',
-            'mechanical-ventilation': 'natural', 'hot-water-cost-current': '98',
-            'county': 'Leicestershire', 'postcode': 'LE14 3QT', 'solar-water-heating-flag': 'N',
-            'constituency': 'E14000909', 'co2-emissions-potential': '2.4', 'number-heated-rooms': '5',
-            'floor-description': 'Solid, no insulation (assumed)', 'energy-consumption-potential': '153',
-            'local-authority': 'E07000133', 'built-form': 'Semi-Detached', 'number-open-fireplaces': '1',
-            'windows-description': 'Fully double glazed', 'glazed-area': 'Normal',
-            'inspection-date': '2022-06-27', 'mains-gas-flag': 'Y', 'co2-emiss-curr-per-floor-area': '46',
-            'address1': '9 Glebe Road', 'heat-loss-corridor': '', 'flat-storey-count': '',
-            'constituency-label': 'Rutland and Melton', 'roof-energy-eff': 'Good',
-            'total-floor-area': '87.0', 'building-reference-number': '10002396876',
-            'environment-impact-current': '60', 'co2-emissions-current': '4.0',
-            'roof-description': 'Pitched, 200 mm loft insulation', 'floor-energy-eff': 'N/A',
-            'number-habitable-rooms': '5', 'address2': 'Asfordby Hill', 'hot-water-env-eff': 'Good',
-            'posttown': 'MELTON MOWBRAY', 'mainheatc-energy-eff': 'Average',
-            'main-fuel': 'mains gas (not community)', 'lighting-env-eff': 'Average',
-            'windows-energy-eff': 'Average', 'floor-env-eff': 'N/A', 'sheating-env-eff': 'N/A',
-            'lighting-description': 'Low energy lighting in 29% of fixed outlets', 'roof-env-eff': 'Good',
-            'walls-energy-eff': 'Very Poor', 'photo-supply': '15.0', 'lighting-cost-potential': '79',
-            'mainheat-env-eff': 'Good', 'multi-glaze-proportion': '100', 'main-heating-controls': '',
-            'lodgement-datetime': '2022-06-27 15:28:18', 'flat-top-storey': '',
-            'current-energy-rating': 'D',
-            'secondheat-description': 'Room heaters, dual fuel (mineral and wood)',
-            'walls-env-eff': 'Very Poor', 'transaction-type': 'ECO assessment', 'uprn': '100030539619',
-            'current-energy-efficiency': '66', 'energy-consumption-current': '256',
-            'mainheat-description': 'Boiler and radiators, mains gas', 'lighting-cost-current': '135',
-            'lodgement-date': '2022-06-27', 'extension-count': '1', 'mainheatc-env-eff': 'Average',
-            'lmk-key': '736b6f4803a11d9e45b49bf98f36eb8a7f357b0dd24f3e7cddef5295518e5bef',
-            'wind-turbine-count': '0', 'tenure': 'Owner-occupied', 'floor-level': '',
-            'potential-energy-efficiency': '78', 'hot-water-energy-eff': 'Good',
-            'low-energy-lighting': '29',
-            'walls-description': 'Solid brick, as built, no insulation (assumed)',
-            'hotwater-description': 'From main system'
-        }
-
-        row4 = {
-            'UPRN': '100030539619', 'RDSAP_CHANGE': 7, 'HEAT_DEMAND_CHANGE': -41, 'CARBON_CHANGE': -0.5,
-            'SAP_STARTING': 66, 'SAP_ENDING': 73, 'HEAT_DEMAND_STARTING': 256, 'HEAT_DEMAND_ENDING': 215,
-            'CARBON_STARTING': 4.0, 'CARBON_ENDING': 3.5, 'PROPERTY_TYPE': 'House', 'BUILT_FORM': 'Semi-Detached',
-            'CONSTITUENCY': 'E14000909', 'NUMBER_HABITABLE_ROOMS': 5.0, 'NUMBER_HEATED_ROOMS': 5.0,
-            'FIXED_LIGHTING_OUTLETS_COUNT': 14.0, 'CONSTRUCTION_AGE_BAND': 'England and Wales: 1930-1949',
-            'TRANSACTION_TYPE_STARTING': 'eco assessment', 'MECHANICAL_VENTILATION_STARTING': 'natural',
-            'SECONDHEAT_DESCRIPTION_STARTING': 'Room heaters, dual fuel (mineral and wood)',
-            'ENERGY_TARIFF_STARTING': 'dual', 'SOLAR_WATER_HEATING_FLAG_STARTING': 'N', 'PHOTO_SUPPLY_STARTING': 15.0,
-            'GLAZED_TYPE_STARTING': 'double glazing, unknown install date', 'MULTI_GLAZE_PROPORTION_STARTING': 100.0,
-            'LOW_ENERGY_LIGHTING_STARTING': 29.0, 'NUMBER_OPEN_FIREPLACES_STARTING': 1.0,
-            'EXTENSION_COUNT_STARTING': 1.0, 'TOTAL_FLOOR_AREA_STARTING': 87.0, 'FLOOR_HEIGHT_STARTING': 2.4,
-            'TRANSACTION_TYPE_ENDING': 'eco assessment', 'MECHANICAL_VENTILATION_ENDING': 'natural',
-            'SECONDHEAT_DESCRIPTION_ENDING': 'Room heaters, dual fuel (mineral and wood)',
-            'ENERGY_TARIFF_ENDING': 'dual', 'SOLAR_WATER_HEATING_FLAG_ENDING': 'N', 'PHOTO_SUPPLY_ENDING': 15.0,
-            'GLAZED_TYPE_ENDING': 'double glazing, unknown install date', 'MULTI_GLAZE_PROPORTION_ENDING': 100.0,
-            'LOW_ENERGY_LIGHTING_ENDING': 29.0, 'NUMBER_OPEN_FIREPLACES_ENDING': 0, 'EXTENSION_COUNT_ENDING': 1.0,
-            'TOTAL_FLOOR_AREA_ENDING': 87.0, 'FLOOR_HEIGHT_ENDING': 2.4, 'DAYS_TO_STARTING': 2887,
-            'DAYS_TO_ENDING': 2960, 'walls_thermal_transmittance': 1.7, 'is_cavity_wall': False,
-            'is_filled_cavity': False, 'is_solid_brick': True, 'is_system_built': False, 'is_timber_frame': False,
-            'is_granite_or_whinstone': False, 'is_as_built': True, 'is_cob': False, 'is_sandstone_or_limestone': False,
-            'is_park_home': False, 'walls_insulation_thickness': 'none', 'external_insulation': False,
-            'internal_insulation': False, 'walls_thermal_transmittance_ENDING': 1.7, 'is_park_home_ENDING': False,
-            'walls_insulation_thickness_ENDING': 'none', 'external_insulation_ENDING': False,
-            'internal_insulation_ENDING': False, 'floor_thermal_transmittance': 0.53, 'is_to_unheated_space': False,
-            'is_to_external_air': False, 'is_suspended': False, 'is_solid': True, 'another_property_below': False,
-            'floor_insulation_thickness': 'none', 'floor_thermal_transmittance_ENDING': 0.53,
-            'floor_insulation_thickness_ENDING': 'none', 'roof_thermal_transmittance': 0.21, 'is_pitched': True,
-            'is_roof_room': False, 'is_loft': True, 'is_flat': False, 'is_thatched': False, 'is_at_rafters': False,
-            'has_dwelling_above': False, 'roof_insulation_thickness': '200', 'roof_thermal_transmittance_ENDING': 0.21,
-            'roof_insulation_thickness_ENDING': '200', 'heater_type': 'Unknown', 'system_type': 'from main system',
-            'thermostat_characteristics': 'Unknown', 'heating_scope': 'Unknown', 'energy_recovery': 'Unknown',
-            'hotwater_tariff_type': 'Unknown', 'extra_features': 'Unknown', 'chp_systems': 'Unknown',
-            'distribution_system': 'Unknown', 'no_system_present': 'Unknown', 'appliance': 'Unknown',
-            'heater_type_ENDING': 'Unknown', 'system_type_ENDING': 'from main system',
-            'thermostat_characteristics_ENDING': 'Unknown', 'heating_scope_ENDING': 'Unknown',
-            'energy_recovery_ENDING': 'Unknown', 'hotwater_tariff_type_ENDING': 'Unknown',
-            'extra_features_ENDING': 'Unknown', 'chp_systems_ENDING': 'Unknown',
-            'distribution_system_ENDING': 'Unknown', 'no_system_present_ENDING': 'Unknown',
-            'appliance_ENDING': 'Unknown', 'has_radiators': True, 'has_fan_coil_units': False,
-            'has_pipes_in_screed_above_insulation': False, 'has_pipes_in_insulated_timber_floor': False,
-            'has_pipes_in_concrete_slab': False, 'has_boiler': True, 'has_air_source_heat_pump': False,
-            'has_room_heaters': False, 'has_electric_storage_heaters': False, 'has_warm_air': False,
-            'has_electric_underfloor_heating': False, 'has_electric_ceiling_heating': False,
-            'has_community_scheme': False, 'has_ground_source_heat_pump': False, 'has_no_system_present': False,
-            'has_portable_electric_heaters': False, 'has_water_source_heat_pump': False,
-            'has_electric_heat_pump': False, 'has_micro-cogeneration': False, 'has_solar_assisted_heat_pump': False,
-            'has_exhaust_source_heat_pump': False, 'has_community_heat_pump': False, 'has_electric': False,
-            'has_mains_gas': True, 'has_wood_logs': False, 'has_coal': False, 'has_oil': False,
-            'has_wood_pellets': False, 'has_anthracite': False, 'has_dual_fuel_mineral_and_wood': False,
-            'has_smokeless_fuel': False, 'has_lpg': False, 'has_b30k': False, 'has_electricaire': False,
-            'has_assumed_for_most_rooms': False, 'has_underfloor_heating': False, 'has_radiators_ENDING': True,
-            'has_fan_coil_units_ENDING': False, 'has_pipes_in_screed_above_insulation_ENDING': False,
-            'has_pipes_in_insulated_timber_floor_ENDING': False, 'has_pipes_in_concrete_slab_ENDING': False,
-            'has_boiler_ENDING': True, 'has_air_source_heat_pump_ENDING': False, 'has_room_heaters_ENDING': False,
-            'has_electric_storage_heaters_ENDING': False, 'has_warm_air_ENDING': False,
-            'has_electric_underfloor_heating_ENDING': False, 'has_electric_ceiling_heating_ENDING': False,
-            'has_community_scheme_ENDING': False, 'has_ground_source_heat_pump_ENDING': False,
-            'has_no_system_present_ENDING': False, 'has_portable_electric_heaters_ENDING': False,
-            'has_water_source_heat_pump_ENDING': False, 'has_electric_heat_pump_ENDING': False,
-            'has_micro-cogeneration_ENDING': False, 'has_solar_assisted_heat_pump_ENDING': False,
-            'has_exhaust_source_heat_pump_ENDING': False, 'has_community_heat_pump_ENDING': False,
-            'has_electric_ENDING': False, 'has_mains_gas_ENDING': True, 'has_wood_logs_ENDING': False,
-            'has_coal_ENDING': False, 'has_oil_ENDING': False, 'has_wood_pellets_ENDING': False,
-            'has_anthracite_ENDING': False, 'has_dual_fuel_mineral_and_wood_ENDING': False,
-            'has_smokeless_fuel_ENDING': False, 'has_lpg_ENDING': False, 'has_b30k_ENDING': False,
-            'has_electricaire_ENDING': False, 'has_assumed_for_most_rooms_ENDING': False,
-            'has_underfloor_heating_ENDING': False, 'thermostatic_control': 'room thermostat',
-            'charging_system': 'Unknown', 'switch_system': 'programmer', 'no_control': 'Unknown',
-            'dhw_control': 'Unknown', 'community_heating': 'Unknown', 'multiple_room_thermostats': False,
-            'auxiliary_systems': 'Unknown', 'trvs': 'Unknown', 'rate_control': 'Unknown',
-            'thermostatic_control_ENDING': 'room thermostat', 'charging_system_ENDING': 'Unknown',
-            'switch_system_ENDING': 'programmer', 'no_control_ENDING': 'Unknown', 'dhw_control_ENDING': 'Unknown',
-            'community_heating_ENDING': 'Unknown', 'multiple_room_thermostats_ENDING': False,
-            'auxiliary_systems_ENDING': 'Unknown', 'trvs_ENDING': 'Unknown', 'rate_control_ENDING': 'Unknown',
-            'glazing_type': 'double', 'glazing_type_ENDING': 'double', 'fuel_type': 'mains gas',
-            'main-fuel_tariff_type': 'Unknown', 'is_community': False,
-            'no_individual_heating_or_community_network': False, 'complex_fuel_type': 'Unknown',
-            'fuel_type_ENDING': 'mains gas', 'main-fuel_tariff_type_ENDING': 'Unknown', 'is_community_ENDING': False,
-            'no_individual_heating_or_community_network_ENDING': False, 'complex_fuel_type_ENDING': 'Unknown',
-            'estimated_perimeter_STARTING': 27.113649698998472, 'estimated_perimeter_ENDING': 27.113649698998472,
-            'HOT_WATER_ENERGY_EFF_STARTING': "Good",
-            "FLOOR_ENERGY_EFF_STARTING": "Unknown",
-            "WINDOWS_ENERGY_EFF_STARTING": "Average",
-            "WALLS_ENERGY_EFF_STARTING": "Very Poor",
-            "SHEATING_ENERGY_EFF_STARTING": "Unknown",
-            "ROOF_ENERGY_EFF_STARTING": "Good",
-            "MAINHEAT_ENERGY_EFF_STARTING": "Good",
-            "MAINHEATC_ENERGY_EFF_STARTING": "Average",
-            "LIGHTING_ENERGY_EFF_STARTING": "Average",
-            "POTENTIAL_ENERGY_EFFICIENCY": 78,
-            "ENVIRONMENT_IMPACT_POTENTIAL": 76,
-            "ENERGY_CONSUMPTION_POTENTIAL": 153,
-            "CO2_EMISSIONS_POTENTIAL": 2.4,
-            'HOT_WATER_ENERGY_EFF_ENDING': "Good",
-            "FLOOR_ENERGY_EFF_ENDING": "Unknown",
-            "WINDOWS_ENERGY_EFF_ENDING": "Average",
-            "WALLS_ENERGY_EFF_ENDING": "Very Poor",
-            "SHEATING_ENERGY_EFF_ENDING": "Unknown",
-            "ROOF_ENERGY_EFF_ENDING": "Good",
-            "MAINHEAT_ENERGY_EFF_ENDING": "Good",
-            "MAINHEATC_ENERGY_EFF_ENDING": "Average",
-            "LIGHTING_ENERGY_EFF_ENDING": "Average",
-        }
-
-        home4 = Property(
-            id=0,
-            postcode=starting_epc4["postcode"],
-            address=starting_epc4["address1"],
-            data=starting_epc4
-        )
-        home4.get_components(cleaned, photo_supply_lookup, floor_area_decile_thresholds)
-        home4.set_number_lighting_outlets(None)
-
-        data_processor4 = DataProcessor(None, newdata=True)
-        data_processor4.insert_data(pd.DataFrame([home4.get_model_data()]))
-
-        data_processor4.pre_process()
-
-        starting_epc_data4 = data_processor4.get_component_features(suffix="_STARTING")
-        ending_epc_data4 = data_processor4.get_component_features(suffix="_ENDING")
-        fixed_data4 = data_processor4.get_fixed_features()
-
-        ending_lodgement_date4 = '2022-09-08'
-
-        ending_epc_data4["DAYS_TO_ENDING"] = data_processor4.calculate_days_to(ending_lodgement_date4)
-
-        recommendation4 = {
-            "recommendation_id": 0,
-            "type": "sealing_open_fireplace"
-        }
-
-        test_record4 = create_recommendation_scoring_data(
-            property=home4,
-            recommendation=recommendation4,
-            starting_epc_data=starting_epc_data4,
-            ending_epc_data=ending_epc_data4,
-            fixed_data=fixed_data4,
-        )
-        test_record4 = pd.DataFrame([test_record4])
-
-        # Test the final cleaning:
-        test_record4 = DataProcessor.apply_averages_cleaning(
-            data_to_clean=test_record4,
-            cleaning_data=cleaning_data,
-            cols_to_merge_on=COLUMNS_TO_MERGE_ON + ["LOCAL_AUTHORITY"]
-        ).drop(columns=["LOCAL_AUTHORITY"])
-
-        test_record4 = DataProcessor.clean_missings_after_description_process(
-            test_record4, [
-                c for c in test_record4.columns if
-                ("thermal_transmittance" in c) or ("insulation_thickness" in c)
-            ]
-        )
-
-        for c in test_record4.columns:
-            if c in ["id", "SAP_ENDING", "HEAT_DEMAND_ENDING", "CARBON_ENDING"]:
-                continue
-
-            assert test_record4[c].values[0] == row4[c]
diff --git a/etl/epc/tests/test_epcrecord.py b/etl/epc/tests/test_epcrecord.py
index 48ad5148..cf0361b1 100644
--- a/etl/epc/tests/test_epcrecord.py
+++ b/etl/epc/tests/test_epcrecord.py
@@ -109,14 +109,13 @@ class TestEpcRecord:
         assert record.prepared_epc["energy-consumption-current"] == 200.0
         assert record.prepared_epc["co2-emissions-current"] == 5.5
 
-    def test_clean_energy_empty_values(self, cleaning_data, epc_records_1):
+    def test_clean_energy_empty_values(self, cleaning_data):
         # We cannot have invalid values so this should raise an exception
         record = EPCRecord(cleaning_data=cleaning_data)
         record.prepared_epc = {
             "energy-consumption-current": "",
             "co2-emissions-current": ""
         }
-        record._clean_energy()
 
         with pytest.raises(ValueError):
             record._clean_energy()
@@ -207,7 +206,7 @@ class TestEpcRecord:
             "mains-gas-flag": "InvalidValue"
         }
         # It should always be Y or N or an anomally value
-        with pytest.raises(ValueError):
+        with pytest.raises(KeyError):
             record._clean_mains_gas()
 
         record = EPCRecord(cleaning_data=cleaning_data)
@@ -225,7 +224,8 @@ class TestEpcRecord:
         }
         record._clean_solar_hot_water()
 
-        assert record.prepared_epc["solar-water-heating-flag"] is True
+        assert record.prepared_epc["solar-water-heating-flag"] == "Y"
+        assert record.solar_water_heating_flag_bool is True
 
     def test_clean_solar_hot_water_empty(self, cleaning_data):
         record = EPCRecord(cleaning_data=cleaning_data)
@@ -234,7 +234,8 @@ class TestEpcRecord:
         }
         record._clean_solar_hot_water()
 
-        assert record.prepared_epc["solar-water-heating-flag"] is None
+        assert record.prepared_epc["solar-water-heating-flag"] == "N"
+        assert record.solar_water_heating_flag_bool is False
 
     def test_clean_number_lighting_outlets_valid(self, cleaning_data, epc_records_1):
         record = EPCRecord(cleaning_data=cleaning_data, epc_records=epc_records_1)
@@ -320,7 +321,8 @@ class TestEpcRecord:
 
         record._clean_solar_hot_water()
 
-        assert record.prepared_epc["solar-water-heating-flag"] is True
+        assert record.prepared_epc["solar-water-heating-flag"] == "Y"
+        assert record.solar_water_heating_flag_bool is True
 
         record = EPCRecord(cleaning_data=cleaning_data)
 
@@ -330,7 +332,8 @@ class TestEpcRecord:
 
         record._clean_solar_hot_water()
 
-        assert record.prepared_epc["solar-water-heating-flag"] is False
+        assert record.prepared_epc["solar-water-heating-flag"] == "N"
+        assert record.solar_water_heating_flag_bool is False
 
         record = EPCRecord(cleaning_data=cleaning_data)
 
@@ -340,7 +343,8 @@ class TestEpcRecord:
 
         record._clean_solar_hot_water()
 
-        assert record.prepared_epc["solar-water-heating-flag"] is None
+        assert record.prepared_epc["solar-water-heating-flag"] == "N"
+        assert record.solar_water_heating_flag_bool is False
 
         record = EPCRecord(cleaning_data=cleaning_data)
 
@@ -350,4 +354,5 @@ class TestEpcRecord:
 
         record._clean_solar_hot_water()
 
-        assert record.prepared_epc["solar-water-heating-flag"] is None
+        assert record.prepared_epc["solar-water-heating-flag"] == "N"
+        assert record.solar_water_heating_flag_bool is False

From 4608ac89a5dd00ec04dca170a964499abd663691 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Fri, 19 Jan 2024 17:42:03 +0000
Subject: [PATCH 12/48] fixed roof tests

---
 backend/Property.py                           | 22 ++++----
 .../tests/test_fireplace_recommendations.py   | 24 ++++----
 .../tests/test_lighting_recommendations.py    | 18 +++---
 .../tests/test_roof_recommendations.py        | 55 +++++++++++--------
 4 files changed, 68 insertions(+), 51 deletions(-)

diff --git a/backend/Property.py b/backend/Property.py
index ee496552..e6ae8bbe 100644
--- a/backend/Property.py
+++ b/backend/Property.py
@@ -68,7 +68,7 @@ class Property(Definitions):
         self.in_conservation_area, self.is_listed, self.is_heritage = None, None, None
         self.restricted_measures = False
         self.year_built = epc_record.get("year_built")
-        self.number_of_rooms = epc_record.prepared_epc["number_heated_rooms"]
+        self.number_of_rooms = epc_record.prepared_epc.get("number_heated_rooms")
         self.age_band = epc_record.get("age_band")
         self.construction_age_band = epc_record.get("construction_age_band")
         self.number_of_floors = epc_record.get("number_of_floors")
@@ -81,7 +81,7 @@ class Property(Definitions):
             "co2_emissions": epc_record.get("co2_emissions_current"),
         }
         self.ventilation = {
-            "ventilation": epc_record.prepared_epc["mechanical_ventilation"],
+            "ventilation": epc_record.prepared_epc.get("mechanical_ventilation"),
         }
         self.solar_pv = {
             "solar_pv": epc_record.get("photo_supply"),
@@ -91,29 +91,29 @@ class Property(Definitions):
             "solar_hot_water_boolean": epc_record.get("solar_water_heating_flag_bool"),
         }
         self.wind_turbine = {
-            "wind_turbine": epc_record.prepared_epc["wind_turbine_count"],
+            "wind_turbine": epc_record.prepared_epc.get("wind_turbine_count"),
         }
         self.number_of_open_fireplaces = {
-            "number_of_open_fireplaces": epc_record.prepared_epc["number_open_fireplaces"],
+            "number_of_open_fireplaces": epc_record.prepared_epc.get("number_open_fireplaces"),
         }
         self.number_of_extensions = {
-            "number_of_extensions": epc_record.prepared_epc["extension_count"],
+            "number_of_extensions": epc_record.prepared_epc.get("extension_count"),
         }
         self.number_of_storeys = {
-            "number_of_storeys": epc_record.prepared_epc["flat_storey_count"],
+            "number_of_storeys": epc_record.prepared_epc.get("flat_storey_count"),
         }
         self.heat_loss_corridor = {
-            "heat_loss_corridor": epc_record.prepared_epc["heat_loss_corridor"],
-            "length": epc_record.prepared_epc["unheated_corridor_length"],
+            "heat_loss_corridor": epc_record.prepared_epc.get("heat_loss_corridor"),
+            "length": epc_record.prepared_epc.get("unheated_corridor_length"),
             "heat_loss_corridor_boolean": epc_record.get("heat_loss_corridor_bool"),
         }
-        self.mains_gas = epc_record.prepared_epc['mains_gas_flag']
-        self.floor_height = epc_record.prepared_epc['floor_height']
+        self.mains_gas = epc_record.prepared_epc.get('mains_gas_flag')
+        self.floor_height = epc_record.prepared_epc.get('floor_height')
         self.insulation_wall_area = None
         self.floor_area = epc_record.prepared_epc.get('total_floor_area')
         self.pitched_roof_area = None
         self.insulation_floor_area = None
-        self.number_lighting_outlets = epc_record.prepared_epc["fixed_lighting_outlets_count"]
+        self.number_lighting_outlets = epc_record.prepared_epc.get("fixed_lighting_outlets_count")
         self.floor_level = None
         self.number_of_windows = None
         self.solar_pv_roof_area = None
diff --git a/recommendations/tests/test_fireplace_recommendations.py b/recommendations/tests/test_fireplace_recommendations.py
index a91d6697..f21d6bc3 100644
--- a/recommendations/tests/test_fireplace_recommendations.py
+++ b/recommendations/tests/test_fireplace_recommendations.py
@@ -1,16 +1,18 @@
 from backend.Property import Property
-from unittest.mock import Mock
 from recommendations.FireplaceRecommendations import FireplaceRecommendations
+from etl.epc.Record import EPCRecord
 
 
 class TestFirepaceRecommendations:
 
     def test_no_fireplaces(self):
-        property_instance = Property(id=0, address="fake", postcode="fake")
-        property_instance.data = {
-            "number-open-fireplaces": 0
+        epc_record = EPCRecord()
+        epc_record.prepared_epc = {
+            "number-open-fireplaces": 0,
         }
 
+        property_instance = Property(id=0, address="fake", postcode="fake", epc_record=epc_record)
+
         recommender = FireplaceRecommendations(
             property_instance=property_instance
         )
@@ -22,10 +24,11 @@ class TestFirepaceRecommendations:
         assert recommender.recommendation is None
 
     def test_one_fireplace(self):
-        property_instance = Property(id=0, address="fake", postcode="fake")
-        property_instance.data = {
-            "number-open-fireplaces": 1
+        epc_record = EPCRecord()
+        epc_record.prepared_epc = {
+            "number-open-fireplaces": 1,
         }
+        property_instance = Property(id=0, address="fake", postcode="fake", epc_record=epc_record)
 
         recommender = FireplaceRecommendations(
             property_instance=property_instance
@@ -40,10 +43,11 @@ class TestFirepaceRecommendations:
         assert recommender.recommendation[0]["total"] == 300
 
     def test_multiple_fireplaces(self):
-        property_instance = Property(id=0, address="fake", postcode="fake")
-        property_instance.data = {
-            "number-open-fireplaces": 3
+        epc_record = EPCRecord()
+        epc_record.prepared_epc = {
+            "number-open-fireplaces": 3,
         }
+        property_instance = Property(id=0, address="fake", postcode="fake", epc_record=epc_record)
 
         recommender = FireplaceRecommendations(
             property_instance=property_instance
diff --git a/recommendations/tests/test_lighting_recommendations.py b/recommendations/tests/test_lighting_recommendations.py
index 964f1da0..45213d70 100644
--- a/recommendations/tests/test_lighting_recommendations.py
+++ b/recommendations/tests/test_lighting_recommendations.py
@@ -1,5 +1,5 @@
 import pytest
-from unittest.mock import Mock
+from etl.epc.Record import EPCRecord
 from backend.Property import Property
 from recommendations.LightingRecommendations import LightingRecommendations
 
@@ -9,18 +9,20 @@ from recommendations.tests.test_data.materials import materials
 class TestLightingRecommendations:
 
     def test_init_invalid_materials(self):
-        input_property0 = Property(id=1, postcode="F4k3 6", address="623 fake street")
+        epc_record = EPCRecord()
+        epc_record.prepared_epc = {"county": "Greater London Authority"}
+        input_property0 = Property(id=1, postcode="F4k3 6", address="623 fake street", epc_record=epc_record)
         input_property0.lighting = {"low_energy_proportion": 0}
-        input_property0.data = {"county": "Greater London Authority"}
         # Test for invalid materials
         with pytest.raises(ValueError):
             LightingRecommendations(input_property0, [])
 
     def test_recommend_no_action_needed(self):
         # Case where no recommendation is needed
-        input_property1 = Property(id=1, postcode="F4k3 6", address="623 fake street")
+        epc_record = EPCRecord()
+        epc_record.prepared_epc = {"county": "Greater London Authority"}
+        input_property1 = Property(id=1, postcode="F4k3 6", address="623 fake street", epc_record=epc_record)
         input_property1.lighting = {"low_energy_proportion": 100}
-        input_property1.data = {"county": "Greater London Authority"}
 
         lr = LightingRecommendations(input_property1, materials)
         lr.recommend()
@@ -28,9 +30,9 @@ class TestLightingRecommendations:
 
     def test_recommend_action_needed(self):
         # Case where recommendation is needed
-        input_property1 = Property(id=1, postcode="F4k3 6", address="623 fake street")
-        input_property1.lighting = {"low_energy_proportion": 100}
-        input_property1.data = {"county": "Greater London Authority"}
+        epc_record = EPCRecord()
+        epc_record.prepared_epc = {"county": "Greater London Authority"}
+        input_property1 = Property(id=1, postcode="F4k3 6", address="623 fake street", epc_record=epc_record)
         input_property1.lighting = {"low_energy_proportion": 0.80}
         input_property1.number_lighting_outlets = 20
 
diff --git a/recommendations/tests/test_roof_recommendations.py b/recommendations/tests/test_roof_recommendations.py
index 75b7ddb2..3d555a4f 100644
--- a/recommendations/tests/test_roof_recommendations.py
+++ b/recommendations/tests/test_roof_recommendations.py
@@ -1,12 +1,17 @@
 from backend.Property import Property
 from recommendations.RoofRecommendations import RoofRecommendations
 from recommendations.tests.test_data.materials import materials
+from etl.epc.Record import EPCRecord
 
 
 class TestRoofRecommendations:
 
     def test_loft_insulation_recommendation_no_insulation(self):
-        property_instance = Property(id=0, address="fake", postcode="fake")
+        epc_record = EPCRecord()
+        epc_record.prepared_epc = {
+            "county": "Cambridgeshire",
+        }
+        property_instance = Property(id=0, address="fake", postcode="fake", epc_record=epc_record)
         property_instance.age_band = "F"
         property_instance.insulation_floor_area = 100
         property_instance.roof = {
@@ -18,9 +23,6 @@ class TestRoofRecommendations:
             'is_at_rafters': False, 'is_assumed': True, 'has_dwelling_above': False, 'is_valid': True,
             'insulation_thickness': 'none', 'roof_thermal_transmittance': None, 'roof_insulation_thickness': 'none'
         }
-        property_instance.data = {
-            "county": "Cambridgeshire",
-        }
 
         roof_recommender = RoofRecommendations(property_instance=property_instance, materials=materials)
 
@@ -31,7 +33,9 @@ class TestRoofRecommendations:
         assert len(roof_recommender.recommendations)
 
     def test_loft_insulation_recommendation_50mm_insulation(self):
-        property_instance2 = Property(id=0, address="fake", postcode="fake")
+        epc_record = EPCRecord()
+        epc_record.prepared_epc = {"county": "Kent"}
+        property_instance2 = Property(id=0, address="fake", postcode="fake", epc_record=epc_record)
         property_instance2.age_band = "F"
         property_instance2.insulation_floor_area = 100
         property_instance2.roof = {
@@ -43,7 +47,6 @@ class TestRoofRecommendations:
             'is_at_rafters': False, 'is_assumed': True, 'has_dwelling_above': False, 'is_valid': True,
             'insulation_thickness': '50', 'roof_thermal_transmittance': None, 'roof_insulation_thickness': 'none'
         }
-        property_instance2.data = {"county": "Kent"}
 
         roof_recommender2 = RoofRecommendations(property_instance=property_instance2, materials=materials)
 
@@ -57,7 +60,9 @@ class TestRoofRecommendations:
         assert roof_recommender2.recommendations[0]["new_u_value"] == 0.14
         assert roof_recommender2.recommendations[0]["starting_u_value"] == 0.68
 
-        property_instance3 = Property(id=0, address="fake", postcode="fake")
+        epc_record = EPCRecord()
+        epc_record.prepared_epc = {"county": "Greater London Authority"}
+        property_instance3 = Property(id=0, address="fake", postcode="fake", epc_record=epc_record)
         property_instance3.age_band = "F"
         property_instance3.insulation_floor_area = 100
         property_instance3.roof = {
@@ -69,7 +74,6 @@ class TestRoofRecommendations:
             'is_at_rafters': False, 'is_assumed': True, 'has_dwelling_above': False, 'is_valid': True,
             'insulation_thickness': '50', 'roof_thermal_transmittance': None, 'roof_insulation_thickness': 'none'
         }
-        property_instance3.data = {"county": "Greater London Authority"}
 
         roof_recommender3 = RoofRecommendations(property_instance=property_instance3, materials=materials)
 
@@ -82,7 +86,9 @@ class TestRoofRecommendations:
         assert roof_recommender3.recommendations[0]["parts"][0]["depth"] == 270
 
     def test_loft_insulation_recommendation_150mm_insulation(self):
-        property_instance4 = Property(id=0, address="fake", postcode="fake")
+        epc_record = EPCRecord()
+        epc_record.prepared_epc = {"county": "North East Lincolnshire"}
+        property_instance4 = Property(id=0, address="fake", postcode="fake", epc_record=epc_record)
         property_instance4.age_band = "F"
         property_instance4.insulation_floor_area = 100
         property_instance4.roof = {
@@ -94,7 +100,6 @@ class TestRoofRecommendations:
             'is_at_rafters': False, 'is_assumed': True, 'has_dwelling_above': False, 'is_valid': True,
             'insulation_thickness': '150', 'roof_thermal_transmittance': None, 'roof_insulation_thickness': 'none'
         }
-        property_instance4.data = {"county": "North East Lincolnshire"}
 
         roof_recommender4 = RoofRecommendations(property_instance=property_instance4, materials=materials)
 
@@ -109,7 +114,9 @@ class TestRoofRecommendations:
         assert roof_recommender4.recommendations[0]["starting_u_value"] == 0.3
         assert roof_recommender4.recommendations[0]["parts"][0]["depth"] == 150
 
-        property_instance5 = Property(id=0, address="fake", postcode="fake")
+        epc_record = EPCRecord()
+        epc_record.prepared_epc = {"county": "Somerset"}
+        property_instance5 = Property(id=0, address="fake", postcode="fake", epc_record=epc_record)
         property_instance5.age_band = "F"
         property_instance5.insulation_floor_area = 100
         property_instance5.roof = {
@@ -121,7 +128,6 @@ class TestRoofRecommendations:
             'is_at_rafters': False, 'is_assumed': True, 'has_dwelling_above': False, 'is_valid': True,
             'insulation_thickness': '150', 'roof_thermal_transmittance': None, 'roof_insulation_thickness': 'none'
         }
-        property_instance5.data = {"county": "Somerset"}
 
         roof_recommender5 = RoofRecommendations(property_instance=property_instance5, materials=materials)
 
@@ -136,7 +142,9 @@ class TestRoofRecommendations:
 
     def test_loft_insulation_recommendation_270mm_insulation(self):
         # We shouldn't recommend anything in this case
-        property_instance6 = Property(id=0, address="fake", postcode="fake")
+        epc_record = EPCRecord()
+        epc_record.prepared_epc = {"county": "Portsmouth"}
+        property_instance6 = Property(id=0, address="fake", postcode="fake", epc_record=epc_record)
         property_instance6.age_band = "F"
         property_instance6.insulation_floor_area = 100
         property_instance6.roof = {
@@ -148,7 +156,6 @@ class TestRoofRecommendations:
             'is_at_rafters': False, 'is_assumed': True, 'has_dwelling_above': False, 'is_valid': True,
             'insulation_thickness': '270', 'roof_thermal_transmittance': None, 'roof_insulation_thickness': 'none'
         }
-        property_instance6.data = {"county": "Portsmouth"}
 
         roof_recommender6 = RoofRecommendations(property_instance=property_instance6, materials=materials)
 
@@ -277,7 +284,9 @@ class TestRoofRecommendations:
     #            "Insulate your room roof with 270mm of Example room roof insulation"
 
     def test_flat_no_insulation(self):
-        property_instance11 = Property(id=11, address="fake", postcode="fake")
+        epc_record = EPCRecord()
+        epc_record.prepared_epc = {"county": "Swindon"}
+        property_instance11 = Property(id=11, address="fake", postcode="fake", epc_record=epc_record)
         property_instance11.age_band = "D"
         property_instance11.insulation_floor_area = 33.5
         property_instance11.perimeter = 24
@@ -288,7 +297,6 @@ class TestRoofRecommendations:
             'is_roof_room': False, 'is_loft': False, 'is_flat': True, 'is_thatched': False, 'is_at_rafters': False,
             'is_assumed': True, 'has_dwelling_above': False, 'is_valid': True, 'insulation_thickness': 'none'
         }
-        property_instance11.data = {"county": "Swindon"}
 
         roof_recommender11 = RoofRecommendations(property_instance=property_instance11, materials=materials)
 
@@ -306,7 +314,9 @@ class TestRoofRecommendations:
                "Insulate the home's flat roof with 150mm of Ecotherm Eco-Versal General Purpose Insulation Board"
 
     def test_flat_insulated(self):
-        property_instance12 = Property(id=12, address="fake", postcode="fake")
+        epc_record = EPCRecord()
+        epc_record.prepared_epc = {"county": "Thurrock"}
+        property_instance12 = Property(id=12, address="fake", postcode="fake", epc_record=epc_record)
         property_instance12.age_band = "D"
         property_instance12.insulation_floor_area = 40
         property_instance12.perimeter = 30
@@ -319,7 +329,6 @@ class TestRoofRecommendations:
             'is_loft': False, 'is_flat': True, 'is_thatched': False, 'is_at_rafters': False, 'is_assumed': True,
             'has_dwelling_above': False, 'is_valid': True, 'insulation_thickness': 'average'
         }
-        property_instance12.data = {"county": "Thurrock"}
 
         roof_recommender12 = RoofRecommendations(property_instance=property_instance12, materials=materials)
 
@@ -330,7 +339,9 @@ class TestRoofRecommendations:
         assert not roof_recommender12.recommendations
 
     def test_flat_limited_insulation(self):
-        property_instance13 = Property(id=12, address="fake", postcode="fake")
+        epc_record = EPCRecord()
+        epc_record.prepared_epc = {"county": "Tyne and Wear"}
+        property_instance13 = Property(id=12, address="fake", postcode="fake", epc_record=epc_record)
         property_instance13.age_band = "D"
         property_instance13.insulation_floor_area = 40
         property_instance13.perimeter = 40
@@ -342,7 +353,6 @@ class TestRoofRecommendations:
             'is_loft': False, 'is_flat': True, 'is_thatched': False, 'is_at_rafters': False, 'is_assumed': True,
             'has_dwelling_above': False, 'is_valid': True, 'insulation_thickness': 'below average'
         }
-        property_instance13.data = {"county": "Tyne and Wear"}
 
         roof_recommender13 = RoofRecommendations(property_instance=property_instance13, materials=materials)
 
@@ -362,7 +372,9 @@ class TestRoofRecommendations:
                "Insulate the home's flat roof with 150mm of Ecotherm Eco-Versal General Purpose Insulation Board"
 
     def test_property_above(self):
-        property_instance14 = Property(id=0, address="fake", postcode="fake")
+        epc_record = EPCRecord()
+        epc_record.prepared_epc = {"county": "Suffolk"}
+        property_instance14 = Property(id=0, address="fake", postcode="fake", epc_record=epc_record)
         property_instance14.age_band = "F"
         property_instance14.insulation_floor_area = 100
         property_instance14.roof = {
@@ -373,7 +385,6 @@ class TestRoofRecommendations:
             'is_assumed': False, 'has_dwelling_above': True, 'is_valid': True,
             'insulation_thickness': None
         }
-        property_instance14.data = {"county": "Suffolk"}
 
         roof_recommender14 = RoofRecommendations(property_instance=property_instance14, materials=materials)
 

From 74c36b5456602bde4698603f7bbe3de8c160df6d Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Fri, 19 Jan 2024 17:44:46 +0000
Subject: [PATCH 13/48] fixed solar tests

---
 .../tests/test_solar_pv_recommendations.py    | 23 +++++++++++--------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/recommendations/tests/test_solar_pv_recommendations.py b/recommendations/tests/test_solar_pv_recommendations.py
index f2436cb1..5481cb17 100644
--- a/recommendations/tests/test_solar_pv_recommendations.py
+++ b/recommendations/tests/test_solar_pv_recommendations.py
@@ -1,45 +1,50 @@
 import pytest
 from recommendations.SolarPvRecommendations import SolarPvRecommendations
 from backend.Property import Property
+from etl.epc.Record import EPCRecord
 
 
 class TestSolarPvRecommendations:
     @pytest.fixture
     def property_instance_invalid_type(self):
         # Setup the property_instance with an invalid property type
-        property_instance_invalid_type = Property(id=1, address="", postcode="")
-        property_instance_invalid_type.data = {
+        epc_record = EPCRecord()
+        epc_record.prepared_epc = {
             "property-type": "InvalidType", "county": "Broxbourne", "photo-supply": None
         }
+        property_instance_invalid_type = Property(id=1, address="", postcode="", epc_record=epc_record)
         property_instance_invalid_type.roof = {"is_flat": False, "is_pitched": False, "is_roof_room": False}
         return property_instance_invalid_type
 
     @pytest.fixture
     def property_instance_invalid_roof(self):
         # Setup the property_instance with invalid roof type
-        property_instance_invalid_roof = Property(id=1, address="", postcode="")
-        property_instance_invalid_roof.data = {
+        epc_record = EPCRecord()
+        epc_record.prepared_epc = {
             "county": "Huntingdonshire", "property-type": "House", "photo-supply": None
         }
+        property_instance_invalid_roof = Property(id=1, address="", postcode="", epc_record=epc_record)
         property_instance_invalid_roof.roof = {"is_flat": False, "is_pitched": False, "is_roof_room": False}
         return property_instance_invalid_roof
 
     @pytest.fixture
     def property_instance_has_solar_pv(self):
         # Setup the property_instance without existing solar pv
-        property_instance_has_solar_pv = Property(id=1, address="", postcode="")
-        property_instance_has_solar_pv.data = {"photo-supply": "40", "county": "Huntingdonshire",
-                                               "property-type": "House"}
+        epc_record = EPCRecord()
+        epc_record.prepared_epc = {"photo-supply": "40", "county": "Huntingdonshire",
+                                   "property-type": "House"}
+        property_instance_has_solar_pv = Property(id=1, address="", postcode="", epc_record=epc_record)
         property_instance_has_solar_pv.roof = {"is_flat": True}
         return property_instance_has_solar_pv
 
     @pytest.fixture
     def property_instance_valid_all(self):
         # Setup a valid property_instance that passes all conditions
-        property_instance_valid_all = Property(id=1, address="", postcode="")
+        epc_record = EPCRecord()
+        epc_record.prepared_epc = {"property-type": "House", "photo-supply": None, "county": "Huntingdonshire"}
+        property_instance_valid_all = Property(id=1, address="", postcode="", epc_record=epc_record)
         property_instance_valid_all.solar_pv_roof_area = 20
         property_instance_valid_all.solar_pv_percentage = 40
-        property_instance_valid_all.data = {"property-type": "House", "photo-supply": None, "county": "Huntingdonshire"}
         property_instance_valid_all.roof = {"is_flat": True}
         return property_instance_valid_all
 

From 40976fd395f06a63ff07a030347d5a8ce218b891 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Fri, 19 Jan 2024 17:46:44 +0000
Subject: [PATCH 14/48] fixed ventialtion recs

---
 .../tests/test_ventilation_recommendations.py | 26 ++++++++++++-------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/recommendations/tests/test_ventilation_recommendations.py b/recommendations/tests/test_ventilation_recommendations.py
index 3242b1d1..aa992253 100644
--- a/recommendations/tests/test_ventilation_recommendations.py
+++ b/recommendations/tests/test_ventilation_recommendations.py
@@ -1,13 +1,15 @@
 from backend.Property import Property
 from recommendations.VentilationRecommendations import VentilationRecommendations
 from recommendations.tests.test_data.materials import materials
+from etl.epc.Record import EPCRecord
 
 
 class TestVentilationRecommendations:
 
     def test_natural_ventilation(self):
-        input_property1 = Property(id=1, postcode="F4k3 6", address="623 fake street")
-        input_property1.data = {"mechanical-ventilation": "natural"}
+        epc_record = EPCRecord()
+        epc_record.prepared_epc = {"mechanical-ventilation": "natural"}
+        input_property1 = Property(id=1, postcode="F4k3 6", address="623 fake street", epc_record=epc_record)
 
         recommender = VentilationRecommendations(
             property_instance=input_property1,
@@ -27,8 +29,9 @@ class TestVentilationRecommendations:
         assert recommender.recommendation[0]["parts"][0]["quantity"] == 2
 
     def test_missing_ventilation(self):
-        input_property2 = Property(id=1, postcode="F4k3 6", address="623 fake street")
-        input_property2.data = {"mechanical-ventilation": None}
+        epc_record = EPCRecord()
+        epc_record.prepared_epc = {"mechanical-ventilation": None}
+        input_property2 = Property(id=1, postcode="F4k3 6", address="623 fake street", epc_record=epc_record)
 
         recommender2 = VentilationRecommendations(
             property_instance=input_property2,
@@ -48,8 +51,9 @@ class TestVentilationRecommendations:
         assert recommender2.recommendation[0]["parts"][0]["quantity"] == 2
 
     def test_nodata_ventilation(self):
-        input_property3 = Property(id=1, postcode="F4k3 6", address="623 fake street")
-        input_property3.data = {"mechanical-ventilation": "NO DATA!!"}
+        epc_record = EPCRecord()
+        epc_record.prepared_epc = {"mechanical-ventilation": "NO DATA!!"}
+        input_property3 = Property(id=1, postcode="F4k3 6", address="623 fake street", epc_record=epc_record)
 
         recommender3 = VentilationRecommendations(
             property_instance=input_property3,
@@ -69,8 +73,9 @@ class TestVentilationRecommendations:
         assert recommender3.recommendation[0]["parts"][0]["quantity"] == 2
 
     def test_existing_ventilation_1(self):
-        input_property4 = Property(id=1, postcode="F4k3 6", address="623 fake street")
-        input_property4.data = {"mechanical-ventilation": 'mechanical, extract only'}
+        epc_record = EPCRecord()
+        epc_record.prepared_epc = {"mechanical-ventilation": "mechanical, extract only"}
+        input_property4 = Property(id=1, postcode="F4k3 6", address="623 fake street", epc_record=epc_record)
 
         recommender4 = VentilationRecommendations(
             property_instance=input_property4,
@@ -85,8 +90,9 @@ class TestVentilationRecommendations:
         assert recommender4.has_ventilaion
 
     def test_existing_ventilation_2(self):
-        input_property5 = Property(id=1, postcode="F4k3 6", address="623 fake street")
-        input_property5.data = {"mechanical-ventilation": 'mechanical, supply and extract'}
+        epc_record = EPCRecord()
+        epc_record.prepared_epc = {"mechanical-ventilation": "mechanical, supply and extract"}
+        input_property5 = Property(id=1, postcode="F4k3 6", address="623 fake street", epc_record=epc_record)
 
         recommender5 = VentilationRecommendations(
             property_instance=input_property5,

From bbb4892437f5e41f23ac38213e39c7f0bb3f55b6 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Fri, 19 Jan 2024 18:28:26 +0000
Subject: [PATCH 15/48] fixed recommendation unit tests

---
 .../tests/test_wall_recommendations.py        |  36 ++++---
 .../tests/test_window_recommendations.py      | 102 ++++++++++--------
 2 files changed, 80 insertions(+), 58 deletions(-)

diff --git a/recommendations/tests/test_wall_recommendations.py b/recommendations/tests/test_wall_recommendations.py
index bfc681f5..580ebb91 100644
--- a/recommendations/tests/test_wall_recommendations.py
+++ b/recommendations/tests/test_wall_recommendations.py
@@ -7,6 +7,7 @@ from recommendations.WallRecommendations import WallRecommendations
 from backend.Property import Property
 from recommendations.recommendation_utils import is_diminishing_returns
 from recommendations.tests.test_data.materials import materials
+from etl.epc.Record import EPCRecord
 
 
 # with open(
@@ -231,7 +232,9 @@ class TestWallRecommendationsBase:
 class TestCavityWallRecommensations:
 
     def test_fill_empty_cavity(self):
-        input_property = Property(id=1, postcode="F4k3", address="123 fake street")
+        epc_record = EPCRecord()
+        epc_record.prepared_epc = {"county": "Derbyshire"}
+        input_property = Property(id=1, postcode="F4k3", address="123 fake street", epc_record=epc_record)
         input_property.walls = {
             'original_description': 'Cavity wall, as built, no insulation (assumed)',
             'clean_description': 'Cavity wall, as built, no insulation',
@@ -245,7 +248,6 @@ class TestCavityWallRecommensations:
         }
         input_property.age_band = "C"
         input_property.insulation_wall_area = 50
-        input_property.data = {"county": "Derbyshire"}
 
         recommender = WallRecommendations(
             property_instance=input_property,
@@ -265,7 +267,9 @@ class TestCavityWallRecommensations:
         assert np.isclose(recommender.recommendations[1]["total"], 2004.6600000000003)
 
     def test_fill_partial_filled_cavity(self):
-        input_property = Property(id=1, postcode="F4k3", address="123 fake street")
+        epc_record = EPCRecord()
+        epc_record.prepared_epc = {"county": "County Durham"}
+        input_property = Property(id=1, postcode="F4k3", address="123 fake street", epc_record=epc_record)
         input_property.walls = {
             'original_description': 'Cavity wall, as built, partial insulation (assumed)',
             'clean_description': 'Cavity wall, as built, partial insulation',
@@ -279,7 +283,6 @@ class TestCavityWallRecommensations:
         }
         input_property.age_band = "C"
         input_property.insulation_wall_area = 50
-        input_property.data = {"county": "County Durham"}
 
         recommender = WallRecommendations(
             property_instance=input_property,
@@ -299,7 +302,9 @@ class TestCavityWallRecommensations:
         assert np.isclose(recommender.recommendations[1]["total"], 1999.9350000000002)
 
     def test_system_built_wall(self):
-        input_property2 = Property(id=1, postcode="F4k3 2", address="223 fake street")
+        epc_record = EPCRecord()
+        epc_record.prepared_epc = {"property-type": "House", "county": "Derbyshire", "built-form": "Detached"}
+        input_property2 = Property(id=1, postcode="F4k3 2", address="223 fake street", epc_record=epc_record)
         input_property2.walls = {
             'original_description': 'System built, as built, no insulation (assumed)',
             'clean_description': 'System built, as built, no insulation',
@@ -314,7 +319,6 @@ class TestCavityWallRecommensations:
         input_property2.age_band = "F"
         input_property2.insulation_wall_area = 120
         input_property2.restricted_measures = False
-        input_property2.data = {"property-type": "House", "county": "Derbyshire", "built-form": "Detached"}
 
         assert input_property2.walls["is_system_built"]
 
@@ -346,7 +350,9 @@ class TestCavityWallRecommensations:
         assert recommender2.recommendations[6]["parts"][0]["depth"] == 52.5
 
     def test_timber_frame_wall(self):
-        input_property3 = Property(id=1, postcode="F4k3 2", address="223 fake street")
+        epc_record = EPCRecord()
+        epc_record.prepared_epc = {"property-type": "House", "county": "Derbyshire", "built-form": "Semi-Detached"}
+        input_property3 = Property(id=1, postcode="F4k3 2", address="223 fake street", epc_record=epc_record)
         input_property3.walls = {
             'original_description': 'Timber frame, as built, no insulation (assumed)',
             'clean_description': 'Timber frame, as built, no insulation',
@@ -361,7 +367,6 @@ class TestCavityWallRecommensations:
         input_property3.age_band = "B"
         input_property3.insulation_wall_area = 99
         input_property3.restricted_measures = False
-        input_property3.data = {"property-type": "House", "county": "Derbyshire", "built-form": "Semi-Detached"}
 
         assert input_property3.walls["is_timber_frame"]
 
@@ -388,7 +393,9 @@ class TestCavityWallRecommensations:
         assert recommender3.recommendations[1]["parts"][0]["depth"] == 150.0
 
     def test_granite_or_whinstone_wall(self):
-        input_property4 = Property(id=1, postcode="F4k3 2", address="223 fake street")
+        epc_record = EPCRecord()
+        epc_record.prepared_epc = {"property-type": "Bungalow", "county": "Derbyshire", "built-form": "Detached"}
+        input_property4 = Property(id=1, postcode="F4k3 2", address="223 fake street", epc_record=epc_record)
         input_property4.walls = {
             'original_description': 'Granite or whinstone, as built, no insulation (assumed)',
             'clean_description': 'Granite or whinstone, as built, no insulation',
@@ -403,7 +410,6 @@ class TestCavityWallRecommensations:
         input_property4.age_band = "A"
         input_property4.insulation_wall_area = 223
         input_property4.restricted_measures = False
-        input_property4.data = {"property-type": "Bungalow", "county": "Derbyshire", "built-form": "Detached"}
 
         assert input_property4.walls["is_granite_or_whinstone"]
 
@@ -430,7 +436,9 @@ class TestCavityWallRecommensations:
         assert recommender4.recommendations[1]["parts"][0]["depth"] == 150
 
     def test_cob_wall(self):
-        input_property5 = Property(id=1, postcode="F4k3 2", address="223 fake street")
+        epc_record = EPCRecord()
+        epc_record.prepared_epc = {"property-type": "Bungalow", "county": "Derbyshire", "built-form": "Detached"}
+        input_property5 = Property(id=1, postcode="F4k3 2", address="223 fake street", epc_record=epc_record)
         input_property5.walls = {
             'original_description': 'Cob, as built',
             'clean_description': 'Cob, as built',
@@ -445,7 +453,6 @@ class TestCavityWallRecommensations:
         input_property5.age_band = "E"
         input_property5.insulation_wall_area = 77
         input_property5.restricted_measures = False
-        input_property5.data = {"property-type": "Bungalow", "county": "Derbyshire", "built-form": "Detached"}
 
         assert input_property5.walls["is_cob"]
 
@@ -472,7 +479,9 @@ class TestCavityWallRecommensations:
         assert recommender5.recommendations[3]["parts"][0]["depth"] == 100
 
     def test_sandstone_or_limestone_wall(self):
-        input_property6 = Property(id=1, postcode="F4k3 6", address="623 fake street")
+        epc_record = EPCRecord()
+        epc_record.prepared_epc = {"property-type": "House", "county": "Derbyshire", "built-form": "Mid-Terrace"}
+        input_property6 = Property(id=1, postcode="F4k3 6", address="623 fake street", epc_record=epc_record)
         input_property6.walls = {
             'original_description': 'Sandstone or limestone, as built, no insulation (assumed)',
             'clean_description': 'Sandstone or limestone, as built, no insulation',
@@ -487,7 +496,6 @@ class TestCavityWallRecommensations:
         input_property6.age_band = "F"
         input_property6.insulation_wall_area = 350
         input_property6.restricted_measures = False
-        input_property6.data = {"property-type": "House", "county": "Derbyshire", "built-form": "Mid-Terrace"}
 
         assert input_property6.walls["is_sandstone_or_limestone"]
 
diff --git a/recommendations/tests/test_window_recommendations.py b/recommendations/tests/test_window_recommendations.py
index 664a1e39..36e70834 100644
--- a/recommendations/tests/test_window_recommendations.py
+++ b/recommendations/tests/test_window_recommendations.py
@@ -1,6 +1,7 @@
 from recommendations.WindowsRecommendations import WindowsRecommendations
 from backend.Property import Property
 from recommendations.tests.test_data.materials import materials
+from etl.epc.Record import EPCRecord
 
 
 class TestWindowRecommendations:
@@ -10,16 +11,17 @@ class TestWindowRecommendations:
         For this property, we expect all windows to be single glazed and should recommend full double glazing
         :return:
         """
-
+        epc_record = EPCRecord()
+        epc_record.prepared_epc = {
+            "county": "Wychavon",
+            "multi-glaze-proportion": 0,
+            "uprn": 0
+        }
         property_1 = Property(
             id=1,
             postcode='1',
             address='1',
-            data={
-                "county": "Wychavon",
-                "multi-glaze-proportion": 0,
-                "uprn": 0
-            }
+            epc_record=epc_record
         )
         property_1.windows = {
             'original_description': 'Single glazed', 'has_glazing': False, 'glazing_coverage': 'full',
@@ -47,16 +49,17 @@ class TestWindowRecommendations:
         double glazing
         :return:
         """
-
+        epc_record = EPCRecord()
+        epc_record.prepared_epc = {
+            "county": "Wychavon",
+            "multi-glaze-proportion": 33,
+            "uprn": 0
+        }
         property_2 = Property(
             id=1,
             postcode='1',
             address='1',
-            data={
-                "county": "Wychavon",
-                "multi-glaze-proportion": 33,
-                "uprn": 0
-            }
+            epc_record=epc_record
         )
         property_2.windows = {'original_description': 'Mostly double glazing', 'has_glazing': True,
                               'glazing_coverage': 'most',
@@ -81,16 +84,17 @@ class TestWindowRecommendations:
         This property has full double glazing so we shouldn't recommend anything
         :return:
         """
-
+        epc_record = EPCRecord()
+        epc_record.prepared_epc = {
+            "county": "Wychavon",
+            "multi-glaze-proportion": 100,
+            "uprn": 0
+        }
         property_3 = Property(
             id=1,
             postcode='1',
             address='1',
-            data={
-                "county": "Wychavon",
-                "multi-glaze-proportion": 80,
-                "uprn": 0
-            }
+            epc_record=epc_record
         )
         property_3.windows = {'original_description': 'Fully double glazed', 'has_glazing': True,
                               'glazing_coverage': 'full',
@@ -106,15 +110,17 @@ class TestWindowRecommendations:
         assert not recommender3.recommendation
 
     def test_fully_secondary_glazed(self):
+        epc_record = EPCRecord()
+        epc_record.prepared_epc = {
+            "county": "Wychavon",
+            "multi-glaze-proportion": 100,
+            "uprn": 0
+        }
         property_4 = Property(
             id=1,
             postcode='1',
             address='1',
-            data={
-                "county": "Wychavon",
-                "multi-glaze-proportion": 100,
-                "uprn": 0
-            }
+            epc_record=epc_record
         )
         property_4.windows = {'original_description': 'Full secondary glazing', 'has_glazing': True,
                               'glazing_coverage': 'full',
@@ -130,15 +136,17 @@ class TestWindowRecommendations:
         assert not recommender4.recommendation
 
     def test_partial_secondary_glazing(self):
+        epc_record = EPCRecord()
+        epc_record.prepared_epc = {
+            "county": "Wychavon",
+            "multi-glaze-proportion": 50,
+            "uprn": 0
+        }
         property_5 = Property(
             id=1,
             postcode='1',
             address='1',
-            data={
-                "county": "Wychavon",
-                "multi-glaze-proportion": 50,
-                "uprn": 0
-            }
+            epc_record=epc_record
         )
         property_5.windows = {'original_description': 'Partial secondary glazing', 'has_glazing': True,
                               'glazing_coverage': 'partial',
@@ -160,15 +168,18 @@ class TestWindowRecommendations:
              'labour_days': 0.8125, 'is_secondary_glazing': True}]
 
     def test_single_glazed_restricted_measures(self):
+        epc_record = EPCRecord()
+        epc_record.prepared_epc = {
+            "county": "Wychavon",
+            "multi-glaze-proportion": 0,
+            "uprn": 0
+        }
+
         property_6 = Property(
             id=1,
             postcode='1',
             address='1',
-            data={
-                "county": "Wychavon",
-                "multi-glaze-proportion": 0,
-                "uprn": 0
-            }
+            epc_record=epc_record
         )
         property_6.windows = {'original_description': 'Single glazed', 'has_glazing': False, 'glazing_coverage': None,
                               'glazing_type': 'single',
@@ -195,15 +206,17 @@ class TestWindowRecommendations:
         ]
 
     def test_full_triple_glazed(self):
+        epc_record = EPCRecord()
+        epc_record.prepared_epc = {
+            "county": "Wychavon",
+            "multi-glaze-proportion": 100,
+            "uprn": 0
+        }
         property_7 = Property(
             id=1,
             postcode='1',
             address='1',
-            data={
-                "county": "Wychavon",
-                "multi-glaze-proportion": 100,
-                "uprn": 0
-            }
+            epc_record=epc_record
         )
         property_7.windows = {'original_description': 'Fully triple glazed', 'has_glazing': True,
                               'glazing_coverage': 'full',
@@ -222,16 +235,17 @@ class TestWindowRecommendations:
         """
         We should just recommend double glazing to the remaining windows, since it's a cheaper option
         """
-
+        epc_record = EPCRecord()
+        epc_record.prepared_epc = {
+            "county": "Wychavon",
+            "multi-glaze-proportion": 80,
+            "uprn": 1
+        }
         property_8 = Property(
             id=1,
             postcode='1',
             address='1',
-            data={
-                "county": "Wychavon",
-                "multi-glaze-proportion": 80,
-                "uprn": 1
-            }
+            epc_record=epc_record
         )
         property_8.windows = {'original_description': 'Mostly triple glazing', 'has_glazing': True,
                               'glazing_coverage': 'most',

From 4adfa0bb6228278b1c3162b551ac8960f60cf48b Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Fri, 19 Jan 2024 19:44:38 +0000
Subject: [PATCH 16/48] fixed all tests

---
 backend/tests/test_property.py | 53 ++++++++++++++++++++++------------
 1 file changed, 35 insertions(+), 18 deletions(-)

diff --git a/backend/tests/test_property.py b/backend/tests/test_property.py
index 09594a40..43149791 100644
--- a/backend/tests/test_property.py
+++ b/backend/tests/test_property.py
@@ -1,9 +1,9 @@
 import pandas as pd
 import pytest
 from unittest.mock import Mock
-from epc_api.client import EpcClient
 from backend.Property import Property
 from etl.epc_clean.EpcClean import EpcClean
+from etl.epc.Record import EPCRecord
 
 # Define some test data
 mock_epc_response = {
@@ -196,12 +196,21 @@ class TestProperty:
 
     @pytest.fixture(autouse=True)
     def property_instance(self, mock_cleaner):
-        property_instance = Property(id=1, postcode="AB12CD", address="Test Address", data=mock_epc_response["rows"][0])
+        epc_record = EPCRecord()
+        epc_record.prepared_epc = mock_epc_response["rows"][0]
+
+        property_instance = Property(id=1, postcode="AB12CD", address="Test Address", epc_record=epc_record)
+        property_instance.number_of_floors = 2
+        property_instance.number_of_rooms = 5
+        property_instance.floor_area = 100
+        property_instance.floor_height = 2.5
         return property_instance
 
     @pytest.fixture(autouse=True)
     def property_instance_dupe_data(self):
-        property_instance_dupe_data = Property(id=2, postcode="AB12CD", address="Test Address")
+        epc_record = EPCRecord()
+        epc_record.prepared_epc = mock_epc_response_dupe["rows"][0]
+        property_instance_dupe_data = Property(id=2, postcode="AB12CD", address="Test Address", epc_record=epc_record)
         return property_instance_dupe_data
 
     # @pytest.fixture
@@ -271,15 +280,17 @@ class TestProperty:
         return mock_cleaner
 
     def test_init(self):
-        inst1 = Property(0, postcode="AB12CD", address="Test Address")
+        epc_record = EPCRecord()
+        epc_record.prepared_epc = {"uprn": 1}
+        inst1 = Property(0, postcode="AB12CD", address="Test Address", epc_record=epc_record)
 
-        assert inst1.data is None
+        assert inst1.data is not None
 
-        inst2 = Property(3, "AB12CD", "Test Address")
+        inst2 = Property(3, "AB12CD", "Test Address", epc_record=epc_record)
         assert inst2.id == 3
 
-        inst3 = Property(4, "AB12CD", "Test Address", data={"some": "data", "uprn": 123})
-        assert inst3.data == {"some": "data", "uprn": 123}
+        inst3 = Property(4, "AB12CD", "Test Address", epc_record=epc_record)
+        assert inst3.data == {"uprn": 1}
 
     def test_get_components(
         self, property_instance, mock_cleaner, mock_photo_supply_lookup, mock_floor_area_decile_thresholds
@@ -372,7 +383,9 @@ class TestProperty:
             property_instance.get_components(cleaned, mock_photo_supply_lookup, mock_floor_area_decile_thresholds)
 
     def test_set_spatial(self):
-        prop = Property(1, postcode="AB12CD", address="Test Address")
+        epc_record = EPCRecord()
+        epc_record.prepared_epc = mock_epc_response["rows"][0]
+        prop = Property(1, postcode="AB12CD", address="Test Address", epc_record=epc_record)
 
         spatial1 = pd.DataFrame([{
             'X_COORDINATE': 411143.0, 'Y_COORDINATE': 281701.0, 'LATITUDE': 52.4331896, 'LONGITUDE': -1.8375238,
@@ -386,7 +399,7 @@ class TestProperty:
         assert prop.is_heritage
         assert prop.restricted_measures
 
-        prop2 = Property(1, "AB12CD", "Test Address")
+        prop2 = Property(1, "AB12CD", "Test Address", epc_record=epc_record)
 
         spatial2 = pd.DataFrame([{
             'X_COORDINATE': 411143.0, 'Y_COORDINATE': 281701.0, 'LATITUDE': 52.4331896, 'LONGITUDE': -1.8375238,
@@ -403,8 +416,9 @@ class TestProperty:
     def test_set_floor_level(self):
         # In this case, we have a flat which looks looks it's on the first floor, but it's actually on the ground
         # floor, so we should set floor_level to 0
-        prop = Property(1, postcode="AB12CD", address="Test Address")
-        prop.data = {'floor-level': '01', 'property-type': 'Flat'}
+        epc_record = EPCRecord()
+        epc_record.prepared_epc = {'floor-level': '01', 'property-type': 'Flat'}
+        prop = Property(1, postcode="AB12CD", address="Test Address", epc_record=epc_record)
         prop.floor = {
             'original_description': 'Solid, no insulation (assumed)', 'clean_description': 'Solid, no insulation',
             'thermal_transmittance': None, 'thermal_transmittance_unit': None, 'is_assumed': True,
@@ -419,8 +433,9 @@ class TestProperty:
 
         # This property is labelled as being on the ground floor but actually has another property below
         # so we set floor level to 1
-        prop2 = Property(1, postcode="AB12CD", address="Test Address")
-        prop2.data = {'floor-level': 'Ground', 'property-type': 'Flat'}
+        epc_record = EPCRecord()
+        epc_record.prepared_epc = {'floor-level': 'Ground', 'property-type': 'Flat'}
+        prop2 = Property(1, postcode="AB12CD", address="Test Address", epc_record=epc_record)
         prop2.floor = {
             'original_description': '(Another dwelling below)', 'clean_description': 'Solid, no insulation',
             'thermal_transmittance': None, 'thermal_transmittance_unit': None, 'is_assumed': False,
@@ -434,8 +449,9 @@ class TestProperty:
         assert prop2.floor_level == 1
 
         # this property is correctly labelled as being on the 2nd floor
-        prop3 = Property(1, postcode="AB12CD", address="Test Address")
-        prop3.data = {'floor-level': '02', 'property-type': 'Flat'}
+        epc_record = EPCRecord()
+        epc_record.prepared_epc = {'floor-level': '02', 'property-type': 'Flat'}
+        prop3 = Property(1, postcode="AB12CD", address="Test Address", epc_record=epc_record)
         prop3.floor = {
             'original_description': '(Another dwelling below)', 'clean_description': 'Solid, no insulation',
             'thermal_transmittance': None, 'thermal_transmittance_unit': None, 'is_assumed': False,
@@ -449,8 +465,9 @@ class TestProperty:
         assert prop3.floor_level == 2
 
         # Example of a house
-        prop4 = Property(1, postcode="AB12CD", address="Test Address")
-        prop4.data = {'floor-level': '', 'property-type': 'House'}
+        epc_record = EPCRecord()
+        epc_record.prepared_epc = {'floor-level': '', 'property-type': 'House'}
+        prop4 = Property(1, postcode="AB12CD", address="Test Address", epc_record=epc_record)
         prop4.floor = {
             'original_description': '(Another dwelling below)', 'clean_description': 'Solid, no insulation',
             'thermal_transmittance': None, 'thermal_transmittance_unit': None, 'is_assumed': False,

From 01a4628d206be30ed88c195fa9b7b04909a53637 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Mon, 22 Jan 2024 11:03:45 +0000
Subject: [PATCH 17/48] read in asset list for ha 1, working on ha 6

---
 etl/eligibility/ha_15_32/app.py               |   2 -
 .../ha_15_32/ha_analysis_batch_3.py           | 182 ++++++++++++++++++
 2 files changed, 182 insertions(+), 2 deletions(-)
 create mode 100644 etl/eligibility/ha_15_32/ha_analysis_batch_3.py

diff --git a/etl/eligibility/ha_15_32/app.py b/etl/eligibility/ha_15_32/app.py
index 76aadcc4..ce216364 100644
--- a/etl/eligibility/ha_15_32/app.py
+++ b/etl/eligibility/ha_15_32/app.py
@@ -16,8 +16,6 @@ from dotenv import load_dotenv
 from backend.SearchEpc import SearchEpc
 from backend.Property import Property
 from etl.eligibility.Eligibility import Eligibility
-from etl.epc.DataProcessor import DataProcessor
-from backend.app.plan.utils import create_recommendation_scoring_data
 from etl.epc.settings import COLUMNS_TO_MERGE_ON
 from backend.ml_models.api import ModelApi
 
diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
new file mode 100644
index 00000000..7c28d481
--- /dev/null
+++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
@@ -0,0 +1,182 @@
+import os
+import msgpack
+import openpyxl
+from pathlib import Path
+from tqdm import tqdm
+from datetime import datetime
+import pandas as pd
+import numpy as np
+from utils.s3 import read_from_s3, read_dataframe_from_s3_parquet
+from utils.logger import setup_logger
+from dotenv import load_dotenv
+from tqdm import tqdm
+from backend.SearchEpc import SearchEpc
+from etl.eligibility.Eligibility import Eligibility
+from etl.eligibility.ha_15_32.app import prepare_model_data_row
+from etl.epc.settings import COLUMNS_TO_MERGE_ON
+from backend.ml_models.api import ModelApi
+from etl.solar.SolarPhotoSupply import SolarPhotoSupply
+from recommendations.recommendation_utils import calculate_cavity_age
+from recommendation_utils import convert_thickness_to_numeric
+
+EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
+ENV_FILE = Path(__file__).parent / "etl" / "eligibility" / "ha_15_32" / ".env"
+
+logger = setup_logger()
+load_dotenv(ENV_FILE)
+
+
+class DataLoader:
+    COLOUR_CONFIG = {
+        "ha_1": {
+            "asset_list": {"red": "FFFF0000", "green": "FF00B050"},
+        },
+        "ha_6": {
+            "asset_list": {"red": "FFFF0000", "green": "FF00B050"},
+        },
+    }
+
+    def __init__(self, files):
+        self.files = files
+
+    def load_asset_list(self, file_path, ha_name, sheet_name=None):
+        workbook = openpyxl.load_workbook(file_path)
+        if sheet_name is not None:
+            sheet = workbook[sheet_name]
+        else:
+            sheet = workbook.active
+        sheet_colnames = [cell.value for cell in sheet[1]]
+
+        rows_data = []
+        rows_colors = []
+        for row in sheet.iter_rows(min_row=2, values_only=False):  # Assuming the first row is headers
+            row_data = [cell.value for cell in row]  # This will get you the cell values
+            row_color = row[0].fill.start_color.index if row[0].fill.start_color.index != '00000000' else None
+            # row_color = COLOR_INDEX[row_color]
+            rows_data.append(row_data)
+            rows_colors.append(row_color)
+
+        asset_list = pd.DataFrame(rows_data, columns=sheet_colnames)
+        asset_list['row_color'] = rows_colors
+
+        asset_list_colours = self.COLOUR_CONFIG[ha_name]["asset_list"]
+
+        asset_list["row_colour_name"] = np.where(
+            asset_list["row_color"] == asset_list_colours["red"], "red",
+            np.where(asset_list["row_color"] == asset_list_colours["green"], "green", "yellow")
+        )
+
+        asset_list["row_meaning"] = np.where(
+            asset_list["row_colour_name"] == "red", "does not meet criteria",
+            np.where(
+                asset_list["row_colour_name"] == "green", "identified potential eco works (CWI)", "maybe in the future"
+            )
+        )
+
+        return asset_list
+
+    def load_survey_list(self, file_path, ha_name, sheet_name=None):
+        survey_workbook = openpyxl.load_workbook(file_path)
+        if sheet_name is not None:
+            survey_sheet = survey_workbook[sheet_name]
+        else:
+            survey_sheet = survey_workbook.active
+
+        survey_rows = []
+        survey_colors = []
+
+        for row in tqdm(survey_sheet.iter_rows(min_row=2, values_only=False)):  # Assuming the first row is headers
+            row_data = [cell.value for cell in row]  # This will get you the cell values
+            row_color = row[0].fill.start_color.index if row[0].fill.start_color.index != '00000000' else None
+            survey_rows.append(row_data)
+            survey_colors.append(row_color)
+
+        survey_list = pd.DataFrame(survey_rows, columns=[cell.value for cell in survey_sheet[1]])
+
+        survey_list["row_colour"] = survey_colors
+        survey_list_colours = self.COLOUR_CONFIG[ha_name]["asset_list"]
+
+        # The survey list has 4 possible colours:
+        # PURPLE - Installer advised install complete and a complimentary post works EPC has been completed.
+        # GREEN - Installer advised install complete.
+        # RED - Cancelled
+        # NO FILL - No official update from installer (could be installed or cancelled)
+
+        survey_list["row_colour_name"] = np.where(
+            survey_list["row_colour"] == survey_list_colours["red"], "red",
+            np.where(survey_list["row_colour"] == survey_list_colours["green"], "green",
+                     np.where(survey_list["row_colour"] == survey_list_colours["purple"], "purple", "yellow"))
+        )
+
+        survey_list["row_meaning"] = np.where(
+            survey_list["row_colour_name"] == "red", "Cancelled",
+            np.where(
+                survey_list["row_colour_name"] == "green",
+                "Installer advised install complete",
+                np.where(
+                    survey_list["row_colour_name"] == "purple",
+                    "Installer advised install complete and a complimentary post works EPC has been completed",
+                    "No official update from installer (could be installed or cancelled)"
+                )
+            )
+        )
+
+        return survey_list
+
+    def load(self):
+
+        data = {}
+        for ha_name, file_config in self.files.items():
+            # Load asset list
+            # logger.info("LOading asset list for {}".format(ha_name))
+            asset_list = self.load_asset_list(
+                file_path=file_config["asset_list"]["filepath"],
+                ha_name=ha_name,
+                sheet_name=file_config["asset_list"]["sheetname"]
+            )
+
+            if file_config.get("survey_list"):
+                survey_list = self.load_survey_list(
+                    file_path=file_config["survey_list"]["filepath"],
+                    ha_name=ha_name,
+                    sheet_name=file_config["survey_list"]["sheetname"]
+                )
+            else:
+                survey_list = None
+
+            data[ha_name] = {
+                "asset_list": asset_list,
+                "survey_list": survey_list
+            }
+
+
+def app():
+    """
+    This app contains the housign association analysis for HAs 1, 6, 14, 39 and 107.
+    Only HA 6 has surveys
+    :return:
+    """
+
+    files = {
+        "ha_1": {
+            "asset_list": {
+                "filepath": "etl/eligibility/ha_15_32/HA 1 - ASSET LIST.xlsx",
+                "sheetname": "HA 1"
+            }
+        },
+        "ha_6": {
+            "asset_list": {
+                "filepath": "etl/eligibility/ha_15_32/HA 6 - ASSET LIST.xlsx",
+                "sheetname": "HA 6"
+            },
+            "survey_list": {
+                "filepath": "etl/eligibility/ha_15_32/HA 6 - SURVEY LIST.xlsx",
+                "sheetname": "HA 6"
+            }
+        },
+        "ha_14": {"asset_list": "etl/eligibility/ha_15_32/HA 14 - ASSET LIST.xlsx"},
+        "ha_39": {"asset_list": "etl/eligibility/ha_15_32/HA 39 - ASSET LIST.xlsx"},
+        "ha_107": {"asset_list": "etl/eligibility/ha_15_32/HA 107 - ASSET LIST.xlsx"}
+    }
+
+    loader = DataLoader(files)

From b22003d2066b4de6b9d3c1aba9091cc5bf98b09b Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Mon, 22 Jan 2024 11:12:06 +0000
Subject: [PATCH 18/48] Read in survey list for HA 6

---
 .../ha_15_32/ha_analysis_batch_3.py            | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
index 7c28d481..9a95cd21 100644
--- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
+++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
@@ -33,6 +33,9 @@ class DataLoader:
         },
         "ha_6": {
             "asset_list": {"red": "FFFF0000", "green": "FF00B050"},
+            "survey_list": {
+                "green": "FF92D050", "purple": "FF7030A0", "red": "FFFF0000", "blue": "FF00B0F0"
+            }
         },
     }
 
@@ -57,6 +60,7 @@ class DataLoader:
             rows_colors.append(row_color)
 
         asset_list = pd.DataFrame(rows_data, columns=sheet_colnames)
+        asset_list = asset_list.loc[:, asset_list.columns.notnull()]
         asset_list['row_color'] = rows_colors
 
         asset_list_colours = self.COLOUR_CONFIG[ha_name]["asset_list"]
@@ -92,20 +96,24 @@ class DataLoader:
             survey_colors.append(row_color)
 
         survey_list = pd.DataFrame(survey_rows, columns=[cell.value for cell in survey_sheet[1]])
+        # Remove columns that are None
+        survey_list = survey_list.loc[:, survey_list.columns.notnull()]
 
         survey_list["row_colour"] = survey_colors
-        survey_list_colours = self.COLOUR_CONFIG[ha_name]["asset_list"]
+        survey_list_colours = self.COLOUR_CONFIG[ha_name]["survey_list"]
 
         # The survey list has 4 possible colours:
         # PURPLE - Installer advised install complete and a complimentary post works EPC has been completed.
         # GREEN - Installer advised install complete.
         # RED - Cancelled
+        # BLUE - Loft Only Installed
         # NO FILL - No official update from installer (could be installed or cancelled)
 
         survey_list["row_colour_name"] = np.where(
             survey_list["row_colour"] == survey_list_colours["red"], "red",
             np.where(survey_list["row_colour"] == survey_list_colours["green"], "green",
-                     np.where(survey_list["row_colour"] == survey_list_colours["purple"], "purple", "yellow"))
+                     np.where(survey_list["row_colour"] == survey_list_colours["purple"], "purple",
+                              np.where(survey_list["row_colour"] == survey_list_colours["blue"], "blue", "no fill")))
         )
 
         survey_list["row_meaning"] = np.where(
@@ -116,7 +124,11 @@ class DataLoader:
                 np.where(
                     survey_list["row_colour_name"] == "purple",
                     "Installer advised install complete and a complimentary post works EPC has been completed",
-                    "No official update from installer (could be installed or cancelled)"
+                    np.where(
+                        survey_list["row_colour_name"] == "blue",
+                        "Loft Only Installed",
+                        "No official update from installer (could be installed or cancelled)"
+                    )
                 )
             )
         )

From f1670498d1fcc55473f63ca76287fe3309a648d7 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Mon, 22 Jan 2024 11:15:56 +0000
Subject: [PATCH 19/48] Setting up to merge HA6

---
 .../ha_15_32/ha_analysis_batch_3.py           | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
index 9a95cd21..bd2c6c99 100644
--- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
+++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
@@ -77,9 +77,12 @@ class DataLoader:
             )
         )
 
+        # Add in asset_list_row_id
+        asset_list["asset_list_row_id"] = [ha_name + str(i) for i in range(0, len(asset_list))]
+
         return asset_list
 
-    def load_survey_list(self, file_path, ha_name, sheet_name=None):
+    def load_survey_list(self, file_path, ha_name, asset_list, sheet_name=None):
         survey_workbook = openpyxl.load_workbook(file_path)
         if sheet_name is not None:
             survey_sheet = survey_workbook[sheet_name]
@@ -133,8 +136,22 @@ class DataLoader:
             )
         )
 
+        # Add in asset_list_row_id
+        survey_list["survey_list_row_id"] = [ha_name + str(i) for i in range(0, len(survey_list))]
+
+        # We now do the matching between the asset list and the survey list.
+        # What we'll get from this is a lookup table from the asset list to the survey list
+
+        if ha_name == "ha_6":
+            self.merge_ha_6(asset_list, survey_list)
+        else:
+            raise NotImplementedError("Only HA 6 has surveys")
+
         return survey_list
 
+    def merge_ha_6(self, asset_list, survey_list):
+        pass
+
     def load(self):
 
         data = {}

From cf9253d06201bbadca263eef269de973957b9556 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Mon, 22 Jan 2024 11:41:52 +0000
Subject: [PATCH 20/48] working on matching code for HA6 asset and survey lists

---
 .../ha_15_32/ha_analysis_batch_3.py           | 49 +++++++++++++++++--
 1 file changed, 45 insertions(+), 4 deletions(-)

diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
index bd2c6c99..7fbddd54 100644
--- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
+++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
@@ -52,7 +52,7 @@ class DataLoader:
 
         rows_data = []
         rows_colors = []
-        for row in sheet.iter_rows(min_row=2, values_only=False):  # Assuming the first row is headers
+        for row in tqdm(sheet.iter_rows(min_row=2, values_only=False)):  # Assuming the first row is headers
             row_data = [cell.value for cell in row]  # This will get you the cell values
             row_color = row[0].fill.start_color.index if row[0].fill.start_color.index != '00000000' else None
             # row_color = COLOR_INDEX[row_color]
@@ -137,7 +137,7 @@ class DataLoader:
         )
 
         # Add in asset_list_row_id
-        survey_list["survey_list_row_id"] = [ha_name + str(i) for i in range(0, len(survey_list))]
+        survey_list["survey_list_row_id"] = [ha_name + "_surveys_" + str(i) for i in range(0, len(survey_list))]
 
         # We now do the matching between the asset list and the survey list.
         # What we'll get from this is a lookup table from the asset list to the survey list
@@ -150,14 +150,53 @@ class DataLoader:
         return survey_list
 
     def merge_ha_6(self, asset_list, survey_list):
-        pass
+
+        # Prepare the asset list
+        asset_list["matching_address"] = asset_list["propertyaddress"].str.lower().strip()
+        asset_list["matching_postcode"] = asset_list["Post Code"].str.lower().strip()
+
+        split_addresses = asset_list['matching_address'].str.split(',', expand=True)
+        split_addresses.columns = ['temp', 'address2', 'address3', 'address4', 'address5']
+        house_numbers = split_addresses['temp'].str.split(' ', expand=True)
+        house_numbers.columns = ['HouseNo', 'part1', 'part2', "part3", "part4", "part5"]
+
+        asset_list = pd.concat([asset_list, house_numbers[["HouseNo"]]], axis=1)
+        del split_addresses, house_numbers
+
+        matching_lookup = []
+        for _, row in tqdm(survey_list.iterrows(), total=len(survey_list)):
+            house_number = row["NO."]
+            if isinstance(house_number, str):
+                house_number = house_number.lower().strip()
+
+            # Filter on the first line of the address
+            df = asset_list[
+                asset_list["matching_address"].str.contains(row["Street / Block Name"].lower().strip())
+            ].copy()
+            df = df[df["matching_address"].str.contains(str(house_number))]
+            if df.shape[0] != 1:
+                df = df[df["HouseNo"] == str(house_number)]
+                if df.shape[0] != 1:
+                    df = df[df["matching_postcode"].str.lower().str.contains(row["Post Code"].lower())]
+                    if df.shape[0] != 1:
+                        print(row["Street / Block Name"])
+                        print(house_number)
+                        print(row["Post Code"].lower())
+                        raise ValueError("Investigate")
+
+            matching_lookup.append(
+                {
+                    "survey_list_row_id": row["survey_list_row_id"],
+                    "asset_list_row_id": df["asset_list_row_id"].values[0],
+                }
+            )
 
     def load(self):
 
         data = {}
         for ha_name, file_config in self.files.items():
             # Load asset list
-            # logger.info("LOading asset list for {}".format(ha_name))
+            logger.info("Loading asset list for {}".format(ha_name))
             asset_list = self.load_asset_list(
                 file_path=file_config["asset_list"]["filepath"],
                 ha_name=ha_name,
@@ -165,6 +204,7 @@ class DataLoader:
             )
 
             if file_config.get("survey_list"):
+                logger.info("Loading survey list for {}".format(ha_name))
                 survey_list = self.load_survey_list(
                     file_path=file_config["survey_list"]["filepath"],
                     ha_name=ha_name,
@@ -209,3 +249,4 @@ def app():
     }
 
     loader = DataLoader(files)
+    loader.load()

From 8c61cca85d821b5fa7c2901bfa164249f8c1dce6 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Mon, 22 Jan 2024 11:47:08 +0000
Subject: [PATCH 21/48] matching 7% complete

---
 etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
index 7fbddd54..257e71d2 100644
--- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
+++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
@@ -151,9 +151,12 @@ class DataLoader:
 
     def merge_ha_6(self, asset_list, survey_list):
 
+        # Correct the asset list
+        asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("Baggott Place", "Baggotts Place")
+
         # Prepare the asset list
-        asset_list["matching_address"] = asset_list["propertyaddress"].str.lower().strip()
-        asset_list["matching_postcode"] = asset_list["Post Code"].str.lower().strip()
+        asset_list["matching_address"] = asset_list["propertyaddress"].str.lower().str.strip()
+        asset_list["matching_postcode"] = asset_list["Post Code"].str.lower().str.strip()
 
         split_addresses = asset_list['matching_address'].str.split(',', expand=True)
         split_addresses.columns = ['temp', 'address2', 'address3', 'address4', 'address5']

From 1e52fe7fb97061d64194456e6d56bd814b660cf7 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Mon, 22 Jan 2024 11:51:28 +0000
Subject: [PATCH 22/48] 11% complete matching

---
 etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
index 257e71d2..3bfea948 100644
--- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
+++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
@@ -166,6 +166,14 @@ class DataLoader:
         asset_list = pd.concat([asset_list, house_numbers[["HouseNo"]]], axis=1)
         del split_addresses, house_numbers
 
+        # Correct the survey list
+        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
+            "Seabridge Road", "Seabridge Lane"
+        )
+
+        # Strip out /KNUTTON from the street name
+        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("/KNUTTON", "")
+
         matching_lookup = []
         for _, row in tqdm(survey_list.iterrows(), total=len(survey_list)):
             house_number = row["NO."]

From 7a2c90cbf36c7e1d73452527683d52e8719be382 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Mon, 22 Jan 2024 11:54:20 +0000
Subject: [PATCH 23/48] matching 23% complete

---
 etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
index 3bfea948..dfbd4fa4 100644
--- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
+++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
@@ -174,6 +174,18 @@ class DataLoader:
         # Strip out /KNUTTON from the street name
         survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("/KNUTTON", "")
 
+        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
+            "Clevend Road", "Cleveland Road"
+        )
+
+        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
+            "TURNERS AVENUE", "Turner Avenue"
+        )
+
+        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
+            "WEDGEWWOD AVENUE", "Wedgwood Avenue"
+        )
+
         matching_lookup = []
         for _, row in tqdm(survey_list.iterrows(), total=len(survey_list)):
             house_number = row["NO."]

From ed0bbf44c76a1303fac20814875f3e99798ed9bd Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Mon, 22 Jan 2024 12:02:43 +0000
Subject: [PATCH 24/48] matching 34% complete

---
 etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
index dfbd4fa4..9cae6e37 100644
--- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
+++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
@@ -153,6 +153,8 @@ class DataLoader:
 
         # Correct the asset list
         asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("Baggott Place", "Baggotts Place")
+        asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("Cherry Tree", "Cherrytree")
+        asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("Maryhill Close", "Mary Hill Close")
 
         # Prepare the asset list
         asset_list["matching_address"] = asset_list["propertyaddress"].str.lower().str.strip()
@@ -185,6 +187,8 @@ class DataLoader:
         survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
             "WEDGEWWOD AVENUE", "Wedgwood Avenue"
         )
+        # The cherrytree record has wrong postcode
+        survey_list.loc[survey_list["Street / Block Name"] == "Cherrytree road", "Post Code"] = "ST5 7BP"
 
         matching_lookup = []
         for _, row in tqdm(survey_list.iterrows(), total=len(survey_list)):

From 702de41d464e27cf4dad6db118dd1ac367e99c27 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Mon, 22 Jan 2024 12:36:03 +0000
Subject: [PATCH 25/48] matching 42% complete

---
 etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
index 9cae6e37..5b2cefcd 100644
--- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
+++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
@@ -190,6 +190,20 @@ class DataLoader:
         # The cherrytree record has wrong postcode
         survey_list.loc[survey_list["Street / Block Name"] == "Cherrytree road", "Post Code"] = "ST5 7BP"
 
+        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
+            "MONUMENT RD", "Monument Road"
+        )
+
+        # Generally replace " RD" with " Road"
+        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(" RD", " Road")
+
+        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
+            "HILARY Road", "Hillary Road"
+        )
+
+        # Remove full stops from the street name
+        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(".", "")
+
         matching_lookup = []
         for _, row in tqdm(survey_list.iterrows(), total=len(survey_list)):
             house_number = row["NO."]

From 4601edbf27f3dd4de5d6df05495d3cfdd1f4e74d Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Mon, 22 Jan 2024 12:42:01 +0000
Subject: [PATCH 26/48] matching 51% complete

---
 etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
index 5b2cefcd..ff717f86 100644
--- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
+++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
@@ -204,6 +204,21 @@ class DataLoader:
         # Remove full stops from the street name
         survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(".", "")
 
+        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
+            "Chatworth road", "Chatsworth Place"
+        )
+
+        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
+            "Wood Croft", "Woodcroft"
+        )
+
+        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
+            "Milstone Avenue", "Millstone Avenue"
+        )
+
+        # Strip out /TALKE from the street name
+        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("/TALKE", "")
+
         matching_lookup = []
         for _, row in tqdm(survey_list.iterrows(), total=len(survey_list)):
             house_number = row["NO."]

From 96893aae14c3896a62fbb8c76a58e41ea567a3e3 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Mon, 22 Jan 2024 16:51:43 +0000
Subject: [PATCH 27/48] matching 61% complete

---
 etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
index ff717f86..261c0fd4 100644
--- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
+++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
@@ -219,6 +219,10 @@ class DataLoader:
         # Strip out /TALKE from the street name
         survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("/TALKE", "")
 
+        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
+            "Woodcutts Street", "Woodshutts Street"
+        )
+
         matching_lookup = []
         for _, row in tqdm(survey_list.iterrows(), total=len(survey_list)):
             house_number = row["NO."]

From a2a8bc012e51b1f2d9977e865e216c10f14b9ba0 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Mon, 22 Jan 2024 17:13:35 +0000
Subject: [PATCH 28/48] matching 82% complete

---
 .../ha_15_32/ha_analysis_batch_3.py           | 41 ++++++++++++++++++-
 1 file changed, 39 insertions(+), 2 deletions(-)

diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
index 261c0fd4..3a5b4ab4 100644
--- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
+++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
@@ -142,12 +142,13 @@ class DataLoader:
         # We now do the matching between the asset list and the survey list.
         # What we'll get from this is a lookup table from the asset list to the survey list
 
+        matched_lookup = pd.DataFrame()
         if ha_name == "ha_6":
-            self.merge_ha_6(asset_list, survey_list)
+            matched_lookup = self.merge_ha_6(asset_list, survey_list)
         else:
             raise NotImplementedError("Only HA 6 has surveys")
 
-        return survey_list
+        return survey_list, matched_lookup
 
     def merge_ha_6(self, asset_list, survey_list):
 
@@ -223,6 +224,42 @@ class DataLoader:
             "Woodcutts Street", "Woodshutts Street"
         )
 
+        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
+            "HILLARY AVENUE", "Hillary Road"
+        )
+
+        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
+            "HILLARY AVENUE", "Hillary Road"
+        )
+
+        # Replace " Rd" with " Road"
+        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(" Rd", " Road")
+
+        # We have a record listed as 19, MAPLE AVENUE ST7 1JX, when it should be 19, Hollins Crescent ST7 1JX
+        survey_list.loc[
+            (survey_list["Street / Block Name"] == "MAPLE AVENUE") &
+            (survey_list["NO."].isin([19])) &
+            (survey_list["Post Code"] == "ST7 1JX"),
+            "Street / Block Name"
+        ] = "Hollins Crescent"
+
+        # However, some of the maple avenue records, are indeed Maple avenue, but are listed with the wrong postcode.
+        # E.g. number 26
+        survey_list.loc[
+            (survey_list["Street / Block Name"] == "MAPLE AVENUE") &
+            (survey_list["NO."].isin([26])) &
+            (survey_list["Post Code"] == "ST7 1JX"),
+            "Post Code"
+        ] = "ST7 1JW"
+
+        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
+            "BURSLEY Road", "Bursley Way"
+        )
+
+        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
+            "Brittania Avenue", "Brittain Avenue"
+        )
+
         matching_lookup = []
         for _, row in tqdm(survey_list.iterrows(), total=len(survey_list)):
             house_number = row["NO."]

From 48ec641675a34b4c7c8b6c1cdbae62ecf2cc45d3 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Mon, 22 Jan 2024 17:17:57 +0000
Subject: [PATCH 29/48] 82% complete

---
 etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
index 3a5b4ab4..61a90d14 100644
--- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
+++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
@@ -156,6 +156,7 @@ class DataLoader:
         asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("Baggott Place", "Baggotts Place")
         asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("Cherry Tree", "Cherrytree")
         asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("Maryhill Close", "Mary Hill Close")
+        asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("Moffat Way", "Moffatt Way")
 
         # Prepare the asset list
         asset_list["matching_address"] = asset_list["propertyaddress"].str.lower().str.strip()
@@ -260,6 +261,9 @@ class DataLoader:
             "Brittania Avenue", "Brittain Avenue"
         )
 
+        # Moffat Way
+        # Moffatt Way
+
         matching_lookup = []
         for _, row in tqdm(survey_list.iterrows(), total=len(survey_list)):
             house_number = row["NO."]

From 90a47d765b315f78c47ce1f4db4851b5cfa9e633 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Mon, 22 Jan 2024 17:46:20 +0000
Subject: [PATCH 30/48] matching 89% complete

---
 etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
index 61a90d14..05c0299c 100644
--- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
+++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
@@ -142,7 +142,6 @@ class DataLoader:
         # We now do the matching between the asset list and the survey list.
         # What we'll get from this is a lookup table from the asset list to the survey list
 
-        matched_lookup = pd.DataFrame()
         if ha_name == "ha_6":
             matched_lookup = self.merge_ha_6(asset_list, survey_list)
         else:
@@ -261,8 +260,9 @@ class DataLoader:
             "Brittania Avenue", "Brittain Avenue"
         )
 
-        # Moffat Way
-        # Moffatt Way
+        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
+            "Hawthorn Road", "Hawthorne Road"
+        )
 
         matching_lookup = []
         for _, row in tqdm(survey_list.iterrows(), total=len(survey_list)):
@@ -274,6 +274,7 @@ class DataLoader:
             df = asset_list[
                 asset_list["matching_address"].str.contains(row["Street / Block Name"].lower().strip())
             ].copy()
+
             df = df[df["matching_address"].str.contains(str(house_number))]
             if df.shape[0] != 1:
                 df = df[df["HouseNo"] == str(house_number)]

From 4ed4c154805eb2f907b07200bd96b4bec8ed0566 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Mon, 22 Jan 2024 17:47:40 +0000
Subject: [PATCH 31/48] matching 95% complete

---
 etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
index 05c0299c..52117d17 100644
--- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
+++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
@@ -264,6 +264,10 @@ class DataLoader:
             "Hawthorn Road", "Hawthorne Road"
         )
 
+        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
+            "Eastdale Place", "Easdale Place"
+        )
+
         matching_lookup = []
         for _, row in tqdm(survey_list.iterrows(), total=len(survey_list)):
             house_number = row["NO."]

From 709a50f02ef5b263d6c83b46aac8a1839ed98511 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Mon, 22 Jan 2024 18:46:43 +0000
Subject: [PATCH 32/48] setting up cache

---
 .../ha_15_32/ha_analysis_batch_3.py           | 82 +++++++++++++++++--
 utils/s3.py                                   | 54 +++++++++++-
 2 files changed, 129 insertions(+), 7 deletions(-)

diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
index 52117d17..bf91d8b6 100644
--- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
+++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
@@ -6,7 +6,7 @@ from tqdm import tqdm
 from datetime import datetime
 import pandas as pd
 import numpy as np
-from utils.s3 import read_from_s3, read_dataframe_from_s3_parquet
+from utils.s3 import read_from_s3, read_dataframe_from_s3_parquet, save_pickle_to_s3, read_pickle_from_s3
 from utils.logger import setup_logger
 from dotenv import load_dotenv
 from tqdm import tqdm
@@ -39,8 +39,11 @@ class DataLoader:
         },
     }
 
-    def __init__(self, files):
+    def __init__(self, files, use_cache):
         self.files = files
+        self.use_cache = use_cache
+
+        self.data = {}
 
     def load_asset_list(self, file_path, ha_name, sheet_name=None):
         workbook = openpyxl.load_workbook(file_path)
@@ -149,7 +152,8 @@ class DataLoader:
 
         return survey_list, matched_lookup
 
-    def merge_ha_6(self, asset_list, survey_list):
+    @staticmethod
+    def merge_ha_6(asset_list, survey_list):
 
         # Correct the asset list
         asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("Baggott Place", "Baggotts Place")
@@ -268,8 +272,39 @@ class DataLoader:
             "Eastdale Place", "Easdale Place"
         )
 
+        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
+            "Wedgewood Road", "Wedgwood Road"
+        )
+
+        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
+            "Droitwich Drive", "Droitwich Close"
+        )
+
+        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
+            "Longdale Road", "Langdale Road"
+        )
+
+        # We have 2 addresses in the survey list that don't have postcodes. We'll manually add them in
+        survey_list.loc[
+            (survey_list["Street / Block Name"] == "Rogers Avenue") &
+            pd.isnull(survey_list["Post Code"]),
+            "Post Code"
+        ] = "ST5 9AT"
+
+        survey_list.loc[
+            (survey_list["Street / Block Name"] == "Cedar Road") &
+            pd.isnull(survey_list["Post Code"]),
+            "Post Code"
+        ] = "ST5 7BY"
+
+        missed_postcodes = [
+            postcode.lower() for postcode in survey_list["Post Code"] if
+            postcode.lower() not in asset_list["matching_postcode"].values
+        ]
+
         matching_lookup = []
         for _, row in tqdm(survey_list.iterrows(), total=len(survey_list)):
+
             house_number = row["NO."]
             if isinstance(house_number, str):
                 house_number = house_number.lower().strip()
@@ -285,6 +320,16 @@ class DataLoader:
                 if df.shape[0] != 1:
                     df = df[df["matching_postcode"].str.lower().str.contains(row["Post Code"].lower())]
                     if df.shape[0] != 1:
+                        postcode_lower = row["Post Code"].lower()
+                        if postcode_lower in missed_postcodes:
+                            matching_lookup.append(
+                                {
+                                    "survey_list_row_id": row["survey_list_row_id"],
+                                    "asset_list_row_id": None,
+                                }
+                            )
+                            continue
+
                         print(row["Street / Block Name"])
                         print(house_number)
                         print(row["Post Code"].lower())
@@ -297,8 +342,19 @@ class DataLoader:
                 }
             )
 
+        matching_lookup = pd.DataFrame(matching_lookup)
+
+        return matching_lookup
+
     def load(self):
 
+        if self.use_cache:
+            self.data = read_pickle_from_s3(
+                bucket_name="retrofit-datalake-dev",
+                s3_file_name="ha-analysis/batch3-inputs.pickle",
+            )
+            return
+
         data = {}
         for ha_name, file_config in self.files.items():
             # Load asset list
@@ -311,19 +367,31 @@ class DataLoader:
 
             if file_config.get("survey_list"):
                 logger.info("Loading survey list for {}".format(ha_name))
-                survey_list = self.load_survey_list(
+                survey_list, matched_lookup = self.load_survey_list(
                     file_path=file_config["survey_list"]["filepath"],
                     ha_name=ha_name,
                     sheet_name=file_config["survey_list"]["sheetname"]
                 )
             else:
                 survey_list = None
+                matched_lookup = None
 
             data[ha_name] = {
                 "asset_list": asset_list,
-                "survey_list": survey_list
+                "survey_list": survey_list,
+                "matched_lookup": matched_lookup
             }
 
+        self.data = data
+
+        # Cache the data in s3
+        # We need to pickle the data and store in s3
+        save_pickle_to_s3(
+            data=self.data,
+            bucket_name="retrofit-datalake-dev",
+            s3_file_name="ha-analysis/batch3-inputs.pickle",
+        )
+
 
 def app():
     """
@@ -332,6 +400,8 @@ def app():
     :return:
     """
 
+    use_cache = False
+
     files = {
         "ha_1": {
             "asset_list": {
@@ -354,5 +424,5 @@ def app():
         "ha_107": {"asset_list": "etl/eligibility/ha_15_32/HA 107 - ASSET LIST.xlsx"}
     }
 
-    loader = DataLoader(files)
+    loader = DataLoader(files, use_cache)
     loader.load()
diff --git a/utils/s3.py b/utils/s3.py
index e63b7192..3d6cf038 100644
--- a/utils/s3.py
+++ b/utils/s3.py
@@ -1,3 +1,4 @@
+import pickle
 import boto3
 from io import BytesIO, StringIO
 from botocore.exceptions import NoCredentialsError, PartialCredentialsError
@@ -141,5 +142,56 @@ def save_csv_to_s3(dataframe, bucket_name, file_name):
         s3.put_object(Body=csv_buffer.getvalue(), Bucket=bucket_name, Key=file_name)
         return True
     except Exception as e:
-        print(f"An error occurred: {e}")
+        logger.error(f"An error occurred: {e}")
         return False
+
+
+def save_pickle_to_s3(data, bucket_name, s3_file_name):
+    """
+    Save an object to an S3 bucket as a pickle file.
+
+    :param data: The data to save
+    :param bucket_name: The name of the S3 bucket
+    :param s3_file_name: The file name to use for the saved data in S3 (should end in .pkl)
+    """
+    # Serialize data to a pickle format
+    try:
+        serialized_data = pickle.dumps(data)
+    except Exception as e:
+        print(f'Failed to serialize data: {str(e)}')
+        return
+
+    # Use save_data_to_s3 function to upload the serialized data to S3
+    save_data_to_s3(serialized_data, bucket_name, s3_file_name)
+
+
+def read_pickle_from_s3(bucket_name, s3_file_name):
+    """
+    Read a pickle file from an S3 bucket and return the data.
+
+    :param bucket_name: The name of the S3 bucket
+    :param s3_file_name: The file name of the pickle file in S3
+    :return: The data read from the pickle file
+    """
+    try:
+        s3 = boto3.client('s3')
+        s3_response = s3.get_object(Bucket=bucket_name, Key=s3_file_name)
+        serialized_data = s3_response['Body'].read()
+    except NoCredentialsError:
+        logger.errpr("Credentials not available.")
+        return None
+    except PartialCredentialsError:
+        logger.errpr("Incomplete credentials provided.")
+        return None
+    except Exception as e:
+        logger.errpr(f'Failed to download data from {bucket_name}/{s3_file_name}: {str(e)}')
+        return None
+
+    # Deserialize data from pickle format
+    try:
+        data = pickle.loads(serialized_data)
+    except Exception as e:
+        logger.errpr(f'Failed to deserialize data: {str(e)}')
+        return None
+
+    return data

From 0620c45a223e542218caa84211c695e1461b385c Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Mon, 22 Jan 2024 18:59:53 +0000
Subject: [PATCH 33/48] Added read for other ha files

---
 .../ha_15_32/ha_analysis_batch_3.py           | 31 +++++++++++++++++--
 1 file changed, 28 insertions(+), 3 deletions(-)

diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
index bf91d8b6..85f8704d 100644
--- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
+++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
@@ -37,6 +37,15 @@ class DataLoader:
                 "green": "FF92D050", "purple": "FF7030A0", "red": "FFFF0000", "blue": "FF00B0F0"
             }
         },
+        "ha_14": {
+            "asset_list": {"red": "FFFF0000", "green": "FF00B050"},
+        },
+        "ha_39": {
+            "asset_list": {"red": "FFFF0000", "green": "FF00B050"},
+        },
+        "ha_107": {
+            "asset_list": {"red": "FFFF0000", "green": "FF00B050"},
+        }
     }
 
     def __init__(self, files, use_cache):
@@ -368,6 +377,7 @@ class DataLoader:
             if file_config.get("survey_list"):
                 logger.info("Loading survey list for {}".format(ha_name))
                 survey_list, matched_lookup = self.load_survey_list(
+                    asset_list=asset_list,
                     file_path=file_config["survey_list"]["filepath"],
                     ha_name=ha_name,
                     sheet_name=file_config["survey_list"]["sheetname"]
@@ -419,9 +429,24 @@ def app():
                 "sheetname": "HA 6"
             }
         },
-        "ha_14": {"asset_list": "etl/eligibility/ha_15_32/HA 14 - ASSET LIST.xlsx"},
-        "ha_39": {"asset_list": "etl/eligibility/ha_15_32/HA 39 - ASSET LIST.xlsx"},
-        "ha_107": {"asset_list": "etl/eligibility/ha_15_32/HA 107 - ASSET LIST.xlsx"}
+        "ha_14": {
+            "asset_list": {
+                "filepath": "etl/eligibility/ha_15_32/HA 14 - ASSET LIST.xlsx",
+                "sheetname": "HA 14"
+            }
+        },
+        "ha_39": {
+            "asset_list": {
+                "filepath": "etl/eligibility/ha_15_32/HA 39 - ASSET LIST.xlsx",
+                "sheetname": "Sheet1"
+            }
+        },
+        "ha_107": {
+            "asset_list": {
+                "filepath": "etl/eligibility/ha_15_32/HA 107 - ASSET LIST.xlsx",
+                "sheetname": "HA 107"
+            }
+        }
     }
 
     loader = DataLoader(files, use_cache)

From 9ac6b25b9fa1adf91926109d6a5610d50bee28b8 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Tue, 23 Jan 2024 18:06:34 +0000
Subject: [PATCH 34/48] improving data read code to create standardised
 matching_address and house number

---
 backend/ml_models/Valuation.py                |   2 +
 .../ha_15_32/ha_analysis_batch_3.py           | 134 +++++++++++++++---
 etl/testing_data/livewest_pilot.py            |  38 +++++
 .../the_guiness_partnership_pilot.py          |  38 +++++
 4 files changed, 192 insertions(+), 20 deletions(-)
 create mode 100644 etl/testing_data/livewest_pilot.py
 create mode 100644 etl/testing_data/the_guiness_partnership_pilot.py

diff --git a/backend/ml_models/Valuation.py b/backend/ml_models/Valuation.py
index dadef9a9..ff771252 100644
--- a/backend/ml_models/Valuation.py
+++ b/backend/ml_models/Valuation.py
@@ -22,6 +22,8 @@ class PropertyValuation:
         100021192109: 650000,  # Based on Zoopla
         766249482: 358000,  # Based on Zoopla estimate for 19 Spring Lane, 3 bedroom semi-detached
         100120703802: 277000,  # Based on Zoopla
+        10014469685: 286000,  # Based on Zoopla
+        10001328782: 196000,  # Based on Zoopla
     }
 
     # We base our valuation uplifts on a number of sources
diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
index 85f8704d..54cd7c58 100644
--- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
+++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
@@ -1,8 +1,7 @@
 import os
-import msgpack
 import openpyxl
 from pathlib import Path
-from tqdm import tqdm
+import msgpack
 from datetime import datetime
 import pandas as pd
 import numpy as np
@@ -48,6 +47,14 @@ class DataLoader:
         }
     }
 
+    MIN_ROWS = {
+        "ha_1": 2,
+        "ha_6": 2,
+        "ha_14": 3,  # The spreadsheet starts from the third row
+        "ha_39": 2,
+        "ha_107": 2,
+    }
+
     def __init__(self, files, use_cache):
         self.files = files
         self.use_cache = use_cache
@@ -60,11 +67,14 @@ class DataLoader:
             sheet = workbook[sheet_name]
         else:
             sheet = workbook.active
-        sheet_colnames = [cell.value for cell in sheet[1]]
+        sheet_colnames = [cell.value for cell in sheet[self.MIN_ROWS[ha_name] - 1]]
 
         rows_data = []
         rows_colors = []
-        for row in tqdm(sheet.iter_rows(min_row=2, values_only=False)):  # Assuming the first row is headers
+        for row in tqdm(
+            sheet.iter_rows(min_row=self.MIN_ROWS[ha_name], values_only=False)
+        ):  # Assuming the first row is headers
+
             row_data = [cell.value for cell in row]  # This will get you the cell values
             row_color = row[0].fill.start_color.index if row[0].fill.start_color.index != '00000000' else None
             # row_color = COLOR_INDEX[row_color]
@@ -73,8 +83,12 @@ class DataLoader:
 
         asset_list = pd.DataFrame(rows_data, columns=sheet_colnames)
         asset_list = asset_list.loc[:, asset_list.columns.notnull()]
+
         asset_list['row_color'] = rows_colors
 
+        # Remove entirely empty roww - consider all rows apart from row_color
+        asset_list = asset_list.loc[asset_list.loc[:, asset_list.columns != 'row_color'].notnull().any(axis=1)]
+
         asset_list_colours = self.COLOUR_CONFIG[ha_name]["asset_list"]
 
         asset_list["row_colour_name"] = np.where(
@@ -92,6 +106,54 @@ class DataLoader:
         # Add in asset_list_row_id
         asset_list["asset_list_row_id"] = [ha_name + str(i) for i in range(0, len(asset_list))]
 
+        # Prepare the asset list
+        # Depending on the HA, we need to rename some columns
+        if ha_name == "ha_1":
+            asset_list["matching_address"] = asset_list["Address"].str.lower().str.strip()
+            asset_list["matching_postcode"] = asset_list["Address - Postcode"].str.lower().str.strip()
+        elif ha_name == "ha_6":
+            asset_list["matching_address"] = asset_list["propertyaddress"].str.lower().str.strip()
+            asset_list["matching_postcode"] = asset_list["Post Code"].str.lower().str.strip()
+        elif ha_name == "ha_14":
+            # Create matching_address by concatenating Address 1, Address 2, Address 3, Address 4, Postcode
+            asset_list["matching_address"] = asset_list["Address 1"].str.lower().str.strip() + ", " + \
+                                             asset_list["Address 2"].str.lower().str.strip() + ", " + \
+                                             asset_list["Address 3"].str.lower().str.strip() + ", " + \
+                                             asset_list["Address 4"].str.lower().str.strip() + ", " + \
+                                             asset_list["Postcode"].str.lower().str.strip()
+            asset_list["matching_postcode"] = asset_list["Postcode"].str.lower().str.strip()
+        elif ha_name == "ha_39":
+            # Create matching_address by concatenating add_1, add_2, add_3, add_4, add_5, post_code
+            asset_list["matching_address"] = asset_list["add_1"].astype(str).str.lower().str.strip() + ", " + \
+                                             asset_list["add_2"].str.lower().str.strip() + ", " + \
+                                             asset_list["add_3"].str.lower().str.strip() + ", " + \
+                                             asset_list["add_4"].str.lower().str.strip() + ", " + \
+                                             asset_list["add_5"].str.lower().str.strip() + ", " + \
+                                             asset_list["post_code"].str.lower().str.strip()
+            asset_list["matching_postcode"] = asset_list["post_code"].str.lower().str.strip()
+        elif ha_name == "ha_107":
+            # Create matching_address by concatenating House No, Street, Town, District, Postcode
+            asset_list["matching_address"] = asset_list["House No"].astype(str).str.lower().str.strip() + ", " + \
+                                             asset_list["Street"].str.lower().str.strip() + ", " + \
+                                             asset_list["Town"].str.lower().str.strip() + ", " + \
+                                             asset_list["District"].str.lower().str.strip() + ", " + \
+                                             asset_list["Postcode"].str.lower().str.strip()
+            asset_list["matching_postcode"] = asset_list["Postcode"].str.lower().str.strip()
+        else:
+            raise NotImplementedError("implement me")
+
+        if ha_name in ["ha_107"]:
+            asset_list["HouseNo"] = asset_list["House No"].copy()
+        else:
+            split_addresses = asset_list['matching_address'].str.split(',', expand=True)
+            house_numbers = split_addresses[0].str.split(' ', expand=True)
+            # THe first column should be HouseNo - we aren't interested in the other columns, but we don't know how
+            # many columns there might be
+            house_numbers = house_numbers.iloc[:, 0:1]
+            house_numbers.columns = ['HouseNo']
+
+            asset_list = pd.concat([asset_list, house_numbers[["HouseNo"]]], axis=1)
+
         return asset_list
 
     def load_survey_list(self, file_path, ha_name, asset_list, sheet_name=None):
@@ -165,22 +227,10 @@ class DataLoader:
     def merge_ha_6(asset_list, survey_list):
 
         # Correct the asset list
-        asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("Baggott Place", "Baggotts Place")
-        asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("Cherry Tree", "Cherrytree")
-        asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("Maryhill Close", "Mary Hill Close")
-        asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("Moffat Way", "Moffatt Way")
-
-        # Prepare the asset list
-        asset_list["matching_address"] = asset_list["propertyaddress"].str.lower().str.strip()
-        asset_list["matching_postcode"] = asset_list["Post Code"].str.lower().str.strip()
-
-        split_addresses = asset_list['matching_address'].str.split(',', expand=True)
-        split_addresses.columns = ['temp', 'address2', 'address3', 'address4', 'address5']
-        house_numbers = split_addresses['temp'].str.split(' ', expand=True)
-        house_numbers.columns = ['HouseNo', 'part1', 'part2', "part3", "part4", "part5"]
-
-        asset_list = pd.concat([asset_list, house_numbers[["HouseNo"]]], axis=1)
-        del split_addresses, house_numbers
+        asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("baggott place", "baggotts place")
+        asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("cherry tree", "cherrytree")
+        asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("maryhill close", "mary hill close")
+        asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("moffat way", "moffatt way")
 
         # Correct the survey list
         survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
@@ -403,6 +453,30 @@ class DataLoader:
         )
 
 
+def get_epc_data(loader):
+    if not loader.data:
+        raise ValueError("Data not found - please run loader.load() first")
+
+    property_type_lookup = {}
+
+    for ha_name, data_assets in loader.data.items():
+        # For each HA, we read pull in the data required, and store in S3
+        asset_list = data_assets["asset_list"]
+
+        # We iterate through the asset list and pull what we need
+        for index, property_meta in tqdm(asset_list.iterrows(), total=len(asset_list)):
+            searcher = SearchEpc(
+                address1=property_meta["No."],
+                postcode=property_meta["Postcode"],
+                auth_token=EPC_AUTH_TOKEN,
+                os_api_key=None,
+                full_address=property_meta["Address"]
+            )
+            searcher.ordnance_survey_client.property_type = property_type_lookup[property_meta["Type"]]["property-type"]
+            searcher.ordnance_survey_client.built_form = property_type_lookup[property_meta["Type"]]["built-form"]
+            searcher.find_property(skip_os=True)
+
+
 def app():
     """
     This app contains the housign association analysis for HAs 1, 6, 14, 39 and 107.
@@ -451,3 +525,23 @@ def app():
 
     loader = DataLoader(files, use_cache)
     loader.load()
+
+    # TODO: We probably need to make sure that we have all of the columns that we need
+
+    # We load in the additional data required to perform the analysis
+
+    cleaned = read_from_s3(
+        s3_file_name="cleaned_epc_data/cleaned.bson",
+        bucket_name="retrofit-data-dev"
+    )
+    cleaned = msgpack.unpackb(cleaned, raw=False)
+
+    cleaning_data = read_dataframe_from_s3_parquet(
+        bucket_name="retrofit-data-dev", file_key="sap_change_model/cleaning_dataset.parquet",
+    )
+
+    created_at = datetime.now().isoformat()
+
+    photo_supply_lookup, floor_area_decile_thresholds = SolarPhotoSupply.load(bucket="retrofit-data-dev")
+
+    get_epc_data(loader)
diff --git a/etl/testing_data/livewest_pilot.py b/etl/testing_data/livewest_pilot.py
new file mode 100644
index 00000000..580c16d0
--- /dev/null
+++ b/etl/testing_data/livewest_pilot.py
@@ -0,0 +1,38 @@
+"""
+This script will create an input csv for the recommendation engine and upload it to S3, which can be used for
+testing
+"""
+import os
+
+import pandas as pd
+from utils.s3 import save_csv_to_s3
+
+EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN", None)
+USER_ID = 8
+PORTFOLIO_ID = 61
+
+
+def app():
+    pilot_file = pd.DataFrame(
+        [
+            {"address": "42, Foxes Field", "postcode": "TR18 3RJ", "Notes": None},
+            {"address": "11, Cranley Gardens", "postcode": "TQ13 8UT", "Notes": None},
+        ]
+    )
+
+    # Store the data in s3
+    filename = f"{USER_ID}/{PORTFOLIO_ID}/livewest_pilot_file.csv"
+    save_csv_to_s3(
+        dataframe=pilot_file,
+        bucket_name="retrofit-plan-inputs-dev",
+        file_name=filename
+    )
+
+    body = {
+        "portfolio_id": str(PORTFOLIO_ID),
+        "housing_type": "Social",
+        "goal": "Increase EPC",
+        "goal_value": "C",
+        "trigger_file_path": filename
+    }
+    print(body)
diff --git a/etl/testing_data/the_guiness_partnership_pilot.py b/etl/testing_data/the_guiness_partnership_pilot.py
new file mode 100644
index 00000000..496ea7ea
--- /dev/null
+++ b/etl/testing_data/the_guiness_partnership_pilot.py
@@ -0,0 +1,38 @@
+"""
+This script will create an input csv for the recommendation engine and upload it to S3, which can be used for
+testing
+"""
+import os
+
+import pandas as pd
+from utils.s3 import save_csv_to_s3
+
+EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN", None)
+USER_ID = 8
+PORTFOLIO_ID = 59
+
+
+def app():
+    pilot_file = pd.DataFrame(
+        [
+            {"address": "10 Elm Close", "postcode": "CV37 8XL", "Notes": None},
+            {"address": "21, Spring Lane", "postcode": "MK17 0QP", "Notes": None},
+        ]
+    )
+
+    # Store the data in s3
+    filename = f"{USER_ID}/{PORTFOLIO_ID}/the_guiness_partnership_pilot_file.csv"
+    save_csv_to_s3(
+        dataframe=pilot_file,
+        bucket_name="retrofit-plan-inputs-dev",
+        file_name=filename
+    )
+
+    body = {
+        "portfolio_id": str(PORTFOLIO_ID),
+        "housing_type": "Social",
+        "goal": "Increase EPC",
+        "goal_value": "C",
+        "trigger_file_path": filename
+    }
+    print(body)

From 4b73aa75b2ea9db59b15886cf68f811402767f0a Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Tue, 23 Jan 2024 18:19:29 +0000
Subject: [PATCH 35/48] fixed the bug in matching ha6 assets and surveys

---
 etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
index 54cd7c58..63a72714 100644
--- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
+++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
@@ -226,11 +226,18 @@ class DataLoader:
     @staticmethod
     def merge_ha_6(asset_list, survey_list):
 
-        # Correct the asset list
-        asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("baggott place", "baggotts place")
-        asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("cherry tree", "cherrytree")
-        asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("maryhill close", "mary hill close")
-        asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("moffat way", "moffatt way")
+        # Correct the asset list across propertyaddress and matching_address
+        asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("Baggott Place", "Baggotts Place")
+        asset_list["matching_address"] = asset_list["matching_address"].str.replace("baggott place", "baggotts place")
+
+        asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("Cherry Tree", "Cherrytree")
+        asset_list["matching_address"] = asset_list["matching_address"].str.replace("cherry tree", "cherrytree")
+
+        asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("Maryhill Close", "Mary Hill Close")
+        asset_list["matching_address"] = asset_list["matching_address"].str.replace("maryhill close", "mary hill close")
+
+        asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("Moffat Way", "Moffatt Way")
+        asset_list["matching_address"] = asset_list["matching_address"].str.replace("moffat way", "moffatt way")
 
         # Correct the survey list
         survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(

From 04aeaae613351c030af740b5f4d4637057bc43a2 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 24 Jan 2024 00:17:17 +0000
Subject: [PATCH 36/48] working on new ha batch

---
 backend/Property.py                           |   2 +-
 etl/eligibility/ha_15_32/app.py               |  75 ++++-----
 .../ha_15_32/ha_analysis_batch_3.py           | 144 ++++++++++++++++--
 etl/epc/Record.py                             |   9 +-
 4 files changed, 167 insertions(+), 63 deletions(-)

diff --git a/backend/Property.py b/backend/Property.py
index e6ae8bbe..e527c1ea 100644
--- a/backend/Property.py
+++ b/backend/Property.py
@@ -68,7 +68,7 @@ class Property(Definitions):
         self.in_conservation_area, self.is_listed, self.is_heritage = None, None, None
         self.restricted_measures = False
         self.year_built = epc_record.get("year_built")
-        self.number_of_rooms = epc_record.prepared_epc.get("number_heated_rooms")
+        self.number_of_rooms = epc_record.prepared_epc.get("number_habitable_rooms")
         self.age_band = epc_record.get("age_band")
         self.construction_age_band = epc_record.get("construction_age_band")
         self.number_of_floors = epc_record.get("number_of_floors")
diff --git a/etl/eligibility/ha_15_32/app.py b/etl/eligibility/ha_15_32/app.py
index ce216364..a68bf272 100644
--- a/etl/eligibility/ha_15_32/app.py
+++ b/etl/eligibility/ha_15_32/app.py
@@ -4,6 +4,7 @@ used by the Warmfront team, to identify which properties are eligible for ECO4 a
 work is being done in December 2023, prior to completion of acquisition
 """
 import pickle
+from etl.epc.Record import EPCRecord
 from pathlib import Path
 from tqdm import tqdm
 import pandas as pd
@@ -345,48 +346,31 @@ def prepare_model_data_row(
     :param modelling_epc:
     :return:
     """
+
+    epc_records = {
+        'original_epc': modelling_epc.copy(),
+        'full_sap_epc': full_sap_epc.copy(),
+        'old_data': old_data.copy(),
+    }
+
+    prepared_epc = EPCRecord(
+        epc_records=epc_records,
+        run_mode="newdata",
+        cleaning_data=cleaning_data
+    )
+
     p = Property(
         id=property_id,
         postcode=modelling_epc["postcode"],
         address=modelling_epc["address1"],
-        data=modelling_epc,
-        old_data=old_data,
-        full_sap_epc=full_sap_epc
+        epc_record=prepared_epc
     )
 
-    p.get_components(cleaned, photo_supply_lookup=photo_supply_lookup,
-                     floor_area_decile_thresholds=floor_area_decile_thresholds)
-
-    # THIS IS TEMP AND SHOULDN'T BE HERE
-    data_to_clean = p.get_model_data()
-    if data_to_clean["NUMBER_HEATED_ROOMS"] in ['', None]:
-        data_to_clean["NUMBER_HEATED_ROOMS"] = data_to_clean["NUMBER_HABITABLE_ROOMS"]
-        p.data["number-heated-rooms"] = data_to_clean["NUMBER_HABITABLE_ROOMS"]
-
-    # This is temp - this should happen after scoring
-    cleaned_property_data = DataProcessor.apply_averages_cleaning(
-        data_to_clean=pd.DataFrame([dict(**data_to_clean, LOCAL_AUTHORITY=p.data["local-authority"])]),
-        cleaning_data=cleaning_data,
-        cols_to_merge_on=['PROPERTY_TYPE', 'BUILT_FORM', 'CONSTRUCTION_AGE_BAND', 'LOCAL_AUTHORITY'],
+    p.get_components(
+        cleaned, photo_supply_lookup=photo_supply_lookup, floor_area_decile_thresholds=floor_area_decile_thresholds
     )
-    p.set_number_lighting_outlets(cleaned_property_data)
 
-    data_processor = DataProcessor(None, newdata=True)
-    data_processor.insert_data(pd.DataFrame([p.get_model_data()]))
-
-    data_processor.pre_process()
-
-    starting_epc_data = data_processor.get_component_features(suffix="_STARTING")
-    ending_epc_data = data_processor.get_component_features(suffix="_ENDING")
-    fixed_data = data_processor.get_fixed_features()
-
-    # We update the ending record with the recommended updates and we set lodgement date to today
-    ending_epc_data["DAYS_TO_ENDING"] = data_processor.calculate_days_to(created_at)
-
-    # We simulate the impact of the retrofit using expected performance of the wall and roof,
-    # after retrofit. We use the minimal u-values required to meet building regulations part L
-    # TODO: Check the performance of the materials warmfront's installers use, particularly for
-    #       cavity
+    p.create_base_difference_epc_record(cleaned_lookup=cleaned)
 
     cavity_simulation = {
         "recommendation_id": "-".join([property_id, "cavity"]),
@@ -402,21 +386,16 @@ def prepare_model_data_row(
         "parts": [{"depth": 270}]
     }
 
-    cavity_scoring = create_recommendation_scoring_data(
-        property=p,
-        recommendation=cavity_simulation,
-        starting_epc_data=starting_epc_data,
-        ending_epc_data=ending_epc_data,
-        fixed_data=fixed_data,
-    )
+    simulations = [
+        [cavity_simulation],
+        [loft_simulation]
+    ]
 
-    loft_scoring = create_recommendation_scoring_data(
-        property=p,
-        recommendation=loft_simulation,
-        starting_epc_data=starting_epc_data,
-        ending_epc_data=ending_epc_data,
-        fixed_data=fixed_data,
-    )
+    p.adjust_difference_record_with_recommendations(simulations)
+
+    # Make sure we definitely have the correct data
+    cavity_scoring = [x for x in p.recommendations_scoring_data if "cavity" in x["id"]][0]
+    loft_scoring = [x for x in p.recommendations_scoring_data if "loft" in x["id"]][0]
 
     return [cavity_scoring, loft_scoring]
 
diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
index 63a72714..1bb0f0c4 100644
--- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
+++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
@@ -460,29 +460,155 @@ class DataLoader:
         )
 
 
-def get_epc_data(loader):
+def get_epc_data(
+    loader, cleaned, cleaning_data, created_at, photo_supply_lookup, floor_area_decile_thresholds
+):
     if not loader.data:
         raise ValueError("Data not found - please run loader.load() first")
 
-    property_type_lookup = {}
+    property_type_lookup = {
+        "ha_1": {
+            "built_form": {
+                'Mid Terrace': 'Mid-Terrace',
+                'Semi-Detached': 'Semi-Detached',
+                'End Terrace': 'End-Terrace',
+                'Detached': 'Detached',
+                'Enclosed Mid': 'Mid-Terrace',
+                'Detached Local Connect': 'Detached',
+            }
+        }
+    }
 
     for ha_name, data_assets in loader.data.items():
         # For each HA, we read pull in the data required, and store in S3
-        asset_list = data_assets["asset_list"]
+        asset_list = data_assets["asset_list"].copy()
+
+        # If the survey list is missing, it means we have no yet completed any surveys and therefore should only
+        # consider the most recent EPC
+        consider_penultimate_epc = data_assets["survey_list"] is None
 
         # We iterate through the asset list and pull what we need
+        results = []
+        scoring_data = []
         for index, property_meta in tqdm(asset_list.iterrows(), total=len(asset_list)):
+
+            if ha_name == "ha_1":
+                property_type = property_meta["Asset Type"]
+                # We correct a small error
+                if property_type == "a":
+                    property_type = "House"
+
+                # Remap bedsits to flats
+                if property_type in ["Bedsit", "Room"]:
+                    property_type = "Flat"
+
+                built_form = property_type_lookup[ha_name]["built_form"].get(property_meta["Property Type"], None)
+            else:
+                raise NotImplementedError("Implement me")
+
             searcher = SearchEpc(
-                address1=property_meta["No."],
-                postcode=property_meta["Postcode"],
+                address1=property_meta["HouseNo"],
+                postcode=property_meta["matching_postcode"],
                 auth_token=EPC_AUTH_TOKEN,
                 os_api_key=None,
-                full_address=property_meta["Address"]
+                full_address=property_meta["matching_address"]
             )
-            searcher.ordnance_survey_client.property_type = property_type_lookup[property_meta["Type"]]["property-type"]
-            searcher.ordnance_survey_client.built_form = property_type_lookup[property_meta["Type"]]["built-form"]
+            searcher.ordnance_survey_client.property_type = property_type
+            searcher.ordnance_survey_client.built_form = built_form
             searcher.find_property(skip_os=True)
 
+            if searcher.newest_epc.get("estimated"):
+                # We insert the row ID as our proxy for UPRN
+                searcher.newest_epc["uprn"] = int(property_meta["asset_list_row_id"].split(ha_name)[1])
+
+            newest_epc = searcher.newest_epc
+            older_epcs = searcher.older_epcs
+            full_sap_epc = searcher.full_sap_epc
+
+            # If we have a survey list, we check the penultimate, because the property might have been installed
+            penultimate_epc = newest_epc
+            if consider_penultimate_epc:
+                # We also want to get the penultimate epc
+                penultimate_epc, _ = searcher.filter_newest_epc(older_epcs)
+                if not penultimate_epc:
+                    penultimate_epc = newest_epc
+
+            eligibility = Eligibility(epc=newest_epc, cleaned=cleaned)
+            eligibility.check_gbis_warmfront()
+            eligibility.check_eco4_warmfront()
+
+            if (not eligibility.eco4_warmfront["eligible"]) and (
+                not eligibility.gbis_warmfront
+            ) and consider_penultimate_epc:
+                # We check the penultimate epc
+                eligibility = Eligibility(epc=penultimate_epc, cleaned=cleaned)
+                eligibility.check_gbis_warmfront()
+                eligibility.check_eco4_warmfront()
+                # If this is the case, we need to update the older epcs
+                # We don't update just to make data cleaning easier
+                if penultimate_epc.get("estimated") is None:
+                    older_epcs = [x for x in searcher.data["rows"] if x["lmk-key"] != penultimate_epc["lmk-key"]]
+
+            # If the property is a cavity wall and it's filled, we produce an estimate for the age of the cavity
+            # Loft MUST be suitable
+            cavity_age = None
+            if (
+                eligibility.walls["is_cavity_wall"] and
+                eligibility.walls["is_filled_cavity"] and
+                eligibility.loft["suitability"] and
+                eligibility.eco4_warmfront["message"] == "Failed due to full cavity - check cavity age"
+            ):
+                # We check the age of the cavity and if it's particularly old, we flag it
+                cavity_age = calculate_cavity_age(newest_epc, older_epcs, cleaned)
+
+            # Full checks
+            eligibility.check_gbis()
+            eligibility.check_eco4()
+
+            if eligibility.eco4_warmfront["eligible"]:
+                if eligibility.epc["uprn"] == "":
+                    eligibility.epc["uprn"] = int(property_meta["asset_list_row_id"].split(ha_name)[1])
+
+                scoring_dictionary = prepare_model_data_row(
+                    property_id=property_meta["asset_list_row_id"],
+                    modelling_epc=eligibility.epc,
+                    cleaned=cleaned,
+                    cleaning_data=cleaning_data,
+                    created_at=created_at,
+                    old_data=older_epcs,
+                    full_sap_epc=full_sap_epc,
+                    photo_supply_lookup=photo_supply_lookup,
+                    floor_area_decile_thresholds=floor_area_decile_thresholds
+                )
+                scoring_data.extend(scoring_dictionary)
+
+            results.append(
+                {
+                    "row_id": property_meta["asset_list_row_id"],
+                    "uprn": eligibility.epc["uprn"],
+                    "property_type": eligibility.epc["property-type"],
+                    "gbis_eligible": eligibility.gbis_warmfront,
+                    "eco4_eligible": eligibility.eco4_warmfront["eligible"],
+                    "eco4_message": eligibility.eco4_warmfront["message"],
+                    "sap": float(eligibility.epc["current-energy-efficiency"]),
+                    "gbis_eligible_future": eligibility.gbis["eligible"],
+                    "gbis_eligible_future_message": eligibility.gbis["message"],
+                    "eco4_eligible_future": eligibility.eco4["eligible"],
+                    "eco4_eligible_future_message": eligibility.eco4["message"],
+                    # Property components
+                    "roof": eligibility.roof["clean_description"],
+                    "walls": eligibility.walls["clean_description"],
+                    "cavity_type": eligibility.cavity["type"],
+                    "heating": eligibility.epc["mainheat-description"],
+                    "tenure": eligibility.tenure,
+                    "date_epc": eligibility.epc["lodgement-date"],
+                    "loft_thickness": eligibility.roof["insulation_thickness"],
+                    "cavity_age": cavity_age,
+                    **eligibility.walls,
+                    **eligibility.roof,
+                }
+            )
+
 
 def app():
     """
@@ -491,7 +617,7 @@ def app():
     :return:
     """
 
-    use_cache = False
+    use_cache = True
 
     files = {
         "ha_1": {
diff --git a/etl/epc/Record.py b/etl/epc/Record.py
index 6fb4d5d9..f0bbcbfa 100644
--- a/etl/epc/Record.py
+++ b/etl/epc/Record.py
@@ -361,7 +361,7 @@ class EPCRecord:
         if not self.prepared_epc:
             raise ValueError("EPC Recrod doesn not contain epc data")
 
-        if self.prepared_epc["fixed-lighting-outlets-count"] == "":
+        if self.prepared_epc["fixed-lighting-outlets-count"] in ["", None] + list(DATA_ANOMALY_MATCHES):
 
             # We check old EPCs and the full SAP EPC
 
@@ -537,7 +537,7 @@ class EPCRecord:
                 else:
                     value = 0
             else:
-                value = int(value)
+                value = int(float(value))
 
             self.prepared_epc[attribute] = value
 
@@ -583,9 +583,8 @@ class EPCRecord:
         if not self.prepared_epc:
             raise ValueError("EPC Recrod doesn not contain epc data")
 
-        self.prepared_epc['photo-supply'] = float(self.prepared_epc['photo-supply']) if self.prepared_epc[
-                                                                                            'photo-supply'] != "" \
-            else None
+        self.prepared_epc['photo-supply'] = float(self.prepared_epc['photo-supply']) if (
+            self.prepared_epc['photo-supply'] not in [None, ""]) else None
 
     def _clean_energy(self):
         """

From 013070073c9431db2471c5851a342e8d8779f869 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 24 Jan 2024 10:59:21 +0000
Subject: [PATCH 37/48] updated cleaning of construction age band to also clean
 the prepared epc

---
 etl/epc/Record.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/etl/epc/Record.py b/etl/epc/Record.py
index f0bbcbfa..aac22618 100644
--- a/etl/epc/Record.py
+++ b/etl/epc/Record.py
@@ -618,9 +618,11 @@ class EPCRecord:
         if not self.prepared_epc:
             raise ValueError("EPC Recrod doesn not contain epc data")
 
-        self.construction_age_band = EPCDataProcessor.clean_construction_age_band(
-            self.prepared_epc["construction-age-band"])
-        if self.construction_age_band in DATA_ANOMALY_MATCHES:
+        self.prepared_epc["construction-age-band"] = EPCDataProcessor.clean_construction_age_band(
+            self.prepared_epc["construction-age-band"]
+        )
+
+        if self.prepared_epc["construction-age-band"] in DATA_ANOMALY_MATCHES:
             if self.old_data:
                 # Take the most recent
                 max_datetime = max(
@@ -630,15 +632,17 @@ class EPCRecord:
                 most_recent = [old_record for old_record in self.old_data if
                                old_record["lodgement-datetime"] == max_datetime]
 
-                self.construction_age_band = EPCDataProcessor.clean_construction_age_band(
+                self.prepared_epc["construction-age-band"] = EPCDataProcessor.clean_construction_age_band(
                     most_recent[0]["construction-age-band"]
                 )
 
+        self.construction_age_band = self.prepared_epc["construction-age-band"]
         self.age_band = england_wales_age_band_lookup.get(self.construction_age_band)
 
         if (self.prepared_epc["transaction-type"] == "new dwelling") and (self.age_band is None):
             self.age_band = "L"
             self.construction_age_band = 'England and Wales: 2012 onwards'
+            self.prepared_epc["construction-age-band"] = self.construction_age_band
 
         if self.age_band is None:
             raise ValueError("age_band is missing")

From f2872def6480cb000010e77431654dc62ee44f8d Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 24 Jan 2024 11:16:11 +0000
Subject: [PATCH 38/48] Adde None to DATA_ANOMALY_MATCHES

---
 etl/epc/settings.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/etl/epc/settings.py b/etl/epc/settings.py
index 24c23ebc..33bab190 100644
--- a/etl/epc/settings.py
+++ b/etl/epc/settings.py
@@ -43,7 +43,9 @@ DATA_ANOMALY_MATCHES = {
     # contain a ‘null’ value. A resolution to correct these anomalies will be considered for future data releases.
     "NULL",
     # We sometimes see fields populated with just an empty string.
-    ""
+    "",
+    # We sometimes find None values - particulatly when we produce an estimated EPC
+    None,
 }
 
 DATA_ANOMALY_SUBSTRINGS = {

From 60e3221fa312c813116889d46c6276b06f0b3068 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 24 Jan 2024 12:19:44 +0000
Subject: [PATCH 39/48] patching eligibility for missing rows in cleaned_lookup

---
 .../ha_15_32/ha_analysis_batch_3.py            | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
index 1bb0f0c4..14b6dfcf 100644
--- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
+++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
@@ -485,7 +485,7 @@ def get_epc_data(
 
         # If the survey list is missing, it means we have no yet completed any surveys and therefore should only
         # consider the most recent EPC
-        consider_penultimate_epc = data_assets["survey_list"] is None
+        consider_penultimate_epc = data_assets["survey_list"] is not None
 
         # We iterate through the asset list and pull what we need
         results = []
@@ -669,6 +669,22 @@ def app():
     )
     cleaned = msgpack.unpackb(cleaned, raw=False)
 
+    # Patch to handle the a missing description
+    cleaned["floor-description"].extend(
+        [
+            {'original_description': 'To external air, uninsulated (assumed)',
+             'clean_description': 'To external air, no insulation', 'thermal_transmittance': None,
+             'thermal_transmittance_unit': None, 'is_assumed': True, 'is_to_unheated_space': False,
+             'is_to_external_air': True, 'is_suspended': False, 'is_solid': False, 'another_property_below': False,
+             'insulation_thickness': 'none'},
+            {'original_description': 'To unheated space, uninsulated (assumed)',
+             'clean_description': 'To unheated space, uninsulated', 'thermal_transmittance': None,
+             'thermal_transmittance_unit': None, 'is_assumed': True, 'is_to_unheated_space': True,
+             'is_to_external_air': False, 'is_suspended': False, 'is_solid': False, 'another_property_below': False,
+             'insulation_thickness': 'average'}
+        ]
+    )
+
     cleaning_data = read_dataframe_from_s3_parquet(
         bucket_name="retrofit-data-dev", file_key="sap_change_model/cleaning_dataset.parquet",
     )

From f5d780a1b0ab920cfdcaf69a0e6d341ef865e11c Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 24 Jan 2024 12:59:56 +0000
Subject: [PATCH 40/48] Added back in filling of age with national average

---
 .../ha_15_32/ha_analysis_batch_3.py           | 21 +++++++++++++++++++
 etl/epc/Record.py                             |  4 +++-
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
index 14b6dfcf..9143df5f 100644
--- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
+++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
@@ -610,6 +610,27 @@ def get_epc_data(
             )
 
 
+def analyse_ha_data():
+    """
+    The approach we take within this function is the following:
+    For properties that have been identified by warmfront as eligible properties, characterise them by scheme. The
+    characterisation can be broken down as the following:
+    1) The property has been identified by Warmfront and is eligible for ECO4/GBIS work, under the strictest criteria
+    2) The property has been identified by Warmfront, however it has a full cavity, and therefore would be subject to
+    a CIGA check
+    3) The property has been identified by Warmfront, but the EPC shows that the property has more than 100mm loft
+    insulation
+    4) The property has been identified by Warmfront, but doesn't look like a property that would likely qualify under
+    any cirsumstances, given the available data
+
+    Then, for any property that has NOT been identifid by Warmfront, we identify properties that look like they would
+    qualify under the strictest criteria, and mark these as potential additional opportunities.
+
+    :return:
+    """
+    pass
+
+
 def app():
     """
     This app contains the housign association analysis for HAs 1, 6, 14, 39 and 107.
diff --git a/etl/epc/Record.py b/etl/epc/Record.py
index aac22618..9fcf31ff 100644
--- a/etl/epc/Record.py
+++ b/etl/epc/Record.py
@@ -645,7 +645,9 @@ class EPCRecord:
             self.prepared_epc["construction-age-band"] = self.construction_age_band
 
         if self.age_band is None:
-            raise ValueError("age_band is missing")
+            self.age_band = "C"
+            self.construction_age_band = "England and Wales: 1930-1949"
+            self.prepared_epc["construction-age-band"] = self.construction_age_band
 
     def _clean_year_built(self):
         """

From d557653129a3aac4b8b45d1c5d6a14d01ac6ac8b Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 24 Jan 2024 14:16:33 +0000
Subject: [PATCH 41/48] patched issue with cleaned lookup

---
 .../ha_15_32/ha_analysis_batch_3.py           | 86 +++++++++++++++++++
 1 file changed, 86 insertions(+)

diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
index 9143df5f..85486e17 100644
--- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
+++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
@@ -609,6 +609,75 @@ def get_epc_data(
                 }
             )
 
+        scoring_df = pd.DataFrame(scoring_data)
+        scoring_df = scoring_df.drop(
+            columns=[
+                "rdsap_change", "heat_demand_change", "carbon_change", "sap_ending", "heat_demand_ending",
+                "carbon_ending"
+            ]
+        )
+
+        model_api = ModelApi(portfolio_id="-".join([ha_name, "eligibility"]), timestamp=created_at)
+
+        all_predictions = model_api.predict_all(
+            df=scoring_df,
+            bucket="retrofit-data-dev",
+            prediction_buckets={
+                "sap_change_predictions": "retrofit-sap-predictions-dev",
+                "heat_demand_predictions": "retrofit-heat-predictions-dev",
+                "carbon_change_predictions": "retrofit-carbon-predictions-dev"
+            }
+        )
+
+        results_df = pd.DataFrame(results)
+
+        predictions = all_predictions["sap_change_predictions"].copy()
+
+        predictions = predictions.rename(columns={"property_id": "row_id"}).merge(
+            results_df[["row_id", "sap"]], how="left", on="row_id"
+        )
+        predictions["sap_uplift"] = predictions["predictions"] - predictions["sap"]
+        predictions = predictions.groupby("row_id")["sap_uplift"].sum().reset_index()
+
+        results_df = results_df.merge(
+            predictions[["sap_uplift", "row_id"]],
+            how="left",
+            on="row_id"
+        )
+        results_df["post_install_sap"] = results_df["sap"] + results_df["sap_uplift"]
+
+        eligibility_assessment = []
+        for _, row in results_df[results_df["eco4_eligible"] == True].iterrows():
+            # The upgrade requirements are dependent on the current SAP
+
+            # If the property is an F or G, it only needs to upgrade to an %
+            if row["sap"] <= 38:
+                if row["post_install_sap"] >= 57:
+                    eligibility_classification = "highest confidence"
+                elif row["post_install_sap"] >= 55:
+                    eligibility_classification = "high confidence"
+                elif row["post_install_sap"] >= 53:
+                    eligibility_classification = "medium confidence"
+                else:
+                    eligibility_classification = "unlikely"
+            else:
+
+                if row["post_install_sap"] >= 71:
+                    eligibility_classification = "highest confidence"
+                elif row["post_install_sap"] >= 69:
+                    eligibility_classification = "high confidence"
+                elif row["post_install_sap"] >= 67:
+                    eligibility_classification = "medium confidence"
+                else:
+                    eligibility_classification = "unlikely"
+
+            eligibility_assessment.append(
+                {
+                    "row_id": row["row_id"],
+                    "eligibility_classification": eligibility_classification
+                }
+            )
+
 
 def analyse_ha_data():
     """
@@ -706,6 +775,23 @@ def app():
         ]
     )
 
+    # We treat unknown loft insulation as no insulation
+    cleaned["roof-description"].extend(
+        [
+            {'original_description': 'Pitched, Unknown loft insulation', 'clean_description': 'Pitched, no insulation',
+             'thermal_transmittance': None, 'thermal_transmittance_unit': None, 'is_pitched': True,
+             'is_roof_room': False,
+             'is_loft': False, 'is_flat': False, 'is_thatched': False, 'is_at_rafters': False, 'is_assumed': True,
+             'has_dwelling_above': False, 'is_valid': True, 'insulation_thickness': 'none'}
+        ]
+    )
+
+    # We patch this record because there is another property below
+    for x in cleaned["floor-description"]:
+        if x["original_description"] == '(Same dwelling below) insulated (assumed)':
+            x["another_property_below"] = True
+            x["thermal_transmittance"] = 0
+
     cleaning_data = read_dataframe_from_s3_parquet(
         bucket_name="retrofit-data-dev", file_key="sap_change_model/cleaning_dataset.parquet",
     )

From edb541f3dc3dca9f03fc75b1e7e399fcf9d6790f Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 24 Jan 2024 14:36:46 +0000
Subject: [PATCH 42/48] patching heating controls

---
 backend/Property.py                             |  1 +
 etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 10 ++++++++++
 2 files changed, 11 insertions(+)

diff --git a/backend/Property.py b/backend/Property.py
index e527c1ea..4d26857d 100644
--- a/backend/Property.py
+++ b/backend/Property.py
@@ -319,6 +319,7 @@ class Property(Definitions):
             attributes = [
                 x for x in cleaned[description] if x["original_description"] == self.data[description]
             ]
+            
             if len(attributes) > 1:
                 raise ValueError("Either No attributes or multiple found for %s" % description)
 
diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
index 85486e17..66183599 100644
--- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
+++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
@@ -786,6 +786,16 @@ def app():
         ]
     )
 
+    # Patch mainheatcont-description
+    cleaned["mainheatcont-description"].extend(
+        [
+            {'original_description': 'None', 'clean_description': 'None', 'thermostatic_control': False,
+             'charging_system': False, 'switch_system': False, 'no_control': False, 'dhw_control': False,
+             'community_heating': False, 'multiple_room_thermostats': False, 'auxiliary_systems': False, 'trvs': False,
+             'rate_control': False}
+        ]
+    )
+
     # We patch this record because there is another property below
     for x in cleaned["floor-description"]:
         if x["original_description"] == '(Same dwelling below) insulated (assumed)':

From ef27d6b1640b6f2003b0d6b3c30c40ce15418486 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 24 Jan 2024 21:21:01 +0000
Subject: [PATCH 43/48] Added booleans to clean missings

---
 BaseUtility.py                                |   4 +-
 backend/Property.py                           |   9 +-
 etl/eligibility/Eligibility.py                |  15 ++
 .../ha_15_32/ha_analysis_batch_3.py           | 237 +++++++++++++++++-
 etl/epc/Dataset.py                            |  33 ++-
 etl/epc/settings.py                           |   2 +
 6 files changed, 288 insertions(+), 12 deletions(-)

diff --git a/BaseUtility.py b/BaseUtility.py
index bd2f091e..e799144d 100644
--- a/BaseUtility.py
+++ b/BaseUtility.py
@@ -45,7 +45,9 @@ class Definitions:
         # contain a ‘null’ value. A resolution to correct these anomalies will be considered for future data releases.
         "NULL",
         # We sometimes see fields populated with just an empty string.
-        ""
+        "",
+        # An older value which rarely shows up but has been seen in the data.
+        "UNKNOWN",
     }
 
     DATA_ANOMALY_SUBSTRINGS = {
diff --git a/backend/Property.py b/backend/Property.py
index 4d26857d..82695b75 100644
--- a/backend/Property.py
+++ b/backend/Property.py
@@ -13,7 +13,7 @@ from etl.epc_clean.epc_attributes.all_cleaners import all_cleaner_map
 from etl.solar.SolarPhotoSupply import SolarPhotoSupply
 from utils.logger import setup_logger
 from utils.s3 import read_dataframe_from_s3_parquet
-from BaseUtility import Definitions
+from etl.epc.settings import DATA_ANOMALY_MATCHES
 from recommendations.rdsap_tables import england_wales_age_band_lookup, FLOOR_LEVEL_MAP
 from recommendations.recommendation_utils import (
     estimate_perimeter, get_wall_type, estimate_external_wall_area, esimtate_pitched_roof_area, estimate_windows
@@ -25,7 +25,7 @@ DATA_BUCKET = os.environ.get('DATA_BUCKET', 'retrofit-data-dev' if ENVIRONMENT =
 logger = setup_logger()
 
 
-class Property(Definitions):
+class Property:
     ATTRIBUTE_MAP = {
         "floor-description": "floor",
         "hotwater-description": "hotwater",
@@ -51,6 +51,8 @@ class Property(Definitions):
     spatial = None
     base_difference_record = None
 
+    DATA_ANOMALY_MATCHES = DATA_ANOMALY_MATCHES
+
     def __init__(self, id, postcode, address, epc_record):
 
         self.epc_record = epc_record
@@ -302,6 +304,7 @@ class Property(Definitions):
         self.set_basic_property_dimensions()
 
         for description, attribute in cleaned.items():
+
             if self.data[description] in self.DATA_ANOMALY_MATCHES:
                 template = cleaned[description][0]
                 fill_dict = dict(zip(template.keys(), [None] * len(template)))
@@ -319,7 +322,7 @@ class Property(Definitions):
             attributes = [
                 x for x in cleaned[description] if x["original_description"] == self.data[description]
             ]
-            
+
             if len(attributes) > 1:
                 raise ValueError("Either No attributes or multiple found for %s" % description)
 
diff --git a/etl/eligibility/Eligibility.py b/etl/eligibility/Eligibility.py
index 13966655..6a5c03e1 100644
--- a/etl/eligibility/Eligibility.py
+++ b/etl/eligibility/Eligibility.py
@@ -233,6 +233,13 @@ class Eligibility:
     def room_roof_insulation(self):
         is_room_roof = self.roof["is_roof_room"]
 
+        if not is_room_roof:
+            self.room_roof = {
+                "suitability": False,
+                "thickness": None
+            }
+            return
+
         insulation_thickness = convert_thickness_to_numeric(
             self.roof["insulation_thickness"],
             self.roof["is_pitched"],
@@ -246,6 +253,14 @@ class Eligibility:
 
     def flat_roof_insulation(self):
         is_flat = self.roof["is_flat"]
+
+        if not is_flat:
+            self.flat_roof = {
+                "suitability": False,
+                "thickness": None
+            }
+            return
+
         insulation_thickness = convert_thickness_to_numeric(
             self.roof["insulation_thickness"],
             self.roof["is_pitched"],
diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
index 66183599..8ee5d743 100644
--- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
+++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
@@ -154,6 +154,10 @@ class DataLoader:
 
             asset_list = pd.concat([asset_list, house_numbers[["HouseNo"]]], axis=1)
 
+        # Finally, we process property_type or built form, where needed
+        if ha_name == "ha_6":
+            asset_list["built_form"] = asset_list["Property Type"].apply(self.identify_built_form_ha6)
+
         return asset_list
 
     def load_survey_list(self, file_path, ha_name, asset_list, sheet_name=None):
@@ -412,6 +416,34 @@ class DataLoader:
 
         return matching_lookup
 
+    @staticmethod
+    def identify_built_form_ha6(property_string):
+        """
+        Identify the built form of a property from the given string.
+
+        :param property_string: The string describing the property
+        :return: The identified built form, or None if it cannot be identified
+        """
+        # Define keywords for each built form
+        built_forms = {
+            'Semi-Detached': ['semi detached'],
+            'Detached': ['detached'],
+            'Mid-Terrace': ['mid terrace', 'mid town house'],
+            'End-Terrace': ['end terrace', 'end town house']
+        }
+
+        # Normalize the input string to lower case for comparison
+        property_string_normalized = property_string.lower()
+
+        # Search for each built form keyword in the input string
+        for built_form, keywords in built_forms.items():
+            for keyword in keywords:
+                if keyword in property_string_normalized:
+                    return built_form
+
+        # Return None if no built form is identified
+        return None
+
     def load(self):
 
         if self.use_cache:
@@ -461,7 +493,7 @@ class DataLoader:
 
 
 def get_epc_data(
-    loader, cleaned, cleaning_data, created_at, photo_supply_lookup, floor_area_decile_thresholds
+    loader, cleaned, cleaning_data, created_at, photo_supply_lookup, floor_area_decile_thresholds, pull_data=True
 ):
     if not loader.data:
         raise ValueError("Data not found - please run loader.load() first")
@@ -476,10 +508,39 @@ def get_epc_data(
                 'Enclosed Mid': 'Mid-Terrace',
                 'Detached Local Connect': 'Detached',
             }
+        },
+        "ha_6": {
+            "property_type": {
+                'HOUSE': "House",
+                'GROUND FLOOR FLAT': "Flat",
+                'UPPER FLOOR FLAT': "Flat",
+                'MAISONETTE': "Maisonette",
+                'BUNGALOW': "Bungalow",
+                'WARDEN BUNGALOW': "Bungalow",
+                'WARDEN FLAT': "Flat",
+                'EXTRACARE SCHEME': "Flat",
+            }
+
         }
     }
 
+    outputs = {}
     for ha_name, data_assets in loader.data.items():
+
+        if not pull_data:
+            # Then we retrieve the data from S3
+            processed_ha_results = read_pickle_from_s3(
+                bucket_name="retrofit-datalake-dev",
+                s3_file_name=f"ha-analysis/{ha_name}/processed_results.pickle"
+            )
+
+            outputs[ha_name] = {
+                "results_df": processed_ha_results["results_df"],
+                "scoring_data": processed_ha_results["scoring_df"],
+                "nodata": processed_ha_results["nodata"]
+            }
+            continue
+
         # For each HA, we read pull in the data required, and store in S3
         asset_list = data_assets["asset_list"].copy()
 
@@ -490,8 +551,12 @@ def get_epc_data(
         # We iterate through the asset list and pull what we need
         results = []
         scoring_data = []
+        nodata = []
         for index, property_meta in tqdm(asset_list.iterrows(), total=len(asset_list)):
 
+            if property_meta["matching_postcode"] is None:
+                continue
+
             if ha_name == "ha_1":
                 property_type = property_meta["Asset Type"]
                 # We correct a small error
@@ -503,6 +568,9 @@ def get_epc_data(
                     property_type = "Flat"
 
                 built_form = property_type_lookup[ha_name]["built_form"].get(property_meta["Property Type"], None)
+            elif ha_name == "ha_6":
+                property_type = property_type_lookup[ha_name]["property_type"][property_meta["Dwelling type"]]
+                built_form = property_meta["built_form"]
             else:
                 raise NotImplementedError("Implement me")
 
@@ -517,6 +585,10 @@ def get_epc_data(
             searcher.ordnance_survey_client.built_form = built_form
             searcher.find_property(skip_os=True)
 
+            if searcher.newest_epc is None:
+                nodata.append(property_meta)
+                continue
+
             if searcher.newest_epc.get("estimated"):
                 # We insert the row ID as our proxy for UPRN
                 searcher.newest_epc["uprn"] = int(property_meta["asset_list_row_id"].split(ha_name)[1])
@@ -606,6 +678,7 @@ def get_epc_data(
                     "cavity_age": cavity_age,
                     **eligibility.walls,
                     **eligibility.roof,
+                    "is_estimated": searcher.newest_epc.get("estimated") is not None
                 }
             )
 
@@ -619,6 +692,10 @@ def get_epc_data(
 
         model_api = ModelApi(portfolio_id="-".join([ha_name, "eligibility"]), timestamp=created_at)
 
+        # scoring_df["is_community"].value_counts()
+        # scoring_df[scoring_df["is_community"] == "Unknown"]
+        # property_meta = asset_list[asset_list["asset_list_row_id"] == "ha_67238"].squeeze()
+
         all_predictions = model_api.predict_all(
             df=scoring_df,
             bucket="retrofit-data-dev",
@@ -678,8 +755,33 @@ def get_epc_data(
                 }
             )
 
+        eligibility_assessment = pd.DataFrame(eligibility_assessment)
 
-def analyse_ha_data():
+        results_df = results_df.merge(
+            eligibility_assessment, how="left", on="row_id"
+        )
+
+        # We store the results in S3 as a pickle
+        save_pickle_to_s3(
+            data={
+                "results_df": results_df,
+                "scoring_data": scoring_df,
+                "nodata": nodata
+            },
+            bucket_name="retrofit-datalake-dev",
+            s3_file_name=f"ha-analysis/{ha_name}/processed_results.pickle"
+        )
+
+        outputs[ha_name] = {
+            "results_df": results_df,
+            "scoring_data": scoring_df,
+            "nodata": nodata
+        }
+
+    return outputs
+
+
+def analyse_ha_data(outputs, loader):
     """
     The approach we take within this function is the following:
     For properties that have been identified by warmfront as eligible properties, characterise them by scheme. The
@@ -697,6 +799,127 @@ def analyse_ha_data():
 
     :return:
     """
+
+    for ha_name, datasets in outputs.items():
+
+        # TODO: This is placeholder because we don't have the schemes that the properties have been qualified for
+        #       yet
+        #
+        import random
+        randomly_allocated_schemes = random.choices(["ECO4", "GBIS"], k=inputs["asset_list"].shape[0])
+        inputs["asset_list"]["randomly_allocated_schemes"] = randomly_allocated_schemes
+        inputs["asset_list"]["funding_scheme"] = None
+        inputs["asset_list"]["funding_scheme"] = np.where(
+            inputs["asset_list"]["row_meaning"] == "identified potential eco works (CWI)",
+            inputs["asset_list"]["randomly_allocated_schemes"],
+            inputs["asset_list"]["funding_scheme"]
+        )
+
+        # End placholder
+
+        results_df = datasets["results_df"].copy()
+
+        inputs = [x for k, x in loader.data.items() if k == ha_name][0]
+
+        analysis_data = inputs["asset_list"][['asset_list_row_id', "row_meaning", "funding_scheme"]].rename(
+            columns={"row_meaning": "asset_identification_status"}
+        ).merge(
+            results_df,
+            how="left",
+            right_on="row_id",
+            left_on="asset_list_row_id"
+        )
+
+        # If we have a survey list, we merge this onto the results
+
+        n_properties_in_asset_list = analysis_data["asset_list_row_id"].nunique()
+
+        properties_sold = (
+            inputs["survey_list"].groupby("funding_scheme")["survey_list_row_id"].nunique().reset_index() if
+            inputs["survey_list"] is not None else 0
+        )
+        properties_sold_eco4 = (
+            properties_sold[properties_sold["funding_scheme"] == "ECO4"]["survey_list_row_id"].values[0] if
+            properties_sold != 0 else 0
+        )
+        properties_sold_gbis = (
+            properties_sold[properties_sold["funding_scheme"] == "GBIS"]["survey_list_row_id"].values[0] if
+            properties_sold != 0 else 0
+        )
+
+        # We now merge the survey list onto the analysis data and remove anything that is sold, to give us just what is
+        # remaining
+
+        if inputs["matched_lookup"] is not None:
+            analysis_data = analysis_data.merge(
+                inputs["matched_lookup"], how="left", on="asset_list_row_id"
+            )
+            # Drop any rows that have a survey_list_row_id
+            analysis_data = analysis_data[pd.isnull(analysis_data["survey_list_row_id"])]
+
+        # We now calculate the number of remaining properties, by scheme
+        # TODO: We might need to tweak a bit of the knowledge
+        remaining_properties = analysis_data[
+            analysis_data["asset_identification_status"] == "identified potential eco works (CWI)"
+            ]
+
+        remaining_properties_by_scheme = (
+            remaining_properties.groupby("funding_scheme")["asset_list_row_id"].nunique().reset_index()
+        )
+        remaining_properties_eco4 = remaining_properties_by_scheme[
+            remaining_properties_by_scheme["funding_scheme"] == "ECO4"
+            ]["asset_list_row_id"].values[0]
+
+        remaining_properties_gbis = remaining_properties_by_scheme[
+            remaining_properties_by_scheme["funding_scheme"] == "GBIS"
+            ]["asset_list_row_id"].values[0]
+
+        # For the remaining properties, we use the results of the eligibility process to classify the property into
+        # one of multiple categories
+        #
+        # For properties that have been identified as ECO4
+        # 1) Strict ECO4 candidate - Has required fabric and EPC is below a D
+        #    - This is not the very strictest definition of ECO4 eligible, but we aim to characterise the properties
+        #      here and re-surveying is a common practicce by Warmfront. Additionally, many of the social homes have
+        #      very old EPCs which may score lower when re-done
+        # 2) Subject to CIGA check - Meets loft conditions but shows a filled cavity.
+        #    - we don't have a SAP constraint here because the EPC is (currently) showing what the property might
+        #      actually look like after retrofit and so the EPC currently being a C or above means little, because
+        #      the updated EPC, showing an empty cavity, could bring the property within
+        # 3) Loft insulation too thick - Meets empty cavity but shows a loft with between 101 and 270mm insulation.
+        #   - No SAP constraint, for the same reason as in category 2)
+        # 4) Does not look like ECO4 candidate
+        #
+        # For properties that have been identified as GBIS
+        # 1) Strict GBIS candidates
+        # 2) Properties that actually look like strict GBIS candidates
+        # 3) Subject to CIGA check - Filled cavity
+        # 4) Does not look like a GBIS candidate
+
+        # ECO4
+        # 1) We identify this if:
+        #   - remaining_properties["eco4_eligible"] == True
+        #   - remaining_properties[""]
+        remaining_properties[remaining_properties["eco4_eligible"] == True]["eco4_message"].value_counts()
+        remaining_properties["eco4_message"].value_counts()
+        z = remaining_properties[
+            (remaining_properties["eco4_message"] == "Possibly eligible but property currently EPC D") &
+            (remaining_properties["eco4_eligible"] == True)
+            ]
+
+        k = z[z["property_type"] == "Flat"]
+        k["uprn"]
+
+        ha_analysis_results = {
+            "n_properties_in_asset_list": n_properties_in_asset_list,
+            # ECO4
+            "properties_sold_eco4": properties_sold_eco4,
+            "remaining_properties_eco4": remaining_properties_eco4,
+            # GBIS
+            "properties_sold_gbis": properties_sold_gbis,
+            "remaining_properties_gbis": remaining_properties_gbis
+        }
+
     pass
 
 
@@ -789,10 +1012,10 @@ def app():
     # Patch mainheatcont-description
     cleaned["mainheatcont-description"].extend(
         [
-            {'original_description': 'None', 'clean_description': 'None', 'thermostatic_control': False,
-             'charging_system': False, 'switch_system': False, 'no_control': False, 'dhw_control': False,
-             'community_heating': False, 'multiple_room_thermostats': False, 'auxiliary_systems': False, 'trvs': False,
-             'rate_control': False}
+            {'original_description': 'None', 'clean_description': 'None', 'thermostatic_control': None,
+             'charging_system': None, 'switch_system': None, 'no_control': None, 'dhw_control': None,
+             'community_heating': None, 'multiple_room_thermostats': False, 'auxiliary_systems': None, 'trvs': None,
+             'rate_control': None}
         ]
     )
 
@@ -810,4 +1033,4 @@ def app():
 
     photo_supply_lookup, floor_area_decile_thresholds = SolarPhotoSupply.load(bucket="retrofit-data-dev")
 
-    get_epc_data(loader)
+    outputs = get_epc_data(loader)
diff --git a/etl/epc/Dataset.py b/etl/epc/Dataset.py
index fbc7a2d2..4a159f4b 100644
--- a/etl/epc/Dataset.py
+++ b/etl/epc/Dataset.py
@@ -11,6 +11,37 @@ from recommendations.recommendation_utils import (
     get_wall_type
 )
 
+# TODO: Can probably produce this in the property change app and store in S3
+BOOLEAN_VARIABLES = [
+    'is_cavity_wall', 'is_filled_cavity', 'is_solid_brick', 'is_system_built', 'is_timber_frame',
+    'is_granite_or_whinstone', 'is_as_built', 'is_cob', 'is_sandstone_or_limestone', 'is_park_home',
+    'external_insulation', 'internal_insulation', 'is_park_home_ending', 'external_insulation_ending',
+    'internal_insulation_ending', 'is_to_unheated_space', 'is_to_external_air', 'is_suspended', 'is_solid',
+    'another_property_below', 'is_pitched', 'is_roof_room', 'is_loft', 'is_flat', 'is_thatched', 'is_at_rafters',
+    'has_dwelling_above', 'has_radiators', 'has_fan_coil_units', 'has_pipes_in_screed_above_insulation',
+    'has_pipes_in_insulated_timber_floor', 'has_pipes_in_concrete_slab', 'has_boiler', 'has_air_source_heat_pump',
+    'has_room_heaters', 'has_electric_storage_heaters', 'has_warm_air', 'has_electric_underfloor_heating',
+    'has_electric_ceiling_heating', 'has_community_scheme', 'has_ground_source_heat_pump', 'has_no_system_present',
+    'has_portable_electric_heaters', 'has_water_source_heat_pump', 'has_electric_heat_pump', 'has_micro-cogeneration',
+    'has_solar_assisted_heat_pump', 'has_exhaust_source_heat_pump', 'has_community_heat_pump', 'has_electric',
+    'has_mains_gas', 'has_wood_logs', 'has_coal', 'has_oil', 'has_wood_pellets', 'has_anthracite',
+    'has_dual_fuel_mineral_and_wood', 'has_smokeless_fuel', 'has_lpg', 'has_b30k', 'has_electricaire',
+    'has_assumed_for_most_rooms', 'has_underfloor_heating', 'has_radiators_ending', 'has_fan_coil_units_ending',
+    'has_pipes_in_screed_above_insulation_ending', 'has_pipes_in_insulated_timber_floor_ending',
+    'has_pipes_in_concrete_slab_ending', 'has_boiler_ending', 'has_air_source_heat_pump_ending',
+    'has_room_heaters_ending', 'has_electric_storage_heaters_ending', 'has_warm_air_ending',
+    'has_electric_underfloor_heating_ending', 'has_electric_ceiling_heating_ending', 'has_community_scheme_ending',
+    'has_ground_source_heat_pump_ending', 'has_no_system_present_ending', 'has_portable_electric_heaters_ending',
+    'has_water_source_heat_pump_ending', 'has_electric_heat_pump_ending', 'has_micro-cogeneration_ending',
+    'has_solar_assisted_heat_pump_ending', 'has_exhaust_source_heat_pump_ending', 'has_community_heat_pump_ending',
+    'has_electric_ending', 'has_mains_gas_ending', 'has_wood_logs_ending', 'has_coal_ending', 'has_oil_ending',
+    'has_wood_pellets_ending', 'has_anthracite_ending', 'has_dual_fuel_mineral_and_wood_ending',
+    'has_smokeless_fuel_ending', 'has_lpg_ending', 'has_b30k_ending', 'has_electricaire_ending',
+    'has_assumed_for_most_rooms_ending', 'has_underfloor_heating_ending', 'multiple_room_thermostats',
+    'multiple_room_thermostats_ending', 'is_community', 'no_individual_heating_or_community_network',
+    'is_community_ending', 'no_individual_heating_or_community_network_ending'
+]
+
 
 class BaseDataset:
     """
@@ -439,7 +470,7 @@ class TrainingDataset(BaseDataset):
 
         for col in missings.index:
             unique_values = self.df[col].unique()
-            if True in unique_values or False in unique_values:
+            if (True in unique_values) or (False in unique_values) or (col in BOOLEAN_VARIABLES):
                 self.df[col] = self.df[col].fillna(False)
             if "none" in unique_values:
                 self.df[col] = self.df[col].fillna("none")
diff --git a/etl/epc/settings.py b/etl/epc/settings.py
index 33bab190..87f27972 100644
--- a/etl/epc/settings.py
+++ b/etl/epc/settings.py
@@ -46,6 +46,8 @@ DATA_ANOMALY_MATCHES = {
     "",
     # We sometimes find None values - particulatly when we produce an estimated EPC
     None,
+    # An older value which rarely shows up but has been seen in the data.
+    "UNKNOWN",
 }
 
 DATA_ANOMALY_SUBSTRINGS = {

From 3cfb2002e41a4ec5b3120b7f5d0ac781a94f1310 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Thu, 25 Jan 2024 14:38:24 +0000
Subject: [PATCH 44/48] Handling property type for ha 107 and 39

---
 .../ha_15_32/ha_analysis_batch_3.py           | 126 +++++++++++++++++-
 1 file changed, 119 insertions(+), 7 deletions(-)

diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
index 8ee5d743..dfd95100 100644
--- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
+++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
@@ -125,11 +125,11 @@ class DataLoader:
         elif ha_name == "ha_39":
             # Create matching_address by concatenating add_1, add_2, add_3, add_4, add_5, post_code
             asset_list["matching_address"] = asset_list["add_1"].astype(str).str.lower().str.strip() + ", " + \
-                                             asset_list["add_2"].str.lower().str.strip() + ", " + \
-                                             asset_list["add_3"].str.lower().str.strip() + ", " + \
-                                             asset_list["add_4"].str.lower().str.strip() + ", " + \
-                                             asset_list["add_5"].str.lower().str.strip() + ", " + \
-                                             asset_list["post_code"].str.lower().str.strip()
+                                             asset_list["add_2"].astype(str).str.lower().str.strip() + ", " + \
+                                             asset_list["add_3"].astype(str).str.lower().str.strip() + ", " + \
+                                             asset_list["add_4"].astype(str).str.lower().str.strip() + ", " + \
+                                             asset_list["add_5"].astype(str).str.lower().str.strip() + ", " + \
+                                             asset_list["post_code"].astype(str).str.lower().str.strip()
             asset_list["matching_postcode"] = asset_list["post_code"].str.lower().str.strip()
         elif ha_name == "ha_107":
             # Create matching_address by concatenating House No, Street, Town, District, Postcode
@@ -520,10 +520,70 @@ def get_epc_data(
                 'WARDEN FLAT': "Flat",
                 'EXTRACARE SCHEME': "Flat",
             }
-
+        },
+        "ha_14": {
+            "property_type": {
+                "House": "House",
+                "Flat": "Flat",
+                "Bungalow": "Bungalow",
+                "Maisonette": "Maisonette",
+            }
+        },
+        "ha_39": {
+            "Semi house": {"property_type": "House", "built_form": "Semi-Detached"},
+            "1st floor flat": {"property_type": "Flat", "built_form": None},
+            "Mid terrace house": {"property_type": "House", "built_form": "Mid-Terrace"},
+            "Ground floor flat": {"property_type": "Flat", "built_form": None},
+            "End terrace house": {"property_type": "House", "built_form": "End-Terrace"},
+            "Semi bungalow": {"property_type": "Bungalow", "built_form": "Semi-Detached"},
+            "End terrace bungalow": {"property_type": "Bungalow", "built_form": "End-Terrace"},
+            "2nd floor flat": {"property_type": "Flat", "built_form": None},
+            "Mid terrace bungalow": {"property_type": "Bungalow", "built_form": "Mid-Terrace"},
+            "3rd floor flat": {"property_type": "Flat", "built_form": None},
+            "Detached bungalow": {"property_type": "Bungalow", "built_form": "Detached"},
+            "Maisonette": {"property_type": "Maisonette", "built_form": None},
+            "Detached house": {"property_type": "House", "built_form": "Detached"},
+            "Lower ground floor flat": {"property_type": "Flat", "built_form": None},
+            "Dormer bungalow": {"property_type": "Bungalow", "built_form": None},
+            "Basement flat": {"property_type": "Flat", "built_form": None},
+            "Cluster House": {"property_type": "House", "built_form": "Detached"},
+            "2nd/3rd floor duplex flat": {"property_type": "Flat", "built_form": None},
+            "Ground floor flat with study": {"property_type": "Flat", "built_form": None},
+            "4th floor flat": {"property_type": "Flat", "built_form": None},
+            "1st floor flat with study room": {"property_type": "Flat", "built_form": None},
+            "2nd floor flat with study": {"property_type": "Flat", "built_form": None},
+        },
+        "ha_107": {
+            "property_type": {
+                "HOUSE": "House",
+                "BUNGALOW": "Bungalow",
+                "GRD FLOOR FLAT": "Flat",
+                "FIRST FLOOR FLAT": "Flat",
+                "SHELTERED BUNGALOW": "Bungalow",
+                "MAISONETTE": "Maisonette",
+                "SECOND FLOOR FLAT": "Flat",
+                "SHELTERED FIRST FLR": "Flat",
+                "SHELTERED GROUND FLR": "Flat",
+                "GRD FLOOR BED SIT": "House"
+            },
+            "built_form": {
+                "Semi Detached": "Semi-Detached",
+                "Mid Terrace": "Mid-Terrace",
+                "End Terrace": "End-Terrace",
+                "Detached": "Detached",
+                "Detatched": "Detached",
+            }
         }
     }
 
+    # TODO: Sort these
+    # DwellingType
+    # UNKNOWN                  395
+    # SHELTERED FIRST FLR       77
+    #       62
+    # ROOM                       4
+    # GRD FLOOR BED SIT          3
+
     outputs = {}
     for ha_name, data_assets in loader.data.items():
 
@@ -571,11 +631,63 @@ def get_epc_data(
             elif ha_name == "ha_6":
                 property_type = property_type_lookup[ha_name]["property_type"][property_meta["Dwelling type"]]
                 built_form = property_meta["built_form"]
+            elif ha_name == "ha_14":
+                if property_meta["Asset Type Description"] == "Block - Repair":
+                    # We try and deduce if it's a flat or house, depending on if it has "room" or "flats" in the address
+                    if "room" in property_meta["Address 1"].lower():
+                        property_type = "House"
+                    else:
+                        property_type = "Flat"
+
+                else:
+                    property_type = property_type_lookup[ha_name]["property_type"][
+                        property_meta["Asset Type Description"]
+                    ]
+
+                built_form = None
+            elif ha_name == "ha_39":
+
+                property_type_config = property_type_lookup[ha_name].get(property_meta["ConstructionStyle"], {})
+                property_type = property_type_config.get("property_type", None)
+                built_form = property_type_config.get("built_form", None)
+
+                if property_type is None:
+                    # We check for the presence of room or flat
+                    if "flat" in property_meta["matching_address"]:
+                        property_type = "Flat"
+                    else:
+                        property_type = "House"
+            elif ha_name == "ha_107":
+
+                dwelling_style = property_meta["Dwelling Style"]
+                if isinstance(dwelling_style, str):
+                    dwelling_style = dwelling_style.strip()
+
+                property_type = property_type_lookup[ha_name]["property_type"].get(property_meta["DwellingType"])
+                built_form = property_type_lookup[ha_name]["built_form"].get(dwelling_style, None)
+
+                if property_type is None:
+                    if built_form in ["Semi-Detached", "Mid-Terrace", "End-Terrace", "Detached"]:
+                        property_type = "House"
+
+                    if "flat" in property_meta["Wall Construction"].lower():
+                        property_type = "Flat"
+
+                    if (property_meta["DwellingType"] == "UNKNOWN") & (property_meta["Dwelling Style"] == 0):
+                        # Hand a few specific cases
+                        property_type = "Bungalow"
+
+                    if property_meta["Street"] == "School View":
+                        property_type = "Bungalow"
+
+                if property_type is None:
+                    blah
+
             else:
                 raise NotImplementedError("Implement me")
 
             searcher = SearchEpc(
-                address1=property_meta["HouseNo"],
+                address1=str(property_meta["HouseNo"]),
                 postcode=property_meta["matching_postcode"],
                 auth_token=EPC_AUTH_TOKEN,
                 os_api_key=None,

From 5c6bac1f8a2823037b0a1ac28481f741e7110ee9 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Fri, 26 Jan 2024 11:00:12 +0000
Subject: [PATCH 45/48] working on eligibility

---
 etl/eligibility/Eligibility.py                | 49 ++++++++++----
 .../ha_15_32/ha_analysis_batch_3.py           | 64 ++++++++++---------
 2 files changed, 70 insertions(+), 43 deletions(-)

diff --git a/etl/eligibility/Eligibility.py b/etl/eligibility/Eligibility.py
index 6a5c03e1..00c72a8e 100644
--- a/etl/eligibility/Eligibility.py
+++ b/etl/eligibility/Eligibility.py
@@ -114,7 +114,8 @@ class Eligibility:
             self.loft = {
                 "suitability": False,
                 "thickness": None,
-                "reason": "roof not loft"
+                "reason": "roof not loft",
+                "thickness_classification": None
             }
             return
 
@@ -125,18 +126,32 @@ class Eligibility:
             is_flat=self.roof["is_flat"]
         )
 
+        if insulation_thickness <= 100:
+            thickness_classification = "0-100mm"
+        elif insulation_thickness <= 270:
+            thickness_classification = "100-270mm"
+        else:
+            thickness_classification = "270mm+"
+
         if insulation_thickness <= loft_thickness_threshold:
+            # We produce a thiclkness classification for the loft
+            # 0 - 100mm insulation
+            # 100 - 270mm insulation
+            # 270mm+ insulation
+
             self.loft = {
                 "suitability": True,
                 "thickness": insulation_thickness,
-                "reason": None
+                "reason": None,
+                "thickness_classification": thickness_classification
             }
 
         if insulation_thickness <= high_loft_thickness_threshold:
             self.loft = {
                 "suitability": True,
                 "thickness": insulation_thickness,
-                "reason": "high loft thickness but below regulation"
+                "reason": "high loft thickness but below regulation",
+                "thickness_classification": thickness_classification
             }
             return
 
@@ -145,7 +160,8 @@ class Eligibility:
             self.loft = {
                 "suitability": False,
                 "thickness": insulation_thickness,
-                "reason": "existing insulation"
+                "reason": "existing insulation",
+                "thickness_classification": thickness_classification
             }
             return
 
@@ -371,20 +387,21 @@ class Eligibility:
         """
 
         current_sap = int(self.epc["current-energy-efficiency"])
-
-        if current_sap >= 69:
-            self.eco4_warmfront = {
-                "eligible": False,
-                "message": "sap too high"
-            }
-            return
-
         self.cavity_insulation()
         self.loft_insulation()
 
         # make sure conditions 2 and 3 are true
         is_eligible = self.cavity["suitability"] & self.loft["suitability"]
 
+        if current_sap >= 69:
+            self.eco4_warmfront = {
+                "eligible": False,
+                "message": "sap too high",
+                "cavity_type": self.cavity["type"],
+                "loft_type": self.loft["thickness_classification"]
+            }
+            return
+
         if post_retrofit_sap is None:
 
             if current_sap >= 55:
@@ -401,7 +418,9 @@ class Eligibility:
 
             self.eco4_warmfront = {
                 "eligible": is_eligible,
-                "message": message
+                "message": message,
+                "cavity_type": self.cavity["type"],
+                "loft_type": self.loft["thickness_classification"]
             }
             return
 
@@ -409,7 +428,9 @@ class Eligibility:
 
         self.eco4_warmfront = {
             "eligible": is_eligible,
-            "message": None
+            "message": None,
+            "cavity_type": self.cavity["type"],
+            "loft_type": self.loft["thickness_classification"]
         }
         return
 
diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
index dfd95100..1212522e 100644
--- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
+++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
@@ -12,11 +12,9 @@ from tqdm import tqdm
 from backend.SearchEpc import SearchEpc
 from etl.eligibility.Eligibility import Eligibility
 from etl.eligibility.ha_15_32.app import prepare_model_data_row
-from etl.epc.settings import COLUMNS_TO_MERGE_ON
 from backend.ml_models.api import ModelApi
 from etl.solar.SolarPhotoSupply import SolarPhotoSupply
 from recommendations.recommendation_utils import calculate_cavity_age
-from recommendation_utils import convert_thickness_to_numeric
 
 EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
 ENV_FILE = Path(__file__).parent / "etl" / "eligibility" / "ha_15_32" / ".env"
@@ -576,14 +574,6 @@ def get_epc_data(
         }
     }
 
-    # TODO: Sort these
-    # DwellingType
-    # UNKNOWN                  395
-    # SHELTERED FIRST FLR       77
-    #       62
-    # ROOM                       4
-    # GRD FLOOR BED SIT          3
-
     outputs = {}
     for ha_name, data_assets in loader.data.items():
 
@@ -596,7 +586,7 @@ def get_epc_data(
 
             outputs[ha_name] = {
                 "results_df": processed_ha_results["results_df"],
-                "scoring_data": processed_ha_results["scoring_df"],
+                "scoring_df": processed_ha_results["scoring_df"],
                 "nodata": processed_ha_results["nodata"]
             }
             continue
@@ -680,9 +670,6 @@ def get_epc_data(
                     if property_meta["Street"] == "School View":
                         property_type = "Bungalow"
 
-                if property_type is None:
-                    blah
-
             else:
                 raise NotImplementedError("Implement me")
 
@@ -790,7 +777,9 @@ def get_epc_data(
                     "cavity_age": cavity_age,
                     **eligibility.walls,
                     **eligibility.roof,
-                    "is_estimated": searcher.newest_epc.get("estimated") is not None
+                    "is_estimated": searcher.newest_epc.get("estimated") is not None,
+                    "eligibility_cavity_type": eligibility.eco4_warmfront["cavity_type"],
+                    "eligibility_loft_type": eligibility.eco4_warmfront["loft_type"]
                 }
             )
 
@@ -877,7 +866,7 @@ def get_epc_data(
         save_pickle_to_s3(
             data={
                 "results_df": results_df,
-                "scoring_data": scoring_df,
+                "scoring_df": scoring_df,
                 "nodata": nodata
             },
             bucket_name="retrofit-datalake-dev",
@@ -886,7 +875,7 @@ def get_epc_data(
 
         outputs[ha_name] = {
             "results_df": results_df,
-            "scoring_data": scoring_df,
+            "scoring_df": scoring_df,
             "nodata": nodata
         }
 
@@ -914,6 +903,7 @@ def analyse_ha_data(outputs, loader):
 
     for ha_name, datasets in outputs.items():
 
+        inputs = [x for k, x in loader.data.items() if k == ha_name][0]
         # TODO: This is placeholder because we don't have the schemes that the properties have been qualified for
         #       yet
         #
@@ -930,9 +920,6 @@ def analyse_ha_data(outputs, loader):
         # End placholder
 
         results_df = datasets["results_df"].copy()
-
-        inputs = [x for k, x in loader.data.items() if k == ha_name][0]
-
         analysis_data = inputs["asset_list"][['asset_list_row_id', "row_meaning", "funding_scheme"]].rename(
             columns={"row_meaning": "asset_identification_status"}
         ).merge(
@@ -970,19 +957,20 @@ def analyse_ha_data(outputs, loader):
             analysis_data = analysis_data[pd.isnull(analysis_data["survey_list_row_id"])]
 
         # We now calculate the number of remaining properties, by scheme
-        # TODO: We might need to tweak a bit of the knowledge
+        # TODO: We might need to tweak a bit of the logic
         remaining_properties = analysis_data[
             analysis_data["asset_identification_status"] == "identified potential eco works (CWI)"
-            ]
+            ].copy()
+        remaining_properties["prospect_type"] = None
 
         remaining_properties_by_scheme = (
             remaining_properties.groupby("funding_scheme")["asset_list_row_id"].nunique().reset_index()
         )
-        remaining_properties_eco4 = remaining_properties_by_scheme[
+        n_remaining_properties_eco4 = remaining_properties_by_scheme[
             remaining_properties_by_scheme["funding_scheme"] == "ECO4"
             ]["asset_list_row_id"].values[0]
 
-        remaining_properties_gbis = remaining_properties_by_scheme[
+        n_remaining_properties_gbis = remaining_properties_by_scheme[
             remaining_properties_by_scheme["funding_scheme"] == "GBIS"
             ]["asset_list_row_id"].values[0]
 
@@ -990,7 +978,8 @@ def analyse_ha_data(outputs, loader):
         # one of multiple categories
         #
         # For properties that have been identified as ECO4
-        # 1) Strict ECO4 candidate - Has required fabric and EPC is below a D
+        # 1) Strict ECO4 candidate - Has required fabric and EPC is D or below. We consider D or below here, because
+        #    Warmfront regularly re-surveys properties which then fall within the SAP requirement
         #    - This is not the very strictest definition of ECO4 eligible, but we aim to characterise the properties
         #      here and re-surveying is a common practicce by Warmfront. Additionally, many of the social homes have
         #      very old EPCs which may score lower when re-done
@@ -1008,10 +997,25 @@ def analyse_ha_data(outputs, loader):
         # 3) Subject to CIGA check - Filled cavity
         # 4) Does not look like a GBIS candidate
 
+        remaining_eco4_df = remaining_properties[
+            remaining_properties["funding_scheme"] == "ECO4"
+            ].copy()
         # ECO4
         # 1) We identify this if:
         #   - remaining_properties["eco4_eligible"] == True
-        #   - remaining_properties[""]
+
+        remaining_eco4_df["prospect_type"] = np.where(
+            remaining_eco4_df["eco4_eligible"] == True,
+            "strict ECO4",
+            remaining_eco4_df["prospect_type"]
+        )
+
+        # 2) We identify this if it has a filled cavity but meets the loft conditions
+
+        remaining_eco4_df["prospect_type"]
+
+        z = remaining_eco4_df[remaining_eco4_df["eco4_message"] == "sap too high"]
+
         remaining_properties[remaining_properties["eco4_eligible"] == True]["eco4_message"].value_counts()
         remaining_properties["eco4_message"].value_counts()
         z = remaining_properties[
@@ -1026,10 +1030,10 @@ def analyse_ha_data(outputs, loader):
             "n_properties_in_asset_list": n_properties_in_asset_list,
             # ECO4
             "properties_sold_eco4": properties_sold_eco4,
-            "remaining_properties_eco4": remaining_properties_eco4,
+            "n_remaining_properties_eco4": n_remaining_properties_eco4,
             # GBIS
             "properties_sold_gbis": properties_sold_gbis,
-            "remaining_properties_gbis": remaining_properties_gbis
+            "n_remaining_properties_gbis": n_remaining_properties_gbis
         }
 
     pass
@@ -1145,4 +1149,6 @@ def app():
 
     photo_supply_lookup, floor_area_decile_thresholds = SolarPhotoSupply.load(bucket="retrofit-data-dev")
 
-    outputs = get_epc_data(loader)
+    outputs = get_epc_data(
+        loader, cleaned, cleaning_data, created_at, photo_supply_lookup, floor_area_decile_thresholds, pull_data=False
+    )

From b6c57c7253ec86b59ef1599489a405a9466ce505 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Fri, 26 Jan 2024 17:17:43 +0000
Subject: [PATCH 46/48] created template of code to create the ha analysis
 results

---
 etl/eligibility/Eligibility.py                |   6 +-
 .../ha_15_32/ha_analysis_batch_3.py           | 242 +++++++++++++++---
 2 files changed, 207 insertions(+), 41 deletions(-)

diff --git a/etl/eligibility/Eligibility.py b/etl/eligibility/Eligibility.py
index 00c72a8e..1d868338 100644
--- a/etl/eligibility/Eligibility.py
+++ b/etl/eligibility/Eligibility.py
@@ -177,15 +177,13 @@ class Eligibility:
         is_empty = (not self.walls["is_filled_cavity"]) or (
             self.walls["is_as_built"] and self.walls["insulation_thickness"] not in ["average", "above average"]
         )
-        is_partial_filled = (
-            self.walls["is_as_built"] and self.walls["insulation_thickness"] not in ["below average"]
-        )
+        is_partial_filled = "partial" in self.walls["clean_description"].lower()
         # We look for potentially under performing cavities - anything that is assumed, as built and insulated
         is_underperforming = (
             self.walls["is_as_built"] and self.walls["insulation_thickness"] in ["average"] and self.walls["is_assumed"]
         )
 
-        is_unfilled_cavity = is_cavity and is_empty
+        is_unfilled_cavity = is_cavity and (is_empty and not is_partial_filled)
         is_partial_filled_cavity = is_cavity and is_partial_filled
         is_underperforming_cavity = is_cavity and is_underperforming
 
diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
index 1212522e..1ed95a30 100644
--- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
+++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
@@ -901,6 +901,7 @@ def analyse_ha_data(outputs, loader):
     :return:
     """
 
+    ha_analysis_results = []
     for ha_name, datasets in outputs.items():
 
         inputs = [x for k, x in loader.data.items() if k == ha_name][0]
@@ -917,9 +918,20 @@ def analyse_ha_data(outputs, loader):
             inputs["asset_list"]["funding_scheme"]
         )
 
+        # TODO: Also temp, just for HA 6
+        if ha_name == "ha_6":
+            inputs["survey_list"]["funding_scheme"] = None
+            inputs["survey_list"]["funding_scheme"] = np.where(
+                inputs["survey_list"][
+                    'AFFORDABLE WARMTH                 OR EPC FOR HOUSING ASSOCIATION '] == "AFFORDABLE WARMTH",
+                "ECO4",
+                "GBIS"
+            )
+
         # End placholder
 
         results_df = datasets["results_df"].copy()
+
         analysis_data = inputs["asset_list"][['asset_list_row_id', "row_meaning", "funding_scheme"]].rename(
             columns={"row_meaning": "asset_identification_status"}
         ).merge(
@@ -929,23 +941,6 @@ def analyse_ha_data(outputs, loader):
             left_on="asset_list_row_id"
         )
 
-        # If we have a survey list, we merge this onto the results
-
-        n_properties_in_asset_list = analysis_data["asset_list_row_id"].nunique()
-
-        properties_sold = (
-            inputs["survey_list"].groupby("funding_scheme")["survey_list_row_id"].nunique().reset_index() if
-            inputs["survey_list"] is not None else 0
-        )
-        properties_sold_eco4 = (
-            properties_sold[properties_sold["funding_scheme"] == "ECO4"]["survey_list_row_id"].values[0] if
-            properties_sold != 0 else 0
-        )
-        properties_sold_gbis = (
-            properties_sold[properties_sold["funding_scheme"] == "GBIS"]["survey_list_row_id"].values[0] if
-            properties_sold != 0 else 0
-        )
-
         # We now merge the survey list onto the analysis data and remove anything that is sold, to give us just what is
         # remaining
 
@@ -956,8 +951,23 @@ def analyse_ha_data(outputs, loader):
             # Drop any rows that have a survey_list_row_id
             analysis_data = analysis_data[pd.isnull(analysis_data["survey_list_row_id"])]
 
+        # If we have a survey list, we merge this onto the results
+        n_properties_in_asset_list = analysis_data["asset_list_row_id"].nunique()
+
+        properties_sold = (
+            inputs["survey_list"].groupby("funding_scheme")["survey_list_row_id"].nunique().reset_index() if
+            inputs["survey_list"] is not None else pd.DataFrame(columns=["funding_scheme"])
+        )
+        properties_sold_eco4 = (
+            properties_sold[properties_sold["funding_scheme"] == "ECO4"]["survey_list_row_id"].values[0] if
+            (not properties_sold.empty) and ("ECO4" in properties_sold["funding_scheme"].values) else 0
+        )
+        properties_sold_gbis = (
+            properties_sold[properties_sold["funding_scheme"] == "GBIS"]["survey_list_row_id"].values[0] if
+            (not properties_sold.empty) and ("GBIS" in properties_sold["funding_scheme"].values) else 0
+        )
+
         # We now calculate the number of remaining properties, by scheme
-        # TODO: We might need to tweak a bit of the logic
         remaining_properties = analysis_data[
             analysis_data["asset_identification_status"] == "identified potential eco works (CWI)"
             ].copy()
@@ -966,6 +976,7 @@ def analyse_ha_data(outputs, loader):
         remaining_properties_by_scheme = (
             remaining_properties.groupby("funding_scheme")["asset_list_row_id"].nunique().reset_index()
         )
+
         n_remaining_properties_eco4 = remaining_properties_by_scheme[
             remaining_properties_by_scheme["funding_scheme"] == "ECO4"
             ]["asset_list_row_id"].values[0]
@@ -983,13 +994,17 @@ def analyse_ha_data(outputs, loader):
         #    - This is not the very strictest definition of ECO4 eligible, but we aim to characterise the properties
         #      here and re-surveying is a common practicce by Warmfront. Additionally, many of the social homes have
         #      very old EPCs which may score lower when re-done
-        # 2) Subject to CIGA check - Meets loft conditions but shows a filled cavity.
+        # 2) Meets Fabric requirements, not SAP
+        #    Warmfront has identified the property as eligible, but the EPC is not D or below. We consider this but
+        #    label is separately as not a strict
+        # 3) Subject to CIGA check - Meets loft conditions but shows a filled cavity.
         #    - we don't have a SAP constraint here because the EPC is (currently) showing what the property might
         #      actually look like after retrofit and so the EPC currently being a C or above means little, because
         #      the updated EPC, showing an empty cavity, could bring the property within
-        # 3) Loft insulation too thick - Meets empty cavity but shows a loft with between 101 and 270mm insulation.
+        # 4) Loft insulation too thick - Meets empty cavity but shows a loft with between 101 and 270mm insulation.
         #   - No SAP constraint, for the same reason as in category 2)
-        # 4) Does not look like ECO4 candidate
+        # 5) Looks like GBIS instead
+        # 6) Does not look like ECO4 candidate
         #
         # For properties that have been identified as GBIS
         # 1) Strict GBIS candidates
@@ -1000,43 +1015,156 @@ def analyse_ha_data(outputs, loader):
         remaining_eco4_df = remaining_properties[
             remaining_properties["funding_scheme"] == "ECO4"
             ].copy()
+
+        ####################################
         # ECO4
+        ####################################
+
         # 1) We identify this if:
         #   - remaining_properties["eco4_eligible"] == True
 
         remaining_eco4_df["prospect_type"] = np.where(
-            remaining_eco4_df["eco4_eligible"] == True,
+            (remaining_eco4_df["eco4_eligible"] == True),
             "strict ECO4",
             remaining_eco4_df["prospect_type"]
         )
 
-        # 2) We identify this if it has a filled cavity but meets the loft conditions
+        # 2) Meets fabric requirements
+        remaining_eco4_df["prospect_type"] = np.where(
+            (
+                (remaining_eco4_df["eco4_message"] == "sap too high") &
+                remaining_eco4_df["eligibility_cavity_type"].isin(["partial", "empty"]) &
+                remaining_eco4_df["eligibility_loft_type"].isin(["0-100mm"])
+            ),
+            "ECO4 if SAP downgrade",
+            remaining_eco4_df["prospect_type"]
+        )
 
-        remaining_eco4_df["prospect_type"]
+        # 3) We identify this if it has a filled cavity but meets the loft conditions
+        # TODO: Consider if we should also allow 100-270mm or if we should add some slight tolerance (e.g. 150mm)
+        #       to account for measurement error
+        remaining_eco4_df["prospect_type"] = np.where(
+            (
+                remaining_eco4_df["eligibility_cavity_type"].isin(["full"]) &
+                remaining_eco4_df["eligibility_loft_type"].isin(["0-100mm"])
+            ),
+            "Filled cavity - subject to CIGA check",
+            remaining_eco4_df["prospect_type"]
+        )
 
-        z = remaining_eco4_df[remaining_eco4_df["eco4_message"] == "sap too high"]
+        # 4) We identify this by ensuring the cavity if empty or partial, and the loft has between 101 and 270mm
+        remaining_eco4_df["prospect_type"] = np.where(
+            (
+                remaining_eco4_df["eligibility_cavity_type"].isin(["empty", "partial"]) &
+                remaining_eco4_df["eligibility_loft_type"].isin(["100-270mm"])
+            ),
+            "ECO4 prospect - empty cavity, loft insulation below regulation",
+            remaining_eco4_df["prospect_type"]
+        )
 
-        remaining_properties[remaining_properties["eco4_eligible"] == True]["eco4_message"].value_counts()
-        remaining_properties["eco4_message"].value_counts()
-        z = remaining_properties[
-            (remaining_properties["eco4_message"] == "Possibly eligible but property currently EPC D") &
-            (remaining_properties["eco4_eligible"] == True)
-            ]
+        # 5) Looks like GBIS instead
+        remaining_eco4_df["prospect_type"] = np.where(
+            (remaining_eco4_df["gbis_eligible"] == True),
+            "Looks like GBIS",
+            remaining_eco4_df["prospect_type"]
+        )
 
-        k = z[z["property_type"] == "Flat"]
-        k["uprn"]
+        # 6) This is everything else (i.e. both the cavity is full and the loft insulation is above 100mm)
+        remaining_eco4_df["prospect_type"] = remaining_eco4_df["prospect_type"].fillna(
+            "Does not look like ECO4 candidate"
+        )
 
-        ha_analysis_results = {
+        ####################################
+        # GBIS
+        ####################################
+
+        remaining_gbis = remaining_properties[
+            remaining_properties["funding_scheme"] == "GBIS"
+            ].copy()
+
+        # 1) Strict GBIS candidates
+        remaining_gbis["prospect_type"] = np.where(
+            (
+                (remaining_gbis["gbis_eligible"] == True) & (remaining_gbis["eco4_eligible"] == False)
+            ),
+            "strict GBIS",
+            remaining_gbis["prospect_type"]
+        )
+
+        # 2) GBIS candidates that look like strict ECO4 candidates
+        remaining_gbis["prospect_type"] = np.where(
+            (remaining_gbis["eco4_eligible"] == True),
+            "Upgradable to ECO4",
+            remaining_gbis["prospect_type"]
+        )
+
+        # 3) Subject to CIGA check - Filled cavity
+        remaining_gbis["prospect_type"] = np.where(
+            (
+                remaining_gbis["eligibility_cavity_type"].isin(["full"])
+            ),
+            "Filled cavity - subject to CIGA check",
+            remaining_gbis["prospect_type"]
+        )
+
+        # 4) Everything else
+        remaining_gbis["prospect_type"] = remaining_gbis["prospect_type"].fillna(
+            "Does not look like GBIS candidate"
+        )
+
+        ####################################
+        # Surplus properties
+        ####################################
+
+        # Take properties that were not identified by Warmfront and identify those that look like they would qualify
+        # under the strictest criteria
+        surplus_df = analysis_data[
+            analysis_data["asset_identification_status"] != "identified potential eco works (CWI)"
+            ].copy()
+
+        eco4_surplus = surplus_df[
+            (
+                (surplus_df["eco4_eligible"] == True) & (surplus_df["eco4_message"] == "subject to post retrofit sap") &
+                (
+                    surplus_df["eligibility_classification"].isin(
+                        ["high confidence", "highest confidence", "medium confidence"]
+                    )
+                )
+            )
+        ].copy()
+
+        gbis_surplus = surplus_df[
+            (
+                (surplus_df["gbis_eligible"] == True) & (surplus_df["eco4_eligible"] == False) & (
+                surplus_df["eligibility_cavity_type"].isin(["empty", "partial"])
+            )
+            )
+        ].copy()
+
+        ha_analysis_results.append({
             "n_properties_in_asset_list": n_properties_in_asset_list,
+            ############
             # ECO4
+            ############
             "properties_sold_eco4": properties_sold_eco4,
             "n_remaining_properties_eco4": n_remaining_properties_eco4,
+            **remaining_eco4_df["prospect_type"].value_counts().to_dict(),
+            ############
             # GBIS
+            ############
             "properties_sold_gbis": properties_sold_gbis,
-            "n_remaining_properties_gbis": n_remaining_properties_gbis
-        }
+            "n_remaining_properties_gbis": n_remaining_properties_gbis,
+            **remaining_gbis["prospect_type"].value_counts().to_dict(),
+            ############
+            # GBIS
+            ############
+            "n_eco4_surplus": eco4_surplus.shape[0],
+            "n_gbis_surplus": gbis_surplus.shape[0],
+        })
 
-    pass
+    ha_analysis_results = pd.DataFrame(ha_analysis_results)
+
+    # Todo: create revenue figures and automate creation of excel
 
 
 def app():
@@ -1152,3 +1280,43 @@ def app():
     outputs = get_epc_data(
         loader, cleaned, cleaning_data, created_at, photo_supply_lookup, floor_area_decile_thresholds, pull_data=False
     )
+
+    # for ha_name, datasets in outputs.items():
+    #     datasets["results_df"] = datasets["results_df"].drop(
+    #         columns=["eligibility_cavity_type", "eligibility_loft_type"]
+    #     )
+    #
+    #     # Re-do
+    #     res = []
+    #     for _, row in tqdm(datasets["results_df"].iterrows(), total=datasets["results_df"].shape[0]):
+    #         epc = {
+    #             "walls-description": row["walls"],
+    #             "roof-description": row["roof"],
+    #             "floor-description": "",
+    #             "tenure": "",
+    #             "current-energy-efficiency": row["sap"],
+    #         }
+    #         eligibility = Eligibility(epc=epc, cleaned=cleaned)
+    #         eligibility.check_eco4_warmfront()
+    #         res.append(
+    #             {
+    #                 "row_id": row["row_id"],
+    #                 "eligibility_cavity_type": eligibility.eco4_warmfront["cavity_type"],
+    #                 "eligibility_loft_type": eligibility.eco4_warmfront["loft_type"]
+    #             }
+    #         )
+    #
+    #     # Merge back on
+    #     res = pd.DataFrame(res)
+    #     datasets["results_df"] = datasets["results_df"].merge(res, how="left", on="row_id")
+    #
+    #     # Re-save in s3
+    #     save_pickle_to_s3(
+    #         data={
+    #             "results_df": datasets["results_df"],
+    #             "scoring_df": datasets["scoring_df"],
+    #             "nodata": datasets["nodata"]
+    #         },
+    #         bucket_name="retrofit-datalake-dev",
+    #         s3_file_name=f"ha-analysis/{ha_name}/processed_results.pickle"
+    #     )

From 55e28942e48bb8cf55e7c95875533710d7e21ea1 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Mon, 29 Jan 2024 12:13:22 +0000
Subject: [PATCH 47/48] Added automated creation of excel and added missing
 files to git

---
 etl/eligibility/Eligibility.py                |  28 +-
 .../ha_15_32/WFT Sales data analysis.py       | 665 ++++++++++++++++++
 etl/eligibility/ha_15_32/cancellation.py      | 113 +++
 .../ha_15_32/ha_analysis_batch_3.py           | 100 ++-
 4 files changed, 876 insertions(+), 30 deletions(-)
 create mode 100644 etl/eligibility/ha_15_32/WFT Sales data analysis.py
 create mode 100644 etl/eligibility/ha_15_32/cancellation.py

diff --git a/etl/eligibility/Eligibility.py b/etl/eligibility/Eligibility.py
index 1d868338..906ff594 100644
--- a/etl/eligibility/Eligibility.py
+++ b/etl/eligibility/Eligibility.py
@@ -128,7 +128,7 @@ class Eligibility:
 
         if insulation_thickness <= 100:
             thickness_classification = "0-100mm"
-        elif insulation_thickness <= 270:
+        elif insulation_thickness <= high_loft_thickness_threshold:
             thickness_classification = "100-270mm"
         else:
             thickness_classification = "270mm+"
@@ -146,24 +146,14 @@ class Eligibility:
                 "thickness_classification": thickness_classification
             }
 
-        if insulation_thickness <= high_loft_thickness_threshold:
-            self.loft = {
-                "suitability": True,
-                "thickness": insulation_thickness,
-                "reason": "high loft thickness but below regulation",
-                "thickness_classification": thickness_classification
-            }
-            return
-
-        if insulation_thickness > high_loft_thickness_threshold:
-            # Insulation is already thick enough
-            self.loft = {
-                "suitability": False,
-                "thickness": insulation_thickness,
-                "reason": "existing insulation",
-                "thickness_classification": thickness_classification
-            }
-            return
+        # Insulation is already thick enough
+        self.loft = {
+            "suitability": False,
+            "thickness": insulation_thickness,
+            "reason": "existing insulation",
+            "thickness_classification": thickness_classification
+        }
+        return
 
     def cavity_insulation(self):
 
diff --git a/etl/eligibility/ha_15_32/WFT Sales data analysis.py b/etl/eligibility/ha_15_32/WFT Sales data analysis.py
new file mode 100644
index 00000000..a088fe43
--- /dev/null
+++ b/etl/eligibility/ha_15_32/WFT Sales data analysis.py	
@@ -0,0 +1,665 @@
+import numpy as np
+import pandas as pd
+
+ECO4_NEW_RATES = 1710
+GBIS_NEW_RATES = 600
+
+
+def app():
+    # Load in the excel
+    nov_ha_data = pd.read_excel(
+        'etl/eligibility/ha_15_32/ALL HA FIGURES AND ASSIGNED INSTALLERS 21.11.2023 with sales data.xlsx',
+    )
+    # Drop rows where HA name is null
+    nov_ha_data = nov_ha_data.dropna(subset=["HA Name"])
+    nov_ha_data["ha_number"] = nov_ha_data["HA Name"].str.extract(r"(\d+)").astype(int)
+    nov_ha_data = nov_ha_data.sort_values("ha_number", ascending=True)
+
+    variance_explanations = pd.read_excel(
+        'etl/eligibility/ha_15_32/ALL HA FIGURES AND ASSIGNED INSTALLERS 21.11.2023 with sales data.xlsx',
+        sheet_name="Variance explanations"
+    )
+
+    september_figures = pd.read_excel(
+        "etl/eligibility/ha_15_32/ALL HA FIGURES AND ASSIGNED INSTALLERS SEP 23 UPDATE (2).xlsx",
+        sheet_name="HA Stats"
+    )
+
+    historical_invoices = pd.read_excel(
+        "etl/eligibility/ha_15_32/ALL HA FIGURES AND ASSIGNED INSTALLERS 21.11.2023 with sales data.xlsx",
+        sheet_name="Jul 22 to Oct 23"
+    )
+    # Drop rows where installer rates is null
+    historical_invoices = historical_invoices[~pd.isnull(historical_invoices["INSTALLER RATES"])]
+    historical_invoices = historical_invoices[historical_invoices["INSTALLER RATES"] != "NA "]
+    # By Scheme, take a weighted mean of the INSTALLER RATES, weighted on the number of rows
+    n_invoices = historical_invoices.groupby(["Scheme", "INSTALLER RATES"])["Invoice number"].count().reset_index()
+    n_invoices = n_invoices[n_invoices["Scheme"].isin(["Eco 4", "GBIS"])]
+    historical_scheme_rates = n_invoices.groupby("Scheme").apply(
+        lambda x: np.average(x["INSTALLER RATES"], weights=x["Invoice number"])
+    ).reset_index().rename(columns={0: "Historical rates"})
+
+    # we take just entries sales data that have sales > 0
+    sales_data = nov_ha_data[nov_ha_data["Sales"] > 0]
+
+    # We now need to adjust sales data depending on the variance explanations
+    sales_data = sales_data.merge(
+        variance_explanations[["HA", 'Which figure is correct']],
+        how="left",
+        left_on="ha_number",
+        right_on="HA"
+    )
+
+    def adjust_sales(row):
+        if pd.isnull(row["Which figure is correct"]):
+            return row["Sales"]
+
+        if row["Which figure is correct"] == "HA facts & figures":
+            return row['No. of Tech surveys complete']
+
+        if row["Which figure is correct"] == "Billed amount":
+            return row["Sales"]
+
+        if row["Which figure is correct"] in ["Both correct, HA facts and figures includes November", "Both correct"]:
+            return row["Sales"]
+
+        raise ValueError(f"Unknown value for 'Which figure is correct': {row['Which figure is correct']}")
+
+    # We now need to adjust sales data depending on the variance explanations
+    sales_data["adjusted_sales"] = sales_data.apply(lambda row: adjust_sales(row), axis=1)
+
+    # We therefore adjust GBIS and ECO4 sales data based on adjusted sales
+    sales_data["adjusted_eco4_sales"] = sales_data["No. of Tech surveys complete - Eco 4"] / sales_data["Sales"] * \
+                                        sales_data["adjusted_sales"]
+
+    sales_data["adjusted_gbis_sales"] = sales_data["No. of Tech surveys complete - GBIS"] / sales_data["Sales"] * \
+                                        sales_data["adjusted_sales"]
+
+    sales_data["cancellation_rate"] = (sales_data["Sales"] - sales_data["adjusted_sales"]) / sales_data["Sales"]
+
+    # The difference between the adjusted sales and the actual sales is the cancellation
+    cancellations = (sales_data["adjusted_sales"].sum() - sales_data["Sales"].sum()) / sales_data["Sales"].sum()
+
+    # Given the cancellations, we can now adjust the expected remaining surveys
+    sales_data["No. of Tech surveys remaining"] = sales_data["No. of Tech surveys remaining"] * (
+        1 - sales_data["cancellation_rate"]
+    )
+
+    # We now merge on the expected values for September
+    sales_data = sales_data.merge(
+        september_figures[["Redacted HA", "ECO4", "GBIS"]].rename(
+            columns={"Redacted HA": "HA Name", "ECO4": "Sept Expected ECO4", "GBIS": "Sept Expected GBIS"}
+        ),
+        how="left",
+        on="HA Name",
+    )
+
+    sales_data["Sept Expected ECO4"] = sales_data["Sept Expected ECO4"].fillna(0)
+    sales_data["Sept Expected GBIS"] = sales_data["Sept Expected GBIS"].fillna(0)
+
+    # We calculate the ECO4 and GBIS conversion rates with the adjusted numbers
+    sales_data["ECO4 Conversion"] = sales_data["adjusted_eco4_sales"] / sales_data["adjusted_sales"]
+    sales_data["GBIS Conversion"] = sales_data["adjusted_gbis_sales"] / sales_data["adjusted_sales"]
+
+    # We now calculate the expected remaining ECO4 and GBIS sales
+    # We take the number of remaining surveys and multiply by the conversion rate for each scheme, which tells us
+    # how many more we should expect to see
+    sales_data["Expected Remaining ECO4"] = sales_data["No. of Tech surveys remaining"] * sales_data["ECO4 Conversion"]
+    sales_data["Expected Remaining GBIS"] = sales_data["No. of Tech surveys remaining"] * sales_data["GBIS Conversion"]
+
+    # We now produce a forecasted ECO4 and GBIS sales figure
+    sales_data["Forecasted ECO4 Sales"] = sales_data["adjusted_eco4_sales"] + sales_data["Expected Remaining ECO4"]
+    sales_data["Forecasted GBIS Sales"] = sales_data["adjusted_gbis_sales"] + sales_data["Expected Remaining GBIS"]
+
+    # Take the columns we're interestd in
+    # HA  # Properties	Sept ECO4 Figures	Sept GBIS Figures	Nov Total Sales	Nov ECO4 Sales	Nov GBIS Sales
+    # Remaining Surveys	ECO4 conversion	GBIS conversion	Forecasted ECO4 Sales	Forecasted GBIS sales	ECO4 Change
+    # GBIS Change
+    sales_data_formatted = sales_data[[
+        "HA Name",
+        "ASSET LIST no.",
+        "Sept Expected ECO4",
+        "Sept Expected GBIS",
+        "adjusted_sales",
+        "adjusted_eco4_sales",
+        "adjusted_gbis_sales",
+        "No. of Tech surveys remaining",
+        "ECO4 Conversion",
+        "GBIS Conversion",
+        "Forecasted ECO4 Sales",
+        "Forecasted GBIS Sales"
+    ]].rename(
+        columns={
+            "adjusted_sales": "Oct Total Sales (adjusted for variance)",
+            "adjusted_eco4_sales": "Oct ECO4 Sales (adjusted for variance)",
+            "adjusted_gbis_sales": "Oct GBIS Sales (adjusted for variance)",
+            "No. of Tech surveys remaining": "Remaining Surveys",
+        }
+    )
+
+    # Convert columns which should be integers to integers
+    for col in ["ASSET LIST no.", "Remaining Surveys", "Sept Expected ECO4", "Sept Expected GBIS",
+                "Oct Total Sales (adjusted for variance)", "Oct ECO4 Sales (adjusted for variance)",
+                "Oct GBIS Sales (adjusted for variance)", "Forecasted ECO4 Sales", "Forecasted GBIS Sales"]:
+        sales_data_formatted[col] = sales_data_formatted[col].fillna(0)
+        sales_data_formatted[col] = sales_data_formatted[col].astype(int)
+
+    # Remove HA 17 because this was EPCs only. We also remove HA33 because they do not have access to the full portfolio
+    sales_data_formatted = sales_data_formatted[
+        ~sales_data_formatted["HA Name"].isin(["HA 17", "HA 33"])
+    ]
+
+    # September expected ECO4 and GBIS
+    sept_expected_eco4 = sales_data_formatted["Sept Expected ECO4"].sum()
+    sept_expected_gbis = sales_data_formatted["Sept Expected GBIS"].sum()
+
+    # Completed so far
+    oct_eco4_sales = sales_data_formatted["Oct ECO4 Sales (adjusted for variance)"].sum()
+    oct_gbis_sales = sales_data_formatted["Oct GBIS Sales (adjusted for variance)"].sum()
+
+    # Forecasted figures
+    forecasted_eco4_sales = sales_data_formatted["Forecasted ECO4 Sales"].sum()
+    forecasted_gbis_sales = sales_data_formatted["Forecasted GBIS Sales"].sum()
+
+    # Expected remaining sales
+    expected_remaining_eco4_sales = forecasted_eco4_sales - oct_eco4_sales
+    expected_remaining_gbis_sales = forecasted_gbis_sales - oct_gbis_sales
+
+    # Forecast change vs September
+    forecasted_eco4_change = 100 * (forecasted_eco4_sales - sept_expected_eco4) / sept_expected_eco4
+    forecasted_gbis_change = 100 * (forecasted_gbis_sales - sept_expected_gbis) / sept_expected_gbis
+
+    aggregates = pd.DataFrame(
+        columns=["Scheme", "Sept Expected", "Oct Completed", "Forecasted Remaining Sales", "Forecasted Total Sales",
+                 "Forecasted Change vs Sept"],
+        data=[
+            ["ECO4", sept_expected_eco4, oct_eco4_sales, expected_remaining_eco4_sales, forecasted_eco4_sales,
+             forecasted_eco4_change],
+            ["GBIS", sept_expected_gbis, oct_gbis_sales, expected_remaining_gbis_sales, forecasted_gbis_sales,
+             forecasted_gbis_change],
+        ]
+    )
+
+    # Multiply by histoical rates to get revenue
+    # For ECO4, this is ~£1456 and for GBIS it's ~£432
+    historical_gbis_price = historical_scheme_rates[
+        historical_scheme_rates["Scheme"] == "GBIS"
+        ]["Historical rates"].iloc[0]
+
+    historical_eco4_price = historical_scheme_rates[
+        historical_scheme_rates["Scheme"] == "Eco 4"
+        ]["Historical rates"].iloc[0]
+
+    aggregates["Sept Expected Revenue"] = np.where(
+        aggregates["Scheme"] == "ECO4",
+        aggregates["Sept Expected"] * historical_eco4_price,
+        aggregates["Sept Expected"] * historical_gbis_price
+    )
+
+    aggregates["Completed Revenue"] = np.where(
+        aggregates["Scheme"] == "ECO4",
+        aggregates["Oct Completed"] * historical_eco4_price,
+        aggregates["Oct Completed"] * historical_gbis_price
+    )
+
+    # We use the new rates for the forecasted revenue
+    aggregates["Forecasted Remaining Revenue"] = np.where(
+        aggregates["Scheme"] == "ECO4",
+        aggregates["Forecasted Remaining Sales"] * ECO4_NEW_RATES,
+        aggregates["Forecasted Remaining Sales"] * GBIS_NEW_RATES
+    )
+
+    # We also calculate the forecasted remaining revenue at the original price
+    aggregates["Forecasted Remaining Revenue (original price)"] = np.where(
+        aggregates["Scheme"] == "ECO4",
+        aggregates["Forecasted Remaining Sales"] * historical_eco4_price,
+        aggregates["Forecasted Remaining Sales"] * historical_gbis_price
+    )
+
+    aggregates["Forecasted Revenue"] = aggregates["Completed Revenue"] + aggregates["Forecasted Remaining Revenue"]
+
+    # Forecasted revenue with original price
+    aggregates["Forecasted Revenue (original price)"] = (
+        aggregates["Completed Revenue"] + aggregates["Forecasted Remaining Revenue (original price)"]
+    )
+
+    # Create a totals row which sums up the two rows
+
+    forecasted_change_vs_sept = 100 * (
+        aggregates["Forecasted Total Sales"].sum() - aggregates["Sept Expected"].sum()
+    ) / aggregates["Sept Expected"].sum()
+
+    aggregates = pd.concat(
+        [
+            aggregates,
+            pd.DataFrame(
+                [
+                    ["Total", aggregates["Sept Expected"].sum(), aggregates["Oct Completed"].sum(),
+                     aggregates["Forecasted Remaining Sales"].sum(), aggregates["Forecasted Total Sales"].sum(),
+                     forecasted_change_vs_sept,
+                     aggregates["Sept Expected Revenue"].sum(), aggregates["Completed Revenue"].sum(),
+                     aggregates["Forecasted Remaining Revenue"].sum(),
+                     aggregates["Forecasted Remaining Revenue (original price)"].sum(),
+                     aggregates["Forecasted Revenue"].sum(),
+                     aggregates["Forecasted Revenue (original price)"].sum(),
+                     ]
+                ],
+                columns=aggregates.columns
+            )
+        ]
+    )
+
+    # For each property in the asset list, we now calculate an average conversion rate to ECO4 and GBIS
+    # We do this by taking the forecasted sales values for each schemes and dividing by the number of properties
+
+    number_properties = sales_data_formatted["ASSET LIST no."].sum()
+    eco4_conversion_rate = forecasted_eco4_sales / number_properties
+    gbis_conversion_rate = forecasted_gbis_sales / number_properties
+
+    # We also attribute a future value per property
+    future_eco4_value = ECO4_NEW_RATES * eco4_conversion_rate
+    future_gbis_value = GBIS_NEW_RATES * gbis_conversion_rate
+
+    # We also calulate a revenue figure for the old rates
+    historical_eco4_value = historical_eco4_price * eco4_conversion_rate
+    historical_gbis_value = historical_gbis_price * gbis_conversion_rate
+
+    # For the HAs that have not begun selling, we estimate the value of the projects
+    # We start with some problem HAs
+
+    # HA 7, HA 24, HA 25
+    # These HAs have no sales data, so we use the expected figures
+
+    problem_has_data = nov_ha_data[
+        (nov_ha_data["HA Name"].isin(["HA 7", "HA 24", "HA 25"]))
+    ].copy()
+    # Merge on the september expected figures
+    problem_has_data = problem_has_data.merge(
+        september_figures[["Redacted HA", "ECO4", "GBIS"]].rename(
+            columns={"Redacted HA": "HA Name", "ECO4": "Sept Expected ECO4", "GBIS": "Sept Expected GBIS"}
+        ),
+        how="left",
+        on="HA Name",
+    )
+    # Fill NAs
+    problem_has_data["Sept Expected ECO4"] = problem_has_data["Sept Expected ECO4"].fillna(0)
+    problem_has_data["Sept Expected GBIS"] = problem_has_data["Sept Expected GBIS"].fillna(0)
+
+    # We now calculate the expected ECO4 and GBIS sales based on the average conversion rates
+    problem_has_data["Expected ECO4 Sales"] = problem_has_data["ASSET LIST no."] * eco4_conversion_rate
+    problem_has_data["Expected GBIS Sales"] = problem_has_data["ASSET LIST no."] * gbis_conversion_rate
+
+    # Filter just on columns we're interested in
+    problem_has_data = problem_has_data[[
+        "HA Name",
+        "ASSET LIST no.",
+        "Sept Expected ECO4",
+        "Sept Expected GBIS",
+        "ECO4",
+        "GBIS",
+        "Expected ECO4 Sales",
+        "Expected GBIS Sales"
+    ]].rename(
+        columns={
+            "ECO4": "Nov Expected ECO4",
+            "GBIS": "Nov Expected GBIS",
+        }
+    )
+
+    # Fill NAs
+    problem_has_data["Nov Expected ECO4"] = problem_has_data["Nov Expected ECO4"].fillna(0)
+    problem_has_data["Nov Expected GBIS"] = problem_has_data["Nov Expected GBIS"].fillna(0)
+
+    # We calculate HA level Sept, Nov expected revenue, based on historical rates and then forecasted revenue
+    problem_has_data["Sept Expected ECO4 Value"] = problem_has_data["Sept Expected ECO4"] * historical_eco4_price
+    problem_has_data["Sept Expected GBIS Value"] = problem_has_data["Sept Expected GBIS"] * historical_gbis_price
+
+    problem_has_data["Nov Expected ECO4 Value"] = problem_has_data["Nov Expected ECO4"] * historical_eco4_price
+    problem_has_data["Nov Expected GBIS Value"] = problem_has_data["Nov Expected GBIS"] * historical_gbis_price
+
+    problem_has_data["Forecasted ECO4 Revenue"] = problem_has_data["ASSET LIST no."] * future_eco4_value
+    problem_has_data["Forecasted GBIS Revenue"] = problem_has_data["ASSET LIST no."] * future_gbis_value
+
+    # Totals
+    problem_has_data["Sept Expected Total Value"] = problem_has_data["Sept Expected ECO4 Value"] + \
+                                                    problem_has_data["Sept Expected GBIS Value"]
+    problem_has_data["Nov Expected Total Value"] = problem_has_data["Nov Expected ECO4 Value"] + \
+                                                   problem_has_data["Nov Expected GBIS Value"]
+    problem_has_data["Forecasted Total Revenue"] = problem_has_data["Forecasted ECO4 Revenue"] + \
+                                                   problem_has_data["Forecasted GBIS Revenue"]
+
+    # We calculate a total expected value for September, November and then forecasted
+    problem_has_expected_eco4_value = problem_has_data["Sept Expected ECO4"].sum() * historical_eco4_price
+    problem_has_expected_gbis_value = problem_has_data["Sept Expected GBIS"].sum() * historical_gbis_price
+    problem_has_expected_total_value = problem_has_expected_eco4_value + problem_has_expected_gbis_value
+
+    problem_has_nov_eco4_value = problem_has_data["Nov Expected ECO4"].sum() * historical_eco4_price
+    problem_has_nov_gbis_value = problem_has_data["Nov Expected GBIS"].sum() * historical_gbis_price
+    problem_has_nov_total_value = problem_has_nov_eco4_value + problem_has_nov_gbis_value
+
+    forecasted_eco4_value = problem_has_data["ASSET LIST no."].sum() * future_eco4_value
+    forecasted_gbis_value = problem_has_data["ASSET LIST no."].sum() * future_gbis_value
+    problem_has_forecasted_total_value = forecasted_eco4_value + forecasted_gbis_value
+
+    problem_has_summary = pd.DataFrame(
+        columns=["Scheme", "Sept Expected", "Nov Expected", "Forecasted"],
+        data=[
+            ["ECO4", problem_has_expected_eco4_value, problem_has_nov_eco4_value, forecasted_eco4_value],
+            ["GBIS", problem_has_expected_gbis_value, problem_has_nov_gbis_value, forecasted_gbis_value],
+            ["Total", problem_has_expected_total_value, problem_has_nov_total_value, problem_has_forecasted_total_value]
+        ]
+    )
+
+    # We now also estimate the value of the remaining HAs based on historical sales performance and new rates
+    # We take the has that are not in the sales data
+    remaining_has = nov_ha_data[
+        ~nov_ha_data["HA Name"].isin(sales_data_formatted["HA Name"])
+    ].copy()
+
+    # Merge on the september expected figures
+    remaining_has = remaining_has.merge(
+        september_figures[["Redacted HA", "ECO4", "GBIS"]].rename(
+            columns={"Redacted HA": "HA Name", "ECO4": "Sept Expected ECO4", "GBIS": "Sept Expected GBIS"}
+        ),
+        how="left",
+        on="HA Name",
+    )
+
+    # We update the asset list size for HA 33, because they do not have access to the full portfolio
+    remaining_has.loc[remaining_has["HA Name"] == "HA 33", "ASSET LIST no."] = 20699
+    # We also remove HA 17
+    remaining_has = remaining_has[~remaining_has["HA Name"].isin(["HA 17"])]
+
+    # We now calculate the expected ECO4 and GBIS sales based on the average conversion rates
+    remaining_has["Expected ECO4 Sales"] = remaining_has["ASSET LIST no."] * eco4_conversion_rate
+    remaining_has["Expected GBIS Sales"] = remaining_has["ASSET LIST no."] * gbis_conversion_rate
+
+    # Filter just on columns we're interested in
+    remaining_has = remaining_has[[
+        "HA Name",
+        "ASSET LIST no.",
+        "Sept Expected ECO4",
+        "Sept Expected GBIS",
+        "ECO4",
+        "GBIS",
+    ]].rename(
+        columns={
+            "ECO4": "Nov Expected ECO4",
+            "GBIS": "Nov Expected GBIS",
+        }
+    )
+
+    remaining_has = remaining_has.fillna(0)
+
+    # We take just HAs that had an initial september expectation for ECO4 or GBIS, or that now have a Nov expectation
+    remaining_has = remaining_has[
+        (remaining_has["Sept Expected ECO4"] > 0) | (remaining_has["Sept Expected GBIS"] > 0) |
+        (remaining_has["Nov Expected ECO4"] > 0) | (remaining_has["Nov Expected GBIS"] > 0)
+        ]
+
+    # Expected sales based on asset list size and conversion rate
+    remaining_has["Forecasted Sales ECO4"] = remaining_has["ASSET LIST no."] * eco4_conversion_rate
+    remaining_has["Forecasted Sales GBIS"] = remaining_has["ASSET LIST no."] * gbis_conversion_rate
+
+    # Calculat the total expected value for September and November
+    remaining_has["Sept Expected ECO4 Value"] = remaining_has["Sept Expected ECO4"] * historical_eco4_price
+    remaining_has["Sept Expected GBIS Value"] = remaining_has["Sept Expected GBIS"] * historical_gbis_price
+
+    remaining_has["Nov Expected ECO4 Value"] = remaining_has["Nov Expected ECO4"] * historical_eco4_price
+    remaining_has["Nov Expected GBIS Value"] = remaining_has["Nov Expected GBIS"] * historical_gbis_price
+
+    # Calculate forecasted revenue
+    remaining_has["Forecasted ECO4 Revenue"] = remaining_has["ASSET LIST no."] * future_eco4_value
+    remaining_has["Forecasted GBIS Revenue"] = remaining_has["ASSET LIST no."] * future_gbis_value
+
+    # We also calculate forecasted revenue with the original price
+    remaining_has["Forecasted ECO4 Revenue (original price)"] = remaining_has["ASSET LIST no."] * historical_eco4_value
+    remaining_has["Forecasted GBIS Revenue (original price)"] = remaining_has["ASSET LIST no."] * historical_gbis_value
+
+    # Calculate totals for each scheme
+    remaining_has_september_eco4_sales = remaining_has["Sept Expected ECO4"].sum()
+    remaining_has_september_gbis_sales = remaining_has["Sept Expected GBIS"].sum()
+
+    remaining_has_november_eco4_sales = remaining_has["Nov Expected ECO4"].sum()
+    remaining_has_november_gbis_sales = remaining_has["Nov Expected GBIS"].sum()
+
+    remaining_has_forecasted_eco4_sales = remaining_has["Forecasted Sales ECO4"].sum()
+    remaining_has_forecasted_gbis_sales = remaining_has["Forecasted Sales GBIS"].sum()
+
+    remaining_has_september_eco4_value = remaining_has["Sept Expected ECO4 Value"].sum()
+    remaining_has_september_gbis_value = remaining_has["Sept Expected GBIS Value"].sum()
+
+    remaining_has_november_eco4_value = remaining_has["Nov Expected ECO4 Value"].sum()
+    remaining_has_november_gbis_value = remaining_has["Nov Expected GBIS Value"].sum()
+
+    remaining_has_forecasted_eco4_value = remaining_has["Forecasted ECO4 Revenue"].sum()
+    remaining_has_forecasted_gbis_value = remaining_has["Forecasted GBIS Revenue"].sum()
+
+    remaining_has_forecasted_eco4_value_original_price = remaining_has["Forecasted ECO4 Revenue (original price)"].sum()
+    remaining_has_forecasted_gbis_value_original_price = remaining_has["Forecasted GBIS Revenue (original price)"].sum()
+
+    # Calculate the change in forecasted sales against the September expected sales
+    remaining_has_foecast_change_eco4 = 100 * (
+        remaining_has["Forecasted Sales ECO4"].sum() - remaining_has["Sept Expected ECO4"].sum()
+    ) / remaining_has["Sept Expected ECO4"].sum()
+
+    remaining_has_foecast_change_gbis = 100 * (
+        remaining_has["Forecasted Sales GBIS"].sum() - remaining_has["Sept Expected GBIS"].sum()
+    ) / remaining_has["Sept Expected GBIS"].sum()
+
+    # Total change
+    remaining_has_foecast_change_total = 100 * (
+        remaining_has["Forecasted Sales ECO4"].sum() + remaining_has["Forecasted Sales GBIS"].sum() -
+        remaining_has["Sept Expected ECO4"].sum() - remaining_has["Sept Expected GBIS"].sum()
+    ) / (remaining_has["Sept Expected ECO4"].sum() + remaining_has["Sept Expected GBIS"].sum())
+
+    asset_list_size = remaining_has["ASSET LIST no."].sum()
+
+    # Create a summary table of the rest with the totals for ECO4, GBIS and then a total row
+    remaining_has_aggregate = pd.DataFrame(
+        columns=["Scheme", "Asset List Size", "Sept Expected Sales", "Nov Expected Sales", "Forecasted Sales",
+                 "Forecasted Change vs Sept",
+                 "Sept Expected Value", "Nov Expected Value", "Forecasted Value", "Forecasted Value (original price)"],
+        data=[
+            [
+                "ECO4", asset_list_size, remaining_has_september_eco4_sales, remaining_has_november_eco4_sales,
+                remaining_has_forecasted_eco4_sales, remaining_has_foecast_change_eco4,
+                remaining_has_september_eco4_value,
+                remaining_has_november_eco4_value, remaining_has_forecasted_eco4_value,
+                remaining_has_forecasted_eco4_value_original_price
+            ],
+            [
+                "GBIS", asset_list_size, remaining_has_september_gbis_sales, remaining_has_november_gbis_sales,
+                remaining_has_forecasted_gbis_sales, remaining_has_foecast_change_gbis,
+                remaining_has_september_gbis_value,
+                remaining_has_november_gbis_value, remaining_has_forecasted_gbis_value,
+                remaining_has_forecasted_gbis_value_original_price
+            ],
+            [
+                "Total", asset_list_size, remaining_has_september_eco4_sales + remaining_has_september_gbis_sales,
+                                          remaining_has_november_eco4_sales + remaining_has_november_gbis_sales,
+                                          remaining_has_forecasted_eco4_sales + remaining_has_forecasted_gbis_sales,
+                remaining_has_foecast_change_total,
+                                          remaining_has_september_eco4_value + remaining_has_september_gbis_value,
+                                          remaining_has_november_eco4_value + remaining_has_november_gbis_value,
+                                          remaining_has_forecasted_eco4_value + remaining_has_forecasted_gbis_value,
+                                          remaining_has_forecasted_eco4_value_original_price +
+                                          remaining_has_forecasted_gbis_value_original_price
+            ]
+        ]
+    )
+
+    # Calculate pipeline value
+    pipeline_value = aggregates[["Scheme", "Completed Revenue", "Forecasted Remaining Revenue"]].merge(
+        remaining_has_aggregate[["Scheme", "Forecasted Value"]].rename(
+            columns={"Forecasted Value": "Forecasted Revenue, Unconfirmed HAs"}
+        ), how="inner", on="Scheme"
+    )
+
+    # Calculate the total
+    pipeline_value["Total Value"] = (
+        pipeline_value["Completed Revenue"] + pipeline_value["Forecasted Remaining Revenue"] + pipeline_value[
+        "Forecasted Revenue, Unconfirmed HAs"]
+    )
+
+    # TODO: Insert model figures
+    model_results = pd.DataFrame(
+        [
+            {
+                # This one, we don't have sales data
+                "HA Name": "HA 15",
+                "Model Expected Additional ECO4 (unit level)": None,
+                "Model Expected Total ECO4 (unit level)": 296,
+                "Model Expected Additional GBIS (unit level)": None,
+                "Model Expected Total GBIS (unit level)": 209,
+            },
+            {
+                "HA Name": "HA 16",
+                # Old before re-run
+                # "Model Expected Additional ECO4 (unit level)": 418,
+                # "Model Expected Total ECO4 (unit level)": 1820,
+                # "Model Expected Additional GBIS (unit level)": 576,
+                # "Model Expected Total GBIS (unit level)": 612,
+
+                # IN the partial sales data, WFT have completed 1407 ECO4, 36 GBIS
+                "Model Expected Additional ECO4 (unit level)": 411 + 342 + 235,
+                "Model Expected Total ECO4 (unit level)": 1407 + 411 + 342 + 235,
+                "Model Expected Additional GBIS (unit level)": 223,
+                "Model Expected Total GBIS (unit level)": 36 + 223,
+            },
+            {
+                "HA Name": "HA 24",
+                "Model Expected Additional ECO4 (unit level)": 224,
+                "Model Expected Total ECO4 (unit level)": 848,
+                "Model Expected Additional GBIS (unit level)": 552,
+                "Model Expected Total GBIS (unit level)": 552,
+            },
+            {
+                "HA Name": "HA 25",
+                "Model Expected Additional ECO4 (unit level)": None,
+                "Model Expected Total ECO4 (unit level)": 1709 + 59,
+                "Model Expected Additional GBIS (unit level)": None,
+                "Model Expected Total GBIS (unit level)": 2004 + 107,
+            }
+        ]
+    )
+
+    sales_data_formatted["Remaining ECO4 Sales"] = (
+        sales_data_formatted["Forecasted ECO4 Sales"] - sales_data_formatted["Oct ECO4 Sales (adjusted for variance)"]
+    )
+
+    sales_data_formatted["Remaining GBIS Sales"] = (
+        sales_data_formatted["Forecasted GBIS Sales"] - sales_data_formatted["Oct GBIS Sales (adjusted for variance)"]
+    )
+
+    sales_data_formatted["Completed ECO4 Revenue"] = (sales_data_formatted[
+                                                          "Oct ECO4 Sales (adjusted for variance)"] *
+                                                      historical_eco4_price)
+    sales_data_formatted["Completed GBIS Revenue"] = (sales_data_formatted[
+                                                          "Oct GBIS Sales (adjusted for variance)"] *
+                                                      historical_gbis_price)
+
+    ha_subset_with_sales = ["HA 15", "HA 16", "HA 24"]
+
+    has_subset_with_sales_value = sales_data_formatted[
+        sales_data_formatted["HA Name"].isin(ha_subset_with_sales)
+    ].copy()[
+        [
+            "HA Name",
+            "Oct ECO4 Sales (adjusted for variance)",
+            "Oct GBIS Sales (adjusted for variance)",
+            "Remaining ECO4 Sales",
+            "Remaining GBIS Sales",
+            "Forecasted ECO4 Sales",
+            "Forecasted GBIS Sales",
+            "Completed ECO4 Revenue",
+            "Completed GBIS Revenue"
+        ]
+    ]
+
+    has_subset_with_sales_value["Remaining ECO4 Revenue"] = has_subset_with_sales_value[
+                                                                "Remaining ECO4 Sales"] * ECO4_NEW_RATES
+    has_subset_with_sales_value["Remaining GBIS Revenue"] = has_subset_with_sales_value[
+                                                                "Remaining GBIS Sales"] * GBIS_NEW_RATES
+
+    has_subset_with_sales_value["Remaining Total Revenue"] = (
+        has_subset_with_sales_value["Remaining ECO4 Revenue"] + has_subset_with_sales_value["Remaining GBIS Revenue"]
+    )
+
+    model_results["Model Expected Additional ECO4 Revenue"] = (
+        model_results["Model Expected Additional ECO4 (unit level)"] * ECO4_NEW_RATES
+    )
+
+    model_results["Model Expected Additional GBIS revenue"] = (
+        model_results["Model Expected Additional GBIS (unit level)"] * GBIS_NEW_RATES
+    )
+
+    model_results["Model Expected Additional Total Revenue"] = (
+        model_results["Model Expected Additional ECO4 Revenue"] + model_results[
+        "Model Expected Additional GBIS revenue"]
+    )
+
+    # Show more columns with pandas
+    pd.set_option('display.max_rows', 500)
+    pd.set_option('display.max_columns', 500)
+    pd.set_option('display.width', 1000)
+
+    # Look at HA 16
+    ha16_model = model_results[model_results["HA Name"] == "HA 16"]
+    has_subset_with_sales_value[has_subset_with_sales_value["HA Name"] == "HA 16"]
+
+    # WFT: For HA 16: 4,598,190 ECO4, 57,000 GBIS
+    # Model:
+
+    # Look at HA 24
+    ha24_model = model_results[model_results["HA Name"] == "HA 24"]
+    has_subset_with_sales_value[has_subset_with_sales_value["HA Name"] == "HA 24"]
+
+    # Look at HA 15
+    ha15_data = has_subset_with_sales_value[has_subset_with_sales_value["HA Name"] == "HA 15"]
+    ha15_portfolio_value = ha15_data["Completed ECO4 Revenue"] + ha15_data[
+        "Completed GBIS Revenue"] + ha15_data["Remaining Total Revenue"]
+    # # This doesn't have sales data so in the model analysis, we just value the ha as a whole
+    ha15_model = model_results[model_results["HA Name"] == "HA 15"]
+    ha15_value = ha15_model["Model Expected Total ECO4 (unit level)"].iloc[0] * ECO4_NEW_RATES + \
+                 ha15_model["Model Expected Total GBIS (unit level)"].iloc[0] * GBIS_NEW_RATES
+
+    model_results["Expected ECO4 Revenue"] = model_results["Model Expected Total ECO4 (unit level)"] * ECO4_NEW_RATES
+    model_results["Expected GBIS Revenue"] = model_results["Model Expected Total GBIS (unit level)"] * GBIS_NEW_RATES
+    model_results["Expected Total Revenue"] = model_results["Expected ECO4 Revenue"] + model_results[
+        "Expected GBIS Revenue"]
+    model_results[model_results["HA Name"].isin(["HA 15"])]
+
+    # We now create a final excel with all of the data
+    # We want:
+    # 1) aggregates
+    # 2) sales_data_formatted
+    # 3) remaining_has_aggregate
+    # 4) remaining_has
+    # 5) problem_has_summary
+
+    # Function to get the maximum column width
+    def get_col_widths(dataframe):
+        # First we find the maximum length of the index column
+        idx_max = max([len(str(s)) for s in dataframe.index.values] + [len(str(dataframe.index.name))])
+        # Then, we concatenate this to the max of the lengths of column name and its max value for each column, row-wise
+        return [idx_max] + [max(dataframe[col].astype(str).map(len).max(), len(col)) for col in dataframe.columns]
+
+    # Create a Pandas Excel writer using XlsxWriter as the engine
+    with pd.ExcelWriter('HA Pipeline Analysis.xlsx', engine='xlsxwriter') as writer:
+        # Write each dataframe to a different worksheet without the index
+        for df, sheet in [(aggregates, 'Forecasted Sales'),
+                          (sales_data_formatted, 'Sales Data'),
+                          (remaining_has_aggregate, 'Remaining HAs Value'),
+                          (remaining_has, 'Remaining HAs data'),
+                          (pipeline_value, 'Pipeline Value'),
+                          (problem_has_summary, 'Problem HAs Analysis'),
+                          (problem_has_data, 'Problem HAs Data')
+
+                          ]:
+
+            df.to_excel(writer, sheet_name=sheet, index=False)
+
+            # Auto-adjust columns' width
+            for i, width in enumerate(get_col_widths(df)):
+                writer.sheets[sheet].set_column(i, i, width)
diff --git a/etl/eligibility/ha_15_32/cancellation.py b/etl/eligibility/ha_15_32/cancellation.py
new file mode 100644
index 00000000..849add45
--- /dev/null
+++ b/etl/eligibility/ha_15_32/cancellation.py
@@ -0,0 +1,113 @@
+import openpyxl
+import pandas as pd
+import numpy as np
+
+
+def get_excel_survey_list(workbook_path, worksheet_name=None):
+    survey_workbook = openpyxl.load_workbook(workbook_path)
+    if worksheet_name is not None:
+        survey_sheet = survey_workbook[worksheet_name]
+    else:
+        survey_sheet = survey_workbook.active
+
+    survey_rows = []
+    survey_colors = []
+
+    for row in survey_sheet.iter_rows(min_row=2, values_only=False):  # Assuming the first row is headers
+        row_data = [cell.value for cell in row]  # This will get you the cell values
+        row_color = row[0].fill.start_color.index if row[0].fill.start_color.index != '00000000' else None
+        # row_color = COLOR_INDEX[row_color]
+        survey_rows.append(row_data)
+        survey_colors.append(row_color)
+
+    survey_list = pd.DataFrame(survey_rows, columns=[cell.value for cell in survey_sheet[1]])
+    survey_list["row_colour"] = survey_colors
+
+    return survey_list
+
+
+def load_data():
+    # Load for HA 16 - ECO 4
+    ha16_survey_list = get_excel_survey_list('etl/eligibility/ha_15_32/HESTIA- HA 16 ECO4 SURVEY LIST.xlsx')
+
+    # Load for HA 24 - ECO 4
+    ha24_survey_list = get_excel_survey_list('etl/eligibility/ha_15_32/HESTIA - HA 24 ECO4 SURVEY LIST.xlsx')
+
+    # Load for HA 25 - ECO 3
+    ha25_survey_list = get_excel_survey_list(
+        'etl/eligibility/ha_15_32/HESTIA - HA 25 ECO3 SURVEY LIST.xlsx', worksheet_name="CAVITY"
+    )
+
+    # Remove columns with None column names
+    ha25_survey_list = ha25_survey_list.dropna(axis=1, how='all')
+
+    # Standardised this installation status columns
+    ha16_survey_list["survey_status"] = ha16_survey_list["INSTALLED OR CANCELLED"].copy()
+    ha16_survey_list["survey_status"] = ha16_survey_list["survey_status"].replace(
+        {
+            "NO UPDATE - CHECKED 2.10.23": "no update",
+            "NO UPDATE - CHECKED 18.12.23": "no update",
+            "INSTALLED": "installed",
+            "CANCELLED": "cancelled",
+            "LOFT STILL TO BE INSTALLED": "loft remaining",
+        }
+    )
+
+    ha24_survey_list["survey_status"] = ha24_survey_list["INSTALLED OR CANCELLED"].copy()
+    ha24_survey_list["survey_status"] = ha24_survey_list["survey_status"].replace(
+        {
+            "NO UPDATE - CHECKED 21.11.23": "no update",
+            "NO UPDATE - CHECKED 18.12.23": "no update",
+            "INSTALLED": "installed",
+            "CANCELLED": "cancelled",
+            "LOFT STILL TO BE INSTALLED": "loft remaining",
+            "SEE NOTES >>": "see notes",
+        }
+    )
+
+    # We need to prepare HA25 differently
+    ha25_survey_list["survey_status"] = np.where(
+        ha25_survey_list["row_colour"] == "FF7030A0", "installed",
+        np.where(ha25_survey_list["row_colour"] == "FF92D050", "installed",
+                 np.where(ha25_survey_list["row_colour"] == "FFFF0000", "cancelled",
+                          np.where(ha25_survey_list["row_colour"] == "FFFFFF00", "filler row - drop",
+                                   np.where(ha25_survey_list["row_colour"] == "FF38FD23", "installed", "unknown")
+                                   )
+                          )
+                 )
+    )
+    ha25_survey_list = ha25_survey_list[ha25_survey_list["survey_status"] != "filler row - drop"]
+
+    # We standardise the cancellation reasons - just create a new column
+    ha16_survey_list["cancellation_reason"] = ha16_survey_list["INSTALLERS NOTES ; REASONS FOR CANCELLATIONS"].copy()
+    ha24_survey_list["cancellation_reason"] = ha24_survey_list["INSTALLERS NOTES ; REASONS FOR CANCELLATIONS"].copy()
+    # There's no cancellation reason for HA25
+    ha25_survey_list["cancellation_reason"] = "No reason provided"
+
+    # Combine the dataframes
+    ha16_survey_list["HA"] = "HA 16"
+    ha24_survey_list["HA"] = "HA 24"
+    ha25_survey_list["HA"] = "HA 25"
+
+    cancellation_data = pd.concat(
+        [
+            ha16_survey_list[["HA", "survey_status", "cancellation_reason"]],
+            ha24_survey_list[["HA", "survey_status", "cancellation_reason"]],
+            ha25_survey_list[["HA", "survey_status", "cancellation_reason"]]
+        ]
+    )
+
+    # Take just rows that we have a confirmed status for
+    cancellation_data = cancellation_data[~cancellation_data["survey_status"].isin(["no update", "loft remaining"])]
+
+    return cancellation_data
+
+
+def app():
+    """
+    This application is used to analyse the cancellation data provided by warmfront
+    :return:
+    """
+
+    # This is cancellations of jobs that completed invasive surveys and the installer could not conclude the work
+    sales_cancellation_data = load_data()
diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
index 1ed95a30..e94babcd 100644
--- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
+++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
@@ -882,6 +882,13 @@ def get_epc_data(
     return outputs
 
 
+def get_col_widths(dataframe):
+    # First we find the maximum length of the index column
+    idx_max = max([len(str(s)) for s in dataframe.index.values] + [len(str(dataframe.index.name))])
+    # Then, we concatenate this to the max of the lengths of column name and its max value for each column, row-wise
+    return [idx_max] + [max(dataframe[col].astype(str).map(len).max(), len(col)) for col in dataframe.columns]
+
+
 def analyse_ha_data(outputs, loader):
     """
     The approach we take within this function is the following:
@@ -901,7 +908,11 @@ def analyse_ha_data(outputs, loader):
     :return:
     """
 
+    eco4_rate = 1710
+    gbis_rate = 600
+
     ha_analysis_results = []
+    ha_revenue_results = []
     for ha_name, datasets in outputs.items():
 
         inputs = [x for k, x in loader.data.items() if k == ha_name][0]
@@ -1034,7 +1045,8 @@ def analyse_ha_data(outputs, loader):
             (
                 (remaining_eco4_df["eco4_message"] == "sap too high") &
                 remaining_eco4_df["eligibility_cavity_type"].isin(["partial", "empty"]) &
-                remaining_eco4_df["eligibility_loft_type"].isin(["0-100mm"])
+                remaining_eco4_df["eligibility_loft_type"].isin(["0-100mm"]) &
+                pd.isnull(remaining_eco4_df["prospect_type"])
             ),
             "ECO4 if SAP downgrade",
             remaining_eco4_df["prospect_type"]
@@ -1048,7 +1060,7 @@ def analyse_ha_data(outputs, loader):
                 remaining_eco4_df["eligibility_cavity_type"].isin(["full"]) &
                 remaining_eco4_df["eligibility_loft_type"].isin(["0-100mm"])
             ),
-            "Filled cavity - subject to CIGA check",
+            "ECO4 - Filled cavity - subject to CIGA check",
             remaining_eco4_df["prospect_type"]
         )
 
@@ -1064,7 +1076,7 @@ def analyse_ha_data(outputs, loader):
 
         # 5) Looks like GBIS instead
         remaining_eco4_df["prospect_type"] = np.where(
-            (remaining_eco4_df["gbis_eligible"] == True),
+            (remaining_eco4_df["gbis_eligible"] == True) & pd.isnull(remaining_eco4_df["prospect_type"]),
             "Looks like GBIS",
             remaining_eco4_df["prospect_type"]
         )
@@ -1094,16 +1106,17 @@ def analyse_ha_data(outputs, loader):
         # 2) GBIS candidates that look like strict ECO4 candidates
         remaining_gbis["prospect_type"] = np.where(
             (remaining_gbis["eco4_eligible"] == True),
-            "Upgradable to ECO4",
+            "GBIS - Upgradable to ECO4",
             remaining_gbis["prospect_type"]
         )
 
         # 3) Subject to CIGA check - Filled cavity
         remaining_gbis["prospect_type"] = np.where(
             (
-                remaining_gbis["eligibility_cavity_type"].isin(["full"])
+                remaining_gbis["eligibility_cavity_type"].isin(["full"]) &
+                pd.isnull(remaining_gbis["prospect_type"])
             ),
-            "Filled cavity - subject to CIGA check",
+            "GBIS - Filled cavity - subject to CIGA check",
             remaining_gbis["prospect_type"]
         )
 
@@ -1141,30 +1154,95 @@ def analyse_ha_data(outputs, loader):
             )
         ].copy()
 
-        ha_analysis_results.append({
+        # Perform some checks to make sure we have all of the values
+        remaining_eco4_dict = remaining_eco4_df["prospect_type"].value_counts().to_dict()
+        if n_remaining_properties_eco4 != sum([v for k, v in remaining_eco4_dict.items()]):
+            raise ValueError(
+                "Number of remaining properties does not match the number of properties in remaining ECO4 dict"
+            )
+
+        remaining_gbis_dict = remaining_gbis["prospect_type"].value_counts().to_dict()
+        if n_remaining_properties_gbis != sum([v for k, v in remaining_gbis_dict.items()]):
+            raise ValueError(
+                "Number of remaining properties does not match the number of properties in remaining GBIS dict"
+            )
+
+        to_append = {
+            "ha_name": ha_name,
             "n_properties_in_asset_list": n_properties_in_asset_list,
             ############
             # ECO4
             ############
             "properties_sold_eco4": properties_sold_eco4,
             "n_remaining_properties_eco4": n_remaining_properties_eco4,
-            **remaining_eco4_df["prospect_type"].value_counts().to_dict(),
+            **remaining_eco4_dict,
             ############
             # GBIS
             ############
             "properties_sold_gbis": properties_sold_gbis,
             "n_remaining_properties_gbis": n_remaining_properties_gbis,
-            **remaining_gbis["prospect_type"].value_counts().to_dict(),
+            **remaining_gbis_dict,
             ############
             # GBIS
             ############
             "n_eco4_surplus": eco4_surplus.shape[0],
             "n_gbis_surplus": gbis_surplus.shape[0],
-        })
+        }
+
+        ha_analysis_results.append(to_append)
+
+        revenue_to_append = {
+            "ha_name": ha_name,
+            "£ Remaining from asset list": (
+                n_remaining_properties_eco4 * eco4_rate + n_remaining_properties_gbis * gbis_rate
+            ),
+            "Of which: Strict": (
+                to_append.get('strict ECO4', 0) * eco4_rate + to_append.get('strict GBIS', 0) * gbis_rate +
+                to_append.get('GBIS - Upgradable to ECO4', 0) * gbis_rate
+            ),
+            "Of which: Subject to CIGA": (
+                to_append.get("ECO4 - Filled cavity - subject to CIGA check", 0) * eco4_rate +
+                to_append.get("GBIS - Filled cavity - subject to CIGA check", 0) * gbis_rate
+            ),
+            "Of which: Prospect, not perfect strict prospect": (
+                to_append.get("ECO4 prospect - empty cavity, loft insulation below regulation", 0) * eco4_rate +
+                to_append.get("ECO4 if SAP downgrade", 0) * eco4_rate
+            ),
+            "Of which: Potential downgrade to GBIS": to_append["Looks like GBIS"] * eco4_rate,
+            "Of which: Does not look like prospect": (
+                to_append.get("Does not look like ECO4 candidate", 0) * eco4_rate +
+                to_append.get("Does not look like GBIS candidate", 0) * gbis_rate
+            ),
+            "Surplus: Unidentified properties": eco4_surplus.shape[0] * eco4_rate + gbis_surplus.shape[0] * gbis_rate,
+            "Surplus: GBIS Updates to ECO4": to_append.get("GBIS - Upgradable to ECO4", 0) * (eco4_rate - gbis_rate)
+        }
+
+        # Perform a quick check:
+        if revenue_to_append["£ Remaining from asset list"] - (
+            revenue_to_append["Of which: Strict"] + revenue_to_append["Of which: Subject to CIGA"] +
+            revenue_to_append["Of which: Prospect, not perfect strict prospect"] +
+            revenue_to_append["Of which: Potential downgrade to GBIS"] +
+            revenue_to_append["Of which: Does not look like prospect"]
+        ) > 1:
+            raise ValueError("Error between top level revenue figures and breakdown - investigate me")
+
+        ha_revenue_results.append(revenue_to_append)
 
     ha_analysis_results = pd.DataFrame(ha_analysis_results)
+    ha_revenue_results = pd.DataFrame(ha_revenue_results)
 
-    # Todo: create revenue figures and automate creation of excel
+    # Automate creation of the excel
+    # Create a Pandas Excel writer using XlsxWriter as the engine
+    with pd.ExcelWriter('HA Analysis - batch3.xlsx', engine='xlsxwriter') as writer:
+        # Write each dataframe to a different worksheet without the index
+        for df, sheet in [(ha_revenue_results, 'Total Revenue'),
+                          (ha_analysis_results, 'By ECO4 and GBIS')]:
+
+            df.to_excel(writer, sheet_name=sheet, index=False)
+
+            # Auto-adjust columns' width
+            for i, width in enumerate(get_col_widths(df)):
+                writer.sheets[sheet].set_column(i, i, width)
 
 
 def app():

From 6a5430d214d60c0075ed0ad6c38655d34c108a1b Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Mon, 29 Jan 2024 12:33:33 +0000
Subject: [PATCH 48/48] Pulling ventilation from epc_record class

---
 backend/Property.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/Property.py b/backend/Property.py
index 82695b75..c9cad22f 100644
--- a/backend/Property.py
+++ b/backend/Property.py
@@ -83,7 +83,7 @@ class Property:
             "co2_emissions": epc_record.get("co2_emissions_current"),
         }
         self.ventilation = {
-            "ventilation": epc_record.prepared_epc.get("mechanical_ventilation"),
+            "ventilation": epc_record.get("mechanical_ventilation"),
         }
         self.solar_pv = {
             "solar_pv": epc_record.get("photo_supply"),